diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index f23064da26319..f25002799fde1 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -126,6 +126,7 @@ sycl/test-e2e/KernelFusion @intel/dpcpp-kernel-fusion-reviewers sycl/include/sycl/ext/oneapi/matrix/ @intel/sycl-matrix-reviewers sycl/test-e2e/Matrix @intel/sycl-matrix-reviewers sycl/test/matrix @intel/sycl-matrix-reviewers +sycl/test/check_device_code/matrix @intel/sycl-matrix-reviewers # Native CPU llvm/**/*SYCLNativeCPU* @intel/dpcpp-nativecpu-pi-reviewers @@ -164,3 +165,15 @@ sycl/test-e2e/DeviceCodeSplit/ @intel/dpcpp-tools-reviewers sycl/test-e2e/SeparateCompile/ @intel/dpcpp-tools-reviewers sycl/test-e2e/Printf/ @intel/dpcpp-tools-reviewers @intel/llvm-reviewers-runtime sycl/test-e2e/SpecConstants/ @intel/dpcpp-tools-reviewers + +# Sanitizer +clang/lib/Driver/SanitizerArgs.cpp @intel/dpcpp-sanitizers-review +libdevice/sanitizer_utils.cpp @intel/dpcpp-sanitizers-review +libdevice/include/asan_libdevice.hpp @intel/dpcpp-sanitizers-review +libdevice/include/sanitizer_utils.hpp @intel/dpcpp-sanitizers-review +llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @intel/dpcpp-sanitizers-review +sycl/test-e2e/AddressSanitizer/ @intel/dpcpp-sanitizers-review +llvm/test/Instrumentation/AddressSanitizer/ @intel/dpcpp-sanitizers-review +llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerOptions.h @intel/dpcpp-sanitizers-review +llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h @intel/dpcpp-sanitizers-review +llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h @intel/dpcpp-sanitizers-review diff --git a/.github/workflows/sycl-linux-precommit-aws.yml b/.github/workflows/sycl-linux-precommit-aws.yml index f7fe4cad3ea96..990fb89dcaca8 100644 --- a/.github/workflows/sycl-linux-precommit-aws.yml +++ b/.github/workflows/sycl-linux-precommit-aws.yml @@ -19,7 +19,7 @@ permissions: jobs: create-check: - runs-on: [Linux, build] + runs-on: [Linux, aux-tasks] permissions: checks: write statuses: write @@ -64,7 +64,7 @@ jobs: with: name: CUDA E2E runner: '["aws_cuda-${{ github.event.workflow_run.id }}-${{ github.event.workflow_run.run_attempt }}"]' - image: ghcr.io/intel/llvm/ubuntu2204_build:latest + image: ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab image_options: -u 1001 --gpus all --cap-add SYS_ADMIN --env NVIDIA_DISABLE_REQUIRE=1 target_devices: ext_oneapi_cuda:gpu # No idea why but that seems to work and be in sync with the main @@ -79,7 +79,7 @@ jobs: update-check: needs: [create-check, e2e-cuda] if: always() - runs-on: [Linux, build] + runs-on: [Linux, aux-tasks] permissions: checks: write statuses: write diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index f6e31541b7188..19d106fa23675 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -46,6 +46,7 @@ jobs: build_artifact_suffix: "default" build_cache_suffix: "default" changes: ${{ needs.detect_changes.outputs.filters }} + build_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab" determine_arc_tests: name: Decide which Arc tests to run @@ -77,7 +78,7 @@ jobs: include: - name: AMD/HIP runner: '["Linux", "amdgpu"]' - image: ghcr.io/intel/llvm/ubuntu2204_build:latest + image: ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab image_options: -u 1001 --device=/dev/dri --device=/dev/kfd target_devices: ext_oneapi_hip:gpu - name: Intel diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml index f5b3453e6db98..fc0b90be7990a 100644 --- a/.github/workflows/sycl-nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -74,13 +74,6 @@ jobs: target_devices: opencl:cpu tests_selector: e2e - - name: Self-hosted CUDA - runner: '["Linux", "cuda"]' - image: ghcr.io/intel/llvm/ubuntu2204_build:latest - image_options: -u 1001 --gpus all --cap-add SYS_ADMIN - target_devices: ext_oneapi_cuda:gpu - tests_selector: e2e - - name: SYCL-CTS on OCL CPU runner: '["Linux", "gen12"]' image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest diff --git a/buildbot/configure.py b/buildbot/configure.py index 720ebb156eae9..f172be352ba7d 100644 --- a/buildbot/configure.py +++ b/buildbot/configure.py @@ -41,6 +41,7 @@ def do_configure(args): fusion_dir = os.path.join(abs_src_dir, "sycl-fusion") llvm_targets_to_build = args.host_target llvm_enable_projects = 'clang;' + llvm_external_projects + libclc_build_native = 'OFF' libclc_targets_to_build = '' libclc_gen_remangled_variants = 'OFF' sycl_build_pi_hip_platform = 'AMD' @@ -88,8 +89,10 @@ def do_configure(args): sycl_enabled_plugins.append("hip") if args.native_cpu: - # Todo: we should set whatever targets we support for native cpu - libclc_targets_to_build += ";x86_64-unknown-linux-gnu" + if args.native_cpu_libclc_targets: + libclc_targets_to_build += ";" + args.native_cpu_libclc_targets + else: + libclc_build_native = "ON" libclc_gen_remangled_variants = "ON" sycl_enabled_plugins.append("native_cpu") @@ -191,6 +194,7 @@ def do_configure(args): "-DLIBCLC_GENERATE_REMANGLED_VARIANTS={}".format( libclc_gen_remangled_variants ), + "-DLIBCLC_NATIVECPU_HOST_TARGET={}".format(libclc_build_native), ] ) @@ -257,8 +261,8 @@ def main(): parser.add_argument("--native_cpu", action='store_true', help="Enable SYCL Native CPU") parser.add_argument("--hip", action='store_true', help="switch from OpenCL to HIP") parser.add_argument("--hip-platform", type=str, choices=['AMD', 'NVIDIA'], default='AMD', help="choose hardware platform for HIP backend") - parser.add_argument("--host-target", default='X86', - help="host LLVM target architecture, defaults to X86, multiple targets may be provided as a semi-colon separated string") + parser.add_argument("--host-target", default='host', + help="host LLVM target architecture, defaults to \'host\', multiple targets may be provided as a semi-colon separated string") parser.add_argument("--enable-all-llvm-targets", action='store_true', help="build compiler with all supported targets, it doesn't change runtime build") parser.add_argument("--no-assertions", action='store_true', help="build without assertions") parser.add_argument("--docs", action='store_true', help="build Doxygen documentation") @@ -276,6 +280,7 @@ def main(): parser.add_argument("--disable-preview-lib", action='store_true', help="Disable building of the SYCL runtime major release preview library") parser.add_argument("--disable-fusion", action="store_true", help="Disable the kernel fusion JIT compiler") parser.add_argument("--add_security_flags", type=str, choices=['none', 'default', 'sanitize'], default=None, help="Enables security flags for compile & link. Two values are supported: 'default' and 'sanitize'. 'Sanitize' option is an extension of 'default' set.") + parser.add_argument('--native-cpu-libclc-targets', help='Target triples for libclc, used by the Native CPU backend') args = parser.parse_args() print("args:{}".format(args)) diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 1eb61d83ff2db..203394297a77a 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -395,6 +395,8 @@ def warn_drv_opt_requires_opt : Warning<"'%0' should be used only in conjunction with '%1'">, InGroup; def err_drv_sycl_missing_amdgpu_arch : Error< "missing AMDGPU architecture for SYCL offloading; specify it with '-Xsycl-target-backend%select{|=%1}0 --offload-arch='">; +def err_drv_sycl_thinlto_split_off: Error< + "'%0' is not supported when '%1' is set with '-fsycl'">; def warn_drv_sycl_offload_target_duplicate : Warning< "SYCL offloading target '%0' is similar to target '%1' already specified; " "will be ignored">, InGroup; diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 74c17c6646669..ebeb3d17ce205 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -12031,10 +12031,10 @@ bool ASTContext::DeclMustBeEmitted(const Decl *D) { // or `indirectly_callable' attribute must be emitted regardless of number // of actual uses if (LangOpts.SYCLIsDevice && isa(D)) { - if (auto *A = D->getAttr()) - return !A->isImplicit(); - if (auto *A = D->getAttr()) - return !A->isImplicit(); + if (D->hasAttr()) + return true; + if (D->hasAttr()) + return true; } GVALinkage Linkage = GetGVALinkageForFunction(FD); diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index acdc5d9daadd5..759b751249595 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -286,9 +286,9 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__SYCL_CUDA_ARCH__", CUDAArchCode); } else { Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode); - if (GPU == CudaArch::SM_90a) - Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1"); } + if (GPU == CudaArch::SM_90a) + Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1"); } } diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index b2d0d5edc9232..759c4f2fe6286 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1881,9 +1881,9 @@ static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) { } static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) { - assert(FPAccuracyStr.equals("high") || FPAccuracyStr.equals("medium") || - FPAccuracyStr.equals("low") || FPAccuracyStr.equals("sycl") || - FPAccuracyStr.equals("cuda")); + assert(FPAccuracyStr == "high" || FPAccuracyStr == "medium" || + FPAccuracyStr == "low" || FPAccuracyStr == "sycl" || + FPAccuracyStr == "cuda"); return llvm::StringSwitch(FPAccuracyStr) .Case("high", SYCLInternalAspect::fp_intrinsic_accuracy_high) .Case("medium", SYCLInternalAspect::fp_intrinsic_accuracy_medium) diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp index 5bad7146895d7..2e85f85fbc1c1 100644 --- a/clang/lib/Driver/Compilation.cpp +++ b/clang/lib/Driver/Compilation.cpp @@ -198,7 +198,7 @@ bool Compilation::CleanupFile(const char *File, bool IssueErrors) const { // when the nvptx*-nvidia-cuda is passed to -fsycl-targets. if (DefaultToolChain.getTriple().isNVPTX()) return false; - if (llvm::sys::path::extension(ActualFile).equals(".spv")) + if (llvm::sys::path::extension(ActualFile) == ".spv") return false; } } diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index abea40cd8f0c5..7ad00c96c5662 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -829,8 +829,8 @@ static bool isValidSYCLTriple(llvm::Triple T) { // SPIR/SPIRV arch, but has invalid SubArch for AOT. StringRef A(T.getArchName()); if (T.getSubArch() == llvm::Triple::NoSubArch && - ((T.getArch() == llvm::Triple::spir && !A.equals("spir")) || - (T.getArch() == llvm::Triple::spir64 && !A.equals("spir64")))) + ((T.getArch() == llvm::Triple::spir && A != "spir") || + (T.getArch() == llvm::Triple::spir64 && A != "spir64"))) return false; return true; } @@ -1149,7 +1149,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, return; const char *ArgValue = A->getValue(); for (const StringRef AllowedValue : AllowedValues) - if (AllowedValue.equals(ArgValue)) + if (AllowedValue == ArgValue) return; Diag(clang::diag::err_drv_invalid_argument_to_option) << ArgValue << A->getOption().getName(); @@ -1182,6 +1182,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, // of -fsycl*target options passed Arg *SYCLTargetsValues = SYCLTargets; if (SYCLTargetsValues) { + llvm::StringSet<> SYCLTriples; if (SYCLTargetsValues->getNumValues()) { // Multiple targets are currently not supported when using @@ -1220,15 +1221,40 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, const ToolChain *HostTC = C.getSingleOffloadToolChain(); llvm::Triple HostTriple = HostTC->getTriple(); - UniqueSYCLTriplesVec.push_back(HostTriple); + SYCLTriples.insert(HostTriple.normalize()); continue; } - if (!isValidSYCLTriple(MakeSYCLDeviceTriple(UserTargetName))) { + llvm::Triple DeviceTriple(MakeSYCLDeviceTriple(UserTargetName)); + if (!isValidSYCLTriple(DeviceTriple)) { Diag(clang::diag::err_drv_invalid_sycl_target) << Val; continue; } + // For any -fsycl-targets=spir64_gen additions, we will scan the + // additional -X* options for potential -device settings. These + // need to be added as a known Arch to the packager. + if (DeviceTriple.isSPIRAOT() && Arch.empty() && + DeviceTriple.getSubArch() == llvm::Triple::SPIRSubArch_gen) { + const ToolChain *HostTC = + C.getSingleOffloadToolChain(); + auto DeviceTC = std::make_unique( + *this, DeviceTriple, *HostTC, C.getInputArgs()); + assert(DeviceTC && "Device toolchain not defined."); + ArgStringList TargetArgs; + DeviceTC->TranslateBackendTargetArgs(DeviceTC->getTriple(), + C.getInputArgs(), TargetArgs); + // Look for -device and use that as the known arch to + // be associated with the current spir64_gen entry. Grab the + // right most entry. + for (int i = TargetArgs.size() - 2; i >= 0; --i) { + if (StringRef(TargetArgs[i]) == "-device") { + Arch = TargetArgs[i + 1]; + break; + } + } + } + // Make sure we don't have a duplicate triple. std::string NormalizedName = MakeSYCLDeviceTriple(Val).normalize(); auto Duplicate = FoundNormalizedTriples.find(NormalizedName); @@ -1241,11 +1267,16 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, // Store the current triple so that we can check for duplicates in // the following iterations. FoundNormalizedTriples[NormalizedName] = Val; - llvm::Triple DeviceTriple(MakeSYCLDeviceTriple(UserTargetName)); - UniqueSYCLTriplesVec.push_back(DeviceTriple); + SYCLTriples.insert(DeviceTriple.normalize()); if (!Arch.empty()) DerivedArchs[DeviceTriple.getTriple()].insert(Arch); } + if (!SYCLTriples.empty()) { + for (const auto &SYCLTriple : SYCLTriples) { + llvm::Triple Triple(SYCLTriple.getKey()); + UniqueSYCLTriplesVec.push_back(Triple); + } + } addSYCLDefaultTriple(C, UniqueSYCLTriplesVec); } else Diag(clang::diag::warn_drv_empty_joined_argument) @@ -1891,7 +1922,7 @@ Compilation *Driver::BuildCompilation(ArrayRef ArgList) { // an external option setting is required to target hardware. setOffloadCompileMode(FPGAEmulationMode); for (StringRef ArgString : TargetArgs) { - if (ArgString.equals("-hardware") || ArgString.equals("-simulation")) { + if (ArgString == "-hardware" || ArgString == "-simulation") { setOffloadCompileMode(FPGAHWMode); break; } @@ -5022,17 +5053,16 @@ class OffloadingActionBuilder final { } // By default, we produce an action for each device arch. - auto TC = ToolChains.begin(); - for (Action *&A : SYCLDeviceActions) { - if ((*TC)->getTriple().isNVPTX() && CurPhase >= phases::Backend) { + for (auto TargetActionInfo : + llvm::zip(SYCLDeviceActions, SYCLTargetInfoList)) { + auto &TargetInfo = std::get<1>(TargetActionInfo); + if (TargetInfo.TC->getTriple().isNVPTX() && CurPhase >= phases::Backend) // For CUDA, stop to emit LLVM IR so it can be linked later on. - ++TC; continue; - } + Action *&A = std::get<0>(TargetActionInfo); A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A, AssociatedOffloadKind); - ++TC; } return ABRT_Success; @@ -6241,12 +6271,12 @@ class OffloadingActionBuilder final { using namespace tools::SYCL; StringRef Device{Value.first}; if (Device.consume_front(gen::AmdGPU)) - return TargetArch.equals(Device) && TargetTriple.isAMDGCN(); + return TargetArch == Device && TargetTriple.isAMDGCN(); if (Device.consume_front(gen::NvidiaGPU)) - return TargetArch.equals(Device) && TargetTriple.isNVPTX(); + return TargetArch == Device && TargetTriple.isNVPTX(); if (Device.consume_front(gen::IntelGPU)) - return TargetArch.equals(Device) && TargetTriple.isSPIRAOT(); - return TargetArch.equals(Device) && isValidSYCLTriple(TargetTriple); + return TargetArch == Device && TargetTriple.isSPIRAOT(); + return TargetArch == Device && isValidSYCLTriple(TargetTriple); }); } else { TargetIt = TargetTable.find(TargetTriple.str()); @@ -9604,7 +9634,8 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA, bool IsHIPNoRDC = JA.getOffloadingDeviceKind() == Action::OFK_HIP && !C.getArgs().hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false); - bool UseOutExtension = IsHIPNoRDC || isa(JA); + bool UseOutExtension = IsHIPNoRDC || isa(JA) || + isa(JA); if (UseOutExtension) { Output = BaseName; llvm::sys::path::replace_extension(Output, ""); @@ -9701,7 +9732,7 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA, const auto &ResultFiles = C.getResultFiles(); const auto CollidingFilenameIt = llvm::find_if(ResultFiles, [NamedOutput](const auto &It) { - return StringRef(NamedOutput).equals(It.second); + return StringRef(NamedOutput) == It.second; }); if (CollidingFilenameIt != ResultFiles.end()) { // Upon any collision, a unique hash will be appended to the filename, diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 76594ae861313..53f56a59664f6 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -1160,6 +1160,8 @@ void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args, CmdArgs.push_back("-asan-stack=0"); CmdArgs.push_back("-mllvm"); CmdArgs.push_back("-asan-globals=0"); + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back("-asan-mapping-scale=4"); } return; } diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 5677175b5f867..6e5645552d55e 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4967,7 +4967,7 @@ void Clang::ConstructHostCompilerJob(Compilation &C, const JobAction &JA, if (isa(JA)) { if (IsMSVCHostCompiler) { // Check the output file, if it is 'stdout' we want to use -E. - if (StringRef(Output.getFilename()).equals("-")) { + if (StringRef(Output.getFilename()) == "-") { HostCompileArgs.push_back("-E"); OutputAdded = true; } else { @@ -5858,10 +5858,19 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-emit-llvm-uselists"); if (IsUsingLTO) { - if (IsDeviceOffloadAction && !JA.isDeviceOffloading(Action::OFK_OpenMP) && - !Args.hasFlag(options::OPT_offload_new_driver, - options::OPT_no_offload_new_driver, false) && - !Triple.isAMDGPU()) { + bool IsUsingOffloadNewDriver = + Args.hasFlag(options::OPT_offload_new_driver, + options::OPT_no_offload_new_driver, false); + Arg *SYCLSplitMode = + Args.getLastArg(options::OPT_fsycl_device_code_split_EQ); + bool IsDeviceCodeSplitDisabled = + SYCLSplitMode && StringRef(SYCLSplitMode->getValue()) == "off"; + bool IsSYCLLTOSupported = + JA.isDeviceOffloading(Action::OFK_SYCL) && IsUsingOffloadNewDriver; + if ((IsDeviceOffloadAction && + !JA.isDeviceOffloading(Action::OFK_OpenMP) && !Triple.isAMDGPU() && + !IsUsingOffloadNewDriver) || + (JA.isDeviceOffloading(Action::OFK_SYCL) && !IsSYCLLTOSupported)) { D.Diag(diag::err_drv_unsupported_opt_for_target) << Args.getLastArg(options::OPT_foffload_lto, options::OPT_foffload_lto_EQ) @@ -5874,6 +5883,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_foffload_lto_EQ) ->getAsString(Args) << "-fno-gpu-rdc"; + } else if (JA.isDeviceOffloading(Action::OFK_SYCL) && + IsDeviceCodeSplitDisabled && LTOMode == LTOK_Thin) { + D.Diag(diag::err_drv_sycl_thinlto_split_off) + << SYCLSplitMode->getAsString(Args) + << Args.getLastArg(options::OPT_foffload_lto, + options::OPT_foffload_lto_EQ) + ->getAsString(Args); } else { assert(LTOMode == LTOK_Full || LTOMode == LTOK_Thin); CmdArgs.push_back(Args.MakeArgString( @@ -10281,6 +10297,41 @@ void OffloadPackager::ConstructJob(Compilation &C, const JobAction &JA, for (StringRef Feature : FeatureArgs) Parts.emplace_back("feature=" + Feature.str()); + // Now that the standard parts are added to the packager string, add any + // additional supplemental options that cover compile and link opts that + // are used for SYCL based offloading. + // Here, we add the compile and link options that are required by backend + // compilers and the clang-offload-wrapper in the case of SYCL offloading. + if (OffloadAction->getOffloadingDeviceKind() == Action::OFK_SYCL) { + ArgStringList BuildArgs; + auto createArgString = [&](const char *Opt) { + if (BuildArgs.empty()) + return; + SmallString<128> AL; + for (const char *A : BuildArgs) { + if (AL.empty()) { + AL = A; + continue; + } + AL += " "; + AL += A; + } + Parts.emplace_back(C.getArgs().MakeArgString(Twine(Opt) + AL)); + }; + const ArgList &Args = + C.getArgsForToolChain(nullptr, StringRef(), Action::OFK_SYCL); + const ToolChain *HostTC = C.getSingleOffloadToolChain(); + const toolchains::SYCLToolChain &SYCLTC = + static_cast(*TC); + SYCLTC.AddImpliedTargetArgs(TC->getTriple(), Args, BuildArgs, JA, *HostTC, + Arch); + SYCLTC.TranslateBackendTargetArgs(TC->getTriple(), Args, BuildArgs, Arch); + createArgString("compile-opts="); + BuildArgs.clear(); + SYCLTC.TranslateLinkerTargetArgs(TC->getTriple(), Args, BuildArgs, Arch); + createArgString("link-opts="); + } + CmdArgs.push_back(Args.MakeArgString("--image=" + llvm::join(Parts, ","))); } @@ -10436,6 +10487,7 @@ static void getOtherSPIRVTransOpts(Compilation &C, ",+SPV_INTEL_fpga_invocation_pipelining_attributes" ",+SPV_INTEL_fpga_latency_control" ",+SPV_INTEL_task_sequence" + ",+SPV_KHR_shader_clock" ",+SPV_INTEL_bindless_images"; ExtArg = ExtArg + DefaultExtArg + INTELExtArg; if (C.getDriver().IsFPGAHWMode()) @@ -10486,7 +10538,7 @@ void SPIRVTranslator::ConstructJob(Compilation &C, const JobAction &JA, // Handle -Xspirv-translator TC.TranslateTargetOpt( - TCArgs, TranslatorArgs, options::OPT_Xspirv_translator, + Triple, TCArgs, TranslatorArgs, options::OPT_Xspirv_translator, options::OPT_Xspirv_translator_EQ, JA.getOffloadingArch()); } for (auto I : Inputs) { @@ -10592,10 +10644,10 @@ static void addArgs(ArgStringList &DstArgs, const llvm::opt::ArgList &Alloc, } } -static void getOtherSYCLPostLinkOpts(const ToolChain &TC, const JobAction &JA, - const llvm::opt::ArgList &TCArgs, - ArgStringList &PostLinkArgs, - bool SpecConsts, types::ID OutputType) { +static void getNonTripleBasedSYCLPostLinkOpts(const ToolChain &TC, + const JobAction &JA, + const llvm::opt::ArgList &TCArgs, + ArgStringList &PostLinkArgs) { // See if device code splitting is requested if (Arg *A = TCArgs.getLastArg(options::OPT_fsycl_device_code_split_EQ)) { auto CodeSplitValue = StringRef(A->getValue()); @@ -10608,19 +10660,8 @@ static void getOtherSYCLPostLinkOpts(const ToolChain &TC, const JobAction &JA, else { // Device code split is off } } - if (OutputType == types::TY_LLVM_BC) { - // single file output requested - this means only perform necessary IR - // transformations (like specialization constant intrinsic lowering) and - // output LLVMIR - addArgs(PostLinkArgs, TCArgs, {"-ir-output-only"}); - } addArgs(PostLinkArgs, TCArgs, {StringRef(getSYCLPostLinkOptimizationLevel(TCArgs))}); - // specialization constants processing is mandatory - if (SpecConsts) - addArgs(PostLinkArgs, TCArgs, {"-spec-const=native"}); - else - addArgs(PostLinkArgs, TCArgs, {"-spec-const=emulation"}); // Process device-globals. addArgs(PostLinkArgs, TCArgs, {"-device-globals"}); @@ -10631,32 +10672,50 @@ static void getOtherSYCLPostLinkOpts(const ToolChain &TC, const JobAction &JA, addArgs(PostLinkArgs, TCArgs, {"-lower-esimd-force-stateless-mem=false"}); } -// Add any sycl-post-link options that rely on a specific Triple. -static void -getTripleBasedSYCLPostLinkOpts(const ToolChain &TC, const JobAction &JA, - const llvm::opt::ArgList &TCArgs, - llvm::Triple Triple, ArgStringList &PostLinkArgs, - bool SpecConsts, types::ID OutputType) { +// Add any sycl-post-link options that rely on a specific Triple in addition +// to user supplied options. This function is invoked only for the old +// offloading model. For the new offloading model, a slightly modified version +// of this function is called inside clang-linker-wrapper. +// NOTE: Any changes made here should be reflected in the similarly named +// function in clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp. +static void getTripleBasedSYCLPostLinkOpts(const ToolChain &TC, + const llvm::opt::ArgList &TCArgs, + ArgStringList &PostLinkArgs, + llvm::Triple Triple, + bool SpecConstsSupported, + types::ID OutputType) { + if (OutputType == types::TY_LLVM_BC) { + // single file output requested - this means only perform necessary IR + // transformations (like specialization constant intrinsic lowering) and + // output LLVMIR + addArgs(PostLinkArgs, TCArgs, {"-ir-output-only"}); + } + if (SpecConstsSupported) + addArgs(PostLinkArgs, TCArgs, {"-spec-const=native"}); + else + addArgs(PostLinkArgs, TCArgs, {"-spec-const=emulation"}); // See if device code splitting is requested. The logic here works along side - // the behavior in setOtherSYCLPostLinkOpts, where the option is added based - // on the user setting of-fsycl-device-code-split. - if (!(TCArgs.hasArg(options::OPT_fsycl_device_code_split_EQ) || - Triple.getArchName() == "spir64_fpga")) + // the behavior in getNonTripleBasedSYCLPostLinkOpts, where the option is + // added based on the user setting of -fsycl-device-code-split. + if (!TCArgs.hasArg(options::OPT_fsycl_device_code_split_EQ) && + (Triple.getArchName() != "spir64_fpga")) addArgs(PostLinkArgs, TCArgs, {"-split=auto"}); // On Intel targets we don't need non-kernel functions as entry points, // because it only increases amount of code for device compiler to handle, // without any actual benefits. // TODO: Try to extend this feature for non-Intel GPUs. - if (!TCArgs.hasFlag(options::OPT_fno_sycl_remove_unused_external_funcs, - options::OPT_fsycl_remove_unused_external_funcs, false) && - !Triple.isNVPTX() && !Triple.isAMDGPU() && !isSYCLNativeCPU(TC)) + if ((!TCArgs.hasFlag(options::OPT_fno_sycl_remove_unused_external_funcs, + options::OPT_fsycl_remove_unused_external_funcs, + false) && + !isSYCLNativeCPU(TC)) && + !Triple.isNVPTX() && !Triple.isAMDGPU()) addArgs(PostLinkArgs, TCArgs, {"-emit-only-kernels-as-entry-points"}); - if (!(Triple.isAMDGCN())) + if (!Triple.isAMDGCN()) addArgs(PostLinkArgs, TCArgs, {"-emit-param-info"}); - // Enable PI program metadata + // Enable program metadata if (Triple.isNVPTX() || Triple.isAMDGCN() || isSYCLNativeCPU(TC)) addArgs(PostLinkArgs, TCArgs, {"-emit-program-metadata"}); if (OutputType != types::TY_LLVM_BC) { @@ -10669,18 +10728,19 @@ getTripleBasedSYCLPostLinkOpts(const ToolChain &TC, const JobAction &JA, // add options unconditionally addArgs(PostLinkArgs, TCArgs, {"-symbols"}); addArgs(PostLinkArgs, TCArgs, {"-emit-exported-symbols"}); + addArgs(PostLinkArgs, TCArgs, {"-emit-imported-symbols"}); if (SplitEsimd) addArgs(PostLinkArgs, TCArgs, {"-split-esimd"}); addArgs(PostLinkArgs, TCArgs, {"-lower-esimd"}); } - bool isAOT = Triple.isNVPTX() || Triple.isAMDGCN() || + bool IsAOT = Triple.isNVPTX() || Triple.isAMDGCN() || Triple.getSubArch() == llvm::Triple::SPIRSubArch_fpga || Triple.getSubArch() == llvm::Triple::SPIRSubArch_gen || Triple.getSubArch() == llvm::Triple::SPIRSubArch_x86_64; if (TCArgs.hasFlag(options::OPT_fsycl_add_default_spec_consts_image, options::OPT_fno_sycl_add_default_spec_consts_image, false) && - isAOT) + IsAOT) addArgs(PostLinkArgs, TCArgs, {"-generate-device-image-default-spec-consts"}); } @@ -10703,10 +10763,8 @@ void SYCLPostLink::ConstructJob(Compilation &C, const JobAction &JA, ArgStringList CmdArgs; llvm::Triple T = getToolChain().getTriple(); - getOtherSYCLPostLinkOpts(getToolChain(), JA, TCArgs, CmdArgs, - SYCLPostLink->getRTSetsSpecConstants(), - SYCLPostLink->getTrueType()); - getTripleBasedSYCLPostLinkOpts(getToolChain(), JA, TCArgs, T, CmdArgs, + getNonTripleBasedSYCLPostLinkOpts(getToolChain(), JA, TCArgs, CmdArgs); + getTripleBasedSYCLPostLinkOpts(getToolChain(), TCArgs, CmdArgs, T, SYCLPostLink->getRTSetsSpecConstants(), SYCLPostLink->getTrueType()); @@ -10717,16 +10775,16 @@ void SYCLPostLink::ConstructJob(Compilation &C, const JobAction &JA, if (T.getSubArch() == llvm::Triple::SPIRSubArch_gen && Device.data()) OutputArg = ("intel_gpu_" + Device + "," + OutputArg).str(); - addArgs(CmdArgs, TCArgs, {"-o", OutputArg}); - const toolchains::SYCLToolChain &TC = static_cast(getToolChain()); // Handle -Xdevice-post-link - TC.TranslateTargetOpt(TCArgs, CmdArgs, options::OPT_Xdevice_post_link, + TC.TranslateTargetOpt(T, TCArgs, CmdArgs, options::OPT_Xdevice_post_link, options::OPT_Xdevice_post_link_EQ, JA.getOffloadingArch()); + addArgs(CmdArgs, TCArgs, {"-o", OutputArg}); + // Add input file assert(Inputs.size() == 1 && Inputs.front().isFilename() && "single input file expected"); @@ -10908,7 +10966,8 @@ void SpirvToIrWrapper::ConstructJob(Compilation &C, const JobAction &JA, static_cast(getToolChain()); // Handle -Xspirv-to-ir-wrapper - TC.TranslateTargetOpt(TCArgs, CmdArgs, options::OPT_Xspirv_to_ir_wrapper, + TC.TranslateTargetOpt(getToolChain().getTriple(), TCArgs, CmdArgs, + options::OPT_Xspirv_to_ir_wrapper, options::OPT_Xspirv_to_ir_wrapper_EQ, JA.getOffloadingArch()); @@ -10973,13 +11032,6 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, if (Args.hasArg(options::OPT_v)) CmdArgs.push_back("--wrapper-verbose"); - // TODO(NOM2): Pass following options to clang-linker-wrapper. - // Please refer to sycl/doc/design/OffloadDesign.md for details. - // sycl-device-libraries - // sycl-device-library-location - // sycl-post-link-options - // llvm-spirv-options - if (const Arg *A = Args.getLastArg(options::OPT_g_Group)) { if (!A->getOption().matches(options::OPT_g0)) CmdArgs.push_back("--device-debug"); @@ -11012,12 +11064,14 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, // Add any SYCL offloading specific options to the clang-linker-wrapper if (C.hasOffloadToolChain()) { // -sycl-device-libraries= contains all of the SYCL - // device specific libraries that are needed. This provides the list of - // files file only. - // TODO: This generic list will be populated with only device binaries - // for spir/spirv. Other targets (AOT and others) can represent a different - // set of device libraries. We will cross that bridge when we begin to - // enable the other possible targets. + // device specific libraries that are needed. This generic list will be + // populated with device binaries for all target triples in the current + // compilation flow. + + // Create a comma separated list to pass along to the linker wrapper. + SmallString<256> LibList; + // TODO: TargetTriple should not be used here for creating linker wrapper + // options. It should also not be passed to the linker wrapper. llvm::Triple TargetTriple; auto ToolChainRange = C.getOffloadToolChains(); for (auto &I : @@ -11026,38 +11080,24 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, if (TC->getTriple().isSPIROrSPIRV() && TC->getTriple().getSubArch() == llvm::Triple::NoSubArch) { TargetTriple = TC->getTriple(); - break; + SmallVector SYCLDeviceLibs; + bool IsSPIR = TargetTriple.isSPIROrSPIRV(); + bool IsSpirvAOT = TargetTriple.isSPIRAOT(); + bool UseJitLink = + IsSPIR && + Args.hasFlag(options::OPT_fsycl_device_lib_jit_link, + options::OPT_fno_sycl_device_lib_jit_link, false); + bool UseAOTLink = IsSPIR && (IsSpirvAOT || !UseJitLink); + SYCLDeviceLibs = SYCL::getDeviceLibraries(C, TargetTriple, UseAOTLink); + for (const auto &AddLib : SYCLDeviceLibs) { + if (LibList.size() > 0) + LibList += ","; + LibList += AddLib; + } } } - // Pass the device triple to the linker wrapper tool for SYCL offload. - // Only spir64 or spirv64 is currently passed. - // TODO(NOM1): Support target triples in a more generic way. - // TODO(NOM3): Investigate why passing spirv64-unknown-unknown does not - // work. - if (TargetTriple.isSPIR()) - CmdArgs.push_back("--triple=spir64"); - else if (TargetTriple.isSPIRV()) - CmdArgs.push_back("--triple=spirv64"); - - SmallVector SYCLDeviceLibs; - auto IsSPIR = TargetTriple.isSPIROrSPIRV(); - bool IsSpirvAOT = TargetTriple.isSPIRAOT(); - bool UseJitLink = - IsSPIR && - Args.hasFlag(options::OPT_fsycl_device_lib_jit_link, - options::OPT_fno_sycl_device_lib_jit_link, false); - bool UseAOTLink = IsSPIR && (IsSpirvAOT || !UseJitLink); - SYCLDeviceLibs = SYCL::getDeviceLibraries(C, TargetTriple, UseAOTLink); - // Create a comma separated list to pass along to the linker wrapper. - SmallString<256> LibList; - for (const auto &AddLib : SYCLDeviceLibs) { - if (LibList.size() > 0) - LibList += ","; - LibList += AddLib; - } // -sycl-device-libraries= provides a comma separate list of // libraries to add to the device linking step. - // SYCL device libraries can be found. if (LibList.size()) CmdArgs.push_back( Args.MakeArgString(Twine("-sycl-device-libraries=") + LibList)); @@ -11097,26 +11137,14 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, // --sycl-post-link-options="options" provides a string of options to be // passed along to the sycl-post-link tool during device link. SmallString<128> PostLinkOptString; + ArgStringList PostLinkArgs; + getNonTripleBasedSYCLPostLinkOpts(getToolChain(), JA, Args, PostLinkArgs); + for (const auto &A : PostLinkArgs) + appendOption(PostLinkOptString, A); if (Args.hasArg(options::OPT_Xdevice_post_link)) { for (const auto &A : Args.getAllArgValues(options::OPT_Xdevice_post_link)) appendOption(PostLinkOptString, A); } - ArgStringList PostLinkArgs; - bool IsSYCLNativeCPU = driver::isSYCLNativeCPU(Args); - types::ID OutputType = TargetTriple.isSPIROrSPIRV() || IsSYCLNativeCPU - ? types::TY_Tempfiletable - : types::TY_LLVM_BC; - // TODO: Items like native_cpu and Specialization Constants behaviors are - // dependent on each toolchain. Passing these along as 'general settings' - // for the clang-linker-wrapper causes for potential inconsistencies and - // would need to handled more at the device linking level. - bool SpecConsts = TargetTriple.isSPIROrSPIRV(); - getOtherSYCLPostLinkOpts(getToolChain(), JA, Args, PostLinkArgs, SpecConsts, - OutputType); - getTripleBasedSYCLPostLinkOpts(getToolChain(), JA, Args, TargetTriple, - PostLinkArgs, SpecConsts, OutputType); - for (const auto &A : PostLinkArgs) - appendOption(PostLinkOptString, A); if (!PostLinkOptString.empty()) CmdArgs.push_back( Args.MakeArgString("--sycl-post-link-options=" + PostLinkOptString)); @@ -11140,25 +11168,50 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, const toolchains::SYCLToolChain &SYCLTC = static_cast(getToolChain()); // Only store compile/link opts in the image descriptor for the SPIR-V - // target. + // target. For AOT, pass along the addition options via GPU or CPU + // specific clang-linker-wrapper options. const ArgList &Args = C.getArgsForToolChain(nullptr, StringRef(), Action::OFK_SYCL); - ArgStringList BuildArgs; - OptString.clear(); - SYCLTC.TranslateBackendTargetArgs(TargetTriple, Args, BuildArgs); - for (const auto &A : BuildArgs) - appendOption(OptString, A); - if (!OptString.empty()) - CmdArgs.push_back( - Args.MakeArgString("--sycl-backend-compile-options=" + OptString)); - BuildArgs.clear(); - OptString.clear(); - SYCLTC.TranslateLinkerTargetArgs(TargetTriple, Args, BuildArgs); - for (const auto &A : BuildArgs) - appendOption(OptString, A); - if (!OptString.empty()) - CmdArgs.push_back( - Args.MakeArgString("--sycl-target-link-options=" + OptString)); + for (auto &ToolChainMember : + llvm::make_range(ToolChainRange.first, ToolChainRange.second)) { + const ToolChain *TC = ToolChainMember.second; + bool IsJIT = false; + StringRef WrapperOption; + StringRef WrapperLinkOption; + if (TC->getTriple().isSPIROrSPIRV()) { + if (TC->getTriple().getSubArch() == llvm::Triple::NoSubArch) { + IsJIT = true; + WrapperOption = "--sycl-backend-compile-options="; + } + if (TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen) + WrapperOption = "--gen-tool-arg="; + if (TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_x86_64) + WrapperOption = "--cpu-tool-arg="; + } else + continue; + ArgStringList BuildArgs; + SmallString<128> BackendOptString; + SmallString<128> LinkOptString; + SYCLTC.TranslateBackendTargetArgs(TC->getTriple(), Args, BuildArgs); + for (const auto &A : BuildArgs) + appendOption(BackendOptString, A); + + BuildArgs.clear(); + SYCLTC.TranslateLinkerTargetArgs(TC->getTriple(), Args, BuildArgs); + for (const auto &A : BuildArgs) { + if (IsJIT) + appendOption(LinkOptString, A); + else + // For AOT, combine the Backend and Linker strings into one. + appendOption(BackendOptString, A); + } + if (!BackendOptString.empty()) + CmdArgs.push_back( + Args.MakeArgString(Twine(WrapperOption) + BackendOptString)); + if (!LinkOptString.empty()) + CmdArgs.push_back( + Args.MakeArgString("--sycl-target-link-options=" + LinkOptString)); + } } // Construct the link job so we can wrap around it. diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 8f3b47937c512..ba0a8d928c8fa 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1568,9 +1568,8 @@ bool tools::isDependentLibAdded(const ArgList &Args, StringRef Lib) { // Check if given Lib is added via --dependent-lib SmallString<64> DepLib("--dependent-lib="); DepLib += Lib; - return llvm::any_of( - Args.getAllArgValues(options::OPT_Xclang), - [&DepLib](StringRef Option) { return Option.equals(DepLib); }); + return llvm::any_of(Args.getAllArgValues(options::OPT_Xclang), + [&DepLib](StringRef Option) { return Option == DepLib; }); } const char *tools::SplitDebugName(const JobAction &JA, const ArgList &Args, diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index edd1e7fec46c9..1db4500bd3b51 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -114,7 +114,7 @@ void SYCL::constructLLVMForeachCommand(Compilation &C, const JobAction &JA, // If fsycl-dump-device-code is passed, put the PTX files // into the path provided in fsycl-dump-device-code. if (T->getToolChain().getTriple().isNVPTX() && - C.getDriver().isDumpDeviceCodeEnabled() && Ext.equals("s")) { + C.getDriver().isDumpDeviceCodeEnabled() && Ext == "s") { SmallString<128> OutputDir; Arg *DumpDeviceCodeArg = @@ -235,12 +235,11 @@ SYCL::getDeviceLibraries(const Compilation &C, const llvm::Triple &TargetTriple, for (StringRef Val : A->getValues()) { if (Val == "all") { for (const auto &K : DeviceLibLinkInfo.keys()) - DeviceLibLinkInfo[K] = - true && (!NoDeviceLibs || K.equals("internal")); + DeviceLibLinkInfo[K] = true && (!NoDeviceLibs || K == "internal"); break; } auto LinkInfoIter = DeviceLibLinkInfo.find(Val); - if (LinkInfoIter == DeviceLibLinkInfo.end() || Val.equals("internal")) { + if (LinkInfoIter == DeviceLibLinkInfo.end() || Val == "internal") { // TODO: Move the diagnostic to the SYCL section of // Driver::CreateOffloadingDeviceToolChains() to minimize code // duplication. @@ -488,7 +487,7 @@ const char *SYCL::Linker::constructLLVMLinkCommand( for (const auto &L : SYCLDeviceLibList) { std::string DeviceLibName(L); DeviceLibName.append(LibPostfix); - if (StringRef(PureLibName).equals(DeviceLibName) || + if (StringRef(PureLibName) == DeviceLibName || (IsNVPTX && StringRef(PureLibName).starts_with(L))) return true; } @@ -899,7 +898,7 @@ static bool hasPVCDevice(const ArgStringList &CmdArgs) { DeviceArg = SplitArg; break; } - if (SplitArg.equals("-device")) + if (SplitArg == "-device") DeviceSeen = true; } if (DeviceSeen) @@ -982,7 +981,8 @@ void SYCL::gen::BackendCompiler::ConstructJob(Compilation &C, *HostTC, Device); TC.TranslateBackendTargetArgs(getToolChain().getTriple(), Args, CmdArgs, Device); - TC.TranslateLinkerTargetArgs(getToolChain().getTriple(), Args, CmdArgs); + TC.TranslateLinkerTargetArgs(getToolChain().getTriple(), Args, CmdArgs, + Device); SmallString<128> ExecPath( getToolChain().GetProgramPath(makeExeName(C, "ocloc"))); const char *Exec = C.getArgs().MakeArgString(ExecPath); @@ -1049,6 +1049,7 @@ StringRef SYCL::gen::resolveGenDevice(StringRef DeviceName) { .Case("nvidia_gpu_sm_87", "sm_87") .Case("nvidia_gpu_sm_89", "sm_89") .Case("nvidia_gpu_sm_90", "sm_90") + .Case("nvidia_gpu_sm_90a", "sm_90a") .Case("amd_gpu_gfx700", "gfx700") .Case("amd_gpu_gfx701", "gfx701") .Case("amd_gpu_gfx702", "gfx702") @@ -1135,6 +1136,7 @@ SmallString<64> SYCL::gen::getGenDeviceMacro(StringRef DeviceName) { .Case("sm_87", "NVIDIA_GPU_SM_87") .Case("sm_89", "NVIDIA_GPU_SM_89") .Case("sm_90", "NVIDIA_GPU_SM_90") + .Case("sm_90a", "NVIDIA_GPU_SM_90A") .Case("gfx700", "AMD_GPU_GFX700") .Case("gfx701", "AMD_GPU_GFX701") .Case("gfx702", "AMD_GPU_GFX702") @@ -1383,7 +1385,8 @@ static void WarnForDeprecatedBackendOpts(const Driver &D, // Expects a specific type of option (e.g. -Xsycl-target-backend) and will // extract the arguments. -void SYCLToolChain::TranslateTargetOpt(const llvm::opt::ArgList &Args, +void SYCLToolChain::TranslateTargetOpt(const llvm::Triple &Triple, + const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs, OptSpecifier Opt, OptSpecifier Opt_EQ, StringRef Device) const { @@ -1393,15 +1396,21 @@ void SYCLToolChain::TranslateTargetOpt(const llvm::opt::ArgList &Args, if (A->getOption().matches(Opt_EQ)) { // Passing device args: -X= -opt=val. StringRef GenDevice = SYCL::gen::resolveGenDevice(A->getValue()); - bool IsGenTriple = - getTriple().isSPIR() && - getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen; - if (Device != GenDevice) - continue; - if (getDriver().MakeSYCLDeviceTriple(A->getValue()) != getTriple() && - (!IsGenTriple || (IsGenTriple && GenDevice.empty()))) - // Triples do not match, but only skip when we know we are not comparing - // against intel_gpu_* and non-spir64_gen + bool IsGenTriple = Triple.isSPIR() && + Triple.getSubArch() == llvm::Triple::SPIRSubArch_gen; + if (IsGenTriple) { + if (Device != GenDevice && !Device.empty()) + continue; + if (getDriver().MakeSYCLDeviceTriple(A->getValue()) != Triple && + GenDevice.empty()) + // Triples do not match, but only skip when we know we are not + // comparing against intel_gpu_* + continue; + if (getDriver().MakeSYCLDeviceTriple(A->getValue()) == Triple && + !Device.empty()) + // Triples match, but we are expecting a specific device to be set. + continue; + } else if (getDriver().MakeSYCLDeviceTriple(A->getValue()) != Triple) continue; } else if (!OptNoTriple) // Don't worry about any of the other args, we only want to pass what is @@ -1424,8 +1433,7 @@ void SYCLToolChain::TranslateTargetOpt(const llvm::opt::ArgList &Args, } else // Triple found, add the next argument in line. ArgString = A->getValue(1); - WarnForDeprecatedBackendOpts(getDriver(), getTriple(), Device, ArgString, - A); + WarnForDeprecatedBackendOpts(getDriver(), Triple, Device, ArgString, A); parseTargetOpts(ArgString, Args, CmdArgs); A->claim(); } @@ -1468,8 +1476,8 @@ void SYCLToolChain::AddImpliedTargetArgs(const llvm::Triple &Triple, auto ProcessElement = [&](StringRef Ele) { auto [DeviceName, RegAllocMode] = Ele.split(':'); StringRef BackendOptName = SYCL::gen::getGenGRFFlag(RegAllocMode); - bool IsDefault = RegAllocMode.equals("default"); - if (RegAllocMode.empty() || !DeviceName.equals("pvc") || + bool IsDefault = RegAllocMode == "default"; + if (RegAllocMode.empty() || DeviceName != "pvc" || (BackendOptName.empty() && !IsDefault)) { getDriver().Diag(diag::err_drv_unsupported_option_argument) << A->getSpelling() << Ele; @@ -1517,7 +1525,7 @@ void SYCLToolChain::AddImpliedTargetArgs(const llvm::Triple &Triple, if (Args.hasArg(options::OPT_fintelfpga) && getDriver().IsFPGAHWMode() && Triple.getSubArch() == llvm::Triple::SPIRSubArch_fpga) { if (Arg *A = Args.getLastArg(options::OPT_ffp_model_EQ)) { - if (StringRef(A->getValue()).equals("fast")) + if (StringRef(A->getValue()) == "fast") BeArgs.push_back("-vpfp-relaxed"); } } @@ -1628,21 +1636,22 @@ void SYCLToolChain::TranslateBackendTargetArgs( Triple.isSPIROrSPIRV() && getDriver().isSYCLDefaultTripleImplied()) return; // Handle -Xsycl-target-backend. - TranslateTargetOpt(Args, CmdArgs, options::OPT_Xsycl_backend, + TranslateTargetOpt(Triple, Args, CmdArgs, options::OPT_Xsycl_backend, options::OPT_Xsycl_backend_EQ, Device); TranslateGPUTargetOpt(Args, CmdArgs, options::OPT_fsycl_targets_EQ); } -void SYCLToolChain::TranslateLinkerTargetArgs( - const llvm::Triple &Triple, const llvm::opt::ArgList &Args, - llvm::opt::ArgStringList &CmdArgs) const { +void SYCLToolChain::TranslateLinkerTargetArgs(const llvm::Triple &Triple, + const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + StringRef Device) const { // Do not process -Xsycl-target-linker for implied spir64/spirv64 if (Triple.getSubArch() == llvm::Triple::NoSubArch && Triple.isSPIROrSPIRV() && getDriver().isSYCLDefaultTripleImplied()) return; // Handle -Xsycl-target-linker. - TranslateTargetOpt(Args, CmdArgs, options::OPT_Xsycl_linker, - options::OPT_Xsycl_linker_EQ, StringRef()); + TranslateTargetOpt(Triple, Args, CmdArgs, options::OPT_Xsycl_linker, + options::OPT_Xsycl_linker_EQ, Device); } Tool *SYCLToolChain::buildBackendCompiler() const { diff --git a/clang/lib/Driver/ToolChains/SYCL.h b/clang/lib/Driver/ToolChains/SYCL.h index 4fe8dee807f49..3a62de18cd07d 100644 --- a/clang/lib/Driver/ToolChains/SYCL.h +++ b/clang/lib/Driver/ToolChains/SYCL.h @@ -181,8 +181,10 @@ class LLVM_LIBRARY_VISIBILITY SYCLToolChain : public ToolChain { StringRef Device = "") const; void TranslateLinkerTargetArgs(const llvm::Triple &Triple, const llvm::opt::ArgList &Args, - llvm::opt::ArgStringList &CmdArgs) const; - void TranslateTargetOpt(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + StringRef Device = "") const; + void TranslateTargetOpt(const llvm::Triple &Triple, + const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs, llvm::opt::OptSpecifier Opt, llvm::opt::OptSpecifier Opt_EQ, diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 6bdd566bb63ed..cd0c41031b551 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3858,7 +3858,7 @@ void CompilerInvocation::ParseFpAccuracyArgs(LangOptions &Opts, ArgList &Args, checkFPAccuracyIsValid(ValElement[0], Diags); // No need to fill the map if the FPaccuracy is 'default'. // The default builtin will be generated. - if (!ValElement[0].equals("default")) { + if (ValElement[0] != "default") { // if FPAccuracyFuncMap of this function has been previously set // update its value; the last fp-accuracy option in the command // line wins. diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 9f6c4b9a71f89..03e60c787f3f1 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3865,10 +3865,9 @@ bool Sema::CheckIntelSYCLAllocaBuiltinFunctionCall(unsigned BuiltinID, } // Check size is passed as a specialization constant - const auto CheckSize = [this, IsAlignedAlloca, ElementTypeIndex, - SpecNameIndex](const ASTContext &Ctx, - SourceLocation Loc, - const TemplateArgumentList *CST) { + const auto CheckSize = [this, IsAlignedAlloca, SpecNameIndex]( + const ASTContext &Ctx, SourceLocation Loc, + const TemplateArgumentList *CST) { TemplateArgument TA = CST->get(SpecNameIndex); QualType Ty = TA.getNonTypeTemplateArgumentType(); if (Ty.isNull() || !Ty->isReferenceType()) diff --git a/clang/test/CodeGenSYCL/force-emit-device-virtual-funcs.cpp b/clang/test/CodeGenSYCL/force-emit-device-virtual-funcs.cpp new file mode 100644 index 0000000000000..fb34cae42d9ae --- /dev/null +++ b/clang/test/CodeGenSYCL/force-emit-device-virtual-funcs.cpp @@ -0,0 +1,47 @@ +// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple spir64-unknown-unknown -fsycl-is-device \ +// RUN: -fsycl-allow-virtual-functions -emit-llvm %s -o %t.ll +// RUN: FileCheck %s --input-file=%t.ll --implicit-check-not _ZN7Derived3baz \ +// RUN: --implicit-check-not _ZN4Base4baz --implicit-check-not _ZN4Base3foo +// +// Some SYCL properties may be turned into 'sycl_device' attribute implicitly +// and we would like to ensure that functions like this (at the moment those +// would be virtual member functions only) are forcefully emitted into device +// code. + +class Base { + virtual void foo() {} + + virtual void baz(); + + [[__sycl_detail__::add_ir_attributes_function("indirectly-callable", "a")]] + virtual void bar(); +}; + +void Base::bar() {} + +void Base::baz() {} + +class Derived : public Base { +public: + [[__sycl_detail__::add_ir_attributes_function("indirectly-callable", "b")]] + void foo() override; + + [[__sycl_detail__::add_ir_attributes_function("indirectly-callable", "c")]] + void bar() override final; + + [[__sycl_detail__::add_ir_attributes_function("not-indirectly-callable", "c")]] + void baz() override final; +}; + +void Derived::foo() {} + +void Derived::bar() {} + +void Derived::baz() {} + +// CHECK: define {{.*}}spir_func void @_ZN4Base3bar{{.*}} #[[#AttrA:]] +// CHECK: define {{.*}}spir_func void @_ZN7Derived3foo{{.*}} #[[#AttrB:]] +// CHECK: define {{.*}}spir_func void @_ZN7Derived3bar{{.*}} #[[#AttrC:]] +// CHECK: attributes #[[#AttrA]] = {{.*}} "indirectly-callable"="a" +// CHECK: attributes #[[#AttrB]] = {{.*}} "indirectly-callable"="b" +// CHECK: attributes #[[#AttrC]] = {{.*}} "indirectly-callable"="c" diff --git a/clang/test/Driver/linker-wrapper-sycl-win.cpp b/clang/test/Driver/linker-wrapper-sycl-win.cpp index 1854dee476641..2ef253a019f34 100644 --- a/clang/test/Driver/linker-wrapper-sycl-win.cpp +++ b/clang/test/Driver/linker-wrapper-sycl-win.cpp @@ -1,11 +1,11 @@ // REQUIRES: system-windows /// Check for list of commands for standalone clang-linker-wrapper run for sycl -// RUN: clang-linker-wrapper -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.obj,libsycl-complex.new.obj -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-pc-windows-msvc" "--triple=spir64" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %S/Inputs/test-sycl.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS %s +// RUN: clang-linker-wrapper -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.obj,libsycl-complex.new.obj -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-pc-windows-msvc" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %S/Inputs/test-sycl.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS %s // CHK-CMDS: "{{.*}}spirv-to-ir-wrapper.exe" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global // CHK-CMDS-NEXT: "{{.*}}llvm-link.exe" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings // CHK-CMDS-NEXT: "{{.*}}llvm-link.exe" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings -// CHK-CMDS-NEXT: "{{.*}}sycl-post-link.exe" SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc +// CHK-CMDS-NEXT: "{{.*}}sycl-post-link.exe"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc // LLVM-SPIRV is not called in dry-run // CHK-CMDS-NEXT: offload-wrapper: input: [[LLVMSPIRVOUT:.*]].table, output: [[WRAPPEROUT:.*]].bc // CHK-CMDS-NEXT: "{{.*}}llc.exe" -filetype=obj -o [[LLCOUT:.*]].o [[WRAPPEROUT]].bc diff --git a/clang/test/Driver/linker-wrapper-sycl.cpp b/clang/test/Driver/linker-wrapper-sycl.cpp index 0ba8b7414d69c..19bde37eb8be6 100644 --- a/clang/test/Driver/linker-wrapper-sycl.cpp +++ b/clang/test/Driver/linker-wrapper-sycl.cpp @@ -5,7 +5,7 @@ // CHK-CMDS: "{{.*}}spirv-to-ir-wrapper" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global // CHK-CMDS-NEXT: "{{.*}}llvm-link" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings // CHK-CMDS-NEXT: "{{.*}}llvm-link" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings -// CHK-CMDS-NEXT: "{{.*}}sycl-post-link" SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc +// CHK-CMDS-NEXT: "{{.*}}sycl-post-link"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc // LLVM-SPIRV is not called in dry-run // CHK-CMDS-NEXT: offload-wrapper: input: [[LLVMSPIRVOUT:.*]].table, output: [[WRAPPEROUT:.*]].bc // CHK-CMDS-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT:.*]].o [[WRAPPEROUT]].bc diff --git a/clang/test/Driver/sycl-device-lib.cpp b/clang/test/Driver/sycl-device-lib.cpp index d478c022a7e5d..df90b29872208 100644 --- a/clang/test/Driver/sycl-device-lib.cpp +++ b/clang/test/Driver/sycl-device-lib.cpp @@ -185,7 +185,7 @@ // RUN: | FileCheck %s -check-prefix=SYCL_LLVM_LINK_NO_DEVICE_LIB // SYCL_LLVM_LINK_NO_DEVICE_LIB: clang{{.*}} "-cc1" {{.*}} "-fsycl-is-device" // SYCL_LLVM_LINK_NO_DEVICE_LIB-NOT: llvm-link{{.*}} "-only-needed" -// SYCL_LLVM_LINK_NO_DEVICE_LIB: sycl-post-link{{.*}} "-symbols" "-emit-exported-symbols"{{.*}} "-o" "{{.*}}.table" "{{.*}}.bc" +// SYCL_LLVM_LINK_NO_DEVICE_LIB: sycl-post-link{{.*}} "-symbols" "-emit-exported-symbols" "-emit-imported-symbols"{{.*}} "-o" "{{.*}}.table" "{{.*}}.bc" /// ########################################################################### /// test llvm-link behavior for special user input whose filename resembles SYCL device library diff --git a/clang/test/Driver/sycl-device-sanitizer.cpp b/clang/test/Driver/sycl-device-sanitizer.cpp index cba93ba3a9f68..5ab3c4265b21b 100644 --- a/clang/test/Driver/sycl-device-sanitizer.cpp +++ b/clang/test/Driver/sycl-device-sanitizer.cpp @@ -8,6 +8,7 @@ // SYCL-ASAN-SAME: "-mllvm" "-asan-constructor-kind=none" // SYCL-ASAN-SAME: "-mllvm" "-asan-stack=0" // SYCL-ASAN-SAME: "-mllvm" "-asan-globals=0" +// SYCL-ASAN-SAME: "-mllvm" "-asan-mapping-scale=4" // RUN: %clangxx -fsycl -fsanitize=address -mllvm -asan-stack=1 -c %s -### 2>&1 \ // RUN: | FileCheck --check-prefix=SYCL-ASAN-FILTER %s diff --git a/clang/test/Driver/sycl-device-traits-macros-nvptx.cpp b/clang/test/Driver/sycl-device-traits-macros-nvptx.cpp index f3fb0c536ccbd..75dd5b99c7aef 100644 --- a/clang/test/Driver/sycl-device-traits-macros-nvptx.cpp +++ b/clang/test/Driver/sycl-device-traits-macros-nvptx.cpp @@ -37,6 +37,8 @@ // RUN: FileCheck %s --check-prefix=CHECK-SYCL-NVPTX-NVIDIA-CUDA-DEVICE-TRIPLE // RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvidia_gpu_sm_90 -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefix=CHECK-SYCL-NVPTX-NVIDIA-CUDA-DEVICE-TRIPLE +// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvidia_gpu_sm_90a -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-NVPTX-NVIDIA-CUDA-DEVICE-TRIPLE // Compiling for a CUDA target passing the device arch to '--offload-arch' (using the '--cuda-gpu-arch' alias). // @@ -68,6 +70,8 @@ // RUN: FileCheck %s --check-prefix=CHECK-SYCL-NVPTX-NVIDIA-CUDA-OFFLOAD-ARCH // RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_90 -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefix=CHECK-SYCL-NVPTX-NVIDIA-CUDA-OFFLOAD-ARCH +// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_90a -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SYCL-NVPTX-NVIDIA-CUDA-OFFLOAD-ARCH // Check device traits macros are defined if sycl is enabled: @@ -92,4 +96,4 @@ // CHECK-SM60: "-D__SYCL_TARGET_NVIDIA_GPU_SM_60__"{{.*}} "-D__SYCL_ALL_DEVICES_HAVE_[[ASPECT]]__=1" // CHECK-SM70: "-D__SYCL_TARGET_NVIDIA_GPU_SM_70__"{{.*}} "-D__SYCL_ALL_DEVICES_HAVE_[[ASPECT]]__=1" // CHECK-SM80: "-D__SYCL_TARGET_NVIDIA_GPU_SM_80__"{{.*}} "-D__SYCL_ALL_DEVICES_HAVE_[[ASPECT]]__=1" -// CHECK-SM90: "-D__SYCL_TARGET_NVIDIA_GPU_SM_80__"{{.*}} "-D__SYCL_ALL_DEVICES_HAVE_[[ASPECT]]__=1" +// CHECK-SM90: "-D__SYCL_TARGET_NVIDIA_GPU_SM_90__"{{.*}} "-D__SYCL_ALL_DEVICES_HAVE_[[ASPECT]]__=1" diff --git a/clang/test/Driver/sycl-fno-libspirv-warn.cpp b/clang/test/Driver/sycl-fno-libspirv-warn.cpp index 842d97153e549..902576f596f3b 100644 --- a/clang/test/Driver/sycl-fno-libspirv-warn.cpp +++ b/clang/test/Driver/sycl-fno-libspirv-warn.cpp @@ -1,7 +1,7 @@ /// Test that appropriate warnings are output when -fno-sycl-libspirv is used. // RUN: not %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -fno-sycl-libspirv %s -### 2>&1 | FileCheck %s -// CHECK: warning: '-fno-sycl-libspirv' should not be used with target 'nvptx64-nvidia-cuda'; libspirv is required for correct behavior [-Wno-libspirv-hip-cuda] -// CHECK: warning: '-fno-sycl-libspirv' should not be used with target 'amdgcn-amd-amdhsa'; libspirv is required for correct behavior [-Wno-libspirv-hip-cuda] +// CHECK-DAG: warning: '-fno-sycl-libspirv' should not be used with target 'nvptx64-nvidia-cuda'; libspirv is required for correct behavior [-Wno-libspirv-hip-cuda] +// CHECK-DAG: warning: '-fno-sycl-libspirv' should not be used with target 'amdgcn-amd-amdhsa'; libspirv is required for correct behavior [-Wno-libspirv-hip-cuda] // RUN: %clangxx -fsycl -fsycl-targets=spir64-unknown-unknown -fno-sycl-libspirv %s -### 2>&1 | FileCheck --check-prefix=CHECK-SPIR64 %s // CHECK-SPIR64: ignoring '-fno-sycl-libspirv' option as it is not currently supported for target 'spir64-unknown-unknown' [-Woption-ignored] diff --git a/clang/test/Driver/sycl-intelfpga-aoco-win.cpp b/clang/test/Driver/sycl-intelfpga-aoco-win.cpp index bebbda92ac0f3..5cba6ff20a2ca 100755 --- a/clang/test/Driver/sycl-intelfpga-aoco-win.cpp +++ b/clang/test/Driver/sycl-intelfpga-aoco-win.cpp @@ -50,7 +50,7 @@ // CHK-FPGA-AOCO: spirv-to-ir-wrapper{{.*}} "[[LIBLIST]]" "-o" "[[LIBLIST2:.+\.txt]]" // CHK-FPGA-AOCO: llvm-link{{.*}} "-o" "[[LINKEDBC:.+\.bc]]" // CHK-FPGA-AOCO: llvm-link{{.*}} "--only-needed" "[[LINKEDBC]]" "@[[LIBLIST2]]" "-o" "[[LINKEDBC2:.+\.bc]]" -// CHK-FPGA-AOCO: sycl-post-link{{.*}} "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[SPLTABLE:.+\.table]]" "[[LINKEDBC2]]" +// CHK-FPGA-AOCO: sycl-post-link{{.*}} "-device-globals"{{.*}} "-spec-const=emulation"{{.*}} "-o" "[[SPLTABLE:.+\.table]]" "[[LINKEDBC2]]" // CHK-FPGA-AOCO: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[SPLTABLE]]" // CHK-FPGA-AOCO: llvm-spirv{{.*}} "-o" "[[TARGSPV:.+\.txt]]" {{.*}} "[[TABLEOUT]]" // CHK-FPGA-AOCO: clang-offload-bundler{{.*}} "-type=aoo" "-targets=sycl-fpga_aoco-intel-unknown" "-input=[[INPUTLIB]]" "-output=[[AOCOLIST:.+\.txt]]" "-unbundle" diff --git a/clang/test/Driver/sycl-intelfpga-aoco.cpp b/clang/test/Driver/sycl-intelfpga-aoco.cpp index 8220580455b87..20839e0c08370 100755 --- a/clang/test/Driver/sycl-intelfpga-aoco.cpp +++ b/clang/test/Driver/sycl-intelfpga-aoco.cpp @@ -54,7 +54,7 @@ // CHK-FPGA-AOCO: spirv-to-ir-wrapper{{.*}} "[[LIBLIST]]" "-o" "[[LIBLIST2:.+\.txt]]" // CHK-FPGA-AOCO: llvm-link{{.*}} "-o" "[[LINKEDBC:.+\.bc]]" // CHK-FPGA-AOCO: llvm-link{{.*}} "--only-needed" "[[LINKEDBC]]" "@[[LIBLIST2]]" "-o" "[[LINKEDBC2:.+\.bc]]" -// CHK-FPGA-AOCO: sycl-post-link{{.*}} "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[SPLTABLE:.+\.table]]" "[[LINKEDBC2]]" +// CHK-FPGA-AOCO: sycl-post-link{{.*}} "-device-globals"{{.*}} "-spec-const=emulation"{{.*}} "-o" "[[SPLTABLE:.+\.table]]" "[[LINKEDBC2]]" // CHK-FPGA-AOCO: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[SPLTABLE]]" // CHK-FPGA-AOCO: llvm-spirv{{.*}} "-o" "[[TARGSPV:.+\.txt]]" {{.*}} "[[TABLEOUT]]" // CHK-FPGA-AOCO: clang-offload-bundler{{.*}} "-type=aoo" "-targets=sycl-fpga_aoco-intel-unknown" "-input=[[INPUTLIB]]" "-output=[[AOCOLIST:.+\.txt]]" "-unbundle" @@ -105,7 +105,7 @@ // CHK-FPGA-AOCO-EMU: clang-offload-bundler{{.*}} "-type=aoo" "-targets=sycl-spir64_fpga-unknown-unknown" "-input=[[INPUTLIB:.+\.a]]" "-output=[[OUTLIB:.+\.txt]]" "-unbundle" // CHK-FPGA-AOCO-EMU: llvm-foreach{{.*}} "--out-ext=txt" "--in-file-list=[[OUTLIB]]" "--in-replace=[[OUTLIB]]" "--out-file-list=[[DEVICELIST:.+\.txt]]" "--out-replace=[[DEVICELIST]]" "--" {{.*}}spirv-to-ir-wrapper{{.*}} "[[OUTLIB]]" "-o" "[[DEVICELIST]]" // CHK-FPGA-AOCO-EMU: llvm-link{{.*}} "@[[DEVICELIST]]" "-o" "[[LINKEDBC:.+\.bc]]" -// CHK-FPGA-AOCO-EMU: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[SPLTABLE:.+\.table]]" "[[LINKEDBC]]" +// CHK-FPGA-AOCO-EMU: sycl-post-link{{.*}} "-O2" "-device-globals"{{.*}} "-spec-const=emulation"{{.*}} "-o" "[[SPLTABLE:.+\.table]]" "[[LINKEDBC]]" // CHK-FPGA-AOCO-EMU: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[SPLTABLE]]" // CHK-FPGA-AOCO-EMU: llvm-spirv{{.*}} "-o" "[[TARGSPV:.+\.txt]]" {{.*}} "[[TABLEOUT]]" // CHK-FPGA-AOCO-EMU: opencl-aot{{.*}} "-device=fpga_fast_emu" "-spv=[[TARGSPV]]" "-ir=[[AOCXOUT:.+\.aocx]]" diff --git a/clang/test/Driver/sycl-linker-wrapper-image.cpp b/clang/test/Driver/sycl-linker-wrapper-image.cpp index 37e976a6e39c5..6c9c0329e438b 100644 --- a/clang/test/Driver/sycl-linker-wrapper-image.cpp +++ b/clang/test/Driver/sycl-linker-wrapper-image.cpp @@ -4,7 +4,7 @@ // RUN: %clang -cc1 -fsycl-is-device -disable-llvm-passes -triple=spir64-unknown-unknown %s -emit-llvm-bc -o %t.device.bc // RUN: clang-offload-packager -o %t.fat --image=file=%t.device.bc,kind=sycl,triple=spir64-unknown-unknown // RUN: %clang -cc1 %s -triple=x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.fat -// RUN: clang-linker-wrapper --print-wrapped-module --host-triple=x86_64-unknown-linux-gnu --triple=spir64 \ +// RUN: clang-linker-wrapper --print-wrapped-module --host-triple=x86_64-unknown-linux-gnu \ // RUN: -sycl-device-library-location=%S/Inputs -sycl-post-link-options="-split=auto -symbols" \ // RUN: %t.o -o %t.out 2>&1 --linker-path="/usr/bin/ld" | FileCheck %s @@ -41,13 +41,14 @@ int main() { // CHECK-DAG: @prop_val = internal unnamed_addr constant [8 x i8] zeroinitializer // CHECK-DAG: @__sycl_offload_prop_sets_arr.2 = internal constant [1 x %_pi_device_binary_property_struct] [%_pi_device_binary_property_struct { ptr @prop.1, ptr @prop_val, i32 2, i64 8 }] // CHECK-DAG: @SYCL_PropSetName.3 = internal unnamed_addr constant [25 x i8] c"SYCL/device requirements\00" -// CHECK-DAG: @__sycl_offload_prop_sets_arr.4 = internal constant [2 x %_pi_device_binary_property_set_struct] [%_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName, ptr @__sycl_offload_prop_sets_arr, ptr getelementptr inbounds ([1 x %_pi_device_binary_property_struct], ptr @__sycl_offload_prop_sets_arr, i64 1, i64 0) }, %_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName.3, ptr @__sycl_offload_prop_sets_arr.2, ptr getelementptr inbounds ([1 x %_pi_device_binary_property_struct], ptr @__sycl_offload_prop_sets_arr.2, i64 1, i64 0) }] +// CHECK-DAG: @SYCL_PropSetName.4 = internal unnamed_addr constant [22 x i8] c"SYCL/kernel param opt\00" +// CHECK-DAG: @__sycl_offload_prop_sets_arr.5 = internal constant [3 x %_pi_device_binary_property_set_struct] [%_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName, ptr @__sycl_offload_prop_sets_arr, ptr getelementptr inbounds ([1 x %_pi_device_binary_property_struct], ptr @__sycl_offload_prop_sets_arr, i64 1, i64 0) }, %_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName.3, ptr @__sycl_offload_prop_sets_arr.2, ptr getelementptr inbounds ([1 x %_pi_device_binary_property_struct], ptr @__sycl_offload_prop_sets_arr.2, i64 1, i64 0) }, %_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName.4, ptr null, ptr null }] // CHECK-DAG: @.sycl_offloading.0.data = internal unnamed_addr constant [740 x i8] // CHECK-DAG: @__sycl_offload_entry_name = internal unnamed_addr constant [25 x i8] c"_ZTSZ4mainE11fake_kernel\00" // CHECK-DAG: @__sycl_offload_entries_arr = internal constant [1 x %struct.__tgt_offload_entry] [%struct.__tgt_offload_entry { ptr null, ptr @__sycl_offload_entry_name, i64 0, i32 0, i32 0 }] // CHECK-DAG: @.sycl_offloading.0.info = internal local_unnamed_addr constant [2 x i64] [i64 ptrtoint (ptr @.sycl_offloading.0.data to i64), i64 740], section ".tgtimg", align 16 // CHECK-DAG: @llvm.used = appending global [1 x ptr] [ptr @.sycl_offloading.0.info], section "llvm.metadata" -// CHECK-DAG: @.sycl_offloading.device_images = internal unnamed_addr constant [1 x %__sycl.tgt_device_image] [%__sycl.tgt_device_image { i16 2, i8 4, i8 0, ptr @.sycl_offloading.target.0, ptr @.sycl_offloading.opts.compile.0, ptr @.sycl_offloading.opts.link.0, ptr null, ptr null, ptr @.sycl_offloading.0.data, ptr getelementptr inbounds ([740 x i8], ptr @.sycl_offloading.0.data, i64 1, i64 0), ptr @__sycl_offload_entries_arr, ptr getelementptr inbounds ([1 x %struct.__tgt_offload_entry], ptr @__sycl_offload_entries_arr, i64 1, i64 0), ptr @__sycl_offload_prop_sets_arr.4, ptr getelementptr inbounds ([2 x %_pi_device_binary_property_set_struct], ptr @__sycl_offload_prop_sets_arr.4, i64 1, i64 0) }] +// CHECK-DAG: @.sycl_offloading.device_images = internal unnamed_addr constant [1 x %__sycl.tgt_device_image] [%__sycl.tgt_device_image { i16 2, i8 4, i8 0, ptr @.sycl_offloading.target.0, ptr @.sycl_offloading.opts.compile.0, ptr @.sycl_offloading.opts.link.0, ptr null, ptr null, ptr @.sycl_offloading.0.data, ptr getelementptr inbounds ([740 x i8], ptr @.sycl_offloading.0.data, i64 1, i64 0), ptr @__sycl_offload_entries_arr, ptr getelementptr inbounds ([1 x %struct.__tgt_offload_entry], ptr @__sycl_offload_entries_arr, i64 1, i64 0), ptr @__sycl_offload_prop_sets_arr.5, ptr getelementptr inbounds ([3 x %_pi_device_binary_property_set_struct], ptr @__sycl_offload_prop_sets_arr.5, i64 1, i64 0) }] // CHECK-DAG: @.sycl_offloading.descriptor = internal constant %__sycl.tgt_bin_desc { i16 1, i16 1, ptr @.sycl_offloading.device_images, ptr null, ptr null } // CHECK-DAG: @llvm.global_ctors = {{.*}} { i32 1, ptr @sycl.descriptor_reg, ptr null }] // CHECK-DAG: @llvm.global_dtors = {{.*}} { i32 1, ptr @sycl.descriptor_unreg, ptr null }] diff --git a/clang/test/Driver/sycl-lto.cpp b/clang/test/Driver/sycl-lto.cpp new file mode 100644 index 0000000000000..b2b68fa5a3583 --- /dev/null +++ b/clang/test/Driver/sycl-lto.cpp @@ -0,0 +1,13 @@ +// Verify the usage of -foffload-lto with SYCL. + +// Verify we error when using the old offload driver. +// RUN: not %clangxx -fsycl -foffload-lto=thin %s -### 2>&1 | FileCheck -check-prefix=CHECK_ERROR %s +// CHECK_ERROR: unsupported option '-foffload-lto=thin' for target 'spir64-unknown-unknown' + +// Verify we error when using the new offload driver but with device code split set to off. +// RUN: not %clangxx -fsycl --offload-new-driver -foffload-lto=thin -fsycl-device-code-split=off %s -### 2>&1 | FileCheck -check-prefix=CHECK_SPLIT_ERROR %s +// CHECK_SPLIT_ERROR: '-fsycl-device-code-split=off' is not supported when '-foffload-lto=thin' is set with '-fsycl' + +// Verify there's no error and we see the expected cc1 flags with the new offload driver. +// RUN: %clangxx -fsycl --offload-new-driver -foffload-lto=thin %s -### 2>&1 | FileCheck -check-prefix=CHECK_SUPPORTED %s +// CHECK_SUPPORTED: clang{{.*}} "-cc1" "-triple" "spir64-unknown-unknown" {{.*}} "-flto=thin" "-flto-unit" diff --git a/clang/test/Driver/sycl-offload-aot.cpp b/clang/test/Driver/sycl-offload-aot.cpp index 4e9c615423ecb..295c4dd7eb137 100644 --- a/clang/test/Driver/sycl-offload-aot.cpp +++ b/clang/test/Driver/sycl-offload-aot.cpp @@ -298,3 +298,12 @@ // RUN: %clang -fsycl -### -fsycl-targets=spir64_fpga -Xshardware -Xsycl-target-backend "-DBLAH" %s 2>&1 \ // RUN: | FileCheck -check-prefix=DUP-OPT %s // DUP-OPT-NOT: aoc{{.*}} "-DBLAH" {{.*}} "-DBLAH" + +/// Output files from ocloc should have an extension. +// RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl \ +// RUN: -fsycl-targets=intel_gpu_skl %s -### 2>&1 \ +// RUN: | FileCheck -check-prefix=OCLOC_OUTPUT %s +// RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl -save-temps \ +// RUN: -fsycl-targets=intel_gpu_skl %s -### 2>&1 \ +// RUN: | FileCheck -check-prefix=OCLOC_OUTPUT %s +// OCLOC_OUTPUT: ocloc{{.*}} "-output" "{{.*}}.out" diff --git a/clang/test/Driver/sycl-offload-intelfpga-emu.cpp b/clang/test/Driver/sycl-offload-intelfpga-emu.cpp index 4a84d6437ae53..2a01cf719801e 100644 --- a/clang/test/Driver/sycl-offload-intelfpga-emu.cpp +++ b/clang/test/Driver/sycl-offload-intelfpga-emu.cpp @@ -16,7 +16,7 @@ // CHK-FPGA-LINK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_fpga-unknown-unknown" "-input=[[INPUT:.+\.o]]" "-output=[[OUTPUT1:.+\.o]]" "-unbundle" // CHK-FPGA-LINK: spirv-to-ir-wrapper{{.*}} "[[OUTPUT1]]" "-o" "[[IROUTPUT1:.+\.bc]]" // CHK-FPGA-LINK: llvm-link{{.*}} "[[IROUTPUT1]]" "-o" "[[OUTPUT2_1:.+\.bc]]" -// CHK-FPGA-LINK: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" {{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]" +// CHK-FPGA-LINK: sycl-post-link{{.*}} "-O2"{{.*}} "-spec-const=emulation" {{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]" // CHK-FPGA-LINK: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[OUTPUT2]]" // CHK-FPGA-LINK: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.txt]]" "-spirv-max-version={{.*}}"{{.*}} "[[TABLEOUT]]" // CHK-FPGA-EARLY: opencl-aot{{.*}} "-device=fpga_fast_emu" "-spv=[[OUTPUT3]]" "-ir=[[OUTPUT4:.+\.aocr]]" "--bo=-g" @@ -41,7 +41,7 @@ // CHK-FPGA-LINK-WIN: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_fpga-unknown-unknown{{.*}}" "-input=[[INPUT:.+\.obj]]" "-output=[[OUTPUT1:.+\.obj]]" "-unbundle" // CHK-FPGA-LINK-WIN: spirv-to-ir-wrapper{{.*}} "[[OUTPUT1]]" "-o" "[[IROUTPUT1:.+\.bc]]" // CHK-FPGA-LINK-WIN: llvm-link{{.*}} "[[IROUTPUT1]]" "-o" "[[OUTPUT2_1:.+\.bc]]" -// CHK-FPGA-LINK-WIN: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" {{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]" +// CHK-FPGA-LINK-WIN: sycl-post-link{{.*}} "-O2"{{.*}} "-spec-const=emulation" {{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]" // CHK-FPGA-LINK-WIN: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[OUTPUT2]]" // CHK-FPGA-LINK-WIN: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.txt]]" "-spirv-max-version={{.*}}"{{.*}} "[[TABLEOUT]]" // CHK-FPGA-LINK-WIN: opencl-aot{{.*}} "-device=fpga_fast_emu" "-spv=[[OUTPUT3]]" "-ir=[[OUTPUT4:.+\.aocr]]" "--bo=-g" @@ -115,7 +115,7 @@ // CHK-FPGA: clang-offload-bundler{{.*}} "-type=o" "-targets=host-x86_64-unknown-linux-gnu,sycl-spir64_fpga-unknown-unknown" {{.*}} "-output=[[FINALLINKx:.+\.o]]" "-output=[[OUTPUT1:.+\.o]]" "-unbundle" // CHK-FPGA: spirv-to-ir-wrapper{{.*}} "[[OUTPUT1]]" "-o" "[[IROUTPUT1:.+\.bc]]" // CHK-FPGA: llvm-link{{.*}} "[[IROUTPUT1]]"{{.*}} "-o" "[[OUTPUT2_BC:.+\.bc]]" -// CHK-FPGA: sycl-post-link{{.*}} "-O2" "-spec-const=emulation"{{.*}} "-o" "[[OUTPUT3_TABLE:.+\.table]]" "[[OUTPUT2_BC]]" +// CHK-FPGA: sycl-post-link{{.*}} "-O2"{{.*}} "-spec-const=emulation"{{.*}} "-o" "[[OUTPUT3_TABLE:.+\.table]]" "[[OUTPUT2_BC]]" // CHK-FPGA: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[OUTPUT3_TABLE]]" // CHK-FPGA: llvm-spirv{{.*}} "-o" "[[OUTPUT5:.+\.txt]]" "-spirv-max-version={{.*}}"{{.*}} "[[TABLEOUT]]" // CHK-FPGA: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-fpga_dep" {{.*}} "-output=[[DEPFILE:.+\.d]]" "-unbundle" @@ -178,7 +178,7 @@ // CHK-FPGA-AOCX-SRC: clang-offload-wrapper{{.*}} "-o=[[WRAPOUT:.+\.bc]]" {{.*}} "-target=spir64_fpga" "-kind=sycl" "--sym-prop-bc-files=[[SYM_AND_PROP]]" "-batch" "[[TABLEOUT]]" // CHK-FPGA-AOCX-SRC: llc{{.*}} "-filetype=obj" "-o" "[[LLCOUT:.+\.(o|obj)]]" "[[WRAPOUT]]" // CHK-FPGA-AOCX-SRC: llvm-link{{.*}} "[[DEVICEBC]]" "-o" "[[LLVMLINKOUT:.+\.bc]]" "--suppress-warnings" -// CHK-FPGA-AOCX-SRC: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]] +// CHK-FPGA-AOCX-SRC: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]] // CHK-FPGA-AOCX-SRC: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[POSTLINKOUT]]" // CHK-FPGA-AOCX-SRC: llvm-spirv{{.*}} "-o" "[[LLVMSPVOUT:.+\.txt]]" {{.*}} "[[TABLEOUT]]" // CHK-FPGA-AOCX-SRC: opencl-aot{{.*}} "-device=fpga_fast_emu" "-spv=[[LLVMSPVOUT]]" "-ir=[[OUTPUT4:.+\.aocx]]" "--bo=-g" @@ -204,7 +204,7 @@ // CHK-FPGA-AOCX-OBJ: clang-offload-bundler{{.*}} "-type=o" {{.*}} "-output=[[HOSTOBJx:.+\.(o|obj)]]" "-output=[[DEVICEOBJ:.+\.(o|obj)]]" "-unbundle" // CHK-FPGA-AOCX-OBJ: spirv-to-ir-wrapper{{.*}} "[[DEVICEOBJ]]" "-o" "[[IROUTPUT:.+\.bc]]" // CHK-FPGA-AOCX-OBJ: llvm-link{{.*}} "[[IROUTPUT]]" "-o" "[[LLVMLINKOUT:.+\.bc]]" "--suppress-warnings" -// CHK-FPGA-AOCX-OBJ: sycl-post-link{{.*}} "-O2" "-spec-const=emulation"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]] +// CHK-FPGA-AOCX-OBJ: sycl-post-link{{.*}} "-O2"{{.*}} "-spec-const=emulation"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]] // CHK-FPGA-AOCX-OBJ: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[POSTLINKOUT]]" // CHK-FPGA-AOCX-OBJ: llvm-spirv{{.*}} "-o" "[[LLVMSPVOUT:.+\.txt]]" {{.*}} "[[TABLEOUT]]" // CHK-FPGA-AOCX-OBJ: opencl-aot{{.*}} "-device=fpga_fast_emu" "-spv=[[LLVMSPVOUT]]" "-ir=[[OUTPUT4:.+\.aocx]]" "--bo=-g" diff --git a/clang/test/Driver/sycl-offload-intelfpga-link.cpp b/clang/test/Driver/sycl-offload-intelfpga-link.cpp index 5d3292be10917..d816f01b4a6e7 100644 --- a/clang/test/Driver/sycl-offload-intelfpga-link.cpp +++ b/clang/test/Driver/sycl-offload-intelfpga-link.cpp @@ -13,7 +13,7 @@ // CHK-FPGA-LINK-NOT: clang-offload-bundler{{.*}} // CHK-FPGA-LINK: spirv-to-ir-wrapper{{.*}} "[[OUTPUT1]]" "-o" "[[IROUTPUT1:.+\.bc]]" // CHK-FPGA-LINK: llvm-link{{.*}} "[[IROUTPUT1]]" "-o" "[[OUTPUT2_1:.+\.bc]]" -// CHK-FPGA-LINK: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]" +// CHK-FPGA-LINK: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]" // CHK-FPGA-LINK: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[OUTPUT2]]" // CHK-FPGA-LINK: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.txt]]" "-spirv-max-version={{.*}}"{{.*}} "[[TABLEOUT]]" // CHK-FPGA-EARLY: aoc{{.*}} "-o" "[[OUTPUT4:.+\.aocr]]" "[[OUTPUT3]]" "-sycl" "-rtl" @@ -46,7 +46,7 @@ // CHK-FPGA-LINK-WIN-NOT: clang-offload-bundler{{.*}} // CHK-FPGA-LINK-WIN: spirv-to-ir-wrapper{{.*}} "[[OUTPUT1]]" "-o" "[[IROUTPUT1:.+\.bc]]" // CHK-FPGA-LINK-WIN: llvm-link{{.*}} "[[IROUTPUT1]]" "-o" "[[OUTPUT2_1:.+\.bc]]" -// CHK-FPGA-LINK-WIN: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]" +// CHK-FPGA-LINK-WIN: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]" // CHK-FPGA-LINK-WIN: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[OUTPUT2]]" // CHK-FPGA-LINK-WIN: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.txt]]" "-spirv-max-version={{.*}}"{{.*}} "[[TABLEOUT]]" // CHK-FPGA-LINK-WIN: aoc{{.*}} "-o" "[[OUTPUT5:.+\.aocr]]" "[[OUTPUT3]]" "-sycl" "-rtl" @@ -175,7 +175,7 @@ // CHK-FPGA: clang-offload-bundler{{.*}} "-type=o" "-targets=host-x86_64-unknown-linux-gnu,sycl-spir64_fpga-unknown-unknown" {{.*}} "-output=[[FINALLINK2x:.+\.o]]" "-output=[[OUTPUT1:.+\.o]]" "-unbundle" // CHK-FPGA: spirv-to-ir-wrapper{{.*}} "[[OUTPUT1]]" "-o" "[[IROUTPUT1:.+\.bc]]" // CHK-FPGA: llvm-link{{.*}} "[[IROUTPUT1]]" "-o" "[[OUTPUT2_BC:.+\.bc]]" -// CHK-FPGA: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[OUTPUT3_TABLE:.+\.table]]" "[[OUTPUT2_BC]]" +// CHK-FPGA: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[OUTPUT3_TABLE:.+\.table]]" "[[OUTPUT2_BC]]" // CHK-FPGA: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[OUTPUT3_TABLE]]" // CHK-FPGA: llvm-spirv{{.*}} "-o" "[[OUTPUT5:.+\.txt]]" "-spirv-max-version={{.*}}"{{.*}} "[[TABLEOUT]]" // CHK-FPGA: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-fpga_dep" {{.*}} "-output=[[DEPFILE:.+\.d]]" "-unbundle" @@ -237,7 +237,7 @@ // CHK-FPGA-AOCX-SRC: clang-offload-wrapper{{.*}} "-o=[[WRAPOUT:.+\.bc]]" {{.*}} "-target=spir64_fpga" "-kind=sycl" "--sym-prop-bc-files=[[SYM_AND_PROP]]" "-batch" "[[TABLEOUT]]" // CHK-FPGA-AOCX-SRC: llc{{.*}} "-filetype=obj" "-o" "[[LLCOUT:.+\.(o|obj)]]" "[[WRAPOUT]]" // CHK-FPGA-AOCX-SRC: llvm-link{{.*}} "[[DEVICEBC]]" "-o" "[[LLVMLINKOUT:.+\.bc]]" "--suppress-warnings" -// CHK-FPGA-AOCX-SRC: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]] +// CHK-FPGA-AOCX-SRC: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]] // CHK-FPGA-AOCX-SRC: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[POSTLINKOUT]]" // CHK-FPGA-AOCX-SRC: llvm-spirv{{.*}} "-o" "[[LLVMSPVOUT:.+\.txt]]" {{.*}} "[[TABLEOUT]]" // CHK-FPGA-AOCX-SRC: aoc{{.*}} "-o" "[[AOCOUT:.+\.aocx]]" "[[LLVMSPVOUT]]" "-sycl" @@ -263,7 +263,7 @@ // CHK-FPGA-AOCX-OBJ: clang-offload-bundler{{.*}} "-type=o" {{.*}} "-output=[[HOSTOBJx:.+\.(o|obj)]]" "-output=[[DEVICEOBJ:.+\.(o|obj)]]" "-unbundle" // CHK-FPGA-AOCX-OBJ: spirv-to-ir-wrapper{{.*}} "[[DEVICEOBJ]]" "-o" "[[IROUTPUT:.+\.bc]]" // CHK-FPGA-AOCX-OBJ: llvm-link{{.*}} "[[IROUTPUT]]" "-o" "[[LLVMLINKOUT:.+\.bc]]" "--suppress-warnings" -// CHK-FPGA-AOCX-OBJ: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]] +// CHK-FPGA-AOCX-OBJ: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]] // CHK-FPGA-AOCX-OBJ: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[POSTLINKOUT]]" // CHK-FPGA-AOCX-OBJ: llvm-spirv{{.*}} "-o" "[[LLVMSPVOUT:.+\.txt]]" {{.*}} "[[TABLEOUT]]" // CHK-FPGA-AOCX-OBJ: aoc{{.*}} "-o" "[[AOCOUT:.+\.aocx]]" "[[LLVMSPVOUT]]" "-sycl" @@ -283,7 +283,7 @@ // CHK-FPGA-AOCX-OBJ2: clang-offload-bundler{{.*}} "-type=o" {{.*}} "-output=[[HOSTOBJx:.+\.(o|obj)]]" "-output=[[DEVICEOBJ:.+\.(o|obj)]]" "-output=[[DEVICEOBJ2:.+\.(o|obj)]]" "-unbundle" // CHK-FPGA-AOCX-OBJ2: spirv-to-ir-wrapper{{.*}} "[[DEVICEOBJ]]" "-o" "[[IROUTPUT:.+\.bc]]" // CHK-FPGA-AOCX-OBJ2: llvm-link{{.*}} "[[IROUTPUT]]" "-o" "[[LLVMLINKOUT:.+\.bc]]" "--suppress-warnings" -// CHK-FPGA-AOCX-OBJ2: sycl-post-link{{.*}} "-O2" "-spec-const=native" "-device-globals"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]]" +// CHK-FPGA-AOCX-OBJ2: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=native"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]]" // CHK-FPGA-AOCX-OBJ2: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[POSTLINKOUT]]" // CHK-FPGA-AOCX-OBJ2: llvm-spirv{{.*}} "-o" "[[LLVMSPVOUT:.+\.txt]]" {{.*}} "[[TABLEOUT]]" // CHK-FPGA-AOCX-OBJ2: file-table-tform{{.*}} "-replace=Code,Code" "-o" "[[TFORM_OUT:.+\.table]]" "[[POSTLINKOUT]]" "[[LLVMSPVOUT]]" @@ -291,7 +291,7 @@ // CHK-FPGA-AOCX-OBJ2: llc{{.*}} "-filetype=obj" "-o" "[[LLCOUT:.+\.(o|obj)]]" "[[WRAPOUT]]" // CHK-FPGA-AOCX-OBJ2: spirv-to-ir-wrapper{{.*}} "[[DEVICEOBJ2]]" "-o" "[[IROUTPUT2:.+\.bc]]" // CHK-FPGA-AOCX-OBJ2: llvm-link{{.*}} "[[IROUTPUT2]]" "-o" "[[LLVMLINKOUT2:.+\.bc]]" "--suppress-warnings" -// CHK-FPGA-AOCX-OBJ2: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[POSTLINKOUT2:.+\.table]]" "[[LLVMLINKOUT2]]" +// CHK-FPGA-AOCX-OBJ2: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[POSTLINKOUT2:.+\.table]]" "[[LLVMLINKOUT2]]" // CHK-FPGA-AOCX-OBJ2: file-table-tform{{.*}} "-o" "[[TABLEOUT2:.+\.txt]]" "[[POSTLINKOUT2]]" // CHK-FPGA-AOCX-OBJ2: llvm-spirv{{.*}} "-o" "[[LLVMSPVOUT2:.+\.txt]]" {{.*}} "[[TABLEOUT2]]" // CHK-FPGA-AOCX-OBJ2: aoc{{.*}} "-o" "[[AOCOUT:.+\.aocx]]" "[[LLVMSPVOUT2]]" "-sycl" diff --git a/clang/test/Driver/sycl-offload-new-driver.c b/clang/test/Driver/sycl-offload-new-driver.c index a4b91621bfb94..0a4a5067457d6 100644 --- a/clang/test/Driver/sycl-offload-new-driver.c +++ b/clang/test/Driver/sycl-offload-new-driver.c @@ -25,17 +25,16 @@ /// Check the toolflow for SYCL compilation using new offload model // RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl -fsycl-targets=spir64 --offload-new-driver %s 2>&1 | FileCheck -check-prefix=CHK-FLOW %s // CHK-FLOW: clang{{.*}} "-cc1" "-triple" "spir64-unknown-unknown" "-aux-triple" "x86_64-unknown-linux-gnu" "-fsycl-is-device" {{.*}} "-fsycl-int-header=[[HEADER:.*]].h" "-fsycl-int-footer=[[FOOTER:.*]].h" {{.*}} "--offload-new-driver" {{.*}} "-o" "[[CC1DEVOUT:.*]]" "-x" "c++" "[[INPUT:.*]]" -// CHK-FLOW-NEXT: clang-offload-packager{{.*}} "-o" "[[PACKOUT:.*]]" "--image=file=[[CC1DEVOUT]],triple=spir64-unknown-unknown,arch=,kind=sycl" +// CHK-FLOW-NEXT: clang-offload-packager{{.*}} "-o" "[[PACKOUT:.*]]" "--image=file=[[CC1DEVOUT]],triple=spir64-unknown-unknown,arch=,kind=sycl{{.*}}" // CHK-FLOW-NEXT: append-file{{.*}} "[[INPUT]]" "--append=[[FOOTER]].h" "--orig-filename=[[INPUT]]" "--output=[[APPENDOUT:.*]]" "--use-include" // CHK-FLOW-NEXT: clang{{.*}} "-cc1" "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-include" "[[HEADER]].h" "-dependency-filter" "[[HEADER]].h" {{.*}} "-fsycl-is-host"{{.*}} "-full-main-file-name" "[[INPUT]]" {{.*}} "--offload-new-driver" {{.*}} "-fembed-offload-object=[[PACKOUT]]" {{.*}} "-o" "[[CC1FINALOUT:.*]]" "-x" "c++" "[[APPENDOUT]]" -// CHK-FLOW-NEXT: clang-linker-wrapper{{.*}} "--host-triple=x86_64-unknown-linux-gnu" "--triple=spir64"{{.*}} "--linker-path={{.*}}/ld" {{.*}} "[[CC1FINALOUT]]" +// CHK-FLOW-NEXT: clang-linker-wrapper{{.*}} "--host-triple=x86_64-unknown-linux-gnu"{{.*}} "--linker-path={{.*}}/ld" {{.*}} "[[CC1FINALOUT]]" /// Verify options passed to clang-linker-wrapper // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \ // RUN: --sysroot=%S/Inputs/SYCL -### %s 2>&1 \ // RUN: | FileCheck -check-prefix WRAPPER_OPTIONS %s -// WRAPPER_OPTIONS: clang-linker-wrapper{{.*}} "--triple=spir64" -// WRAPPER_OPTIONS-SAME: "-sycl-device-libraries=libsycl-crt.new.o,libsycl-complex.new.o,libsycl-complex-fp64.new.o,libsycl-cmath.new.o,libsycl-cmath-fp64.new.o,libsycl-imf.new.o,libsycl-imf-fp64.new.o,libsycl-imf-bf16.new.o,libsycl-fallback-cassert.new.o,libsycl-fallback-cstring.new.o,libsycl-fallback-complex.new.o,libsycl-fallback-complex-fp64.new.o,libsycl-fallback-cmath.new.o,libsycl-fallback-cmath-fp64.new.o,libsycl-fallback-imf.new.o,libsycl-fallback-imf-fp64.new.o,libsycl-fallback-imf-bf16.new.o,libsycl-itt-user-wrappers.new.o,libsycl-itt-compiler-wrappers.new.o,libsycl-itt-stubs.new.o" +// WRAPPER_OPTIONS: clang-linker-wrapper{{.*}} "-sycl-device-libraries=libsycl-crt.new.o,libsycl-complex.new.o,libsycl-complex-fp64.new.o,libsycl-cmath.new.o,libsycl-cmath-fp64.new.o,libsycl-imf.new.o,libsycl-imf-fp64.new.o,libsycl-imf-bf16.new.o,libsycl-fallback-cassert.new.o,libsycl-fallback-cstring.new.o,libsycl-fallback-complex.new.o,libsycl-fallback-complex-fp64.new.o,libsycl-fallback-cmath.new.o,libsycl-fallback-cmath-fp64.new.o,libsycl-fallback-imf.new.o,libsycl-fallback-imf-fp64.new.o,libsycl-fallback-imf-bf16.new.o,libsycl-itt-user-wrappers.new.o,libsycl-itt-compiler-wrappers.new.o,libsycl-itt-stubs.new.o" // WRAPPER_OPTIONS-SAME: "-sycl-device-library-location={{.*}}/lib" /// Verify phases used to generate SPIR-V instead of LLVM-IR @@ -56,14 +55,12 @@ // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \ // RUN: -Xspirv-translator -translator-opt -### %s 2>&1 \ // RUN: | FileCheck -check-prefix WRAPPER_OPTIONS_TRANSLATOR %s -// WRAPPER_OPTIONS_TRANSLATOR: clang-linker-wrapper{{.*}} "--triple=spir64" -// WRAPPER_OPTIONS_TRANSLATOR-SAME: "--llvm-spirv-options={{.*}}-translator-opt{{.*}}" +// WRAPPER_OPTIONS_TRANSLATOR: clang-linker-wrapper{{.*}} "--llvm-spirv-options={{.*}}-translator-opt{{.*}}" // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \ // RUN: -Xdevice-post-link -post-link-opt -### %s 2>&1 \ // RUN: | FileCheck -check-prefix WRAPPER_OPTIONS_POSTLINK %s -// WRAPPER_OPTIONS_POSTLINK: clang-linker-wrapper{{.*}} "--triple=spir64" -// WRAPPER_OPTIONS_POSTLINK-SAME: "--sycl-post-link-options=-post-link-opt -O2 -spec-const=native -device-globals -split=auto -emit-only-kernels-as-entry-points -emit-param-info -symbols -emit-exported-symbols -split-esimd -lower-esimd" +// WRAPPER_OPTIONS_POSTLINK: clang-linker-wrapper{{.*}} "--sycl-post-link-options=-O2 -device-globals -post-link-opt" // -fsycl-device-only behavior // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \ @@ -94,6 +91,17 @@ // RUN: | FileCheck -check-prefix=CHK_ARCH \ // RUN: -DTRIPLE=spir64_gen-unknown-unknown -DARCH=pvc %s // RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl \ +// RUN: -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen \ +// RUN: "-device pvc" --offload-new-driver %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK_ARCH \ +// RUN: -DTRIPLE=spir64_gen-unknown-unknown -DARCH=pvc %s +// RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl \ +// RUN: -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen \ +// RUN: "-device pvc" -Xsycl-target-backend=spir64_gen "-device dg1" \ +// RUN: --offload-new-driver %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK_ARCH \ +// RUN: -DTRIPLE=spir64_gen-unknown-unknown -DARCH=dg1 %s +// RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl \ // RUN: -fno-sycl-libspirv -fsycl-targets=amd_gpu_gfx900 \ // RUN: -nogpulib --offload-new-driver %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK_ARCH \ @@ -105,17 +113,63 @@ // RUN: -DTRIPLE=nvptx64-nvidia-cuda -DARCH=sm_50 %s // CHK_ARCH: clang{{.*}} "-triple" "[[TRIPLE]]" // CHK_ARCH-SAME: "-fsycl-is-device" {{.*}} "--offload-new-driver"{{.*}} "-o" "[[CC1DEVOUT:.+\.bc]]" -// CHK_ARCH-NEXT: clang-offload-packager{{.*}} "--image=file=[[CC1DEVOUT]],triple=[[TRIPLE]],arch=[[ARCH]],kind=sycl" +// CHK_ARCH-NEXT: clang-offload-packager{{.*}} "--image=file=[[CC1DEVOUT]],triple=[[TRIPLE]],arch=[[ARCH]],kind=sycl{{.*}}" + +// Verify offload-packager option values +// RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl \ +// RUN: -fsycl-targets=spir64,intel_gpu_pvc \ +// RUN: -Xsycl-target-backend=spir64 -spir64-opt \ +// RUN: -Xsycl-target-backend=intel_gpu_pvc -spir64_gen-opt \ +// RUN: -Xsycl-target-linker=spir64 -spir64-link-opt \ +// RUN: -Xsycl-target-linker=intel_gpu_pvc -spir64_gen-link-opt \ +// RUN: --offload-new-driver %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK_PACKAGER_OPTS %s +// CHK_PACKAGER_OPTS: clang-offload-packager{{.*}} "-o" +// CHK_PACKAGER_OPTS-SAME: {{.*}}triple=spir64_gen-unknown-unknown,arch=pvc,kind=sycl,compile-opts={{.*}}-spir64_gen-opt,link-opts=-spir64_gen-link-opt +// CHK_PACKAGER_OPTS-SAME: {{.*}}triple=spir64-unknown-unknown,arch=,kind=sycl,compile-opts={{.*}}-spir64-opt,link-opts=-spir64-link-opt + +/// Check phases with multiple intel_gpu settings +// RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl \ +// RUN: -fsycl-targets=intel_gpu_dg1,intel_gpu_pvc \ +// RUN: --offload-new-driver -ccc-print-phases %s 2>&1 \ +// RUN: | FileCheck -check-prefix=MULT_TARG_PHASES %s +// MULT_TARG_PHASES: 0: input, "[[INPUT:.+\.c]]", c++, (host-sycl) +// MULT_TARG_PHASES: 1: append-footer, {0}, c++, (host-sycl) +// MULT_TARG_PHASES: 2: preprocessor, {1}, c++-cpp-output, (host-sycl) +// MULT_TARG_PHASES: 3: compiler, {2}, ir, (host-sycl) +// MULT_TARG_PHASES: 4: input, "[[INPUT]]", c++, (device-sycl, dg1) +// MULT_TARG_PHASES: 5: preprocessor, {4}, c++-cpp-output, (device-sycl, dg1) +// MULT_TARG_PHASES: 6: compiler, {5}, ir, (device-sycl, dg1) +// MULT_TARG_PHASES: 7: backend, {6}, ir, (device-sycl, dg1) +// MULT_TARG_PHASES: 8: offload, "device-sycl (spir64_gen-unknown-unknown:dg1)" {7}, ir +// MULT_TARG_PHASES: 9: input, "[[INPUT]]", c++, (device-sycl, pvc) +// MULT_TARG_PHASES: 10: preprocessor, {9}, c++-cpp-output, (device-sycl, pvc) +// MULT_TARG_PHASES: 11: compiler, {10}, ir, (device-sycl, pvc) +// MULT_TARG_PHASES: 12: backend, {11}, ir, (device-sycl, pvc) +// MULT_TARG_PHASES: 13: offload, "device-sycl (spir64_gen-unknown-unknown:pvc)" {12}, ir +// MULT_TARG_PHASES: 14: clang-offload-packager, {8, 13}, image, (device-sycl) +// MULT_TARG_PHASES: 15: offload, "host-sycl (x86_64-unknown-linux-gnu)" {3}, "device-sycl (x86_64-unknown-linux-gnu)" {14}, ir +// MULT_TARG_PHASES: 16: backend, {15}, assembler, (host-sycl) +// MULT_TARG_PHASES: 17: assembler, {16}, object, (host-sycl) /// Test option passing behavior for clang-offload-wrapper options. // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \ // RUN: -Xsycl-target-backend -backend-opt -### %s 2>&1 \ // RUN: | FileCheck -check-prefix WRAPPER_OPTIONS_BACKEND %s -// WRAPPER_OPTIONS_BACKEND: clang-linker-wrapper{{.*}} "--triple=spir64" -// WRAPPER_OPTIONS_BACKEND-SAME: "--sycl-backend-compile-options={{.*}}-backend-opt{{.*}}" +// WRAPPER_OPTIONS_BACKEND: clang-linker-wrapper{{.*}} "--sycl-backend-compile-options={{.*}}-backend-opt{{.*}}" // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \ // RUN: -Xsycl-target-linker -link-opt -### %s 2>&1 \ // RUN: | FileCheck -check-prefix WRAPPER_OPTIONS_LINK %s -// WRAPPER_OPTIONS_LINK: clang-linker-wrapper{{.*}} "--triple=spir64" -// WRAPPER_OPTIONS_LINK-SAME: "--sycl-target-link-options={{.*}}-link-opt{{.*}}" +// WRAPPER_OPTIONS_LINK: clang-linker-wrapper{{.*}} "--sycl-target-link-options={{.*}}-link-opt{{.*}}" + +/// Test option passing behavior for clang-offload-wrapper options for AOT. +// RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \ +// RUN: -fsycl-targets=spir64_gen,spir64_x86_64 \ +// RUN: -Xsycl-target-backend=spir64_gen -backend-gen-opt \ +// RUN: -Xsycl-target-backend=spir64_x86_64 -backend-cpu-opt \ +// RUN: -### %s 2>&1 \ +// RUN: | FileCheck -check-prefix WRAPPER_OPTIONS_BACKEND_AOT %s +// WRAPPER_OPTIONS_BACKEND_AOT: clang-linker-wrapper{{.*}} "--host-triple=x86_64-unknown-linux-gnu" +// WRAPPER_OPTIONS_BACKEND_AOT-SAME: "--gen-tool-arg={{.*}}-backend-gen-opt" +// WRAPPER_OPTIONS_BACKEND_AOT-SAME: "--cpu-tool-arg={{.*}}-backend-cpu-opt" diff --git a/clang/test/Driver/sycl-offload-with-split.c b/clang/test/Driver/sycl-offload-with-split.c index d081d083c3a91..3e304b78e2e52 100644 --- a/clang/test/Driver/sycl-offload-with-split.c +++ b/clang/test/Driver/sycl-offload-with-split.c @@ -205,7 +205,7 @@ // CHK-TOOLS-AOT: clang{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-int-header=[[INPUT1:.+\-header.+\.h]]" "-fsycl-int-footer={{.*}}"{{.*}} "-o" "[[OUTPUT1:.+\.bc]]" // CHK-TOOLS-AOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-o" "[[OUTPUT10:.+\.o]]" // CHK-TOOLS-AOT: llvm-link{{.*}} "[[OUTPUT1]]" "-o" "[[OUTPUT2:.+\.bc]]" -// CHK-TOOLS-AOT: sycl-post-link{{.*}} "-split=auto" {{.*}} "-spec-const=emulation" {{.*}} "-o" "[[OUTPUT3:.+\.table]]" "[[OUTPUT2]]" +// CHK-TOOLS-AOT: sycl-post-link{{.*}} "-split=auto"{{.*}} "-spec-const=emulation"{{.*}} "-o" "[[OUTPUT3:.+\.table]]" "[[OUTPUT2]]" // CHK-TOOLS-AOT: file-table-tform{{.*}} "-o" "[[OUTPUT4:.+\.txt]]" "[[OUTPUT3]]" // CHK-TOOLS-AOT: llvm-foreach{{.*}} "--in-file-list=[[OUTPUT4]]" "--in-replace=[[OUTPUT4]]" "--out-ext=spv" "--out-file-list=[[OUTPUT5:.+\.txt]]" "--out-replace=[[OUTPUT5]]" "--" "{{.*}}llvm-spirv{{.*}}" "-o" "[[OUTPUT5]]" {{.*}} "[[OUTPUT4]]" // CHK-TOOLS-FPGA: llvm-foreach{{.*}} "--out-file-list=[[OUTPUT6:.+\.txt]]{{.*}} "--" "{{.*}}aoc{{.*}} "-o" "[[OUTPUT6]]" "[[OUTPUT5]]" diff --git a/clang/test/Driver/sycl-oneapi-gpu-nvidia.cpp b/clang/test/Driver/sycl-oneapi-gpu-nvidia.cpp index cc5795bad3aca..097571e21edf5 100644 --- a/clang/test/Driver/sycl-oneapi-gpu-nvidia.cpp +++ b/clang/test/Driver/sycl-oneapi-gpu-nvidia.cpp @@ -28,11 +28,14 @@ // RUN: FileCheck %s --check-prefixes=DEVICE_NVIDIA,MACRO_NVIDIA -DDEV_STR=sm_89 -DMAC_STR=SM_89 // RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvidia_gpu_sm_90 -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE_NVIDIA,MACRO_NVIDIA -DDEV_STR=sm_90 -DMAC_STR=SM_90 +// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvidia_gpu_sm_90a -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE_NVIDIA,MACRO_NVIDIA -DDEV_STR=sm_90a -DMAC_STR=SM_90A // MACRO_NVIDIA: clang{{.*}} "-fsycl-is-host" // MACRO_NVIDIA: "-D__SYCL_TARGET_NVIDIA_GPU_[[MAC_STR]]__" // MACRO_NVIDIA: clang{{.*}} "-triple" "nvptx64-nvidia-cuda" // DEVICE_NVIDIA: llvm-foreach{{.*}} "--gpu-name" "[[DEV_STR]]" + /// test for invalid nvidia arch // RUN: not %clangxx -c -fsycl -fsycl-targets=nvidia_gpu_bad -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefix=BAD_NVIDIA_INPUT diff --git a/clang/test/Driver/sycl-post-link-options-win.cpp b/clang/test/Driver/sycl-post-link-options-win.cpp new file mode 100644 index 0000000000000..65a802d1f0210 --- /dev/null +++ b/clang/test/Driver/sycl-post-link-options-win.cpp @@ -0,0 +1,16 @@ +// REQUIRES: system-windows +/// Verify same set of sycl-post-link options generated for old and new offloading model +// RUN: %clangxx -### --target=x86_64-pc-windows-msvc -fsycl \ +// RUN: -Xdevice-post-link -O0 %s 2>&1 \ +// RUN: | FileCheck -check-prefix OPTIONS_POSTLINK_JIT_OLD %s +// OPTIONS_POSTLINK_JIT_OLD: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=native" "-split=auto" "-emit-only-kernels-as-entry-points" "-emit-param-info" "-symbols" "-emit-exported-symbols" "-emit-imported-symbols" "-split-esimd" "-lower-esimd" "-O0" + +// RUN: %clang -cc1 %s -triple x86_64-pc-windows-msvc -emit-obj -o %t.elf.o +// RUN: clang-offload-packager -o %t.out --image=file=%t.elf.o,kind=sycl,triple=spir64 +// RUN: %clang -cc1 %s -triple x86_64-pc-windows-msvc -emit-obj -o %t.o \ +// RUN: -fembed-offload-object=%t.out +// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-pc-windows-msvc \ +// RUN: -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.obj \ +// RUN: --sycl-post-link-options="-O2 -device-globals -O0" \ +// RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck --check-prefix OPTIONS_POSTLINK_JIT_NEW %s +// OPTIONS_POSTLINK_JIT_NEW: sycl-post-link{{.*}} -spec-const=native -split=auto -emit-only-kernels-as-entry-points -emit-param-info -symbols -emit-exported-symbols -emit-imported-symbols -split-esimd -lower-esimd -O2 -device-globals -O0 diff --git a/clang/test/Driver/sycl-post-link-options.cpp b/clang/test/Driver/sycl-post-link-options.cpp new file mode 100644 index 0000000000000..4f81fb424ec7c --- /dev/null +++ b/clang/test/Driver/sycl-post-link-options.cpp @@ -0,0 +1,16 @@ +// REQUIRES: system-linux +/// Verify same set of sycl-post-link options generated for old and new offloading model +// RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl -### \ +// RUN: -Xdevice-post-link -O0 %s 2>&1 \ +// RUN: | FileCheck -check-prefix OPTIONS_POSTLINK_JIT_OLD %s +// OPTIONS_POSTLINK_JIT_OLD: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=native" "-split=auto" "-emit-only-kernels-as-entry-points" "-emit-param-info" "-symbols" "-emit-exported-symbols" "-emit-imported-symbols" "-split-esimd" "-lower-esimd" "-O0" + +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.elf.o +// RUN: clang-offload-packager -o %t.out --image=file=%t.elf.o,kind=sycl,triple=spir64 +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ +// RUN: -fembed-offload-object=%t.out +// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \ +// RUN: -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.o \ +// RUN: --sycl-post-link-options="-O2 -device-globals -O0" \ +// RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck --check-prefix OPTIONS_POSTLINK_JIT_NEW %s +// OPTIONS_POSTLINK_JIT_NEW: sycl-post-link{{.*}} -spec-const=native -split=auto -emit-only-kernels-as-entry-points -emit-param-info -symbols -emit-exported-symbols -emit-imported-symbols -split-esimd -lower-esimd -O2 -device-globals -O0 diff --git a/clang/test/Driver/sycl-spirv-ext.c b/clang/test/Driver/sycl-spirv-ext.c index a306b9eb1ea4d..eb4d24197b1af 100644 --- a/clang/test/Driver/sycl-spirv-ext.c +++ b/clang/test/Driver/sycl-spirv-ext.c @@ -48,6 +48,7 @@ // CHECK-DEFAULT-SAME:,+SPV_INTEL_fpga_invocation_pipelining_attributes // CHECK-DEFAULT-SAME:,+SPV_INTEL_fpga_latency_control // CHECK-DEFAULT-SAME:,+SPV_INTEL_task_sequence +// CHECK-DEFAULT-SAME:,+SPV_KHR_shader_clock // CHECK-DEFAULT-SAME:,+SPV_INTEL_bindless_images // CHECK-DEFAULT-SAME:,+SPV_INTEL_token_type // CHECK-DEFAULT-SAME:,+SPV_INTEL_bfloat16_conversion @@ -125,4 +126,3 @@ // CHECK-CPU-SAME:,+SPV_KHR_non_semantic_info // CHECK-CPU-SAME:,+SPV_KHR_cooperative_matrix // CHECK-CPU-SAME:,+SPV_INTEL_fp_max_error" - diff --git a/clang/test/Preprocessor/sycl-macro.cpp b/clang/test/Preprocessor/sycl-macro.cpp index cab506771e74c..81645d3ac3736 100644 --- a/clang/test/Preprocessor/sycl-macro.cpp +++ b/clang/test/Preprocessor/sycl-macro.cpp @@ -12,6 +12,8 @@ // RUN: %clang_cc1 %s -triple nvptx64-nvidia-cuda -target-cpu sm_80 -fsycl-is-device -E -dM | FileCheck --check-prefix=CHECK-CUDA %s // RUN: %clang_cc1 %s -triple amdgcn-amd-amdhsa -target-cpu gfx906 -fsycl-is-device -E -dM | FileCheck --check-prefix=CHECK-HIP %s +// RUN: %clang_cc1 %s -triple nvptx64-nvidia-cuda -target-cpu sm_90a -fsycl-is-device -E -dM | FileCheck --check-prefix=CHECK-CUDA-FEATURE %s + // CHECK-NOT:#define __SYCL_DEVICE_ONLY__ 1 // CHECK-NOT:#define SYCL_EXTERNAL // CHECK-NOT:#define CL_SYCL_LANGUAGE_VERSION 121 @@ -37,3 +39,5 @@ // CHECK-CUDA-NOT:#define __CUDA_ARCH__ 800 // CHECK-HIP:#define __CUDA_ARCH__ 0 + +// CHECK-CUDA-FEATURE:#define __CUDA_ARCH_FEAT_SM90_ALL 1 diff --git a/clang/test/SemaSYCL/accessor-type-diagnostics.cpp b/clang/test/SemaSYCL/accessor-type-diagnostics.cpp index e18a1880fcd9b..d7c8e15cde20c 100644 --- a/clang/test/SemaSYCL/accessor-type-diagnostics.cpp +++ b/clang/test/SemaSYCL/accessor-type-diagnostics.cpp @@ -1,9 +1,7 @@ // RUN: %clang_cc1 -triple spir64 -fsycl-is-device -verify \ -// RUN: -aux-triple x86_64-unknown-linux-gnu -fsyntax-only \ -// RUN: -Wno-sycl-2017-compat %s +// RUN: -aux-triple x86_64-unknown-linux-gnu -fsyntax-only %s // RUN: %clang_cc1 -triple spir64 -fsycl-is-device -verify \ -// RUN: -aux-triple x86_64-pc-windows-msvc -fsyntax-only \ -// RUN: -Wno-sycl-2017-compat %s +// RUN: -aux-triple x86_64-pc-windows-msvc -fsyntax-only %s // // Ensure SYCL type restrictions are applied to accessors as well. diff --git a/clang/test/SemaSYCL/buffer_location.cpp b/clang/test/SemaSYCL/buffer_location.cpp index c733d8e7c8315..95fb14eb207b1 100644 --- a/clang/test/SemaSYCL/buffer_location.cpp +++ b/clang/test/SemaSYCL/buffer_location.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fsycl-is-device -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s -// RUN: %clang_cc1 -fsycl-is-device -Wno-sycl-2017-compat -verify -pedantic -DTRIGGER_ERROR %s +// RUN: %clang_cc1 -fsycl-is-device -ast-dump %s | FileCheck %s +// RUN: %clang_cc1 -fsycl-is-device -verify -pedantic -DTRIGGER_ERROR %s #include "Inputs/sycl.hpp" diff --git a/clang/test/SemaSYCL/deferred-diagnostics-aux-builtin.cpp b/clang/test/SemaSYCL/deferred-diagnostics-aux-builtin.cpp index cde5eb40559a1..65141587bc93c 100644 --- a/clang/test/SemaSYCL/deferred-diagnostics-aux-builtin.cpp +++ b/clang/test/SemaSYCL/deferred-diagnostics-aux-builtin.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -aux-triple x86_64-unknown-linux-gnu -Wno-sycl-2017-compat -verify -fsyntax-only %s +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -aux-triple x86_64-unknown-linux-gnu -verify -fsyntax-only %s #include "sycl.hpp" diff --git a/clang/test/SemaSYCL/implicit-sycl-device-attr.cpp b/clang/test/SemaSYCL/implicit-sycl-device-attr.cpp index 8b2c0618795c3..4b30e9ef31f5a 100644 --- a/clang/test/SemaSYCL/implicit-sycl-device-attr.cpp +++ b/clang/test/SemaSYCL/implicit-sycl-device-attr.cpp @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -triple spir64 \ // RUN: -aux-triple x86_64-unknown-linux-gnu -Wno-return-type -verify \ -// RUN: -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s +// RUN: -fsyntax-only -std=c++17 %s // add_ir_attributes_function attribute used to represent compile-time SYCL // properties and some of those properties are intended to be turned into diff --git a/clang/test/SemaSYCL/inline-asm.cpp b/clang/test/SemaSYCL/inline-asm.cpp index a4a308fe0875a..85d5fbeed2b1d 100644 --- a/clang/test/SemaSYCL/inline-asm.cpp +++ b/clang/test/SemaSYCL/inline-asm.cpp @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify %s -DLINUX_ASM -// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify %s -DLINUX_ASM -DSPIR_CHECK -triple spir64-unknown-unknown -// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify -triple x86_64-windows -fasm-blocks %s +// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify %s -DLINUX_ASM +// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify %s -DLINUX_ASM -DSPIR_CHECK -triple spir64-unknown-unknown +// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify -triple x86_64-windows -fasm-blocks %s #ifndef SPIR_CHECK //expected-no-diagnostics diff --git a/clang/test/SemaSYCL/intel-fpga-loop-ast.cpp b/clang/test/SemaSYCL/intel-fpga-loop-ast.cpp index be824787f07d6..8e133b853456f 100644 --- a/clang/test/SemaSYCL/intel-fpga-loop-ast.cpp +++ b/clang/test/SemaSYCL/intel-fpga-loop-ast.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -ast-dump %s | FileCheck %s // Add AST tests for Loop attributes: [[intel::enable_loop_pipelining]], // [[intel::max_interleaving()]], [[intel::loop_coalesce]], diff --git a/clang/test/SemaSYCL/intel-fpga-no-global-work-offset-ast.cpp b/clang/test/SemaSYCL/intel-fpga-no-global-work-offset-ast.cpp index 67343e725d8d8..5256194c2becc 100644 --- a/clang/test/SemaSYCL/intel-fpga-no-global-work-offset-ast.cpp +++ b/clang/test/SemaSYCL/intel-fpga-no-global-work-offset-ast.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -ast-dump %s | FileCheck %s // Tests for AST of Intel FPGA no_global_work_offset function attribute. diff --git a/clang/test/SemaSYCL/intel-fpga-nofusion.cpp b/clang/test/SemaSYCL/intel-fpga-nofusion.cpp index fd64b2ac7fcac..e8a931b124f5d 100644 --- a/clang/test/SemaSYCL/intel-fpga-nofusion.cpp +++ b/clang/test/SemaSYCL/intel-fpga-nofusion.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -fsyntax-only -ast-dump -Wno-sycl-2017-compat -verify %s | FileCheck %s +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -fsyntax-only -ast-dump -verify %s | FileCheck %s // expected-no-diagnostics #include "sycl.hpp" diff --git a/clang/test/SemaSYCL/intel-max-global-work-dim-device-ast.cpp b/clang/test/SemaSYCL/intel-max-global-work-dim-device-ast.cpp index 381433e7c6087..7ec419972dea2 100644 --- a/clang/test/SemaSYCL/intel-max-global-work-dim-device-ast.cpp +++ b/clang/test/SemaSYCL/intel-max-global-work-dim-device-ast.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 %s -fsyntax-only -ast-dump -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -Wno-sycl-2017-compat -triple spir64 | FileCheck %s +// RUN: %clang_cc1 %s -fsyntax-only -ast-dump -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -triple spir64 | FileCheck %s // The test checks AST of [[intel::max_global_work_dim()]] attribute. diff --git a/clang/test/SemaSYCL/intel-max-global-work-dim-device.cpp b/clang/test/SemaSYCL/intel-max-global-work-dim-device.cpp index b453985c2ae79..a100c9728615f 100644 --- a/clang/test/SemaSYCL/intel-max-global-work-dim-device.cpp +++ b/clang/test/SemaSYCL/intel-max-global-work-dim-device.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 %s -fsyntax-only -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2020 -Wno-sycl-2017-compat -triple spir64 -verify +// RUN: %clang_cc1 %s -fsyntax-only -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2020 -triple spir64 -verify // The test checks support and functionality of [[intel::max_global_work_dim()]] attribute. diff --git a/clang/test/SemaSYCL/intel-max-global-work-dim-host.cpp b/clang/test/SemaSYCL/intel-max-global-work-dim-host.cpp index 2519c18672778..0359de526a33e 100644 --- a/clang/test/SemaSYCL/intel-max-global-work-dim-host.cpp +++ b/clang/test/SemaSYCL/intel-max-global-work-dim-host.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-host -Wno-sycl-2017-compat -fsyntax-only -verify %s +// RUN: %clang_cc1 -fsycl-is-host -fsyntax-only -verify %s // expected-no-diagnostics [[intel::max_global_work_dim(2)]] void func_do_not_ignore() {} diff --git a/clang/test/SemaSYCL/intel-max-work-group-size-host.cpp b/clang/test/SemaSYCL/intel-max-work-group-size-host.cpp index cce684221dfcb..5faa2ff420269 100644 --- a/clang/test/SemaSYCL/intel-max-work-group-size-host.cpp +++ b/clang/test/SemaSYCL/intel-max-work-group-size-host.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-host -fsyntax-only -Wno-sycl-2017-compat -verify %s +// RUN: %clang_cc1 -fsycl-is-host -fsyntax-only -verify %s // expected-no-diagnostics [[intel::max_work_group_size(2, 2, 2)]] void func_do_not_ignore() {} diff --git a/clang/test/SemaSYCL/intel-reqd-work-group-size-ast-device.cpp b/clang/test/SemaSYCL/intel-reqd-work-group-size-ast-device.cpp index 412a8a88b8b1e..5ce303af1cb58 100644 --- a/clang/test/SemaSYCL/intel-reqd-work-group-size-ast-device.cpp +++ b/clang/test/SemaSYCL/intel-reqd-work-group-size-ast-device.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump %s | FileCheck %s // Test for AST of reqd_work_group_size kernel attribute in SYCL 1.2.1. diff --git a/clang/test/SemaSYCL/intel-reqd-work-group-size-host.cpp b/clang/test/SemaSYCL/intel-reqd-work-group-size-host.cpp index f9d98ef6be877..445f0815670e3 100644 --- a/clang/test/SemaSYCL/intel-reqd-work-group-size-host.cpp +++ b/clang/test/SemaSYCL/intel-reqd-work-group-size-host.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-host -Wno-sycl-2017-compat -fsyntax-only -verify %s +// RUN: %clang_cc1 -fsycl-is-host -fsyntax-only -verify %s // expected-no-diagnostics [[sycl::reqd_work_group_size(4)]] void f4x1x1() {} diff --git a/clang/test/SemaSYCL/intel-work-group-size-hint-ast-device.cpp b/clang/test/SemaSYCL/intel-work-group-size-hint-ast-device.cpp index aefe2202ce43f..caf429e5a7bc4 100644 --- a/clang/test/SemaSYCL/intel-work-group-size-hint-ast-device.cpp +++ b/clang/test/SemaSYCL/intel-work-group-size-hint-ast-device.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump %s | FileCheck %s // Test for AST of work_group_size_hint kernel attribute in SYCL 1.2.1. diff --git a/clang/test/SemaSYCL/invalid-kernel-arguments.cpp b/clang/test/SemaSYCL/invalid-kernel-arguments.cpp index cf62e6117e47b..10b60eacc9e4c 100644 --- a/clang/test/SemaSYCL/invalid-kernel-arguments.cpp +++ b/clang/test/SemaSYCL/invalid-kernel-arguments.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify %s +// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify %s // This test checks that compiler doesn't crash if type of kernel argument is // invalid. diff --git a/clang/test/SemaSYCL/kernel-arg-opt-report.cpp b/clang/test/SemaSYCL/kernel-arg-opt-report.cpp index a9cca45099b62..95f2106d72655 100644 --- a/clang/test/SemaSYCL/kernel-arg-opt-report.cpp +++ b/clang/test/SemaSYCL/kernel-arg-opt-report.cpp @@ -1,9 +1,9 @@ // RUN: %clang_cc1 -triple spir64-unknown-unknown -fsycl-is-device \ -// RUN: -Wno-sycl-2017-compat -emit-llvm-bc %s -o %t-host.bc -opt-record-file %t-host.yaml +// RUN: -emit-llvm-bc %s -o %t-host.bc -opt-record-file %t-host.yaml // RUN: FileCheck -check-prefix=SPIR --input-file %t-host.yaml %s // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fsycl-is-device \ -// RUN: -Wno-sycl-2017-compat -emit-llvm-bc %s -o %t-host.bc -opt-record-file %t-host.yaml +// RUN: -emit-llvm-bc %s -o %t-host.bc -opt-record-file %t-host.yaml // RUN: FileCheck -check-prefix=NVPTX --input-file %t-host.yaml %s // The test generates remarks about the kernel argument, their location and type // in the resulting yaml file. diff --git a/clang/test/SemaSYCL/kernel-function-type.cpp b/clang/test/SemaSYCL/kernel-function-type.cpp index 9a2036d7d5eee..bef5a5d7ab0b9 100644 --- a/clang/test/SemaSYCL/kernel-function-type.cpp +++ b/clang/test/SemaSYCL/kernel-function-type.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify %s +// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify %s // expected-no-diagnostics // The kernel_single_task call is emitted as an OpenCL kernel function. The call diff --git a/clang/test/SemaSYCL/lambda_implicit_capture_this.cpp b/clang/test/SemaSYCL/lambda_implicit_capture_this.cpp index 4e7d41b142815..9bd49d777977c 100644 --- a/clang/test/SemaSYCL/lambda_implicit_capture_this.cpp +++ b/clang/test/SemaSYCL/lambda_implicit_capture_this.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify %s +// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify %s // // This test checks that the compiler issues an error on attempt to capture // "this" pointer by lambdas passed to the device code (directly and indirectly) diff --git a/clang/test/SemaSYCL/loop_fusion_ast.cpp b/clang/test/SemaSYCL/loop_fusion_ast.cpp index 52829b05d669c..d3f05382e103b 100644 --- a/clang/test/SemaSYCL/loop_fusion_ast.cpp +++ b/clang/test/SemaSYCL/loop_fusion_ast.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -ast-dump %s | FileCheck %s // Tests for AST of Intel FPGA loop fusion function attributes #include "sycl.hpp" diff --git a/clang/test/SemaSYCL/loop_unroll.cpp b/clang/test/SemaSYCL/loop_unroll.cpp index 2fcf31480b183..61796397ece05 100644 --- a/clang/test/SemaSYCL/loop_unroll.cpp +++ b/clang/test/SemaSYCL/loop_unroll.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify -pedantic %s +// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify -pedantic %s template void bar() { diff --git a/clang/test/SemaSYCL/markfunction-astconsumer.cpp b/clang/test/SemaSYCL/markfunction-astconsumer.cpp index 207d2b9e5d47b..0b690c004b68f 100644 --- a/clang/test/SemaSYCL/markfunction-astconsumer.cpp +++ b/clang/test/SemaSYCL/markfunction-astconsumer.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -Wno-return-type -verify -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s +// RUN: %clang_cc1 -fsycl-is-device -Wno-return-type -verify -fsyntax-only -std=c++17 %s void bar(); template diff --git a/clang/test/SemaSYCL/max-concurrency-ast.cpp b/clang/test/SemaSYCL/max-concurrency-ast.cpp index 654ce182fed67..f61332135a0a5 100644 --- a/clang/test/SemaSYCL/max-concurrency-ast.cpp +++ b/clang/test/SemaSYCL/max-concurrency-ast.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -ast-dump %s | FileCheck %s // Tests for AST of Intel FPGA max concurrency function attribute. #include "sycl.hpp" diff --git a/clang/test/SemaSYCL/no-vtables.cpp b/clang/test/SemaSYCL/no-vtables.cpp index 28400fe2834c3..905a40da9dc5b 100644 --- a/clang/test/SemaSYCL/no-vtables.cpp +++ b/clang/test/SemaSYCL/no-vtables.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -verify -Wno-sycl-2017-compat -emit-llvm-only %s +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -verify -emit-llvm-only %s // expected-no-diagnostics // Should never fail, since the type is never used in kernel code. diff --git a/clang/test/SemaSYCL/no-vtables2.cpp b/clang/test/SemaSYCL/no-vtables2.cpp index 721786c96c1e9..fd0313574f491 100644 --- a/clang/test/SemaSYCL/no-vtables2.cpp +++ b/clang/test/SemaSYCL/no-vtables2.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -Wno-return-type -verify -Wno-sycl-2017-compat -fsyntax-only %s +// RUN: %clang_cc1 -fsycl-is-device -Wno-return-type -verify -fsyntax-only %s struct Base { virtual void f() const {} diff --git a/clang/test/SemaSYCL/num_simd_work_items.cpp b/clang/test/SemaSYCL/num_simd_work_items.cpp index 71bd2f38bc21a..8f47fdb2d2434 100644 --- a/clang/test/SemaSYCL/num_simd_work_items.cpp +++ b/clang/test/SemaSYCL/num_simd_work_items.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 %s -fsycl-is-device -internal-isystem %S/Inputs -triple spir64 -fsyntax-only -sycl-std=2020 -Wno-sycl-2017-compat -verify +// RUN: %clang_cc1 %s -fsycl-is-device -internal-isystem %S/Inputs -triple spir64 -fsyntax-only -sycl-std=2020 -verify // The test checks support and functionality of [[intel::num_simd_work_items()]] attribute. diff --git a/clang/test/SemaSYCL/num_simd_work_items_ast.cpp b/clang/test/SemaSYCL/num_simd_work_items_ast.cpp index 5da7471260989..c2dbc246c6511 100644 --- a/clang/test/SemaSYCL/num_simd_work_items_ast.cpp +++ b/clang/test/SemaSYCL/num_simd_work_items_ast.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 %s -fsycl-is-device -internal-isystem %S/Inputs -triple spir64 -fsyntax-only -sycl-std=2017 -Wno-sycl-2017-compat -ast-dump | FileCheck %s +// RUN: %clang_cc1 %s -fsycl-is-device -internal-isystem %S/Inputs -triple spir64 -fsyntax-only -sycl-std=2017 -ast-dump | FileCheck %s // The test checks AST of [[intel::num_simd_work_items()]] attribute. diff --git a/clang/test/SemaSYCL/num_simd_work_items_host.cpp b/clang/test/SemaSYCL/num_simd_work_items_host.cpp index 8558a574572d9..d82cdd0e1ec38 100644 --- a/clang/test/SemaSYCL/num_simd_work_items_host.cpp +++ b/clang/test/SemaSYCL/num_simd_work_items_host.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-host -fsyntax-only -Wno-sycl-2017-compat -verify %s +// RUN: %clang_cc1 -fsycl-is-host -fsyntax-only -verify %s // expected-no-diagnostics [[intel::num_simd_work_items(2)]] void func_do_not_ignore() {} diff --git a/clang/test/SemaSYCL/pointer-to-vla.cpp b/clang/test/SemaSYCL/pointer-to-vla.cpp index cd2d925f92dad..16f89caa07b5c 100644 --- a/clang/test/SemaSYCL/pointer-to-vla.cpp +++ b/clang/test/SemaSYCL/pointer-to-vla.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -fsyntax-only -Wno-sycl-2017-compat -verify %s +// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -fsyntax-only -verify %s // // This test checks if compiler reports compilation error on an attempt to pass // a pointer to VLA as kernel argument diff --git a/clang/test/SemaSYCL/prohibit-thread-local.cpp b/clang/test/SemaSYCL/prohibit-thread-local.cpp index c87c2439ad02b..c7b00d04befdf 100644 --- a/clang/test/SemaSYCL/prohibit-thread-local.cpp +++ b/clang/test/SemaSYCL/prohibit-thread-local.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -verify -Wno-sycl-2017-compat -fsyntax-only %s +// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -verify -fsyntax-only %s thread_local const int prohobit_ns_scope = 0; thread_local int prohobit_ns_scope2 = 0; diff --git a/clang/test/SemaSYCL/reference-kernel-param.cpp b/clang/test/SemaSYCL/reference-kernel-param.cpp index 81350f48d552f..080ecaef42684 100644 --- a/clang/test/SemaSYCL/reference-kernel-param.cpp +++ b/clang/test/SemaSYCL/reference-kernel-param.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -verify -Wno-sycl-2017-compat -fsyntax-only %s +// RUN: %clang_cc1 -fsycl-is-device -verify -fsyntax-only %s // This test checks if compiler reports compilation error on an attempt to pass // a reference as SYCL kernel parameter. diff --git a/clang/test/SemaSYCL/reqd-sub-group-size-ast.cpp b/clang/test/SemaSYCL/reqd-sub-group-size-ast.cpp index dde7140595dd4..512e8d5f326ea 100644 --- a/clang/test/SemaSYCL/reqd-sub-group-size-ast.cpp +++ b/clang/test/SemaSYCL/reqd-sub-group-size-ast.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump %s | FileCheck %s // The test checks AST of [[intel::reqd_sub_group_size()]] attribute. diff --git a/clang/test/SemaSYCL/restrict-recursion.cpp b/clang/test/SemaSYCL/restrict-recursion.cpp index fdfd4ba8fb4e8..a2ede51efe027 100644 --- a/clang/test/SemaSYCL/restrict-recursion.cpp +++ b/clang/test/SemaSYCL/restrict-recursion.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -verify -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s +// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -verify -fsyntax-only -std=c++17 %s // This recursive function is not called from sycl kernel, // so it should not be diagnosed. diff --git a/clang/test/SemaSYCL/restrict-recursion2.cpp b/clang/test/SemaSYCL/restrict-recursion2.cpp index a92705aa81bbd..8e9e1b32f3c53 100644 --- a/clang/test/SemaSYCL/restrict-recursion2.cpp +++ b/clang/test/SemaSYCL/restrict-recursion2.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -verify -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s +// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -verify -fsyntax-only -std=c++17 %s // This recursive function is not called from sycl kernel, // so it should not be diagnosed. diff --git a/clang/test/SemaSYCL/restrict-recursion3.cpp b/clang/test/SemaSYCL/restrict-recursion3.cpp index b66e3cd580cc3..5e8b3fae83c0b 100644 --- a/clang/test/SemaSYCL/restrict-recursion3.cpp +++ b/clang/test/SemaSYCL/restrict-recursion3.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -Wno-sycl-2017-compat -Wno-error=sycl-strict -verify -fsyntax-only -std=c++17 %s +// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -Wno-error=sycl-strict -verify -fsyntax-only -std=c++17 %s // This recursive function is not called from sycl kernel, // so it should not be diagnosed. diff --git a/clang/test/SemaSYCL/restrict-recursion4.cpp b/clang/test/SemaSYCL/restrict-recursion4.cpp index ee0fdb20ce4c0..30a5c5e2b6296 100644 --- a/clang/test/SemaSYCL/restrict-recursion4.cpp +++ b/clang/test/SemaSYCL/restrict-recursion4.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -Wno-sycl-2017-compat -Wno-error=sycl-strict -verify -fsyntax-only -std=c++17 %s +// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -Wno-error=sycl-strict -verify -fsyntax-only -std=c++17 %s // This recursive function is not called from sycl kernel, // so it should not be diagnosed. diff --git a/clang/test/SemaSYCL/stall_enable_device.cpp b/clang/test/SemaSYCL/stall_enable_device.cpp index b8682461fc9ba..ceba235e5e820 100644 --- a/clang/test/SemaSYCL/stall_enable_device.cpp +++ b/clang/test/SemaSYCL/stall_enable_device.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 %s -fsyntax-only -internal-isystem %S/Inputs -fsycl-is-device -Wno-sycl-2017-compat -DTRIGGER_ERROR -verify -// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -fsyntax-only -ast-dump -Wno-sycl-2017-compat %s | FileCheck %s +// RUN: %clang_cc1 %s -fsyntax-only -internal-isystem %S/Inputs -fsycl-is-device -DTRIGGER_ERROR -verify +// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -fsyntax-only -ast-dump %s | FileCheck %s // Test that checks [[intel::use_stall_enable_clusters]] attribute support on function. diff --git a/clang/test/SemaSYCL/sycl-callstack.cpp b/clang/test/SemaSYCL/sycl-callstack.cpp index e6eb7a34a3c3c..28f36d741be7b 100644 --- a/clang/test/SemaSYCL/sycl-callstack.cpp +++ b/clang/test/SemaSYCL/sycl-callstack.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -verify -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s +// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -verify -fsyntax-only -std=c++17 %s template __attribute__((sycl_kernel)) void kernel_single_task(const Func &kernelFunc) { diff --git a/clang/test/SemaSYCL/sycl-cconv.cpp b/clang/test/SemaSYCL/sycl-cconv.cpp index 2f30d8474423c..143e5bb9a750a 100644 --- a/clang/test/SemaSYCL/sycl-cconv.cpp +++ b/clang/test/SemaSYCL/sycl-cconv.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-windows -aux-triple x86_64-pc-windows-msvc -fsyntax-only -Wno-sycl-2017-compat -verify %s +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-windows -aux-triple x86_64-pc-windows-msvc -fsyntax-only -verify %s // expected-no-warning@+1 __inline __cdecl int printf(char const* const _Format, ...) { return 0; } diff --git a/clang/test/SemaSYCL/sycl-device-const-static.cpp b/clang/test/SemaSYCL/sycl-device-const-static.cpp index 6a785f17725cb..69b75e936abc0 100644 --- a/clang/test/SemaSYCL/sycl-device-const-static.cpp +++ b/clang/test/SemaSYCL/sycl-device-const-static.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -verify -Wno-sycl-2017-compat -fsyntax-only %s +// RUN: %clang_cc1 -fsycl-is-device -verify -fsyntax-only %s struct Base {}; struct S { diff --git a/clang/test/SemaSYCL/sycl-device-static-restrict.cpp b/clang/test/SemaSYCL/sycl-device-static-restrict.cpp index 6a9d5092689e2..733285a1325a4 100644 --- a/clang/test/SemaSYCL/sycl-device-static-restrict.cpp +++ b/clang/test/SemaSYCL/sycl-device-static-restrict.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -verify -Wno-sycl-2017-compat -fsyntax-only %s +// RUN: %clang_cc1 -fsycl-is-device -verify -fsyntax-only %s const int glob1 = 1; int glob2 = 2; template diff --git a/clang/test/SemaSYCL/sycl-device-template-diag.cpp b/clang/test/SemaSYCL/sycl-device-template-diag.cpp index f1a8942aed6d6..7ae5f7b57fca5 100644 --- a/clang/test/SemaSYCL/sycl-device-template-diag.cpp +++ b/clang/test/SemaSYCL/sycl-device-template-diag.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -verify -Wno-sycl-2017-compat -fsyntax-only %s -internal-isystem %S/Inputs +// RUN: %clang_cc1 -fsycl-is-device -verify -fsyntax-only %s -internal-isystem %S/Inputs // This test verifies that we generate deferred diagnostics when // such diagnostics are in a function template. diff --git a/clang/test/SemaSYCL/sycl-dllimport-dllexport.cpp b/clang/test/SemaSYCL/sycl-dllimport-dllexport.cpp index e703b6fcb117b..ea67ee0d2ebed 100644 --- a/clang/test/SemaSYCL/sycl-dllimport-dllexport.cpp +++ b/clang/test/SemaSYCL/sycl-dllimport-dllexport.cpp @@ -1,19 +1,19 @@ // RUN: %clang_cc1 -triple spir64-unknown-unknown -fms-extensions \ // RUN: -aux-triple x86_64-unknown-linux-gnu -fsycl-is-device \ -// RUN: -fsyntax-only -Wno-sycl-2017-compat -DWARNCHECK %s -o /dev/null 2>&1 | FileCheck %s +// RUN: -fsyntax-only -DWARNCHECK %s -o /dev/null 2>&1 | FileCheck %s // check random triple aux-triple with sycl-device -// RUN: %clang_cc1 -triple spir64-unknown-windows -Wno-sycl-2017-compat -fsyntax-only \ +// RUN: %clang_cc1 -triple spir64-unknown-windows -fsyntax-only \ // RUN: -fms-extensions -DWARNCHECK %s -o /dev/null 2>&1 | FileCheck --check-prefixes CHECKALL %s // check without -aux-triple but sycl-device // RUN: %clang_cc1 -triple spir64-unknown-windows \ // RUN: -fsycl-is-device -aux-triple x86_64-pc-windows-msvc -fms-extensions \ -// RUN: -fsyntax-only -Wno-sycl-2017-compat -DWARNCHECK %s -o /dev/null 2>&1 | \ +// RUN: -fsyntax-only -DWARNCHECK %s -o /dev/null 2>&1 | \ // RUN: FileCheck %s --check-prefixes CHECKALL // check -aux-tripe without sycl-device -// RUN: %clang_cc1 -triple spir64-unknown-windows -Wno-sycl-2017-compat -fsyntax-only \ +// RUN: %clang_cc1 -triple spir64-unknown-windows -fsyntax-only \ // RUN: -aux-triple x86_64-pc-windows-msvc -fsycl-is-device \ // RUN: -fms-extensions -verify %s // check error message when dllimport function gets called in sycl-kernel code diff --git a/clang/test/SemaSYCL/sycl-fptr-lambda.cpp b/clang/test/SemaSYCL/sycl-fptr-lambda.cpp index eac66a9106ab8..480c4fbb6e6c7 100644 --- a/clang/test/SemaSYCL/sycl-fptr-lambda.cpp +++ b/clang/test/SemaSYCL/sycl-fptr-lambda.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -std=c++14 -verify -Wno-sycl-2017-compat -fsyntax-only %s +// RUN: %clang_cc1 -fsycl-is-device -std=c++14 -verify -fsyntax-only %s // expected-no-diagnostics template diff --git a/clang/test/SemaSYCL/sycl-pseudo-dtor.cpp b/clang/test/SemaSYCL/sycl-pseudo-dtor.cpp index 3635e22ffac43..3e90e58a0ec94 100644 --- a/clang/test/SemaSYCL/sycl-pseudo-dtor.cpp +++ b/clang/test/SemaSYCL/sycl-pseudo-dtor.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -verify -Wno-sycl-2017-compat -fsyntax-only %s +// RUN: %clang_cc1 -fsycl-is-device -verify -fsyntax-only %s template struct functor_wrapper{ diff --git a/clang/test/SemaSYCL/sycl-restrict.cpp b/clang/test/SemaSYCL/sycl-restrict.cpp index f52f6964712ff..c7187fd737dea 100644 --- a/clang/test/SemaSYCL/sycl-restrict.cpp +++ b/clang/test/SemaSYCL/sycl-restrict.cpp @@ -1,14 +1,14 @@ // RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -triple spir64 \ // RUN: -aux-triple x86_64-unknown-linux-gnu -Wno-return-type -verify \ -// RUN: -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s +// RUN: -fsyntax-only -std=c++17 %s // RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -triple spir64 \ // RUN: -aux-triple x86_64-unknown-linux-gnu -fno-sycl-allow-func-ptr \ -// RUN: -Wno-return-type -verify -Wno-sycl-2017-compat -fsyntax-only \ +// RUN: -Wno-return-type -verify -fsyntax-only \ // RUN: -std=c++17 %s // RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -triple spir64 \ // RUN: -aux-triple x86_64-unknown-linux-gnu -DALLOW_FP=1 \ // RUN: -fsycl-allow-func-ptr -Wno-return-type -verify \ -// RUN: -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s +// RUN: -fsyntax-only -std=c++17 %s namespace std { class type_info; diff --git a/clang/test/SemaSYCL/tls_error.cpp b/clang/test/SemaSYCL/tls_error.cpp index a43a5ee9b6bbc..80b081107b3e5 100644 --- a/clang/test/SemaSYCL/tls_error.cpp +++ b/clang/test/SemaSYCL/tls_error.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -verify -Wno-sycl-2017-compat -fsyntax-only %s +// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -verify -fsyntax-only %s extern __thread void* __once_callable; // expected-no-error extern __thread void (*__once_call)(); // expected-no-error diff --git a/clang/test/SemaSYCL/unevaluated-function.cpp b/clang/test/SemaSYCL/unevaluated-function.cpp index 2d0059eaef06d..5b07ed1764f65 100644 --- a/clang/test/SemaSYCL/unevaluated-function.cpp +++ b/clang/test/SemaSYCL/unevaluated-function.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -verify -Wno-sycl-2017-compat -fsyntax-only %s +// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -verify -fsyntax-only %s // Check that a function used in an unevaluated context is not subject // to delayed device diagnostics. diff --git a/clang/test/SemaSYCL/unsupported_math.cpp b/clang/test/SemaSYCL/unsupported_math.cpp index c1ed10ccf496f..ca65f234d3f03 100644 --- a/clang/test/SemaSYCL/unsupported_math.cpp +++ b/clang/test/SemaSYCL/unsupported_math.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify %s +// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify %s template __attribute__((sycl_kernel)) void kernel(const Func &kernelFunc) { kernelFunc(); diff --git a/clang/test/SemaSYCL/variadic-func-call.cpp b/clang/test/SemaSYCL/variadic-func-call.cpp index 96b19f1d7d905..4da94418fafe6 100644 --- a/clang/test/SemaSYCL/variadic-func-call.cpp +++ b/clang/test/SemaSYCL/variadic-func-call.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsyntax-only -Wno-sycl-2017-compat -verify %s +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsyntax-only -verify %s void variadic(int, ...) {} namespace NS { diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 48cd2c9a4d77b..2f958f31a208f 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -522,6 +522,71 @@ static Expected convertSPIRVToIR(StringRef Filename, return *TempFileOrErr; } +// Add any sycl-post-link options that rely on a specific Triple in addition +// to user supplied options. +// NOTE: Any changes made here should be reflected in the similarly named +// function in clang/lib/Driver/ToolChains/Clang.cpp. +static void +getTripleBasedSYCLPostLinkOpts(const ArgList &Args, + SmallVector &PostLinkArgs, + const llvm::Triple Triple) { + const llvm::Triple HostTriple(Args.getLastArgValue(OPT_host_triple_EQ)); + bool SYCLNativeCPU = (HostTriple == Triple); + bool SpecConstsSupported = (!Triple.isNVPTX() && !Triple.isAMDGCN() || + !Triple.isSPIRAOT() && !SYCLNativeCPU); + if (SpecConstsSupported) + PostLinkArgs.push_back("-spec-const=native"); + else + PostLinkArgs.push_back("-spec-const=emulation"); + + // See if device code splitting is already requested. If not requested, then + // set -split=auto for non-FPGA targets. + bool NoSplit = true; + for (auto Arg : PostLinkArgs) + if (Arg.contains("-split=")) { + NoSplit = false; + break; + } + if (NoSplit && (Triple.getSubArch() != llvm::Triple::SPIRSubArch_fpga)) + PostLinkArgs.push_back("-split=auto"); + + // On Intel targets we don't need non-kernel functions as entry points, + // because it only increases amount of code for device compiler to handle, + // without any actual benefits. + // TODO: Try to extend this feature for non-Intel GPUs. + if ((!Args.hasFlag(OPT_no_sycl_remove_unused_external_funcs, + OPT_sycl_remove_unused_external_funcs, false) && + !SYCLNativeCPU) && + !Triple.isNVPTX() && !Triple.isAMDGPU()) + PostLinkArgs.push_back("-emit-only-kernels-as-entry-points"); + + if (!Triple.isAMDGCN()) + PostLinkArgs.push_back("-emit-param-info"); + // Enable program metadata + if (Triple.isNVPTX() || Triple.isAMDGCN() || SYCLNativeCPU) + PostLinkArgs.push_back("-emit-program-metadata"); + + bool SplitEsimdByDefault = Triple.isSPIROrSPIRV(); + bool SplitEsimd = + Args.hasFlag(OPT_sycl_device_code_split_esimd, + OPT_no_sycl_device_code_split_esimd, SplitEsimdByDefault); + + // Symbol file and specialization constant info generation is mandatory - + // add options unconditionally + PostLinkArgs.push_back("-symbols"); + PostLinkArgs.push_back("-emit-exported-symbols"); + PostLinkArgs.push_back("-emit-imported-symbols"); + if (SplitEsimd) + PostLinkArgs.push_back("-split-esimd"); + PostLinkArgs.push_back("-lower-esimd"); + + bool IsAOT = Triple.isNVPTX() || Triple.isAMDGCN() || Triple.isSPIRAOT(); + if (Args.hasFlag(OPT_sycl_add_default_spec_consts_image, + OPT_no_sycl_add_default_spec_consts_image, false) && + IsAOT) + PostLinkArgs.push_back("-generate-device-image-default-spec-consts"); +} + // Run sycl-post-link tool static Expected runSYCLPostLink(ArrayRef InputFiles, const ArgList &Args) { @@ -536,12 +601,13 @@ static Expected runSYCLPostLink(ArrayRef InputFiles, if (!TempFileOrErr) return TempFileOrErr.takeError(); + SmallVector CmdArgs; + CmdArgs.push_back(*SYCLPostLinkPath); + const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); + getTripleBasedSYCLPostLinkOpts(Args, CmdArgs, Triple); StringRef SYCLPostLinkOptions; if (Arg *A = Args.getLastArg(OPT_sycl_post_link_options_EQ)) SYCLPostLinkOptions = A->getValue(); - - SmallVector CmdArgs; - CmdArgs.push_back(*SYCLPostLinkPath); SYCLPostLinkOptions.split(CmdArgs, " ", /* MaxSplit = */ -1, /* KeepEmpty = */ false); CmdArgs.push_back("-o"); diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td index fb9189d99ff87..1c93fa2e0cf1e 100644 --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -170,3 +170,18 @@ def sycl_post_link_options_EQ : Joined<["--", "-"], "sycl-post-link-options=">, def llvm_spirv_options_EQ : Joined<["--", "-"], "llvm-spirv-options=">, Flags<[WrapperOnlyOption]>, HelpText<"Options that will control llvm-spirv step">; + +// Extra SYCL options to help generate sycl-post-link options that also depend +// on the target triple. +def sycl_remove_unused_external_funcs : Flag<["--", "-"], "sycl-remove-unused-external-funcs">, + Flags<[WrapperOnlyOption, HelpHidden]>; +def no_sycl_remove_unused_external_funcs : Flag<["--", "-"], "no-sycl-remove-unused-external-funcs">, + Flags<[WrapperOnlyOption, HelpHidden]>; +def sycl_device_code_split_esimd : Flag<["--", "-"], "sycl-device-code-split-esimd">, + Flags<[WrapperOnlyOption, HelpHidden]>; +def no_sycl_device_code_split_esimd : Flag<["--", "-"], "no-sycl-device-code-split-esimd">, + Flags<[WrapperOnlyOption, HelpHidden]>; +def sycl_add_default_spec_consts_image : Flag<["--", "-"], "sycl-add-default-spec-consts-image">, + Flags<[WrapperOnlyOption, HelpHidden]>; +def no_sycl_add_default_spec_consts_image : Flag<["--", "-"], "no-sycl-add-default-spec-consts-image">, + Flags<[WrapperOnlyOption, HelpHidden]>; diff --git a/devops/cts_exclude_filter b/devops/cts_exclude_filter index 318c23eea2168..cebc93ef30fd0 100644 --- a/devops/cts_exclude_filter +++ b/devops/cts_exclude_filter @@ -5,4 +5,3 @@ marray math_builtin_api # https://github.com/intel/llvm/issues/13574 hierarchical -accessor diff --git a/devops/dependencies-igc-dev.json b/devops/dependencies-igc-dev.json index 39729a23fec3a..e36ef6f0c970d 100644 --- a/devops/dependencies-igc-dev.json +++ b/devops/dependencies-igc-dev.json @@ -1,10 +1,10 @@ { "linux": { "igc_dev": { - "github_tag": "igc-dev-4627f1f", - "version": "4627f1f", - "updated_at": "2024-05-26T23:48:05Z", - "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/1539236241/zip", + "github_tag": "igc-dev-6fe460a", + "version": "6fe460a", + "updated_at": "2024-06-24T01:03:13Z", + "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/1629761341/zip", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" } } diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 0f515afdc9875..e8655b0c2e839 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -32,7 +32,7 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS r600/libspirv/SOURCES; spirv/lib/SOURCES; spirv64/lib/SOURCES - x86_64-unknown-linux/libspirv/SOURCES + native_cpu-unknown-linux/libspirv/SOURCES ) set( LIBCLC_MIN_LLVM 3.9.0 ) @@ -42,6 +42,9 @@ set( LIBCLC_TARGETS_TO_BUILD "all" option( ENABLE_RUNTIME_SUBNORMAL "Enable runtime linking of subnormal support." OFF ) +set( LIBCLC_NATIVECPU_FLAGS_X86_64 "" + CACHE STRING "Semicolon-separated list of compiler flags for x86_64 libclc target.") + if( LIBCLC_STANDALONE_BUILD OR CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR ) # Out-of-tree configuration set( LIBCLC_STANDALONE_BUILD TRUE ) @@ -156,6 +159,12 @@ if( LIBCLC_TARGETS_TO_BUILD STREQUAL "all" ) set( LIBCLC_TARGETS_TO_BUILD ${LIBCLC_TARGETS_ALL} ) endif() +option( LIBCLC_NATIVECPU_HOST_TARGET "Build libclc for Native CPU using the host triple." Off) + +if( LIBCLC_NATIVECPU_HOST_TARGET ) + list(APPEND LIBCLC_TARGETS_TO_BUILD ${LLVM_TARGET_TRIPLE}) +endif() + list( SORT LIBCLC_TARGETS_TO_BUILD ) # Verify that the user hasn't requested mesa3d targets without an available @@ -195,6 +204,7 @@ set( spirv-mesa3d-_devices none ) set( spirv64-mesa3d-_devices none ) # TODO: Does this need to be set for each possible triple? set( x86_64-unknown-linux-gnu_devices none ) +set( aarch64-unknown-linux-gnu_devices none ) # Setup aliases set( cedar_aliases palm sumo sumo2 redwood juniper ) @@ -272,6 +282,8 @@ else(LIBCLC_STANDALONE_BUILD) endif(LIBCLC_STANDALONE_BUILD) file( TO_CMAKE_PATH ${LIBCLC_LIBRARY_OUTPUT_INTDIR}/clc LIBCLC_LIBRARY_OUTPUT_INTDIR ) +set(NATIVECPU_SUPPORTED_ARCH "x86_64;aarch64") + foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) message( STATUS "libclc target '${t}' is enabled" ) string( REPLACE "-" ";" TRIPLE ${t} ) @@ -297,6 +309,18 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set( DARCH ${ARCH} ) endif() + set(IS_NATIVE_CPU_ARCH FALSE) + if( ARCH IN_LIST NATIVECPU_SUPPORTED_ARCH ) + set(IS_NATIVE_CPU_ARCH TRUE) + endif() + + if( IS_NATIVE_CPU_ARCH AND OS STREQUAL linux) + LIST( APPEND dirs native_cpu-unknown-linux ) + elseif( IS_NATIVE_CPU_ARCH AND NOT OS STREQUAL linux ) + message(WARNING "libclc is being built for an unsupported ARCH/OS" + " configuration, some SYCL programs may fail to build.") + endif() + set( lib_files ) set( lib_gen_files ) libclc_configure_lib_source(lib_files lib_gen_files @@ -332,11 +356,11 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) # AMDGCN needs libclc to be compiled to high bc version since all atomic # clang builtins need to be accessible list( APPEND flags -mcpu=gfx940 -mllvm --amdgpu-oclc-reflect-enable=false ) - elseif( ARCH STREQUAL x86_64) - # TODO: This is used by SYCL Native Cpu, we should define an option to set this flags - list( APPEND flags -Xclang -target-feature -Xclang +avx - -Xclang -fsycl-is-native-cpu - -Xclang -target-feature -Xclang +avx512f) + elseif( IS_NATIVE_CPU_ARCH ) + list( APPEND flags -Xclang -fsycl-is-native-cpu ) + if( ARCH STREQUAL x86_64 ) + list( APPEND flags ${LIBCLC_NATIVECPU_FLAGS_X86_64}) + endif() endif() endif() @@ -369,8 +393,8 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set( has_distinct_generic_addrspace FALSE ) elseif( ARCH STREQUAL amdgcn ) set( opt_flags -O3 --amdgpu-oclc-reflect-enable=false ) - elseif( ARCH STREQUAL x86_64) - set( opt_flags ) + elseif( IS_NATIVE_CPU_ARCH ) + set( opt_flags -O3 ) set( has_distinct_generic_addrspace FALSE ) else() set( opt_flags -O3 ) @@ -391,6 +415,9 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) "+__opencl_c_3d_image_writes," "+__opencl_c_images," "+cl_khr_3d_image_writes") + if( ARCH STREQUAL "aarch64") + string( APPEND CL_3_0_EXTENSIONS ",+cl_clang_storage_class_specifiers,+__opencl_c_fp64,+cl_khr_int64_base_atomics" ) + endif() if( supports_generic_addrspace ) string( APPEND CL_3_0_EXTENSIONS ",+__opencl_c_generic_address_space" ) if( has_distinct_generic_addrspace ) diff --git a/libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl b/libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl index 16d5a06acc530..0d09c0f49e313 100644 --- a/libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl +++ b/libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl @@ -10,42 +10,56 @@ #include #include -#define BUILTIN_FENCE(semantics, scope_memory) \ - if (semantics & Acquire) \ - return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, scope_memory); \ - else if (semantics & Release) \ - return __builtin_amdgcn_fence(__ATOMIC_RELEASE, scope_memory); \ - else if (semantics & AcquireRelease) \ - return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, scope_memory); \ - else if (semantics & SequentiallyConsistent) \ - return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, scope_memory); \ - else \ - return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, scope_memory); -_CLC_DEF _CLC_OVERLOAD void __mem_fence(unsigned int scope_memory, - unsigned int semantics) { +#define BUILTIN_FENCE(order, scope_memory) \ + /* None implies Monotonic (for llvm/AMDGPU), or relaxed in C++. \ + * This does not make sense as ordering argument for a fence instruction \ + * and is not part of the supported orderings for a fence in AMDGPU. */ \ + if (order != None) { \ + switch (order) { \ + case Acquire: \ + return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, scope_memory); \ + case Release: \ + return __builtin_amdgcn_fence(__ATOMIC_RELEASE, scope_memory); \ + case AcquireRelease: \ + return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, scope_memory); \ + case SequentiallyConsistent: \ + return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, scope_memory); \ + default: \ + __builtin_trap(); \ + __builtin_unreachable(); \ + } \ + } + +_CLC_INLINE void builtin_fence_order(unsigned int scope_memory, + unsigned int order) { switch ((enum Scope)scope_memory) { case CrossDevice: - BUILTIN_FENCE(semantics, "") + BUILTIN_FENCE(order, "") case Device: - BUILTIN_FENCE(semantics, "agent") + BUILTIN_FENCE(order, "agent") case Workgroup: - BUILTIN_FENCE(semantics, "workgroup") + BUILTIN_FENCE(order, "workgroup") case Subgroup: - BUILTIN_FENCE(semantics, "wavefront") + BUILTIN_FENCE(order, "wavefront") case Invocation: - BUILTIN_FENCE(semantics, "singlethread") + BUILTIN_FENCE(order, "singlethread") } } #undef BUILTIN_FENCE +_CLC_DEF _CLC_OVERLOAD void __mem_fence(unsigned int scope_memory, + unsigned int semantics) { + builtin_fence_order(scope_memory, semantics & 0x1F); +} + _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int scope_memory, unsigned int semantics) { __mem_fence(scope_memory, semantics); } _CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void -__spirv_ControlBarrier(unsigned int scope_execution, unsigned scope_memory, +__spirv_ControlBarrier(unsigned int scope_execution, unsigned int scope_memory, unsigned int semantics) { if (semantics) { __mem_fence(scope_memory, semantics); diff --git a/libclc/generic/libspirv/math/acos.inc b/libclc/generic/libspirv/math/acos.inc index 4b437283f60ec..947730bead1f8 100644 --- a/libclc/generic/libspirv/math/acos.inc +++ b/libclc/generic/libspirv/math/acos.inc @@ -19,9 +19,6 @@ * precision of #4 may be better. */ -// TODO: Enable half precision when atan2 is implemented -#if __CLC_FPSIZE > 16 - #if __CLC_FPSIZE == 64 #define __CLC_CONST(x) x #elif __CLC_FPSIZE == 32 @@ -38,5 +35,3 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __spirv_ocl_acos(__CLC_GENTYPE x) { } #undef __CLC_CONST - -#endif diff --git a/libclc/generic/libspirv/math/acosh.cl b/libclc/generic/libspirv/math/acosh.cl index 6945d3f6e2c8d..cb7931795466f 100644 --- a/libclc/generic/libspirv/math/acosh.cl +++ b/libclc/generic/libspirv/math/acosh.cl @@ -113,3 +113,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_acosh(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_acosh, double) #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_acosh, __builtin_acosh, half) + +#endif diff --git a/libclc/generic/libspirv/math/asin.inc b/libclc/generic/libspirv/math/asin.inc index ebacd008f0352..f32aca0fb7c5c 100644 --- a/libclc/generic/libspirv/math/asin.inc +++ b/libclc/generic/libspirv/math/asin.inc @@ -5,8 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// TODO: Enable half precision when atan2 is implemented -#if __CLC_FPSIZE > 16 #if __CLC_FPSIZE == 64 #define __CLC_CONST(x) x @@ -22,5 +20,3 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __spirv_ocl_asin(__CLC_GENTYPE x) { } #undef __CLC_CONST - -#endif diff --git a/libclc/generic/libspirv/math/asinh.cl b/libclc/generic/libspirv/math/asinh.cl index 10d206846fd45..76a32eb4ed1b5 100644 --- a/libclc/generic/libspirv/math/asinh.cl +++ b/libclc/generic/libspirv/math/asinh.cl @@ -361,3 +361,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_asinh(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_asinh, double) #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_asinh, __builtin_asinh, half) + +#endif diff --git a/libclc/generic/libspirv/math/atan.cl b/libclc/generic/libspirv/math/atan.cl index 4dadde766f286..f8f2fb90d40c9 100644 --- a/libclc/generic/libspirv/math/atan.cl +++ b/libclc/generic/libspirv/math/atan.cl @@ -173,3 +173,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_atan(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_atan, double); #endif // cl_khr_fp64 + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_atan, __builtin_atan, half) + +#endif diff --git a/libclc/generic/libspirv/math/atan2.cl b/libclc/generic/libspirv/math/atan2.cl index e6cce7868ff3b..f71c0188314a6 100644 --- a/libclc/generic/libspirv/math/atan2.cl +++ b/libclc/generic/libspirv/math/atan2.cl @@ -245,3 +245,11 @@ _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_atan2, double, double); #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_BINARY_BUILTIN(half, __spirv_ocl_atan2, __builtin_atan2, half, half) + +#endif diff --git a/libclc/generic/libspirv/math/cbrt.cl b/libclc/generic/libspirv/math/cbrt.cl index 98bff27b9c979..c34b91901a11f 100644 --- a/libclc/generic/libspirv/math/cbrt.cl +++ b/libclc/generic/libspirv/math/cbrt.cl @@ -144,3 +144,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_cbrt(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_cbrt, double) #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_cbrt, __builtin_cbrt, half) + +#endif diff --git a/libclc/generic/libspirv/math/clc_exp10.cl b/libclc/generic/libspirv/math/clc_exp10.cl index d5b9621aa9888..154d4f457b27b 100644 --- a/libclc/generic/libspirv/math/clc_exp10.cl +++ b/libclc/generic/libspirv/math/clc_exp10.cl @@ -150,3 +150,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x) } _CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_exp10, double) #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __clc_exp10, __builtin_exp10, half) + +#endif diff --git a/libclc/generic/libspirv/math/clc_fmod.cl b/libclc/generic/libspirv/math/clc_fmod.cl index f84c1155b49c3..6a773d8ab082c 100644 --- a/libclc/generic/libspirv/math/clc_fmod.cl +++ b/libclc/generic/libspirv/math/clc_fmod.cl @@ -166,3 +166,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) { _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_fmod, double, double); #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_BINARY_BUILTIN(half, __clc_fmod, __builtin_fmod, half, half) + +#endif diff --git a/libclc/generic/libspirv/math/clc_hypot.cl b/libclc/generic/libspirv/math/clc_hypot.cl index d99fceccf77c7..b34a5e5107b4a 100644 --- a/libclc/generic/libspirv/math/clc_hypot.cl +++ b/libclc/generic/libspirv/math/clc_hypot.cl @@ -85,3 +85,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_hypot, double, double) #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_BINARY_BUILTIN(half, __clc_hypot, __builtin_hypot, half, half) + +#endif diff --git a/libclc/generic/libspirv/math/clc_ldexp.cl b/libclc/generic/libspirv/math/clc_ldexp.cl index be582b88445cb..6183638f388b9 100644 --- a/libclc/generic/libspirv/math/clc_ldexp.cl +++ b/libclc/generic/libspirv/math/clc_ldexp.cl @@ -130,3 +130,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double x, int n) { } #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_BINARY_BUILTIN(half, __clc_ldexp, __builtin_ldexp, half, int) + +#endif diff --git a/libclc/generic/libspirv/math/clc_remainder.cl b/libclc/generic/libspirv/math/clc_remainder.cl index ccef76690571a..16e75a0a1319d 100644 --- a/libclc/generic/libspirv/math/clc_remainder.cl +++ b/libclc/generic/libspirv/math/clc_remainder.cl @@ -202,3 +202,12 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) { _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_remainder, double, double); #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_BINARY_BUILTIN(half, __clc_remainder, __builtin_remainder, half, + half) + +#endif diff --git a/libclc/generic/libspirv/math/copysign.cl b/libclc/generic/libspirv/math/copysign.cl index d839f9f7e88b3..6c241dd8170b0 100644 --- a/libclc/generic/libspirv/math/copysign.cl +++ b/libclc/generic/libspirv/math/copysign.cl @@ -25,13 +25,7 @@ _CLC_DEFINE_BINARY_BUILTIN(double, __spirv_ocl_copysign, __builtin_copysign, #pragma OPENCL EXTENSION cl_khr_fp16 : enable -_CLC_DEF _CLC_OVERLOAD half __spirv_ocl_copysign(half x, half y) { - ushort sign_x = as_ushort(x) & 0x8000u; - ushort unsigned_y = as_ushort(y) & 0x7ffffu; - - return as_half((ushort)(sign_x | unsigned_y)); -} -_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_copysign, half, - half) +_CLC_DEFINE_BINARY_BUILTIN(half, __spirv_ocl_copysign, __builtin_copysign, half, + half) #endif diff --git a/libclc/generic/libspirv/math/cos.cl b/libclc/generic/libspirv/math/cos.cl index 0a47bf9956af3..b53551d0d2c90 100644 --- a/libclc/generic/libspirv/math/cos.cl +++ b/libclc/generic/libspirv/math/cos.cl @@ -62,3 +62,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_cos(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_cos, double); #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_cos, __builtin_cos, half) + +#endif diff --git a/libclc/generic/libspirv/math/cosh.cl b/libclc/generic/libspirv/math/cosh.cl index 0c737d091a0cc..ff1da9632b2e5 100644 --- a/libclc/generic/libspirv/math/cosh.cl +++ b/libclc/generic/libspirv/math/cosh.cl @@ -209,3 +209,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_cosh(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_cosh, double) #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_cosh, __builtin_cosh, half) + +#endif diff --git a/libclc/generic/libspirv/math/cospi.cl b/libclc/generic/libspirv/math/cospi.cl index ec02fee7daae7..fcf7b8d9e4b16 100644 --- a/libclc/generic/libspirv/math/cospi.cl +++ b/libclc/generic/libspirv/math/cospi.cl @@ -120,3 +120,16 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_cospi(double x) { } _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_cospi, double); #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEF _CLC_OVERLOAD half __spirv_ocl_cospi(half x) { + float f = x; + return __spirv_ocl_cospi(f); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_cospi, half) + +#endif diff --git a/libclc/generic/libspirv/math/erf.cl b/libclc/generic/libspirv/math/erf.cl index f50358917caa3..510ee76820f30 100644 --- a/libclc/generic/libspirv/math/erf.cl +++ b/libclc/generic/libspirv/math/erf.cl @@ -540,3 +540,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_erf(double y) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_erf, double); #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_erf, __builtin_erf, half) + +#endif diff --git a/libclc/generic/libspirv/math/erfc.cl b/libclc/generic/libspirv/math/erfc.cl index 9f5db45f5aa50..fb1d88d4f13ae 100644 --- a/libclc/generic/libspirv/math/erfc.cl +++ b/libclc/generic/libspirv/math/erfc.cl @@ -549,3 +549,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_erfc(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_erfc, double); #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_erfc, __builtin_erfc, half) + +#endif diff --git a/libclc/generic/libspirv/math/exp.cl b/libclc/generic/libspirv/math/exp.cl index 3fdc69a44fa13..2d6421f2d6f25 100644 --- a/libclc/generic/libspirv/math/exp.cl +++ b/libclc/generic/libspirv/math/exp.cl @@ -75,3 +75,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_exp(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_exp, double) #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_exp, __builtin_exp, half) + +#endif diff --git a/libclc/generic/libspirv/math/exp2.cl b/libclc/generic/libspirv/math/exp2.cl index 6e6a736722379..7720e78be7754 100644 --- a/libclc/generic/libspirv/math/exp2.cl +++ b/libclc/generic/libspirv/math/exp2.cl @@ -70,3 +70,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_exp2(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_exp2, double) #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_exp2, __builtin_exp2, half) + +#endif diff --git a/libclc/generic/libspirv/math/expm1.cl b/libclc/generic/libspirv/math/expm1.cl index 31407f8a689bd..710a67e2fe25c 100644 --- a/libclc/generic/libspirv/math/expm1.cl +++ b/libclc/generic/libspirv/math/expm1.cl @@ -149,3 +149,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_expm1(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_expm1, double) #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_expm1, __builtin_expm1, half) + +#endif diff --git a/libclc/generic/libspirv/math/fdim.cl b/libclc/generic/libspirv/math/fdim.cl index a4818b9ecf812..6385d8036c5cd 100644 --- a/libclc/generic/libspirv/math/fdim.cl +++ b/libclc/generic/libspirv/math/fdim.cl @@ -12,3 +12,13 @@ #define __CLC_BODY #include + +#include + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_BINARY_BUILTIN(half, __spirv_ocl_fdim, __builtin_fdim, half, half) + +#endif diff --git a/libclc/generic/libspirv/math/frexp.cl b/libclc/generic/libspirv/math/frexp.cl index 6b05fe88832b4..b893bb63ea2be 100644 --- a/libclc/generic/libspirv/math/frexp.cl +++ b/libclc/generic/libspirv/math/frexp.cl @@ -8,6 +8,7 @@ #include #include +#include #define __CLC_BODY #define __CLC_ADDRESS_SPACE private @@ -30,3 +31,22 @@ #include #undef __CLC_ADDRESS_SPACE #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#define _CLC_DEFINE_NO_VEC(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, \ + ARG2_TYPE) \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ + return BUILTIN(x, y); \ + } + +_CLC_DEFINE_NO_VEC(half, __spirv_ocl_frexp, __builtin_frexp, half, global int *) +_CLC_V_V_VP_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, half, __spirv_ocl_frexp, half, global, int) +_CLC_DEFINE_NO_VEC(half, __spirv_ocl_frexp, __builtin_frexp, half, local int *) +_CLC_V_V_VP_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, half, __spirv_ocl_frexp, half, local, int) +_CLC_DEFINE_NO_VEC(half, __spirv_ocl_frexp, __builtin_frexp, half, int *) +_CLC_V_V_VP_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, half, __spirv_ocl_frexp, half, , int) + +#endif diff --git a/libclc/generic/libspirv/math/ilogb.cl b/libclc/generic/libspirv/math/ilogb.cl index 2e991afa2f50a..0387cbdc109ab 100644 --- a/libclc/generic/libspirv/math/ilogb.cl +++ b/libclc/generic/libspirv/math/ilogb.cl @@ -41,3 +41,17 @@ _CLC_OVERLOAD _CLC_DEF int __spirv_ocl_ilogb(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, __spirv_ocl_ilogb, double); #endif // cl_khr_fp64 + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEF _CLC_OVERLOAD int __spirv_ocl_ilogb(half x) { + float f = x; + return __spirv_ocl_ilogb(f); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, __spirv_ocl_ilogb, half) + + +#endif diff --git a/libclc/generic/libspirv/math/lgamma.cl b/libclc/generic/libspirv/math/lgamma.cl index 701d898bc0538..63003e9b58ef5 100644 --- a/libclc/generic/libspirv/math/lgamma.cl +++ b/libclc/generic/libspirv/math/lgamma.cl @@ -27,3 +27,16 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_lgamma(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_lgamma, double) #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_OVERLOAD _CLC_DEF half __spirv_ocl_lgamma(half x) { + int s; + return __spirv_ocl_lgamma_r(x, &s); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_lgamma, half) + +#endif diff --git a/libclc/generic/libspirv/math/log.cl b/libclc/generic/libspirv/math/log.cl index dab1368109a1c..b9e986260c2b7 100644 --- a/libclc/generic/libspirv/math/log.cl +++ b/libclc/generic/libspirv/math/log.cl @@ -32,3 +32,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_log(double x) _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_log, double); #endif // cl_khr_fp64 + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_log, __builtin_log, half) + +#endif diff --git a/libclc/generic/libspirv/math/log10.cl b/libclc/generic/libspirv/math/log10.cl index 9a6bcc996759d..74fbd1ec112ea 100644 --- a/libclc/generic/libspirv/math/log10.cl +++ b/libclc/generic/libspirv/math/log10.cl @@ -24,3 +24,11 @@ _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_log10, float); #ifdef cl_khr_fp64 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_log10, double); #endif // cl_khr_fp64 + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_log10, __builtin_log10, half) + +#endif diff --git a/libclc/generic/libspirv/math/log1p.cl b/libclc/generic/libspirv/math/log1p.cl index b05873155f73e..ad6f94d2ecf25 100644 --- a/libclc/generic/libspirv/math/log1p.cl +++ b/libclc/generic/libspirv/math/log1p.cl @@ -166,3 +166,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_log1p(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_log1p, double); #endif // cl_khr_fp64 + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_log1p, __builtin_log1p, half) + +#endif diff --git a/libclc/generic/libspirv/math/log2.cl b/libclc/generic/libspirv/math/log2.cl index 46cc5e545aa27..d8be06b9c5b4a 100644 --- a/libclc/generic/libspirv/math/log2.cl +++ b/libclc/generic/libspirv/math/log2.cl @@ -24,3 +24,11 @@ _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_log2, float); #ifdef cl_khr_fp64 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_log2, double); #endif // cl_khr_fp64 + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_log2, __builtin_log2, half) + +#endif diff --git a/libclc/generic/libspirv/math/logb.cl b/libclc/generic/libspirv/math/logb.cl index 224b7e3042618..bd62b84fe7965 100644 --- a/libclc/generic/libspirv/math/logb.cl +++ b/libclc/generic/libspirv/math/logb.cl @@ -38,3 +38,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_logb(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_logb, double) #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_logb, __builtin_logb, half) + +#endif diff --git a/libclc/generic/libspirv/math/sin.cl b/libclc/generic/libspirv/math/sin.cl index 8e4f7c06577be..679aa304dcd38 100644 --- a/libclc/generic/libspirv/math/sin.cl +++ b/libclc/generic/libspirv/math/sin.cl @@ -64,3 +64,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_sin(double x) { _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_sin, double); #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_sin, __builtin_sin, half) + +#endif diff --git a/libclc/generic/libspirv/math/sinpi.cl b/libclc/generic/libspirv/math/sinpi.cl index c2b273e1fcdd8..d0e7c1a830030 100644 --- a/libclc/generic/libspirv/math/sinpi.cl +++ b/libclc/generic/libspirv/math/sinpi.cl @@ -115,3 +115,16 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_sinpi(double x) _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_sinpi, double) #endif + +#ifdef cl_khr_fp16 + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +_CLC_DEF _CLC_OVERLOAD half __spirv_ocl_sinpi(half x) { + float f = x; + return __spirv_ocl_sinpi(f); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_sinpi, half) + +#endif diff --git a/libclc/x86_64-unknown-linux/libspirv/SOURCES b/libclc/native_cpu-unknown-linux/libspirv/SOURCES similarity index 95% rename from libclc/x86_64-unknown-linux/libspirv/SOURCES rename to libclc/native_cpu-unknown-linux/libspirv/SOURCES index b5ebcbf787bf6..3bd1204f6a449 100644 --- a/libclc/x86_64-unknown-linux/libspirv/SOURCES +++ b/libclc/native_cpu-unknown-linux/libspirv/SOURCES @@ -16,5 +16,4 @@ math/native_sqrt.cl math/rint.cl math/round.cl math/trunc.cl -shared/helpers.ll cl_khr_int64_extended_atomics/minmax_helpers.ll diff --git a/libclc/x86_64-unknown-linux/libspirv/cl_khr_int64_extended_atomics/minmax_helpers.ll b/libclc/native_cpu-unknown-linux/libspirv/cl_khr_int64_extended_atomics/minmax_helpers.ll similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/cl_khr_int64_extended_atomics/minmax_helpers.ll rename to libclc/native_cpu-unknown-linux/libspirv/cl_khr_int64_extended_atomics/minmax_helpers.ll diff --git a/libclc/native_cpu-unknown-linux/libspirv/integer/popcount.cl b/libclc/native_cpu-unknown-linux/libspirv/integer/popcount.cl new file mode 100644 index 0000000000000..448b7ed50a98d --- /dev/null +++ b/libclc/native_cpu-unknown-linux/libspirv/integer/popcount.cl @@ -0,0 +1,13 @@ +#include +#include +#include + +_CLC_DEFINE_UNARY_BUILTIN(int, __spirv_ocl_popcount, __builtin_popcount, int) +_CLC_DEFINE_UNARY_BUILTIN(uint, __spirv_ocl_popcount, __builtin_popcount, uint) +_CLC_DEFINE_UNARY_BUILTIN(short, __spirv_ocl_popcount, __builtin_popcount, short) +_CLC_DEFINE_UNARY_BUILTIN(ushort, __spirv_ocl_popcount, __builtin_popcount, ushort) +_CLC_DEFINE_UNARY_BUILTIN(long, __spirv_ocl_popcount, __builtin_popcount, long) +_CLC_DEFINE_UNARY_BUILTIN(ulong, __spirv_ocl_popcount, __builtin_popcount, ulong) +_CLC_DEFINE_UNARY_BUILTIN(char, __spirv_ocl_popcount, __builtin_popcount, char) +_CLC_DEFINE_UNARY_BUILTIN(uchar, __spirv_ocl_popcount, __builtin_popcount, uchar) +_CLC_DEFINE_UNARY_BUILTIN(schar, __spirv_ocl_popcount, __builtin_popcount, schar) diff --git a/libclc/x86_64-unknown-linux/libspirv/math/ceil.cl b/libclc/native_cpu-unknown-linux/libspirv/math/ceil.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/ceil.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/ceil.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/clc_sqrt.cl b/libclc/native_cpu-unknown-linux/libspirv/math/clc_sqrt.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/clc_sqrt.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/clc_sqrt.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/fabs.cl b/libclc/native_cpu-unknown-linux/libspirv/math/fabs.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/fabs.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/fabs.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/floor.cl b/libclc/native_cpu-unknown-linux/libspirv/math/floor.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/floor.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/floor.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/fma.cl b/libclc/native_cpu-unknown-linux/libspirv/math/fma.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/fma.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/fma.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/helpers.h b/libclc/native_cpu-unknown-linux/libspirv/math/helpers.h similarity index 76% rename from libclc/x86_64-unknown-linux/libspirv/math/helpers.h rename to libclc/native_cpu-unknown-linux/libspirv/math/helpers.h index 0178a74ad6c96..1dec19d63414c 100644 --- a/libclc/x86_64-unknown-linux/libspirv/math/helpers.h +++ b/libclc/native_cpu-unknown-linux/libspirv/math/helpers.h @@ -1,26 +1,7 @@ #include "func.h" #include "types.h" -#ifdef NO_CLANG_BUILTINS - -#define GEN_UNARY_BUILTIN_T(NAME, TYPE) \ - _CLC_OVERLOAD TYPE __##NAME##_helper(TYPE); \ - _CLC_OVERLOAD TYPE __spirv_ocl_##NAME(TYPE n) { return __##NAME##_helper(n); } - -#define GEN_TERNARY_BUILTIN_T(NAME, TYPE) \ - _CLC_OVERLOAD TYPE __##NAME##_helper(TYPE, TYPE, TYPE); \ - _CLC_OVERLOAD TYPE __spirv_ocl_##NAME(TYPE a, TYPE b, TYPE c) { \ - return __##NAME##_helper(a, b, c); \ - } -#define GEN_UNARY_BUILTIN(NAME) \ - GEN_UNARY_BUILTIN_T(NAME, float) \ - GEN_UNARY_BUILTIN_T(NAME, double) - -#define GEN_TERNARY_BUILTIN(NAME) \ - GEN_TERNARY_BUILTIN_T(NAME, float) \ - GEN_TERNARY_BUILTIN_T(NAME, double) - -#else +#pragma OPENCL EXTENSION cl_khr_fp16 : enable #ifndef IS_NATIVE #define GETNAME(ID) __spirv_ocl_##ID @@ -54,8 +35,10 @@ return __builtin_##NAME##f(n); \ } \ _CLC_OVERLOAD double GETNAME(NAME)(double n) { return __builtin_##NAME(n); } \ + _CLC_OVERLOAD half GETNAME(NAME)(half n) { return __builtin_##NAME(n); } \ GEN_UNARY_VECTOR_BUILTIN_T(NAME, float) \ - GEN_UNARY_VECTOR_BUILTIN_T(NAME, double) + GEN_UNARY_VECTOR_BUILTIN_T(NAME, double) \ + GEN_UNARY_VECTOR_BUILTIN_T(NAME, half) #define GEN_TERNARY_VECTOR_BUILTIN(NAME, TYPE, NUM) \ _CLC_OVERLOAD TYPE##NUM GETNAME(NAME)(TYPE##NUM n1, TYPE##NUM n2, \ @@ -77,6 +60,9 @@ _CLC_OVERLOAD double GETNAME(NAME)(double n1, double n2, double n3) { \ return __builtin_##NAME(n1, n2, n3); \ } \ + _CLC_OVERLOAD half GETNAME(NAME)(half n1, half n2, half n3) { \ + return __builtin_##NAME(n1, n2, n3); \ + } \ GEN_TERNARY_VECTOR_BUILTIN_T(NAME, float) \ - GEN_TERNARY_VECTOR_BUILTIN_T(NAME, double) -#endif + GEN_TERNARY_VECTOR_BUILTIN_T(NAME, double) \ + GEN_TERNARY_VECTOR_BUILTIN_T(NAME, half) diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_cos.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_cos.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/native_cos.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/native_cos.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_exp.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_exp.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/native_exp.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/native_exp.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_exp2.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_exp2.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/native_exp2.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/native_exp2.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_log.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_log.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/native_log.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/native_log.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_log10.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_log10.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/native_log10.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/native_log10.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_log2.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_log2.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/native_log2.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/native_log2.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_sin.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_sin.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/native_sin.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/native_sin.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_sqrt.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_sqrt.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/native_sqrt.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/native_sqrt.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/rint.cl b/libclc/native_cpu-unknown-linux/libspirv/math/rint.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/rint.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/rint.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/round.cl b/libclc/native_cpu-unknown-linux/libspirv/math/round.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/round.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/round.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/sqrt.cl b/libclc/native_cpu-unknown-linux/libspirv/math/sqrt.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/sqrt.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/sqrt.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/math/trunc.cl b/libclc/native_cpu-unknown-linux/libspirv/math/trunc.cl similarity index 100% rename from libclc/x86_64-unknown-linux/libspirv/math/trunc.cl rename to libclc/native_cpu-unknown-linux/libspirv/math/trunc.cl diff --git a/libclc/x86_64-unknown-linux/libspirv/integer/helpers.h b/libclc/x86_64-unknown-linux/libspirv/integer/helpers.h deleted file mode 100644 index 50e7c39cb3d23..0000000000000 --- a/libclc/x86_64-unknown-linux/libspirv/integer/helpers.h +++ /dev/null @@ -1,9 +0,0 @@ -#include "func.h" - -#define GEN_UNARY_BUILTIN_T(NAME, TYPE) \ - _CLC_OVERLOAD TYPE __##NAME##_helper(TYPE); \ - _CLC_OVERLOAD TYPE __spirv_ocl_##NAME(TYPE n) { return __##NAME##_helper(n); } - -#define GEN_UNARY_BUILTIN(NAME) \ - GEN_UNARY_BUILTIN_T(NAME, int) \ - GEN_UNARY_BUILTIN_T(NAME, signed char) diff --git a/libclc/x86_64-unknown-linux/libspirv/integer/popcount.cl b/libclc/x86_64-unknown-linux/libspirv/integer/popcount.cl deleted file mode 100644 index fae953de0c340..0000000000000 --- a/libclc/x86_64-unknown-linux/libspirv/integer/popcount.cl +++ /dev/null @@ -1,3 +0,0 @@ -#include "helpers.h" - -GEN_UNARY_BUILTIN(popcount) diff --git a/libclc/x86_64-unknown-linux/libspirv/shared/helpers.ll b/libclc/x86_64-unknown-linux/libspirv/shared/helpers.ll deleted file mode 100644 index b3d7d5e2daa9f..0000000000000 --- a/libclc/x86_64-unknown-linux/libspirv/shared/helpers.ll +++ /dev/null @@ -1,17 +0,0 @@ -declare i32 @llvm.ctpop.i32(i32 %n) -declare i8 @llvm.ctpop.i8(i8 %n) - - -define dso_local i32 @_Z17__popcount_helperi(i32 %x) { -entry: - %call = call i32 @llvm.ctpop.i32(i32 %x) - ret i32 %call -} - - -define dso_local i8 @_Z17__popcount_helpera(i8 %x) { -entry: - %call = call i8 @llvm.ctpop.i8(i8 %x) - ret i8 %call -} - diff --git a/libdevice/bfloat16_wrapper.cpp b/libdevice/bfloat16_wrapper.cpp index a0b6b96d4a293..4d2902420f1b0 100644 --- a/libdevice/bfloat16_wrapper.cpp +++ b/libdevice/bfloat16_wrapper.cpp @@ -11,6 +11,8 @@ #if defined(__SPIR__) || defined(__SPIRV__) #include +#include +#include #include DEVICE_EXTERN_C_INLINE @@ -23,4 +25,42 @@ float __devicelib_ConvertBF16ToFINTEL(const uint16_t &x) { return __spirv_ConvertBF16ToFINTEL(x); } +// For vector of size 1. +DEVICE_EXTERN_C_INLINE +void __devicelib_ConvertFToBF16INTELVec1(const float *src, uint16_t *dst) { + dst[0] = __spirv_ConvertFToBF16INTEL(src[0]); +} +DEVICE_EXTERN_C_INLINE +void __devicelib_ConvertBF16ToFINTELVec1(const uint16_t *src, float *dst) { + dst[0] = __spirv_ConvertBF16ToFINTEL(src[0]); +} + +// Generate the conversion functions for vector of size 2, 3, 4, 8, 16. +#define GenerateConvertFunctionForVec(size) \ + DEVICE_EXTERN_C_INLINE \ + void __devicelib_ConvertFToBF16INTELVec##size(const float *src, \ + uint16_t *dst) { \ + __ocl_vec_t x = \ + *__builtin_bit_cast(const __ocl_vec_t *, src); \ + __ocl_vec_t y = __spirv_ConvertFToBF16INTEL(x); \ + *__builtin_bit_cast(__ocl_vec_t *, dst) = y; \ + } \ + DEVICE_EXTERN_C_INLINE \ + void __devicelib_ConvertBF16ToFINTELVec##size(const uint16_t *src, \ + float *dst) { \ + __ocl_vec_t x = \ + *__builtin_bit_cast(const __ocl_vec_t *, src); \ + __ocl_vec_t y = __spirv_ConvertBF16ToFINTEL(x); \ + *__builtin_bit_cast(__ocl_vec_t *, dst) = y; \ + } + +// clang-format off +GenerateConvertFunctionForVec(2) +GenerateConvertFunctionForVec(3) +GenerateConvertFunctionForVec(4) +GenerateConvertFunctionForVec(8) +GenerateConvertFunctionForVec(16) +// clang-format on +#undef GenerateConvertFunctionForVec + #endif // __SPIR__ || __SPIRV__ diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake index e8c96d0099823..095b7a0cd1583 100644 --- a/libdevice/cmake/modules/SYCLLibdevice.cmake +++ b/libdevice/cmake/modules/SYCLLibdevice.cmake @@ -91,7 +91,7 @@ function(add_devicelib_obj obj_filename) set(devicelib-obj-file-new-offload ${obj_new_offload_binary_dir}/${obj_filename}.${new-offload-lib-suffix}) add_custom_command(OUTPUT ${devicelib-obj-file-new-offload} - COMMAND ${clang} -fsycl -c --offload-new-driver + COMMAND ${clang} -fsycl -c --offload-new-driver -foffload-lto=thin ${compile_opts} ${sycl_targets_opt} ${OBJ_EXTRA_ARGS} ${CMAKE_CURRENT_SOURCE_DIR}/${OBJ_SRC} -o ${devicelib-obj-file-new-offload} @@ -159,7 +159,12 @@ set(imf_obj_deps device_imf.hpp imf_half.hpp imf_bf16.hpp imf_rounding_op.hpp im set(itt_obj_deps device_itt.h spirv_vars.h device.h sycl-compiler) set(bfloat16_obj_deps sycl-headers sycl-compiler) if (NOT MSVC) - set(sanitizer_obj_deps device.h atomic.hpp spirv_vars.h include/sanitizer_utils.hpp include/spir_global_var.hpp sycl-compiler) + set(sanitizer_obj_deps + device.h atomic.hpp spirv_vars.h + include/asan_libdevice.hpp + include/sanitizer_utils.hpp + include/spir_global_var.hpp + sycl-compiler) endif() add_devicelib(libsycl-itt-stubs SRC itt_stubs.cpp DEP ${itt_obj_deps}) @@ -219,7 +224,8 @@ set(imf_host_cxx_flags -c ) if (NOT WIN32) - list(APPEND imf_host_cxx_flags -fPIC) + list(APPEND imf_host_cxx_flags -fPIC -fcf-protection=full) + list(APPEND imf_host_cxx_flags -fcf-protection=full) endif() add_custom_command(OUTPUT ${imf_fp32_fallback_src} @@ -270,7 +276,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf.${lib-suffix} VERBATIM) add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf.${new-offload-lib-suffix} - COMMAND ${clang} -fsycl -c --offload-new-driver + COMMAND ${clang} -fsycl -c --offload-new-driver -foffload-lto=thin ${compile_opts} ${sycl_targets_opt} ${imf_fp32_fallback_src} -I ${CMAKE_CURRENT_SOURCE_DIR}/imf -o ${obj_binary_dir}/libsycl-fallback-imf.${new-offload-lib-suffix} @@ -286,7 +292,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp32-host.${lib-suffix} VERBATIM) add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp32-host.${new-offload-lib-suffix} - COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver + COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin -I ${CMAKE_CURRENT_SOURCE_DIR}/imf ${imf_fp32_fallback_src} -o ${obj_binary_dir}/fallback-imf-fp32-host.${new-offload-lib-suffix} @@ -321,7 +327,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-fp64.${lib-suff add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-fp64.${new-offload-lib-suffix} COMMAND ${clang} -fsycl -c -I ${CMAKE_CURRENT_SOURCE_DIR}/imf - --offload-new-driver + --offload-new-driver -foffload-lto=thin ${compile_opts} ${sycl_targets_opt} ${imf_fp64_fallback_src} -o ${obj_binary_dir}/libsycl-fallback-imf-fp64.${new-offload-lib-suffix} @@ -337,7 +343,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp64-host.${lib-suffix} VERBATIM) add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp64-host.${new-offload-lib-suffix} - COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver + COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin -I ${CMAKE_CURRENT_SOURCE_DIR}/imf ${imf_fp64_fallback_src} -o ${obj_binary_dir}/fallback-imf-fp64-host.${new-offload-lib-suffix} @@ -372,7 +378,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-bf16.${lib-suff add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-bf16.${new-offload-lib-suffix} COMMAND ${clang} -fsycl -c -I ${CMAKE_CURRENT_SOURCE_DIR}/imf - --offload-new-driver + --offload-new-driver -foffload-lto=thin ${compile_opts} ${sycl_targets_opt} ${imf_bf16_fallback_src} -o ${obj_binary_dir}/libsycl-fallback-imf-bf16.${new-offload-lib-suffix} @@ -388,7 +394,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-bf16-host.${lib-suffix} VERBATIM) add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-bf16-host.${new-offload-lib-suffix} - COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver + COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin -I ${CMAKE_CURRENT_SOURCE_DIR}/imf ${imf_bf16_fallback_src} -o ${obj_binary_dir}/fallback-imf-bf16-host.${new-offload-lib-suffix} @@ -437,7 +443,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp32-host.${lib-suffix} VERBATIM) add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp32-host.${new-offload-lib-suffix} - COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver + COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper.cpp -o ${obj_binary_dir}/imf-fp32-host.${new-offload-lib-suffix} MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper.cpp @@ -453,7 +459,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp64-host.${lib-suffix} VERBATIM) add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp64-host.${new-offload-lib-suffix} - COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver + COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_fp64.cpp -o ${obj_binary_dir}/imf-fp64-host.${new-offload-lib-suffix} MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_fp64.cpp @@ -469,7 +475,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/imf-bf16-host.${lib-suffix} VERBATIM) add_custom_command(OUTPUT ${obj_binary_dir}/imf-bf16-host.${new-offload-lib-suffix} - COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver + COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_bf16.cpp -o ${obj_binary_dir}/imf-bf16-host.${new-offload-lib-suffix} MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_bf16.cpp diff --git a/libdevice/cmath_wrapper.cpp b/libdevice/cmath_wrapper.cpp index e5e36045a7b8b..cf40373a90efb 100644 --- a/libdevice/cmath_wrapper.cpp +++ b/libdevice/cmath_wrapper.cpp @@ -39,6 +39,18 @@ extern "C" SYCL_EXTERNAL float __devicelib_fminf(float, float); DEVICE_EXTERN_C_INLINE float fminf(float x, float y) { return __devicelib_fminf(x, y); } +DEVICE_EXTERN_C_INLINE +float truncf(float x) { return __devicelib_truncf(x); } + +DEVICE_EXTERN_C_INLINE +float sinpif(float x) { return __devicelib_sinpif(x); } + +DEVICE_EXTERN_C_INLINE +float rsqrtf(float x) { return __devicelib_rsqrtf(x); } + +DEVICE_EXTERN_C_INLINE +float exp10f(float x) { return __devicelib_exp10f(x); } + DEVICE_EXTERN_C_INLINE div_t div(int x, int y) { return __devicelib_div(x, y); } diff --git a/libdevice/cmath_wrapper_fp64.cpp b/libdevice/cmath_wrapper_fp64.cpp index 5624ef2ad9b51..bfc1a122f0f18 100644 --- a/libdevice/cmath_wrapper_fp64.cpp +++ b/libdevice/cmath_wrapper_fp64.cpp @@ -36,6 +36,18 @@ extern "C" SYCL_EXTERNAL double __devicelib_fmin(double, double); DEVICE_EXTERN_C_INLINE double fmin(double x, double y) { return __devicelib_fmin(x, y); } +DEVICE_EXTERN_C_INLINE +double trunc(double x) { return __devicelib_trunc(x); } + +DEVICE_EXTERN_C_INLINE +double sinpi(double x) { return __devicelib_sinpi(x); } + +DEVICE_EXTERN_C_INLINE +double rsqrt(double x) { return __devicelib_rsqrt(x); } + +DEVICE_EXTERN_C_INLINE +double exp10(double x) { return __devicelib_exp10(x); } + DEVICE_EXTERN_C_INLINE double log(double x) { return __devicelib_log(x); } diff --git a/libdevice/device_math.h b/libdevice/device_math.h index f62c4c632f4d0..01085013dae57 100644 --- a/libdevice/device_math.h +++ b/libdevice/device_math.h @@ -76,6 +76,30 @@ float __devicelib_fminf(float x, float y); DEVICE_EXTERN_C double __devicelib_fmin(double x, double y); +DEVICE_EXTERN_C +float __devicelib_truncf(float x); + +DEVICE_EXTERN_C +double __devicelib_trunc(double x); + +DEVICE_EXTERN_C +double __devicelib_sinpi(double x); + +DEVICE_EXTERN_C +float __devicelib_sinpif(float x); + +DEVICE_EXTERN_C +double __devicelib_rsqrt(double x); + +DEVICE_EXTERN_C +float __devicelib_rsqrtf(float x); + +DEVICE_EXTERN_C +double __devicelib_exp10(double x); + +DEVICE_EXTERN_C +float __devicelib_exp10f(float x); + DEVICE_EXTERN_C div_t __devicelib_div(int x, int y); diff --git a/libdevice/fallback-bfloat16.cpp b/libdevice/fallback-bfloat16.cpp index 84015d03b35b0..4f7e35b6f2718 100644 --- a/libdevice/fallback-bfloat16.cpp +++ b/libdevice/fallback-bfloat16.cpp @@ -43,4 +43,31 @@ __devicelib_ConvertBF16ToFINTEL(const uint16_t &a) { return floatValue; } +// Generate the conversion functions for vector of size 1, 2, 3, 4, 8, 16. +#define GenerateConvertFunctionForVec(size) \ + DEVICE_EXTERN_C_INLINE \ + void __devicelib_ConvertFToBF16INTELVec##size(const float *src, \ + uint16_t *dst) { \ + for (int i = 0; i < size; ++i) { \ + dst[i] = __devicelib_ConvertFToBF16INTEL(src[i]); \ + } \ + } \ + DEVICE_EXTERN_C_INLINE \ + void __devicelib_ConvertBF16ToFINTELVec##size(const uint16_t *src, \ + float *dst) { \ + for (int i = 0; i < size; ++i) { \ + dst[i] = __devicelib_ConvertBF16ToFINTEL(src[i]); \ + } \ + } + +// clang-format off +GenerateConvertFunctionForVec(1) +GenerateConvertFunctionForVec(2) +GenerateConvertFunctionForVec(3) +GenerateConvertFunctionForVec(4) +GenerateConvertFunctionForVec(8) +GenerateConvertFunctionForVec(16) +// clang-format on +#undef GenerateConvertFunctionForVec + #endif // __SPIR__ || __SPIRV__ diff --git a/libdevice/fallback-cmath-fp64.cpp b/libdevice/fallback-cmath-fp64.cpp index e3db88d7db7b6..49832ef966b5f 100644 --- a/libdevice/fallback-cmath-fp64.cpp +++ b/libdevice/fallback-cmath-fp64.cpp @@ -35,6 +35,18 @@ double __devicelib_fmax(double x, double y) { return __spirv_ocl_fmax(x, y); } DEVICE_EXTERN_C_INLINE double __devicelib_fmin(double x, double y) { return __spirv_ocl_fmin(x, y); } +DEVICE_EXTERN_C_INLINE +double __devicelib_trunc(double x) { return __spirv_ocl_trunc(x); } + +DEVICE_EXTERN_C_INLINE +double __devicelib_sinpi(double x) { return __spirv_ocl_sinpi(x); } + +DEVICE_EXTERN_C_INLINE +double __devicelib_rsqrt(double x) { return __spirv_ocl_rsqrt(x); } + +DEVICE_EXTERN_C_INLINE +double __devicelib_exp10(double x) { return __spirv_ocl_exp10(x); } + DEVICE_EXTERN_C_INLINE double __devicelib_log(double x) { return __spirv_ocl_log(x); } diff --git a/libdevice/fallback-cmath.cpp b/libdevice/fallback-cmath.cpp index 1e1d0f59a9ba6..6289126272da4 100644 --- a/libdevice/fallback-cmath.cpp +++ b/libdevice/fallback-cmath.cpp @@ -45,6 +45,18 @@ float __devicelib_fmaxf(float x, float y) { return __spirv_ocl_fmax(x, y); } DEVICE_EXTERN_C_INLINE float __devicelib_fminf(float x, float y) { return __spirv_ocl_fmin(x, y); } +DEVICE_EXTERN_C_INLINE +float __devicelib_truncf(float x) { return __spirv_ocl_trunc(x); } + +DEVICE_EXTERN_C_INLINE +float __devicelib_sinpif(float x) { return __spirv_ocl_sinpi(x); } + +DEVICE_EXTERN_C_INLINE +float __devicelib_rsqrtf(float x) { return __spirv_ocl_rsqrt(x); } + +DEVICE_EXTERN_C_INLINE +float __devicelib_exp10f(float x) { return __spirv_ocl_exp10(x); } + DEVICE_EXTERN_C_INLINE div_t __devicelib_div(int x, int y) { return {x / y, x % y}; } diff --git a/libdevice/include/asan_libdevice.hpp b/libdevice/include/asan_libdevice.hpp index 9a1e20368fd77..3107c428df426 100644 --- a/libdevice/include/asan_libdevice.hpp +++ b/libdevice/include/asan_libdevice.hpp @@ -75,7 +75,7 @@ struct LaunchInfo { LocalArgsInfo *LocalArgs = nullptr; // ordered by ArgIndex }; -constexpr unsigned ASAN_SHADOW_SCALE = 3; +constexpr unsigned ASAN_SHADOW_SCALE = 4; constexpr unsigned ASAN_SHADOW_GRANULARITY = 1ULL << ASAN_SHADOW_SCALE; // Based on the observation, only the last 24 bits of the address of the private diff --git a/libdevice/sanitizer_utils.cpp b/libdevice/sanitizer_utils.cpp index 0ea65215a3012..959776009c9aa 100644 --- a/libdevice/sanitizer_utils.cpp +++ b/libdevice/sanitizer_utils.cpp @@ -109,17 +109,17 @@ __SYCL_PRIVATE__ void *ToPrivate(void *ptr) { } inline uptr MemToShadow_CPU(uptr addr) { - return __AsanShadowMemoryGlobalStart + (addr >> 3); + return __AsanShadowMemoryGlobalStart + (addr >> ASAN_SHADOW_SCALE); } inline uptr MemToShadow_DG2(uptr addr, uint32_t as) { uptr shadow_ptr = 0; if (addr & (~0xffffffffffff)) { - shadow_ptr = - (((addr & 0xffffffffffff) >> 3) + __AsanShadowMemoryGlobalStart) | - (~0xffffffffffff); + shadow_ptr = (((addr & 0xffffffffffff) >> ASAN_SHADOW_SCALE) + + __AsanShadowMemoryGlobalStart) | + (~0xffffffffffff); } else { - shadow_ptr = (addr >> 3) + __AsanShadowMemoryGlobalStart; + shadow_ptr = (addr >> ASAN_SHADOW_SCALE) + __AsanShadowMemoryGlobalStart; } if (shadow_ptr > __AsanShadowMemoryGlobalEnd) { @@ -163,8 +163,8 @@ inline uptr MemToShadow_PVC(uptr addr, uint32_t as) { if (as == ADDRESS_SPACE_GLOBAL) { // global uptr shadow_ptr; if (addr & 0xFF00000000000000) { // Device USM - shadow_ptr = __AsanShadowMemoryGlobalStart + 0x200000000000 + - ((addr & 0xFFFFFFFFFFFF) >> 3); + shadow_ptr = __AsanShadowMemoryGlobalStart + 0x80000000000 + + ((addr & 0xFFFFFFFFFFFF) >> ASAN_SHADOW_SCALE); } else { // Only consider 47bit VA shadow_ptr = __AsanShadowMemoryGlobalStart + ((addr & 0x7FFFFFFFFFFF) >> ASAN_SHADOW_SCALE); @@ -204,7 +204,7 @@ inline uptr MemToShadow_PVC(uptr addr, uint32_t as) { uptr shadow_ptr = shadow_offset + ((wg_lid * SLM_SIZE) >> ASAN_SHADOW_SCALE) + - ((addr & (SLM_SIZE - 1)) >> 3); + ((addr & (SLM_SIZE - 1)) >> ASAN_SHADOW_SCALE); if (shadow_ptr > shadow_offset_end) { if (__asan_report_out_of_shadow_bounds() && __AsanDebug) { @@ -483,12 +483,14 @@ bool __asan_region_is_value(uptr addr, uint32_t as, std::size_t size, return true; } -// NOTE: size < 8 -inline int __asan_address_is_poisoned(uptr a, uint32_t as, size_t size) { +// NOTE: size <= 16 +inline int __asan_address_is_poisoned(uptr a, uint32_t as, size_t size = 1) { auto *shadow_address = (__SYCL_GLOBAL__ s8 *)MemToShadow(a, as); if (shadow_address) { auto shadow_value = *shadow_address; if (shadow_value) { + if (size == ASAN_SHADOW_GRANULARITY) + return true; s8 last_accessed_byte = (a & (ASAN_SHADOW_GRANULARITY - 1)) + size - 1; return (last_accessed_byte >= shadow_value); } @@ -496,11 +498,6 @@ inline int __asan_address_is_poisoned(uptr a, uint32_t as, size_t size) { return false; } -// NOTE: size = 1 -inline int __asan_address_is_poisoned(uptr a, uint32_t as) { - return __asan_address_is_poisoned(a, as, 1); -} - inline uptr __asan_region_is_poisoned(uptr beg, uint32_t as, size_t size) { if (!size) return 0; @@ -564,34 +561,13 @@ inline uptr __asan_region_is_poisoned(uptr beg, uint32_t as, size_t size) { ASAN_REPORT_ERROR(load, false, 1) ASAN_REPORT_ERROR(load, false, 2) ASAN_REPORT_ERROR(load, false, 4) +ASAN_REPORT_ERROR(load, false, 8) +ASAN_REPORT_ERROR(load, false, 16) ASAN_REPORT_ERROR(store, true, 1) ASAN_REPORT_ERROR(store, true, 2) ASAN_REPORT_ERROR(store, true, 4) - -#define ASAN_REPORT_ERROR_BYTE(type, is_write, size) \ - DEVICE_EXTERN_C_NOINLINE void __asan_##type##size( \ - uptr addr, uint32_t as, const char __SYCL_CONSTANT__ *file, \ - uint32_t line, const char __SYCL_CONSTANT__ *func) { \ - auto *shadow_address = (__SYCL_GLOBAL__ u##size *)MemToShadow(addr, as); \ - if (shadow_address && *shadow_address) { \ - __asan_report_access_error(addr, as, size, is_write, addr, file, line, \ - func); \ - } \ - } \ - DEVICE_EXTERN_C_NOINLINE void __asan_##type##size##_noabort( \ - uptr addr, uint32_t as, const char __SYCL_CONSTANT__ *file, \ - uint32_t line, const char __SYCL_CONSTANT__ *func) { \ - auto *shadow_address = (__SYCL_GLOBAL__ u##size *)MemToShadow(addr, as); \ - if (shadow_address && *shadow_address) { \ - __asan_report_access_error(addr, as, size, is_write, addr, file, line, \ - func, true); \ - } \ - } - -ASAN_REPORT_ERROR_BYTE(load, false, 8) -ASAN_REPORT_ERROR_BYTE(load, false, 16) -ASAN_REPORT_ERROR_BYTE(store, true, 8) -ASAN_REPORT_ERROR_BYTE(store, true, 16) +ASAN_REPORT_ERROR(store, true, 8) +ASAN_REPORT_ERROR(store, true, 16) #define ASAN_REPORT_ERROR_N(type, is_write) \ DEVICE_EXTERN_C_NOINLINE void __asan_##type##N( \ diff --git a/llvm/docs/requirements-hashed.txt b/llvm/docs/requirements-hashed.txt index fdf7682926b2b..07e3ed9d19030 100644 --- a/llvm/docs/requirements-hashed.txt +++ b/llvm/docs/requirements-hashed.txt @@ -360,7 +360,7 @@ sphinxcontrib-serializinghtml==1.1.10 \ --hash=sha256:326369b8df80a7d2d8d7f99aa5ac577f51ea51556ed974e7716cfd4fca3f6cb7 \ --hash=sha256:93f3f5dc458b91b192fe10c397e324f262cf163d79f3282c158e8436a2c4511f # via sphinx -urllib3==2.2.1 \ - --hash=sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d \ - --hash=sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19 +urllib3==2.2.2 \ + --hash=sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472 \ + --hash=sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168 # via requests diff --git a/llvm/include/llvm/SYCLLowerIR/CompileTimePropertiesPass.h b/llvm/include/llvm/SYCLLowerIR/CompileTimePropertiesPass.h index f2c1f96b65d35..6cc29369d0c02 100644 --- a/llvm/include/llvm/SYCLLowerIR/CompileTimePropertiesPass.h +++ b/llvm/include/llvm/SYCLLowerIR/CompileTimePropertiesPass.h @@ -57,7 +57,7 @@ namespace detail { /// /// @returns \c false if the value of \c Value equals to "false", \c true /// otherwise. -inline bool toBool(StringRef Value) { return !Value.equals("false"); } +inline bool toBool(StringRef Value) { return Value != "false"; } } // namespace detail diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index f74db3c8726ba..38d5f2512a1c4 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -206,6 +206,8 @@ def : CudaTargetInfo<"nvidia_gpu_sm_89", !listconcat(CudaMinAspects, CudaBindles [AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>; def : CudaTargetInfo<"nvidia_gpu_sm_90", !listconcat(CudaMinAspects, CudaBindlessImagesAspects, [AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>; +def : CudaTargetInfo<"nvidia_gpu_sm_90a", !listconcat(CudaMinAspects, CudaBindlessImagesAspects, + [AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>; // // HIP / AMDGPU device aspects diff --git a/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h index 9ae433cedc668..085e424249d5c 100644 --- a/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h +++ b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h @@ -19,6 +19,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/IR/Function.h" #include "llvm/Support/Error.h" +#include "llvm/Support/PropertySetIO.h" #include #include @@ -196,6 +197,8 @@ class ModuleDesc { ModuleDesc clone() const; + std::string makeSymbolTable() const; + const SYCLDeviceRequirements &getOrComputeDeviceRequirements() const { if (!Reqs.has_value()) Reqs = computeDeviceRequirements(*this); @@ -270,6 +273,33 @@ void dumpEntryPoints(const Module &M, bool OnlyKernelsAreEntryPoints = false, const char *msg = "", int Tab = 0); #endif // NDEBUG +struct SplitModule { + std::string ModuleFilePath; + util::PropertySetRegistry Properties; + std::string Symbols; + + SplitModule() = default; + SplitModule(const SplitModule &) = default; + SplitModule &operator=(const SplitModule &) = default; + SplitModule(SplitModule &&) = default; + SplitModule &operator=(SplitModule &&) = default; + + SplitModule(std::string_view File, util::PropertySetRegistry Properties, + std::string Symbols) + : ModuleFilePath(File), Properties(std::move(Properties)), + Symbols(std::move(Symbols)) {} +}; + +struct ModuleSplitterSettings { + IRSplitMode Mode; + bool OutputAssembly = false; // Bitcode or LLVM IR. + StringRef OutputPrefix; +}; + +/// Splits the given module \p M according to the given \p Settings. +Expected> +splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings); + } // namespace module_split } // namespace llvm diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h b/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h index abb78b51af154..8891f7f550c5f 100644 --- a/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h +++ b/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h @@ -30,7 +30,19 @@ class PropertyValue; } struct SYCLDeviceRequirements { - std::set Aspects; + struct AspectNameValuePair { + llvm::SmallString<64> Name; + uint32_t Value; + AspectNameValuePair(StringRef Name, uint32_t Value) + : Name(Name), Value(Value) {} + bool operator<(const AspectNameValuePair &rhs) const { + return Value < rhs.Value; + } + bool operator==(const AspectNameValuePair &rhs) const { + return Value == rhs.Value; + } + }; + std::set Aspects; std::set FixedTarget; std::optional> ReqdWorkGroupSize; std::optional WorkGroupNumDim; diff --git a/llvm/include/llvm/Support/PropertySetIO.h b/llvm/include/llvm/Support/PropertySetIO.h index 93e045256ed93..bbda6c548825f 100644 --- a/llvm/include/llvm/Support/PropertySetIO.h +++ b/llvm/include/llvm/Support/PropertySetIO.h @@ -205,6 +205,7 @@ class PropertySetRegistry { static constexpr char SYCL_MISC_PROP[] = "SYCL/misc properties"; static constexpr char SYCL_ASSERT_USED[] = "SYCL/assert used"; static constexpr char SYCL_EXPORTED_SYMBOLS[] = "SYCL/exported symbols"; + static constexpr char SYCL_IMPORTED_SYMBOLS[] = "SYCL/imported symbols"; static constexpr char SYCL_DEVICE_GLOBALS[] = "SYCL/device globals"; static constexpr char SYCL_DEVICE_REQUIREMENTS[] = "SYCL/device requirements"; static constexpr char SYCL_HOST_PIPES[] = "SYCL/host pipes"; diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index 7f2edaae323a9..22ad42e836135 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -88,10 +88,12 @@ add_llvm_component_library(LLVMSYCLLowerIR LLVMDemangle LLVMTargetParser LLVMTransformUtils - + LINK_COMPONENTS Analysis + BitWriter Core + IRPrinter Support ipo ) diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMDSlmReservation.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMDSlmReservation.cpp index c43b5895b94fc..7dbab9e127778 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMDSlmReservation.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMDSlmReservation.cpp @@ -360,6 +360,7 @@ class ScopedCallGraph { continue; } if (CallInst *ScopeStartCI = IsScopeEnd(&I)) { + (void)ScopeStartCI; ScopeMet = true; // Scope end marker encountered - verify all enclosed scopes have // ended and truncate current scope path to the enclosing node. diff --git a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp index cf41aee46df28..900e1578c7adf 100644 --- a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp +++ b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp @@ -12,16 +12,19 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/Bitcode/BitcodeWriterPass.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" +#include "llvm/IRPrinter/IRPrintingPasses.h" #include "llvm/SYCLLowerIR/DeviceGlobals.h" #include "llvm/SYCLLowerIR/LowerInvokeSimd.h" #include "llvm/SYCLLowerIR/SYCLUtils.h" #include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/GlobalDCE.h" #include "llvm/Transforms/IPO/StripDeadPrototypes.h" @@ -733,6 +736,14 @@ void EntryPointGroup::rebuild(const Module &M) { Functions.insert(const_cast(&F)); } +std::string ModuleDesc::makeSymbolTable() const { + std::string ST; + for (const Function *F : EntryPoints.Functions) + ST += (Twine(F->getName()) + "\n").str(); + + return ST; +} + namespace { // This is a helper class, which allows to group/categorize function based on // provided rules. It is intended to be used in device code split @@ -1143,5 +1154,62 @@ SmallVector splitByESIMD(ModuleDesc &&MD, return Result; } +static Error saveModuleIRInFile(Module &M, StringRef FilePath, + bool OutputAssembly) { + int FD = -1; + if (std::error_code EC = sys::fs::openFileForWrite(FilePath, FD)) + return errorCodeToError(EC); + + raw_fd_ostream OS(FD, true); + ModulePassManager MPM; + ModuleAnalysisManager MAM; + MAM.registerPass([&] { return PassInstrumentationAnalysis(); }); + if (OutputAssembly) + MPM.addPass(PrintModulePass(OS)); + else + MPM.addPass(BitcodeWriterPass(OS)); + + MPM.run(M, MAM); + return Error::success(); +} + +static Expected saveModuleDesc(ModuleDesc &MD, std::string Prefix, + bool OutputAssembly) { + SplitModule SM; + Prefix += OutputAssembly ? ".ll" : ".bc"; + Error E = saveModuleIRInFile(MD.getModule(), Prefix, OutputAssembly); + if (E) + return E; + + SM.ModuleFilePath = Prefix; + SM.Symbols = MD.makeSymbolTable(); + return SM; +} + +Expected> +splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { + ModuleDesc MD = std::move(M); // makeModuleDesc() ? + // FIXME: false arguments are temporary for now. + auto Splitter = + getDeviceCodeSplitter(std::move(MD), Settings.Mode, false, false); + size_t ID = 0; + std::vector OutputImages; + while (Splitter->hasMoreSplits()) { + ModuleDesc MD2 = Splitter->nextSplit(); + MD2.fixupLinkageOfDirectInvokeSimdTargets(); + + std::string OutIRFileName = (Settings.OutputPrefix + "_" + Twine(ID)).str(); + auto SplittedImageOrErr = + saveModuleDesc(MD2, OutIRFileName, Settings.OutputAssembly); + if (!SplittedImageOrErr) + return SplittedImageOrErr.takeError(); + + OutputImages.emplace_back(std::move(*SplittedImageOrErr)); + ++ID; + } + + return OutputImages; +} + } // namespace module_split } // namespace llvm diff --git a/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp b/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp index 8ebec7f54013d..60424c04027fa 100644 --- a/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp +++ b/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp @@ -43,19 +43,20 @@ llvm::computeDeviceRequirements(const module_split::ModuleDesc &MD) { // Process all functions in the module for (const Function &F : MD.getModule()) { if (auto *MDN = F.getMetadata("sycl_used_aspects")) { - for (auto &MDOp : MDN->operands()) { - int64_t Val; - if (auto Pair = dyn_cast(MDOp)) { + for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) { + StringRef AspectName = ""; + int64_t AspectValue; + if (auto Pair = dyn_cast(MDN->getOperand(I))) { assert(Pair->getNumOperands() == 2); - Val = mdconst::extract(Pair->getOperand(1)) - ->getZExtValue(); + AspectName = ExtractStringFromMDNodeOperand(Pair, 0); + AspectValue = ExtractSignedIntegerFromMDNodeOperand(Pair, 1); } else { - Val = mdconst::extract(MDOp)->getZExtValue(); + AspectValue = ExtractSignedIntegerFromMDNodeOperand(MDN, I); } // Don't put internal aspects (with negative integer value) into the // requirements, they are used only for device image splitting. - if (Val >= 0) - Reqs.Aspects.insert(Val); + if (AspectValue >= 0) + Reqs.Aspects.insert({AspectName, uint32_t(AspectValue)}); } } @@ -133,8 +134,11 @@ std::map SYCLDeviceRequirements::asMap() const { // For all properties except for "aspects", we'll only add the // value to the map if the corresponding value from // SYCLDeviceRequirements has a value/is non-empty. - Requirements["aspects"] = - std::vector(Aspects.begin(), Aspects.end()); + std::vector AspectValues; + AspectValues.reserve(Aspects.size()); + for (auto Aspect : Aspects) + AspectValues.push_back(Aspect.Value); + Requirements["aspects"] = std::move(AspectValues); if (!FixedTarget.empty()) Requirements["fixed_target"] = diff --git a/llvm/lib/Support/PropertySetIO.cpp b/llvm/lib/Support/PropertySetIO.cpp index 96593d4aa26be..f14f8cd5b16cb 100644 --- a/llvm/lib/Support/PropertySetIO.cpp +++ b/llvm/lib/Support/PropertySetIO.cpp @@ -202,6 +202,7 @@ constexpr char PropertySetRegistry::SYCL_PROGRAM_METADATA[]; constexpr char PropertySetRegistry::SYCL_MISC_PROP[]; constexpr char PropertySetRegistry::SYCL_ASSERT_USED[]; constexpr char PropertySetRegistry::SYCL_EXPORTED_SYMBOLS[]; +constexpr char PropertySetRegistry::SYCL_IMPORTED_SYMBOLS[]; constexpr char PropertySetRegistry::SYCL_DEVICE_GLOBALS[]; constexpr char PropertySetRegistry::SYCL_DEVICE_REQUIREMENTS[]; constexpr char PropertySetRegistry::SYCL_HOST_PIPES[]; diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index 7a965dbed6c96..6e2c59109e10c 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -147,6 +147,7 @@ set(LLVM_TEST_DEPENDS sanstats spirv-to-ir-wrapper sycl-post-link + sycl-module-split split-file verify-uselistorder yaml-bench diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index c26ee26caa8e2..1d35fdaa55bbe 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -244,6 +244,7 @@ def get_asan_rtlib(): "sanstats", "llvm-remarkutil", "spirv-to-ir-wrapper", + "sycl-module-split", ] ) diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-1.ll index 09261f7f61088..0583cfde3af23 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-1.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-1.ll @@ -5,6 +5,13 @@ ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT +; RUN: sycl-module-split -split=auto -S < %s -o %t2 +; By default auto mode is equal to source mode +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-TU0,CHECK +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-TU1,CHECK +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-TU0-TXT +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-TU1-TXT + target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" target triple = "spir64-unknown-linux" diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll index e911800bf429a..4ff2095f42bbb 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll @@ -10,6 +10,12 @@ ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT +; RUN: sycl-module-split -split=auto -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-TU0,CHECK +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-TU1,CHECK +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-TU0-TXT +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-TU1-TXT + target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" target triple = "spir64-unknown-linux" diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll index f5915c7ac57b6..a5c62a5912338 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll @@ -14,6 +14,18 @@ ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-SYM ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-SYM ; +; +; RUN: sycl-module-split -split=auto -S < %s -o %t2 +; +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-TU0-IR \ +; RUN: --implicit-check-not TU0_kernel --implicit-check-not _Z3foov \ +; RUN: --implicit-check-not _Z4foo3v +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-TU1-IR \ +; RUN: --implicit-check-not TU1_kernel --implicit-check-not _Z4foo2v \ +; RUN: --implicit-check-not _Z4foo1v +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-TU0-SYM +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-TU1-SYM + ; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel0 ; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel1 ; diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll index 458485bf53aa6..730d9a5cd8efc 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll @@ -4,6 +4,12 @@ ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix=CHECK-IR0 ; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix=CHECK-IR1 +; RUN: sycl-module-split -split=auto -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix=CHECK-SYM0 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix=CHECK-SYM1 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefix=CHECK-IR0 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix=CHECK-IR1 + ; This test checkes that we can properly perform device code split by tracking ; all uses of functions (not only direct calls) diff --git a/llvm/test/tools/sycl-post-link/device-code-split/basic-module-split.ll b/llvm/test/tools/sycl-post-link/device-code-split/basic-module-split.ll index 2a86625eeb27e..48d58248d0095 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/basic-module-split.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/basic-module-split.ll @@ -3,6 +3,12 @@ ; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT + +; RUN: sycl-module-split -split=source -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-TU0,CHECK +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-TU1,CHECK +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-TU0-TXT +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-TU1-TXT ; ModuleID = 'basic-module-split.ll' source_filename = "basic-module-split.ll" target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" diff --git a/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll b/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll index d26f97f9d70a0..064471405a58d 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll @@ -12,6 +12,17 @@ ; RUN: --implicit-check-not @BAZ --implicit-check-not @kernel_B \ ; RUN: --implicit-check-not @kernel_C ; +; RUN: sycl-module-split -split=auto -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefix CHECK0 \ +; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ +; RUN: --implicit-check-not @kernel_B --implicit-check-not @baz +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix CHECK1 \ +; RUN: --implicit-check-not @kernel_A --implicit-check-not @kernel_C +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefix CHECK2 \ +; RUN: --implicit-check-not @foo --implicit-check-not @bar \ +; RUN: --implicit-check-not @BAZ --implicit-check-not @kernel_B \ +; RUN: --implicit-check-not @kernel_C +; ; RUN: sycl-post-link -split=source -S < %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \ ; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ @@ -23,6 +34,17 @@ ; RUN: --implicit-check-not @BAZ --implicit-check-not @kernel_B \ ; RUN: --implicit-check-not @kernel_C ; +; RUN: sycl-module-split -split=source -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefix CHECK0 \ +; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ +; RUN: --implicit-check-not @kernel_B --implicit-check-not @baz +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix CHECK1 \ +; RUN: --implicit-check-not @kernel_A --implicit-check-not @kernel_C +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefix CHECK2 \ +; RUN: --implicit-check-not @foo --implicit-check-not @bar \ +; RUN: --implicit-check-not @BAZ --implicit-check-not @kernel_B \ +; RUN: --implicit-check-not @kernel_C +; ; RUN: sycl-post-link -split=kernel -S < %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \ ; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ @@ -33,6 +55,17 @@ ; RUN: --implicit-check-not @foo --implicit-check-not @bar \ ; RUN: --implicit-check-not @BAZ --implicit-check-not @kernel_B \ ; RUN: --implicit-check-not @kernel_C +; +; RUN: sycl-module-split -split=kernel -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefix CHECK0 \ +; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ +; RUN: --implicit-check-not @kernel_B --implicit-check-not @baz +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix CHECK1 \ +; RUN: --implicit-check-not @kernel_A --implicit-check-not @kernel_C +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefix CHECK2 \ +; RUN: --implicit-check-not @foo --implicit-check-not @bar \ +; RUN: --implicit-check-not @BAZ --implicit-check-not @kernel_B \ +; RUN: --implicit-check-not @kernel_C ; CHECK0-DAG: define spir_kernel void @kernel_C ; CHECK0-DAG: define spir_func i32 @bar diff --git a/llvm/test/tools/sycl-post-link/device-code-split/one-kernel-per-module.ll b/llvm/test/tools/sycl-post-link/device-code-split/one-kernel-per-module.ll index 715929861b356..0197a2edd4a1b 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/one-kernel-per-module.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/one-kernel-per-module.ll @@ -5,6 +5,15 @@ ; RUN: FileCheck %s -input-file=%t.files_1.sym --check-prefixes CHECK-MODULE1-TXT ; RUN: FileCheck %s -input-file=%t.files_2.ll --check-prefixes CHECK-MODULE2,CHECK ; RUN: FileCheck %s -input-file=%t.files_2.sym --check-prefixes CHECK-MODULE2-TXT +; +; RUN: sycl-module-split -split=kernel -S < %s -o %t2.files +; RUN: FileCheck %s -input-file=%t2.files_0.ll --check-prefixes CHECK-MODULE0,CHECK +; RUN: FileCheck %s -input-file=%t2.files_0.sym --check-prefixes CHECK-MODULE0-TXT +; RUN: FileCheck %s -input-file=%t2.files_1.ll --check-prefixes CHECK-MODULE1,CHECK +; RUN: FileCheck %s -input-file=%t2.files_1.sym --check-prefixes CHECK-MODULE1-TXT +; RUN: FileCheck %s -input-file=%t2.files_2.ll --check-prefixes CHECK-MODULE2,CHECK +; RUN: FileCheck %s -input-file=%t2.files_2.sym --check-prefixes CHECK-MODULE2-TXT + ; ModuleID = 'one-kernel-per-module.ll' source_filename = "one-kernel-per-module.ll" target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll index faec71a602ffd..51a2895f4d326 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll @@ -21,6 +21,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: sycl-module-split -split=auto -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 + ; RUN: sycl-post-link -split=source -symbols -S < %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 @@ -35,6 +49,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: sycl-module-split -split=source -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 + ; RUN: sycl-post-link -split=kernel -symbols -S < %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 @@ -49,6 +77,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: sycl-module-split -split=kernel -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 + ; Regardless of device code split mode, each kernel should go into a separate ; device image diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll index 773424fa91fcb..f4d66822b261c 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll @@ -15,6 +15,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3 +; RUN: sycl-module-split -split=auto -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-TABLE +; +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel3 --implicit-check-not kernel1 \ +; RUN: --implicit-check-not kernel2 +; +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 \ +; RUN: --implicit-check-not kernel2 +; +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3 + ; CHECK-TABLE: Code ; CHECK-TABLE-NEXT: _0.sym ; CHECK-TABLE-NEXT: _1.sym diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll index 5c1f743997816..523477a07573b 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll @@ -17,6 +17,22 @@ ; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK-M1-IR \ ; RUN: --implicit-check-not kernel0 --implicit-check-not bar +; RUN: sycl-module-split -split=auto -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-TABLE +; +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-M0-SYMS \ +; RUN: --implicit-check-not foo --implicit-check-not kernel1 +; +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-M1-SYMS \ +; RUN: --implicit-check-not foo --implicit-check-not kernel0 +; +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not foo \ +; RUN: --implicit-check-not bar +; +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not bar + ; We expect to see 3 modules generated: ; ; CHECK-TABLE: Code diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-1.ll index 282a0dd0dc79e..543a892415fa4 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-1.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-1.ll @@ -15,6 +15,16 @@ ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ ; RUN: --implicit-check-not Kernel3 +; RUN: sycl-module-split -split=auto -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel3 + ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 @@ -25,6 +35,16 @@ ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ ; RUN: --implicit-check-not Kernel3 +; RUN: sycl-module-split -split=source -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel3 + ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 @@ -39,6 +59,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \ ; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2 +; RUN: sycl-module-split -split=kernel -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2 + ; CHECK-IR-K1: define {{.*}} @Kernel1 ; CHECK-IR-K2: define {{.*}} @Kernel2 ; CHECK-IR-K3: define {{.*}} @Kernel3 diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-2.ll index 5472093bda677..6c054fc579659 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-2.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-2.ll @@ -16,6 +16,16 @@ ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \ ; RUN: --implicit-check-not Kernel2 +; RUN: sycl-module-split -split=auto -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel2 + ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K2 \ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 @@ -26,6 +36,16 @@ ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \ ; RUN: --implicit-check-not Kernel2 +; RUN: sycl-module-split -split=source -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel2 + ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 @@ -40,6 +60,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \ ; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3 +; RUN: sycl-module-split -split=kernel -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \ +; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \ +; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3 + ; CHECK-IR-K1: define {{.*}} @Kernel1 ; CHECK-IR-K2: define {{.*}} @Kernel2 ; CHECK-IR-K3: define {{.*}} @Kernel3 diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-1.ll index c85f636459fa2..fd64b234b2c6f 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-1.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-1.ll @@ -15,6 +15,16 @@ ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ ; RUN: --implicit-check-not Kernel3 +; RUN: sycl-module-split -split=auto -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel3 + ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 @@ -25,6 +35,16 @@ ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ ; RUN: --implicit-check-not Kernel3 +; RUN: sycl-module-split -split=source -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel3 + ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 @@ -39,6 +59,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \ ; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2 +; RUN: sycl-module-split -split=kernel -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2 + ; CHECK-IR-K1: define {{.*}} @Kernel1 ; CHECK-IR-K2: define {{.*}} @Kernel2 ; CHECK-IR-K3: define {{.*}} @Kernel3 diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-2.ll index f13f9caf01ed7..4c4a4bc8a1a6e 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-2.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-2.ll @@ -16,6 +16,16 @@ ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \ ; RUN: --implicit-check-not Kernel2 +; RUN: sycl-module-split -split=auto -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel2 + ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K2 \ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 @@ -26,6 +36,16 @@ ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \ ; RUN: --implicit-check-not Kernel2 +; RUN: sycl-module-split -split=source -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel2 + ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 @@ -40,6 +60,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \ ; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3 +; RUN: sycl-module-split -split=kernel -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \ +; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \ +; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3 + ; CHECK-IR-K1: define {{.*}} @Kernel1 ; CHECK-IR-K2: define {{.*}} @Kernel2 ; CHECK-IR-K3: define {{.*}} @Kernel3 diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-4.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-4.ll index f4a312ded5c7e..fe995542deba1 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-4.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-4.ll @@ -16,6 +16,16 @@ ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ ; RUN: --implicit-check-not Kernel3 +; RUN: sycl-module-split -split=auto -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel3 + ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 @@ -26,6 +36,16 @@ ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ ; RUN: --implicit-check-not Kernel3 +; RUN: sycl-module-split -split=source -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel3 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel3 + ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 @@ -40,6 +60,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \ ; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2 +; RUN: sycl-module-split -split=kernel -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2 + ; CHECK-IR-K1: define {{.*}} @Kernel1 ; CHECK-IR-K2: define {{.*}} @Kernel2 ; CHECK-IR-K3: define {{.*}} @Kernel3 diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-5.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-5.ll index b33aba9a2ad06..25fd2e26f3ca4 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-5.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-5.ll @@ -33,6 +33,32 @@ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ ; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: sycl-module-split -split=auto -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3,CHECK-IR-K5 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K6 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 \ +; RUN: --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_3.ll --check-prefixes CHECK-IR-K4 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3,CHECK-SYMS-K5 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K6 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 \ +; RUN: --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_3.sym --check-prefixes CHECK-SYMS-K4 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6 + ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3,CHECK-IR-K5 \ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ @@ -59,6 +85,32 @@ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ ; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: sycl-module-split -split=source -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3,CHECK-IR-K5 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K6 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 \ +; RUN: --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_3.ll --check-prefixes CHECK-IR-K4 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3,CHECK-SYMS-K5 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K6 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 \ +; RUN: --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_3.sym --check-prefixes CHECK-SYMS-K4 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6 + ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K6 \ ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ @@ -97,6 +149,44 @@ ; RUN: --implicit-check-not Kernel12 --implicit-check-not Kernel3 \ ; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: sycl-module-split -split=kernel -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K6 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K5 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K4 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_3.ll --check-prefixes CHECK-IR-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_4.ll --check-prefixes CHECK-IR-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 \ +; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_5.ll --check-prefixes CHECK-IR-K1 \ +; RUN: --implicit-check-not Kernel12 --implicit-check-not Kernel3 \ +; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K6 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K5 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K4 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_3.sym --check-prefixes CHECK-SYMS-K3 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \ +; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_4.sym --check-prefixes CHECK-SYMS-K2 \ +; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 \ +; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6 +; RUN: FileCheck %s -input-file=%t2_5.sym --check-prefixes CHECK-SYMS-K1 \ +; RUN: --implicit-check-not Kernel12 --implicit-check-not Kernel3 \ +; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6 + ; CHECK-IR-K1: define {{.*}} @Kernel1 ; CHECK-IR-K2: define {{.*}} @Kernel2 ; CHECK-IR-K3: define {{.*}} @Kernel3 diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll index cd890e158c734..393943b63db43 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll @@ -21,6 +21,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: sycl-module-split -split=auto -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 @@ -35,6 +49,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: sycl-module-split -split=kernel -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 @@ -49,6 +77,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: sycl-module-split -split=source -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + ; Regardless of device code split mode, each kernel should go into a separate ; device image diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll index 155b843c390a5..1efeb364cb2e3 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll @@ -16,6 +16,21 @@ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 \ ; RUN: --implicit-check-not kernel2 +; RUN: sycl-module-split -split=auto -S %s -o %t2 +; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-TABLE +; +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3 +; +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 \ +; RUN: --implicit-check-not kernel3 + +; +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 \ +; RUN: --implicit-check-not kernel2 + ; CHECK-TABLE: Code ; CHECK-TABLE-NEXT: _0.sym ; CHECK-TABLE-NEXT: _1.sym diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-1.ll index fa5ffe782a7db..b156d71b1e3f6 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-1.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-1.ll @@ -21,6 +21,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: sycl-module-split -split=auto -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + ; RUN: sycl-post-link -split=source -symbols -S < %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 @@ -35,6 +49,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: sycl-module-split -split=source -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + ; RUN: sycl-post-link -split=kernel -symbols -S < %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 @@ -49,6 +77,20 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: sycl-module-split -split=kernel -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + ; Regardless of device code split mode, each kernel should go into a separate ; device image diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll index cb38a596a7ba9..c92ae8dbc9c03 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll @@ -15,6 +15,20 @@ ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 \ ; RUN: --implicit-check-not kernel0 +; RUN: sycl-module-split -split=auto -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-TABLE +; +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel3 \ +; RUN: --implicit-check-not kernel2 +; +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 \ +; RUN: --implicit-check-not kernel0 + ; CHECK-TABLE: Code ; CHECK-TABLE-NEXT: _0.sym ; CHECK-TABLE-NEXT: _1.sym diff --git a/llvm/test/tools/sycl-post-link/device-code-split/split-with-kernel-declarations.ll b/llvm/test/tools/sycl-post-link/device-code-split/split-with-kernel-declarations.ll index 595427a786e7b..82213e4b3beeb 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/split-with-kernel-declarations.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/split-with-kernel-declarations.ll @@ -6,11 +6,22 @@ ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-PER-SOURCE-SYM0 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-PER-SOURCE-SYM1 ; -; RUN: sycl-post-link -split=kernel -symbols -S < %s -o %t1.table -; RUN: FileCheck %s -input-file=%t1.table --check-prefix CHECK-PER-KERNEL-TABLE -; RUN: FileCheck %s -input-file=%t1_0.sym --check-prefix CHECK-PER-KERNEL-SYM1 -; RUN: FileCheck %s -input-file=%t1_1.sym --check-prefix CHECK-PER-KERNEL-SYM2 -; RUN: FileCheck %s -input-file=%t1_2.sym --check-prefix CHECK-PER-KERNEL-SYM0 +; RUN: sycl-module-split -split=source -S < %s -o %t1 +; RUN: FileCheck %s -input-file=%t1.table --check-prefix CHECK-PER-SOURCE-TABLE +; RUN: FileCheck %s -input-file=%t1_0.sym --check-prefix CHECK-PER-SOURCE-SYM0 +; RUN: FileCheck %s -input-file=%t1_1.sym --check-prefix CHECK-PER-SOURCE-SYM1 +; +; RUN: sycl-post-link -split=kernel -symbols -S < %s -o %t2.table +; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-PER-KERNEL-TABLE +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-PER-KERNEL-SYM1 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-PER-KERNEL-SYM2 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-PER-KERNEL-SYM0 +; +; RUN: sycl-module-split -split=kernel -S < %s -o %t3 +; RUN: FileCheck %s -input-file=%t3.table --check-prefix CHECK-PER-KERNEL-TABLE +; RUN: FileCheck %s -input-file=%t3_0.sym --check-prefix CHECK-PER-KERNEL-SYM1 +; RUN: FileCheck %s -input-file=%t3_1.sym --check-prefix CHECK-PER-KERNEL-SYM2 +; RUN: FileCheck %s -input-file=%t3_2.sym --check-prefix CHECK-PER-KERNEL-SYM0 ; With per-source split, there should be two device images ; CHECK-PER-SOURCE-TABLE: [Code|Properties|Symbols] diff --git a/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll b/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll index 02d289fa772e0..cb9fd1f77cf78 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll @@ -42,6 +42,9 @@ ; RUN: sycl-post-link -split=auto -S < %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll ; +; RUN: sycl-module-split -split=auto -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2_0.ll +; ; CHECK-DAG: @_ZTV8Derived1 = {{.*}} @_ZN8Derived17displayEv ; CHECK-DAG: @_ZTV8Derived2 = {{.*}} @_ZN8Derived27displayEv ; diff --git a/llvm/test/tools/sycl-post-link/emit_imported_symbols.ll b/llvm/test/tools/sycl-post-link/emit_imported_symbols.ll new file mode 100644 index 0000000000000..ae824d293b9ea --- /dev/null +++ b/llvm/test/tools/sycl-post-link/emit_imported_symbols.ll @@ -0,0 +1,113 @@ +; This test checks that the -emit-imported-symbols option generates a list of imported symbols +; Function names were chosen so that no function with a 'inside' in their function name is imported +; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Test with -split=kernel +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; RUN: sycl-post-link -symbols -emit-imported-symbols -split=kernel -S < %s -o %t_kernel.table + +; RUN: FileCheck %s -input-file=%t_kernel_0.sym --check-prefixes CHECK-KERNEL-SYM-0 +; RUN: FileCheck %s -input-file=%t_kernel_1.sym --check-prefixes CHECK-KERNEL-SYM-1 +; RUN: FileCheck %s -input-file=%t_kernel_2.sym --check-prefixes CHECK-KERNEL-SYM-2 + +; RUN: FileCheck %s -input-file=%t_kernel_0.prop --check-prefixes CHECK-KERNEL-IMPORTED-SYM-0 +; RUN: FileCheck %s -input-file=%t_kernel_1.prop --check-prefixes CHECK-KERNEL-IMPORTED-SYM-1 +; RUN: FileCheck %s -input-file=%t_kernel_2.prop --check-prefixes CHECK-KERNEL-IMPORTED-SYM-2 + +; CHECK-KERNEL-SYM-0: middle +; CHECK-KERNEL-IMPORTED-SYM-0: [SYCL/imported symbols] +; CHECK-KERNEL-IMPORTED-SYM-0-NEXT: childD +; CHECK-KERNEL-IMPORTED-SYM-0-EMPTY: + +; CHECK-KERNEL-SYM-1: foo +; CHECK-KERNEL-IMPORTED-SYM-1: [SYCL/imported symbols] +; CHECK-KERNEL-IMPORTED-SYM-1-NEXT: childA +; CHECK-KERNEL-IMPORTED-SYM-1-NEXT: childC +; CHECK-KERNEL-IMPORTED-SYM-1-NEXT: childD +; CHECK-KERNEL-IMPORTED-SYM-1-EMPTY: + + +; CHECK-KERNEL-SYM-2: bar +; CHECK-KERNEL-IMPORTED-SYM-2: [SYCL/imported symbols] +; CHECK-KERNEL-IMPORTED-SYM-2-NEXT: childB +; CHECK-KERNEL-IMPORTED-SYM-2-NEXT: childC +; CHECK-KERNEL-IMPORTED-SYM-2-NEXT: childD +; CHECK-KERNEL-IMPORTED-SYM-2-NEXT: _Z7outsidev +; CHECK-KERNEL-IMPORTED-SYM-2-EMPTY: + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Test with -split=source +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; RUN: sycl-post-link -symbols -emit-imported-symbols -split=source -S < %s -o %t_source.table +; RUN: FileCheck %s -input-file=%t_source_0.sym --check-prefixes CHECK-SOURCE-SYM-0 +; RUN: FileCheck %s -input-file=%t_source_0.prop --check-prefixes CHECK-SOURCE-IMPORTED-SYM-0 + +; RUN: sycl-post-link -symbols -emit-imported-symbols -split=source -S < %s -o %t_source.table -O0 +; RUN: FileCheck %s -input-file=%t_source_0.sym --check-prefixes CHECK-SOURCE-SYM-0 +; RUN: FileCheck %s -input-file=%t_source_0.prop --check-prefixes CHECK-SOURCE-IMPORTED-SYM-0 + +; CHECK-SOURCE-SYM-0-DAG: foo +; CHECK-SOURCE-SYM-0-DAG: bar +; CHECK-SOURCE-SYM-0-DAG: middle + +; CHECK-SOURCE-IMPORTED-SYM-0: [SYCL/imported symbols] +; CHECK-SOURCE-IMPORTED-SYM-0-NEXT: childA +; CHECK-SOURCE-IMPORTED-SYM-0-NEXT: childB +; CHECK-SOURCE-IMPORTED-SYM-0-NEXT: childC +; CHECK-SOURCE-IMPORTED-SYM-0-NEXT: childD +; CHECK-SOURCE-IMPORTED-SYM-0-NEXT: _Z7outsidev +; CHECK-SOURCE-IMPORTED-SYM-0-EMPTY: + +target triple = "spir64-unknown-unknown" + +@llvm.used = appending global [2 x ptr] [ptr @foo, ptr @bar], section "llvm.metadata" + +define weak_odr spir_kernel void @foo() #0 { + call void @childA() + call void @childC() + call void @middle() + ret void +} + +define weak_odr spir_kernel void @bar() #0 { + ;; Functions that are not SYCL External (i.e. they have no sycl-module-id) cannot be imported + call spir_func void @__itt_offload_wi_start_wrapper() + + call void @childB() + call void @childC() + call void @middle() + ;; LLVM intrinsics cannot be imported + %dummy = call i8 @llvm.bitreverse.i8(i8 0) + ;; Functions with a demangled name prefixed with a '__' are not imported + call void @_Z8__insidev() + call void @_Z7outsidev() + + ;; Functions that are not SYCL External (i.e. they have no sycl-module-id) cannot be imported + call spir_func void @__itt_offload_wi_finish_wrapper() + ret void +} + +define void @middle() #0 { + call void @childD() + ret void +} + +declare void @childA() #1 +declare void @childB() #1 +declare void @childC() #1 +declare void @childD() #1 + +declare void @_Z7outsidev() #1 +;; Verify unused functions are not imported +declare void @insideUnusedFunction() #1 +declare void @_Z8__insidev() #1 +declare i8 @llvm.bitreverse.i8(i8) + +declare spir_func void @__itt_offload_wi_start_wrapper() +declare spir_func void @__itt_offload_wi_finish_wrapper() + +attributes #0 = { "sycl-module-id"="a.cpp" } +attributes #1 = { "sycl-module-id"="external.cpp" } diff --git a/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll b/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll index 1f014410d0a1c..7c2ab6e91b925 100644 --- a/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll +++ b/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll @@ -65,136 +65,56 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" target triple = "spir64-unknown-unknown" -; Function Attrs: mustprogress norecurse nounwind -define weak_odr dso_local spir_kernel void @double_kernel(ptr addrspace(1) noundef align 8 %_arg_out) local_unnamed_addr #0 !srcloc !65 !kernel_arg_buffer_location !66 !sycl_used_aspects !67 !sycl_fixed_targets !68 !sycl_kernel_omit_args !69 { +define spir_kernel void @double_kernel(ptr addrspace(1) noundef align 8 %_arg_out) #0 !sycl_used_aspects !67 { entry: - %0 = load double, ptr addrspace(1) %_arg_out, align 8, !tbaa !70 + %0 = load double, ptr addrspace(1) %_arg_out, align 8 %mul.i = fmul double %0, 2.000000e-01 - store double %mul.i, ptr addrspace(1) %_arg_out, align 8, !tbaa !70 + store double %mul.i, ptr addrspace(1) %_arg_out, align 8 ret void } -; Function Attrs: mustprogress norecurse nounwind -define weak_odr dso_local spir_kernel void @float_kernel(ptr addrspace(1) noundef align 4 %_arg_out) local_unnamed_addr #0 !srcloc !74 !kernel_arg_buffer_location !66 !sycl_fixed_targets !68 !sycl_kernel_omit_args !69 { +define spir_kernel void @float_kernel(ptr addrspace(1) noundef align 4 %_arg_out) #0 { entry: - %0 = load float, ptr addrspace(1) %_arg_out, align 4, !tbaa !75 + %0 = load float, ptr addrspace(1) %_arg_out, align 4 %mul.i = fmul float %0, 0x3FC99999A0000000 - store float %mul.i, ptr addrspace(1) %_arg_out, align 4, !tbaa !75 + store float %mul.i, ptr addrspace(1) %_arg_out, align 4 ret void } -; Function Attrs: mustprogress norecurse nounwind -define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_8() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !78 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 { +define spir_kernel void @reqd_sub_group_size_kernel_8() #0 !intel_reqd_sub_group_size !78 { entry: ret void } -; Function Attrs: mustprogress norecurse nounwind -define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_16() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !79 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 { +define spir_kernel void @reqd_sub_group_size_kernel_16() #0 !intel_reqd_sub_group_size !79 { entry: ret void } -; Function Attrs: mustprogress norecurse nounwind -define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_32() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !80 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 { +define spir_kernel void @reqd_sub_group_size_kernel_32() #0 !intel_reqd_sub_group_size !80 { entry: ret void } -; Function Attrs: mustprogress norecurse nounwind -define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_64() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !81 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 { +define spir_kernel void @reqd_sub_group_size_kernel_64() #0 !intel_reqd_sub_group_size !81 { entry: ret void } -declare dso_local spir_func i32 @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2), ...) - attributes #0 = { mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="double.cpp" "sycl-optlevel"="3" "uniform-work-group-size"="true" } !llvm.module.flags = !{!0, !1} !opencl.spir.version = !{!2} !spirv.Source = !{!3} -!sycl_aspects = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63} !llvm.ident = !{!64} !0 = !{i32 1, !"wchar_size", i32 4} !1 = !{i32 7, !"frame-pointer", i32 2} !2 = !{i32 1, i32 2} !3 = !{i32 4, i32 100000} -!4 = !{!"cpu", i32 1} -!5 = !{!"gpu", i32 2} -!6 = !{!"accelerator", i32 3} -!7 = !{!"custom", i32 4} -!8 = !{!"fp16", i32 5} !9 = !{!"fp64", i32 6} -!10 = !{!"image", i32 9} -!11 = !{!"online_compiler", i32 10} -!12 = !{!"online_linker", i32 11} -!13 = !{!"queue_profiling", i32 12} -!14 = !{!"usm_device_allocations", i32 13} -!15 = !{!"usm_host_allocations", i32 14} -!16 = !{!"usm_shared_allocations", i32 15} -!17 = !{!"usm_system_allocations", i32 17} -!18 = !{!"ext_intel_pci_address", i32 18} -!19 = !{!"ext_intel_gpu_eu_count", i32 19} -!20 = !{!"ext_intel_gpu_eu_simd_width", i32 20} -!21 = !{!"ext_intel_gpu_slices", i32 21} -!22 = !{!"ext_intel_gpu_subslices_per_slice", i32 22} -!23 = !{!"ext_intel_gpu_eu_count_per_subslice", i32 23} -!24 = !{!"ext_intel_max_mem_bandwidth", i32 24} -!25 = !{!"ext_intel_mem_channel", i32 25} -!26 = !{!"usm_atomic_host_allocations", i32 26} -!27 = !{!"usm_atomic_shared_allocations", i32 27} -!28 = !{!"atomic64", i32 28} -!29 = !{!"ext_intel_device_info_uuid", i32 29} -!30 = !{!"ext_oneapi_srgb", i32 30} -!31 = !{!"ext_oneapi_native_assert", i32 31} -!32 = !{!"host_debuggable", i32 32} -!33 = !{!"ext_intel_gpu_hw_threads_per_eu", i32 33} -!34 = !{!"ext_oneapi_cuda_async_barrier", i32 34} -!35 = !{!"ext_oneapi_bfloat16_math_functions", i32 35} -!36 = !{!"ext_intel_free_memory", i32 36} -!37 = !{!"ext_intel_device_id", i32 37} -!38 = !{!"ext_intel_memory_clock_rate", i32 38} -!39 = !{!"ext_intel_memory_bus_width", i32 39} -!40 = !{!"emulated", i32 40} -!41 = !{!"ext_intel_legacy_image", i32 41} -!42 = !{!"ext_oneapi_bindless_images", i32 42} -!43 = !{!"ext_oneapi_bindless_images_shared_usm", i32 43} -!44 = !{!"ext_oneapi_bindless_images_1d_usm", i32 44} -!45 = !{!"ext_oneapi_bindless_images_2d_usm", i32 45} -!46 = !{!"ext_oneapi_interop_memory_import", i32 46} -!47 = !{!"ext_oneapi_interop_memory_export", i32 47} -!48 = !{!"ext_oneapi_interop_semaphore_import", i32 48} -!49 = !{!"ext_oneapi_interop_semaphore_export", i32 49} -!50 = !{!"ext_oneapi_mipmap", i32 50} -!51 = !{!"ext_oneapi_mipmap_anisotropy", i32 51} -!52 = !{!"ext_oneapi_mipmap_level_reference", i32 52} -!53 = !{!"ext_intel_esimd", i32 53} -!54 = !{!"ext_oneapi_ballot_group", i32 54} -!55 = !{!"ext_oneapi_fixed_size_group", i32 55} -!56 = !{!"ext_oneapi_opportunistic_group", i32 56} -!57 = !{!"ext_oneapi_tangle_group", i32 57} -!58 = !{!"ext_intel_matrix", i32 58} -!59 = !{!"int64_base_atomics", i32 7} -!60 = !{!"int64_extended_atomics", i32 8} -!61 = !{!"usm_system_allocator", i32 17} -!62 = !{!"usm_restricted_shared_allocations", i32 16} -!63 = !{!"host", i32 0} !64 = !{!"clang version 19.0.0git (/ws/llvm/clang a7f3a637bdd6299831f903bbed9e8d069fea5c86)"} -!65 = !{i32 233} -!66 = !{i32 -1} -!67 = !{i32 6} -!68 = !{} -!69 = !{i1 false} -!70 = !{!71, !71, i64 0} -!71 = !{!"double", !72, i64 0} -!72 = !{!"omnipotent char", !73, i64 0} -!73 = !{!"Simple C++ TBAA"} -!74 = !{i32 364} -!75 = !{!76, !76, i64 0} -!76 = !{!"float", !72, i64 0} -!77 = !{i32 529} +!67 = !{!9} !78 = !{i32 8} !79 = !{i32 16} !80 = !{i32 32} diff --git a/llvm/tools/sycl-module-split/CMakeLists.txt b/llvm/tools/sycl-module-split/CMakeLists.txt new file mode 100644 index 0000000000000..0c29be481e538 --- /dev/null +++ b/llvm/tools/sycl-module-split/CMakeLists.txt @@ -0,0 +1,10 @@ +set(LLVM_LINK_COMPONENTS + Core + IRReader + Support + SYCLLowerIR + ) + +add_llvm_tool(sycl-module-split + sycl-module-split.cpp + ) diff --git a/llvm/tools/sycl-module-split/sycl-module-split.cpp b/llvm/tools/sycl-module-split/sycl-module-split.cpp new file mode 100644 index 0000000000000..89d8b9e10b2b7 --- /dev/null +++ b/llvm/tools/sycl-module-split/sycl-module-split.cpp @@ -0,0 +1,130 @@ +//==-- sycl-module-split: command line tool for testing SYCL Module Splitting // +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This program can be used only to test the SYCL Module Splitting. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/SYCLLowerIR/ModuleSplitter.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/PropertySetIO.h" +#include "llvm/Support/SimpleTable.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include + +using namespace llvm; +using namespace llvm::util; +using namespace module_split; + +static cl::OptionCategory SplitCategory("Split options"); + +static cl::opt InputFilename(cl::Positional, cl::desc(""), + cl::init("-"), + cl::value_desc("filename")); + +static cl::opt + OutputFilenamePrefix("o", cl::desc("output filename prefix"), + cl::value_desc("filename prefix"), cl::init("output"), + cl::cat(SplitCategory)); + +cl::opt OutputAssembly{"S", cl::desc("Write output as LLVM assembly"), + cl::cat(SplitCategory)}; + +cl::opt SplitMode( + "split", cl::desc("split input module"), cl::Optional, cl::init(SPLIT_NONE), + cl::values(clEnumValN(module_split::SPLIT_PER_TU, "source", + "1 output module per source (translation unit)"), + clEnumValN(module_split::SPLIT_PER_KERNEL, "kernel", + "1 output module per kernel"), + clEnumValN(module_split::SPLIT_AUTO, "auto", + "Choose split mode automatically")), + cl::cat(SplitCategory)); + +void writeStringToFile(const std::string &Content, StringRef Path) { + std::error_code EC; + raw_fd_ostream OS(Path, EC); + if (EC) { + errs() << formatv("error opening file: {0}\n", Path); + exit(1); + } + + OS << Content << "\n"; +} + +void writePropertiesToFile(const PropertySetRegistry &Properties, + StringRef Path) { + std::error_code EC; + raw_fd_ostream OS(Path, EC); + if (EC) { + errs() << formatv("error opening file: {0}\n", Path); + exit(1); + } + + Properties.write(OS); +} + +void dumpModulesAsTable(const std::vector &SplitModules, + StringRef Path) { + std::vector Columns = {"Code", "Properties", "Symbols"}; + auto TableOrErr = SimpleTable::create(Columns); + if (!TableOrErr) { + errs() << "can't create a table\n"; + exit(1); + } + + std::unique_ptr Table = std::move(*TableOrErr); + for (const auto &[I, SM] : enumerate(SplitModules)) { + std::string SymbolsFile = (Twine(Path) + "_" + Twine(I) + ".sym").str(); + std::string PropertiesFile = (Twine(Path) + "_" + Twine(I) + ".prop").str(); + writePropertiesToFile(SM.Properties, PropertiesFile); + writeStringToFile(SM.Symbols, SymbolsFile); + SmallVector Row = {SM.ModuleFilePath, PropertiesFile, + SymbolsFile}; + Table->addRow(Row); + } + + std::error_code EC; + raw_fd_ostream OS((Path + ".table").str(), EC); + if (EC) { + errs() << formatv("error opening file: {0}\n", Path); + exit(1); + } + + Table->write(OS); +} + +int main(int argc, char *argv[]) { + LLVMContext C; + SMDiagnostic Err; + cl::ParseCommandLineOptions(argc, argv, "SYCL Module Splitter\n"); + + std::unique_ptr M = parseIRFile(InputFilename, Err, C); + if (!M) { + Err.print(argv[0], errs()); + return 1; + } + + ModuleSplitterSettings Settings; + Settings.Mode = SplitMode; + Settings.OutputAssembly = OutputAssembly; + Settings.OutputPrefix = OutputFilenamePrefix; + auto SplitModulesOrErr = splitSYCLModule(std::move(M), Settings); + if (!SplitModulesOrErr) { + Err.print(argv[0], errs()); + return 1; + } + + dumpModulesAsTable(*SplitModulesOrErr, OutputFilenamePrefix); +} diff --git a/llvm/tools/sycl-post-link/CMakeLists.txt b/llvm/tools/sycl-post-link/CMakeLists.txt index cfb9b1a27560f..aa98f4942edbc 100644 --- a/llvm/tools/sycl-post-link/CMakeLists.txt +++ b/llvm/tools/sycl-post-link/CMakeLists.txt @@ -1,6 +1,7 @@ set(LLVM_LINK_COMPONENTS BitWriter Core + Demangle IPO IRPrinter IRReader diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp index 8f8dd8267c771..1554e81751668 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp @@ -668,6 +668,30 @@ SYCLDeviceLibFuncMap SDLMap = { DeviceLibExt::cl_intel_devicelib_bfloat16}, {"__devicelib_ConvertBF16ToFINTEL", DeviceLibExt::cl_intel_devicelib_bfloat16}, + {"__devicelib_ConvertFToBF16INTELVec1", + DeviceLibExt::cl_intel_devicelib_bfloat16}, + {"__devicelib_ConvertBF16ToFINTELVec1", + DeviceLibExt::cl_intel_devicelib_bfloat16}, + {"__devicelib_ConvertFToBF16INTELVec2", + DeviceLibExt::cl_intel_devicelib_bfloat16}, + {"__devicelib_ConvertBF16ToFINTELVec2", + DeviceLibExt::cl_intel_devicelib_bfloat16}, + {"__devicelib_ConvertFToBF16INTELVec3", + DeviceLibExt::cl_intel_devicelib_bfloat16}, + {"__devicelib_ConvertBF16ToFINTELVec3", + DeviceLibExt::cl_intel_devicelib_bfloat16}, + {"__devicelib_ConvertFToBF16INTELVec4", + DeviceLibExt::cl_intel_devicelib_bfloat16}, + {"__devicelib_ConvertBF16ToFINTELVec4", + DeviceLibExt::cl_intel_devicelib_bfloat16}, + {"__devicelib_ConvertFToBF16INTELVec8", + DeviceLibExt::cl_intel_devicelib_bfloat16}, + {"__devicelib_ConvertBF16ToFINTELVec8", + DeviceLibExt::cl_intel_devicelib_bfloat16}, + {"__devicelib_ConvertFToBF16INTELVec16", + DeviceLibExt::cl_intel_devicelib_bfloat16}, + {"__devicelib_ConvertBF16ToFINTELVec16", + DeviceLibExt::cl_intel_devicelib_bfloat16}, }; // Each fallback device library corresponds to one bit in "require mask" which diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 6c6db956c383a..9afa25c3a6552 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -25,6 +25,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/BitcodeWriterPass.h" +#include "llvm/Demangle/Demangle.h" #include "llvm/GenXIntrinsics/GenXSPIRVWriterAdaptor.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/LLVMContext.h" @@ -228,6 +229,10 @@ cl::opt EmitExportedSymbols{"emit-exported-symbols", cl::desc("emit exported symbols"), cl::cat(PostLinkCat)}; +cl::opt EmitImportedSymbols{"emit-imported-symbols", + cl::desc("emit imported symbols"), + cl::cat(PostLinkCat)}; + cl::opt EmitOnlyKernelsAsEntryPoints{ "emit-only-kernels-as-entry-points", cl::desc("Consider only sycl_kernel functions as entry points for " @@ -250,6 +255,7 @@ struct GlobalBinImageProps { bool EmitKernelParamInfo; bool EmitProgramMetadata; bool EmitExportedSymbols; + bool EmitImportedSymbols; bool EmitDeviceGlobalPropSet; }; @@ -411,6 +417,25 @@ std::string saveModuleIR(Module &M, int I, StringRef Suff) { return OutFilename; } +bool isImportedFunction(const Function &F) { + if (!F.isDeclaration() || F.isIntrinsic() || + !llvm::sycl::utils::isSYCLExternalFunction(&F)) + return false; + + // StripDeadPrototypes is called during module splitting + // cleanup. At this point all function decls should have uses. + assert(!F.use_empty() && "Function F has no uses"); + + bool ReturnValue = true; + if (char *NameStr = itaniumDemangle(F.getName())) { + StringRef DemangledName(NameStr); + if (DemangledName.starts_with("__")) + ReturnValue = false; + free(NameStr); + } + return ReturnValue; +} + std::string saveModuleProperties(module_split::ModuleDesc &MD, const GlobalBinImageProps &GlobProps, int I, StringRef Suff) { @@ -474,10 +499,21 @@ std::string saveModuleProperties(module_split::ModuleDesc &MD, // so they won't make it into the export list. Should the check be // F->getCallingConv() != CallingConv::SPIR_KERNEL? if (F->getCallingConv() == CallingConv::SPIR_FUNC) { - PropSet.add(PropSetRegTy::SYCL_EXPORTED_SYMBOLS, F->getName(), true); + PropSet.add(PropSetRegTy::SYCL_EXPORTED_SYMBOLS, F->getName(), + /*PropVal=*/true); } } } + + if (GlobProps.EmitImportedSymbols) { + // record imported functions in the property set + for (const auto &F : M) { + if (isImportedFunction(F)) + PropSet.add(PropSetRegTy::SYCL_IMPORTED_SYMBOLS, F.getName(), + /*PropVal=*/true); + } + } + // Metadata names may be composite so we keep them alive until the // properties have been written. SmallVector MetadataNames; @@ -730,7 +766,8 @@ IrPropSymFilenameTriple saveModule(module_split::ModuleDesc &MD, int I, Res.Ir = saveModuleIR(MD.getModule(), I, Suffix); } GlobalBinImageProps Props = {EmitKernelParamInfo, EmitProgramMetadata, - EmitExportedSymbols, DeviceGlobals}; + EmitExportedSymbols, EmitImportedSymbols, + DeviceGlobals}; Res.Prop = saveModuleProperties(MD, Props, I, Suffix); if (DoSymGen) { @@ -1014,41 +1051,12 @@ bool isTargetCompatibleWithModule(const std::optional &Target, DeviceConfigFile::TargetTable[*Target]; const SYCLDeviceRequirements &ModuleReqs = IrMD.getOrComputeDeviceRequirements(); - // The device config file data stores the target's supported - // aspects as a vector of the strings, so we need to translate - // the values to a common format. - const NamedMDNode *Node = IrMD.getModule().getNamedMetadata("sycl_aspects"); - if (Node) { - SmallMapVector AspectNameToValue; - for (const MDNode *N : Node->operands()) { - assert(N->getNumOperands() == 2 && - "Each operand of sycl_aspects must be a pair."); - - // The aspect's name is the first operand. - const auto *AspectName = cast(N->getOperand(0)); - - // The aspect's integral value is the second operand. - const auto *AspectCAM = cast(N->getOperand(1)); - const Constant *AspectC = AspectCAM->getValue(); - - AspectNameToValue[AspectName->getString()] = - cast(AspectC)->getSExtValue(); - } - - // Make the set of aspects values the target supports. - SmallSet TargetAspectValueSet; - for (const auto &Aspect : TargetInfo.aspects) { - auto It = AspectNameToValue.find(Aspect); - assert(It != AspectNameToValue.end() && "Aspect value mapping unknown!"); - TargetAspectValueSet.insert(It->second); - } - // Now check to see if all the requirements of the input module - // are compatbile with the target. - for (const auto &Aspect : ModuleReqs.Aspects) { - if (!TargetAspectValueSet.contains(Aspect)) - return false; - } + // Check to see if all the requirements of the input module + // are compatbile with the target. + for (const auto &Aspect : ModuleReqs.Aspects) { + if (!is_contained(TargetInfo.aspects, Aspect.Name)) + return false; } // Check if module sub group size is compatible with the target. @@ -1278,13 +1286,14 @@ int main(int argc, char **argv) { bool DoParamInfo = EmitKernelParamInfo.getNumOccurrences() > 0; bool DoProgMetadata = EmitProgramMetadata.getNumOccurrences() > 0; bool DoExportedSyms = EmitExportedSymbols.getNumOccurrences() > 0; + bool DoImportedSyms = EmitImportedSymbols.getNumOccurrences() > 0; bool DoDeviceGlobals = DeviceGlobals.getNumOccurrences() > 0; bool DoGenerateDeviceImageWithDefaulValues = GenerateDeviceImageWithDefaultSpecConsts.getNumOccurrences() > 0; if (!DoSplit && !DoSpecConst && !DoSymGen && !DoParamInfo && - !DoProgMetadata && !DoSplitEsimd && !DoExportedSyms && !DoDeviceGlobals && - !DoLowerEsimd) { + !DoProgMetadata && !DoSplitEsimd && !DoExportedSyms && !DoImportedSyms && + !DoDeviceGlobals && !DoLowerEsimd) { errs() << "no actions specified; try --help for usage info\n"; return 1; } @@ -1318,6 +1327,11 @@ int main(int argc, char **argv) { << " -" << IROutputOnly.ArgStr << "\n"; return 1; } + if (IROutputOnly && DoImportedSyms) { + errs() << "error: -" << EmitImportedSymbols.ArgStr << " can't be used with" + << " -" << IROutputOnly.ArgStr << "\n"; + return 1; + } if (IROutputOnly && DoGenerateDeviceImageWithDefaulValues) { errs() << "error: -" << GenerateDeviceImageWithDefaultSpecConsts.ArgStr << " can't be used with -" << IROutputOnly.ArgStr << "\n"; diff --git a/llvm/utils/git/requirements.txt b/llvm/utils/git/requirements.txt index de84e17104954..656f64bb7315e 100644 --- a/llvm/utils/git/requirements.txt +++ b/llvm/utils/git/requirements.txt @@ -240,7 +240,7 @@ smmap==5.0.1 \ --hash=sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62 \ --hash=sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da # via gitdb -urllib3==2.2.1 \ +urllib3==2.2.2 \ --hash=sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d \ --hash=sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19 # via requests diff --git a/llvm/utils/git/requirements_formatting.txt b/llvm/utils/git/requirements_formatting.txt index 6fadbaffcb7c1..2741c03fa26b7 100644 --- a/llvm/utils/git/requirements_formatting.txt +++ b/llvm/utils/git/requirements_formatting.txt @@ -46,7 +46,7 @@ requests==2.32.0 # via pygithub toml==0.10.2 # via darker -urllib3==2.2.1 +urllib3==2.2.2 # via requests wrapt==1.16.0 # via deprecated diff --git a/mlir/utils/vscode/package-lock.json b/mlir/utils/vscode/package-lock.json index 9f4d8f51f31c6..11edbbbf968f4 100644 --- a/mlir/utils/vscode/package-lock.json +++ b/mlir/utils/vscode/package-lock.json @@ -285,11 +285,11 @@ } }, "node_modules/braces": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", - "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", "dependencies": { - "fill-range": "^7.0.1" + "fill-range": "^7.1.1" }, "engines": { "node": ">=8" @@ -724,9 +724,9 @@ } }, "node_modules/fill-range": { - "version": "7.0.1", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", - "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==", + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", "dependencies": { "to-regex-range": "^5.0.1" }, @@ -2208,11 +2208,11 @@ } }, "braces": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", - "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", "requires": { - "fill-range": "^7.0.1" + "fill-range": "^7.1.1" } }, "buffer": { @@ -2533,9 +2533,9 @@ } }, "fill-range": { - "version": "7.0.1", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", - "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==", + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", "requires": { "to-regex-range": "^5.0.1" } diff --git a/sycl/ReleaseNotes.md b/sycl/ReleaseNotes.md index b80e6640b9d18..bb592c570db92 100644 --- a/sycl/ReleaseNotes.md +++ b/sycl/ReleaseNotes.md @@ -1,3 +1,153 @@ +# Mar'24 release notes +Release notes for commit range [f4e0d3177338](https://github.com/intel/llvm/commit/f4ed132f243ab43816ebe826669d978139964df2).. [d2817d6d317db1](https://github.com/intel/llvm/commit/d2817d6d317db1143bb227168e85c409d5ab7c82) + +## New Features +### SYCL Compiler + +- Added more available CPU for `-march` option in OpenCL AOT compiler. [7911773c] +- Added support for additional AMD GPU targets. [c1ce15944] +- Supported detecting out-of-bound errors on CPU device, static local memory, and device globals via AddressSanitizer. [f331ba2063] [a14cfdd7999] +- Provide a preprocessor macro to locate the CUPTI library when XPTI tracing is enabled during compiler build. [e15ebd08] [acf89a6c90] +- Made `-fsycl-dump-device-code` save PTX files generated for the CUDA backend. [16e06ff] +- When multiple floating point accuracy-related options are specified on the CLI, made the last option take precedence over others. [69e2b91] +- Added a new `-fsycl-dump-device-code` option to dump device code generated during SYCL compilation into a user-specified directory. [96ce6ea] +- Added support for `-fsycl-link` with ahead-of-time (AOT) compilation. [22fab5a] +- Added support for `-O3` on Windows when using `clang-cl`. [0af4ac7] + +### SYCL Library + +- Implemented [ext_oneapi_kernel_compiler](https://github.com/intel/llvm/blob/096676e8d4d87475860723ed8a4d8c256bcd98c2/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler.asciidoc) SYCL extension. [096676e8] [e5826540] [67086100] +- Implemented [ext_intel_fp_control](https://github.com/intel/llvm/blob/bf8ea96f/sycl/doc/extensions/experimental/sycl_ext_intel_fp_control.asciidoc) SYCL extension. [bf8ea96f] +- Implemented [ext_oneapi_kernel_compiler_opencl](https://github.com/intel/llvm/blob/6344ead19/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler_opencl.asciidoc) SYCL extension. [6344ead19] +- Enabled kernel fusion with heterogeneous ND ranges for HIP targets. [e44888873] +- Enabled [ext_oneapi_graph](https://github.com/intel/llvm/blob/5d7524543/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension for OpenCL and HIP backend. [5d7524543] [897b27076] +- Supported graph partitioning for host task dependencies in [ext_oneapi_graph](https://github.com/intel/llvm/blob/d53f123a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension. [d53f123a] +- Added ESIMD APIs for stochastic rounding, property-based gather, masked-gather, and ReaD timestamp counting. [aa4e87801] [3eca2d473] [1261e0518] +- Added out-of-bounds `load`,`store`,`fill` and overloads accepting annotated pointers in [ext_oneapi_matrix](https://github.com/intel/llvm/blob/4c17a7f39/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc) SYCL extension [4c17a7f39] [f3137e99] +- Added support for `queue::mem_advise` on HIP backends. [a669374b7] [ab86d0db] +- Supported `fill` and `memset` nodes in [ext_oneapi_graph](https://github.com/intel/llvm/blob/8ea022954/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension. [8ea022954] +- Implemented [ext_oneapi_in_order_queue_events](https://github.com/intel/llvm/blob/19072756e/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc) SYCL extension. [19072756e] +- Implemented [ext_oneapi_address_cast](https://github.com/intel/llvm/blob/123705190/sycl/doc/extensions/proposed/sycl_ext_oneapi_address_cast.asciidoc) SYCL extension. [123705190] +- Implemented [ext_oneapi_kernel_compiler_spirv](https://github.com/intel/llvm/blob/36e123d3e1/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler_spirv.asciidoc) SYCL extension. [36e123d3e1] +- Implemented [ext_oneapi_composite_device](https://github.com/intel/llvm/blob/2db1a4f6a5/sycl/doc/extensions/experimental/sycl_ext_oneapi_composite_device.asciidoc) SYCL extension. [2db1a4f6a5] +- Implemented joint matrix query from [ext_oneapi_matrix](https://github.com/intel/llvm/blob/00eebe1e4/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc) SYCL extension on CUDA and HIP backends. [00eebe1e4] +- Added support for unsampled image arrays in [ext_oneapi_bindless_images](https://github.com/intel/llvm/blob/76ec3f0f7/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc) SYCL extension. [76ec3f0f7] +- Added `__imf_rcp64h` - equivalent to CUDA's `__nv_rcp64h` - and `sqrt` function with selectable rounding modes to Intel math libdevice. [ce70cb521] [6c1dde4243b5] +- Integrated OneAPI construction kit's vectorizer to Native CPU backend. [330ac57d6] +- Added ability to compare device architecture and support for PVC-VG to [ext_oneapi_device_architecture](https://github.com/intel/llvm/blob/68445467/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc) SYCL extension. [68445467] [ac0e142e12] +- Added `sycl::length` wrapper and a helper functions in SYCLCompat library for occupancy calculation in Intel GPUs. [b209b321] [2525570] +- Added support for SYCL barriers on Native CPU. [3c39d132a] +- Added support for `bfloat16` to `sycl::vec`. [bbbe8839] +- Added vectorized binary and unary operations through callable structs in the SYCLCompat library. [5505e03] +- Supported profiling information for default-constructed events when `ext_oneapi_barrier` is submitted to an empty in-order queue. [200694b] +- Implemented `ext_oneapi_private_alloca` by adding code generation capabilities for `private_alloca`. [f4e0d31] +- Added support for memory attributes on `non-const` device global variables on FPGA. [3bb5f40] [3fc6708] +- Added `set_default_queue` functionality to SYCLCompat library to enable changing the default queue of the current device. [e72b85c] +- Propagate annotations from `annotated_ptr` to the underlying raw pointers to enable additional optimization opportunities. [8f182cd] + +### Documentation +- Proposed [ext_intel_fp_control](https://github.com/intel/llvm/blob/bf8ea96f4/sycl/doc/extensions/experimental/sycl_ext_intel_fp_control.asciidoc) extension to allow specifying the rounding and denorm mode for floating-point operations in SYCL kernels. [bf8ea96f4] +- Proposed [ext_oneapi_raw_kernel_arg](https://github.com/intel/llvm/blob/4168793978/sycl/doc/extensions/proposed/sycl_ext_oneapi_raw_kernel_arg.asciidoc) SYCL extension to allow opaque types to be passed to SYCL kernels. [4168793978] +- Proposed [ext_oneapi_composite_device](https://github.com/intel/llvm/blob/9a1b9084/sycl/doc/extensions/experimental/sycl_ext_oneapi_composite_device.asciidoc) SYCL extension to allow card-level device access on PVC GPUs. [9a1b9084] +- Proposed [ext_oneapi_in_order_queue_events](https://github.com/intel/llvm/blob/19072756e/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc) SYCL extension to allow getting event from the last submitted command and setting an external event as an implicit dependence on the next command submitted to the queue [19072756e] +- Proposed [ext_oneapi_profiling_tag](https://github.com/intel/llvm/blob/b4ade420/sycl/doc/extensions/proposed/sycl_ext_oneapi_profiling_tag.asciidoc) SYCL extension to time commands submitted to the queue. [b4ade420] +- Proposed [ext_oneapi_private_alloca](https://github.com/intel/llvm/blob/aaf7a58863/sycl/doc/extensions/experimental/sycl_ext_oneapi_private_alloca.asciidoc) SYCL extension to have specialization constant-length private memory allocations. [aaf7a58863] +- Added `joint_matrix_prefetch` and overloads of load and store with `annotated_ptr` in [ext_intel_matrix](https://github.com/intel/llvm/blob/04a222f7bb3022f3623ad40c9de70fd97579061a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc) and [ext_oneapi_matrix](https://github.com/intel/llvm/blob/04a222f7bb3022f3623ad40c9de70fd97579061a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc) SYCL extensions. [04a222f] + +### Other changes +- Created an additional version-agnostic copy of the SYCL import library during compiler build. [2d2e418c] + +## Improvements +### SYCL Compiler +- Enabled default selection of general register file (GRF) size on Linux for PVC GPUs. [8083f8a8] +- Disabled passing `-sycl-opt` for NativeCPU to enable the original full LLVM optimization pipeline. [3fe77b9] +- Enabled `-fsycl-esimd-force-stateless-mem` flag by default. [f316273] +- Enable `-emit-only-kernels-as-entry-point` by default on Intel backends for `sycl-post-link` to prevent device code bloating. [70fddbb] + + +### SYCL Library +- Improved error messages for invalid properties specified on non pointer types. [728b132a5] +- Adopted a unified and scalable way to pass alignment and cache flags to all ESIMD functions. [a2208484ab] [960d898c] [5ef8df837d] [a57a96c77] [19cd6144a] [646ab086e5] [0bf2e666c] +- Added default constructor to bindless sampler and image handler in [ext_oneapi_bindless_images](https://github.com/intel/llvm/blob/d65f3aa560/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc) SYCL extension. [d65f3aa560] [7bfdcfd4cabf] +- Added `SYCL_CACHE_IN_MEM` environment variable to disable in-memory caching of programs and facilitated automatic program cache cleaning when running out of memory. [9322d14ce] [6cf1ae081ac] +- Improved templated and convertible builtins after clarification in SYCL 2020 revision 8. [92861835] +- Allowed generic_space `multi_ptr` in math builtins. [eda8a587f1] +- Improved error message when writing beyond the bounds of `simd_view` object. [197c33a2b] +- Optimized `ext_oneapi_submit_barrier` from [ext_oneapi_enqueue_barrier](https://github.com/intel/llvm/blob/7e08c15dd/sycl/doc/extensions/supported/sycl_ext_oneapi_enqueue_barrier.asciidoc) into `NOP` for in-order queues with empty waitlist. [7e08c15dd] +- Supported prefetch, memory advise, and automatic management of dependencies for multiple command-buffer submissions in [ext_oneapi_graph](https://github.com/intel/llvm/blob/c6fbac59/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension. [c6fbac59] [56f8d38c] +- Added support for profiling command buffers. [b04f894dbd06b] +- Implemented ESIMD APIs that accepts compile-time properties. [655ab100] [5582ce4db] [d286f4ab1c] [961793913] [0cfe7e35] [656b8be7] +- Removed deprecated esimd_emulators from device filters and depreciated `SYCL_DEVICE_FILTER` in favor of `ONEAPI_DEVICE_SELECTOR`. [9d0888ca3] [8d0fa9875] +- Improved error message when trying to fuse kernels with incompatible ND-Ranges in [ext_codeplay_kernel_fusion](https://github.com/intel/llvm/blob/7d492f87ec97/sycl/doc/extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc). [7d492f87ec97] +- Made user functions to always inline in the SYCL kernels to reduce overhead in SYCLCompat library. [e121c8811] +- Made runtime choose device image with inlined specialization constant when `-fsycl-add-default-spec-consts-image` option is used. [73d34739b] +- Made `nd_item` stateless to reduce initialization overhead. [7999e27b] +- Improved warning messages and added `-ignore-device-selector` flag to `sycl-ls` to ignore device selection environment variables. [6e3aa218] +- Improved error handling when calling `matrix_combinations` query on platforms unsupported by [ext_oneapi_device_architecture](https://github.com/intel/llvm/blob/c00305b73/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc) SYCL extension. [c00305b73] +- Made default `sycl::queue` context reusable on Windows. [491e6e4ea] +- Changed default cache hints for `prefetch` ESIMD API. [984c88c] +- Limited `bfloat16` ESIMD operations to data types convertible to `float`, as required by the SPEC. [f81b5a2] +- Removed the implicitly passed `-ze-take-global-address` IGC option as it is by default enabled on newer IGC versions. [7e414a9] +- Improved product security by ensuring that `pi_win_proxy_loader.dll` is loaded only from trusted directories. [85b7145] [218d9fe] [9c504a5] +- Aligned `sycl-ls` output with `ONEAPI_DEVICE_SELECTOR` environment variable syntax. [38ce764] [f720291] +- Improved error message when kernel compilation fails. [eba7b7e] + + +### Documentation +- Updated [ext_oneapi_kernel_compiler_opencl](https://github.com/intel/llvm/blob/6344ead19e/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler_opencl.asciidoc) SYCL extension to allow querying OpenCL version. [6344ead19e] +- Updated [ext_intel_data_flow_pipes_properties](https://github.com/intel/llvm/blob/2a0911892/sycl/doc/extensions/experimental/sycl_ext_intel_data_flow_pipes_properties.asciidoc) to include AXI streaming as a protocol choice on FPGAs. [2a0911892] +- Updated [KernelFusionJIT](https://github.com/intel/llvm/blob/b9854a12/sycl/doc/design/KernelFusionJIT.md) to include details on local/private memory allocation size, different promotion hints, etc. [b9854a12] +- Updated [ext_oneapi_in_order_queue_events](https://github.com/intel/llvm/blob/b0f584c675f9/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc) to make external events wait when queue is waited on. [b0f584c675f9] +- Improved [ext_oneapi_address_cast](https://github.com/intel/llvm/blob/84a92e03/sycl/doc/extensions/proposed/sycl_ext_oneapi_address_cast.asciidoc) SYCL extension to allow casting raw pointers to multi_ptr. [84a92e03] + +## Bug Fixes +### SYCL Compiler +- Made the device binary generated by `-fsycl-link=image` linkable by adding more information into the binary. [219d4ef54] +- Fixed linking error when separately compiling and linking a SYCL program with SYCL libraries. [d6eecfa] +- Fixed `clangd` parsing crash with `-fsycl` flag when using `!nullptr` asserts. [f42bbcc] + +### SYCL Library +- Fixed computation of submit time based on host timestamps. [254756369c] +- Fixed SYCL CTS failures for Unified Runtime's OpenCL adapter. [4c0780e76] +- Fixed strict aliasing violations in `sycl::vec` routines. [a9d0e1b8] +- Fixed logical operations and integer conversions among sycl::vec types. [3d5e41fddf] [ff48612f] [7868596d] +- Fixed compound operators on `annoted_ptr` when the user-defined type only defines a compound operator. [c43a90f2] +- Fixed exponential slowdown in multiple calls to `queue::ext_oneapi_submit_barrier`. [079fc97b] +- Fixed input handling for `ONEAPI_DEVICE_SELECTOR` environment variable. [90b6aee46] +- Fixed in-order dependency filtering for isolated kernels. [8e7995df] +- Fixed double-free bug in kernel-program cache. [04ff5b81] +- Fixed resource leak in `SYCL_FALLBACK_ASSERT`. [b478d2fa] +- Fixed deadlock in in-order queue when submitting a host task and simultaneously accessing stream service events. [3031733] +- Made `sycl::vec` interface consistent with `sycl::marray` and `sycl::buffer` by defining `value_type` alias. [33e5b10] +- Fix handling of enumeration specialization constants. [1f0dc36] +- Fixes `-O0 -fno-inline-functions` ESIMD failures by inlining some non-inline functions due to VC limitations. [89327e0] + +### Documentation +- Clarified [ext_oneapi_graph](https://github.com/intel/llvm/blob/2581123a1/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension to make it illegal for graph nodes to depend on events from outside the graph. [2581123a1] +- Updated [ext_oneapi_non_uniform_groups](https://github.com/intel/llvm/blob/90a55a5/sycl/doc/extensions/experimental/sycl_ext_oneapi_non_uniform_groups.asciidoc) to invert group numbering for ballot groups. [90a55a5] +- Updated [ext_oneapi_free_function_kernels](https://github.com/intel/llvm/blob/a452e06a0ebcbabbfecbeb2ca05675265bddbf8d/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc) to remove `range kernels` from the extension. [a452e06] + +## Known Issues +- On Windows, the Unified Runtime's Level Zero leak check does not work correctly with +the default contexts on Windows. This is because on Windows the release +of the plugin DLLs races against the release of static global variables +(like the default context). +- Intel Graphic Compiler's Vector Compute backend does not support O0 code and often gets miscompiled, produces wrong answers and crashes. This issue directly affects ESIMD code at O0. As a temporary workaround, we have optimize ESIMD code even in O0 mode. [00749b1e8](https://github.com/intel/llvm/commit/00749b1e8e3085acfdc63108f073a255842533e2) +- `multi_ptr` relational operators assume the lowest possible value of `std::null_ptr` which might cause issues with the CUDA and AMDGPU backends. This will be fixed in the next release. ([13201](https://github.com/intel/llvm/pull/13201)) +- When `-fsycl-device-code-split=off` is set, having kernels with different `reqd_work_group_size` attributes could lead to runtime errors about local size mismatching the attribute value. The issue is also reproducible when there is a kernel with `reqd_work_group_size` attribute, but other kernels don't have that attribute set. This will be fixed in the next release. ([#13523](https://github.com/intel/llvm/pull/13523)) +- Having default-constructed `local_accessor` as unused kernel argument could lead to runtime errors during kernel arguments setting. The issue is reproducible when optimizations are explicitly disabled through `-O0`, or when optimizations failed to remove that unused kernel argument. This will be fixed in the next release. ([#13382](https://github.com/intel/llvm/pull/13382)) +- ONEAPI_DEVICE_SELECTOR incorrectly parses `!` from discard filters. This will be fixed in the next release. ([SYCL] Fix ONEAPI_DEVICE_SELECTOR handling of discard filters. #13927) + +## API/ABI breaking changes +- Renamed and removed some APIs from [ext_oneapi_free_function_queries](https://github.com/intel/llvm/commit/287fd3733#diff-4ab48d4a7f26c356939d42c6aed9c67d4d59aafac11565f3bfe71d7e053a4db4) SYCL extension. [287fd3733] + +## Upcoming API/ABI breakages +The following changes ared only in effect if the `-fpreview-breaking-changes` flag is set. +- Changed return type of `abs_diff` to be same as that of the input. [2a3e1ab82] +- Added a preview of pre-C++11 ABI support for GCC on Linux. This feature allows users to set a GCC compiler flag -D_GLIBCXX_USE_CXX11_ABI=0 to use pre-C++11 ABI. Details about GCC C++11 ABI is available at https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html. In this release, this feature is enabled under the flag -fpreview-breaking-changes, and the support is incomplete and may not work for some cases. [459e122a] +- Removed some sub-group class APIs that do not appear in SYCL 2020 Spec. [2985395] + + # Nov'23 release notes Release notes for commit range f4e0d3177338..f4ed132f243a diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index 0f36f0d38eeb4..de14612ba53e5 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -439,17 +439,17 @@ run the following commands # Extract OpenCL CPU RT mkdir -p /opt/intel/oclcpuexp_ cd /opt/intel/oclcpuexp_ - tar -zxvf oclcpu_rt_.tar.gz + tar -zxvf oclcpuexp_.tar.gz ``` -2) Create ICD file pointing to the new runtime (requires root access) +2) Create ICD file pointing to the new runtime (requires sudo access) ```bash # OpenCL FPGA emulation RT - echo /opt/intel/oclfpgaemu_/x64/libintelocl_emu.so > + echo /opt/intel/oclfpgaemu_/x64/libintelocl_emu.so | sudo tee /etc/OpenCL/vendors/intel_fpgaemu.icd # OpenCL CPU RT - echo /opt/intel/oclcpuexp_/x64/libintelocl.so > + echo /opt/intel/oclcpuexp_/x64/libintelocl.so | sudo tee /etc/OpenCL/vendors/intel_expcpu.icd ``` @@ -469,32 +469,32 @@ folder: ```bash # OpenCL FPGA emulation RT ln -s /opt/intel/oneapi-tbb-/lib/intel64/gcc4.8/libtbb.so - /opt/intel/oclfpgaemu_/x64 + /opt/intel/oclfpgaemu_/x64/libtbb.so ln -s /opt/intel/oneapi-tbb-/lib/intel64/gcc4.8/libtbbmalloc.so - /opt/intel/oclfpgaemu_/x64 + /opt/intel/oclfpgaemu_/x64/libtbbmalloc.so ln -s /opt/intel/oneapi-tbb-/lib/intel64/gcc4.8/libtbb.so.12 - /opt/intel/oclfpgaemu_/x64 + /opt/intel/oclfpgaemu_/x64/libtbb.so.12 ln -s /opt/intel/oneapi-tbb-/lib/intel64/gcc4.8/libtbbmalloc.so.2 - /opt/intel/oclfpgaemu_/x64 + /opt/intel/oclfpgaemu_/x64/libtbbmalloc.so.2 # OpenCL CPU RT ln -s /opt/intel/oneapi-tbb-/lib/intel64/gcc4.8/libtbb.so - /opt/intel/oclcpuexp_/x64 + /opt/intel/oclcpuexp_/x64/libtbb.so ln -s /opt/intel/oneapi-tbb-/lib/intel64/gcc4.8/libtbbmalloc.so - /opt/intel/oclcpuexp_/x64 + /opt/intel/oclcpuexp_/x64/libtbbmalloc.so ln -s /opt/intel/oneapi-tbb-/lib/intel64/gcc4.8/libtbb.so.12 - /opt/intel/oclcpuexp_/x64 + /opt/intel/oclcpuexp_/x64/libtbb.so.12 ln -s /opt/intel/oneapi-tbb-/lib/intel64/gcc4.8/libtbbmalloc.so.2 - /opt/intel/oclcpuexp_/x64 + /opt/intel/oclcpuexp_/x64/libtbbmalloc.so.2 ``` -5) Configure library paths (requires root access) +5) Configure library paths (requires sudo access) ```bash - echo /opt/intel/oclfpgaemu_/x64 > + echo /opt/intel/oclfpgaemu_/x64 | sudo tee /etc/ld.so.conf.d/libintelopenclexp.conf - echo /opt/intel/oclcpuexp_/x64 >> + echo /opt/intel/oclcpuexp_/x64 | sudo tee -a /etc/ld.so.conf.d/libintelopenclexp.conf - ldconfig -f /etc/ld.so.conf.d/libintelopenclexp.conf + sudo ldconfig -f /etc/ld.so.conf.d/libintelopenclexp.conf ``` **Windows (64-bit)**: diff --git a/sycl/doc/UsersManual.md b/sycl/doc/UsersManual.md index e354f2e605a6f..638d4e2c2f080 100644 --- a/sycl/doc/UsersManual.md +++ b/sycl/doc/UsersManual.md @@ -91,6 +91,7 @@ and not recommended to use in production environment. * nvidia_gpu_sm_87 - NVIDIA Jetson/Drive AGX Orin architecture * nvidia_gpu_sm_89 - NVIDIA Ada Lovelace architecture * nvidia_gpu_sm_90 - NVIDIA Hopper architecture + * nvidia_gpu_sm_90a - NVIDIA Hopper architecture (with wgmma and setmaxnreg instructions) * amd_gpu_gfx700 - AMD GCN GFX7 (Sea Islands (CI)) architecture * amd_gpu_gfx701 - AMD GCN GFX7 (Sea Islands (CI)) architecture * amd_gpu_gfx702 - AMD GCN GFX7 (Sea Islands (CI)) architecture diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 9519067a00484..f36c40af07403 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -438,6 +438,24 @@ Level Zero: Future work will include exploring L0 API extensions to improve the mapping of UR command-buffer to L0 command-list. +#### Copy Engine + +For performance considerations, the Unified Runtime Level Zero adapter uses +different Level Zero command-queues to submit compute kernels and memory +operations when the device has a dedicated copy engine. To take advantage of the +copy engine when available, the graph workload can also be split between memory +operations and compute kernels. To achieve this, two graph workload +command-lists live simultaneously in a command-buffer. + +When the command-buffer is finalized, memory operations (e.g. buffer copy, +buffer fill, ...) are enqueued in the *copy* command-list while the other +commands are enqueued in the compute command-list. On submission, if not empty, +the *copy* command-list is sent to the main copy command-queue while the compute +command-list is sent to the compute command-queue. + +Both are executed concurrently. Synchronization between the command-lists is +handled by Level Zero events. + ### CUDA The SYCL Graph CUDA backend relies on the diff --git a/sycl/doc/design/DeviceIf.md b/sycl/doc/design/DeviceIf.md index b9cbb1cf3de73..93a92934842b7 100644 --- a/sycl/doc/design/DeviceIf.md +++ b/sycl/doc/design/DeviceIf.md @@ -183,6 +183,7 @@ one of the following corresponding C++ macro names: * `__SYCL_TARGET_NVIDIA_GPU_SM87__` * `__SYCL_TARGET_NVIDIA_GPU_SM89__` * `__SYCL_TARGET_NVIDIA_GPU_SM90__` +* `__SYCL_TARGET_NVIDIA_GPU_SM90A__` * `__SYCL_TARGET_AMD_GPU_GFX700__` * `__SYCL_TARGET_AMD_GPU_GFX701__` * `__SYCL_TARGET_AMD_GPU_GFX702__` diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md index d2fc7d3b484e8..28d19de097e76 100644 --- a/sycl/doc/design/SYCLNativeCPU.md +++ b/sycl/doc/design/SYCLNativeCPU.md @@ -38,6 +38,12 @@ python buildbot/configure.py \ # other options here ``` +### libclc target triples + +SYCL Native CPU uses [libclc](https://github.com/intel/llvm/tree/sycl/libclc) to implement many SPIRV builtins. When Native CPU is enabled, the default target triple for libclc will be `LLVM_TARGET_TRIPLE` (same as the default target triple used by `clang`). This can be overridden by setting the `--native-cpu-libclc-targets` option in `configure.py`. + +### oneAPI Construction Kit + SYCL Native CPU uses the [oneAPI Construction Kit](https://github.com/codeplaysoftware/oneapi-construction-kit) (OCK) in order to support some core SYCL functionalities and improve performances, the OCK is fetched by default when SYCL Native CPU is enabled, and can optionally be disabled using the `NATIVECPU_USE_OCK` CMake variable (please note that disabling the OCK will result in limited functionalities and performances on the SYCL Native CPU backend): ``` diff --git a/sycl/doc/developer/ContributeToDPCPP.md b/sycl/doc/developer/ContributeToDPCPP.md index ee60eb5a59d70..a096d99b33397 100644 --- a/sycl/doc/developer/ContributeToDPCPP.md +++ b/sycl/doc/developer/ContributeToDPCPP.md @@ -164,3 +164,39 @@ These tests verify SYCL specification conformance. All implementation details are out of scope for the tests. See DPC++ compiler invocation definitions at [FindIntel_SYCL](https://github.com/KhronosGroup/SYCL-CTS/blob/SYCL-1.2.1/master/cmake/FindIntel_SYCL.cmake)) + +## Unified Runtime Updates + +To integrate changes from the [Unified Runtime][ur] project into DPC++ there +two main options which depend on the scope of those changes and the current +state of DPC++. + +1. Synchronized update: + * When: If the Unified Runtime change touches the API/ABI, more than one + adapter, or common code such as the loader. + * How: Update the `UNIFIED_RUNTIME_TAG` to point at the desired commit or tag + name in the Unified Runtime repository and ensure that any tag for specific + adapters are set to use `${UNIFIED_RUNTIME_TAG}`. + +2. Decoupled update: + * When: If only a single Unified Runtime adatper has changed. + * How: Update the tag used in the `fetch_adapter_source()` call for a + specific Unified Runtime adapter, e.g. Level Zero, OpenCL, CUDA, HIP, or + Native CPU. + +In general, a synchronized update should be the default. However, when there +are a lot of changes in flight in parallel always synchronizing the tag can be +troublesome. This is when a decoupled update can help sustain the merge +velocity of Unified Runtime changes. + +The [intel/unified-runtime-reviewers][ur-reviewers-team] team is responsible +for ensuring that the Unified Runtime tag is updated correctly and will only +provide code owner approval to pull requests once the following criteria are +met: + +* Tags are pointing to a valid commit or tag on Unified Runtime main branch. +* Changes to additional code owned files are in a good state. +* GitHub Actions checks are passing. + +[ur]: https://github.com/oneapi-src/unified-runtime +[ur-reviewers-team]: https://github.com/orgs/intel/teams/unified-runtime-reviewers diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index e755c6f9ee414..9fb1dd1503237 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -1085,6 +1085,13 @@ multiple of 4 when `T` is `float`; where `T` is the type of the `joint_matrix` elements. When `T` is not `half` or `float` there are no restrictions to `stride`. +IMPORTANT: For some devices it is important to use the sm version +(Compute Capability) corresponding to the device that will run the +program when specifying e.g. `-fsycl-targets=nvidia_gpu_sm_xx` during +compilation. This particularly affects matrix operations using `half`. +For more information on this issue consult +https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-restrictions + ==== AMD Matrix Cores Supported Combinations The complete set of matrix data types and dimensions that are supported by the `ext_oneapi_hip` backend are represented in the following @@ -1139,4 +1146,5 @@ supported combinations load/store overloads |11 |2024-04-29 |Yury Plyakhin | Add 1x64x16 supported combination for Intel XMX (intel_gpu_pvc) +|12 |2024-06-14 |Jack Kirk | Add note on sm version device matching issue. |====================== diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc index 3261a94b17cdf..6359515a67b9d 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc @@ -103,286 +103,556 @@ then it supports the `bfloat16` math functions described in the next section. === Math Functions -The following functions are only available when `T` is `bfloat16` or -`sycl::marray`, where `{N}` means any positive value of -`size_t` type. - ==== isnan ```c++ namespace sycl::ext::oneapi::experimental { -bool isnan(bfloat16 x); +bool isnan(bfloat16 x); (1) -template -sycl::marray isnan(sycl::marray x); +template +/*return type*/ isnan(NonScalar x); (2) } // namespace sycl::ext::oneapi::experimental ``` ===== Description +====== Overload (1) + +Returns `true` if `x` is a NaN value, otherwise returns `false`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + + - `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and + - The element type is `bfloat16`. -Returns true if x is NAN value, otherwise returns false. +*Returns:* If `NonScalar` is `marray`, returns `true` for each element of `x` only if `x[i]` has a NaN value. If `NonScalar` is `vec` or the `[code]#+__swizzled_vec__+#` type, returns -1 for each element of `x` if `x[i]` is a NaN value and returns 0 otherwise. + +The return type depends on `NonScalar`. For `marray`, the return type is `marray` and for `vec`, `[code]#+__swizzled_vec__+#` type, the return type is `vec`. ==== fma ```c++ namespace sycl::ext::oneapi::experimental { -template -T fma(T a, T b, T c); +bfloat16 fma(bfloat16 a, bfloat16 b, bfloat16 c); (1) + +template (2) +/*return-type*/ fma(NonScalar1 a, NonScalar2 b, NonScalar3 c) } // namespace sycl::ext::oneapi::experimental ``` ===== Description -Returns the correctly rounded floating-point representation of the +====== Overload (1) + +*Returns:* Returns the correctly rounded floating-point representation of the sum of `c` with the infinitely precise product of `a` and `b`. Rounding of intermediate products shall not occur. The mantissa LSB rounds to the nearest even. Subnormal numbers are supported. +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* One of the following conditions must hold for `NonScalar1`, `NonScalar2`, and `NonScalar3`: +** `NonScalar1`, `NonScalar2`, and `NonScalar3` are each `marray`; or +** `NonScalar1`, `NonScalar2`, and `NonScalar3` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type; +* `NonScalar1`, `NonScalar2`, and `NonScalar3` have the same number of elements; +* `NonScalar1`, `NonScalar2`, and `NonScalar3` have the same element type; and +* The element type of `NonScalar1`, `NonScalar2`, and `NonScalar3` is `bfloat16`. + +*Returns:* For each element of `a`, `b`, and `c`; the correctly rounded floating-point representation of the sum of `c[i]` with the infinitely precise product of `a[i]` and `b[i]`. Rounding of intermediate products shall not occur. Edge case behavior is per the IEEE 754-2008 standard. + +The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. + ==== fmax ```c++ namespace sycl::ext::oneapi::experimental { -template -T fmax(T x, T y); +bfloat16 fmax(bfloat16 x, bfloat16 y); (1) + +template (2) +/*return-type*/ fmax(NonScalar1 x, NonScalar2 y) + +template (3) +/*return-type*/ fmax(NonScalar x, bfloat16 y) } // namespace sycl::ext::oneapi::experimental ``` ===== Description -Returns `y` if -`x < y`, otherwise it -returns `x`. If one argument is a -NaN, `fmax()` returns the other -argument. If both arguments are -NaNs, `fmax()` returns a NaN. +====== Overload (1) + +Returns `y` if `x < y`, otherwise it returns `x`. If one argument is a NaN, `fmax()` returns the other +argument. If both arguments are NaNs, `fmax()` returns a NaN. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* One of the following conditions must hold for `NonScalar1` and `NonScalar2`: +** Both `NonScalar1` and `NonScalar2` are `marray`; or +** `NonScalar1` and `NonScalar2` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type; +* `NonScalar1` and `NonScalar2` have the same number of elements; +* `NonScalar1` and `NonScalar2` have the same element type; and +* The element type of `NonScalar1` and `NonScalar2` is bfloat16. + +*Returns:* For each element of `x` and `y`, the value `y[i]` if `x[i] < y[i]`, otherwise `x[i]`. If one element is a NaN, the result is the other element. If both elements are NaNs, the result is NaN. + +The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. + +====== Overload (3) + +*Constraints:* Available only if all of the following conditions are met: + +* NonScalar is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is bfloat16. + +*Returns:* For each element of `x`, the value `y` if `x[i] < y`, otherwise `x[i]`. If one value is a NaN, the result is the other value. If both value are NaNs, the result is a NaN. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== fmin ```c++ namespace sycl::ext::oneapi::experimental { -template -T fmin(T x, T y); +bfloat16 fmin(bfloat16 x, bfloat16 y); (1) + +template (2) +/*return-type*/ fmin(NonScalar1 x, NonScalar2 y) + +template (3) +/*return-type*/ fmin(NonScalar x, bfloat16 y) } // namespace sycl::ext::oneapi::experimental ``` ===== Description -Returns `y` if -`y < x`, otherwise it -returns `x`. If one argument is a -NaN, `fmax()` returns the other -argument. If both arguments are -NaNs, `fmax()` returns a NaN. +====== Overload (1) + +Returns `x` if `x < y`, otherwise it returns `y`. If one argument is a +NaN, `fmin()` returns the other argument. If both arguments are NaNs, `fmin()` returns a NaN. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* One of the following conditions must hold for `NonScalar1` and `NonScalar2`: +** Both `NonScalar1` and `NonScalar2` are `marray`; or +** `NonScalar1` and `NonScalar2` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type; +* `NonScalar1` and `NonScalar2` have the same number of elements; +* `NonScalar1` and `NonScalar2` have the same element type; and +* The element type of `NonScalar1` and `NonScalar2` is bfloat16. + +*Returns:* For each element of `x` and `y`, the value `x[i]` if `x[i] < y[i]`, otherwise `y[i]`. If one element is a NaN, the result is the other element. If both elements are NaNs, the result is NaN. + +The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. + +====== Overload (3) + +*Constraints:* Available only if all of the following conditions are met: + +* NonScalar is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is bfloat16. + +*Returns:* For each element of `x`, the value `x[i]` if `x[i] < y`, otherwise `y`. If one value is a NaN, the result is the other value. If both value are NaNs, the result is a NaN. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== fabs ```c++ namespace sycl::ext::oneapi::experimental { -template -T fabs(T x); +bfloat16 fabs(bfloat16 x); (1) + +template (2) +/*return-type*/ fabs(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Compute absolute value(s) of a scalar `bfloat16` value. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. -Compute absolute value of a `bfloat16` value or `sycl::marray`. +*Returns:* For each element of `x`, the absolute value of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== ceil ```c++ namespace sycl::ext::oneapi::experimental { -template -T ceil(T x); +bfloat16 ceil(bfloat16 x); (1) + +template (2) +/*return-type*/ ceil(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +The value `x` rounded to an integral value using the round to positive infinity rounding mode. + +====== Overload (2) -Returns `x` rounded to an integral value using the round to positive infinity rounding mode +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to positive infinity rounding mode. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== floor ```c++ namespace sycl::ext::oneapi::experimental { -template -T floor(T x); +bfloat16 floor(bfloat16 x); (1) + +template (2) +/*return-type*/ floor(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +The value `x` rounded to an integral value using the round to negative infinity rounding mode. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. -Returns `x` rounded to an integral value using the round to negative infinity rounding mode -for a `bfloat16` value or `sycl::marray`. +*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to negative infinity rounding mode. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== cos ```c++ namespace sycl::ext::oneapi::experimental { -template -T cos(T x); +bfloat16 cos(bfloat16 x); (1) + +template (2) +/*return-type*/ cos(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the cosine of `x`. -Compute cosine of a `bfloat16` value or `sycl::marray`. +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the cosine of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== sin ```c++ namespace sycl::ext::oneapi::experimental { -template -T sin(T x); +bfloat16 sin(bfloat16 x); (1) + +template (2) +/*return-type*/ sin(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the sine of `x`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: -Compute sine of a `bfloat16` value or `sycl::marray`. +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. +*Returns:* For each element of `x`, the sine of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== exp ```c++ namespace sycl::ext::oneapi::experimental { -template -T exp(T x); +bfloat16 exp(bfloat16 x); (1) + +template (2) +/*return-type*/ exp(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the base-e exponential of `x`. + +====== Overload (2) -Compute the base-e exponential of a `bfloat16` value or `sycl::marray`. +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the base-e exponential of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== exp2 ```c++ namespace sycl::ext::oneapi::experimental { -template -T exp2(T x); +bfloat16 exp2(bfloat16 x); (1) + +template (2) +/*return-type*/ exp2(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the base-2 exponential of `x`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. -Compute the base-2 exponential of a `bfloat16` value or `sycl::marray`. +*Returns:* For each element of `x`, the base-2 exponential of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== exp10 ```c++ namespace sycl::ext::oneapi::experimental { -template -T exp10(T x); +bfloat16 exp10(bfloat16 x); (1) + +template (2) +/*return-type*/ exp10(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the base-10 exponential of `x`. -Compute the base-10 exponential of a `bfloat16` value or `sycl::marray`. +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the base-10 exponential of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== log ```c++ namespace sycl::ext::oneapi::experimental { -template -T log(T x); +bfloat16 log(bfloat16 x); (1) + +template (2) +/*return-type*/ log(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the natural logarithm of `x`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: -Compute natural logarithm of a `bfloat16` value or `sycl::marray`. +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the natural logarithm of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== log2 ```c++ namespace sycl::ext::oneapi::experimental { -template -T log2(T x); +bfloat16 log2(bfloat16 x); (1) + +template (2) +/*return-type*/ log2(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the base-2 logarithm of `x`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the base-2 logarithm of `x[i]`. -Compute base-2 logarithm of a `bfloat16` value or `sycl::marray`. +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== log10 ```c++ namespace sycl::ext::oneapi::experimental { -template -T log10(T x); +bfloat16 log10(bfloat16 x); (1) + +template (2) +/*return-type*/ log10(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the base-10 logarithm of `x`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: -Compute base-10 logarithm of a `bfloat16` value or `sycl::marray`. +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the base-10 logarithm of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== rint ```c++ namespace sycl::ext::oneapi::experimental { -template -T rint(T x); +bfloat16 rint(bfloat16 x); (1) + +template (2) +/*return-type*/ rint(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) -Returns `x` rounded to an integral value using the round to nearest even rounding mode -for a `bfloat16` value or `sycl::marray`. +Returns the value `x` rounded to an integral value (using round to nearest even rounding mode) in floating-point format. Refer to section 7.1 of the OpenCL 1.2 specification document: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#opencl12 for a description of the rounding modes. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value (using round to nearest even rounding mode) in floating-point format. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== sqrt ```c++ namespace sycl::ext::oneapi::experimental { -template -T sqrt(T x); +bfloat16 sqrt(bfloat16 x); (1) + +template (2) +/*return-type*/ sqrt(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the square root of `x`. + +====== Overload (2) -Compute square root of a `bfloat16` value or `sycl::marray`. +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the square root of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== rsqrt ```c++ namespace sycl::ext::oneapi::experimental { -template -T rsqrt(T x); +bfloat16 rsqrt(bfloat16 x); (1) + +template (2) +/*return-type*/ rsqrt(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the inverse square root of `x`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. -Compute inverse square root of a `bfloat16` value or `sycl::marray`. +*Returns:* For each element of `x`, the inverse square root of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== trunc ```c++ namespace sycl::ext::oneapi::experimental { -template -T trunc(T x); +bfloat16 trunc(bfloat16 x); (1) + +template (2) +/*return-type*/ trunc(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) -Returns `x` rounded to an integral value using the round to zero rounding mode -for a `bfloat16` value or `sycl::marray`. +Returns the value `x` rounded to an integral value using the round to zero rounding mode. -== Issues +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: -1. The CUDA backend does not have a use case that would necessitate support -of the `vec` class in bfloat16 math functions, and `marray` would always be -preferred over `vec` if `vec` support were to be added in the CUDA backend. -For portability reasons, support for the `vec` class can be easily added if -other backends require it. +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to zero rounding mode. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. + +== Issues -2. We should decide on a roadmap to extend support of `bfloat16` to other +1. We should decide on a roadmap to extend support of `bfloat16` to other SYCL 2020 math functions. diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc index 8c47f17f3adfc..71bc0dc031d64 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc @@ -1364,7 +1364,7 @@ and the array index. // Fetch an unsampled image array template DataT fetch_image_array(const unsampled_image_handle &ImageHandle, - const CoordT &Coords, const unsigned int ArrayLayer); + const CoordT &Coords, unsigned int ArrayLayer); ``` Fetching an image array follows the same restrictions on what coordinate types @@ -1390,7 +1390,7 @@ provided that type is trivially copyable. // Write to an unsampled image array template DataT write_image_array(unsampled_image_handle ImageHandle, - const CoordT &Coords, const unsigned int ArrayLayer + const CoordT &Coords, unsigned int ArrayLayer const DataT &Color); ``` @@ -1495,7 +1495,7 @@ sampling depends on the sampler attributes passed upon creation of the cubemap. template DataT fetch_cubemap(const unsampled_image_handle &ImageHandle, const int2 &Coords, - const int Face); + int Face); // Sampled cubemap read template @@ -1506,7 +1506,7 @@ DataT sample_cubemap(const sampled_image_handle &ImageHandle, template void write_cubemap(unsampled_image_handle ImageHandle, const int2 &Coords, - const int Face, + int Face, const DataT &Color); ``` @@ -1594,10 +1594,18 @@ struct. ```cpp namespace sycl::ext::oneapi::experimental { +// Types of external memory handles +enum class external_mem_handle_type { + opaque_fd = 0, + win32_nt_handle = 1, + win32_nt_dx12_resource = 2, +}; + // Descriptor templated on specific resource type template struct external_mem_descriptor { ResourceType external_resource; + external_mem_handle_type handle_type; size_t size_in_bytes; }; @@ -1609,9 +1617,13 @@ handle type, `ResourceType`, for their purposes, e.g. `resource_fd` to describe a POSIX file descriptor resource on Linux systems, or a `resource_win32_handle` for Windows NT resource handles. -Once the user populates the `external_mem_descriptor` with the appropriate -`ResourceType` values, and the size of the external memory in bytes, -they can then import that memory into SYCL through `import_external_memory`. +The user must populate the `external_mem_descriptor` with the appropriate +`ResourceType` values, a `handle_type`, and the size of the external memory in +bytes, before they can then import that memory into SYCL through +`import_external_memory`. Note that some handle types can only be used in +combination with certain resource types, for example the `opaque_fd` handle type +is only used on Linux systems and is only compatible with the `resource_fd` +resource type. ```cpp namespace sycl::ext::oneapi::experimental { @@ -1690,16 +1702,32 @@ memory resources handles can take different forms of structure and type depending on the API and operating system, so do external semaphore resource handles. +It is important to note, that the use of imported external semaphore objects +within SYCL has the restriction in that imported external semaphores can only +be used in conjuction with SYCL queues that have been constructed with the +`property::queue::in_order` property. The semaphore synchronization mechanism +is not supported for the default SYCL out-of-order queues. Use of the semaphore +synchronization mechanism with SYCL queues which were not constructed with the +`queue::in_order` property will result in undefined behaviour. + External semaphore import is facilitated through the following proposed descriptor struct. ```cpp namespace sycl::ext::oneapi::experimental { +// Types of external semaphore handles +enum class external_semaphore_handle_type { + opaque_fd = 0, + win32_nt_handle = 1, + win32_nt_dx12_fence = 2, +}; + // Descriptor templated on specific resource type template struct external_semaphore_descriptor { ResourceType external_resource; + external_semaphore_handle_type handle_type; }; } @@ -1710,9 +1738,12 @@ appropriate handle type, `ResourceType`, for their purposes, e.g. `resource_fd` to describe a POSIX file descriptor resource on Linux systems, or a `resource_win32_handle` for Windows NT resource handles. -Once the user populates the `external_semaphore_descriptor` with the appropriate -`ResourceType` values, they can then import that semaphore into SYCL through -`import_external_semaphore`. +The user must populate the `external_semaphore_descriptor` with the appropriate +`ResourceType` values, and `handle_type`, before they can then import that +semaphore into SYCL through `import_external_semaphore`. Note that some handle +types can only be used in combination with certain resource types, for example +the `opaque_fd` handle type is only used on Linux systems and is only +compatible with the `resource_fd` resource type. ```cpp namespace sycl::ext::oneapi::experimental { @@ -1728,7 +1759,6 @@ interop_semaphore_handle import_external_semaphore( externalSemaphoreDescriptor, const sycl::device &syclDevice, const sycl::context &syclContext); -} template interop_semaphore_handle import_external_semaphore( @@ -1739,8 +1769,11 @@ interop_semaphore_handle import_external_semaphore( ``` The resulting `interop_semaphore_handle` can then be used in a SYCL command -group, to either wait until the semaphore is in the signaled state, or set the -semaphore to a signaled state. +group, to either wait until the semaphore signalled, or signal the semaphore. + +If the type of semaphore imported supports setting the state of discrete +semaphore value (the semaphore type is `win32_nt_dx12_fence`), then the user +can specify which value the semaphore operation should wait on, or signal. We propose to extend the SYCL queue and handler classes with semaphore waiting and signalling operations. @@ -1754,9 +1787,19 @@ public: ext::oneapi::experimental::interop_semaphore_handle interop_semaphore_handle); + void ext_oneapi_wait_external_semaphore( + ext::oneapi::experimental::interop_semaphore_handle + interop_semaphore_handle, + uint64_t wait_value); + void ext_oneapi_signal_external_semaphore( ext::oneapi::experimental::interop_semaphore_handle interop_semaphore_handle); + + void ext_oneapi_signal_external_semaphore( + ext::oneapi::experimental::interop_semaphore_handle + interop_semaphore_handle, + uint64_t signal_value); }; class queue { @@ -1773,6 +1816,21 @@ public: interop_semaphore_handle, const std::vector &DepEvents); + event ext_oneapi_wait_external_semaphore( + ext::oneapi::experimental::interop_semaphore_handle + interop_semaphore_handle, + uint64_t wait_value); + event ext_oneapi_wait_external_semaphore( + ext::oneapi::experimental::interop_semaphore_handle + interop_semaphore_handle, + uint64_t wait_value, + event DepEvent); + event ext_oneapi_wait_external_semaphore( + ext::oneapi::experimental::interop_semaphore_handle + interop_semaphore_handle, + uint64_t wait_value, + const std::vector &DepEvents); + event ext_oneapi_signal_external_semaphore( ext::oneapi::experimental::interop_semaphore_handle interop_semaphore_handle); @@ -1784,17 +1842,46 @@ public: ext::oneapi::experimental::interop_semaphore_handle interop_semaphore_handle, const std::vector &DepEvents); + + event ext_oneapi_signal_external_semaphore( + ext::oneapi::experimental::interop_semaphore_handle + interop_semaphore_handle, + uint64_t signal_value); + event ext_oneapi_signal_external_semaphore( + ext::oneapi::experimental::interop_semaphore_handle + interop_semaphore_handle, + uint64_t signal_value, + event DepEvent); + event ext_oneapi_signal_external_semaphore( + ext::oneapi::experimental::interop_semaphore_handle + interop_semaphore_handle, + uint64_t signal_value, + const std::vector &DepEvents); }; } ``` -Any operations submitted to the queue after a -`ext_oneapi_wait_external_semaphore` call will not begin until the imported -semaphore is in a signaled state. +The behaviour of waiting on a semaphore will depend on the type of the +semaphore which was imported. + +If the semaphore does not support setting of a discrete state value (the +semaphore type is not `win32_nt_dx12_fence`), then any operations submitted to +the queue after a `ext_oneapi_wait_external_semaphore` call will not begin +until the imported semaphore is in a signalled state. After this, the semaphore +will be reset to a non-signalled state. + +If the semaphore does support setting of a discrete state value (the semaphore +type is `win32_nt_dx12_fence`), then any operations submitted to the queue +after a `ext_oneapi_wait_external_semaphore` call will not begin until the +imported semaphore is in a state greater than or equal to the `wait_value`. The +state of this type of semaphore will not be altered by the call to +`ext_oneapi_wait_external_semaphore`. When `ext_oneapi_signal_external_semaphore` is called, the external semaphore -will be set to the signaled state after all commands submitted to the queue -prior to the `ext_oneapi_signal_external_semaphore` call complete. +will either be set to a signalled state, or the state of the semaphore will be +set to `signal_value`, depending on the type of semaphore which was imported. +This singalling will be done after all commands submitted to the queue prior to +the `ext_oneapi_signal_external_semaphore` call complete. `ext_oneapi_wait_external_semaphore` and `ext_oneapi_signal_external_semaphore` are non-blocking, asynchronous operations. @@ -2366,13 +2453,17 @@ int external_output_image_file_descriptor = /* passed from external API */ // Extension: populate external memory descriptors sycl::ext::oneapi::experimental::external_mem_descriptor< sycl::ext::oneapi::experimental::resource_fd> - input_ext_mem_desc{external_input_image_file_descriptor, - img_size_in_bytes}; + input_ext_mem_desc{ + external_input_image_file_descriptor, + sycl::ext::oneapi::experimental::external_mem_handle_type::opaque_fd, + img_size_in_bytes}; sycl::ext::oneapi::experimental::external_mem_descriptor< sycl::ext::oneapi::experimental::resource_fd> - output_ext_mem_desc{external_output_image_file_descriptor, - img_size_in_bytes}; + output_ext_mem_desc{ + external_output_image_file_descriptor, + sycl::ext::oneapi::experimental::external_mem_handle_type::opaque_fd, + img_size_in_bytes}; // An external API semaphore will signal this semaphore before our SYCL commands // can begin execution @@ -2386,11 +2477,13 @@ int done_semaphore_file_descriptor = /* passed from external API */; // We assume POSIX file descriptor resource types sycl::ext::oneapi::experimental::external_semaphore_descriptor< sycl::ext::oneapi::experimental::resource_fd> - wait_external_semaphore_desc{wait_semaphore_file_descriptor}; + wait_external_semaphore_desc{wait_semaphore_file_descriptor, + sycl::ext::oneapi::experimental::external_semaphore_handle_type::opaque_fd}; sycl::ext::oneapi::experimental::external_semaphore_descriptor< sycl::ext::oneapi::experimental::resource_fd> - done_external_semaphore_desc{done_semaphore_file_descriptor}; + done_external_semaphore_desc{done_semaphore_file_descriptor, + sycl::ext::oneapi::experimental::external_semaphore_handle_type::opaque_fd}; try { // Extension: import external semaphores @@ -2682,4 +2775,15 @@ These features still need to be handled: This function is redundant since images don't have a notion of channel order, only the channel size. Use `get_num_channels()` instead. +|5.11|2024-05-27| - Added `external_mem_handle_type` and + `external_semaphore_handle_type` enums. These will allow + multiple handle types to be consumed by the same interop API. + - Added `handle_type` field to the `external_mem_descriptor` + and `external_semaphore_descriptor` structs. This allows + multiple handle types to be consumed by the API, such as + file descriptors, Windows NT handles, and other handles in + the future. + - Added semaphore operations which can accept values. These + are only supported for certain semaphore types + (e.g. `win32_nt_dx12_fence`). |====================== diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc index 8006bf651b1ad..f2ebcc5944462 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc @@ -121,6 +121,16 @@ which version of this extension first included each of these enumerators. |Added in version |Description +3+^|*Unknown architecture* + +a| +[source] +---- +unknown +---- +|- +|Some architecture which is not one of those listed below. + 3+^|*Intel CPU family* a| @@ -528,6 +538,7 @@ a| [source] ---- nvidia_gpu_sm_90 +nvidia_gpu_sm_90a ---- |- |NVIDIA Hopper architecture. @@ -1018,7 +1029,8 @@ struct architecture; _Return type:_ `sycl::ext::oneapi::experimental::architecture` -_Returns:_ The architecture of the device. +_Returns:_ The architecture of the device if architecture is supported, otherwise +`ext::oneapi::experimental::architecture::unknown`. |==== @@ -1106,6 +1118,9 @@ They currently exist only for use with the link:sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc[sycl_ext_oneapi_matrix] extension. +The architecture enumeration `unknown` is not currently supported with the +`if_architecture_is` function. + == Implementation notes diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc similarity index 97% rename from sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc rename to sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc index ed85566c99fbc..70898ecf61a10 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc @@ -44,11 +44,12 @@ SYCL specification refer to that revision. == Status -This is a proposed extension specification, intended to gather community -feedback. Interfaces defined in this specification may not be implemented yet -or may be in a preliminary state. The specification itself may also change in -incompatible ways before it is finalized. *Shipping software products should -not rely on APIs defined in this specification.* +This is an experimental extension specification, intended to provide early +access to features and gather community feedback. Interfaces defined in this +specification are implemented in {dpcpp}, but they are not finalized and may +change incompatibly in future versions of {dpcpp} without prior notice. +*Shipping software products should not rely on APIs defined in this +specification.* == Overview @@ -79,7 +80,7 @@ This extension makes SYCL simpler and easier to document. It is also expected to improve the performance of many SYCL applications, where `event` objects are not required to describe application behavior. -All functions proposed in this extension accept as their first argument an +All functions in this extension accept as their first argument an object that represents where a command should be submitted, allowing the new functions to be used either at command-group scope or as a replacement for existing queue shortcuts. A future version of this extension may adjust this @@ -89,7 +90,7 @@ by accepting a scheduler and returning a sender). === Usage example -The example below demonstrates that the syntax proposed here requires only +The example below demonstrates that the syntax here requires only minor changes to existing applications, while retaining their structure. @@ -117,7 +118,7 @@ sycl::free(output, q); ---- -==== Proposed syntax +==== Syntax [source,c++] ---- diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 961f87462af6c..77fab2ebe5fb1 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -1047,9 +1047,11 @@ Constraints: Parameters: -* `propList` - Optional parameter for passing properties. The only property - that is valid to pass here is `property::graph::updatable`, to enable the +* `propList` - Optional parameter for passing properties. Two properties + are valid to pass here. One is `property::graph::updatable` to enable the returned executable graph to be <>. + The other is <> + to enable profiling events returned from submissions of the executable graph. Returns: A new executable graph object which can be submitted to a queue. diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc index f2832edc31156..7a471b7fa36c6 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc @@ -773,6 +773,62 @@ int main() { ``` +== {dpcpp} guaranteed compatibility with Level Zero and OpenCL backends + +The contents of this section are non-normative and apply only to the {dpcpp} +implementation. +Kernels written using the free function kernel syntax can be submitted to a +device by using the Level Zero or OpenCL backends, without going through the +SYCL host runtime APIs. +This works only when the kernel is AOT compiled to native device code using the +`-fsycl-targets` compiler option. + +The interface to the kernel in the native device code module is only guaranteed +when the kernel adheres to the following restrictions: + +* The kernel is written in the free function kernel syntax; +* The kernel function is declared as `extern "C"`; +* Each formal argument to the kernel is either a {cpp} trivially copyable type + or the `work_group_memory` type (see + link:../proposed/sycl_ext_oneapi_work_group_memory.asciidoc[ + sycl_ext_oneapi_work_group_memory]); and +* The translation unit containing the kernel is compiled with the + `-fno-sycl-dead-args-optimization` option. + +Both Level Zero and OpenCL identify a kernel via a _name_ string. +(See `zeKernelCreate` and `clCreateKernel` in their respective specifications.) +When a kernel is defined according to the restrictions above, the _name_ is +guaranteed to be the same as the name of the kernel's function in the {cpp} +source code but with "++__sycl_kernel_++" prefixed. +For example, if the function name is "foo", the kernel's name in the native +device code module is "++__sycl_kernel_foo++". + +Both Level Zero and OpenCL set kernel argument values using three pieces of +information: + +* The index of the argument; +* The size (in bytes) of the value; and +* A pointer to the start of the value. + +(See `zeKernelSetArgumentValue` and `clSetKernelArg` in their respective +specifications.) + +When a kernel is defined according to the restrictions above, the argument +indices are the same as the positions of the formal kernel arguments in the +{cpp} source code. +The first argument has index 0, the next has index 1, etc. + +If an argument has a trivially copyable type, the size must be the size of that +type, and the pointer must point to a memory region that has the same size and +representation as that trivially copyable type. + +If an argument has the type `work_group_memory`, the size must be the size (in +bytes) of the device local memory that is represented by the +`work_group_memory` argument. +The pointer passed to `zeKernelSetArgumentValue` or `clSetKernelArg` must be +NULL in this case. + + == Implementation notes === Compiler diagnostics diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_profiling_tag.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_profiling_tag.asciidoc index 9d971079b0e56..30ead43b272e3 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_profiling_tag.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_profiling_tag.asciidoc @@ -162,10 +162,10 @@ Implementations are encouraged to transition the event directly from the "submitted" status to the "complete" status and are encouraged to set the "command_start" timestamp to the same value as the "command_end" timestamp. -_Throws:_ A synchronous `exception` with the `errc::invalid` error code if the -queue was not constructed with the `property::queue::enable_profiling` property -and if the queue's device does not have the aspect -`ext_oneapi_queue_profiling_tag`. +_Throws:_ A synchronous `exception` with the `errc::invalid` error code if +the queue's device does not have the aspect `ext_oneapi_queue_profiling_tag` +and the queue was not constructed with the `property::queue::enable_profiling` +property. [_Note:_ In order to understand why the "command_start" and "command_end" timestamps are encouraged to be the same, think of the barrier as an empty diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc new file mode 100644 index 0000000000000..9a7875c6987ab --- /dev/null +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc @@ -0,0 +1,553 @@ += sycl_ext_oneapi_work_group_memory + +:source-highlighter: coderay +:coderay-linenums-mode: table + +// This section needs to be after the document title. +:doctype: book +:toc2: +:toc: left +:encoding: utf-8 +:lang: en +:dpcpp: pass:[DPC++] +:endnote: —{nbsp}end{nbsp}note + +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} + + +== Notice + +[%hardbreaks] +Copyright (C) 2024 Intel Corporation. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. + + +== Contact + +To report problems with this extension, please open a new issue at: + +https://github.com/intel/llvm/issues + + +== Dependencies + +This extension is written against the SYCL 2020 revision 8 specification. +All references below to the "core SYCL specification" or to section numbers in +the SYCL specification refer to that revision. + +This extension also depends on the following other SYCL extensions: + +* link:../experimental/sycl_ext_oneapi_properties.asciidoc[ + sycl_ext_oneapi_properties] + + +== Status + +This is a proposed extension specification, intended to gather community +feedback. +Interfaces defined in this specification may not be implemented yet or may be +in a preliminary state. +The specification itself may also change in incompatible ways before it is +finalized. +*Shipping software products should not rely on APIs defined in this +specification.* + + +== Overview + +This extension adds a lower overhead way to allocate device local memory, +memory which is shared by all work-items in a work-group. +The `local_accessor` class in the core SYCL specification provides a mechanism +to do this also, but `local_accessor` has higher overhead because it +encapsulates both a pointer to the memory and the size of that memory. +When a `local_accessor` has multiple dimensions, it contains the size in +each dimension. +By comparison, the `work_group_memory` class in this extension encapsulates +only a pointer to the memory without any size information. +The functionality of `work_group_memory` is, of course, less than +`local_accessor`, but many applications do not need the extra features. + + +== Specification + +=== Feature test macro + +This extension provides a feature-test macro as described in the core SYCL +specification. +An implementation supporting this extension must predefine the macro +`SYCL_EXT_ONEAPI_WORK_GROUP_MEMORY` to one of the values defined in the table +below. +Applications can test for the existence of this macro to determine if the +implementation supports this feature, or applications can test the macro's +value to determine which of the extension's features the implementation +supports. + +[%header,cols="1,5"] +|=== +|Value +|Description + +|1 +|The APIs of this experimental extension are not versioned, so the + feature-test macro always has this value. +|=== + +=== New `work_group_memory` class + +This extension adds the following new class: + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +class work_group_memory { + public: + using value_type = std::remove_all_extents_t; + + work_group_memory(); + work_group_memory(const work_group_memory& rhs); + work_group_memory(handler& cgh); + work_group_memory(size_t num, handler& cgh); + work_group_memory& operator=(const work_group_memory& rhs); + + operator DataT&() const; + const work_group_memory& operator=(const DataT& value) const; + DataT* operator&() const; + + template + multi_ptr get_multi_ptr() const; +}; + +} // namespace sycl::ext::oneapi::experimental +---- + +The `work_group_memory` class allocates device local memory and provides access +to this memory from within a SYCL kernel function. +The local memory that is allocated is shared between all work-items of a +work-group. +If multiple work-groups execute simultaneously, each of those work-group +receives its own independent copy of the allocated local memory. + +The `work_group_memory` type is a legal kernel parameter type as defined in +section 4.12.4 "Rules for parameter passing to kernels" of the core SYCL +specification. +Applications typically construct an object of type `work_group_memory` in +command group scope, pass the object as a kernel parameter, and then reference +the object inside the kernel in order to access the device local memory that it +contains. + +The `work_group_memory` class may only be used in an nd-range kernel. +If an application passes a `work_group_memory` object as an argument to a +single-task kernel or to a simple "range" kernel, the implementation must throw +a synchronous `exception` with the `errc::kernel_argument` error code when the +kernel is enqueued. + +The `DataT` template parameter identifies the type of the objects created in +device local memory, and this type must be one of the types that is supported +in device code. +In order to create an array of objects, `DataT` should be an array type. +For example, `work_group_memory` creates an array of 10 `float` +objects in device local memory. +In order to create an array of objects where the number of elements is +determined at runtime, specify an unbounded array type such as +`work_group_memory` and use the constructor overload that takes a +`num` parameter. + +If `DataT` is an implicit-lifetime type as defined in the {cpp} core language, +`work_group_memory` implicitly creates objects of that type with indeterminate +values. +For other types, `work_group_memory` merely allocates uninitialized memory, and +the application is responsible for constructing objects in that memory (e.g. by +calling placement-new). + +The `PropertyListT` template parameter currently has no meaning and must have +its default value of `empty_properties_t`. +This template parameter may be used in the future to associate compile-time +properties with the `work_group_memory`. + +==== Type aliases + +[frame=all,grid=none,separator="@"] +!==== +a@ +[source,c++] +---- +using value_type = std::remove_all_extents_t; +---- +!==== + +This type alias provides the data type of the device local memory with all +array extents removed. + +==== Constructors and copy assignment + +[frame=all,grid=none,separator="@"] +!==== +a@ +[source,c++] +---- +work_group_memory(); +---- +!==== + +_Effects:_ Constructs a "dummy" `work_group_memory` object that does not +represent any device local memory. +The only valid operation for a dummy object is the copy-assignment operator, +which overwrites the object with the right-hand-side of the assignment. +Passing a dummy object as a kernel argument or calling any of its other +member functions or operators produces undefined behavior. + +[_Note:_ This constructor may be called in either host code or device code. +_{endnote}_] + +''' + +[frame=all,grid=none,separator="@"] +!==== +a@ +[source,c++] +---- +work_group_memory(const work_group_memory& rhs); +---- +!==== + +_Effects:_ Constructs a `work_group_memory` object which is a copy of the +`rhs` object. +The new object represents the same underlying device local memory as `rhs`. + +[_Note:_ This constructor may be called in either host code or device code. +_{endnote}_] + +[_Note:_ The copied object does not always represent the same underlying device +local memory when the copy constructor is called in host code. +See the open issues. +_{endnote}_] + +''' + +[frame=all,grid=none,separator="@"] +!==== +a@ +[source,c++] +---- +work_group_memory(handler& cgh); (1) +work_group_memory(size_t num, handler& cgh); (2) +---- +!==== + +_Preconditions:_ These constructors must be called from host code. + +_Constraints (1):_ Available only when `DataT` is not an unbounded array. + +_Constraints (2):_ Available only when `DataT` is an unbounded array. + +_Effects:_ Constructs a `work_group_memory` object which represents device +local memory of type `DataT` in the kernel that is enqueued via the `cgh` +handler. +Overload (2) uses `num` to determine the number of elements in the unbounded +array `DataT`. + +_Remarks:_ Attempting to pass the `work_group_memory` object as an argument +to a kernel that is _not_ launched via the `cgh` handler produces undefined +behavior. + +''' + +[frame=all,grid=none,separator="@"] +!==== +a@ +[source,c++] +---- +work_group_memory& operator=(const work_group_memory& rhs); +---- +!==== + +_Effects:_ Replaces the `work_group_memory` object with a copy of the `rhs` object. +The replaced object represents the same underlying device local memory as `rhs`. + +_Returns:_ A reference to the `work_group_memory` object. + +[_Note:_ This operator may be called in either host code or device code. +_{endnote}_] + +[_Note:_ The replaced object does not always represent the same underlying +device local memory when the assignment operator is called in host code. +See the open issues. +_{endnote}_] + +==== Member functions and operators + +[frame=all,grid=none,separator="@"] +!==== +a@ +[source,c++] +---- +operator DataT&() const; +---- +!==== + +_Preconditions:_ This operator must be called from device code. + +_Effects:_ Implicit conversion to the underlying `DataT`. + +''' + +[frame=all,grid=none,separator="@"] +!==== +a@ +[source,c++] +---- +const work_group_memory& operator=(const DataT& value) const; +---- +!==== + +_Preconditions:_ This operator must be called from device code. + +_Constraints:_ Available only when `DataT` is not an array. + +_Effects:_ Assigns the value `value` to the underlying device local memory +object. + +_Returns:_ A reference to the `work_group_memory` object. + +''' + +[frame=all,grid=none,separator="@"] +!==== +a@ +[source,c++] +---- +DataT* operator&() const; +---- +!==== + +_Preconditions:_ This operator must be called from device code. + +_Returns:_ A pointer to the underlying device local memory object. + +''' + +[frame=all,grid=none,separator="@"] +!==== +a@ +[source,c++] +---- +template +multi_ptr get_multi_ptr() const; +---- +!==== + +_Preconditions:_ This function must be called from device code. + +_Returns:_ A `multi_ptr` to the underlying device local memory object. + + +== Examples + +=== Basic usage + +The following example illustrates a typical use of the `work_group_memory` +class. + +[source,c++] +---- +#include +namespace syclexp = sycl::ext::oneapi::experimental; + +constexpr size_t SIZE = 4096; +constexpr size_t WGSIZE = 256; + +int main() { + sycl::queue q; + + q.submit([&](sycl::handler &cgh) { + // Allocate one element for each work-item in the work-group. + syclexp::work_group_memory mem{cgh}; + + sycl::nd_range ndr{{SIZE}, {WGSIZE}}; + cgh.parallel_for(ndr, [=](sycl::nd_item<> it) { + size_t id = it.get_local_linear_id(); + + // Each work-item has its own dedicated element of the array. + mem[id] = /*...*/; + }); + }).wait(); +} +---- + +=== Operations on types + +The following example illustrates various operations that can be done with the +`work_group_memory` class when it is templated with different `DataT` types. + +[source,c++] +---- +#include +namespace syclexp = sycl::ext::oneapi::experimental; + +constexpr size_t SIZE = 4096; +constexpr size_t WGSIZE = 256; + +struct point { + int x; + int y; +}; + +int main() { + sycl::queue q; + + q.submit([&](sycl::handler &cgh) { + syclexp::work_group_memory mem1{cgh}; // scalar + syclexp::work_group_memory mem2{cgh}; // bounded array + syclexp::work_group_memory mem3{5, cgh}; // unbounded array + syclexp::work_group_memory mem4{2, cgh}; // multi-dimensional array + syclexp::work_group_memory mem5{cgh}; // array of struct + + sycl::nd_range ndr{{SIZE}, {WGSIZE}}; + cgh.parallel_for(ndr, [=](sycl::nd_item<> it) { + if (it.get_group().leader()) { + // A "work_group_memory" templated on a scalar type acts much like the + // enclosed scalar type. + ++mem1; + mem1++; + mem1 += 1; + mem1 = mem1 + 1; + int *p1 = &mem1; + + // A "work_group_memory" templated on an array type (either bounded or + // unbounded) acts like an array. + ++mem2[4]; + mem2[4]++; + mem2[4] = mem2[4] + 1; + int *p2 = &mem2[4]; + + // A multi-dimensional array works as expected. + mem4[1][5] = mem4[1][5] + 1; + mem4[1][7] = mem4[1][7] + 1; + + // An array of structs works as expected too. + mem5[1].x++; + mem5[1].y = mem5[1].y + 1; + } + }); + }).wait(); +} +---- + +=== Usage with a free function kernel + +The following example illustrates usage of `work_group_memory` in a free +function kernel. + +[source,c++] +---- +#include +namespace syclexp = sycl::ext::oneapi::experimental; +namespace syclext = sycl::ext::oneapi; + +constexpr size_t SIZE = 4096; +constexpr size_t WGSIZE = 256; + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) +void mykernel(syclexp::work_group_memory mem) { + size_t id = syclext::this_work_item::get_nd_item().get_local_linear_id(); + + // Each work-item has its own dedicated element of the device local memory + // array. + mem[id] = /*...*/; +} + +int main() { + sycl::queue q; + sycl::context ctxt = q.get_context(); + + // Get the kernel object for the "mykernel" kernel. + auto exe_bndl = + syclexp::get_kernel_bundle(ctxt); + sycl::kernel k_mykernel = exe_bndl.ext_oneapi_get_kernel(); + + q.submit([&](sycl::handler &cgh) { + // Allocate an array of device local memory with one element for each + // work-item in the work-group. + syclexp::work_group_memory mem{cgh}; + cgh.set_args(mem); + + sycl::nd_range ndr{{NUM}, {WGSIZE}}; + cgh.parallel_for(ndr, k_mykernel); + }).wait(); +} +---- + + +== Issues + +* We have not agreed on the way in which `work_group_memory` should be created + when there is a property list. + One option is to add a new constructor that takes a `PropertyListT` parameter + and use CTAD to deduce the class template parameters. + However, we need some way to deduce `DataT` because CTAD does not work unless + it deduces all of the template parameters. + This leads to a constructor that requires a tag-type parameter like: ++ +[source,c++] +---- +template +struct type_tag {}; + +template +inline constexpr type_tag type; + +template +class work_group_memory { + work_group_memory(const type_tag&, handler& cgh, + const PropertyListT& props = {}); +}; + +// Deduction guide for the constructor that takes "type_tag". +template +work_group_memory(const type_tag&, handler&, const PropertyListT&) -> + work_group_memory; +---- ++ +Usage would be like: ++ +[source,c++] +---- +syclexp::work_group_memory mem{syclexp::type, cgh, props}; +---- ++ +Another option is to add a factory function like: ++ +[source,c++] +---- +template +work_group_memory +make_work_group_memory(handler& cgh, const PropertyListT& props = {}); +---- ++ +In which case, usage would be like: ++ +[source,c++] +---- +auto mem = syclexp::make_work_group_memory(cgh, props); +---- ++ +We decided to defer this decision for now because we don't have any properties +defined for this class yet anyways. + +* The copy constructor and copy assignment operator say that the copied object + "represents the same underlying device local memory as ``rhs``". + This is not currently the case in {dpcpp} when the copy happens in host code. + If you pass two `work_group_memory` objects as kernel parameters, each object + creates a unique device local memory region, even if one `work_group_memory` + object is a copy of the other. + The `local_accessor` class behaves the same way. + See https://github.com/KhronosGroup/SYCL-Docs/issues/552[this issue] against + the SYCL specification. diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md index 6b776cfb777b3..a35d69cb2a0fa 100644 --- a/sycl/doc/syclcompat/README.md +++ b/sycl/doc/syclcompat/README.md @@ -399,7 +399,7 @@ static void destroy_event(event_ptr event); } // syclcompat ``` -### Memory Allocation +### Memory Operations This library provides interfaces to allocate memory to be accessed within kernel functions and on the host. The `syclcompat::malloc` function allocates device @@ -489,10 +489,12 @@ sycl::event memset_async(pitched_data pitch, int val, sycl::range<3> size, sycl::queue q = get_default_queue()); // 3D matrix +// Free +void wait_and_free(void *ptr, sycl::queue q = get_default_queue()); void free(void *ptr, sycl::queue q = get_default_queue()); -sycl::event free_async(const std::vector &pointers, - const std::vector &events, - sycl::queue q = get_default_queue()); +sycl::event enqueue_free(const std::vector &pointers, + const std::vector &events, + sycl::queue q = get_default_queue()); // Queries pointer allocation type class pointer_attributes { @@ -508,6 +510,64 @@ public: } // syclcompat ``` +The `syclcompat::experimental` namespace contains currently unsupported `memcpy` overloads which take a `syclcompat::experimental::memcpy_parameter` argument. These are included for forwards compatibility and currently throw a `std::runtime_error`. + +```cpp +namespace syclcompat { +namespace experimental { +// Forward declarations for types relating to unsupported memcpy_parameter API: + +enum memcpy_direction { + host_to_host, + host_to_device, + device_to_host, + device_to_device, + automatic +}; + +#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES +class image_mem_wrapper; +#endif +class image_matrix; + +/// Memory copy parameters for 2D/3D memory data. +struct memcpy_parameter { + struct data_wrapper { + pitched_data pitched{}; + sycl::id<3> pos{}; +#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES + experimental::image_mem_wrapper *image_bindless{nullptr}; +#endif + image_matrix *image{nullptr}; + }; + data_wrapper from{}; + data_wrapper to{}; + sycl::range<3> size{}; + syclcompat::detail::memcpy_direction direction{syclcompat::detail::memcpy_direction::automatic}; +}; + +/// [UNSUPPORTED] Synchronously copies 2D/3D memory data specified by \p param . +/// The function will return after the copy is completed. +/// +/// \param param Memory copy parameters. +/// \param q Queue to execute the copy task. +/// \returns no return value. +static inline void memcpy(const memcpy_parameter ¶m, + sycl::queue q = get_default_queue()); + +/// [UNSUPPORTED] Asynchronously copies 2D/3D memory data specified by \p param +/// . The return of the function does NOT guarantee the copy is completed. +/// +/// \param param Memory copy parameters. +/// \param q Queue to execute the copy task. +/// \returns no return value. +static inline void memcpy_async(const memcpy_parameter ¶m, + sycl::queue q = get_default_queue()); + +} // namespace experimental +} // namespace syclcompat +``` + Finally, the class `pitched_data`, which manages memory allocation for 3D spaces, padded to avoid uncoalesced memory accesses. @@ -760,7 +820,9 @@ public: unsigned int get_global_mem_cache_size() const; int get_image1d_max() const; auto get_image2d_max() const; + auto get_image2d_max(); auto get_image3d_max() const; + auto get_image3d_max(); void set_name(const char *name); void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes); @@ -844,9 +906,24 @@ static inline sycl::context get_default_context(); // Util function to get a CPU device. static inline device_ext &cpu_device(); +/// Filter out devices; only keep the device whose name contains one of the +/// subname in \p dev_subnames. +/// May break device id mapping and change current device. It's better to be +/// called before other SYCLcompat or SYCL APIs. +static inline void filter_device(const std::vector &dev_subnames); + +/// Print all the devices (and their IDs) in the dev_mgr +static inline void list_devices(); + // Util function to select a device by its id static inline unsigned int select_device(unsigned int id); +// Util function to get the device id from a device +static inline unsigned int get_device_id(const sycl::device &dev); + +// Util function to get the number of available devices +static inline unsigned int device_count(); + } // syclcompat ``` @@ -861,13 +938,19 @@ independently of what is set in this parameter. Devices are managed through a helper class, `device_ext`. The `device_ext` class associates a vector of `sycl::queues` with its `sycl::device`. The `device_ext` destructor waits on a set of `sycl::event` which can be added to via -`add_event`. This is used, for example, to implement `syclcompat::free_async` to +`add_event`. This is used, for example, to implement `syclcompat::enqueue_free` to schedule release of memory after a kernel or `mempcy`. SYCL device properties can be queried through `device_ext` as well. `device_ext` also provides the `has_capability_or_fail` member function, which throws a `sycl::exception` if the device does not have the specified list of `sycl::aspect`. +Devices can be listed and filtered using `syclcompat::list_devices()` and +`syclcompat::filter_device()`. If `SYCLCOMPAT_VERBOSE` is defined at compile +time, the available SYCL devices are printed to the standard output both at +initialization time, and when the device list is filtered using +`syclcompat::filter_device`. + Users can manage queues through the `syclcompat::set_default_queue(sycl::queue q)` free function, and the `device_ext` `set_saved_queue`, `set_default_queue`, and `get_saved_queue` member functions. @@ -1511,6 +1594,13 @@ without modulo overflow for vector types. The functions `cmul`,`cdiv`,`cabs`, `cmul_add`, and `conj` define complex math operations which accept `sycl::vec` arguments representing complex values. +The `dp4a` function returns the 4-way 8-bit dot product accumulate for unsigned +and signed 32-bit integer values. The `dp2a_lo` and `dp2a_hi` functions return the +two-way 16-bit to 8-bit dot product using the second and first 16 bits of the +second operand, respectively. These three APIs return a single 32-bit value with +the accumulated result, which is unsigned if both operands are `uint32_t` and +signed otherwise. + ```cpp inline unsigned int funnelshift_l(unsigned int low, unsigned int high, unsigned int shift); @@ -1692,6 +1782,24 @@ inline sycl::marray cmul_add(const sycl::marray a, template sycl::vec conj(sycl::vec x); template inline ValueT reverse_bits(ValueT a); + + +template +using dot_product_acc_t = + std::conditional_t && std::is_unsigned_v, + uint32_t, int32_t>; + +template +inline dot_product_acc_t dp2a_lo(T1 a, T2 b, + dot_product_acc_t c); + +template +inline dot_product_acc_t dp2a_hi(T1 a, T2 b, + dot_product_acc_t c); + +template +inline dot_product_acc_t dp4a(T1 a, T2 b, + dot_product_acc_t c); ``` `vectorized_binary` computes the `BinaryOperation` for two operands, @@ -1754,7 +1862,7 @@ struct sub_sat { } // namespace syclcompat ``` -Finally, the math header provides a set of functions to extend 32-bit operations +The math header provides a set of functions to extend 32-bit operations to 33 bit, and handle sign extension internally. There is support for `add`, `sub`, `absdiff`, `min` and `max` operations. Each operation provides overloads to include a second, separate, `BinaryOperation` after the first, and include @@ -1838,6 +1946,591 @@ inline constexpr RetT extend_max_sat(AT a, BT b, CT c, BinaryOperation second_op); ``` +Another set of vectorized extend 32-bit operations is provided in the math +header.These APIs treat each of the 32-bit operands as 2-elements vector +(16-bits each) while handling sign extension to 17-bits internally. There is +support for `add`, `sub`, `absdiff`, `min`, `max` and `avg` binary operations. +Each operation provides has a `_sat` variat which determines if the returning +value is saturated or not, and a `_add` variant that computes the binary sum +of the the initial operation outputs and a third operand. + +```cpp +/// Compute vectorized addition of \p a and \p b, with each value treated as a +/// 2 elements vector type and extend each element to 17 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized addition of the two values +template +inline constexpr RetT extend_vadd2(AT a, BT b, RetT c); + +/// Compute vectorized addition of \p a and \p b, with each value treated as a 2 +/// elements vector type and extend each element to 17 bit. Then add each half +/// of the result and add with \p c. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The addition of each half of extend vectorized addition of the two +/// values and the third value +template +inline constexpr RetT extend_vadd2_add(AT a, BT b, RetT c); + +/// Compute vectorized addition of \p a and \p b with saturation, with each +/// value treated as a 2 elements vector type and extend each element to 17 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized addition of the two values with saturation +template +inline constexpr RetT extend_vadd2_sat(AT a, BT b, RetT c); + +/// Compute vectorized subtraction of \p a and \p b, with each value treated as +/// a 2 elements vector type and extend each element to 17 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized subtraction of the two values +template +inline constexpr RetT extend_vsub2(AT a, BT b, RetT c); + +/// Compute vectorized subtraction of \p a and \p b, with each value treated as +/// a 2 elements vector type and extend each element to 17 bit. Then add each +/// half of the result and add with \p c. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The addition of each half of extend vectorized subtraction of the +/// two values and the third value +template +inline constexpr RetT extend_vsub2_add(AT a, BT b, RetT c); + +/// Compute vectorized subtraction of \p a and \p b with saturation, with each +/// value treated as a 2 elements vector type and extend each element to 17 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized subtraction of the two values with saturation +template +inline constexpr RetT extend_vsub2_sat(AT a, BT b, RetT c); + +/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2 +/// elements vector type and extend each element to 17 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized abs_diff of the two values +template +inline constexpr RetT extend_vabsdiff2(AT a, BT b, RetT c); + +/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2 +/// elements vector type and extend each element to 17 bit. Then add each half +/// of the result and add with \p c. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The addition of each half of extend vectorized abs_diff of the +/// two values and the third value +template +inline constexpr RetT extend_vabsdiff2_add(AT a, BT b, RetT c); + +/// Compute vectorized abs_diff of \p a and \p b with saturation, with each +/// value treated as a 2 elements vector type and extend each element to 17 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized abs_diff of the two values with saturation +template +inline constexpr RetT extend_vabsdiff2_sat(AT a, BT b, RetT c); + +/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2 +/// elements vector type and extend each element to 17 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized minimum of the two values +template +inline constexpr RetT extend_vmin2(AT a, BT b, RetT c); + +/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2 +/// elements vector type and extend each element to 17 bit. Then add each half +/// of the result and add with \p c. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The addition of each half of extend vectorized minimum of the +/// two values and the third value +template +inline constexpr RetT extend_vmin2_add(AT a, BT b, RetT c); + +/// Compute vectorized minimum of \p a and \p b with saturation, with each value +/// treated as a 2 elements vector type and extend each element to 17 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized minimum of the two values with saturation +template +inline constexpr RetT extend_vmin2_sat(AT a, BT b, RetT c); + +/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2 +/// elements vector type and extend each element to 17 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized maximum of the two values +template +inline constexpr RetT extend_vmax2(AT a, BT b, RetT c); + +/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2 +/// elements vector type and extend each element to 17 bit. Then add each half +/// of the result and add with \p c. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The addition of each half of extend vectorized maximum of the +/// two values and the third value +template +inline constexpr RetT extend_vmax2_add(AT a, BT b, RetT c); + +/// Compute vectorized maximum of \p a and \p b with saturation, with each value +/// treated as a 2 elements vector type and extend each element to 17 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized maximum of the two values with saturation +template +inline constexpr RetT extend_vmax2_sat(AT a, BT b, RetT c); + +/// Compute vectorized average of \p a and \p b, with each value treated as a 2 +/// elements vector type and extend each element to 17 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized average of the two values +template +inline constexpr RetT extend_vavrg2(AT a, BT b, RetT c); + +/// Compute vectorized average of \p a and \p b, with each value treated as a 2 +/// elements vector type and extend each element to 17 bit. Then add each half +/// of the result and add with \p c. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The addition of each half of extend average maximum of the +/// two values and the third value +template +inline constexpr RetT extend_vavrg2_add(AT a, BT b, RetT c); + +/// Compute vectorized average of \p a and \p b with saturation, with each value +/// treated as a 2 elements vector type and extend each element to 17 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized average of the two values with saturation +template +inline constexpr RetT extend_vavrg2_sat(AT a, BT b, RetT c); +``` + +Similarly, a set of vectorized extend 32-bit operations is provided in the math +header treating each of the 32-bit operands as 4-elements vector (8-bits each) +while handling sign extension to 9-bits internally. There is support for `add`, +`sub`, `absdiff`, `min`, `max` and `avg` binary operations. +Each operation provides has a `_sat` variat which determines if the returning +value is saturated or not, and a `_add` variant that computes the binary sum +of the the initial operation outputs and a third operand. + +```cpp +/// Compute vectorized addition of \p a and \p b, with each value treated as a +/// 4 elements vector type and extend each element to 9 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized addition of the two values +template +inline constexpr RetT extend_vadd4(AT a, BT b, RetT c); + +/// Compute vectorized addition of \p a and \p b, with each value treated as a 4 +/// elements vector type and extend each element to 9 bit. Then add each half +/// of the result and add with \p c. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The addition of each half of extend vectorized addition of the two +/// values and the third value +template +inline constexpr RetT extend_vadd4_add(AT a, BT b, RetT c); + +/// Compute vectorized addition of \p a and \p b with saturation, with each +/// value treated as a 4 elements vector type and extend each element to 9 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized addition of the two values with saturation +template +inline constexpr RetT extend_vadd4_sat(AT a, BT b, RetT c); + +/// Compute vectorized subtraction of \p a and \p b, with each value treated as +/// a 4 elements vector type and extend each element to 9 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized subtraction of the two values +template +inline constexpr RetT extend_vsub4(AT a, BT b, RetT c); + +/// Compute vectorized subtraction of \p a and \p b, with each value treated as +/// a 4 elements vector type and extend each element to 9 bit. Then add each +/// half of the result and add with \p c. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The addition of each half of extend vectorized subtraction of the +/// two values and the third value +template +inline constexpr RetT extend_vsub4_add(AT a, BT b, RetT c); + +/// Compute vectorized subtraction of \p a and \p b with saturation, with each +/// value treated as a 4 elements vector type and extend each element to 9 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized subtraction of the two values with saturation +template +inline constexpr RetT extend_vsub4_sat(AT a, BT b, RetT c); + +/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4 +/// elements vector type and extend each element to 9 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized abs_diff of the two values +template +inline constexpr RetT extend_vabsdiff4(AT a, BT b, RetT c); + +/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4 +/// elements vector type and extend each element to 9 bit. Then add each half +/// of the result and add with \p c. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The addition of each half of extend vectorized abs_diff of the +/// two values and the third value +template +inline constexpr RetT extend_vabsdiff4_add(AT a, BT b, RetT c); + +/// Compute vectorized abs_diff of \p a and \p b with saturation, with each +/// value treated as a 4 elements vector type and extend each element to 9 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized abs_diff of the two values with saturation +template +inline constexpr RetT extend_vabsdiff4_sat(AT a, BT b, RetT c); + +/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4 +/// elements vector type and extend each element to 9 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized minimum of the two values +template +inline constexpr RetT extend_vmin4(AT a, BT b, RetT c); + +/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4 +/// elements vector type and extend each element to 9 bit. Then add each half +/// of the result and add with \p c. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The addition of each half of extend vectorized minimum of the +/// two values and the third value +template +inline constexpr RetT extend_vmin4_add(AT a, BT b, RetT c); + +/// Compute vectorized minimum of \p a and \p b with saturation, with each value +/// treated as a 4 elements vector type and extend each element to 9 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized minimum of the two values with saturation +template +inline constexpr RetT extend_vmin4_sat(AT a, BT b, RetT c); + +/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4 +/// elements vector type and extend each element to 9 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized maximum of the two values +template +inline constexpr RetT extend_vmax4(AT a, BT b, RetT c); + +/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4 +/// elements vector type and extend each element to 9 bit. Then add each half +/// of the result and add with \p c. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The addition of each half of extend vectorized maximum of the +/// two values and the third value +template +inline constexpr RetT extend_vmax4_add(AT a, BT b, RetT c); + +/// Compute vectorized maximum of \p a and \p b with saturation, with each value +/// treated as a 4 elements vector type and extend each element to 9 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized maximum of the two values with saturation +template +inline constexpr RetT extend_vmax4_sat(AT a, BT b, RetT c); + +/// Compute vectorized average of \p a and \p b, with each value treated as a 4 +/// elements vector type and extend each element to 9 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized average of the two values +template +inline constexpr RetT extend_vavrg4(AT a, BT b, RetT c); + +/// Compute vectorized average of \p a and \p b, with each value treated as a 4 +/// elements vector type and extend each element to 9 bit. Then add each half +/// of the result and add with \p c. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The addition of each half of extend vectorized average of the +/// two values and the third value +template +inline constexpr RetT extend_vavrg4_add(AT a, BT b, RetT c); + +/// Compute vectorized average of \p a and \p b with saturation, with each value +/// treated as a 4 elements vector type and extend each element to 9 bit. +/// \tparam [in] RetT The type of the return value, can only be 32 bit integer +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \returns The extend vectorized average of the two values with saturation +template +inline constexpr RetT extend_vavrg4_sat(AT a, BT b, RetT c); +``` + +Vectorized comparison APIs also provided in the math header behave similarly +and support a `std` comparison operator parameter which can be `greater`, +`less`, `greater_equal`, `less_equal`, `equal_to` or `not_equal_to`. These APIs +cover both the 2-elements *(16-bits each)* and 4-elements *(8-bits each)* +variants, as well as an additional `_add` variant that computes the sum of the +2/4 output elements. + +```cpp +/// Extend \p a and \p b to 33 bit and vectorized compare input values using +/// specified comparison \p cmp . +/// +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \tparam [in] BinaryOperation The type of the compare operation +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] cmp The comparsion operator +/// \returns The comparison result of the two extended values. +template +inline constexpr unsigned extend_vcompare2(AT a, BT b, BinaryOperation cmp); + +/// Extend Inputs to 33 bit, and vectorized compare input values using specified +/// comparison \p cmp , then add the result with \p c . +/// +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \tparam [in] BinaryOperation The type of the compare operation +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \param [in] cmp The comparsion operator +/// \returns The comparison result of the two extended values, and add the +/// result with \p c . +template +inline constexpr unsigned extend_vcompare2_add(AT a, BT b, unsigned c, + BinaryOperation cmp); + +/// Extend \p a and \p b to 33 bit and vectorized compare input values using +/// specified comparison \p cmp . +/// +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \tparam [in] BinaryOperation The type of the compare operation +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] cmp The comparsion operator +/// \returns The comparison result of the two extended values. +template +inline constexpr unsigned extend_vcompare4(AT a, BT b, BinaryOperation cmp); + +/// Extend Inputs to 33 bit, and vectorized compare input values using specified +/// comparison \p cmp , then add the result with \p c . +/// +/// \tparam [in] AT The type of the first value, can only be 32 bit integer +/// \tparam [in] BT The type of the second value, can only be 32 bit integer +/// \tparam [in] BinaryOperation The type of the compare operation +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \param [in] cmp The comparsion operator +/// \returns The comparison result of the two extended values, and add the +/// result with \p c . +template +inline constexpr unsigned extend_vcompare4_add(AT a, BT b, unsigned c, + BinaryOperation cmp); +``` + +The math header file provides APIs for bit-field insertion (`bfi_safe`) and +bit-field extraction (`bfe_safe`). These are bounds-checked variants of +underlying `detail` APIs (`detail::bfi`, `detail::bfe`) which, in future +releases, will be exposed to the user. + +```c++ + +/// Bitfield-insert with boundary checking. +/// +/// Align and insert a bit field from \param x into \param y . Source \param +/// bit_start gives the starting bit position for the insertion, and source +/// \param num_bits gives the bit field length in bits. +/// +/// \tparam T The type of \param x and \param y , must be an unsigned integer. +/// \param x The source of the bitfield. +/// \param y The source where bitfield is inserted. +/// \param bit_start The position to start insertion. +/// \param num_bits The number of bits to insertion. +template +inline T bfi_safe(const T x, const T y, const uint32_t bit_start, + const uint32_t num_bits); + +/// Bitfield-extract with boundary checking. +/// +/// Extract bit field from \param source and return the zero or sign-extended +/// result. Source \param bit_start gives the bit field starting bit position, +/// and source \param num_bits gives the bit field length in bits. +/// +/// The result is padded with the sign bit of the extracted field. If `num_bits` +/// is zero, the result is zero. If the start position is beyond the msb of the +/// input, the result is filled with the replicated sign bit of the extracted +/// field. +/// +/// \tparam T The type of \param source value, must be an integer. +/// \param source The source value to extracting. +/// \param bit_start The position to start extracting. +/// \param num_bits The number of bits to extracting. +template +inline T bfe_safe(const T source, const uint32_t bit_start, + const uint32_t num_bits); +``` + ## Sample Code Below is a simple linear algebra sample, which computes `y = mx + b` implemented diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 6fd4b9ebf63db..3031e73a3c113 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -1019,10 +1019,16 @@ extern __DPCPP_SYCL_EXTERNAL void __spirv_ocl_prefetch(const __attribute__((opencl_global)) char *Ptr, size_t NumBytes) noexcept; -extern __DPCPP_SYCL_EXTERNAL uint16_t -__spirv_ConvertFToBF16INTEL(float) noexcept; extern __DPCPP_SYCL_EXTERNAL float __spirv_ConvertBF16ToFINTEL(uint16_t) noexcept; +extern __DPCPP_SYCL_EXTERNAL uint16_t +__spirv_ConvertFToBF16INTEL(float) noexcept; +template +extern __DPCPP_SYCL_EXTERNAL __ocl_vec_t + __spirv_ConvertBF16ToFINTEL(__ocl_vec_t) noexcept; +template +extern __DPCPP_SYCL_EXTERNAL __ocl_vec_t + __spirv_ConvertFToBF16INTEL(__ocl_vec_t) noexcept; __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT __ocl_vec_t @@ -1280,6 +1286,7 @@ __CLC_BF16_SCAL_VEC(uint32_t) extern __DPCPP_SYCL_EXTERNAL int32_t __spirv_BuiltInGlobalHWThreadIDINTEL(); extern __DPCPP_SYCL_EXTERNAL int32_t __spirv_BuiltInSubDeviceIDINTEL(); +extern __DPCPP_SYCL_EXTERNAL uint64_t __spirv_ReadClockKHR(int); template extern __DPCPP_SYCL_EXTERNAL diff --git a/sycl/include/sycl/atomic_ref.hpp b/sycl/include/sycl/atomic_ref.hpp index 5c163cd5fe8e2..1a1c4a63000f7 100644 --- a/sycl/include/sycl/atomic_ref.hpp +++ b/sycl/include/sycl/atomic_ref.hpp @@ -568,9 +568,14 @@ class [[__sycl_detail__::__uses_aspects__(aspect::atomic64)]] atomic_ref_impl< // Partial specialization for pointer types // Arithmetic is emulated because target's representation of T* is unknown // TODO: Find a way to use intptr_t or uintptr_t atomics instead -template -class atomic_ref_impl +template +#ifndef __SYCL_DEVICE_ONLY__ +class atomic_ref_impl< +#else +class [[__sycl_detail__::__uses_aspects__(aspect::atomic64)]] atomic_ref_impl< +#endif + T *, IsAspectAtomic64AttrUsed, DefaultOrder, DefaultScope, AddressSpace> : public atomic_ref_base { diff --git a/sycl/include/sycl/detail/cg.hpp b/sycl/include/sycl/detail/cg.hpp index f0616dcce51b9..8d823c109ee34 100644 --- a/sycl/include/sycl/detail/cg.hpp +++ b/sycl/include/sycl/detail/cg.hpp @@ -534,33 +534,41 @@ class CGCopyImage : public CG { /// "Semaphore Wait" command group class. class CGSemaphoreWait : public CG { sycl::detail::pi::PiInteropSemaphoreHandle MInteropSemaphoreHandle; + std::optional MWaitValue; public: CGSemaphoreWait( sycl::detail::pi::PiInteropSemaphoreHandle InteropSemaphoreHandle, - CG::StorageInitHelper CGData, detail::code_location loc = {}) + std::optional WaitValue, CG::StorageInitHelper CGData, + detail::code_location loc = {}) : CG(SemaphoreWait, std::move(CGData), std::move(loc)), - MInteropSemaphoreHandle(InteropSemaphoreHandle) {} + MInteropSemaphoreHandle(InteropSemaphoreHandle), MWaitValue(WaitValue) { + } sycl::detail::pi::PiInteropSemaphoreHandle getInteropSemaphoreHandle() const { return MInteropSemaphoreHandle; } + std::optional getWaitValue() const { return MWaitValue; } }; /// "Semaphore Signal" command group class. class CGSemaphoreSignal : public CG { sycl::detail::pi::PiInteropSemaphoreHandle MInteropSemaphoreHandle; + std::optional MSignalValue; public: CGSemaphoreSignal( sycl::detail::pi::PiInteropSemaphoreHandle InteropSemaphoreHandle, - CG::StorageInitHelper CGData, detail::code_location loc = {}) + std::optional SignalValue, CG::StorageInitHelper CGData, + detail::code_location loc = {}) : CG(SemaphoreSignal, std::move(CGData), std::move(loc)), - MInteropSemaphoreHandle(InteropSemaphoreHandle) {} + MInteropSemaphoreHandle(InteropSemaphoreHandle), + MSignalValue(SignalValue) {} sycl::detail::pi::PiInteropSemaphoreHandle getInteropSemaphoreHandle() const { return MInteropSemaphoreHandle; } + std::optional getSignalValue() const { return MSignalValue; } }; /// "Execute command-buffer" command group class. diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp index 3b0ce7988f576..bca2fd27eeb49 100644 --- a/sycl/include/sycl/detail/generic_type_traits.hpp +++ b/sycl/include/sycl/detail/generic_type_traits.hpp @@ -17,6 +17,8 @@ #include // for BIsRepresentationT #include // for multi_ptr, address_spa... +#include // for bfloat16 storage type. + #include // for byte #include // for uint8_t #include // for numeric_limits @@ -252,6 +254,16 @@ inline constexpr bool is_genfloatptr_marray_v = (IsDecorated == access::decorated::yes || IsDecorated == access::decorated::no); +template +using is_byte = typename +#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) + std::is_same; +#else + std::false_type; +#endif + +template inline constexpr bool is_byte_v = is_byte::value; + template using make_floating_point_t = make_type_t; @@ -332,6 +344,8 @@ template auto convertToOpenCLType(T &&x) { std::declval()))>, no_ref::size()>; #ifdef __SYCL_DEVICE_ONLY__ + +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES // TODO: for some mysterious reasons on NonUniformGroups E2E tests fail if // we use the "else" version only. I suspect that's an issues with // non-uniform groups implementation. @@ -340,6 +354,10 @@ template auto convertToOpenCLType(T &&x) { else return static_cast( x.template as()); +#else // __INTEL_PREVIEW_BREAKING_CHANGES + return sycl::bit_cast(x); +#endif // __INTEL_PREVIEW_BREAKING_CHANGES + #else return x.template as(); #endif @@ -370,7 +388,13 @@ template auto convertToOpenCLType(T &&x) { static_assert(sizeof(OpenCLType) == sizeof(T)); return static_cast(x); } else if constexpr (is_bfloat16_v) { + // On host, don't interpret BF16 as uint16. +#ifdef __SYCL_DEVICE_ONLY__ + using OpenCLType = sycl::ext::oneapi::detail::Bfloat16StorageT; + return sycl::bit_cast(x); +#else return std::forward(x); +#endif } else if constexpr (std::is_floating_point_v) { static_assert(std::is_same_v || std::is_same_v, diff --git a/sycl/include/sycl/detail/group_sort_impl.hpp b/sycl/include/sycl/detail/group_sort_impl.hpp index af060edbbdc4c..6974413492a7c 100644 --- a/sycl/include/sycl/detail/group_sort_impl.hpp +++ b/sycl/include/sycl/detail/group_sort_impl.hpp @@ -15,11 +15,43 @@ #include #include #include +#include + +#include namespace sycl { inline namespace _V1 { namespace detail { +// Helpers for sorting algorithms +#ifdef __SYCL_DEVICE_ONLY__ +template +static __SYCL_ALWAYS_INLINE T *align_scratch(sycl::span scratch, + Group g, + size_t number_of_elements) { + // Adjust the scratch pointer based on alignment of the type T. + // Per extension specification if scratch size is less than the value + // returned by memory_required then behavior is undefined, so we don't check + // that the scratch size statisfies the requirement. + T *scratch_begin = nullptr; + // We must have a barrier here before array placement new because it is + // possible that scratch memory is already in use, so we need to synchronize + // work items. + sycl::group_barrier(g); + if (g.leader()) { + void *scratch_ptr = scratch.data(); + size_t space = scratch.size(); + scratch_ptr = std::align(alignof(T), number_of_elements * sizeof(T), + scratch_ptr, space); + scratch_begin = ::new (scratch_ptr) T[number_of_elements]; + } + // Broadcast leader's pointer (the beginning of the scratch) to all work + // items in the group. + scratch_begin = sycl::group_broadcast(g, scratch_begin); + return scratch_begin; +} +#endif + // ---- merge sort implementation // following two functions could be useless if std::[lower|upper]_bound worked @@ -68,22 +100,10 @@ struct GetValueType> { using type = ElementType; }; -// since we couldn't assign data to raw memory, it's better to use placement -// for first assignment -template -void set_value(Acc ptr, const size_t idx, const T &val, bool is_first) { - if (is_first) { - ::new (ptr + idx) T(val); - } else { - ptr[idx] = val; - } -} - template void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1, const size_t start_1, const size_t end_1, const size_t end_2, - const size_t start_out, Compare comp, const size_t chunk, - bool is_first) { + const size_t start_out, Compare comp, const size_t chunk) { const size_t start_2 = end_1; // Borders of the sequences to merge within this call const size_t local_start_1 = @@ -111,8 +131,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1, const size_t l_shift_1 = local_start_1 - start_1; const size_t l_shift_2 = l_search_bound_2 - start_2; - set_value(out_acc1, start_out + l_shift_1 + l_shift_2, local_l_item_1, - is_first); + out_acc1[start_out + l_shift_1 + l_shift_2] = local_l_item_1; size_t r_search_bound_2{}; // find right border in 2nd sequence @@ -123,8 +142,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1, const auto r_shift_1 = local_end_1 - 1 - start_1; const auto r_shift_2 = r_search_bound_2 - start_2; - set_value(out_acc1, start_out + r_shift_1 + r_shift_2, local_r_item_1, - is_first); + out_acc1[start_out + r_shift_1 + r_shift_2] = local_r_item_1; } // Handle intermediate items @@ -138,8 +156,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1, const size_t shift_1 = idx - start_1; const size_t shift_2 = l_search_bound_2 - start_2; - set_value(out_acc1, start_out + shift_1 + shift_2, intermediate_item_1, - is_first); + out_acc1[start_out + shift_1 + shift_2] = intermediate_item_1; } } // Process 2nd sequence @@ -152,8 +169,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1, const size_t l_shift_1 = l_search_bound_1 - start_1; const size_t l_shift_2 = local_start_2 - start_2; - set_value(out_acc1, start_out + l_shift_1 + l_shift_2, local_l_item_2, - is_first); + out_acc1[start_out + l_shift_1 + l_shift_2] = local_l_item_2; size_t r_search_bound_1{}; // find right border in 1st sequence @@ -164,8 +180,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1, const size_t r_shift_1 = r_search_bound_1 - start_1; const size_t r_shift_2 = local_end_2 - 1 - start_2; - set_value(out_acc1, start_out + r_shift_1 + r_shift_2, local_r_item_2, - is_first); + out_acc1[start_out + r_shift_1 + r_shift_2] = local_r_item_2; } // Handle intermediate items @@ -179,8 +194,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1, const size_t shift_1 = l_search_bound_1 - start_1; const size_t shift_2 = idx - start_2; - set_value(out_acc1, start_out + shift_1 + shift_2, intermediate_item_2, - is_first); + out_acc1[start_out + shift_1 + shift_2] = intermediate_item_2; } } } @@ -200,10 +214,9 @@ void bubble_sort(Iter first, const size_t begin, const size_t end, } } -template +template void merge_sort(Group group, Iter first, const size_t n, Compare comp, - std::byte *scratch) { - using T = typename GetValueType::type; + T *scratch) { const size_t idx = group.get_local_linear_id(); const size_t local = group.get_local_range().size(); const size_t chunk = (n - 1) / local + 1; @@ -212,9 +225,7 @@ void merge_sort(Group group, Iter first, const size_t n, Compare comp, bubble_sort(first, idx * chunk, sycl::min((idx + 1) * chunk, n), comp); sycl::group_barrier(group); - T *temp = reinterpret_cast(scratch); - bool data_in_temp = false; - bool is_first = true; + bool data_in_scratch = false; size_t sorted_size = 1; while (sorted_size * chunk < n) { const size_t start_1 = @@ -223,26 +234,24 @@ void merge_sort(Group group, Iter first, const size_t n, Compare comp, const size_t end_2 = sycl::min(end_1 + sorted_size * chunk, n); const size_t offset = chunk * (idx % sorted_size); - if (!data_in_temp) { - merge(offset, first, temp, start_1, end_1, end_2, start_1, comp, chunk, - is_first); + if (!data_in_scratch) { + merge(offset, first, scratch, start_1, end_1, end_2, start_1, comp, + chunk); } else { - merge(offset, temp, first, start_1, end_1, end_2, start_1, comp, chunk, - /*is_first*/ false); + merge(offset, scratch, first, start_1, end_1, end_2, start_1, comp, + chunk); } sycl::group_barrier(group); - data_in_temp = !data_in_temp; + data_in_scratch = !data_in_scratch; sorted_size *= 2; - if (is_first) - is_first = false; } // copy back if data is in a temporary storage - if (data_in_temp) { + if (data_in_scratch) { for (size_t i = 0; i < chunk; ++i) { if (idx * chunk + i < n) { - first[idx * chunk + i] = temp[idx * chunk + i]; + first[idx * chunk + i] = scratch[idx * chunk + i]; } } sycl::group_barrier(group); @@ -601,7 +610,7 @@ template void performRadixIterStaticSize(GroupT group, const uint32_t radix_iter, const uint32_t last_iter, KeysT *keys, - ValsT vals, const ScratchMemory &memory) { + ValsT *vals, const ScratchMemory &memory) { const uint32_t radix_states = getStatesInBits(radix_bits); const size_t wgsize = group.get_local_linear_range(); const size_t idx = group.get_local_linear_id(); diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index 66efb7c750ebe..995579d612afb 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -206,9 +206,11 @@ _PI_API(piextMemMipmapFree) // Interop _PI_API(piextMemImportOpaqueFD) +_PI_API(piextImportExternalMemory) _PI_API(piextMemReleaseInterop) _PI_API(piextMemMapExternalArray) _PI_API(piextImportExternalSemaphoreOpaqueFD) +_PI_API(piextImportExternalSemaphore) _PI_API(piextDestroyExternalSemaphore) _PI_API(piextWaitExternalSemaphore) _PI_API(piextSignalExternalSemaphore) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 6f6821360207a..f4e67f7ba6113 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -176,9 +176,24 @@ // piextMemSampledImageCreate // 15.52 Added piEnqueueTimestampRecordingExp and // PI_EXT_ONEAPI_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT. +// 15.53 Added new extension functions that enable importing various external +// handle types: +// - piextImportExternalMemory +// - piextImportExternalSemaphore +// Deprecated no longer necessary functions: +// - piextImportExternalSemaphoreOpaqueFD +// - piextMemImportOpaqueFD +// The following interop semaphore related functions now take extra +// `bool` and `pi_uint64` values: +// - `piextWaitExternalSemaphore` +// - `piextSignalExternalSemaphore` +// The `pi_external_mem_handle_type` enum now has a new +// `win32_nt_dx12_resource` value. +// the `pi_external_semaphore_handle_type` enum now has a new +// `win32_nt_dx12_fence` value. #define _PI_H_VERSION_MAJOR 15 -#define _PI_H_VERSION_MINOR 52 +#define _PI_H_VERSION_MINOR 53 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -1161,6 +1176,60 @@ struct pi_device_binaries_struct { }; using pi_device_binaries = pi_device_binaries_struct *; +// This union encapsulates the two external handles we currently support. +// When choosing the correct field from the union we need to look at the value +// of the enum `pi_external_mem_handle_type` or +// `pi_external_semaphore_handle_type`. +union pi_external_handle { + // Used universally for all Linux based interoperability functionality. + // The associated enum `pi_external_mem_handle_type` in + // `pi_external_mem_descriptor` should always be set to + // `pi_external_mem_handle_type::opaque_fd`. Likewise for semaphore handles. + int file_descriptor; + + // Could be Win32 NT, KMT, or various DX12 handle types. + // The `void *` type is used for all of these. + // The exact handle type depends on the enum `pi_external_mem_handle_type`. + // This enum is found in `pi_external_mem_descriptor`. + // It could be a regular NT handle type (`win32_nt_handle`) or a DX12 specific + // resource handle type (`win32_nt_dx12_resource`), etc. + void *win32_handle; +}; + +// This enum enumerates the specific external memory handles types that we want +// to import. +enum class pi_external_mem_handle_type { + opaque_fd = 0, + win32_nt_handle = 1, + win32_nt_dx12_resource = 2, +}; + +// This struct holds all the information required to import external memory. +struct pi_external_mem_descriptor { + // The type of the external memory handle. + pi_external_mem_handle_type handleType; + // Union encapsulates both Opaque FD (linux) and Win32 handles (Windows). + pi_external_handle handle; + // Size of the external memory in bytes. + size_t memorySizeBytes; +}; + +// This enum enumerates the specific external semaphore handles types that we +// want to import. +enum class pi_external_semaphore_handle_type { + opaque_fd = 0, + win32_nt_handle = 1, + win32_nt_dx12_fence = 2, +}; + +// This struct holds all the information required to import external semaphores. +struct pi_external_semaphore_descriptor { + // The type of the external semaphore handle. + pi_external_semaphore_handle_type handleType; + // Union encapsulates both Opaque FD (linux) and Win32 handles (Windows). + pi_external_handle handle; +}; + // Opaque types that make reading build log errors easier. struct _pi_platform; struct _pi_device; @@ -2856,6 +2925,9 @@ __SYCL_EXPORT pi_result piextMemImageGetInfo( const pi_image_mem_handle mem_handle, pi_image_info param_name, void *param_value, size_t *param_value_size_ret); +/// [DEPRECATED] This function is deprecated in favor of +/// `piextImportExternalMemory` +/// /// API to import external memory in the form of a file descriptor. /// /// \param context is the pi_context @@ -2864,9 +2936,23 @@ __SYCL_EXPORT pi_result piextMemImageGetInfo( /// \param file_descriptor is the file descriptor /// \param ret_handle is the returned interop memory handle to the external /// memory +__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of " + "`piextImportExternalMemory`") +pi_result piextMemImportOpaqueFD(pi_context context, pi_device device, + size_t size, int file_descriptor, + pi_interop_mem_handle *ret_handle); + +/// API to import external memory +/// +/// \param context is the pi_context +/// \param device is the pi_device +/// \param mem_descriptor is the interop memory descriptor +/// \param ret_handle is the returned interop memory handle to the external +/// memory __SYCL_EXPORT pi_result -piextMemImportOpaqueFD(pi_context context, pi_device device, size_t size, - int file_descriptor, pi_interop_mem_handle *ret_handle); +piextImportExternalMemory(pi_context context, pi_device device, + pi_external_mem_descriptor *mem_descriptor, + pi_interop_mem_handle *ret_handle); /// API to map an interop memory handle to an image memory handle. /// @@ -2890,6 +2976,9 @@ __SYCL_EXPORT pi_result piextMemMapExternalArray( __SYCL_EXPORT pi_result piextMemReleaseInterop( pi_context context, pi_device device, pi_interop_mem_handle memory_handle); +/// [DEPRECATED] This function is deprecated in favor of +/// `piextImportExternalSemaphore` +/// /// API to import an external semaphore in the form of a file descriptor. /// /// \param context is the pi_context @@ -2897,9 +2986,24 @@ __SYCL_EXPORT pi_result piextMemReleaseInterop( /// \param file_descriptor is the file descriptor /// \param ret_handle is the returned interop semaphore handle to the external /// semaphore -__SYCL_EXPORT pi_result piextImportExternalSemaphoreOpaqueFD( - pi_context context, pi_device device, int file_descriptor, - pi_interop_semaphore_handle *ret_handle); +__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of " + "`piextImportExternalSemaphore`") +pi_result +piextImportExternalSemaphoreOpaqueFD(pi_context context, pi_device device, + int file_descriptor, + pi_interop_semaphore_handle *ret_handle); + +/// API to import an external semaphore +/// +/// \param context is the pi_context +/// \param device is the pi_device +/// \param sem_descriptor is the interop semaphore descriptor +/// \param ret_handle is the returned interop semaphore handle to the external +/// semaphore +__SYCL_EXPORT pi_result +piextImportExternalSemaphore(pi_context context, pi_device device, + pi_external_semaphore_descriptor *sem_descriptor, + pi_interop_semaphore_handle *ret_handle); /// API to destroy the external semaphore handle. /// @@ -2915,12 +3019,20 @@ piextDestroyExternalSemaphore(pi_context context, pi_device device, /// /// \param command_queue is the queue instructed to wait /// \param sem_handle is the interop semaphore handle +/// \param has_wait_value indicates whether the semaphore is capable of setting +/// user defined state passed through `wait_value`. +/// Otherwise `wait_value` is ignored. +/// \param wait_value is the user defined value of the semaphore state for +/// which this operation will wait upon, provided the +/// semaphore type has this capability, and +/// `has_wait_value` is `true`. /// \param num_events_in_wait_list is the number of events in the wait list /// \param event_wait_list is the list of events to wait on before this /// operation /// \param event is the returned event representing this operation __SYCL_EXPORT pi_result piextWaitExternalSemaphore( pi_queue command_queue, pi_interop_semaphore_handle sem_handle, + bool has_wait_value, pi_uint64 wait_value, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); @@ -2929,12 +3041,19 @@ __SYCL_EXPORT pi_result piextWaitExternalSemaphore( /// /// \param command_queue is the queue instructed to signal /// \param sem_handle is the interop semaphore handle to signal +/// \param has_signal_value indicates whether the semaphore is capable of +/// setting user defined state passed through +/// `signal_value`. Otherwise `signal_value` is ignored. +/// \param signal_value is the user defined value to which the state of the +/// semaphore will be set, provided the semaphore type has +/// this capability, and `has_signal_value` is `true`. /// \param num_events_in_wait_list is the number of events in the wait list /// \param event_wait_list is the list of events to wait on before this /// operation /// \param event is the returned event representing this operation __SYCL_EXPORT pi_result piextSignalExternalSemaphore( pi_queue command_queue, pi_interop_semaphore_handle sem_handle, + bool has_signal_value, pi_uint64 signal_value, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); diff --git a/sycl/include/sycl/detail/pi.hpp b/sycl/include/sycl/detail/pi.hpp index 9442d6f2a86bc..0a6713dab1096 100644 --- a/sycl/include/sycl/detail/pi.hpp +++ b/sycl/include/sycl/detail/pi.hpp @@ -146,6 +146,8 @@ using PiImageMemHandle = ::pi_image_mem_handle; using PiImageCopyFlags = ::pi_image_copy_flags; using PiInteropMemHandle = ::pi_interop_mem_handle; using PiInteropSemaphoreHandle = ::pi_interop_semaphore_handle; +using PiExternalMemDescriptor = ::pi_external_mem_descriptor; +using PiExternalSemaphoreDescriptor = ::pi_external_semaphore_descriptor; using PiImageOffset = ::pi_image_offset_struct; using PiImageRegion = ::pi_image_region_struct; diff --git a/sycl/include/sycl/detail/vector_arith.hpp b/sycl/include/sycl/detail/vector_arith.hpp new file mode 100644 index 0000000000000..7a2bce152c1d3 --- /dev/null +++ b/sycl/include/sycl/detail/vector_arith.hpp @@ -0,0 +1,394 @@ +//=== vector_arith.hpp --- Implementation of arithmetic ops on sycl::vec ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include // for half, cl_char, cl_int +#include // for is_sigeninteger, is_s... +#include // for is_contained +#include // for is_floating_point + +#include // bfloat16 + +#include +#include // for enable_if_t, is_same + +namespace sycl { +inline namespace _V1 { + +template class vec; + +namespace detail { + +template class VecAccess; + +// Macros to populate binary operation on sycl::vec. +#if defined(__SYCL_BINOP) || defined(BINOP_BASE) +#error "Undefine __SYCL_BINOP and BINOP_BASE macro" +#endif + +#ifdef __SYCL_DEVICE_ONLY__ +#define BINOP_BASE(BINOP, OPASSIGN, CONVERT, COND) \ + template \ + friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs, \ + const vec_t & Rhs) { \ + vec_t Ret; \ + if constexpr (vec_t::IsBfloat16) { \ + for (size_t I = 0; I < NumElements; ++I) { \ + Ret[I] = Lhs[I] BINOP Rhs[I]; \ + } \ + } else { \ + auto ExtVecLhs = sycl::bit_cast(Lhs); \ + auto ExtVecRhs = sycl::bit_cast(Rhs); \ + Ret = vec(ExtVecLhs BINOP ExtVecRhs); \ + if constexpr (std::is_same_v && CONVERT) { \ + vec_arith_common::ConvertToDataT(Ret); \ + } \ + } \ + return Ret; \ + } +#else // __SYCL_DEVICE_ONLY__ + +#define BINOP_BASE(BINOP, OPASSIGN, CONVERT, COND) \ + template \ + friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs, \ + const vec_t & Rhs) { \ + vec_t Ret{}; \ + for (size_t I = 0; I < NumElements; ++I) { \ + Ret[I] = Lhs[I] BINOP Rhs[I]; \ + } \ + return Ret; \ + } +#endif // __SYCL_DEVICE_ONLY__ + +#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT, COND) \ + BINOP_BASE(BINOP, OPASSIGN, CONVERT, COND) \ + \ + template \ + friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs, \ + const DataT & Rhs) { \ + return Lhs BINOP vec_t(Rhs); \ + } \ + template \ + friend std::enable_if_t<(COND), vec_t> operator BINOP(const DataT & Lhs, \ + const vec_t & Rhs) { \ + return vec_t(Lhs) BINOP Rhs; \ + } \ + template \ + friend std::enable_if_t<(COND), vec_t> &operator OPASSIGN( \ + vec_t & Lhs, const vec_t & Rhs) { \ + Lhs = Lhs BINOP Rhs; \ + return Lhs; \ + } \ + template \ + friend std::enable_if_t<(Num != 1) && (COND), vec_t &> operator OPASSIGN( \ + vec_t & Lhs, const DataT & Rhs) { \ + Lhs = Lhs BINOP vec_t(Rhs); \ + return Lhs; \ + } + +/**************************************************************** + * vec_arith_common + * / | \ + * / | \ + * vec_arith vec_arith ... vec_arith + * \ | / + * \ | / + * sycl::vec + * + * vec_arith_common is the base class for vec_arith. It contains + * the common math operators of sycl::vec for all types. + * vec_arith is the derived class that contains the math operators + * specialized for certain types. sycl::vec inherits from vec_arith. + * *************************************************************/ +template class vec_arith_common; +template struct vec_helper; + +template +class vec_arith : public vec_arith_common { +protected: + using vec_t = vec; + using ocl_t = detail::select_cl_scalar_integral_signed_t; + template using vec_data = vec_helper; + + // operator!. + friend vec operator!(const vec_t &Rhs) { +#ifdef __SYCL_DEVICE_ONLY__ + if constexpr (!vec_t::IsBfloat16) { + auto extVec = sycl::bit_cast(Rhs); + vec Ret{ + (typename vec::vector_t) !extVec}; + return Ret; + } else +#endif // __SYCL_DEVICE_ONLY__ + { + vec Ret{}; + for (size_t I = 0; I < NumElements; ++I) { + // static_cast will work here as the output of ! operator is either 0 or + // -1. + Ret[I] = static_cast(-1 * (!Rhs[I])); + } + return Ret; + } + } + + // operator +. + friend vec_t operator+(const vec_t &Lhs) { +#ifdef __SYCL_DEVICE_ONLY__ + auto extVec = sycl::bit_cast(Lhs); + return vec_t{+extVec}; +#else + vec_t Ret{}; + for (size_t I = 0; I < NumElements; ++I) + Ret[I] = +Lhs[I]; + return Ret; +#endif + } + + // operator -. + friend vec_t operator-(const vec_t &Lhs) { + vec_t Ret{}; + if constexpr (vec_t::IsBfloat16) { + for (size_t I = 0; I < NumElements; I++) + Ret[I] = -Lhs[I]; + } else { +#ifndef __SYCL_DEVICE_ONLY__ + for (size_t I = 0; I < NumElements; ++I) + Ret[I] = -Lhs[I]; +#else + auto extVec = sycl::bit_cast(Lhs); + Ret = vec_t{-extVec}; + if constexpr (std::is_same_v) { + vec_arith_common::ConvertToDataT(Ret); + } +#endif + } + return Ret; + } + +// Unary operations on sycl::vec +// FIXME: Don't allow Unary operators on vec after +// https://github.com/KhronosGroup/SYCL-CTS/issues/896 gets fixed. +#ifdef __SYCL_UOP +#error "Undefine __SYCL_UOP macro" +#endif +#define __SYCL_UOP(UOP, OPASSIGN) \ + friend vec_t &operator UOP(vec_t & Rhs) { \ + Rhs OPASSIGN DataT{1}; \ + return Rhs; \ + } \ + friend vec_t operator UOP(vec_t &Lhs, int) { \ + vec_t Ret(Lhs); \ + Lhs OPASSIGN DataT{1}; \ + return Ret; \ + } + + __SYCL_UOP(++, +=) + __SYCL_UOP(--, -=) +#undef __SYCL_UOP + + // The logical operations on scalar types results in 0/1, while for vec<>, + // logical operations should result in 0 and -1 (similar to OpenCL vectors). + // That's why, for vec, we need to invert the result of the logical + // operations since we store vec as scalar type on the device. +#if defined(__SYCL_RELLOGOP) || defined(RELLOGOP_BASE) +#error "Undefine __SYCL_RELLOGOP and RELLOGOP_BASE macro." +#endif + +#ifdef __SYCL_DEVICE_ONLY__ +#define RELLOGOP_BASE(RELLOGOP, COND) \ + template \ + friend std::enable_if_t<(COND), vec> operator RELLOGOP( \ + const vec_t & Lhs, const vec_t & Rhs) { \ + vec Ret{}; \ + /* ext_vector_type does not support bfloat16, so for these */ \ + /* we do element-by-element operation on the underlying std::array. */ \ + if constexpr (vec_t::IsBfloat16) { \ + for (size_t I = 0; I < NumElements; ++I) { \ + Ret[I] = static_cast(-(Lhs[I] RELLOGOP Rhs[I])); \ + } \ + } else { \ + auto ExtVecLhs = sycl::bit_cast(Lhs); \ + auto ExtVecRhs = sycl::bit_cast(Rhs); \ + /* Cast required to convert unsigned char ext_vec_type to */ \ + /* char ext_vec_type. */ \ + Ret = vec( \ + (typename vec::vector_t)( \ + ExtVecLhs RELLOGOP ExtVecRhs)); \ + /* For NumElements == 1, we use scalar instead of ext_vector_type. */ \ + if constexpr (NumElements == 1) { \ + Ret *= -1; \ + } \ + } \ + return Ret; \ + } +#else // __SYCL_DEVICE_ONLY__ +#define RELLOGOP_BASE(RELLOGOP, COND) \ + template \ + friend std::enable_if_t<(COND), vec> operator RELLOGOP( \ + const vec_t & Lhs, const vec_t & Rhs) { \ + vec Ret{}; \ + for (size_t I = 0; I < NumElements; ++I) { \ + Ret[I] = static_cast(-(Lhs[I] RELLOGOP Rhs[I])); \ + } \ + return Ret; \ + } +#endif + +#define __SYCL_RELLOGOP(RELLOGOP, COND) \ + RELLOGOP_BASE(RELLOGOP, COND) \ + \ + template \ + friend std::enable_if_t<(COND), vec> operator RELLOGOP( \ + const vec_t & Lhs, const DataT & Rhs) { \ + return Lhs RELLOGOP vec_t(Rhs); \ + } \ + template \ + friend std::enable_if_t<(COND), vec> operator RELLOGOP( \ + const DataT & Lhs, const vec_t & Rhs) { \ + return vec_t(Lhs) RELLOGOP Rhs; \ + } + + // OP is: ==, !=, <, >, <=, >=, &&, || + // vec operatorOP(const vec &Rhs) const; + // vec operatorOP(const DataT &Rhs) const; + __SYCL_RELLOGOP(==, true) + __SYCL_RELLOGOP(!=, true) + __SYCL_RELLOGOP(>, true) + __SYCL_RELLOGOP(<, true) + __SYCL_RELLOGOP(>=, true) + __SYCL_RELLOGOP(<=, true) + + // Only available to integral types. + __SYCL_RELLOGOP(&&, (!detail::is_vgenfloat_v)) + __SYCL_RELLOGOP(||, (!detail::is_vgenfloat_v)) +#undef __SYCL_RELLOGOP +#undef RELLOGOP_BASE + + // Binary operations on sycl::vec<> for all types except std::byte. + __SYCL_BINOP(+, +=, true, true) + __SYCL_BINOP(-, -=, true, true) + __SYCL_BINOP(*, *=, false, true) + __SYCL_BINOP(/, /=, false, true) + + // The following OPs are available only when: DataT != cl_float && + // DataT != cl_double && DataT != cl_half && DataT != BF16. + __SYCL_BINOP(%, %=, false, (!detail::is_vgenfloat_v)) + // Bitwise operations are allowed for std::byte. + __SYCL_BINOP(|, |=, false, (!detail::is_vgenfloat_v)) + __SYCL_BINOP(&, &=, false, (!detail::is_vgenfloat_v)) + __SYCL_BINOP(^, ^=, false, (!detail::is_vgenfloat_v)) + __SYCL_BINOP(>>, >>=, false, (!detail::is_vgenfloat_v)) + __SYCL_BINOP(<<, <<=, true, (!detail::is_vgenfloat_v)) + + // friends + template friend class vec; +}; // class vec_arith<> + +#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) +template +class vec_arith + : public vec_arith_common { +protected: + // NumElements can never be zero. Still using the redundant check to avoid + // incomplete type errors. + using DataT = typename std::conditional_t; + using vec_t = vec; + template using vec_data = vec_helper; + + // Special <<, >> operators for std::byte. + // std::byte is not an arithmetic type and it only supports the following + // overloads of >> and << operators. + // + // 1 template + // constexpr std::byte operator<<( std::byte b, IntegerType shift ) + // noexcept; + friend vec_t operator<<(const vec_t &Lhs, int shift) { + vec_t Ret; + for (size_t I = 0; I < NumElements; ++I) { + Ret[I] = Lhs[I] << shift; + } + return Ret; + } + friend vec_t &operator<<=(vec_t &Lhs, int shift) { + Lhs = Lhs << shift; + return Lhs; + } + + // 2 template + // constexpr std::byte operator>>( std::byte b, IntegerType shift ) + // noexcept; + friend vec_t operator>>(const vec_t &Lhs, int shift) { + vec_t Ret; + for (size_t I = 0; I < NumElements; ++I) { + Ret[I] = Lhs[I] >> shift; + } + return Ret; + } + friend vec_t &operator>>=(vec_t &Lhs, int shift) { + Lhs = Lhs >> shift; + return Lhs; + } + + __SYCL_BINOP(|, |=, false, true) + __SYCL_BINOP(&, &=, false, true) + __SYCL_BINOP(^, ^=, false, true) + + // friends + template friend class vec; +}; +#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) + +template class vec_arith_common { +protected: + using vec_t = vec; + + static constexpr bool IsBfloat16 = + std::is_same_v; + + // operator~() available only when: dataT != float && dataT != double + // && dataT != half + template + friend std::enable_if_t, vec_t> + operator~(const vec_t &Rhs) { +#ifdef __SYCL_DEVICE_ONLY__ + auto extVec = sycl::bit_cast(Rhs); + vec_t Ret{~extVec}; + if constexpr (std::is_same_v) { + ConvertToDataT(Ret); + } + return Ret; +#else + vec_t Ret{}; + for (size_t I = 0; I < NumElements; ++I) { + Ret[I] = ~Rhs[I]; + } + return Ret; +#endif + } + +#ifdef __SYCL_DEVICE_ONLY__ + using vec_bool_t = vec; + // Required only for std::bool. + static void ConvertToDataT(vec_bool_t &Ret) { + for (size_t I = 0; I < NumElements; ++I) { + Ret[I] = bit_cast(Ret[I]) != 0; + } + } +#endif + + // friends + template friend class vec; +}; + +#undef __SYCL_BINOP +#undef BINOP_BASE + +} // namespace detail +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/detail/vector_convert.hpp b/sycl/include/sycl/detail/vector_convert.hpp index c018fce5bcfa3..e459c59f79202 100644 --- a/sycl/include/sycl/detail/vector_convert.hpp +++ b/sycl/include/sycl/detail/vector_convert.hpp @@ -57,12 +57,100 @@ #include // for is_sigeninteger, is_s... #include // for errc +#include // bfloat16 + #ifndef __SYCL_DEVICE_ONLY__ #include // for fesetround, fegetround #endif #include +// Enable on only intel devices. +#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__)) +extern "C" { +// For converting BF16 to other types. +extern __DPCPP_SYCL_EXTERNAL float __imf_bfloat162float(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_rd(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_rn(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_ru(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_rz(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned short +__imf_bfloat162ushort_rd(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned short +__imf_bfloat162ushort_rn(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned short +__imf_bfloat162ushort_ru(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned short +__imf_bfloat162ushort_rz(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned long long +__imf_bfloat162ull_rd(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned long long +__imf_bfloat162ull_rn(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned long long +__imf_bfloat162ull_ru(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned long long +__imf_bfloat162ull_rz(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_rd(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_rn(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_ru(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_rz(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_rd(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_rn(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_ru(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_rz(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_rd(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_rn(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_ru(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_rz(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat16_as_short(uint16_t x); +extern __DPCPP_SYCL_EXTERNAL unsigned short +__imf_bfloat16_as_ushort(uint16_t x); + +// For converting other types to BF16. +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16(float x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rd(float x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rn(float x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_ru(float x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rz(float x); +extern __DPCPP_SYCL_EXTERNAL uint16_t +__imf_ushort2bfloat16_rd(unsigned short x); +extern __DPCPP_SYCL_EXTERNAL uint16_t +__imf_ushort2bfloat16_rn(unsigned short x); +extern __DPCPP_SYCL_EXTERNAL uint16_t +__imf_ushort2bfloat16_ru(unsigned short x); +extern __DPCPP_SYCL_EXTERNAL uint16_t +__imf_ushort2bfloat16_rz(unsigned short x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_rd(unsigned int x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_rn(unsigned int x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_ru(unsigned int x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_rz(unsigned int x); +extern __DPCPP_SYCL_EXTERNAL uint16_t +__imf_ull2bfloat16_rd(unsigned long long x); +extern __DPCPP_SYCL_EXTERNAL uint16_t +__imf_ull2bfloat16_rn(unsigned long long x); +extern __DPCPP_SYCL_EXTERNAL uint16_t +__imf_ull2bfloat16_ru(unsigned long long x); +extern __DPCPP_SYCL_EXTERNAL uint16_t +__imf_ull2bfloat16_rz(unsigned long long x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_rd(short x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_rn(short x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_ru(short x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_rz(short x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_rd(int x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_rn(int x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_ru(int x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_rz(int x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_rd(long long x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_rn(long long x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_ru(long long x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_rz(long long x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_double2bfloat16(double x); +extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short_as_bfloat16(short x); +extern __DPCPP_SYCL_EXTERNAL uint16_t +__imf_ushort_as_bfloat16(unsigned short x); +} +#endif // __SYCL_DEVICE_ONLY__ && (defined(__SPIR__) || defined(__SPIRV__)) + namespace sycl { enum class rounding_mode { automatic = 0, rte = 1, rtz = 2, rtp = 3, rtn = 4 }; @@ -81,6 +169,10 @@ inline double trunc(double); #endif namespace detail { +template +NativeToT convertImpl(NativeFromT); + template using is_sint_to_sint = std::bool_constant && is_sigeninteger_v>; @@ -123,6 +215,8 @@ using is_float_to_float = std::bool_constant::value && detail::is_floating_point::value>; +using bfloat16 = sycl::ext::oneapi::bfloat16; + #ifndef __SYCL_DEVICE_ONLY__ template > @@ -196,8 +290,29 @@ template (Value); } -#else +template +inline NativeToT ConvertFromBF16Scalar(bfloat16 val) { + // On host, NativeBF16T is bfloat16. Convert BF16 to float losslessly. + float fval = static_cast(val); + + if constexpr (std::is_same_v) + return fval; + else + // Convert float to the desired type. + return convertImpl( + fval); +} + +template +bfloat16 ConvertToBF16Scalar(NativeFromT val) { + + constexpr int rm = static_cast(RoundingMode); + return sycl::ext::oneapi::detail::ConvertToBfloat16:: + getBfloat16WithRoundingMode(val); +} + +#else // Bunch of helpers to "specialize" each template for its own destination type // and vector size. @@ -498,8 +613,188 @@ __SYCL_FLOAT_FLOAT_CONVERT_FOR_TYPE(double) #undef __SYCL_FLOAT_FLOAT_CONVERT #undef __SYCL_FLOAT_FLOAT_CONVERT_FOR_TYPE +template +inline NativeFloatT ConvertBF16ToFVec(NativeBFT vec) { + bfloat16 *src = sycl::bit_cast(&vec); + + // OpenCL vector of 3 elements is aligned to 4 multiplied by + // the size of data type. + constexpr int AdjustedSize = (VecSize == 3) ? 4 : VecSize; + float dst[AdjustedSize]; + sycl::ext::oneapi::detail::BF16VecToFloatVec(src, dst); + + return sycl::bit_cast(dst); +} + +template +inline NativeBFT ConvertFToBF16Vec(NativeFloatT vec) { + float *src = sycl::bit_cast(&vec); + + // OpenCL vector of 3 elements is aligned to 4 multiplied by + // the size of data type. + constexpr int AdjustedSize = (VecSize == 3) ? 4 : VecSize; + bfloat16 dst[AdjustedSize]; + + sycl::ext::oneapi::detail::FloatVecToBF16Vec(src, dst); + return sycl::bit_cast(dst); +} + +/* Emit _imf_* funcs only on Intel hardware. */ +#if defined(__SPIR__) || defined(__SPIRV__) +#define EXPAND_BF16_ROUNDING_MODE(type, type_str, rmode, rmode_str) \ + template \ + std::enable_if_t<(std::is_same_v && RoundingMode == rmode), \ + NativeToT> \ + ConvertFromBF16Scalar(uint16_t val) { \ + return __imf_bfloat162##type_str##_##rmode_str(val); \ + } \ + template \ + std::enable_if_t< \ + (std::is_same_v && RoundingMode == rmode), uint16_t> \ + ConvertToBF16Scalar(NativeFromT val) { \ + return __imf_##type_str##2bfloat16_##rmode_str(val); \ + } + +#else // __SYCL_DEVICE_ONLY__ && (defined(__SPIR__) || defined(__SPIRV__)) +// On non-Intel HWs, convert BF16 to float (losslessly) and convert float +// to the desired type. +#define EXPAND_BF16_ROUNDING_MODE(type, type_str, rmode, rmode_str) \ + template \ + std::enable_if_t<(std::is_same_v && RoundingMode == rmode), \ + NativeToT> \ + ConvertFromBF16Scalar(uint16_t val) { \ + bfloat16 bfval = sycl::bit_cast(val); \ + float fval = static_cast(bfval); \ + return convertImpl( \ + fval); \ + } \ + template \ + std::enable_if_t< \ + (std::is_same_v && RoundingMode == rmode), uint16_t> \ + ConvertToBF16Scalar(NativeFromT val) { \ + constexpr int rm = static_cast(RoundingMode); \ + bfloat16 bfval = sycl::ext::oneapi::detail::ConvertToBfloat16:: \ + getBfloat16WithRoundingMode(val); \ + return sycl::bit_cast(bfval); \ + } +#endif // __SYCL_DEVICE_ONLY__ && (defined(__SPIR__) || defined(__SPIRV__)) + +#define EXPAND_BF16_TYPE(type, type_str) \ + EXPAND_BF16_ROUNDING_MODE(type, type_str, sycl::rounding_mode::automatic, \ + rn) \ + EXPAND_BF16_ROUNDING_MODE(type, type_str, sycl::rounding_mode::rte, rn) \ + EXPAND_BF16_ROUNDING_MODE(type, type_str, sycl::rounding_mode::rtp, ru) \ + EXPAND_BF16_ROUNDING_MODE(type, type_str, sycl::rounding_mode::rtn, rd) \ + EXPAND_BF16_ROUNDING_MODE(type, type_str, sycl::rounding_mode::rtz, rz) + +EXPAND_BF16_TYPE(uint, uint) +EXPAND_BF16_TYPE(int, int) +EXPAND_BF16_TYPE(ushort, ushort) +EXPAND_BF16_TYPE(short, short) +EXPAND_BF16_TYPE(long, ll) +EXPAND_BF16_TYPE(unsigned long long, ull) + +#undef EXPAND_BF16_TYPE +#undef EXPAND_BF16_ROUNDING_MODE + +// Mapping from BF16 to float is 1:1, lossless, so we accept all +// rounding modes. +template +std::enable_if_t, NativeToT> +ConvertFromBF16Scalar(uint16_t val) { + bfloat16 bfval = sycl::bit_cast(val); + return static_cast(bfval); +} + +template +std::enable_if_t, uint16_t> +ConvertToBF16Scalar(NativeFromT val) { +#if defined(__SPIR__) || defined(__SPIRV__) + return __imf_double2bfloat16(val); +#else + constexpr int rm = static_cast(RoundingMode); + bfloat16 bfval = + sycl::ext::oneapi::detail::ConvertToBfloat16::getBfloat16WithRoundingMode< + NativeFromT, rm>(val); + return sycl::bit_cast(bfval); +#endif +} + +template +std::enable_if_t, uint16_t> +ConvertToBF16Scalar(NativeFromT val) { + +#if defined(__SPIR__) || defined(__SPIRV__) + if constexpr (RoundingMode == sycl::rounding_mode::automatic || + RoundingMode == sycl::rounding_mode::rte) + return __imf_float2bfloat16_rn(val); + else if constexpr (RoundingMode == sycl::rounding_mode::rtp) + return __imf_float2bfloat16_ru(val); + else if constexpr (RoundingMode == sycl::rounding_mode::rtn) + return __imf_float2bfloat16_rd(val); + else if constexpr (RoundingMode == sycl::rounding_mode::rtz) + return __imf_float2bfloat16_rz(val); + else + static_assert(false, "Invalid rounding mode."); +#else + constexpr int rm = static_cast(RoundingMode); + bfloat16 bfval = + sycl::ext::oneapi::detail::ConvertToBfloat16::getBfloat16WithRoundingMode< + float, rm>(val); + return sycl::bit_cast(bfval); +#endif +} + #endif // __SYCL_DEVICE_ONLY__ +// Wrapper function for scalar and vector conversions from BF16 type. +template +NativeToT ConvertFromBF16(NativeFromT val) { +#ifdef __SYCL_DEVICE_ONLY__ + // Use vector conversion from BF16 to float for all rounding modes. + if constexpr (std::is_same_v && VecSize > 1) + return ConvertBF16ToFVec(val); + else +#endif + // For VecSize > 1. Only for device. + if constexpr (VecSize > 1) { + NativeToT retval; + for (int i = 0; i < VecSize; i++) { + retval[i] = ConvertFromBF16Scalar(val[i]); + } + return retval; + } + // For VecSize == 1. + else + return ConvertFromBF16Scalar(val); +} + +// Wrapper function for scalar and vector conversions to BF16 type. +template +NativeToT ConvertToBF16(NativeFromT val) { +#ifdef __SYCL_DEVICE_ONLY__ + // Use vector conversion to BF16 from float for RNE rounding mode. + if constexpr (std::is_same_v && VecSize > 1 && + (RoundingMode == sycl::rounding_mode::automatic || + RoundingMode == sycl::rounding_mode::rte)) + return ConvertFToBF16Vec(val); + else +#endif + // For VecSize > 1. Only for device. + if constexpr (VecSize > 1) { + NativeToT retval; + for (int i = 0; i < VecSize; i++) { + retval[i] = ConvertToBF16Scalar(val[i]); + } + return retval; + } + // For VecSize == 1. + else + return ConvertToBF16Scalar(val); +} + /// Entry point helper for all kinds of converts between scalars and vectors, it /// dispatches to a right function depending on source and destination types. /// @@ -537,6 +832,14 @@ NativeToT convertImpl(NativeFromT Value) { else if constexpr (is_float_to_float::value) return FConvert( Value); + // BF16 conversion to other types. + else if constexpr (std::is_same_v) + return ConvertFromBF16( + Value); + // conversion from other types to BF16. + else if constexpr (std::is_same_v) + return ConvertToBF16( + Value); else if constexpr (is_float_to_sint::value) return ConvertFToS( Value); @@ -558,6 +861,15 @@ NativeToT convertImpl(NativeFromT Value) { } } +#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) +template +auto ConvertImpl(std::byte val) { + return convertImpl( + (std::int8_t)val); +} +#endif + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp index 2f6584a4bd640..d8022f48a9a1d 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp @@ -244,8 +244,6 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T1, N) __esimd_ssdp4a_sat(__ESIMD_raw_vec_t(T2, N) src0, __ESIMD_raw_vec_t(T3, N) src1, __ESIMD_raw_vec_t(T4, N) src2) __ESIMD_INTRIN_END; -__ESIMD_INTRIN __ESIMD_raw_vec_t(uint32_t, 4) - __esimd_timestamp() __ESIMD_INTRIN_END; template __ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) diff --git a/sycl/include/sycl/ext/intel/esimd/math.hpp b/sycl/include/sycl/ext/intel/esimd/math.hpp index 096c33a2fda93..67bcaace80673 100644 --- a/sycl/include/sycl/ext/intel/esimd/math.hpp +++ b/sycl/include/sycl/ext/intel/esimd/math.hpp @@ -1844,8 +1844,11 @@ __ESIMD_API uint32_t subb(uint32_t &borrow, uint32_t src0, uint32_t src1) { /// rdtsc - get the value of timestamp counter. /// @return the current value of timestamp counter __ESIMD_API uint64_t rdtsc() { - __ESIMD_NS::simd retv = __esimd_timestamp(); - return retv.template bit_cast_view()[0]; +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv_ReadClockKHR(0); +#else + __ESIMD_UNSUPPORTED_ON_HOST; +#endif } /// @} sycl_esimd_math diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index b8f36e8f57255..6272d8ce97d10 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -550,7 +550,7 @@ gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask mask, /// simd byte_offsets, /// simd_mask mask, PassThruSimdViewT pass_thru, /// PropertyListT props = {}); -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. Loads ("gathers") elements of the type 'T' /// from memory locations addressed by the base pointer \p p and byte offsets \p /// byte_offsets, and returns the loaded elements. Access to any element's @@ -591,7 +591,7 @@ gather(const T *p, simd byte_offsets, simd_mask mask, /// OffsetSimdViewT byte_offsets, /// simd_mask mask, PassThruSimdViewT pass_thru, /// PropertyListT props = {}); -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. Loads ("gathers") elements of the type 'T' /// from memory locations addressed by the base pointer \p p and byte offsets \p /// byte_offsets, and returns the loaded elements. Access to any element's @@ -637,7 +637,7 @@ gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask mask, /// OffsetSimdViewT byte_offsets, /// simd_mask mask, simd pass_thru, /// PropertyListT props = {}); -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. Loads ("gathers") elements of the type 'T' /// from memory locations addressed by the base pointer \p p and byte offsets \p /// byte_offsets, and returns the loaded elements. Access to any element's @@ -711,7 +711,7 @@ gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask mask, /// simd gather(const T *p, /// OffsetSimdViewT byte_offsets, /// simd_mask mask, PropertyListT props = {}); -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. Loads ("gathers") elements of the type 'T' /// from memory locations addressed by the base pointer \p p and byte offsets \p /// byte_offsets, and returns the loaded elements. Access to any element's @@ -772,7 +772,7 @@ gather(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) { /// simd gather(const T *p, /// OffsetSimdViewT byte_offsets, /// PropertyListT props = {}); -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. Loads ("gathers") elements of the type 'T' /// from memory locations addressed by the base pointer \p p and byte offsets \p /// byte_offsets, and returns the loaded elements. @@ -925,7 +925,7 @@ scatter(T *p, simd byte_offsets, simd vals, /// void scatter(T *p, simd byte_offsets, ValuesSimdViewT vals, /// simd_mask mask, PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. /// Writes ("scatters") elements of the input vector to different memory /// locations. Each memory location is base address plus an offset - a @@ -993,7 +993,7 @@ scatter(T *p, simd byte_offsets, simd vals, /// void scatter(T *p, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals, /// simd_mask mask, PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. /// Writes ("scatters") elements of the input vector to different memory /// locations. Each memory location is base address plus an offset - a @@ -1033,7 +1033,7 @@ scatter(T *p, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals, /// void scatter(T *p, simd byte_offsets, ValuesSimdViewT vals, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. /// Writes ("scatters") elements of the input vector to different memory /// locations. Each memory location is base address plus an offset - a @@ -1101,7 +1101,7 @@ scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, /// void scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, /// simd_mask mask, PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. /// Writes ("scatters") elements of the input vector to different memory /// locations. Each memory location is base address plus an offset - a @@ -1140,7 +1140,7 @@ scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, /// void scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. /// Writes ("scatters") elements of the input vector to different memory /// locations. Each memory location is base address plus an offset - a @@ -1214,7 +1214,7 @@ scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, /// void scatter(T *p, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. /// Writes ("scatters") elements of the input vector to different memory /// locations. Each memory location is base address plus an offset - a @@ -1971,6 +1971,53 @@ block_load(const T *ptr, simd_mask<1> pred, simd pass_thru, return detail::block_load_impl(ptr, pred, pass_thru); } +/// simd block_load(const T* ptr, simd_mask<1> pred, +/// PassThruSimdViewT pass_thru, props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// This function loads a contiguous memory block from USM pointer \p ptr. If +/// the predicate \p pred is set to 0, then the load is omitted and the vector +/// \p pass_thru is returned. +/// +/// This function has temporary restrictions. See details in the 'Restrictions' +/// section below. The restrictions will be relaxed in the future. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default assumed alignment is the minimally required element-size +/// alignment. Note that additional/temporary restrictions are applied +/// (see Restrictions below). +/// +/// Restrictions - cache hint and mask imposed - temporary: +/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or +/// smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template < + typename PassThruSimdViewT, typename T, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v, + simd> +block_load(const T *ptr, simd_mask<1> pred, PassThruSimdViewT pass_thru, + PropertyListT props = {}) { + return block_load(ptr, pred, pass_thru.read(), props); +} + /// simd block_load(const T* ptr, size_t byte_offset, /// simd_mask<1> pred, simd pass_thru, /// props={}); // (usm-bl-6) @@ -2017,6 +2064,55 @@ block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred, return block_load(AdjustedPtr, pred, pass_thru, props); } +/// simd block_load(const T* ptr, size_t byte_offset, +/// simd_mask<1> pred, PassThruSimdViewT pass_thru, +/// props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// This function loads a contiguous memory block from address referenced +/// by USM pointer \p ptr and the given \p byte_offset. +/// If the predicate \p pred is set to 0, then the load is omitted and the +/// vector \p pass_thru is returned. +/// +/// This function has temporary restrictions. See details in the 'Restrictions' +/// section below. The restrictions will be relaxed in the future. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default assumed alignment is the minimally required element-size +/// alignment. Note that additional/temporary restrictions are applied +/// (see Restrictions below). +/// +/// Restrictions - cache hint and mask imposed - temporary: +/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or +/// smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template < + typename PassThruSimdViewT, typename T, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v, + simd> +block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred, + PassThruSimdViewT pass_thru, PropertyListT props = {}) { + return block_load(ptr, byte_offset, pred, pass_thru.read(), props); +} + /// Loads a contiguous block of memory from the given memory address \p addr /// and returns the loaded data as a vector. /// The generated code depends on the combination {T, N, Flags}. @@ -2294,6 +2390,57 @@ block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, pass_thru); } +/// simd +/// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred, +/// PassThruSimdViewT pass_thru, props = {}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// This function loads a contiguous memory block referenced +/// by accessor \p acc and the given \p byte_offset. +/// If the predicate \p pred is set to 0, then the load is omitted and the +/// \p pass_thru value is returned. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the \p byte_offset must be at least 4-byte aligned for elements of 4-bytes +/// or smaller and 8-byte aligned for 8-byte elements. +/// +/// Restrictions - cache hint and predicate imposed - temporary: +/// R1: \p byte_offset must be at least 4-byte aligned for elements of 4-bytes +/// or smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements must be: +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template < + typename PassThruSimdViewT, + typename T = PassThruSimdViewT::value_type::element_type, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_device_accessor_with_v, + simd> +block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, + simd_mask<1> pred, PassThruSimdViewT pass_thru, + PropertyListT props = {}) { + return block_load(acc, byte_offset, pred, pass_thru.read(), props); +} + /// simd /// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred, /// props = {}); // (acc-bl-4) @@ -2383,6 +2530,53 @@ block_load(AccessorT acc, simd_mask<1> pred, simd pass_thru, return block_load(acc, 0, pred, pass_thru, NewPropertyListT{}); } +/// block_load(AccessorT acc, simd_mask<1> pred, +/// PassThruSimdViewT pass_thru, props = {}); +/// +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// This function loads a contiguous memory block referenced +/// by accessor \p acc and implied offset=0. +/// If the predicate \p pred is set to 0, then the load is omitted and the +/// \p pass_thru value is returned. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2. Other properties are ignored. If \p props +/// specifies the alignment property, then it is ignored because this +/// variant implies zero offset, which means the most favourable 16-byte +/// alignment is used. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Restrictions - cache hint and predicate imposed - temporary: +/// R1: The number of elements must be: +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// or 512(only if alignment is 8-bytes or more). +/// R2: The target device must be DG2, PVC or newer GPU. +template < + typename PassThruSimdViewT, + typename T = PassThruSimdViewT::value_type::element_type, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_device_accessor_with_v, + simd> +block_load(AccessorT acc, simd_mask<1> pred, PassThruSimdViewT pass_thru, + PropertyListT props = {}) { + return block_load(acc, pred, pass_thru.read(), props); +} + /// simd /// block_load(AccessorT acc, simd_mask<1> pred, props = {}); // (acc-bl-6) /// This function loads a contiguous memory block referenced @@ -2638,29 +2832,15 @@ block_store(T *ptr, size_t byte_offset, simd vals, simd_mask<1> pred, block_store(AdjustedPtr, vals, pred, props); } -/// Each of the following block_store functions stores the vector 'vals' to a -/// contiguous memory block at the address referenced by accessor 'acc', or from -/// 'acc + byte_offset', The parameter 'pred' is the one element predicate. If -/// it is set to 1, then all 'N' elements are stored. Otherwise, the block store -/// operation is a NO-OP. The parameter 'props' specifies the optional -/// compile-time properties of the type esimd::properties and may include -/// esimd::cache_hint_L1, esimd::cache_hint_L2, esimd::cache_hint_L3, -/// esimd::alignment. - -/// void block_store(AccessorT acc, OffsetT byte_offset, // (acc-bs-1) -/// simd vals, props = {}); - -/// void block_store(AccessorT acc, simd vals, props = {}); // (acc-bs-2) -/// void block_store(AccessorT acc, OffsetT byte_offset, // (acc-bs-3) -/// simd vals, simd_mask<1> pred, props = {}); - -/// void block_store(AccessorT acc, simd vals, // (acc-bs-4) -/// simd_mask<1> pred, props = {}); - -/// void block_store(AccessorT acc, OffsetT byte_offset, // (acc-bs-1) -/// simd vals, props = {}); -/// This function stores a contiguous memory block to -/// accessor \p acc and \p byte_offset with data specified by \p vals. +/// void block_store(T* ptr, ValuesSimdViewT vals, props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// This function stores a contiguous memory block to USM pointer \p ptr +/// with data specified by \p vals. +/// +/// There may be temporary restrictions depending on L1, L2 cache hints, +/// See details in the 'Restrictions' section below. The restrictions will be +/// relaxed in the future. /// /// The parameter \p props specifies the optional compile-time properties /// of the type esimd::properties and may include esimd::cache_hint_L1, @@ -2670,61 +2850,263 @@ block_store(T *ptr, size_t byte_offset, simd vals, simd_mask<1> pred, /// the cache_hint::none value is assumed by default. /// /// Alignment: If \p props does not specify the 'alignment' property, then -/// the \p byte_offset must be at least 16-byte aligned if (!(b) && (c)) -/// from the below restrictions, and must be at least 4-byte aligned for -/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements -/// otherwise. If the 'alignment' property is specified as less than 16 bytes, -/// then the target device must be DG2 or PVC (not Gen12). The alignment -/// requirement may be less strict if stateless memory mode is ON, see -/// block_store(usm_ptr, props) (aka usm-bs-01) for details/requirements. -/// -/// Restrictions: there may be some extra restrictions depending on -/// a) stateless memory mode enforcement is ON, -/// b) cache hints are used, -/// c) number of bytes stored is either 16,32,64, or 128. -/// d) the 'alignment' property is specified as less than 16 bytes. -/// -/// If (b) || !(c) || (d), then the target device must be DG2 or PVC (not -/// Gen12). -/// If (a) && !(b), then there is no restriction on the number of -/// elements to be stored and \p byte_offset must be only element-aligned. +/// the default assumed alignment is 16 bytes if \p props does not specify any +/// L1 or L2 cache hints, and the minimally required element-size +/// alignment otherwise. Note that additional/temporary restrictions may apply +/// (see Restrictions below). /// -/// Gen12 requirements: !(b) && (c) && !(d). -/// It can store 16-, 32-, 64-, or 128-bytes only. -/// DG2/PVC requirements: -/// It can store such number of elements depending on the type 'T': -/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; -/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// Restrictions - cache hint imposed - temporary: +/// If L1 or L2 cache hint is passed, then: +/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or +/// smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, /// or 128(only if alignment is 8-bytes or more); -/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, /// or 256(only if alignment is 8-bytes or more); -/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, /// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. template < - typename T, int N, typename AccessorT, + typename ValuesSimdViewT, typename T, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), typename PropertyListT = ext::oneapi::experimental::empty_properties_t> -__ESIMD_API std::enable_if_t< - ext::oneapi::experimental::is_property_list_v && - detail::is_device_accessor_with_v> -block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, - simd vals, PropertyListT props = {}) { -#ifdef __ESIMD_FORCE_STATELESS_MEM - block_store(detail::accessorToPointer(acc, byte_offset), vals, - props); -#else - constexpr int DefaultLSCAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); - constexpr size_t Alignment = - detail::getPropertyValue( - DefaultLSCAlignment); - constexpr bool AlignmentRequiresLSC = - PropertyListT::template has_property() && Alignment < 16; - using Tx = detail::__raw_t; - constexpr unsigned Sz = sizeof(Tx) * N; - constexpr bool SzRequiresLSC = - Sz < detail::OperandSize::OWORD || Sz % detail::OperandSize::OWORD != 0 || - !detail::isPowerOf2(Sz / detail::OperandSize::OWORD) || - Sz > 8 * detail::OperandSize::OWORD; +__ESIMD_API std::enable_if_t && + detail::is_property_list_v> +block_store(T *ptr, ValuesSimdViewT vals, PropertyListT props = {}) { + block_store(ptr, vals.read(), props); +} + +/// void block_store(T* ptr, size_t byte_offset, +/// ValuesSimdViewT vals, props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// This function stores a contiguous memory block to USM pointer \p ptr and +/// byte-offset \p byte_offset with data specified by \p vals. +/// +/// There may be temporary restrictions depending on L1, L2 cache hints, +/// See details in the 'Restrictions' section below. The restrictions will be +/// relaxed in the future. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default assumed alignment is 16 bytes if \p props does not specify any +/// L1 or L2 cache hints, and the minimally required element-size +/// alignment otherwise. Note that additional/temporary restrictions may apply +/// (see Restrictions below). +/// +/// Restrictions - cache hint imposed - temporary: +/// If L1 or L2 cache hint is passed, then: +/// R1: The pointer plus byte offset must be at least 4-byte aligned for +/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template < + typename ValuesSimdViewT, typename T, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +block_store(T *ptr, size_t byte_offset, ValuesSimdViewT vals, + PropertyListT props = {}) { + block_store(ptr, byte_offset, vals.read(), props); +} + +/// void block_store(T* ptr, ValuesSimdViewT vals, +/// simd_mask<1> pred, props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// This function stores a contiguous memory block to USM pointer \p ptr +/// with data specified by \p vals. If the predicate \p pred is set to 0, +/// then the store is omitted. +/// +/// There are temporary restrictions. See details in the 'Restrictions' +/// section below. The restrictions will be relaxed in the future. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default assumed alignment is the minimally required element-size +/// alignment. Note that additional/temporary restrictions apply (see +/// Restrictions below). +/// +/// Restrictions - predicate imposed - temporary: +/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or +/// smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template < + typename ValuesSimdViewT, typename T, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t && + detail::is_property_list_v> +block_store(T *ptr, ValuesSimdViewT vals, simd_mask<1> pred, + PropertyListT props = {}) { + block_store(ptr, vals.read(), pred, props); +} + +/// void block_store(T* ptr, size_t byte_offset, +/// ValuesSimdViewT vals, simd_mask<1> pred, props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// This function stores a contiguous memory block to USM pointer \p ptr +/// and byte-offset \p byte_offset with data specified by \p vals. +/// If the predicate \p pred is set to 0, then the store is omitted. +/// +/// There may be temporary restrictions depending on L1, L2 cache hints, +/// See details in the 'Restrictions' section below. The restrictions will be +/// relaxed in the future. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default assumed alignment is 16 bytes if \p props does not specify any +/// L1 or L2 cache hints and \p pred is set to 1, and +// the minimally required element-size alignment otherwise. +/// Note that additional/temporary restrictions may apply +/// (see Restrictions below). +/// +/// Restrictions - cache hint or predicate imposed - temporary: +/// If a predicate, L1 or L2 cache hint is passed, then: +/// R1: The pointer plus byte offset must be at least 4-byte aligned for +/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template < + typename ValuesSimdViewT, typename T, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +block_store(T *ptr, size_t byte_offset, ValuesSimdViewT vals, simd_mask<1> pred, + PropertyListT props = {}) { + block_store(ptr, byte_offset, vals.read(), pred, props); +} + +/// Each of the following block_store functions stores the vector 'vals' to a +/// contiguous memory block at the address referenced by accessor 'acc', or from +/// 'acc + byte_offset', The parameter 'pred' is the one element predicate. If +/// it is set to 1, then all 'N' elements are stored. Otherwise, the block store +/// operation is a NO-OP. The parameter 'props' specifies the optional +/// compile-time properties of the type esimd::properties and may include +/// esimd::cache_hint_L1, esimd::cache_hint_L2, esimd::cache_hint_L3, +/// esimd::alignment. + +/// void block_store(AccessorT acc, OffsetT byte_offset, // (acc-bs-1) +/// simd vals, props = {}); + +/// void block_store(AccessorT acc, simd vals, props = {}); // (acc-bs-2) +/// void block_store(AccessorT acc, OffsetT byte_offset, // (acc-bs-3) +/// simd vals, simd_mask<1> pred, props = {}); + +/// void block_store(AccessorT acc, simd vals, // (acc-bs-4) +/// simd_mask<1> pred, props = {}); + +/// void block_store(AccessorT acc, OffsetT byte_offset, // (acc-bs-1) +/// simd vals, props = {}); +/// This function stores a contiguous memory block to +/// accessor \p acc and \p byte_offset with data specified by \p vals. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the \p byte_offset must be at least 16-byte aligned if (!(b) && (c)) +/// from the below restrictions, and must be at least 4-byte aligned for +/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements +/// otherwise. If the 'alignment' property is specified as less than 16 bytes, +/// then the target device must be DG2 or PVC (not Gen12). The alignment +/// requirement may be less strict if stateless memory mode is ON, see +/// block_store(usm_ptr, props) (aka usm-bs-01) for details/requirements. +/// +/// Restrictions: there may be some extra restrictions depending on +/// a) stateless memory mode enforcement is ON, +/// b) cache hints are used, +/// c) number of bytes stored is either 16,32,64, or 128. +/// d) the 'alignment' property is specified as less than 16 bytes. +/// +/// If (b) || !(c) || (d), then the target device must be DG2 or PVC (not +/// Gen12). +/// If (a) && !(b), then there is no restriction on the number of +/// elements to be stored and \p byte_offset must be only element-aligned. +/// +/// Gen12 requirements: !(b) && (c) && !(d). +/// It can store 16-, 32-, 64-, or 128-bytes only. +/// DG2/PVC requirements: +/// It can store such number of elements depending on the type 'T': +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// or 512(only if alignment is 8-bytes or more). +template < + typename T, int N, typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v && + detail::is_device_accessor_with_v> +block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, + simd vals, PropertyListT props = {}) { +#ifdef __ESIMD_FORCE_STATELESS_MEM + block_store(detail::accessorToPointer(acc, byte_offset), vals, + props); +#else + constexpr int DefaultLSCAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); + constexpr size_t Alignment = + detail::getPropertyValue( + DefaultLSCAlignment); + constexpr bool AlignmentRequiresLSC = + PropertyListT::template has_property() && Alignment < 16; + using Tx = detail::__raw_t; + constexpr unsigned Sz = sizeof(Tx) * N; + constexpr bool SzRequiresLSC = + Sz < detail::OperandSize::OWORD || Sz % detail::OperandSize::OWORD != 0 || + !detail::isPowerOf2(Sz / detail::OperandSize::OWORD) || + Sz > 8 * detail::OperandSize::OWORD; if constexpr (detail::has_cache_hints() || AlignmentRequiresLSC || SzRequiresLSC) { using NewPropertyListT = @@ -2871,49 +3253,247 @@ block_store(AccessorT acc, simd vals, simd_mask<1> pred, block_store(acc, 0, vals, pred, NewPropertyListT{}); } -/// @} sycl_esimd_memory_block - -/// @} sycl_esimd_memory - -/// @cond ESIMD_DETAIL - -// Implementations of accessor-based gather and scatter functions -namespace detail { -template -ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t< - std::is_same_v || - is_accessor_with_v> -scatter_impl(AccessorTy acc, simd vals, simd offsets, - uint32_t glob_offset, simd_mask mask) { - - static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length"); - if constexpr (sizeof(T) == 8) { - scatter_impl( - acc, vals.template bit_cast_view().template select(0), - offsets, glob_offset, mask); - scatter_impl( - acc, vals.template bit_cast_view().template select(1), - offsets, glob_offset + sizeof(uint32_t), mask); - } else { - constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding(); - // TODO (performance) use hardware-supported scale once BE supports it - constexpr int16_t scale = 0; - const auto si = __ESIMD_NS::get_surface_index(acc); - - if constexpr (sizeof(T) < 4) { - using Tint = std::conditional_t, T, - detail::uint_type_t>; - using Treal = __raw_t; - simd vals_int = bitcast(std::move(vals).data()); - using PromoT = typename std::conditional_t::value, - int32_t, uint32_t>; - const simd promo_vals = convert(std::move(vals_int)); - __esimd_scatter_scaled( - mask.data(), si, glob_offset, offsets.data(), promo_vals.data()); - } else { - using Treal = __raw_t; - if constexpr (!std::is_same_v) { - simd Values = vals.template bit_cast_view(); +/// void block_store(AccessorT acc, OffsetT byte_offset, +/// ValuesSimdViewT vals, props = {}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// This function stores a contiguous memory block to +/// accessor \p acc and \p byte_offset with data specified by \p vals. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the \p byte_offset must be at least 16-byte aligned if (!(b) && (c)) +/// from the below restrictions, and must be at least 4-byte aligned for +/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements +/// otherwise. If the 'alignment' property is specified as less than 16 bytes, +/// then the target device must be DG2 or PVC (not Gen12). The alignment +/// requirement may be less strict if stateless memory mode is ON, see +/// block_store(usm_ptr, props) (aka usm-bs-01) for details/requirements. +/// +/// Restrictions: there may be some extra restrictions depending on +/// a) stateless memory mode enforcement is ON, +/// b) cache hints are used, +/// c) number of bytes stored is either 16,32,64, or 128. +/// d) the 'alignment' property is specified as less than 16 bytes. +/// +/// If (b) || !(c) || (d), then the target device must be DG2 or PVC (not +/// Gen12). +/// If (a) && !(b), then there is no restriction on the number of +/// elements to be stored and \p byte_offset must be only element-aligned. +/// +/// Gen12 requirements: !(b) && (c) && !(d). +/// It can store 16-, 32-, 64-, or 128-bytes only. +/// DG2/PVC requirements: +/// It can store such number of elements depending on the type 'T': +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// or 512(only if alignment is 8-bytes or more). +template < + typename ValuesSimdViewT, + typename T = ValuesSimdViewT::value_type::element_type, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_device_accessor_with_v> +block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, + ValuesSimdViewT vals, PropertyListT props = {}) { + block_store(acc, byte_offset, vals.read(), props); +} + +/// void block_store(AccessorT acc, ValuesSimdViewT vals, props = {}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// This function stores a contiguous memory block to +/// accessor \p acc with data specified by \p vals and implied offset=0. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2. Other properties are ignored. If \p props specifies +/// the alignment property, then it is ignored because this variant implies +/// zero offset, which means the most favourable 16-byte alignment is used. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Restrictions: there may be some extra restrictions depending on +/// a) stateless memory mode enforcement is ON, +/// b) cache hints are used, +/// c) number of bytes stored is either 16,32,64, or 128. +/// If (b) || !(c), then the target device must be DG2 or PVC (not Gen12). +/// If (a) && !(b), then there is no restriction on the number of elements +/// to be stored. +/// +/// Gen12 requirements: !(b) && (c). +/// It can store 16-, 32-, 64-, or 128-bytes only. +/// DG2/PVC requirements: +/// It can store such number of elements depending on the type 'T': +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128; +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256; +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512. +template < + typename ValuesSimdViewT, + typename T = ValuesSimdViewT::value_type::element_type, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_device_accessor_with_v> +block_store(AccessorT acc, ValuesSimdViewT vals, PropertyListT props = {}) { + block_store(acc, vals.read(), props); +} + +/// void block_store(AccessorT acc, OffsetT byte_offset, +/// ValuesSimdViewT vals, simd_mask<1> pred, props = {}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// This function stores a contiguous memory block to +/// accessor \p acc and \p byte_offset with data specified by \p vals. +/// If the predicate \p pred is set to 0, then the store is omitted. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the \p byte_offset must be at least 4-byte aligned for elements of 4-bytes +/// or smaller and 8-byte aligned for 8-byte elements. +/// The alignment requirement may be less strict if stateless memory mode is ON, +/// see block_store(usm_ptr, props) (aka usm-bs-01) for details/requirements. +/// +/// Restrictions: +/// R1: The target device must be DG2 or PVC (not Gen12). +/// +/// R2: +/// It can store such number of elements depending on the type 'T': +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// or 512(only if alignment is 8-bytes or more). +template < + typename ValuesSimdViewT, + typename T = ValuesSimdViewT::value_type::element_type, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_device_accessor_with_v> +block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, + ValuesSimdViewT vals, simd_mask<1> pred, PropertyListT props = {}) { + block_store(acc, byte_offset, vals.read(), pred, props); +} + +/// void block_store(AccessorT acc, ValuesSimdViewT vals, +/// simd_mask<1> pred, props = {}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// This function stores a contiguous memory block to +/// accessor \p acc with data specified by \p vals and implied offset=0. +/// If the predicate \p pred is set to 0, then the store is omitted. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2. Other properties are ignored. If \p props specifies +/// the alignment property, then it is ignored because this variant implies +/// zero offset, which means the most favourable 16-byte alignment is used. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Restrictions: +/// R1: The target device must be DG2 or PVC (not Gen12). +/// +/// R2: +/// It can store such number of elements depending on the type 'T': +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128; +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256; +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512. +template < + typename ValuesSimdViewT, + typename T = ValuesSimdViewT::value_type::element_type, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_device_accessor_with_v> +block_store(AccessorT acc, ValuesSimdViewT vals, simd_mask<1> pred, + PropertyListT props = {}) { + block_store(acc, vals.read(), pred, props); +} + +/// @} sycl_esimd_memory_block + +/// @} sycl_esimd_memory + +/// @cond ESIMD_DETAIL + +// Implementations of accessor-based gather and scatter functions +namespace detail { +template +ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t< + std::is_same_v || + is_accessor_with_v> +scatter_impl(AccessorTy acc, simd vals, simd offsets, + uint32_t glob_offset, simd_mask mask) { + + static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length"); + if constexpr (sizeof(T) == 8) { + scatter_impl( + acc, vals.template bit_cast_view().template select(0), + offsets, glob_offset, mask); + scatter_impl( + acc, vals.template bit_cast_view().template select(1), + offsets, glob_offset + sizeof(uint32_t), mask); + } else { + constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding(); + // TODO (performance) use hardware-supported scale once BE supports it + constexpr int16_t scale = 0; + const auto si = __ESIMD_NS::get_surface_index(acc); + + if constexpr (sizeof(T) < 4) { + using Tint = std::conditional_t, T, + detail::uint_type_t>; + using Treal = __raw_t; + simd vals_int = bitcast(std::move(vals).data()); + using PromoT = typename std::conditional_t::value, + int32_t, uint32_t>; + const simd promo_vals = convert(std::move(vals_int)); + __esimd_scatter_scaled( + mask.data(), si, glob_offset, offsets.data(), promo_vals.data()); + } else { + using Treal = __raw_t; + if constexpr (!std::is_same_v) { + simd Values = vals.template bit_cast_view(); __esimd_scatter_scaled( mask.data(), si, glob_offset, offsets.data(), Values.data()); } else { @@ -4327,7 +4907,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd vals, /// simd_mask mask, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to memory locations addressed /// by the accessor \p acc and byte offsets \p byte_offsets. Access to any @@ -4368,7 +4948,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd vals, /// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd vals, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to memory locations addressed /// by the accessor \p acc and byte offsets \p byte_offsets. @@ -4409,7 +4989,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd vals, /// ValuesSimdViewT vals, simd_mask mask, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to memory locations addressed /// by the accessor \p acc and byte offsets \p byte_offsets. Access to any @@ -4457,7 +5037,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals, /// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, /// ValuesSimdViewT vals, PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to memory locations addressed /// by the accessor \p acc and byte offsets \p byte_offsets. @@ -4502,7 +5082,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals, /// ValuesSimdViewT vals, simd_mask mask, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to memory locations addressed /// by the accessor \p acc and byte offsets \p byte_offsets. Access to any @@ -4545,7 +5125,7 @@ scatter(AccessorTy acc, simd byte_offsets, /// void scatter(AccessorTy acc, simd byte_offsets, /// ValuesSimdViewT vals, PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to memory locations addressed /// by the accessor \p acc and byte offsets \p byte_offsets. @@ -5469,7 +6049,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask mask, /// OffsetSimdViewT byte_offsets, /// simd_mask mask, simd pass_thru, /// PropertyListT props = {}); -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. /// Loads ("gathers") elements of the type 'T' from Shared Local Memory /// locations addressed by byte offsets \p byte_offsets, and returns the loaded @@ -5514,7 +6094,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask mask, /// OffsetSimdViewT byte_offsets, /// simd_mask mask, PassThruSimdViewT pass_thru, /// PropertyListT props = {}); -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. /// Loads ("gathers") elements of the type 'T' from Shared Local Memory /// locations addressed by byte offsets \p byte_offsets, and returns the loaded @@ -5563,7 +6143,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask mask, /// OffsetSimdViewT byte_offsets, /// simd_mask mask, PassThruSimdViewT pass_thru, /// PropertyListT props = {}); -/// Variation of the API that allows to use \c simd_view without specifying \c T +/// Variation of the API that allows using \c simd_view without specifying \c T /// and \c N template parameters. /// Loads ("gathers") elements of the type 'T' from Shared Local Memory /// locations addressed by byte offsets \p byte_offsets, and returns the loaded @@ -5844,7 +6424,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, simd vals, /// void slm_scatter(OffsetSimdViewT byte_offsets, simd vals, /// simd_mask mask, PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to Shared Local Memory /// locations addressed by byte offsets \p byte_offsets. @@ -5876,7 +6456,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, simd vals, /// void slm_scatter(OffsetSimdViewT byte_offsets, simd vals, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to Shared Local Memory /// locations addressed by byte offsets \p byte_offsets. @@ -5911,7 +6491,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, simd vals, /// ValuesSimdViewT vals, simd_mask mask, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to Shared Local Memory /// locations addressed by byte offsets \p byte_offsets. @@ -5948,7 +6528,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, ValuesSimdViewT vals, /// void slm_scatter(OffsetSimdViewT byte_offsets, /// ValuesSimdViewT vals, PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to Shared Local Memory /// locations addressed by byte offsets \p byte_offsets. @@ -5986,7 +6566,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, ValuesSimdViewT vals, /// ValuesSimdViewT vals, simd_mask mask, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to Shared Local Memory /// locations addressed by byte offsets \p byte_offsets. @@ -6018,7 +6598,7 @@ slm_scatter(simd byte_offsets, ValuesSimdViewT vals, /// void slm_scatter(simd byte_offsets, /// ValuesSimdViewT vals, PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to Shared Local Memory /// locations addressed by byte offsets \p byte_offsets. @@ -6350,6 +6930,51 @@ slm_block_load(uint32_t offset, simd_mask<1> pred, simd pass_thru, return Result.template bit_cast_view(); } +/// simd slm_block_load(uint32_t byte_offset, +/// simd_mask<1> pred, +/// PassThruSimdViewT pass_thru, props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Loads a contiguous memory block from SLM (Shared Local Memory) at the +/// given \p byte_offset. +/// The parameter \p pred is the one-element predicate. If it is set to 1, +/// then all 'N' elements are loaded. Otherwise, the block load operation +/// is a NO-OP. +/// The parameter 'pass_thru' specifies the values being copied to the returned +/// result if 'pred' is set to 0. +/// +/// The parameter 'props' specifies the optional compile-time properties +/// list. Only esimd::alignment property is used. Other properties are ignored. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default expected alignment is the minimally required (see (R1) below). +/// +/// Restrictions - predicate imposed - temporary: +/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller +/// elements and 8-byte aligned for 8-byte elements. +/// R2: The number of elements must be: +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template < + typename PassThruSimdViewT, + typename T = PassThruSimdViewT::value_type::element_type, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v, + simd> +slm_block_load(uint32_t offset, simd_mask<1> pred, PassThruSimdViewT pass_thru, + PropertyListT props = {}) { + return slm_block_load(offset, pred, pass_thru.read(), props); +} + /// simd block_load(local_accessor lacc, uint32_t byte_offset, /// props={}); // (lacc-bl-1) /// Loads a contiguous memory block from SLM (Shared Local Memory) associated @@ -6539,7 +7164,53 @@ block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred, return slm_block_load(byte_offset, pred, pass_thru, props); } -/// simd block_load(local_accessor lacc, +/// simd block_load(local_accessor lacc, uint32_t byte_offset, +/// simd_mask<1> pred, PassThruSimdViewT pass_thru, +/// props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Loads a contiguous memory block from SLM (Shared Local Memory) associated +/// the local accessor \p lacc at the given \p byte_offset. +/// The parameter \p pred is the one-element predicate. If it is set to 1, +/// then all 'N' elements are loaded. Otherwise, the block load operation +/// is a NO-OP, and \p pass_thru value is returned. +/// +/// The parameter 'props' specifies the optional compile-time properties +/// list. Only esimd::alignment property is used. Other properties are ignored. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default expected alignment is the minimally required (see (R1) below). +/// +/// Restrictions - predicate imposed - temporary: +/// R1: The \p lacc + \p byte_offset must be at least 4-byte aligned for 4-byte +/// or smaller elements and 8-byte aligned for 8-byte elements. +/// R2: The number of elements must be: +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template < + typename PassThruSimdViewT, + typename T = PassThruSimdViewT::value_type::element_type, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v, + simd> +block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred, + PassThruSimdViewT pass_thru, PropertyListT props = {}) { + return block_load(lacc, byte_offset, pred, pass_thru.read(), props); +} + +/// simd block_load(local_accessor lacc, /// simd_mask<1> pred, simd pass_thru, /// props={}); // (lacc-bl-6) /// Loads a contiguous memory block from SLM (Shared Local Memory) associated @@ -6579,6 +7250,51 @@ block_load(AccessorT lacc, simd_mask<1> pred, simd pass_thru, pass_thru, props); } +/// simd block_load(local_accessor lacc, +/// simd_mask<1> pred, PassThruSimdViewT pass_thru, +/// props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Loads a contiguous memory block from SLM (Shared Local Memory) associated +/// with the local accessor \p lacc at zero offset. +/// +/// The parameter \p pred is the one-element predicate. If it is set to 1, +/// then all 'N' elements are loaded. Otherwise, the block load operation +/// is a NO-OP, and \p pass_thru value is returned. +/// +/// The parameter 'props' specifies the optional compile-time properties +/// list. Only esimd::alignment property is used. Other properties are ignored. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default expected alignment is the minimally required (see (R1) below). +/// +/// Restrictions - predicate imposed - temporary: +/// R1: The local accessor \p lacc must point to memory at least 4-byte aligned +/// for elements of 4-bytes or smaller and 8-byte aligned for 8-byte +/// elements. +/// R2: The number of elements must be: +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128; +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256; +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512. +/// R2: The target device must be DG2, PVC or newer GPU. +template < + typename PassThruSimdViewT, + typename T = PassThruSimdViewT::value_type::element_type, + int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(), + typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v, + simd> +block_load(AccessorT lacc, simd_mask<1> pred, PassThruSimdViewT pass_thru, + PropertyListT props = {}) { + return block_load(lacc, pred, pass_thru.read(), props); +} + /// Stores elements of the vector \p vals to a contiguous block of SLM memory /// at the given byte-offset \p offset. /// The generated code depends on the combination {T, N, Flags}. @@ -6745,6 +7461,76 @@ slm_block_store(uint32_t byte_offset, simd vals, sycl::bit_cast<__ESIMD_DNS::vector_type_t>(vals.data())); } +/// void slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals, +/// simd_mask<1> pred, props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local +/// Memory) at the given \p byte_offset. The parameter \p pred is the +/// one-element predicate. If it is set to 1, then all 'N' elements are stored. +/// Otherwise, the block stored operation is a NO-OP. +/// +/// The parameter 'props' specifies the optional compile-time properties +/// list. Only esimd::alignment property is used. Other properties are ignored. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default expected alignment is the minimally required (see (R1) below). +/// +/// Restrictions - predicate imposed - temporary: +/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller +/// elements and 8-byte aligned for 8-byte elements. +/// R2: The number of elements must be: +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template < + typename ValuesSimdViewT, + typename T = ValuesSimdViewT::value_type::element_type, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals, simd_mask<1> pred, + PropertyListT props = {}) { + slm_block_store(byte_offset, vals.read(), pred, props); +} + +/// void slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals, +/// props = {}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Stores the vector \p vals to a contiguous memory block in SLM +/// (Shared Local Memory) at the given \p byte_offset. The parameter 'props' +/// specifies the optional compile-time properties list. Only esimd::alignment +/// property is used. Other properties are ignored. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default expected alignment is 16-bytes to generate block_store +/// instruction on all known target devices (Gen12, DG2, PVC, etc). +/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes +/// is valid, but requires JIT compiler generating a slower SCATTER instead +/// of faster BLOCK_STORE. +/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying +/// the actual alignment in \p props produces incorrect store results on Gen12. +template < + typename ValuesSimdViewT, + typename T = ValuesSimdViewT::value_type::element_type, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals, + PropertyListT props = {}) { + slm_block_store(byte_offset, vals.read(), props); +} + /// void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-1) /// simd vals, props={}); /// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local @@ -6876,79 +7662,232 @@ block_store(AccessorT lacc, simd vals, simd_mask<1> pred, PropertyListT props = {}) { slm_block_store(detail::localAccessorToOffset(lacc), vals, pred, props); } -namespace detail { -// lsc_atomic_update() operations may share atomic_op values for data types -// of the same (fp vs integral) class for convenience (e.g. re-use 'fmax' for -// all FP types). In fact those data types may require using different internal -// opcodes. This function returns the corresponding internal opcode for -// the input type 'T' and operation 'Op'. -template -constexpr int lsc_to_internal_atomic_op() { - constexpr __ESIMD_NS::native::lsc::atomic_op LSCOp = - __ESIMD_DNS::to_lsc_atomic_op(); - return static_cast(LSCOp); +/// void block_store(local_accessor lacc, uint32_t byte_offset, +/// ValuesSimdViewT vals, props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local +/// Memory) associated with the local accessor \p lacc at the given \p +/// byte_offset. The parameter 'props' specifies the optional compile-time +/// properties list. Only esimd::alignment property is used. Other properties +/// are ignored. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default expected alignment is 16-bytes to generate block_store +/// instruction on all known target devices (Gen12, DG2, PVC, etc). +/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes +/// is valid, but requires JIT compiler generating a slower SCATTER instead +/// of faster BLOCK_STORE. +/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying +/// the actual alignment in \p props produces incorrect store results on Gen12. +template < + typename ValuesSimdViewT, + typename T = ValuesSimdViewT::value_type::element_type, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +block_store(AccessorT lacc, uint32_t byte_offset, ValuesSimdViewT vals, + PropertyListT props = {}) { + block_store(lacc, byte_offset, vals.read(), props); } -/// SLM atomic. -/// Supported platforms: DG2, PVC -/// VISA instruction: lsc_atomic_.slm -/// -/// @tparam Op is operation type. -/// @tparam T is element type. -/// @tparam N is the number of channels (platform dependent). -/// @tparam DS is the data size. -/// @param offsets is the zero-based offsets. -/// @param pred is predicate. +/// void block_store(local_accessor lacc, ValuesSimdViewT vals, +/// props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Stores the vector \p vals to a contiguous memory block in SLM +/// (Shared Local Memory) associated with the local accessor \p lacc. The +/// parameter 'props' specifies the optional compile-time properties list. Only +/// esimd::alignment property is used. Other properties are ignored. /// -/// @return A vector of the old values at the memory locations before the -/// update. - -template -__ESIMD_API std::enable_if_t() == 0, simd> -slm_atomic_update_impl(simd offsets, simd_mask pred) { - check_lsc_data_size(); - check_atomic(); - constexpr uint16_t AddressScale = 1; - constexpr int ImmOffset = 0; - constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); - constexpr lsc_vector_size VS = to_lsc_vector_size<1>(); - constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; - using MsgT = typename lsc_expand_type::type; - constexpr int IOp = lsc_to_internal_atomic_op(); - simd Tmp = - __esimd_lsc_xatomic_slm_0(pred.data(), offsets.data()); - return lsc_format_ret(Tmp); +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default expected alignment is 16-bytes to generate block_store +/// instruction on all known target devices (Gen12, DG2, PVC, etc). +/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes +/// is valid, but requires JIT compiler generating a slower SCATTER instead +/// of faster BLOCK_STORE. +/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying +/// the actual alignment in \p props produces incorrect store results on Gen12. +template < + typename ValuesSimdViewT, + typename T = ValuesSimdViewT::value_type::element_type, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +block_store(AccessorT lacc, ValuesSimdViewT vals, PropertyListT props = {}) { + block_store(lacc, vals.read(), props); } -/// SLM atomic. -/// Supported platforms: DG2, PVC -/// VISA instruction: lsc_atomic_.slm +/// void block_store(local_accessor lacc, uint32_t byte_offset, +/// ValuesSimdViewT vals, simd_mask<1> pred, props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local +/// Memory) associated with the local accessor \p lacc at the given \p +/// byte_offset. The parameter \p pred is the one-element predicate. If it is +/// set to 1, then all 'N' elements are stored. Otherwise, the block store +/// operation is a NO-OP. /// -/// @tparam Op is operation type. -/// @tparam T is element type. -/// @tparam N is the number of channels (platform dependent). -/// @tparam DS is the data size. -/// @param offsets is the zero-based offsets. -/// @param src0 is the first atomic operand. -/// @param pred is predicate. +/// The parameter 'props' specifies the optional compile-time properties +/// list. Only esimd::alignment property is used. Other properties are ignored. /// -/// @return A vector of the old values at the memory locations before the -/// update. -template -__ESIMD_API std::enable_if_t() == 1, simd> -slm_atomic_update_impl(simd offsets, simd src0, - simd_mask pred) { - check_lsc_data_size(); - check_atomic(); - constexpr uint16_t AddressScale = 1; - constexpr int ImmOffset = 0; - constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); - constexpr lsc_vector_size VS = to_lsc_vector_size<1>(); - constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; - constexpr int IOp = lsc_to_internal_atomic_op(); +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default expected alignment is the minimally required (see (R1) below). +/// +/// Restrictions - predicate imposed - temporary: +/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller +/// elements and 8-byte aligned for 8-byte elements. +/// R2: The number of elements must be: +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template < + typename ValuesSimdViewT, + typename T = ValuesSimdViewT::value_type::element_type, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +block_store(AccessorT lacc, uint32_t byte_offset, ValuesSimdViewT vals, + simd_mask<1> pred, PropertyListT props = {}) { + block_store(lacc, byte_offset, vals.read(), pred, props); +} + +/// void block_store(local_accessor lacc, ValuesSimdViewT vals, +/// simd_mask<1> pred, props={}); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local +/// Memory) associated with the local accessor \p lacc. The parameter \p pred is +/// the one-element predicate. If it is set to 1, then all 'N' elements are +/// stored. Otherwise, the block store operation is a NO-OP. +/// +/// The parameter 'props' specifies the optional compile-time properties +/// list. Only esimd::alignment property is used. Other properties are ignored. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default expected alignment is the minimally required (see (R1) below). +/// +/// Restrictions - predicate imposed - temporary: +/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller +/// elements and 8-byte aligned for 8-byte elements. +/// R2: The number of elements must be: +/// for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template < + typename ValuesSimdViewT, + typename T = ValuesSimdViewT::value_type::element_type, + int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(), + typename AccessorT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +block_store(AccessorT lacc, ValuesSimdViewT vals, simd_mask<1> pred, + PropertyListT props = {}) { + block_store(lacc, vals.read(), pred, props); +} +namespace detail { + +// lsc_atomic_update() operations may share atomic_op values for data types +// of the same (fp vs integral) class for convenience (e.g. re-use 'fmax' for +// all FP types). In fact those data types may require using different internal +// opcodes. This function returns the corresponding internal opcode for +// the input type 'T' and operation 'Op'. +template +constexpr int lsc_to_internal_atomic_op() { + constexpr __ESIMD_NS::native::lsc::atomic_op LSCOp = + __ESIMD_DNS::to_lsc_atomic_op(); + return static_cast(LSCOp); +} + +/// SLM atomic. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_atomic_.slm +/// +/// @tparam Op is operation type. +/// @tparam T is element type. +/// @tparam N is the number of channels (platform dependent). +/// @tparam DS is the data size. +/// @param offsets is the zero-based offsets. +/// @param pred is predicate. +/// +/// @return A vector of the old values at the memory locations before the +/// update. + +template +__ESIMD_API std::enable_if_t() == 0, simd> +slm_atomic_update_impl(simd offsets, simd_mask pred) { + check_lsc_data_size(); + check_atomic(); + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); + constexpr lsc_vector_size VS = to_lsc_vector_size<1>(); + constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; + using MsgT = typename lsc_expand_type::type; + constexpr int IOp = lsc_to_internal_atomic_op(); + simd Tmp = + __esimd_lsc_xatomic_slm_0(pred.data(), offsets.data()); + return lsc_format_ret(Tmp); +} + +/// SLM atomic. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_atomic_.slm +/// +/// @tparam Op is operation type. +/// @tparam T is element type. +/// @tparam N is the number of channels (platform dependent). +/// @tparam DS is the data size. +/// @param offsets is the zero-based offsets. +/// @param src0 is the first atomic operand. +/// @param pred is predicate. +/// +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t() == 1, simd> +slm_atomic_update_impl(simd offsets, simd src0, + simd_mask pred) { + check_lsc_data_size(); + check_atomic(); + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); + constexpr lsc_vector_size VS = to_lsc_vector_size<1>(); + constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; + constexpr int IOp = lsc_to_internal_atomic_op(); if constexpr (std::is_same_v || std::is_same_v) { return __esimd_lsc_xatomic_slm_1 byte_offset, /// atomic_update(local_accessor lacc, /// simd byte_offset, /// simd src0, -/// simd_mask<1> pred = 1); // (lacc-au1-1) +/// simd_mask mask = 1); // (lacc-au1-1) /// /// Usage of cache hints or non-standard operation width N requires DG2 or PVC. @@ -7158,11 +8097,102 @@ slm_atomic_update(simd byte_offset, simd src0, } } +/// simd +/// slm_atomic_update(simd byte_offset, +/// SrcSimdViewT src0, +/// simd_mask mask = 1) +/// +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1 && + detail::is_simd_view_type_v, + simd> +slm_atomic_update(simd byte_offset, SrcSimdViewT src0, + simd_mask mask = 1) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return slm_atomic_update(byte_offset, src0.read(), mask); +} + +/// simd +/// slm_atomic_update(OffsetSimdViewT byte_offset, +/// simd src0, +/// simd_mask mask = 1) +/// +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1 && + detail::is_simd_view_type_v, + simd> +slm_atomic_update(OffsetSimdViewT byte_offset, simd src0, + simd_mask mask = 1) { + static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return slm_atomic_update(byte_offset.read(), src0, mask); +} + +/// simd +/// slm_atomic_update(OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, +/// simd_mask mask = 1) +/// +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1 && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +slm_atomic_update(OffsetSimdViewT byte_offset, SrcSimdViewT src0, + simd_mask mask = 1) { + static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return slm_atomic_update(byte_offset.read(), src0.read(), mask); +} + /// simd /// atomic_update(local_accessor lacc, /// simd byte_offset, /// simd src0, -/// simd_mask<1> pred = 1); // (lacc-au1-1) +/// simd_mask<1> mask = 1); // (lacc-au1-1) /// /// Atomically updates \c N memory locations in SLM indicated by /// local accessor \p lacc and a vector of offsets, and returns a vector of old @@ -7186,6 +8216,105 @@ atomic_update(AccessorT lacc, simd byte_offset, simd src0, return slm_atomic_update(byte_offset, src0, mask); } +/// simd +/// atomic_update(local_accessor lacc, +/// OffsetSimdViewT byte_offset, +/// simd src0, +/// simd_mask<1> mask = 1); +/// +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// local accessor \p lacc and a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1 && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_local_accessor_v, + simd> +atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, simd src0, + simd_mask mask = 1) { + static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(lacc, byte_offset.read(), src0, mask); +} + +/// simd +/// atomic_update(local_accessor lacc, +/// simd byte_offset, +/// SrcSimdViewT src0, +/// simd_mask<1> mask = 1); +/// +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// local accessor \p lacc and a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1 && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_local_accessor_v, + simd> +atomic_update(AccessorT lacc, simd byte_offset, SrcSimdViewT src0, + simd_mask mask = 1) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(lacc, byte_offset, src0.read(), mask); +} + +/// simd +/// atomic_update(local_accessor lacc, +/// OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, +/// simd_mask<1> mask = 1); +/// +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// local accessor \p lacc and a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1 && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_local_accessor_v, + simd> +atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, SrcSimdViewT src0, + simd_mask mask = 1) { + static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(lacc, byte_offset.read(), src0.read(), mask); +} /// Two argument variant of the atomic update operation. /// simd @@ -7198,7 +8327,7 @@ atomic_update(AccessorT lacc, simd byte_offset, simd src0, /// simd byte_offset, /// simd src0, /// simd src1, -/// simd_mask<1> pred = 1); // (lacc-au2-1) +/// simd_mask<1> mask = 1); // (lacc-au2-1) /// /// simd @@ -7241,11 +8370,232 @@ slm_atomic_update(simd byte_offset, simd src0, } /// simd -/// atomic_update(local_accessor lacc, +/// slm_atomic_update(simd byte_offset, +/// SrcSimdViewT src0, simd src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand (new value). +/// @param src1 is the second atomic operand (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v, + simd> +slm_atomic_update(simd byte_offset, SrcSimdViewT src0, + simd src1, simd_mask mask = 1) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset and src1 parameters."); + return slm_atomic_update(byte_offset, src0.read(), src1, mask); +} + +/// simd +/// slm_atomic_update(simd byte_offset, +/// simd src0, SrcSimdViewT src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand (new value). +/// @param src1 is the second atomic operand (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v, + simd> +slm_atomic_update(simd byte_offset, simd src0, + SrcSimdViewT src1, simd_mask mask = 1) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src1 parameter must correspond to the size of " + "byte_offset and src0 parameters."); + return slm_atomic_update(byte_offset, src0, src1.read(), mask); +} + +/// simd +/// slm_atomic_update(simd byte_offset, +/// SrcSimdViewT src0, SrcSimdViewT src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand (new value). +/// @param src1 is the second atomic operand (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v, + simd> +slm_atomic_update(simd byte_offset, SrcSimdViewT src0, + SrcSimdViewT src1, simd_mask mask = 1) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src1 and src0 parameters must correspond to the size of " + "byte_offset parameter."); + return slm_atomic_update(byte_offset, src0.read(), src1.read(), + mask); +} + +/// simd +/// slm_atomic_update(OffsetSimdViewT byte_offset, +/// simd src0, simd src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand (new value). +/// @param src1 is the second atomic operand (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v, + simd> +slm_atomic_update(OffsetSimdViewT byte_offset, simd src0, simd src1, + simd_mask mask = 1) { + static_assert( + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src1 and src0 parameters must correspond to the size of " + "byte_offset parameter."); + return slm_atomic_update(byte_offset.read(), src0, src1, mask); +} + +/// simd +/// slm_atomic_update(OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, simd src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand (new value). +/// @param src1 is the second atomic operand (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +slm_atomic_update(OffsetSimdViewT byte_offset, SrcSimdViewT src0, + simd src1, simd_mask mask = 1) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * + OffsetSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset and src1 parameters."); + return slm_atomic_update(byte_offset.read(), src0.read(), src1, + mask); +} + +/// simd +/// slm_atomic_update(OffsetSimdViewT byte_offset, +/// simd src0, SrcSimdViewT src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand (new value). +/// @param src1 is the second atomic operand (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +slm_atomic_update(OffsetSimdViewT byte_offset, simd src0, + SrcSimdViewT src1, simd_mask mask = 1) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * + OffsetSimdViewT::getSizeY(), + "Size of src1 parameter must correspond to the size of " + "byte_offset and src0 parameters."); + return slm_atomic_update(byte_offset.read(), src0, src1.read(), + mask); +} + +/// simd +/// slm_atomic_update(OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, SrcSimdViewT src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Atomically updates \c N memory locations in SLM indicated by +/// a vector of offsets, and returns a vector of old +/// values found at the memory locations before update. +/// @tparam Op The atomic operation. +/// @param byte_offset The vector of 32-bit offsets. +/// @param src0 is the first atomic operand (new value). +/// @param src1 is the second atomic operand (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +slm_atomic_update(OffsetSimdViewT byte_offset, SrcSimdViewT src0, + SrcSimdViewT src1, simd_mask mask = 1) { + static_assert( + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src1 and src0 parameters must correspond to the size of " + "byte_offset parameter."); + return slm_atomic_update(byte_offset.read(), src0, src1, mask); +} + +/// simd +/// atomic_update(local_accessor lacc, /// simd byte_offset, /// simd src0, /// simd src1, -/// simd_mask<1> pred = 1); // (lacc-au2-1) +/// simd_mask mask = 1); // (lacc-au2-1) template __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && __ESIMD_DNS::is_rw_local_accessor_v, @@ -7256,6 +8606,175 @@ atomic_update(AccessorT lacc, simd byte_offset, simd src0, return slm_atomic_update(byte_offset, src0, src1, mask); } +/// simd +/// atomic_update(local_accessor lacc, +/// simd byte_offset, +/// SrcSimdViewT src0, +/// simd src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_local_accessor_v, + simd> +atomic_update(AccessorT lacc, simd byte_offset, SrcSimdViewT src0, + simd src1, simd_mask mask = 1) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset and src1 parameters."); + return atomic_update(lacc, byte_offset, src0.read(), src1, mask); +} + +/// simd +/// atomic_update(local_accessor lacc, +/// simd byte_offset, +/// simd src0, +/// SrcSimdViewT src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_local_accessor_v, + simd> +atomic_update(AccessorT lacc, simd byte_offset, simd src0, + SrcSimdViewT src1, simd_mask mask = 1) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src1 parameter must correspond to the size of " + "byte_offset and src0 parameters."); + return atomic_update(lacc, byte_offset, src0, src1.read(), mask); +} + +/// simd +/// atomic_update(local_accessor lacc, +/// simd byte_offset, +/// SrcSimdViewT src0, +/// SrcSimdViewT src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_local_accessor_v, + simd> +atomic_update(AccessorT lacc, simd byte_offset, SrcSimdViewT src0, + SrcSimdViewT src1, simd_mask mask = 1) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src1 and src0 parameters must correspond to the size of " + "byte_offset parameter."); + return atomic_update(lacc, byte_offset, src0.read(), src1.read(), + mask); +} + +/// simd +/// atomic_update(local_accessor lacc, +/// OffsetSimdViewT byte_offset, +/// simd src0, +/// simd src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_local_accessor_v, + simd> +atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, simd src0, + simd src1, simd_mask mask = 1) { + static_assert( + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src1 and src0 parameters must correspond to the size of " + "byte_offset parameter."); + return atomic_update(lacc, byte_offset.read(), src0, src1, mask); +} + +/// simd +/// atomic_update(local_accessor lacc, +/// OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, +/// simd src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_local_accessor_v, + simd> +atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, SrcSimdViewT src0, + simd src1, simd_mask mask = 1) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset and src1 parameters."); + return atomic_update(lacc, byte_offset.read(), src0.read(), src1, + mask); +} + +/// simd +/// atomic_update(local_accessor lacc, +/// OffsetSimdViewT byte_offset, +/// simd src0, +/// SrcSimdViewT src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_local_accessor_v, + simd> +atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, simd src0, + SrcSimdViewT src1, simd_mask mask = 1) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * + OffsetSimdViewT::getSizeY(), + "Size of src1 parameter must correspond to the size of " + "byte_offset and src0 parameters."); + return atomic_update(lacc, byte_offset.read(), src0, src1.read(), + mask); +} + +/// simd +/// atomic_update(local_accessor lacc, +/// OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, +/// SrcSimdViewT src1, +/// simd_mask mask = 1); +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +template +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_local_accessor_v, + simd> +atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, SrcSimdViewT src0, + SrcSimdViewT src1, simd_mask mask = 1) { + static_assert( + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src1 and src0 parameters must correspond to the size of " + "byte_offset parameter."); + return atomic_update(lacc, byte_offset.read(), src0.read(), + src1.read(), mask); +} + /// @} sycl_esimd_memory_slm namespace detail { @@ -7744,8 +9263,37 @@ __ESIMD_API std::enable_if_t< detail::is_simd_view_type_v, simd> atomic_update(T *p, OffsetSimdViewT byte_offset, PropertyListT props = {}) { - simd_mask mask = 1; - return atomic_update(p, byte_offset.read(), mask, props); + return atomic_update(p, byte_offset.read(), props); +} + +/// simd +/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// props = {}); +/// +/// A variation of \c atomic_update API with \c offsets represented as +/// \c simd_view object without mask operand and allows the use without +/// specifying \c T and \c N template parameters. +/// +/// @tparam Op The atomic operation - can be \c atomic_op::inc, +/// \c atomic_op::dec, or \c atomic_op::load. +/// @param p The USM pointer. +/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +/// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +template < + atomic_op Op, typename OffsetSimdViewT, typename T, + int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 0 && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v, + simd> +atomic_update(T *p, OffsetSimdViewT byte_offset, PropertyListT props = {}) { + return atomic_update(p, byte_offset.read(), props); } /// A variation of \c atomic_update API with \c offset represented as @@ -7881,21 +9429,27 @@ atomic_update(T *p, simd byte_offset, simd src0, /// simd /// atomic_update(T *ptr, simd byte_offset, -/// simd src0, props = {}); // (usm-au1-2) - -/// A variation of \c atomic_update API without mask operand. - +/// SrcSimdViewT src0, simd_mask mask, props = {}); +/// +/// Atomically updates \c N memory locations represented by a USM pointer and +/// a vector of offsets relative to the pointer, and returns a vector of old +/// values found at the memory locations before update. The update operation +/// has 1 additional argument. +/// A variation of \c atomic_update API with \c src0 represented as +/// \c simd_view object and allows the use without specifying \c T and \c N +/// template parameters. +/// /// @tparam Op The atomic operation - can be one of the following: /// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max, /// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or, /// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint, /// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c /// atomic_op::fsub, \c atomic_op::store. -/// @tparam T The vector element type. -/// @tparam N The number of memory locations to update. /// @param p The USM pointer. /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. /// @param src0 The additional argument. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. /// @param props The parameter 'props' specifies the optional compile-time /// properties list. Only L1/L2 properties are used. Other properties are /// ignored. @@ -7903,20 +9457,100 @@ atomic_update(T *p, simd byte_offset, simd src0, /// update. /// template < - atomic_op Op, typename T, int N, typename Toffset, + atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args() == 1 && + detail::is_simd_view_type_v && ext::oneapi::experimental::is_property_list_v, simd> -atomic_update(T *p, simd byte_offset, simd src0, - PropertyListT props = {}) { - simd_mask mask = 1; - return atomic_update(p, byte_offset, src0, mask, props); +atomic_update(T *p, simd byte_offset, SrcSimdViewT src0, + simd_mask mask, PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(p, byte_offset, src0.read(), mask, props); } /// simd -/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// atomic_update(T *ptr, simd byte_offset, +/// simd src0, props = {}); // (usm-au1-2) + +/// A variation of \c atomic_update API without mask operand. + +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c +/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c +/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c +/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c +/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The additional argument. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. Other properties are +/// ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename T, int N, typename Toffset, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(T *p, simd byte_offset, simd src0, + PropertyListT props = {}) { + simd_mask mask = 1; + return atomic_update(p, byte_offset, src0, mask, props); +} + +/// simd +/// atomic_update(T *ptr, simd byte_offset, +/// SrcSimdViewT src0, props = {}); + +/// A variation of \c atomic_update API with \c src0 represented as +/// \c simd_view object and no mask operand and allows the use without +/// specifying \c T and \c N template parameters. + +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c +/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c +/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c +/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c +/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The additional argument. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. Other properties are +/// ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(T *p, simd byte_offset, SrcSimdViewT src0, + PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(p, byte_offset, src0.read(), props); +} + +/// simd +/// atomic_update(T *p, OffsetSimdViewT byte_offset, /// simd src0, /// simd_mask mask, props = {}); // (usm-au1-3) /// @@ -7955,6 +9589,54 @@ atomic_update(T *p, OffsetSimdViewT offsets, simd src0, simd_mask mask, return atomic_update(p, offsets.read(), src0, mask, props); } +/// simd +/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, +/// simd_mask mask, props = {}); +/// +/// A variation of \c atomic_update API with \c byte_offset and \c src0 +/// represented as \c simd_view object and allows the use without specifying \c +/// T and \c N template parameters. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c +/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c +/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c +/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c +/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @param p The USM pointer. +/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. +/// @param src0 The additional argument. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. Other properties are +/// ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT, typename T, + int N, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +atomic_update(T *p, OffsetSimdViewT offsets, SrcSimdViewT src0, + simd_mask mask, PropertyListT props = {}) { + static_assert( + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() && + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 and offsets parameters must correspond to the size of " + "mask parameter."); + return atomic_update(p, offsets.read(), src0.read(), mask, props); +} + /// simd /// atomic_update(T *p, OffsetSimdViewT byte_offset, /// simd src0, @@ -7994,6 +9676,48 @@ atomic_update(T *p, OffsetSimdViewT offsets, simd src0, return atomic_update(p, offsets.read(), src0, mask, props); } +/// simd +/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, +/// props = {}); +/// +/// A variation of \c atomic_update API with \c byte_offset represented as +/// \c simd_view object and no mask operand and allows the use without +/// specifying \c T and \c N template parameters. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c +/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c +/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c +/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c +/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store. +/// @param p The USM pointer. +/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. +/// @param src0 The additional argument. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. Other properties are +/// ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT, typename T, + int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +atomic_update(T *p, OffsetSimdViewT offsets, SrcSimdViewT src0, + PropertyListT props = {}) { + static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "offsets parameter."); + return atomic_update(p, offsets.read(), src0.read(), props); +} + /// A variation of \c atomic_update API with \c offset represented as /// scalar object. /// @@ -8125,17 +9849,21 @@ atomic_update(T *p, simd byte_offset, simd src0, /// simd /// atomic_update(T *p, simd byte_offset, -/// simd src0, simd src1, -/// props = {}); // (usm-au2-2) -// +/// SrcSimdViewT src0, simd src1, +/// simd_mask mask, props = {}); +/// +/// A variation of \c atomic_update API with \c src0 represented as +/// \c simd_view object and allows the use without specifying \c T and \c N +/// template parameters. + /// @tparam Op The atomic operation - can be one of the following: /// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. -/// @tparam T The vector element type. -/// @tparam N The number of memory locations to update. /// @param p The USM pointer. /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. /// @param src0 The first additional argument (new value). /// @param src1 The second additional argument (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. /// @param props The parameter 'props' specifies the optional compile-time /// properties list. Only L1/L2 properties are used. // Other properties are ignored. @@ -8143,27 +9871,73 @@ atomic_update(T *p, simd byte_offset, simd src0, /// update. /// template < - atomic_op Op, typename T, int N, typename Toffset, + atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(T *p, simd byte_offset, SrcSimdViewT src0, + simd src1, simd_mask mask, PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(p, byte_offset, src0.read(), src1, mask, + props); +} + +/// simd +/// atomic_update(T *p, simd byte_offset, +/// simd src0, SrcSimdViewT src1, +/// simd_mask mask, props = {}); +/// +/// A variation of \c atomic_update API with \c src1 represented as +/// \c simd_view object and allows the use without specifying \c T and \c N +/// template parameters. + +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && ext::oneapi::experimental::is_property_list_v, simd> atomic_update(T *p, simd byte_offset, simd src0, - simd src1, PropertyListT props = {}) { - simd_mask mask = 1; - return atomic_update(p, byte_offset, src0, src1, mask, props); + SrcSimdViewT src1, simd_mask mask, PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src1 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(p, byte_offset, src0, src1.read(), mask, + props); } /// simd -/// atomic_update(T *p, OffsetSimdViewT byte_offset, -/// simd src0, simd src1, -/// simd_mask mask, props = {}) // (usm-au2-3) +/// atomic_update(T *p, simd byte_offset, +/// SrcSimdViewT src0, SrcSimdViewT src1, +/// simd_mask mask, props = {}); /// +/// A variation of \c atomic_update API with \c src0 and \c src1 represented as +/// \c simd_view object and allows the use without specifying \c T and \c N +/// template parameters. + /// @tparam Op The atomic operation - can be one of the following: /// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. -/// @tparam T The vector element type. -/// @tparam N The number of memory locations to update. /// @param p The USM pointer. /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. /// @param src0 The first additional argument (new value). @@ -8175,25 +9949,30 @@ atomic_update(T *p, simd byte_offset, simd src0, // Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. +/// template < - atomic_op Op, typename T, int N, typename OffsetSimdViewT, + atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args() == 2 && - ext::oneapi::experimental::is_property_list_v && - detail::is_simd_view_type_v, + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v, simd> -atomic_update(T *p, OffsetSimdViewT byte_offset, simd src0, - simd src1, simd_mask mask, PropertyListT props = {}) { - return atomic_update(p, byte_offset.read(), src0, src1, mask, +atomic_update(T *p, simd byte_offset, SrcSimdViewT src0, + SrcSimdViewT src1, simd_mask mask, PropertyListT props = {}) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src1 and src0 parameters must correspond to the size of " + "byte_offset parameter."); + return atomic_update(p, byte_offset, src0.read(), src1.read(), mask, props); } /// simd -/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// atomic_update(T *p, simd byte_offset, /// simd src0, simd src1, -/// props = {}) // (usm-au2-4) -/// +/// props = {}); // (usm-au2-2) +// /// @tparam Op The atomic operation - can be one of the following: /// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. /// @tparam T The vector element type. @@ -8207,82 +9986,1255 @@ atomic_update(T *p, OffsetSimdViewT byte_offset, simd src0, // Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. +/// template < - atomic_op Op, typename T, int N, typename OffsetSimdViewT, + atomic_op Op, typename T, int N, typename Toffset, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args() == 2 && - ext::oneapi::experimental::is_property_list_v && - detail::is_simd_view_type_v, + ext::oneapi::experimental::is_property_list_v, simd> -atomic_update(T *p, OffsetSimdViewT byte_offset, simd src0, +atomic_update(T *p, simd byte_offset, simd src0, simd src1, PropertyListT props = {}) { simd_mask mask = 1; - return atomic_update(p, byte_offset.read(), src0, src1, mask, - props); + return atomic_update(p, byte_offset, src0, src1, mask, props); } -/// A variation of \c atomic_update API with \c byte_offset represented as -/// scalar. +/// simd +/// atomic_update(T *p, simd byte_offset, +/// SrcSimdViewT src0, simd src1, +/// props = {}); /// +/// A variation of \c atomic_update API with \c src0 represented as +/// \c simd_view object without \c mask operand and allows the use without +/// specifying \c T and \c N template parameters. + /// @tparam Op The atomic operation - can be one of the following: /// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. -/// @tparam Tx The vector element type. -/// @tparam N The number of memory locations to update. /// @param p The USM pointer. -/// @param byte_offset The scalar 32-bit or 64-bit offset in bytes. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. /// @param src0 The first additional argument (new value). /// @param src1 The second additional argument (expected value). -/// @param mask Operation mask, only locations with non-zero in the -/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. /// -template -__ESIMD_API std::enable_if_t, simd> -atomic_update(Tx *p, Toffset byte_offset, simd src0, simd src1, - simd_mask mask) { - return atomic_update(p, simd(byte_offset), src0, src1, - mask); +template < + atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(T *p, simd byte_offset, SrcSimdViewT src0, + simd src1, PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(p, byte_offset, src0.read(), src1, props); } -/// @anchor accessor_atomic_update0 -/// @brief No-argument variant of the atomic update operation. -/// -/// simd -/// atomic_update(AccessorT acc, simd byte_offset, -/// simd_mask mask, props = {}); /// (acc-au0-1) -/// simd -/// atomic_update(AccessorT acc, simd byte_offset, -/// props = {}); /// (acc-au0-2) -/// simd -/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset, -/// simd_mask mask, props = {}); /// (acc-au0-3) /// simd -/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset, -/// props = {}); /// (acc-au0-4) +/// atomic_update(T *p, simd byte_offset, +/// simd src0, SrcSimdViewT src1, +/// props = {}); /// +/// A variation of \c atomic_update API with \c src1 represented as +/// \c simd_view object without \c mask operand and allows the use without +/// specifying \c T and \c N template parameters. -/// Usage of cache hints or non-standard operation width N requires DG2 or PVC. -/// -/// simd -/// atomic_update(AccessorT acc, simd byte_offset, -/// simd_mask mask, props = {}); /// (acc-au0-1) -/// -/// Atomically updates \c N memory locations represented by an accessor and -/// a vector of offsets, and returns a vector of old values found at the +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(T *p, simd byte_offset, simd src0, + SrcSimdViewT src1, PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src1 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(p, byte_offset, src0, src1.read(), props); +} + +/// simd +/// atomic_update(T *p, simd byte_offset, +/// SrcSimdViewT src0, SrcSimdViewT src1, +/// props = {}); +/// +/// A variation of \c atomic_update API with \c src0 and \c src1 represented as +/// \c simd_view object without \c mask operand and allows the use without +/// specifying \c T and \c N template parameters. + +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(T *p, simd byte_offset, SrcSimdViewT src0, + SrcSimdViewT src1, PropertyListT props = {}) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src1 and src0 parameters must correspond to the size of " + "byte_offset parameter."); + return atomic_update(p, byte_offset, src0.read(), src1.read(), + props); +} + +/// simd +/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// simd src0, simd src1, +/// simd_mask mask, props = {}) // (usm-au2-3) +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +template < + atomic_op Op, typename T, int N, typename OffsetSimdViewT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v, + simd> +atomic_update(T *p, OffsetSimdViewT byte_offset, simd src0, + simd src1, simd_mask mask, PropertyListT props = {}) { + return atomic_update(p, byte_offset.read(), src0, src1, mask, + props); +} + +/// simd +/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, simd src1, +/// simd_mask mask, props = {}) +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +template < + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T, + int N, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0, + simd src1, simd_mask mask, PropertyListT props = {}) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0 and byte_offset parameters must correspond to the size of " + "mask parameter."); + return atomic_update(p, byte_offset.read(), src0.read(), src1, mask, + props); +} + +/// simd +/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// simd src0, SrcSimdViewT src1, +/// simd_mask mask, props = {}) +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +template < + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T, + int N, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +atomic_update(T *p, OffsetSimdViewT byte_offset, simd src0, + SrcSimdViewT src1, simd_mask mask, PropertyListT props = {}) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src1 and byte_offset parameters must correspond to the size of " + "mask parameter."); + return atomic_update(p, byte_offset.read(), src0, src1.read(), mask, + props); +} + +/// simd +/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, SrcSimdViewT src1, +/// simd_mask mask, props = {}) +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +template < + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T, + int N, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0, + SrcSimdViewT src1, simd_mask mask, PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * + OffsetSimdViewT::getSizeY(), + "Size of src0, src1 and byte_offset parameters must correspond " + "to the size of " + "mask parameter."); + return atomic_update(p, byte_offset.read(), src0.read(), + src1.read(), mask, props); +} + +/// simd +/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// simd src0, simd src1, +/// props = {}) // (usm-au2-4) +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +template < + atomic_op Op, typename T, int N, typename OffsetSimdViewT, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v, + simd> +atomic_update(T *p, OffsetSimdViewT byte_offset, simd src0, + simd src1, PropertyListT props = {}) { + simd_mask mask = 1; + return atomic_update(p, byte_offset.read(), src0, src1, mask, + props); +} + +/// simd +/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, simd src1, +/// props = {}) +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +template < + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T, + int N, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0, + simd src1, PropertyListT props = {}) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0 and byte_offset parameters must correspond to the size of " + "src1 parameter."); + return atomic_update(p, byte_offset.read(), src0.read(), src1, + props); +} + +/// simd +/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// simd src0, SrcSimdViewT src1, +/// props = {}) +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +template < + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T, + int N, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +atomic_update(T *p, OffsetSimdViewT byte_offset, simd src0, + SrcSimdViewT src1, PropertyListT props = {}) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src1 and byte_offset parameters must correspond to the size of " + "src0 parameter."); + return atomic_update(p, byte_offset.read(), src0, src1.read(), + props); +} + +/// simd +/// atomic_update(T *p, OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, SrcSimdViewT src1, +/// props = {}) +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @param p The USM pointer. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +template < + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T, + int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0, + SrcSimdViewT src1, PropertyListT props = {}) { + static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0, src1 and byte_offset parameters must be equal."); + return atomic_update(p, byte_offset.read(), src0.read(), + src1.read(), props); +} + +/// A variation of \c atomic_update API with \c byte_offset represented as +/// scalar. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @tparam Tx The vector element type. +/// @tparam N The number of memory locations to update. +/// @param p The USM pointer. +/// @param byte_offset The scalar 32-bit or 64-bit offset in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template +__ESIMD_API std::enable_if_t, simd> +atomic_update(Tx *p, Toffset byte_offset, simd src0, simd src1, + simd_mask mask) { + return atomic_update(p, simd(byte_offset), src0, src1, + mask); +} + +/// @anchor accessor_atomic_update0 +/// @brief No-argument variant of the atomic update operation. +/// +/// simd +/// atomic_update(AccessorT acc, simd byte_offset, +/// simd_mask mask, props = {}); /// (acc-au0-1) +/// simd +/// atomic_update(AccessorT acc, simd byte_offset, +/// props = {}); /// (acc-au0-2) +/// simd +/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset, +/// simd_mask mask, props = {}); /// (acc-au0-3) +/// simd +/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset, +/// props = {}); /// (acc-au0-4) +/// + +/// Usage of cache hints or non-standard operation width N requires DG2 or PVC. +/// +/// simd +/// atomic_update(AccessorT acc, simd byte_offset, +/// simd_mask mask, props = {}); /// (acc-au0-1) +/// +/// Atomically updates \c N memory locations represented by an accessor and +/// a vector of offsets, and returns a vector of old values found at the /// memory locations before update. The update operation has no arguments /// in addition to the value at the memory location. /// -/// @tparam Op The atomic operation - can be \c atomic_op::inc, -/// \c atomic_op::dec, or \c atomic_op::load. +/// @tparam Op The atomic operation - can be \c atomic_op::inc, +/// \c atomic_op::dec, or \c atomic_op::load. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit +/// offsets are supported only when stateless memory accesses are enforced, i.e. +/// accessor based accesses are automatically converted to stateless accesses. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 0 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(AccessorTy acc, simd byte_offset, simd_mask mask, + PropertyListT props = {}) { +#ifdef __ESIMD_FORCE_STATELESS_MEM + return atomic_update(__ESIMD_DNS::accessorToPointer(acc), + byte_offset, mask, props); +#else + static_assert(std::is_integral_v, "Unsupported offset type"); + + if constexpr (detail::has_cache_hints() || + !detail::isPowerOf2(N, 32) || sizeof(T) < 4) { + return detail::atomic_update_impl< + Op, T, N, detail::lsc_data_size::default_size, PropertyListT>( + acc, byte_offset, mask); + } else { + if constexpr (Op == atomic_op::load) { + if constexpr (std::is_integral_v) { + return atomic_update( + acc, byte_offset, simd(0), mask, props); + } else { + using Tint = detail::uint_type_t; + simd Res = atomic_update( + acc, byte_offset, simd(0), mask, props); + return Res.template bit_cast_view(); + } + } else { + detail::check_atomic(); + static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported"); + + static_assert(sizeof(T) == 4, "Only 32 bit data is supported"); + const auto si = get_surface_index(acc); + using Tx = typename detail::__raw_t; + return __esimd_dword_atomic0(mask.data(), si, + byte_offset.data()); + } + } +#endif +} + +/// simd +/// atomic_update(AccessorT acc, simd byte_offset, +/// props = {}); /// (acc-au0-2) +/// A variation of \c atomic_update API without mask operand +/// +/// @tparam Op The atomic operation - can be \c atomic_op::inc, +/// \c atomic_op::dec, or \c atomic_op::load. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit +/// offsets are supported only when stateless memory accesses are enforced, i.e. +/// accessor based accesses are automatically converted to stateless accesses. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 0 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(AccessorTy acc, simd byte_offset, + PropertyListT props = {}) { + simd_mask mask = 1; + return atomic_update(acc, byte_offset, mask, props); +} + +/// simd +/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset, +/// simd_mask mask, props = {}); /// (acc-au0-3) +/// A variation of \c atomic_update API with \c offsets represented as +/// \c simd_view object. +/// +/// @tparam Op The atomic operation - can be \c atomic_op::inc, +/// \c atomic_op::dec, or \c atomic_op::load. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. +/// 64-bit offsets are supported only when stateless memory accesses are +/// enforced, i.e. accessor based accesses are automatically converted to +/// stateless accesses. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename T, int N, typename OffsetSimdViewT, + typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 0 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v, + simd> +atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd_mask mask, + PropertyListT props = {}) { + return atomic_update(acc, byte_offset.read(), mask, props); +} + +/// simd +/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset, +/// props = {}); /// (acc-au0-4) +/// A variation of \c atomic_update API with \c offsets represented as +/// \c simd_view object and no mask operand. +/// +/// @tparam Op The atomic operation - can be \c atomic_op::inc, +/// \c atomic_op::dec, or \c atomic_op::load. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. +/// 64-bit offsets are supported only when stateless memory accesses are +/// enforced, i.e. accessor based accesses are automatically converted to +/// stateless accesses. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename T, int N, typename OffsetSimdViewT, + typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 0 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v, + simd> +atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, + PropertyListT props = {}) { + simd_mask mask = 1; + return atomic_update(acc, byte_offset.read(), mask, props); +} + +/// A variation of \c atomic_update API with \c offset represented as +/// scalar. +/// +/// @tparam Op The atomic operation - can be \c atomic_op::inc, +/// \c atomic_op::dec, or \c atomic_op::load. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The scalar 32-bit or 64-bit offset in bytes. 64-bit +/// offset are supported only when stateless memory accesses are enforced, +/// i.e. accessor based accesses are automatically converted to stateless +/// accesses. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template +__ESIMD_API + std::enable_if_t<__ESIMD_DNS::get_num_args() == 0 && + __ESIMD_DNS::is_rw_device_accessor_v, + simd> + atomic_update(AccessorTy acc, Toffset byte_offset, simd_mask mask) { + return atomic_update(acc, simd(byte_offset), mask); +} + +/// A variation of \c atomic_update API with \p byte_offset represented as +/// scalar using \c local_accessor. +/// +/// @tparam Op The atomic operation - can be \c atomic_op::inc, +/// \c atomic_op::dec, or \c atomic_op::load. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The scalar 32-bit or 64-bit offset in bytes. 64-bit +/// offset are supported only when stateless memory accesses are enforced, +/// i.e. accessor based accesses are automatically converted to stateless +/// accesses. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template +__ESIMD_API + std::enable_if_t<__ESIMD_DNS::get_num_args() == 0 && + __ESIMD_DNS::is_rw_local_accessor_v, + simd> + atomic_update(AccessorTy acc, uint32_t byte_offset, simd_mask mask) { + return atomic_update(acc, simd(byte_offset), mask); +} + +/// @anchor accessor_atomic_update1 +/// @brief Single-argument variant of the atomic update operation. +/// +/// simd +/// atomic_update(AccessorT acc, simd byte_offset, +/// simd src0, simd_mask mask, props = {});//(acc-au1-1) +/// simd +/// atomic_update(AccessorT acc, simd byte_offset, +/// simd src0, props = {}); // (acc-au1-2) +/// +/// simd +/// atomic_update(AccessorT acc, +//// OffsetSimdViewT byte_offset, +/// simd src0, +/// simd_mask mask, props = {}); // (acc-au1-3) +/// simd +/// atomic_update(AccessorT acc, +/// OffsetSimdViewT byte_offset, +/// simd src0, +/// props = {}); // (acc-au1-4) +/// + +/// simd +/// atomic_update(AccessorT acc, simd byte_offset, +/// simd src0, simd_mask mask, props = {});//(acc-au1-1) +/// +/// Atomically updates \c N memory locations represented by an accessor and +/// a vector of offsets, and returns a vector of old values found at the +/// memory locations before update. The update operation has 1 additional +/// argument. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max, +/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or, +/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint, +/// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c +/// atomic_op::fsub, \c atomic_op::store. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit +/// offsets are supported only when stateless memory accesses are enforced, i.e. +/// accessor based accesses are automatically converted to stateless accesses. +/// @param src0 The additional argument. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. Other properties are +/// ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// + +template < + atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(AccessorTy acc, simd byte_offset, simd src0, + simd_mask mask, PropertyListT props = {}) { +#ifdef __ESIMD_FORCE_STATELESS_MEM + return atomic_update(__ESIMD_DNS::accessorToPointer(acc), + byte_offset, src0, mask, props); +#else + static_assert(std::is_integral_v, "Unsupported offset type"); + static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported"); + // Auto-convert FP atomics to LSC version. + if constexpr (detail::has_cache_hints() || + Op == atomic_op::fmin || Op == atomic_op::fmax || + Op == atomic_op::fadd || Op == atomic_op::fsub || + !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) { + return detail::atomic_update_impl< + Op, T, N, detail::lsc_data_size::default_size, PropertyListT>( + acc, byte_offset, src0, mask); + } else if constexpr (Op == atomic_op::store) { + if constexpr (std::is_integral_v) { + return atomic_update(acc, byte_offset, src0, mask, + props); + } else { + using Tint = detail::uint_type_t; + simd Res = atomic_update( + acc, byte_offset, src0.template bit_cast_view(), mask, props); + return Res.template bit_cast_view(); + } + } else { + detail::check_atomic(); + static_assert(sizeof(T) == 4, "Only 32 bit data is supported"); + const auto si = __ESIMD_NS::get_surface_index(acc); + using Tx = typename detail::__raw_t; + return __esimd_dword_atomic1( + mask.data(), si, byte_offset.data(), + sycl::bit_cast<__ESIMD_DNS::vector_type_t>(src0.data())); + } +#endif +} + +/// simd +/// atomic_update(AccessorT acc, simd byte_offset, +/// SrcSimdViewT src0, simd_mask mask, props = {}); +/// +/// Atomically updates \c N memory locations represented by an accessor and +/// a vector of offsets, and returns a vector of old values found at the +/// memory locations before update. The update operation has 1 additional +/// argument. +/// A variation of \c atomic_update API with \c src0 represented as +/// \c simd_view object and allows the use without +/// specifying \c T and \c N template parameters. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max, +/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or, +/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint, +/// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c +/// atomic_op::fsub, \c atomic_op::store. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit +/// offsets are supported only when stateless memory accesses are enforced, i.e. +/// accessor based accesses are automatically converted to stateless accesses. +/// @param src0 The additional argument. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. Other properties are +/// ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// + +template < + atomic_op Op, typename SrcSimdViewT, typename Toffset, + typename T = SrcSimdViewT::value_type::element_type, int N, + typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(AccessorTy acc, simd byte_offset, SrcSimdViewT src0, + simd_mask mask, PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(acc, byte_offset, src0.read(), mask, props); +} + +/// simd +/// atomic_update(AccessorT acc, simd byte_offset, +/// simd src0, props = {}); // (acc-au1-2) +/// +/// A variation of \c atomic_update API with no mask operand. +/// +/// Atomically updates \c N memory locations represented by an accessor and +/// a vector of offsets, and returns a vector of old values found at the +/// memory locations before update. The update operation has 1 additional +/// argument. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c +/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c +/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c +/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c +/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// 64-bit offsets are supported only when stateless memory accesses are +/// enforced, i.e. accessor based accesses are automatically converted to +/// stateless accesses. +/// @param src0 The additional argument. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. Other properties are +/// ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(AccessorTy acc, simd byte_offset, simd src0, + PropertyListT props = {}) { + simd_mask mask = 1; + return atomic_update(acc, byte_offset, src0, mask, props); +} + +/// simd +/// atomic_update(AccessorT acc, SrcSimdViewT byte_offset, +/// simd src0, props = {}); +/// +/// A variation of \c atomic_update API with no mask operand and \c src0 +/// represented as \c simd_view object that allows the use without specifying +/// \c T and \c N template parameters. +/// +/// Atomically updates \c N memory locations represented by an accessor and +/// a vector of offsets, and returns a vector of old values found at the +/// memory locations before update. The update operation has 1 additional +/// argument. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c +/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c +/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c +/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c +/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// 64-bit offsets are supported only when stateless memory accesses are +/// enforced, i.e. accessor based accesses are automatically converted to +/// stateless accesses. +/// @param src0 The additional argument. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. Other properties are +/// ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename SrcSimdViewT, typename Toffset, + typename T = SrcSimdViewT::value_type::element_type, int N, + typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(AccessorTy acc, simd byte_offset, SrcSimdViewT src0, + PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(acc, byte_offset, src0.read(), props); +} + +/// simd +/// atomic_update(AccessorT acc, +/// OffsetSimdViewT byte_offset, +/// simd src0, +/// simd_mask mask, props = {}); // (acc-au1-3) +/// +/// A variation of \c atomic_update API with \c byte_offset represented as +/// \c simd_view object. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c +/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c +/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c +/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c +/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. +/// 64-bit offsets are supported only when stateless memory accesses are +/// enforced, i.e. accessor based accesses are automatically converted to +/// stateless accesses. +/// @param src0 The additional argument. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. Other properties are +/// ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename T, int N, typename OffsetSimdViewT, + typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v, + simd> +atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd src0, + simd_mask mask, PropertyListT props = {}) { + return atomic_update(acc, byte_offset.read(), src0, mask, props); +} + +/// simd +/// atomic_update(AccessorT acc, +/// OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, +/// simd_mask mask, props = {}); +/// +/// A variation of \c atomic_update API with \c byte_offset and \c src0 +/// represented as \c simd_view object that allows the use without specifying +/// \c T and \c N template parameters. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c +/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c +/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c +/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c +/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. +/// 64-bit offsets are supported only when stateless memory accesses are +/// enforced, i.e. accessor based accesses are automatically converted to +/// stateless accesses. +/// @param src0 The additional argument. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. Other properties are +/// ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, + typename T = SrcSimdViewT::value_type::element_type, + int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0, + simd_mask mask, PropertyListT props = {}) { + static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(acc, byte_offset.read(), src0.read(), mask, + props); +} + +/// simd +/// atomic_update(AccessorT acc, +/// OffsetSimdViewT byte_offset, +/// simd src0, +/// props = {}); // (acc-au1-4) +/// +/// A variation of \c atomic_update API with \c byte_offset represented as +/// \c simd_view object and no mask operand. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c +/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c +/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c +/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c +/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store. +/// @tparam T The vector element type. +/// @tparam N The number of memory locations to update. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. +/// 64-bit offsets are supported only when stateless memory accesses are +/// enforced, i.e. accessor based accesses are automatically converted to +/// stateless accesses. +/// @param src0 The additional argument. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. Other properties are +/// ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename T, int N, typename OffsetSimdViewT, + typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v, + simd> +atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd src0, + PropertyListT props = {}) { + simd_mask mask = 1; + return atomic_update(acc, byte_offset.read(), src0, mask, props); +} + +/// simd +/// atomic_update(AccessorT acc, +/// OffsetSimdViewT byte_offset, +/// SrcSimdViewT src0, +/// props = {}); +/// +/// A variation of \c atomic_update API with \c byte_offset and \c src0 +/// represented as \c simd_view object and no \c mask operand that allows the +/// use without specifying \c T and \c N template parameters. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c +/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c +/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c +/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c +/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. +/// 64-bit offsets are supported only when stateless memory accesses are +/// enforced, i.e. accessor based accesses are automatically converted to +/// stateless accesses. +/// @param src0 The additional argument. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. Other properties are +/// ignored. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template < + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, + typename T = SrcSimdViewT::value_type::element_type, + int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 1 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0, + PropertyListT props = {}) { + static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(acc, byte_offset.read(), src0.read(), props); +} + +/// A variation of \c atomic_update API with \c offset represented as +/// scalar object. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max, +/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or, +/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint, +/// \c atomic_op::fmax, \c atomic_op::fmin \c atomic_op::store. +/// @tparam Tx The vector element type. +/// @tparam N The number of memory locations to update. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param offset The scalar 32-bit or 64-bit offset in bytes. 64-bit +/// offset are supported only when stateless memory accesses are enforced, i.e. +/// accessor based accesses are automatically converted to stateless accesses. +/// @param src0 The additional argument. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::is_rw_device_accessor_v && + ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1), + simd> +atomic_update(AccessorTy acc, Toffset offset, simd src0, + simd_mask mask) { + return atomic_update(acc, simd(offset), src0, mask); +} + +/// A variation of \c atomic_update API with \c offset represented as +/// scalar object and uses \c local_accessor. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max, +/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or, +/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint, +/// \c atomic_op::fmax, \c atomic_op::fmin \c atomic_op::store. +/// @tparam Tx The vector element type. +/// @tparam N The number of memory locations to update. +/// @tparam AccessorTy type of the SYCL accessor. +/// @param acc The SYCL accessor. +/// @param offset The scalar 32-bit offset in bytes. +/// @param src0 The additional argument. +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @return A vector of the old values at the memory locations before the +/// update. +/// +template +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::is_rw_local_accessor_v && + ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1), + simd> +atomic_update(AccessorTy acc, uint32_t offset, simd src0, + simd_mask mask) { + return atomic_update(acc, simd(offset), src0, mask); +} + +/// @anchor accessor_atomic_update2 +/// @brief Two-argument variant of the atomic update operation. +/// +/// simd +/// atomic_update(AccessorTy acc, simd byte_offset, +/// simd src0, simd src1, +// simd_mask mask,props = {}); // (acc-au2-1) +/// +/// simd +/// atomic_update(AccessorTy acc, simd byte_offset, +/// simd src0, simd src1, +/// props = {}); // (acc-au2-2) +/// simd +/// atomic_update(AccessorTy acc, OffsetSimdViewT +/// byte_offset, simd src0, simd src1, +/// simd_mask mask, props = {}); // (acc-au2-3) +/// +/// simd +/// atomic_update(AccessorTy acc, +/// OffsetSimdViewT, byte_offset, +/// simd src0, simd src1, props = {}); // (acc-au2-4) +/// + +/// simd +/// atomic_update(AccessorTy acc, simd byte_offset, +/// simd src0, simd src1, +// simd_mask mask,props = {}); // (acc-au2-1) +/// +/// Atomically updates \c N memory locations represented by an accessor and +/// a vector of offsets and returns a vector of old +/// values found at the memory locations before update. The update operation +/// has 2 additional arguments. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. /// @tparam T The vector element type. /// @tparam N The number of memory locations to update. /// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit -/// offsets are supported only when stateless memory accesses are enforced, i.e. -/// accessor based accesses are automatically converted to stateless accesses. +/// offsets are supported only when stateless memory accesses are enforced, +/// i.e. accessor based accesses are automatically converted to stateless +/// accesses. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). /// @param mask Operation mask, only locations with non-zero in the /// corresponding mask element are updated. /// @param props The parameter 'props' specifies the optional compile-time @@ -8295,62 +11247,69 @@ template < atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< - __ESIMD_DNS::get_num_args() == 0 && + __ESIMD_DNS::get_num_args() == 2 && std::is_integral_v && __ESIMD_DNS::is_rw_device_accessor_v && ext::oneapi::experimental::is_property_list_v, simd> -atomic_update(AccessorTy acc, simd byte_offset, simd_mask mask, - PropertyListT props = {}) { +atomic_update(AccessorTy acc, simd byte_offset, simd src0, + simd src1, simd_mask mask, PropertyListT props = {}) { #ifdef __ESIMD_FORCE_STATELESS_MEM return atomic_update(__ESIMD_DNS::accessorToPointer(acc), - byte_offset, mask, props); + byte_offset, src0, src1, mask, props); #else static_assert(std::is_integral_v, "Unsupported offset type"); - + static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported"); + // Use LSC atomic when cache hints are present, FP atomics is used, + // non-power of two length is used, operation width greater than 32, or the + // data size is less than 4 bytes, if constexpr (detail::has_cache_hints() || - !detail::isPowerOf2(N, 32) || sizeof(T) < 4) { + Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) || + sizeof(T) < 4) { + // 2-argument lsc_atomic_update arguments order matches the standard one - + // expected value first, then new value. But atomic_update uses reverse + // order, hence the src1/src0 swap. return detail::atomic_update_impl< Op, T, N, detail::lsc_data_size::default_size, PropertyListT>( - acc, byte_offset, mask); + acc, byte_offset, src1, src0, mask); } else { - if constexpr (Op == atomic_op::load) { - if constexpr (std::is_integral_v) { - return atomic_update( - acc, byte_offset, simd(0), mask, props); - } else { - using Tint = detail::uint_type_t; - simd Res = atomic_update( - acc, byte_offset, simd(0), mask, props); - return Res.template bit_cast_view(); - } - } else { - detail::check_atomic(); - static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported"); - - static_assert(sizeof(T) == 4, "Only 32 bit data is supported"); - const auto si = get_surface_index(acc); - using Tx = typename detail::__raw_t; - return __esimd_dword_atomic0(mask.data(), si, - byte_offset.data()); - } + detail::check_atomic(); + static_assert(sizeof(T) == 4, "Only 32 bit data is supported"); + const auto si = __ESIMD_NS::get_surface_index(acc); + using Tx = typename detail::__raw_t; + return __esimd_dword_atomic2( + mask.data(), si, byte_offset.data(), + sycl::bit_cast<__ESIMD_DNS::vector_type_t>(src0.data()), + sycl::bit_cast<__ESIMD_DNS::vector_type_t>(src1.data())); } #endif } /// simd -/// atomic_update(AccessorT acc, simd byte_offset, -/// props = {}); /// (acc-au0-2) -/// A variation of \c atomic_update API without mask operand +/// atomic_update(AccessorTy acc, simd byte_offset, +/// SrcSimdViewT src0, simd src1, +// simd_mask mask,props = {}); /// -/// @tparam Op The atomic operation - can be \c atomic_op::inc, -/// \c atomic_op::dec, or \c atomic_op::load. -/// @tparam T The vector element type. -/// @tparam N The number of memory locations to update. +/// Atomically updates \c N memory locations represented by an accessor and +/// a vector of offsets and returns a vector of old +/// values found at the memory locations before update. The update operation +/// has 2 additional arguments. +/// +/// A variation of \c atomic_update API with \c src0 represented as +/// \c simd_view object and allows the use without specifying \c T and \c N +/// template parameters. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. /// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit -/// offsets are supported only when stateless memory accesses are enforced, i.e. -/// accessor based accesses are automatically converted to stateless accesses. +/// offsets are supported only when stateless memory accesses are enforced, +/// i.e. accessor based accesses are automatically converted to stateless +/// accesses. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. /// @param props The parameter 'props' specifies the optional compile-time /// properties list. Only L1/L2 properties are used. // Other properties are ignored. @@ -8358,35 +11317,48 @@ atomic_update(AccessorTy acc, simd byte_offset, simd_mask mask, /// update. /// template < - atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy, + atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset, + typename AccessorTy, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< - __ESIMD_DNS::get_num_args() == 0 && + __ESIMD_DNS::get_num_args() == 2 && std::is_integral_v && + detail::is_simd_view_type_v && __ESIMD_DNS::is_rw_device_accessor_v && ext::oneapi::experimental::is_property_list_v, simd> -atomic_update(AccessorTy acc, simd byte_offset, - PropertyListT props = {}) { - simd_mask mask = 1; - return atomic_update(acc, byte_offset, mask, props); +atomic_update(AccessorTy acc, simd byte_offset, SrcSimdViewT src0, + simd src1, simd_mask mask, PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(acc, byte_offset, src0.read(), src1, mask, + props); } /// simd -/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset, -/// simd_mask mask, props = {}); /// (acc-au0-3) -/// A variation of \c atomic_update API with \c offsets represented as -/// \c simd_view object. +/// atomic_update(AccessorTy acc, simd byte_offset, +/// simd src0, SrcSimdViewT src1, +// simd_mask mask,props = {}); /// -/// @tparam Op The atomic operation - can be \c atomic_op::inc, -/// \c atomic_op::dec, or \c atomic_op::load. -/// @tparam T The vector element type. -/// @tparam N The number of memory locations to update. +/// Atomically updates \c N memory locations represented by an accessor and +/// a vector of offsets and returns a vector of old +/// values found at the memory locations before update. The update operation +/// has 2 additional arguments. +/// +/// A variation of \c atomic_update API with \c src1 represented as +/// \c simd_view object and allows the use without specifying \c T and \c N +/// template parameters. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. /// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. -/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. -/// 64-bit offsets are supported only when stateless memory accesses are -/// enforced, i.e. accessor based accesses are automatically converted to -/// stateless accesses. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit +/// offsets are supported only when stateless memory accesses are enforced, +/// i.e. accessor based accesses are automatically converted to stateless +/// accesses. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). /// @param mask Operation mask, only locations with non-zero in the /// corresponding mask element are updated. /// @param props The parameter 'props' specifies the optional compile-time @@ -8396,509 +11368,490 @@ atomic_update(AccessorTy acc, simd byte_offset, /// update. /// template < - atomic_op Op, typename T, int N, typename OffsetSimdViewT, + atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset, typename AccessorTy, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< - __ESIMD_DNS::get_num_args() == 0 && + __ESIMD_DNS::get_num_args() == 2 && std::is_integral_v && + detail::is_simd_view_type_v && __ESIMD_DNS::is_rw_device_accessor_v && - ext::oneapi::experimental::is_property_list_v && - detail::is_simd_view_type_v, + ext::oneapi::experimental::is_property_list_v, simd> -atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd_mask mask, - PropertyListT props = {}) { - return atomic_update(acc, byte_offset.read(), mask, props); +atomic_update(AccessorTy acc, simd byte_offset, simd src0, + SrcSimdViewT src1, simd_mask mask, PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src1 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(acc, byte_offset, src0, src1.read(), mask, + props); } /// simd -/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset, -/// props = {}); /// (acc-au0-4) -/// A variation of \c atomic_update API with \c offsets represented as -/// \c simd_view object and no mask operand. +/// atomic_update(AccessorTy acc, simd byte_offset, +/// SrcSimdViewT src0, SrcSimdViewT src1, +// simd_mask mask,props = {}); /// -/// @tparam Op The atomic operation - can be \c atomic_op::inc, -/// \c atomic_op::dec, or \c atomic_op::load. -/// @tparam T The vector element type. -/// @tparam N The number of memory locations to update. +/// Atomically updates \c N memory locations represented by an accessor and +/// a vector of offsets and returns a vector of old +/// values found at the memory locations before update. The update operation +/// has 2 additional arguments. +/// +/// A variation of \c atomic_update API with \c src0 and \c src1 represented as +/// \c simd_view object and allows the use without specifying \c T and \c N +/// template parameters. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. /// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. -/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. -/// 64-bit offsets are supported only when stateless memory accesses are -/// enforced, i.e. accessor based accesses are automatically converted to -/// stateless accesses. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit +/// offsets are supported only when stateless memory accesses are enforced, +/// i.e. accessor based accesses are automatically converted to stateless +/// accesses. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. /// template < - atomic_op Op, typename T, int N, typename OffsetSimdViewT, - typename AccessorTy, + atomic_op Op, typename SrcSimdViewT, + typename T = SrcSimdViewT::value_type::element_type, int N, + typename Toffset, typename AccessorTy, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< - __ESIMD_DNS::get_num_args() == 0 && + __ESIMD_DNS::get_num_args() == 2 && std::is_integral_v && + detail::is_simd_view_type_v && __ESIMD_DNS::is_rw_device_accessor_v && - ext::oneapi::experimental::is_property_list_v && - detail::is_simd_view_type_v, + ext::oneapi::experimental::is_property_list_v, simd> -atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, - PropertyListT props = {}) { - simd_mask mask = 1; - return atomic_update(acc, byte_offset.read(), mask, props); +atomic_update(AccessorTy acc, simd byte_offset, SrcSimdViewT src0, + SrcSimdViewT src1, simd_mask mask, PropertyListT props = {}) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 and src1 parameters must correspond to the size of " + "byte_offset parameter."); + return atomic_update(acc, byte_offset, src0.read(), src1.read(), + mask, props); } -/// A variation of \c atomic_update API with \c offset represented as -/// scalar. +/// simd +/// atomic_update(AccessorTy acc, simd byte_offset, +/// simd src0, simd src1, +/// props = {}); // (acc-au2-2) /// -/// @tparam Op The atomic operation - can be \c atomic_op::inc, -/// \c atomic_op::dec, or \c atomic_op::load. +/// A variation of \c atomic_update API with no mask operand. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. /// @tparam T The vector element type. /// @tparam N The number of memory locations to update. -/// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. -/// @param byte_offset The scalar 32-bit or 64-bit offset in bytes. 64-bit -/// offset are supported only when stateless memory accesses are enforced, -/// i.e. accessor based accesses are automatically converted to stateless -/// accesses. -/// @param mask Operation mask, only locations with non-zero in the -/// corresponding mask element are updated. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. /// -template -__ESIMD_API - std::enable_if_t<__ESIMD_DNS::get_num_args() == 0 && - __ESIMD_DNS::is_rw_device_accessor_v, - simd> - atomic_update(AccessorTy acc, Toffset byte_offset, simd_mask mask) { - return atomic_update(acc, simd(byte_offset), mask); +template < + atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(AccessorTy acc, simd byte_offset, simd src0, + simd src1, PropertyListT props = {}) { + simd_mask mask = 1; + return atomic_update(acc, byte_offset, src0, src1, mask, props); } -/// A variation of \c atomic_update API with \p byte_offset represented as -/// scalar using \c local_accessor. +/// simd +/// atomic_update(AccessorTy acc, simd byte_offset, +/// SrcSimdViewT src0, simd src1, +// props = {}); /// -/// @tparam Op The atomic operation - can be \c atomic_op::inc, -/// \c atomic_op::dec, or \c atomic_op::load. -/// @tparam T The vector element type. -/// @tparam N The number of memory locations to update. +/// Atomically updates \c N memory locations represented by an accessor and +/// a vector of offsets and returns a vector of old +/// values found at the memory locations before update. The update operation +/// has 2 additional arguments. +/// +/// A variation of \c atomic_update API with no \c mask operand and with \c src0 +/// represented as \c simd_view object and allows the use without specifying \c +/// T and \c N template parameters. +/// +/// @tparam Op The atomic operation - can be one of the following: +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. /// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. -/// @param byte_offset The scalar 32-bit or 64-bit offset in bytes. 64-bit -/// offset are supported only when stateless memory accesses are enforced, +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit +/// offsets are supported only when stateless memory accesses are enforced, /// i.e. accessor based accesses are automatically converted to stateless /// accesses. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). /// @param mask Operation mask, only locations with non-zero in the /// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. /// -template -__ESIMD_API - std::enable_if_t<__ESIMD_DNS::get_num_args() == 0 && - __ESIMD_DNS::is_rw_local_accessor_v, - simd> - atomic_update(AccessorTy acc, uint32_t byte_offset, simd_mask mask) { - return atomic_update(acc, simd(byte_offset), mask); +template < + atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset, + typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args() == 2 && std::is_integral_v && + detail::is_simd_view_type_v && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v, + simd> +atomic_update(AccessorTy acc, simd byte_offset, SrcSimdViewT src0, + simd src1, PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(acc, byte_offset, src0.read(), src1, props); } -/// @anchor accessor_atomic_update1 -/// @brief Single-argument variant of the atomic update operation. -/// -/// simd -/// atomic_update(AccessorT acc, simd byte_offset, -/// simd src0, simd_mask mask, props = {});//(acc-au1-1) -/// simd -/// atomic_update(AccessorT acc, simd byte_offset, -/// simd src0, props = {}); // (acc-au1-2) -/// -/// simd -/// atomic_update(AccessorT acc, -//// OffsetSimdViewT byte_offset, -/// simd src0, -/// simd_mask mask, props = {}); // (acc-au1-3) -/// simd -/// atomic_update(AccessorT acc, -/// OffsetSimdViewT byte_offset, -/// simd src0, -/// props = {}); // (acc-au1-4) -/// - /// simd -/// atomic_update(AccessorT acc, simd byte_offset, -/// simd src0, simd_mask mask, props = {});//(acc-au1-1) +/// atomic_update(AccessorTy acc, simd byte_offset, +/// simd src0, SrcSimdViewT src1, +// props = {}); /// /// Atomically updates \c N memory locations represented by an accessor and -/// a vector of offsets, and returns a vector of old values found at the -/// memory locations before update. The update operation has 1 additional -/// argument. +/// a vector of offsets and returns a vector of old +/// values found at the memory locations before update. The update operation +/// has 2 additional arguments. +/// +/// A variation of \c atomic_update API with no \c mask operand with \c src1 +/// represented as \c simd_view object and allows the use without specifying \c +/// T and \c N template parameters. /// /// @tparam Op The atomic operation - can be one of the following: -/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max, -/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or, -/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint, -/// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c -/// atomic_op::fsub, \c atomic_op::store. -/// @tparam T The vector element type. -/// @tparam N The number of memory locations to update. +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. /// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit -/// offsets are supported only when stateless memory accesses are enforced, i.e. -/// accessor based accesses are automatically converted to stateless accesses. -/// @param src0 The additional argument. +/// offsets are supported only when stateless memory accesses are enforced, +/// i.e. accessor based accesses are automatically converted to stateless +/// accesses. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). /// @param mask Operation mask, only locations with non-zero in the /// corresponding mask element are updated. /// @param props The parameter 'props' specifies the optional compile-time -/// properties list. Only L1/L2 properties are used. Other properties are -/// ignored. +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. /// - template < - atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy, + atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset, + typename AccessorTy, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< - __ESIMD_DNS::get_num_args() == 1 && + __ESIMD_DNS::get_num_args() == 2 && std::is_integral_v && + detail::is_simd_view_type_v && __ESIMD_DNS::is_rw_device_accessor_v && ext::oneapi::experimental::is_property_list_v, simd> atomic_update(AccessorTy acc, simd byte_offset, simd src0, - simd_mask mask, PropertyListT props = {}) { -#ifdef __ESIMD_FORCE_STATELESS_MEM - return atomic_update(__ESIMD_DNS::accessorToPointer(acc), - byte_offset, src0, mask, props); -#else - static_assert(std::is_integral_v, "Unsupported offset type"); - static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported"); - // Auto-convert FP atomics to LSC version. - if constexpr (detail::has_cache_hints() || - Op == atomic_op::fmin || Op == atomic_op::fmax || - Op == atomic_op::fadd || Op == atomic_op::fsub || - !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) { - return detail::atomic_update_impl< - Op, T, N, detail::lsc_data_size::default_size, PropertyListT>( - acc, byte_offset, src0, mask); - } else if constexpr (Op == atomic_op::store) { - if constexpr (std::is_integral_v) { - return atomic_update(acc, byte_offset, src0, mask, - props); - } else { - using Tint = detail::uint_type_t; - simd Res = atomic_update( - acc, byte_offset, src0.template bit_cast_view(), mask, props); - return Res.template bit_cast_view(); - } - } else { - detail::check_atomic(); - static_assert(sizeof(T) == 4, "Only 32 bit data is supported"); - const auto si = __ESIMD_NS::get_surface_index(acc); - using Tx = typename detail::__raw_t; - return __esimd_dword_atomic1( - mask.data(), si, byte_offset.data(), - sycl::bit_cast<__ESIMD_DNS::vector_type_t>(src0.data())); - } -#endif + SrcSimdViewT src1, PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src1 parameter must correspond to the size of " + "byte_offset parameter."); + return atomic_update(acc, byte_offset, src0, src1.read(), props); } /// simd -/// atomic_update(AccessorT acc, simd byte_offset, -/// simd src0, props = {}); // (acc-au1-2) -/// -/// A variation of \c atomic_update API with no mask operand. +/// atomic_update(AccessorTy acc, simd byte_offset, +/// SrcSimdViewT src0, SrcSimdViewT src1, +// props = {}); /// /// Atomically updates \c N memory locations represented by an accessor and -/// a vector of offsets, and returns a vector of old values found at the -/// memory locations before update. The update operation has 1 additional -/// argument. +/// a vector of offsets and returns a vector of old +/// values found at the memory locations before update. The update operation +/// has 2 additional arguments. +/// +/// A variation of \c atomic_update API with no \c mask operand with \c src0 and +/// \c src1 represented as \c simd_view object and allows the use without +/// specifying \c T and \c N template parameters. /// /// @tparam Op The atomic operation - can be one of the following: -/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max, -/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or, -/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint, -/// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c -/// atomic_op::fsub, \c atomic_op::store. -/// @tparam T The vector element type. -/// @tparam N The number of memory locations to update. +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. /// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit -/// offsets are supported only when stateless memory accesses are enforced, i.e. -/// accessor based accesses are automatically converted to stateless accesses. -/// @param src0 The additional argument. +/// offsets are supported only when stateless memory accesses are enforced, +/// i.e. accessor based accesses are automatically converted to stateless +/// accesses. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. /// @param props The parameter 'props' specifies the optional compile-time -/// properties list. Only L1/L2 properties are used. Other properties are -/// ignored. +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. /// template < - atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy, + atomic_op Op, typename SrcSimdViewT, + typename T = SrcSimdViewT::value_type::element_type, int N, + typename Toffset, typename AccessorTy, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< - __ESIMD_DNS::get_num_args() == 1 && + __ESIMD_DNS::get_num_args() == 2 && std::is_integral_v && + detail::is_simd_view_type_v && __ESIMD_DNS::is_rw_device_accessor_v && ext::oneapi::experimental::is_property_list_v, simd> -atomic_update(AccessorTy acc, simd byte_offset, simd src0, - PropertyListT props = {}) { - simd_mask mask = 1; - return atomic_update(acc, byte_offset, src0, mask, props); +atomic_update(AccessorTy acc, simd byte_offset, SrcSimdViewT src0, + SrcSimdViewT src1, PropertyListT props = {}) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), + "Size of src0 and src1 parameters must correspond to the size of " + "byte_offset parameter."); + return atomic_update(acc, byte_offset, src0.read(), src1.read(), + props); } /// simd -/// atomic_update(AccessorT acc, -/// OffsetSimdViewT byte_offset, -/// simd src0, -/// simd_mask mask, props = {}); // (acc-au1-3) +/// atomic_update(AccessorTy acc, OffsetSimdViewT +/// byte_offset, simd src0, simd src1, +/// simd_mask mask, props = {}); // (acc-au2-3) /// /// A variation of \c atomic_update API with \c byte_offset represented as -/// \c simd_view object. +/// a \c simd_view object. /// /// @tparam Op The atomic operation - can be one of the following: -/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c -/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c -/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c -/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c -/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store. +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. /// @tparam T The vector element type. /// @tparam N The number of memory locations to update. -/// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. -/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. -/// 64-bit offsets are supported only when stateless memory accesses are -/// enforced, i.e. accessor based accesses are automatically converted to -/// stateless accesses. -/// @param src0 The additional argument. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). /// @param mask Operation mask, only locations with non-zero in the /// corresponding mask element are updated. /// @param props The parameter 'props' specifies the optional compile-time -/// properties list. Only L1/L2 properties are used. Other properties are -/// ignored. +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. -/// template < atomic_op Op, typename T, int N, typename OffsetSimdViewT, typename AccessorTy, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< - __ESIMD_DNS::get_num_args() == 1 && + __ESIMD_DNS::get_num_args() == 2 && __ESIMD_DNS::is_rw_device_accessor_v && ext::oneapi::experimental::is_property_list_v && detail::is_simd_view_type_v, simd> atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd src0, - simd_mask mask, PropertyListT props = {}) { - return atomic_update(acc, byte_offset.read(), src0, mask, props); + simd src1, simd_mask mask, PropertyListT props = {}) { + return atomic_update(acc, byte_offset.read(), src0, src1, mask, + props); } /// simd -/// atomic_update(AccessorT acc, -/// OffsetSimdViewT byte_offset, -/// simd src0, -/// props = {}); // (acc-au1-4) +/// atomic_update(AccessorTy acc, OffsetSimdViewT +/// byte_offset, SrcSimdViewT src0, simd src1, +/// simd_mask mask, props = {}); /// -/// A variation of \c atomic_update API with \c byte_offset represented as -/// \c simd_view object and no mask operand. +/// A variation of \c atomic_update API with \c byte_offset and \c src0 +/// represented as \c simd_view object and allows the use without specifying \c +/// T and \c N template parameters. /// /// @tparam Op The atomic operation - can be one of the following: -/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c -/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c -/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c -/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c -/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store. +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. /// @tparam T The vector element type. /// @tparam N The number of memory locations to update. -/// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. -/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes. -/// 64-bit offsets are supported only when stateless memory accesses are -/// enforced, i.e. accessor based accesses are automatically converted to -/// stateless accesses. -/// @param src0 The additional argument. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). +/// @param mask Operation mask, only locations with non-zero in the +/// corresponding mask element are updated. /// @param props The parameter 'props' specifies the optional compile-time -/// properties list. Only L1/L2 properties are used. Other properties are -/// ignored. +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. -/// template < - atomic_op Op, typename T, int N, typename OffsetSimdViewT, - typename AccessorTy, + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T, + int N, typename AccessorTy, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< - __ESIMD_DNS::get_num_args() == 1 && + __ESIMD_DNS::get_num_args() == 2 && __ESIMD_DNS::is_rw_device_accessor_v && ext::oneapi::experimental::is_property_list_v && - detail::is_simd_view_type_v, + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, simd> -atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd src0, - PropertyListT props = {}) { - simd_mask mask = 1; - return atomic_update(acc, byte_offset.read(), src0, mask, props); +atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0, + simd src1, simd_mask mask, PropertyListT props = {}) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0 and byte_offset parameters must correspond to the size of " + "src1 parameter."); + return atomic_update(acc, byte_offset.read(), src0.read(), src1, + mask, props); } - -/// A variation of \c atomic_update API with \c offset represented as -/// scalar object. + +/// simd +/// atomic_update(AccessorTy acc, OffsetSimdViewT +/// byte_offset, simd src0, SrcSimdViewT src1, +/// simd_mask mask, props = {}); +/// +/// A variation of \c atomic_update API with \c byte_offset and \c src1 +/// represented as \c simd_view object and allows the use without specifying \c +/// T and \c N template parameters. /// /// @tparam Op The atomic operation - can be one of the following: -/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max, -/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or, -/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint, -/// \c atomic_op::fmax, \c atomic_op::fmin \c atomic_op::store. -/// @tparam Tx The vector element type. +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @tparam T The vector element type. /// @tparam N The number of memory locations to update. -/// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. -/// @param offset The scalar 32-bit or 64-bit offset in bytes. 64-bit -/// offset are supported only when stateless memory accesses are enforced, i.e. -/// accessor based accesses are automatically converted to stateless accesses. -/// @param src0 The additional argument. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). /// @param mask Operation mask, only locations with non-zero in the /// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. -/// -template +template < + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T, + int N, typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< - __ESIMD_DNS::is_rw_device_accessor_v && - ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1), + __ESIMD_DNS::get_num_args() == 2 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, simd> -atomic_update(AccessorTy acc, Toffset offset, simd src0, - simd_mask mask) { - return atomic_update(acc, simd(offset), src0, mask); +atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd src0, + SrcSimdViewT src1, simd_mask mask, PropertyListT props = {}) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src1 and byte_offset parameters must correspond to the size of " + "src0 parameter."); + return atomic_update(acc, byte_offset.read(), src0, src1.read(), + mask, props); } -/// A variation of \c atomic_update API with \c offset represented as -/// scalar object and uses \c local_accessor. +/// simd +/// atomic_update(AccessorTy acc, OffsetSimdViewT +/// byte_offset, SrcSimdViewT src0, SrcSimdViewT src1, +/// simd_mask mask, props = {}); +/// +/// A variation of \c atomic_update API with \c byte_offset, \c src0 and +/// \c src1 represented as \c simd_view object and allows the use without +/// specifying \c T and \c N template parameters. /// /// @tparam Op The atomic operation - can be one of the following: -/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max, -/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or, -/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint, -/// \c atomic_op::fmax, \c atomic_op::fmin \c atomic_op::store. -/// @tparam Tx The vector element type. +/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. +/// @tparam T The vector element type. /// @tparam N The number of memory locations to update. -/// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. -/// @param offset The scalar 32-bit offset in bytes. -/// @param src0 The additional argument. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. +/// @param src0 The first additional argument (new value). +/// @param src1 The second additional argument (expected value). /// @param mask Operation mask, only locations with non-zero in the /// corresponding mask element are updated. +/// @param props The parameter 'props' specifies the optional compile-time +/// properties list. Only L1/L2 properties are used. +// Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. -/// -template +template < + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, + typename T = SrcSimdViewT::value_type::element_type, int N, + typename AccessorTy, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< - __ESIMD_DNS::is_rw_local_accessor_v && - ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1), - simd> -atomic_update(AccessorTy acc, uint32_t offset, simd src0, - simd_mask mask) { - return atomic_update(acc, simd(offset), src0, mask); + __ESIMD_DNS::get_num_args() == 2 && + __ESIMD_DNS::is_rw_device_accessor_v && + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, + simd> +atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0, + SrcSimdViewT src1, simd_mask mask, PropertyListT props = {}) { + static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * + OffsetSimdViewT::getSizeY(), + "Size of src0, src1 and byte_offset parameters must correspond " + "to the size of " + "mask parameter."); + return atomic_update(acc, byte_offset.read(), src0.read(), + src1.read(), mask, props); } -/// @anchor accessor_atomic_update2 -/// @brief Two-argument variant of the atomic update operation. -/// -/// simd -/// atomic_update(AccessorTy acc, simd byte_offset, -/// simd src0, simd src1, -// simd_mask mask,props = {}); // (acc-au2-1) -/// -/// simd -/// atomic_update(AccessorTy acc, simd byte_offset, -/// simd src0, simd src1, -/// props = {}); // (acc-au2-2) -/// simd -/// atomic_update(AccessorTy acc, OffsetSimdViewT -/// byte_offset, simd src0, simd src1, -/// simd_mask mask, props = {}); // (acc-au2-3) -/// /// simd /// atomic_update(AccessorTy acc, /// OffsetSimdViewT, byte_offset, /// simd src0, simd src1, props = {}); // (acc-au2-4) /// - -/// simd -/// atomic_update(AccessorTy acc, simd byte_offset, -/// simd src0, simd src1, -// simd_mask mask,props = {}); // (acc-au2-1) -/// -/// Atomically updates \c N memory locations represented by an accessor and -/// a vector of offsets and returns a vector of old -/// values found at the memory locations before update. The update operation -/// has 2 additional arguments. +/// A variation of \c atomic_update API with \c byte_offset represented as +/// a \c simd_view object and no mask operand. /// /// @tparam Op The atomic operation - can be one of the following: /// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. /// @tparam T The vector element type. /// @tparam N The number of memory locations to update. -/// @tparam AccessorTy type of the SYCL accessor. /// @param acc The SYCL accessor. -/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit -/// offsets are supported only when stateless memory accesses are enforced, -/// i.e. accessor based accesses are automatically converted to stateless -/// accesses. +/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. /// @param src0 The first additional argument (new value). /// @param src1 The second additional argument (expected value). -/// @param mask Operation mask, only locations with non-zero in the -/// corresponding mask element are updated. /// @param props The parameter 'props' specifies the optional compile-time /// properties list. Only L1/L2 properties are used. // Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. -/// template < - atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy, + atomic_op Op, typename T, int N, typename OffsetSimdViewT, + typename AccessorTy, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< - __ESIMD_DNS::get_num_args() == 2 && std::is_integral_v && + __ESIMD_DNS::get_num_args() == 2 && __ESIMD_DNS::is_rw_device_accessor_v && - ext::oneapi::experimental::is_property_list_v, + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v, simd> -atomic_update(AccessorTy acc, simd byte_offset, simd src0, - simd src1, simd_mask mask, PropertyListT props = {}) { -#ifdef __ESIMD_FORCE_STATELESS_MEM - return atomic_update(__ESIMD_DNS::accessorToPointer(acc), - byte_offset, src0, src1, mask, props); -#else - static_assert(std::is_integral_v, "Unsupported offset type"); - static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported"); - // Use LSC atomic when cache hints are present, FP atomics is used, - // non-power of two length is used, operation width greater than 32, or the - // data size is less than 4 bytes, - if constexpr (detail::has_cache_hints() || - Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) || - sizeof(T) < 4) { - // 2-argument lsc_atomic_update arguments order matches the standard one - - // expected value first, then new value. But atomic_update uses reverse - // order, hence the src1/src0 swap. - return detail::atomic_update_impl< - Op, T, N, detail::lsc_data_size::default_size, PropertyListT>( - acc, byte_offset, src1, src0, mask); - } else { - detail::check_atomic(); - static_assert(sizeof(T) == 4, "Only 32 bit data is supported"); - const auto si = __ESIMD_NS::get_surface_index(acc); - using Tx = typename detail::__raw_t; - return __esimd_dword_atomic2( - mask.data(), si, byte_offset.data(), - sycl::bit_cast<__ESIMD_DNS::vector_type_t>(src0.data()), - sycl::bit_cast<__ESIMD_DNS::vector_type_t>(src1.data())); - } -#endif +atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd src0, + simd src1, PropertyListT props = {}) { + simd_mask mask = 1; + return atomic_update(acc, byte_offset.read(), src0, src1, mask, + props); } /// simd -/// atomic_update(AccessorTy acc, simd byte_offset, -/// simd src0, simd src1, -/// props = {}); // (acc-au2-2) +/// atomic_update(AccessorTy acc, OffsetSimdViewT +/// byte_offset, SrcSimdViewT src0, simd src1, +/// props = {}); /// -/// A variation of \c atomic_update API with no mask operand. +/// A variation of \c atomic_update API with with no mask operand and \c +/// byte_offset and \c src0 represented as \c simd_view object and allows the +/// use without specifying \c T and \c N template parameters. /// /// @tparam Op The atomic operation - can be one of the following: /// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. @@ -8913,28 +11866,36 @@ atomic_update(AccessorTy acc, simd byte_offset, simd src0, // Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. -/// template < - atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy, + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T, + int N, typename AccessorTy, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args() == 2 && __ESIMD_DNS::is_rw_device_accessor_v && - ext::oneapi::experimental::is_property_list_v, + ext::oneapi::experimental::is_property_list_v && + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, simd> -atomic_update(AccessorTy acc, simd byte_offset, simd src0, +atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0, simd src1, PropertyListT props = {}) { - simd_mask mask = 1; - return atomic_update(acc, byte_offset, src0, src1, mask, props); + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0 and byte_offset parameters must correspond to the size of " + "src1 parameter."); + return atomic_update(acc, byte_offset.read(), src0.read(), src1, + props); } /// simd /// atomic_update(AccessorTy acc, OffsetSimdViewT -/// byte_offset, simd src0, simd src1, -/// simd_mask mask, props = {}); // (acc-au2-3) +/// byte_offset, simd src0, SrcSimdViewT src1, +/// props = {}); /// -/// A variation of \c atomic_update API with \c byte_offset represented as -/// a \c simd_view object. +/// A variation of \c atomic_update API with no mask operand and \c byte_offset +/// and \c src1 represented as \c simd_view object and allows the use without +/// specifying \c T and \c N template parameters. /// /// @tparam Op The atomic operation - can be one of the following: /// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. @@ -8944,36 +11905,41 @@ atomic_update(AccessorTy acc, simd byte_offset, simd src0, /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. /// @param src0 The first additional argument (new value). /// @param src1 The second additional argument (expected value). -/// @param mask Operation mask, only locations with non-zero in the -/// corresponding mask element are updated. /// @param props The parameter 'props' specifies the optional compile-time /// properties list. Only L1/L2 properties are used. // Other properties are ignored. /// @return A vector of the old values at the memory locations before the /// update. template < - atomic_op Op, typename T, int N, typename OffsetSimdViewT, - typename AccessorTy, + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T, + int N, typename AccessorTy, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args() == 2 && __ESIMD_DNS::is_rw_device_accessor_v && ext::oneapi::experimental::is_property_list_v && - detail::is_simd_view_type_v, + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, simd> atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd src0, - simd src1, simd_mask mask, PropertyListT props = {}) { - return atomic_update(acc, byte_offset.read(), src0, src1, mask, + SrcSimdViewT src1, PropertyListT props = {}) { + static_assert( + N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() && + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src1 and byte_offset parameters must correspond to the size of " + "src0 parameter."); + return atomic_update(acc, byte_offset.read(), src0, src1.read(), props); } /// simd -/// atomic_update(AccessorTy acc, -/// OffsetSimdViewT, byte_offset, -/// simd src0, simd src1, props = {}); // (acc-au2-4) +/// atomic_update(AccessorTy acc, OffsetSimdViewT +/// byte_offset, SrcSimdViewT src0, SrcSimdViewT src1, +/// props = {}); /// -/// A variation of \c atomic_update API with \c byte_offset represented as -/// a \c simd_view object and no mask operand. +/// A variation of \c atomic_update API with no mask operand and \c byte_offset, +/// \c src0 and \c src1 represented as \c simd_view object and allows the use +/// without specifying \c T and \c N template parameters. /// /// @tparam Op The atomic operation - can be one of the following: /// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg. @@ -8989,20 +11955,25 @@ atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd src0, /// @return A vector of the old values at the memory locations before the /// update. template < - atomic_op Op, typename T, int N, typename OffsetSimdViewT, + atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, + typename T = SrcSimdViewT::value_type::element_type, + int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(), typename AccessorTy, typename PropertyListT = ext::oneapi::experimental::empty_properties_t> __ESIMD_API std::enable_if_t< __ESIMD_DNS::get_num_args() == 2 && __ESIMD_DNS::is_rw_device_accessor_v && ext::oneapi::experimental::is_property_list_v && - detail::is_simd_view_type_v, + detail::is_simd_view_type_v && + detail::is_simd_view_type_v, simd> -atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd src0, - simd src1, PropertyListT props = {}) { - simd_mask mask = 1; - return atomic_update(acc, byte_offset.read(), src0, src1, mask, - props); +atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0, + SrcSimdViewT src1, PropertyListT props = {}) { + static_assert( + N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(), + "Size of src0, src1 and byte_offset parameters must correspond."); + return atomic_update(acc, byte_offset.read(), src0.read(), + src1.read(), props); } /// A variation of \c atomic_update API with \c offsets represented as @@ -9860,7 +12831,7 @@ scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, /// simd_mask mask, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to memory locations addressed /// by the local accessor \p acc and byte offsets \p byte_offsets. Access to any @@ -9901,7 +12872,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd vals, /// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd vals, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to memory locations addressed /// by the local accessor \p acc and byte offsets \p byte_offsets. @@ -9942,7 +12913,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd vals, /// ValuesSimdViewT vals, simd_mask mask, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to memory locations addressed /// by the local accessor \p acc and byte offsets \p byte_offsets. Access to any @@ -9990,7 +12961,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals, /// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, /// ValuesSimdViewT vals, PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to memory locations addressed /// by the local accessor \p acc and byte offsets \p byte_offsets. @@ -10035,7 +13006,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals, /// ValuesSimdViewT vals, simd_mask mask, /// PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to memory locations addressed /// by the local accessor \p acc and byte offsets \p byte_offsets. Access to any @@ -10078,7 +13049,7 @@ scatter(AccessorTy acc, simd byte_offsets, /// void scatter(AccessorTy acc, simd byte_offsets, /// ValuesSimdViewT vals, PropertyListT props = {}); /// -/// Variation of the API that allows to use \c simd_view without specifying +/// Variation of the API that allows using \c simd_view without specifying /// \c T and \c N template parameters. /// Stores ("scatters") elements of the type 'T' to memory locations addressed /// by the local accessor \p acc and byte offsets \p byte_offsets. @@ -10388,6 +13359,66 @@ prefetch(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) { prefetch(p, byte_offsets.read(), props); } +/// template +/// void prefetch(const T *p, OffsetSimdViewT byte_offsets, +/// simd_mask mask, PropertyListT props = {}); +/// Supported platforms: DG2, PVC only. +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Prefetches elements of the type 'T' from memory locations +/// addressed by the base pointer \p p and byte offsets \p byte_offsets to the +/// cache. Access to any element's memory location can be disabled via the input +/// vector of predicates \p mask. If mask[i] is unset, then the load from (p + +/// byte_offsets[i]) is skipped. +/// @tparam VS Vector size. It can also be read as the number of reads per +/// each address. The parameter 'N' must be divisible by 'VS'. +/// @param p The base address. +/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only cache hint +/// properties are used. +template < + int VS = 1, typename OffsetSimdViewT, typename T, + int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() * VS, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +prefetch(const T *p, OffsetSimdViewT byte_offsets, simd_mask mask, + PropertyListT props = {}) { + prefetch(p, byte_offsets.read(), mask, props); +} + +/// template +/// void prefetch(const T *p, OffsetSimdViewT byte_offsets, +/// PropertyListT props = {}); +/// Supported platforms: DG2, PVC only. +/// Variation of the API that allows using \c simd_view without specifying +/// \c T and \c N template parameters. +/// Prefetches elements of the type 'T' from memory locations +/// addressed by the base pointer \p p and byte offsets \p byte_offsets to the +/// cache. +/// @tparam VS Vector size. It can also be read as the number of reads per +/// each address. The parameter 'N' must be divisible by 'VS'. +/// @param p The base address. +/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// @param props The optional compile-time properties. Only cache hint +/// properties are used. +template < + int VS = 1, typename OffsetSimdViewT, typename T, + int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() * VS, + typename PropertyListT = ext::oneapi::experimental::empty_properties_t> +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +prefetch(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) { + prefetch(p, byte_offsets.read(), props); +} + /// template /// void prefetch(const T *p, OffsetT byte_offset, simd_mask<1> mask, diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp index 3a16dcd244b4c..de97de176e53d 100644 --- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp +++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp @@ -18,6 +18,30 @@ extern "C" __DPCPP_SYCL_EXTERNAL uint16_t __devicelib_ConvertFToBF16INTEL(const float &) noexcept; extern "C" __DPCPP_SYCL_EXTERNAL float __devicelib_ConvertBF16ToFINTEL(const uint16_t &) noexcept; +extern "C" __DPCPP_SYCL_EXTERNAL void +__devicelib_ConvertFToBF16INTELVec1(const float *, uint16_t *) noexcept; +extern "C" __DPCPP_SYCL_EXTERNAL void +__devicelib_ConvertBF16ToFINTELVec1(const uint16_t *, float *) noexcept; +extern "C" __DPCPP_SYCL_EXTERNAL void +__devicelib_ConvertFToBF16INTELVec2(const float *, uint16_t *) noexcept; +extern "C" __DPCPP_SYCL_EXTERNAL void +__devicelib_ConvertBF16ToFINTELVec2(const uint16_t *, float *) noexcept; +extern "C" __DPCPP_SYCL_EXTERNAL void +__devicelib_ConvertFToBF16INTELVec3(const float *, uint16_t *) noexcept; +extern "C" __DPCPP_SYCL_EXTERNAL void +__devicelib_ConvertBF16ToFINTELVec3(const uint16_t *, float *) noexcept; +extern "C" __DPCPP_SYCL_EXTERNAL void +__devicelib_ConvertFToBF16INTELVec4(const float *, uint16_t *) noexcept; +extern "C" __DPCPP_SYCL_EXTERNAL void +__devicelib_ConvertBF16ToFINTELVec4(const uint16_t *, float *) noexcept; +extern "C" __DPCPP_SYCL_EXTERNAL void +__devicelib_ConvertFToBF16INTELVec8(const float *, uint16_t *) noexcept; +extern "C" __DPCPP_SYCL_EXTERNAL void +__devicelib_ConvertBF16ToFINTELVec8(const uint16_t *, float *) noexcept; +extern "C" __DPCPP_SYCL_EXTERNAL void +__devicelib_ConvertFToBF16INTELVec16(const float *, uint16_t *) noexcept; +extern "C" __DPCPP_SYCL_EXTERNAL void +__devicelib_ConvertBF16ToFINTELVec16(const uint16_t *, float *) noexcept; namespace sycl { inline namespace _V1 { @@ -29,9 +53,35 @@ namespace detail { using Bfloat16StorageT = uint16_t; Bfloat16StorageT bfloat16ToBits(const bfloat16 &Value); bfloat16 bitsToBfloat16(const Bfloat16StorageT Value); +// Class to convert different data types to Bfloat16 +// with different rounding modes. +class ConvertToBfloat16; + +template void BF16VecToFloatVec(const bfloat16 src[N], float dst[N]) { +#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__)) + const uint16_t *src_i16 = sycl::bit_cast(src); + if constexpr (N == 1) + __devicelib_ConvertBF16ToFINTELVec1(src_i16, dst); + else if constexpr (N == 2) + __devicelib_ConvertBF16ToFINTELVec2(src_i16, dst); + else if constexpr (N == 3) + __devicelib_ConvertBF16ToFINTELVec3(src_i16, dst); + else if constexpr (N == 4) + __devicelib_ConvertBF16ToFINTELVec4(src_i16, dst); + else if constexpr (N == 8) + __devicelib_ConvertBF16ToFINTELVec8(src_i16, dst); + else if constexpr (N == 16) + __devicelib_ConvertBF16ToFINTELVec16(src_i16, dst); +#else + for (int i = 0; i < N; ++i) { + dst[i] = (float)src[i]; + } +#endif +} // sycl::vec support namespace bf16 { +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES #ifdef __SYCL_DEVICE_ONLY__ using Vec2StorageT = Bfloat16StorageT __attribute__((ext_vector_type(2))); using Vec3StorageT = Bfloat16StorageT __attribute__((ext_vector_type(3))); @@ -45,6 +95,7 @@ using Vec4StorageT = std::array; using Vec8StorageT = std::array; using Vec16StorageT = std::array; #endif +#endif // __INTEL_PREVIEW_BREAKING_CHANGES } // namespace bf16 } // namespace detail @@ -56,6 +107,7 @@ class bfloat16 { detail::bfloat16ToBits(const bfloat16 &Value); friend inline bfloat16 detail::bitsToBfloat16(const detail::Bfloat16StorageT Value); + friend class detail::ConvertToBfloat16; public: bfloat16() = default; @@ -237,6 +289,30 @@ class bfloat16 { namespace detail { +template void FloatVecToBF16Vec(float src[N], bfloat16 dst[N]) { +#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__)) + uint16_t *dst_i16 = sycl::bit_cast(dst); + if constexpr (N == 1) + __devicelib_ConvertFToBF16INTELVec1(src, dst_i16); + else if constexpr (N == 2) + __devicelib_ConvertFToBF16INTELVec2(src, dst_i16); + else if constexpr (N == 3) + __devicelib_ConvertFToBF16INTELVec3(src, dst_i16); + else if constexpr (N == 4) + __devicelib_ConvertFToBF16INTELVec4(src, dst_i16); + else if constexpr (N == 8) + __devicelib_ConvertFToBF16INTELVec8(src, dst_i16); + else if constexpr (N == 16) + __devicelib_ConvertFToBF16INTELVec16(src, dst_i16); +#else + for (int i = 0; i < N; ++i) { + // No need to cast as bfloat16 has a assignment op overload that takes + // a float. + dst[i] = src[i]; + } +#endif +} + // Helper function for getting the internal representation of a bfloat16. inline Bfloat16StorageT bfloat16ToBits(const bfloat16 &Value) { return Value.value; @@ -250,6 +326,315 @@ inline bfloat16 bitsToBfloat16(const Bfloat16StorageT Value) { return res; } +// Class to convert different data types to Bfloat16 +// with different rounding modes. +class ConvertToBfloat16 { + + // The automatic rounding mode is RTE. + enum SYCLRoundingMode { automatic = 0, rte = 1, rtz = 2, rtp = 3, rtn = 4 }; + + // Function to get the most significant bit position of a number. + template static size_t get_msb_pos(const Ty &x) { + assert(x != 0); + size_t idx = 0; + Ty mask = ((Ty)1 << (sizeof(Ty) * 8 - 1)); + for (idx = 0; idx < (sizeof(Ty) * 8); ++idx) { + if ((x & mask) == mask) + break; + mask >>= 1; + } + + return (sizeof(Ty) * 8 - 1 - idx); + } + + // Helper function to get BF16 from float with different rounding modes. + // Reference: + // https://github.com/intel/llvm/blob/sycl/libdevice/imf_bf16.hpp#L30 + static bfloat16 + getBFloat16FromFloatWithRoundingMode(const float &f, + SYCLRoundingMode roundingMode) { + + if (roundingMode == SYCLRoundingMode::automatic || + roundingMode == SYCLRoundingMode::rte) { + // Use the default rounding mode. + return bfloat16{f}; + } else { + uint32_t u32_val = sycl::bit_cast(f); + uint16_t bf16_sign = static_cast((u32_val >> 31) & 0x1); + uint16_t bf16_exp = static_cast((u32_val >> 23) & 0x7FF); + uint32_t f_mant = u32_val & 0x7F'FFFF; + uint16_t bf16_mant = static_cast(f_mant >> 16); + // +/-infinity and NAN + if (bf16_exp == 0xFF) { + if (!f_mant) + return bitsToBfloat16(bf16_sign ? 0xFF80 : 0x7F80); + else + return bitsToBfloat16((bf16_sign << 15) | (bf16_exp << 7) | + bf16_mant); + } + + // +/-0 + if (!bf16_exp && !f_mant) { + return bitsToBfloat16(bf16_sign ? 0x8000 : 0x0); + } + + uint16_t mant_discard = static_cast(f_mant & 0xFFFF); + switch (roundingMode) { + case SYCLRoundingMode::rtn: + if (bf16_sign && mant_discard) + bf16_mant++; + break; + case SYCLRoundingMode::rtz: + break; + case SYCLRoundingMode::rtp: + if (!bf16_sign && mant_discard) + bf16_mant++; + break; + + // Should not reach here. Adding these just to suppress the warning. + case SYCLRoundingMode::automatic: + case SYCLRoundingMode::rte: + break; + } + + // if overflow happens, bf16_exp will be 0xFF and bf16_mant will be 0, + // infinity will be returned. + if (bf16_mant == 0x80) { + bf16_mant = 0; + bf16_exp++; + } + + return bitsToBfloat16((bf16_sign << 15) | (bf16_exp << 7) | bf16_mant); + } + } + + // Helper function to get BF16 from unsigned integral data types + // with different rounding modes. + // Reference: + // https://github.com/intel/llvm/blob/sycl/libdevice/imf_bf16.hpp#L302 + template + static bfloat16 + getBFloat16FromUIntegralWithRoundingMode(T &u, + SYCLRoundingMode roundingMode) { + + size_t msb_pos = get_msb_pos(u); + // return half representation for 1 + if (msb_pos == 0) + return bitsToBfloat16(0x3F80); + + T mant = u & ((static_cast(1) << msb_pos) - 1); + // Unsigned integral value can be represented by 1.mant * (2^msb_pos), + // msb_pos is also the bit number of mantissa, 0 < msb_pos < sizeof(Ty) * 8, + // exponent of bfloat16 precision value range is [-126, 127]. + + uint16_t b_exp = msb_pos; + uint16_t b_mant; + + if (msb_pos <= 7) { + // No need to round off if we can losslessly fit the input value in + // mantissa of bfloat16. + mant <<= (7 - msb_pos); + b_mant = static_cast(mant); + } else { + b_mant = static_cast(mant >> (msb_pos - 7)); + T mant_discard = mant & ((static_cast(1) << (msb_pos - 7)) - 1); + T mid = static_cast(1) << (msb_pos - 8); + switch (roundingMode) { + case SYCLRoundingMode::automatic: + case SYCLRoundingMode::rte: + if ((mant_discard > mid) || + ((mant_discard == mid) && ((b_mant & 0x1) == 0x1))) + b_mant++; + break; + case SYCLRoundingMode::rtp: + if (mant_discard) + b_mant++; + break; + case SYCLRoundingMode::rtn: + case SYCLRoundingMode::rtz: + break; + } + } + if (b_mant == 0x80) { + b_exp++; + b_mant = 0; + } + + b_exp += 127; + return bitsToBfloat16((b_exp << 7) | b_mant); + } + + // Helper function to get BF16 from signed integral data types. + // Reference: + // https://github.com/intel/llvm/blob/sycl/libdevice/imf_bf16.hpp#L353 + template + static bfloat16 + getBFloat16FromSIntegralWithRoundingMode(T &i, + SYCLRoundingMode roundingMode) { + // Get unsigned type corresponding to T. + typedef typename std::make_unsigned_t UTy; + + uint16_t b_sign = (i >= 0) ? 0 : 0x8000; + UTy ui = (i > 0) ? static_cast(i) : static_cast(-i); + size_t msb_pos = get_msb_pos(ui); + if (msb_pos == 0) + return bitsToBfloat16(b_sign ? 0xBF80 : 0x3F80); + UTy mant = ui & ((static_cast(1) << msb_pos) - 1); + + uint16_t b_exp = msb_pos; + uint16_t b_mant; + if (msb_pos <= 7) { + mant <<= (7 - msb_pos); + b_mant = static_cast(mant); + } else { + b_mant = static_cast(mant >> (msb_pos - 7)); + T mant_discard = mant & ((static_cast(1) << (msb_pos - 7)) - 1); + T mid = static_cast(1) << (msb_pos - 8); + switch (roundingMode) { + case SYCLRoundingMode::automatic: + case SYCLRoundingMode::rte: + if ((mant_discard > mid) || + ((mant_discard == mid) && ((b_mant & 0x1) == 0x1))) + b_mant++; + break; + case SYCLRoundingMode::rtp: + if (mant_discard && !b_sign) + b_mant++; + break; + case SYCLRoundingMode::rtn: + if (mant_discard && b_sign) + b_mant++; + case SYCLRoundingMode::rtz: + break; + } + } + + if (b_mant == 0x80) { + b_exp++; + b_mant = 0; + } + b_exp += 127; + return bitsToBfloat16(b_sign | (b_exp << 7) | b_mant); + } + + // Helper function to get BF16 from double with RTE rounding modes. + // Reference: + // https://github.com/intel/llvm/blob/sycl/libdevice/imf_bf16.hpp#L79 + static bfloat16 getBFloat16FromDoubleWithRTE(const double &d) { + + uint64_t u64_val = sycl::bit_cast(d); + int16_t bf16_sign = (u64_val >> 63) & 0x1; + uint16_t fp64_exp = static_cast((u64_val >> 52) & 0x7FF); + uint64_t fp64_mant = (u64_val & 0xF'FFFF'FFFF'FFFF); + uint16_t bf16_mant; + // handling +/-infinity and NAN for double input + if (fp64_exp == 0x7FF) { + if (!fp64_mant) { + return bf16_sign ? 0xFF80 : 0x7F80; + } else { + // returns a quiet NaN + return 0x7FC0; + } + } + + // Subnormal double precision is converted to 0 + if (fp64_exp == 0) { + return bf16_sign ? 0x8000 : 0x0; + } + + fp64_exp -= 1023; + // handling overflow, convert to +/-infinity + if (static_cast(fp64_exp) > 127) { + return bf16_sign ? 0xFF80 : 0x7F80; + } + + // handling underflow + if (static_cast(fp64_exp) < -133) { + return bf16_sign ? 0x8000 : 0x0; + } + + //-133 <= fp64_exp <= 127, 1.signicand * 2^fp64_exp + // For these numbers, they are NOT subnormal double-precision numbers but + // will turn into subnormal when converting to bfloat16 + uint64_t discard_bits; + if (static_cast(fp64_exp) < -126) { + fp64_mant |= 0x10'0000'0000'0000; + fp64_mant >>= -126 - static_cast(fp64_exp) - 1; + discard_bits = fp64_mant & 0x3FFF'FFFF'FFFF; + bf16_mant = static_cast(fp64_mant >> 46); + if (discard_bits > 0x2000'0000'0000 || + ((discard_bits == 0x2000'0000'0000) && ((bf16_mant & 0x1) == 0x1))) + bf16_mant += 1; + fp64_exp = 0; + if (bf16_mant == 0x80) { + bf16_mant = 0; + fp64_exp = 1; + } + return (bf16_sign << 15) | (fp64_exp << 7) | bf16_mant; + } + + // For normal value, discard 45 bits from mantissa + discard_bits = fp64_mant & 0x1FFF'FFFF'FFFF; + bf16_mant = static_cast(fp64_mant >> 45); + if (discard_bits > 0x1000'0000'0000 || + ((discard_bits == 0x1000'0000'0000) && ((bf16_mant & 0x1) == 0x1))) + bf16_mant += 1; + + if (bf16_mant == 0x80) { + if (fp64_exp != 127) { + bf16_mant = 0; + fp64_exp++; + } else { + return bf16_sign ? 0xFF80 : 0x7F80; + } + } + fp64_exp += 127; + + return (bf16_sign << 15) | (fp64_exp << 7) | bf16_mant; + } + +public: + template + static bfloat16 getBfloat16WithRoundingMode(const Ty &a) { + + if (!a) + return bfloat16{0.0f}; + + constexpr SYCLRoundingMode roundingMode = static_cast(rm); + + // Float. + if constexpr (std::is_same_v) { + return getBFloat16FromFloatWithRoundingMode(a, roundingMode); + } + // Double. + else if constexpr (std::is_same_v) { + static_assert( + roundingMode == SYCLRoundingMode::automatic || + roundingMode == SYCLRoundingMode::rte, + "Only automatic/RTE rounding mode is supported for double type."); + return getBFloat16FromDoubleWithRTE(a); + } + // Half + else if constexpr (std::is_same_v) { + // Convert half to float and then convert to bfloat16. + // Conversion of half to float is lossless as the latter + // have a wider dynamic range. + return getBFloat16FromFloatWithRoundingMode(static_cast(a), + roundingMode); + } + // Unsigned integral types. + else if constexpr (std::is_integral_v && std::is_unsigned_v) { + return getBFloat16FromUIntegralWithRoundingMode(a, roundingMode); + } + // Signed integral types. + else if constexpr (std::is_integral_v && std::is_signed_v) { + return getBFloat16FromSIntegralWithRoundingMode(a, roundingMode); + } else { + static_assert(std::is_integral_v || std::is_floating_point_v, + "Only integral and floating point types are supported."); + } + } +}; // class ConvertToBfloat16. } // namespace detail } // namespace ext::oneapi diff --git a/sycl/include/sycl/ext/oneapi/bindless_images.hpp b/sycl/include/sycl/ext/oneapi/bindless_images.hpp index 696301e5c3098..4a8f618a78959 100644 --- a/sycl/include/sycl/ext/oneapi/bindless_images.hpp +++ b/sycl/include/sycl/ext/oneapi/bindless_images.hpp @@ -181,7 +181,7 @@ void free_mipmap_mem(image_mem_handle handle, const sycl::queue &syclQueue); * @return Memory handle to the individual mipmap image */ __SYCL_EXPORT image_mem_handle get_mip_level_mem_handle( - const image_mem_handle mipMem, const unsigned int level, + const image_mem_handle mipMem, unsigned int level, const sycl::device &syclDevice, const sycl::context &syclContext); /** @@ -192,9 +192,9 @@ __SYCL_EXPORT image_mem_handle get_mip_level_mem_handle( * @param syclQueue The queue in which we created our memory handle * @return Memory handle to the individual mipmap image */ -__SYCL_EXPORT image_mem_handle get_mip_level_mem_handle( - const image_mem_handle mipMem, const unsigned int level, - const sycl::queue &syclQueue); +__SYCL_EXPORT image_mem_handle +get_mip_level_mem_handle(const image_mem_handle mipMem, unsigned int level, + const sycl::queue &syclQueue); /** * @brief Import external memory taking an external memory handle (the type @@ -1299,7 +1299,7 @@ template DataT fetch_image_array(const unsampled_image_handle &imageHandle [[maybe_unused]], const CoordT &coords [[maybe_unused]], - const int arrayLayer [[maybe_unused]]) { + int arrayLayer [[maybe_unused]]) { detail::assert_unsampled_coords(); constexpr size_t coordSize = detail::coord_size(); static_assert(coordSize == 1 || coordSize == 2, @@ -1347,7 +1347,7 @@ DataT fetch_image_array(const unsampled_image_handle &imageHandle */ template DataT fetch_cubemap(const unsampled_image_handle &imageHandle, - const int2 &coords, const unsigned int face) { + const int2 &coords, unsigned int face) { return fetch_image_array(imageHandle, coords, face); } @@ -1442,7 +1442,7 @@ void write_image(unsampled_image_handle imageHandle [[maybe_unused]], template void write_image_array(unsampled_image_handle imageHandle [[maybe_unused]], const CoordT &coords [[maybe_unused]], - const int arrayLayer [[maybe_unused]], + int arrayLayer [[maybe_unused]], const DataT &color [[maybe_unused]]) { detail::assert_unsampled_coords(); constexpr size_t coordSize = detail::coord_size(); @@ -1482,7 +1482,7 @@ void write_image_array(unsampled_image_handle imageHandle [[maybe_unused]], */ template void write_cubemap(unsampled_image_handle imageHandle, const sycl::int2 &coords, - const int face, const DataT &color) { + int face, const DataT &color) { return write_image_array(imageHandle, coords, face, color); } @@ -1774,5 +1774,138 @@ inline event queue::ext_oneapi_copy( }, CodeLoc); } + +inline event queue::ext_oneapi_wait_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + event DepEvent, const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle); + }, + CodeLoc); +} + +inline event queue::ext_oneapi_wait_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + const std::vector &DepEvents, const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle); + }, + CodeLoc); +} + +inline event queue::ext_oneapi_wait_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t WaitValue, const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return submit( + [&](handler &CGH) { + CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle, WaitValue); + }, + CodeLoc); +} + +inline event queue::ext_oneapi_wait_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t WaitValue, event DepEvent, const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle, WaitValue); + }, + CodeLoc); +} + +inline event queue::ext_oneapi_wait_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t WaitValue, const std::vector &DepEvents, + const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle, WaitValue); + }, + CodeLoc); +} + +inline event queue::ext_oneapi_signal_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return submit( + [&](handler &CGH) { + CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle); + }, + CodeLoc); +} + +inline event queue::ext_oneapi_signal_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + event DepEvent, const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle); + }, + CodeLoc); +} + +inline event queue::ext_oneapi_signal_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + const std::vector &DepEvents, const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle); + }, + CodeLoc); +} + +inline event queue::ext_oneapi_signal_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t SignalValue, const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return submit( + [&](handler &CGH) { + CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle, SignalValue); + }, + CodeLoc); +} + +inline event queue::ext_oneapi_signal_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t SignalValue, event DepEvent, + const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle, SignalValue); + }, + CodeLoc); +} + +inline event queue::ext_oneapi_signal_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t SignalValue, const std::vector &DepEvents, + const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle, SignalValue); + }, + CodeLoc); +} + } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/bindless_images_interop.hpp b/sycl/include/sycl/ext/oneapi/bindless_images_interop.hpp index f7caddc1b5bf7..3992f5d93075d 100644 --- a/sycl/include/sycl/ext/oneapi/bindless_images_interop.hpp +++ b/sycl/include/sycl/ext/oneapi/bindless_images_interop.hpp @@ -16,6 +16,20 @@ namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { +// Types of external memory handles +enum class external_mem_handle_type { + opaque_fd = 0, + win32_nt_handle = 1, + win32_nt_dx12_resource = 2, +}; + +// Types of external semaphore handles +enum class external_semaphore_handle_type { + opaque_fd = 0, + win32_nt_handle = 1, + win32_nt_dx12_fence = 2, +}; + /// Opaque interop memory handle type struct interop_mem_handle { using raw_handle_type = pi_uint64; @@ -26,6 +40,7 @@ struct interop_mem_handle { struct interop_semaphore_handle { using raw_handle_type = pi_uint64; raw_handle_type raw_handle; + external_semaphore_handle_type handle_type; }; // External resource file descriptor type @@ -46,12 +61,14 @@ struct resource_win32_name { /// Opaque external memory descriptor type template struct external_mem_descriptor { ResourceType external_resource; + external_mem_handle_type handle_type; size_t size_in_bytes; }; // Opaque external semaphore descriptor type template struct external_semaphore_descriptor { ResourceType external_resource; + external_semaphore_handle_type handle_type; }; /// EVERYTHING BELOW IS DEPRECATED diff --git a/sycl/include/sycl/ext/oneapi/experimental/architectures.def b/sycl/include/sycl/ext/oneapi/experimental/architectures.def new file mode 100644 index 0000000000000..47741b0ba3778 --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/experimental/architectures.def @@ -0,0 +1,179 @@ + +// If new element is added to this enum: +// +// Update +// - "detail::min__architecture" below if needed +// - "detail::max__architecture" below if needed +// - sycl_ext_oneapi_device_architecture specification doc +// - "-fsycl-targets" description in sycl/doc/UsersManual.md +// +// Add +// - new value for -fsycl-targets option to the compiler driver in +// accordance with changes from sycl/doc/UsersManual.md and update the +// compiler driver tests +// - ___SYCL_TARGET___ to the compiler driver and to all places below +// - the unique ID of the new architecture to the SYCL RT source code to +// support querying the device architecture through +// device::get_info +// - alias of architecture if this is Intel GPU architecture in format +// intel_gpu_ +// +// Important note about keeping architecture IDs below unique: +// - the architecture ID must be a hex number with 16 digits +// - the architecture ID must suit the following template: +// 0x AA BBBB CCCCCCCC DD (without spaces), where +// - AA is 2-digit ID of the architecture family which must be unique +// - BBBB is 4-digit number reserved for future modifications +// to keep uniqueness. It should be always 0000 for now +// - CCCCCCCC is 8-digit number of architecture itself. It must be +// unique for all architectures inside the family +// - DD is 2-digit number reserved for future unexpected modifications +// to keep uniqueness. It should be always 00 for now +// +__SYCL_ARCHITECTURE(unknown, 0x9900000000000000) +// +// Intel CPU architectures +// +// AA is 03, +// CCCCCCCC is the architecture ID from the DEVICE_IP_VERSION extension of +// underlied backend +// Note: CCCCCCCC for x86_64 consists of all zeros +__SYCL_ARCHITECTURE(x86_64, 0x0300000000000000) +__SYCL_ARCHITECTURE(intel_cpu_spr, 0x0300000000000800) +__SYCL_ARCHITECTURE(intel_cpu_gnr, 0x0300000000000900) +// +// Intel GPU architectures +// +// AA is 00, +// CCCCCCCC is GMDID of that architecture +__SYCL_ARCHITECTURE(intel_gpu_bdw, 0x0000000200000000) // Intel(R) microarchitecture code name Broadwell +__SYCL_ARCHITECTURE(intel_gpu_skl, 0x0000000240000900) // Intel(R) microarchitecture code name Skylake +__SYCL_ARCHITECTURE(intel_gpu_kbl, 0x0000000240400900) // Kaby Lake +__SYCL_ARCHITECTURE(intel_gpu_cfl, 0x0000000240800900) // Coffee Lake +__SYCL_ARCHITECTURE(intel_gpu_apl, 0x0000000240c00000) // Apollo Lake +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_bxt, intel_gpu_apl) // Broxton +__SYCL_ARCHITECTURE(intel_gpu_glk, 0x0000000241000000) // Gemini Lake +__SYCL_ARCHITECTURE(intel_gpu_whl, 0x0000000241400000) // Whiskey Lake +__SYCL_ARCHITECTURE(intel_gpu_aml, 0x0000000241800000) // Amber Lake +__SYCL_ARCHITECTURE(intel_gpu_cml, 0x0000000241c00000) // Comet Lake +__SYCL_ARCHITECTURE(intel_gpu_icllp, 0x00000002c0000000) // Ice Lake +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_icl, intel_gpu_icllp) // Ice Lake +__SYCL_ARCHITECTURE(intel_gpu_ehl, 0x00000002c0800000) // Elkhart Lake +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_jsl, intel_gpu_ehl) // Jasper Lake +__SYCL_ARCHITECTURE(intel_gpu_tgllp, 0x0000000300000000) // Tiger Lake +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_tgl, intel_gpu_tgllp) // Tiger Lake +__SYCL_ARCHITECTURE(intel_gpu_rkl, 0x0000000300400000) // Rocket Lake +__SYCL_ARCHITECTURE(intel_gpu_adl_s, 0x0000000300800000) // Alder Lake S +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_rpl_s, intel_gpu_adl_s) // Raptor Lake +__SYCL_ARCHITECTURE(intel_gpu_adl_p, 0x0000000300c00000) // Alder Lake P +__SYCL_ARCHITECTURE(intel_gpu_adl_n, 0x0000000301000000) // Alder Lake N +__SYCL_ARCHITECTURE(intel_gpu_dg1, 0x0000000302800000) // DG1 +__SYCL_ARCHITECTURE(intel_gpu_acm_g10, 0x000000030dc00800) // Alchemist G10 +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_dg2_g10, intel_gpu_acm_g10) // Alchemist G10 +__SYCL_ARCHITECTURE(intel_gpu_acm_g11, 0x000000030e000500) // Alchemist G11 +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_dg2_g11, intel_gpu_acm_g11) // Alchemist G11 +__SYCL_ARCHITECTURE(intel_gpu_acm_g12, 0x000000030e400000) // Alchemist G12 +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_dg2_g12, intel_gpu_acm_g12) // Alchemist G12 +__SYCL_ARCHITECTURE(intel_gpu_pvc, 0x000000030f000700) // Ponte Vecchio +__SYCL_ARCHITECTURE(intel_gpu_pvc_vg, 0x000000030f400700) // Ponte Vecchio VG +__SYCL_ARCHITECTURE(intel_gpu_mtl_u, 0x0000000311800400) // Meteor Lake U +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_mtl_s, intel_gpu_mtl_u) // Meteor Lake S +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_arl_u, intel_gpu_mtl_u) // Arrow Lake U +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_arl_s, intel_gpu_mtl_u) // Arrow Lake S +__SYCL_ARCHITECTURE(intel_gpu_mtl_h, 0x0000000311c00400) // Meteor Lake H +__SYCL_ARCHITECTURE(intel_gpu_arl_h, 0x0000000312800400) // Arrow Lake H +__SYCL_ARCHITECTURE(intel_gpu_bmg_g21, 0x0000000500400400) // Battlemage G21 +__SYCL_ARCHITECTURE(intel_gpu_lnl_m, 0x0000000501000400) // Lunar Lake +// +// NVIDIA architectures +// +// AA is 01, +// CCCCCCCC is the SM version ID of that architecture +__SYCL_ARCHITECTURE(nvidia_gpu_sm_50, 0x0100000000005000) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_52, 0x0100000000005200) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_53, 0x0100000000005300) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_60, 0x0100000000006000) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_61, 0x0100000000006100) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_62, 0x0100000000006200) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_70, 0x0100000000007000) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_72, 0x0100000000007200) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_75, 0x0100000000007500) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_80, 0x0100000000008000) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_86, 0x0100000000008600) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_87, 0x0100000000008700) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_89, 0x0100000000008900) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_90, 0x0100000000009000) +__SYCL_ARCHITECTURE(nvidia_gpu_sm_90a, 0x01000000000090a0) +// +// AMD architectures +// +// AA is 02, +// CCCCCCCC is the GFX version ID of that architecture +__SYCL_ARCHITECTURE(amd_gpu_gfx700, 0x0200000000070000) +__SYCL_ARCHITECTURE(amd_gpu_gfx701, 0x0200000000070100) +__SYCL_ARCHITECTURE(amd_gpu_gfx702, 0x0200000000070200) +__SYCL_ARCHITECTURE(amd_gpu_gfx801, 0x0200000000080100) +__SYCL_ARCHITECTURE(amd_gpu_gfx802, 0x0200000000080200) +__SYCL_ARCHITECTURE(amd_gpu_gfx803, 0x0200000000080300) +__SYCL_ARCHITECTURE(amd_gpu_gfx805, 0x0200000000080500) +__SYCL_ARCHITECTURE(amd_gpu_gfx810, 0x0200000000081000) +__SYCL_ARCHITECTURE(amd_gpu_gfx900, 0x0200000000090000) +__SYCL_ARCHITECTURE(amd_gpu_gfx902, 0x0200000000090200) +__SYCL_ARCHITECTURE(amd_gpu_gfx904, 0x0200000000090400) +__SYCL_ARCHITECTURE(amd_gpu_gfx906, 0x0200000000090600) +__SYCL_ARCHITECTURE(amd_gpu_gfx908, 0x0200000000090800) +__SYCL_ARCHITECTURE(amd_gpu_gfx909, 0x0200000000090900) +__SYCL_ARCHITECTURE(amd_gpu_gfx90a, 0x0200000000090a00) +__SYCL_ARCHITECTURE(amd_gpu_gfx90c, 0x0200000000090c00) +__SYCL_ARCHITECTURE(amd_gpu_gfx940, 0x0200000000094000) +__SYCL_ARCHITECTURE(amd_gpu_gfx941, 0x0200000000094100) +__SYCL_ARCHITECTURE(amd_gpu_gfx942, 0x0200000000094200) +__SYCL_ARCHITECTURE(amd_gpu_gfx1010, 0x0200000000101000) +__SYCL_ARCHITECTURE(amd_gpu_gfx1011, 0x0200000000101100) +__SYCL_ARCHITECTURE(amd_gpu_gfx1012, 0x0200000000101200) +__SYCL_ARCHITECTURE(amd_gpu_gfx1013, 0x0200000000101300) +__SYCL_ARCHITECTURE(amd_gpu_gfx1030, 0x0200000000103000) +__SYCL_ARCHITECTURE(amd_gpu_gfx1031, 0x0200000000103100) +__SYCL_ARCHITECTURE(amd_gpu_gfx1032, 0x0200000000103200) +__SYCL_ARCHITECTURE(amd_gpu_gfx1033, 0x0200000000103300) +__SYCL_ARCHITECTURE(amd_gpu_gfx1034, 0x0200000000103400) +__SYCL_ARCHITECTURE(amd_gpu_gfx1035, 0x0200000000103500) +__SYCL_ARCHITECTURE(amd_gpu_gfx1036, 0x0200000000103600) +__SYCL_ARCHITECTURE(amd_gpu_gfx1100, 0x0200000000110000) +__SYCL_ARCHITECTURE(amd_gpu_gfx1101, 0x0200000000110100) +__SYCL_ARCHITECTURE(amd_gpu_gfx1102, 0x0200000000110200) +__SYCL_ARCHITECTURE(amd_gpu_gfx1103, 0x0200000000110300) +__SYCL_ARCHITECTURE(amd_gpu_gfx1150, 0x0200000000115000) +__SYCL_ARCHITECTURE(amd_gpu_gfx1151, 0x0200000000115100) +__SYCL_ARCHITECTURE(amd_gpu_gfx1200, 0x0200000000120000) +__SYCL_ARCHITECTURE(amd_gpu_gfx1201, 0x0200000000120100) +// +// Aliases for Intel graphics architectures +// +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_8_0_0, intel_gpu_bdw) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_0_9, intel_gpu_skl) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_1_9, intel_gpu_kbl) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_2_9, intel_gpu_cfl) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_3_0, intel_gpu_apl) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_4_0, intel_gpu_glk) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_5_0, intel_gpu_whl) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_6_0, intel_gpu_aml) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_7_0, intel_gpu_cml) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_11_0_0, intel_gpu_icllp) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_11_2_0, intel_gpu_ehl) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_0_0, intel_gpu_tgllp) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_1_0, intel_gpu_rkl) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_2_0, intel_gpu_adl_s) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_3_0, intel_gpu_adl_p) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_4_0, intel_gpu_adl_n) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_10_0, intel_gpu_dg1) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_55_8, intel_gpu_acm_g10) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_56_5, intel_gpu_acm_g11) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_57_0, intel_gpu_acm_g12) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_60_7, intel_gpu_pvc) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_61_7, intel_gpu_pvc_vg) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_70_4, intel_gpu_mtl_u) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_71_4, intel_gpu_mtl_h) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_74_4, intel_gpu_arl_h) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_20_1_4, intel_gpu_bmg_g21) +__SYCL_ARCHITECTURE_ALIAS(intel_gpu_20_4_4, intel_gpu_lnl_m) \ No newline at end of file diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp index 2b611f46ddadd..fb4b49a44d4d3 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp @@ -9,6 +9,7 @@ #pragma once #include // for ceil, cos, exp, exp10, exp2 +#include // For simplify_if_swizzle, is_swizzle #include // sycl::detail::memcpy #include // for bfloat16, bfloat16ToBits #include // for marray @@ -30,6 +31,17 @@ uint32_t to_uint32_t(sycl::marray x, size_t start) { } } // namespace detail +// Trait to check if the type is a vector or swizzle of bfloat16. +template +constexpr bool is_vec_or_swizzle_bf16_v = + sycl::detail::is_vec_or_swizzle_v && + sycl::detail::is_valid_elem_type_v; + +template +constexpr int num_elements_v = sycl::detail::num_elements::value; + +/******************* isnan ********************/ + // According to bfloat16 format, NAN value's exponent field is 0xFF and // significand has non-zero bits. template @@ -46,6 +58,21 @@ template sycl::marray isnan(sycl::marray x) { return res; } +// Overload for BF16 vec and swizzles. +template > +std::enable_if_t, sycl::vec> +isnan(T x) { + sycl::vec res; + for (size_t i = 0; i < N; i++) { + // The result of isnan is 0 or 1 but SPEC requires + // isnan() of vec/swizzle to return -1 or 0. + res[i] = isnan(x[i]) ? -1 : 0; + } + return res; +} + +/******************* fabs ********************/ + template std::enable_if_t, T> fabs(T x) { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ @@ -89,6 +116,19 @@ sycl::marray fabs(sycl::marray x) { return res; } +// Overload for BF16 vec and swizzles. +template > +std::enable_if_t, sycl::vec> +fabs(T x) { + sycl::vec res; + for (size_t i = 0; i < N; i++) { + res[i] = fabs(x[i]); + } + return res; +} + +/******************* fmin ********************/ + template std::enable_if_t, T> fmin(T x, T y) { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ @@ -146,6 +186,22 @@ sycl::marray fmin(sycl::marray x, return res; } +// Overload for different combination of BF16 vec and swizzles. +template , + int N2 = num_elements_v> +std::enable_if_t && is_vec_or_swizzle_bf16_v && + N1 == N2, + sycl::vec> +fmin(T1 x, T2 y) { + sycl::vec res; + for (size_t i = 0; i < N1; i++) { + res[i] = fmin(x[i], y[i]); + } + return res; +} + +/******************* fmax ********************/ + template std::enable_if_t, T> fmax(T x, T y) { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ @@ -202,6 +258,22 @@ sycl::marray fmax(sycl::marray x, return res; } +// Overload for different combination of BF16 vec and swizzles. +template , + int N2 = num_elements_v> +std::enable_if_t && is_vec_or_swizzle_bf16_v && + N1 == N2, + sycl::vec> +fmax(T1 x, T2 y) { + sycl::vec res; + for (size_t i = 0; i < N1; i++) { + res[i] = fmax(x[i], y[i]); + } + return res; +} + +/******************* fma *********************/ + template std::enable_if_t, T> fma(T x, T y, T z) { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ @@ -248,6 +320,22 @@ sycl::marray fma(sycl::marray x, return res; } +// Overload for different combination of BF16 vec and swizzles. +template , + int N2 = num_elements_v, int N3 = num_elements_v> +std::enable_if_t && is_vec_or_swizzle_bf16_v && + is_vec_or_swizzle_bf16_v && N1 == N2 && N2 == N3, + sycl::vec> +fma(T1 x, T2 y, T3 z) { + sycl::vec res; + for (size_t i = 0; i < N1; i++) { + res[i] = fma(x[i], y[i], z[i]); + } + return res; +} + +/******************* unary math operations ********************/ + #define BFLOAT16_MATH_FP32_WRAPPERS(op) \ template \ std::enable_if_t::value, T> op(T x) { \ @@ -264,37 +352,77 @@ sycl::marray fma(sycl::marray x, return res; \ } +#define BFLOAT16_MATH_FP32_WRAPPERS_VEC(op) \ + /* Overload for BF16 vec and swizzles. */ \ + template > \ + std::enable_if_t, sycl::vec> op( \ + T x) { \ + sycl::vec res; \ + for (size_t i = 0; i < N; i++) { \ + res[i] = op(x[i]); \ + } \ + return res; \ + } + BFLOAT16_MATH_FP32_WRAPPERS(ceil) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(ceil) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(ceil) + BFLOAT16_MATH_FP32_WRAPPERS(cos) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(cos) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(cos) + BFLOAT16_MATH_FP32_WRAPPERS(exp) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp) + BFLOAT16_MATH_FP32_WRAPPERS(exp10) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp10) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp10) + BFLOAT16_MATH_FP32_WRAPPERS(exp2) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp2) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp2) + BFLOAT16_MATH_FP32_WRAPPERS(floor) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(floor) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(floor) + BFLOAT16_MATH_FP32_WRAPPERS(log) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(log) + BFLOAT16_MATH_FP32_WRAPPERS(log2) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log2) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(log2) + BFLOAT16_MATH_FP32_WRAPPERS(log10) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log10) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(log10) + BFLOAT16_MATH_FP32_WRAPPERS(rint) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rint) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(rint) + BFLOAT16_MATH_FP32_WRAPPERS(rsqrt) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rsqrt) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(rsqrt) + BFLOAT16_MATH_FP32_WRAPPERS(sin) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sin) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(sin) + BFLOAT16_MATH_FP32_WRAPPERS(sqrt) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sqrt) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(sqrt) + BFLOAT16_MATH_FP32_WRAPPERS(trunc) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(trunc) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(trunc) #undef BFLOAT16_MATH_FP32_WRAPPERS #undef BFLOAT16_MATH_FP32_WRAPPERS_MARRAY +#undef BFLOAT16_MATH_FP32_WRAPPERS_VEC } // namespace ext::oneapi::experimental } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp index c6d367dbda959..0d83f9f84f790 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp @@ -17,183 +17,11 @@ inline namespace _V1 { namespace ext::oneapi::experimental { enum class architecture : uint64_t { - // If new element is added to this enum: - // - // Update - // - "detail::min__architecture" below if needed - // - "detail::max__architecture" below if needed - // - sycl_ext_oneapi_device_architecture specification doc - // - "-fsycl-targets" description in sycl/doc/UsersManual.md - // - // Add - // - new value for -fsycl-targets option to the compiler driver in - // accordance with changes from sycl/doc/UsersManual.md and update the - // compiler driver tests - // - ___SYCL_TARGET___ to the compiler driver and to all places below - // - the unique ID of the new architecture to the SYCL RT source code to - // support querying the device architecture through - // device::get_info - // - alias of architecture if this is Intel GPU architecture in format - // intel_gpu_ - // - // Important note about keeping architecture IDs below unique: - // - the architecture ID must be a hex number with 16 digits - // - the architecture ID must suit the following template: - // 0x AA BBBB CCCCCCCC DD (without spaces), where - // - AA is 2-digit ID of the architecture family which must be unique - // - BBBB is 4-digit number reserved for future modifications - // to keep uniqueness. It should be always 0000 for now - // - CCCCCCCC is 8-digit number of architecture itself. It must be - // unique for all architectures inside the family - // - DD is 2-digit number reserved for future unexpected modifications - // to keep uniqueness. It should be always 00 for now - // - x86_64 = 0x9900000000000000, - // - // Intel CPU architectures - // - // AA is 03, - // CCCCCCCC is the architecture ID from the DEVICE_IP_VERSION extension of - // underlied backend - intel_cpu_spr = 0x0300000000000800, - intel_cpu_gnr = 0x0300000000000900, - // - // Intel GPU architectures - // - // AA is 00, - // CCCCCCCC is GMDID of that architecture - intel_gpu_bdw = - 0x0000000200000000, // Intel(R) microarchitecture code name Broadwell - intel_gpu_skl = - 0x0000000240000900, // Intel(R) microarchitecture code name Skylake - intel_gpu_kbl = 0x0000000240400900, // Kaby Lake - intel_gpu_cfl = 0x0000000240800900, // Coffee Lake - intel_gpu_apl = 0x0000000240c00000, // Apollo Lake - intel_gpu_bxt = intel_gpu_apl, // Broxton - intel_gpu_glk = 0x0000000241000000, // Gemini Lake - intel_gpu_whl = 0x0000000241400000, // Whiskey Lake - intel_gpu_aml = 0x0000000241800000, // Amber Lake - intel_gpu_cml = 0x0000000241c00000, // Comet Lake - intel_gpu_icllp = 0x00000002c0000000, // Ice Lake - intel_gpu_icl = intel_gpu_icllp, // Ice Lake - intel_gpu_ehl = 0x00000002c0800000, // Elkhart Lake - intel_gpu_jsl = intel_gpu_ehl, // Jasper Lake - intel_gpu_tgllp = 0x0000000300000000, // Tiger Lake - intel_gpu_tgl = intel_gpu_tgllp, // Tiger Lake - intel_gpu_rkl = 0x0000000300400000, // Rocket Lake - intel_gpu_adl_s = 0x0000000300800000, // Alder Lake S - intel_gpu_rpl_s = intel_gpu_adl_s, // Raptor Lake - intel_gpu_adl_p = 0x0000000300c00000, // Alder Lake P - intel_gpu_adl_n = 0x0000000301000000, // Alder Lake N - intel_gpu_dg1 = 0x0000000302800000, // DG1 - intel_gpu_acm_g10 = 0x000000030dc00800, // Alchemist G10 - intel_gpu_dg2_g10 = intel_gpu_acm_g10, // Alchemist G10 - intel_gpu_acm_g11 = 0x000000030e000500, // Alchemist G11 - intel_gpu_dg2_g11 = intel_gpu_acm_g11, // Alchemist G11 - intel_gpu_acm_g12 = 0x000000030e400000, // Alchemist G12 - intel_gpu_dg2_g12 = intel_gpu_acm_g12, // Alchemist G12 - intel_gpu_pvc = 0x000000030f000700, // Ponte Vecchio - intel_gpu_pvc_vg = 0x000000030f400700, // Ponte Vecchio VG - intel_gpu_mtl_u = 0x0000000311800400, // Meteor Lake U - intel_gpu_mtl_s = intel_gpu_mtl_u, // Meteor Lake S - intel_gpu_arl_u = intel_gpu_mtl_u, // Arrow Lake U - intel_gpu_arl_s = intel_gpu_mtl_u, // Arrow Lake S - intel_gpu_mtl_h = 0x0000000311c00400, // Meteor Lake H - intel_gpu_arl_h = 0x0000000312800400, // Arrow Lake H - intel_gpu_bmg_g21 = 0x0000000500400400, // Battlemage G21 - intel_gpu_lnl_m = 0x0000000501000400, // Lunar Lake - // - // NVIDIA architectures - // - // AA is 01, - // CCCCCCCC is the SM version ID of that architecture - nvidia_gpu_sm_50 = 0x0100000000005000, - nvidia_gpu_sm_52 = 0x0100000000005200, - nvidia_gpu_sm_53 = 0x0100000000005300, - nvidia_gpu_sm_60 = 0x0100000000006000, - nvidia_gpu_sm_61 = 0x0100000000006100, - nvidia_gpu_sm_62 = 0x0100000000006200, - nvidia_gpu_sm_70 = 0x0100000000007000, - nvidia_gpu_sm_72 = 0x0100000000007200, - nvidia_gpu_sm_75 = 0x0100000000007500, - nvidia_gpu_sm_80 = 0x0100000000008000, - nvidia_gpu_sm_86 = 0x0100000000008600, - nvidia_gpu_sm_87 = 0x0100000000008700, - nvidia_gpu_sm_89 = 0x0100000000008900, - nvidia_gpu_sm_90 = 0x0100000000009000, - // - // AMD architectures - // - // AA is 02, - // CCCCCCCC is the GFX version ID of that architecture - amd_gpu_gfx700 = 0x0200000000070000, - amd_gpu_gfx701 = 0x0200000000070100, - amd_gpu_gfx702 = 0x0200000000070200, - amd_gpu_gfx801 = 0x0200000000080100, - amd_gpu_gfx802 = 0x0200000000080200, - amd_gpu_gfx803 = 0x0200000000080300, - amd_gpu_gfx805 = 0x0200000000080500, - amd_gpu_gfx810 = 0x0200000000081000, - amd_gpu_gfx900 = 0x0200000000090000, - amd_gpu_gfx902 = 0x0200000000090200, - amd_gpu_gfx904 = 0x0200000000090400, - amd_gpu_gfx906 = 0x0200000000090600, - amd_gpu_gfx908 = 0x0200000000090800, - amd_gpu_gfx909 = 0x0200000000090900, - amd_gpu_gfx90a = 0x0200000000090a00, - amd_gpu_gfx90c = 0x0200000000090c00, - amd_gpu_gfx940 = 0x0200000000094000, - amd_gpu_gfx941 = 0x0200000000094100, - amd_gpu_gfx942 = 0x0200000000094200, - amd_gpu_gfx1010 = 0x0200000000101000, - amd_gpu_gfx1011 = 0x0200000000101100, - amd_gpu_gfx1012 = 0x0200000000101200, - amd_gpu_gfx1013 = 0x0200000000101300, - amd_gpu_gfx1030 = 0x0200000000103000, - amd_gpu_gfx1031 = 0x0200000000103100, - amd_gpu_gfx1032 = 0x0200000000103200, - amd_gpu_gfx1033 = 0x0200000000103300, - amd_gpu_gfx1034 = 0x0200000000103400, - amd_gpu_gfx1035 = 0x0200000000103500, - amd_gpu_gfx1036 = 0x0200000000103600, - amd_gpu_gfx1100 = 0x0200000000110000, - amd_gpu_gfx1101 = 0x0200000000110100, - amd_gpu_gfx1102 = 0x0200000000110200, - amd_gpu_gfx1103 = 0x0200000000110300, - amd_gpu_gfx1150 = 0x0200000000115000, - amd_gpu_gfx1151 = 0x0200000000115100, - amd_gpu_gfx1200 = 0x0200000000120000, - amd_gpu_gfx1201 = 0x0200000000120100, - // - // Aliases for Intel graphics architectures - // - intel_gpu_8_0_0 = intel_gpu_bdw, - intel_gpu_9_0_9 = intel_gpu_skl, - intel_gpu_9_1_9 = intel_gpu_kbl, - intel_gpu_9_2_9 = intel_gpu_cfl, - intel_gpu_9_3_0 = intel_gpu_apl, - intel_gpu_9_4_0 = intel_gpu_glk, - intel_gpu_9_5_0 = intel_gpu_whl, - intel_gpu_9_6_0 = intel_gpu_aml, - intel_gpu_9_7_0 = intel_gpu_cml, - intel_gpu_11_0_0 = intel_gpu_icllp, - intel_gpu_11_2_0 = intel_gpu_ehl, - intel_gpu_12_0_0 = intel_gpu_tgllp, - intel_gpu_12_1_0 = intel_gpu_rkl, - intel_gpu_12_2_0 = intel_gpu_adl_s, - intel_gpu_12_3_0 = intel_gpu_adl_p, - intel_gpu_12_4_0 = intel_gpu_adl_n, - intel_gpu_12_10_0 = intel_gpu_dg1, - intel_gpu_12_55_8 = intel_gpu_acm_g10, - intel_gpu_12_56_5 = intel_gpu_acm_g11, - intel_gpu_12_57_0 = intel_gpu_acm_g12, - intel_gpu_12_60_7 = intel_gpu_pvc, - intel_gpu_12_61_7 = intel_gpu_pvc_vg, - intel_gpu_12_70_4 = intel_gpu_mtl_u, - intel_gpu_12_71_4 = intel_gpu_mtl_h, - intel_gpu_12_74_4 = intel_gpu_arl_h, - intel_gpu_20_1_4 = intel_gpu_bmg_g21, - intel_gpu_20_4_4 = intel_gpu_lnl_m, +#define __SYCL_ARCHITECTURE(NAME, VAL) NAME = VAL, +#define __SYCL_ARCHITECTURE_ALIAS(NAME, VAL) NAME = VAL, +#include +#undef __SYCL_ARCHITECTURE +#undef __SYCL_ARCHITECTURE_ALIAS }; enum class arch_category { @@ -231,7 +59,7 @@ static constexpr ext::oneapi::experimental::architecture ext::oneapi::experimental::architecture::nvidia_gpu_sm_50; static constexpr ext::oneapi::experimental::architecture max_nvidia_gpu_architecture = - ext::oneapi::experimental::architecture::nvidia_gpu_sm_90; + ext::oneapi::experimental::architecture::nvidia_gpu_sm_90a; static constexpr ext::oneapi::experimental::architecture min_amd_gpu_architecture = diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp index d33502b7e3f24..2885a7673795b 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp @@ -15,9 +15,10 @@ #include // for PI_ERROR_INVALID_DEVICE #include // for sycl_category, exception #include // for bfloat16 -#include // for memory_scope -#include // for range -#include // for span +#include +#include // for memory_scope +#include // for range +#include // for span #ifdef __SYCL_DEVICE_ONLY__ #include @@ -36,6 +37,54 @@ namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { +enum class group_algorithm_data_placement { blocked, striped }; + +struct input_data_placement_key + : detail::compile_time_property_key { + template + using value_t = + property_value(Placement)>>; +}; + +struct output_data_placement_key + : detail::compile_time_property_key { + template + using value_t = + property_value(Placement)>>; +}; + +template +inline constexpr input_data_placement_key::value_t + input_data_placement; + +template +inline constexpr output_data_placement_key::value_t + output_data_placement; + +namespace detail { + +template +constexpr bool isInputBlocked(Properties properties) { + if constexpr (properties.template has_property()) + return properties.template get_property() == + input_data_placement; + else + return true; +} + +template +constexpr bool isOutputBlocked(Properties properties) { + if constexpr (properties.template has_property()) + return properties.template get_property() == + output_data_placement; + else + return true; +} + +} // namespace detail + // ---- group helpers template class group_with_scratchpad { Group g; @@ -48,7 +97,7 @@ template class group_with_scratchpad { sycl::span get_memory() const { return scratch; } }; -// ---- sorters +// Default sorter provided by the first version of the extension specification. template > class default_sorter { Compare comp; sycl::span scratch; @@ -63,10 +112,10 @@ template > class default_sorter { void operator()([[maybe_unused]] Group g, [[maybe_unused]] Ptr first, [[maybe_unused]] Ptr last) { #ifdef __SYCL_DEVICE_ONLY__ - // Per extension specification if scratch size is less than the value - // returned by memory_required then behavior is undefined, so we don't check - // that the scratch size statisfies the requirement. - sycl::detail::merge_sort(g, first, last - first, comp, scratch.data()); + using T = typename sycl::detail::GetValueType::type; + size_t n = last - first; + T *scratch_begin = sycl::detail::align_scratch(scratch, g, n); + sycl::detail::merge_sort(g, first, n, comp, scratch_begin); #else throw sycl::exception( std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), @@ -77,16 +126,14 @@ template > class default_sorter { template T operator()([[maybe_unused]] Group g, T val) { #ifdef __SYCL_DEVICE_ONLY__ - // Per extension specification if scratch size is less than the value - // returned by memory_required then behavior is undefined, so we don't check - // that the scratch size statisfies the requirement. + std::size_t local_id = g.get_local_linear_id(); auto range_size = g.get_local_range().size(); - size_t local_id = g.get_local_linear_id(); - T *temp = reinterpret_cast(scratch.data()); - ::new (temp + local_id) T(val); - sycl::detail::merge_sort(g, temp, range_size, comp, - scratch.data() + range_size * sizeof(T)); - val = temp[local_id]; + T *scratch_begin = sycl::detail::align_scratch( + scratch, g, /* output storage and temporary storage */ 2 * range_size); + scratch_begin[local_id] = val; + sycl::detail::merge_sort(g, scratch_begin, range_size, comp, + scratch_begin + range_size); + val = scratch_begin[local_id]; #else throw sycl::exception( std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), @@ -122,6 +169,7 @@ template struct ConvertToComp { }; } // namespace detail +// Radix sorter provided by the first version of the extension specification. template class radix_sorter { @@ -199,6 +247,318 @@ class radix_sorter { } }; +// Default sorters provided by the second version of the extension +// specification. +namespace default_sorters { + +template > class joint_sorter { + CompareT comp; + sycl::span scratch; + +public: + template + joint_sorter(sycl::span scratch_, + CompareT comp_ = CompareT()) + : comp(comp_), scratch(scratch_) {} + + template + void operator()([[maybe_unused]] Group g, [[maybe_unused]] Ptr first, + [[maybe_unused]] Ptr last) { +#ifdef __SYCL_DEVICE_ONLY__ + using T = typename sycl::detail::GetValueType::type; + size_t n = last - first; + T *scratch_begin = sycl::detail::align_scratch(scratch, g, n); + sycl::detail::merge_sort(g, first, n, comp, scratch_begin); +#else + throw sycl::exception( + std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), + "default_sorter constructor is not supported on host device."); +#endif + } + + template + static size_t memory_required(sycl::memory_scope, size_t range_size) { + return range_size * sizeof(T) + alignof(T); + } +}; + +template , + std::size_t ElementsPerWorkItem = 1> +class group_sorter { + CompareT comp; + sycl::span scratch; + +public: + template + group_sorter(sycl::span scratch_, + CompareT comp_ = CompareT{}) + : comp(comp_), scratch(scratch_) {} + + template T operator()([[maybe_unused]] Group g, T val) { +#ifdef __SYCL_DEVICE_ONLY__ + std::size_t local_id = g.get_local_linear_id(); + auto range_size = g.get_local_range().size(); + T *scratch_begin = sycl::detail::align_scratch( + scratch, g, /* output storage and temporary storage */ 2 * range_size); + scratch_begin[local_id] = val; + sycl::detail::merge_sort(g, scratch_begin, range_size, comp, + scratch_begin + range_size); + val = scratch_begin[local_id]; +#else + throw sycl::exception( + std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), + "default_sorter operator() is not supported on host device."); +#endif + return val; + } + + template + void operator()([[maybe_unused]] Group g, + [[maybe_unused]] sycl::span values, + [[maybe_unused]] Properties properties) { +#ifdef __SYCL_DEVICE_ONLY__ + std::size_t local_id = g.get_local_linear_id(); + auto wg_size = g.get_local_range().size(); + auto number_of_elements = wg_size * ElementsPerWorkItem; + T *scratch_begin = sycl::detail::align_scratch( + scratch, g, + /* output storage and temporary storage */ 2 * number_of_elements); + for (std::uint32_t i = 0; i < ElementsPerWorkItem; ++i) + scratch_begin[local_id * ElementsPerWorkItem + i] = values[i]; + sycl::detail::merge_sort(g, scratch_begin, number_of_elements, comp, + scratch_begin + number_of_elements); + + std::size_t shift{}; + for (std::uint32_t i = 0; i < ElementsPerWorkItem; ++i) { + if constexpr (detail::isOutputBlocked(properties)) { + shift = local_id * ElementsPerWorkItem + i; + } else { + shift = i * wg_size + local_id; + } + values[i] = scratch_begin[shift]; + } +#endif + } + + static std::size_t memory_required(sycl::memory_scope scope, + size_t range_size) { + return 2 * joint_sorter<>::template memory_required( + scope, range_size * ElementsPerWorkItem); + } +}; + +template , + std::size_t ElementsPerWorkItem = 1> +class group_key_value_sorter { + CompareT comp; + sycl::span scratch; + +public: + template + group_key_value_sorter(sycl::span scratch_, + CompareT comp_ = {}) + : comp(comp_), scratch(scratch_) {} + + template + std::tuple operator()(Group g, KeyTy key, ValueTy value) { + static_assert(ElementsPerWorkItem == 1, + "ElementsPerWorkItem must be equal 1"); + + using KeyValue = std::tuple; + auto comp_key_value = [this_comp = this->comp](const KeyValue &lhs, + const KeyValue &rhs) { + return this_comp(std::get<0>(lhs), std::get<0>(rhs)); + }; + return group_sorter(scratch, comp_key_value)( + g, KeyValue(key, value)); + } + + static std::size_t memory_required(sycl::memory_scope scope, + std::size_t range_size) { + return group_sorter, CompareT, + ElementsPerWorkItem>::memory_required(scope, + range_size); + } +}; +} // namespace default_sorters + +// Radix sorters provided by the second version of the extension specification. +namespace radix_sorters { + +template +class joint_sorter { + + sycl::span scratch; + uint32_t first_bit = 0; + uint32_t last_bit = 0; + + static constexpr uint32_t bits = BitsPerPass; + using bitset_t = std::bitset; + +public: + template + joint_sorter(sycl::span scratch_, + const bitset_t mask = bitset_t{}.set()) + : scratch(scratch_) { + static_assert((std::is_arithmetic::value || + std::is_same::value || + std::is_same::value), + "radix sort is not supported for the given type"); + + for (first_bit = 0; first_bit < mask.size() && !mask[first_bit]; + ++first_bit) + ; + for (last_bit = first_bit; last_bit < mask.size() && mask[last_bit]; + ++last_bit) + ; + } + + template + void operator()([[maybe_unused]] GroupT g, [[maybe_unused]] PtrT first, + [[maybe_unused]] PtrT last) { +#ifdef __SYCL_DEVICE_ONLY__ + sycl::detail::privateDynamicSort( + g, first, /*empty*/ first, last - first, scratch.data(), first_bit, + last_bit); +#else + throw sycl::exception( + std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), + "radix_sorter is not supported on host device."); +#endif + } + + static constexpr std::size_t + memory_required([[maybe_unused]] sycl::memory_scope scope, + std::size_t range_size) { + return range_size * sizeof(ValT) + + (1 << bits) * range_size * sizeof(uint32_t) + alignof(uint32_t); + } +}; + +template +class group_sorter { + + sycl::span scratch; + uint32_t first_bit = 0; + uint32_t last_bit = 0; + + static constexpr uint32_t bits = BitsPerPass; + using bitset_t = std::bitset; + +public: + template + group_sorter(sycl::span scratch_, + const bitset_t mask = bitset_t{}.set()) + : scratch(scratch_) { + static_assert((std::is_arithmetic::value || + std::is_same::value || + std::is_same::value), + "radix sort is not usable"); + + for (first_bit = 0; first_bit < mask.size() && !mask[first_bit]; + ++first_bit) + ; + for (last_bit = first_bit; last_bit < mask.size() && mask[last_bit]; + ++last_bit) + ; + } + + template + ValT operator()([[maybe_unused]] GroupT g, [[maybe_unused]] ValT val) { +#ifdef __SYCL_DEVICE_ONLY__ + ValT result[]{val}; + sycl::detail::privateStaticSort( + g, result, /*empty*/ result, scratch.data(), first_bit, last_bit); + return result[0]; +#else + throw sycl::exception( + std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), + "radix_sorter is not supported on host device."); +#endif + } + + template + void operator()([[maybe_unused]] Group g, + [[maybe_unused]] sycl::span values, + [[maybe_unused]] Properties properties) { +#ifdef __SYCL_DEVICE_ONLY__ + sycl::detail::privateStaticSort< + /*is_key_value=*/false, detail::isOutputBlocked(properties), + OrderT == sorting_order::ascending, ElementsPerWorkItem, bits>( + g, values.data(), /*empty*/ values.data(), scratch.data(), first_bit, + last_bit); +#endif + } + + static constexpr size_t + memory_required([[maybe_unused]] sycl::memory_scope scope, + size_t range_size) { + return (std::max)(range_size * sizeof(ValT), + range_size * (1 << bits) * sizeof(uint32_t)); + } +}; + +template +class group_key_value_sorter { + sycl::span scratch; + uint32_t first_bit; + uint32_t last_bit; + + static constexpr uint32_t bits = BitsPerPass; + using bitset_t = std::bitset; + +public: + template + group_key_value_sorter(sycl::span scratch_, + const bitset_t mask = bitset_t{}.set()) + : scratch(scratch_) { + static_assert((std::is_arithmetic::value || + std::is_same::value), + "radix sort is not usable"); + for (first_bit = 0; first_bit < mask.size() && !mask[first_bit]; + ++first_bit) + ; + for (last_bit = first_bit; last_bit < mask.size() && mask[last_bit]; + ++last_bit) + ; + } + + template + std::tuple operator()([[maybe_unused]] Group g, KeyTy key, + ValueTy val) { + static_assert(ElementsPerWorkItem == 1, "ElementsPerWorkItem must be 1"); + KeyTy key_result[]{key}; + ValueTy val_result[]{val}; +#ifdef __SYCL_DEVICE_ONLY__ + sycl::detail::privateStaticSort< + /*is_key_value=*/true, + /*is_blocked=*/true, Order == sorting_order::ascending, 1, bits>( + g, key_result, val_result, scratch.data(), first_bit, last_bit); +#endif + key = key_result[0]; + val = val_result[0]; + return {key, val}; + } + + static constexpr std::size_t memory_required(sycl::memory_scope, + std::size_t range_size) { + return (std::max)(range_size * ElementsPerWorkItem * + (sizeof(KeyTy) + sizeof(ValueTy)), + range_size * (1 << bits) * sizeof(uint32_t)); + } +}; +} // namespace radix_sorters + } // namespace ext::oneapi::experimental } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp index 092fec5c7da0c..5dece1c54f7c4 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp @@ -68,6 +68,19 @@ struct is_sorter_impl struct is_sorter : decltype(is_sorter_impl::test(0)) { }; + +template +struct is_key_value_sorter : std::false_type {}; + +template +struct is_key_value_sorter< + Sorter, Group, Key, Value, + std::enable_if_t< + std::is_same_v, + std::tuple> && + sycl::is_group_v>> : std::true_type {}; + } // namespace detail // ---- sort_over_group @@ -90,7 +103,7 @@ sort_over_group(experimental::group_with_scratchpad exec, T value, Compare comp) { return sort_over_group( exec.get_group(), value, - experimental::default_sorter(exec.get_memory(), comp)); + default_sorters::group_sorter(exec.get_memory(), comp)); } template @@ -98,7 +111,60 @@ std::enable_if_t>, T> sort_over_group(experimental::group_with_scratchpad exec, T value) { return sort_over_group(exec.get_group(), value, - experimental::default_sorter<>(exec.get_memory())); + default_sorters::group_sorter(exec.get_memory())); +} + +template +std::enable_if_t>, + void> +sort_over_group([[maybe_unused]] Group g, + [[maybe_unused]] sycl::span values, + [[maybe_unused]] Sorter sorter, + [[maybe_unused]] Properties properties = {}) { +#ifdef __SYCL_DEVICE_ONLY__ + return sorter(g, values, properties); +#else + throw sycl::exception( + std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), + "Group algorithms are not supported on host device."); +#endif +} + +template +std::enable_if_t>, + void> +sort_over_group(experimental::group_with_scratchpad exec, + sycl::span values, + Properties properties = {}) { + return sort_over_group( + exec.get_group(), values, + default_sorters::group_sorter, ElementsPerWorkItem>( + exec.get_memory()), + properties); +} + +template +std::enable_if_t> && + sycl::ext::oneapi::experimental::is_property_list_v< + std::decay_t>, + void> +sort_over_group(experimental::group_with_scratchpad exec, + sycl::span values, Compare comp, + Properties properties = {}) { + return sort_over_group( + exec.get_group(), values, + default_sorters::group_sorter( + exec.get_memory(), comp), + properties); } // ---- joint_sort @@ -120,7 +186,7 @@ std::enable_if_t::value, void> joint_sort(experimental::group_with_scratchpad exec, Iter first, Iter last, Compare comp) { joint_sort(exec.get_group(), first, last, - experimental::default_sorter(exec.get_memory(), comp)); + default_sorters::joint_sorter(exec.get_memory(), comp)); } template @@ -128,7 +194,49 @@ std::enable_if_t>, void> joint_sort(experimental::group_with_scratchpad exec, Iter first, Iter last) { joint_sort(exec.get_group(), first, last, - experimental::default_sorter<>(exec.get_memory())); + default_sorters::joint_sorter<>(exec.get_memory())); +} + +template +std::enable_if_t< + detail::is_key_value_sorter::value, + std::tuple> +sort_key_value_over_group([[maybe_unused]] Group g, [[maybe_unused]] KeyTy key, + [[maybe_unused]] ValueTy value, + [[maybe_unused]] Sorter sorter) { +#ifdef __SYCL_DEVICE_ONLY__ + return sorter(g, key, value); +#else + throw sycl::exception( + std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), + "Group algorithms are not supported on host device."); +#endif +} + +template +std::enable_if_t< + !detail::is_key_value_sorter::value, + std::tuple> +sort_key_value_over_group( + experimental::group_with_scratchpad exec, KeyTy key, + ValueTy value, Compare comp) { + return sort_key_value_over_group( + exec.get_group(), key, value, + default_sorters::group_key_value_sorter( + exec.get_memory(), comp)); +} + +template +std::enable_if_t>, + std::tuple> +sort_key_value_over_group( + experimental::group_with_scratchpad exec, KeyTy key, + ValueTy value) { + return sort_key_value_over_group( + exec.get_group(), key, value, + default_sorters::group_key_value_sorter( + exec.get_memory())); } } // namespace ext::oneapi::experimental diff --git a/sycl/include/sycl/ext/oneapi/experimental/virtual_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/virtual_functions.hpp new file mode 100644 index 0000000000000..a173689cbc652 --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/experimental/virtual_functions.hpp @@ -0,0 +1,74 @@ +#pragma once + +#include +#include + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { +struct indirectly_callable_key { + template + using value_t = + sycl::ext::oneapi::experimental::property_value; +}; + +template +inline constexpr indirectly_callable_key::value_t indirectly_callable; + +struct calls_indirectly_key { + template + using value_t = + sycl::ext::oneapi::experimental::property_value; +}; + +template +inline constexpr calls_indirectly_key::value_t calls_indirectly; + +template <> struct is_property_key : std::true_type {}; +template <> struct is_property_key : std::true_type {}; + +namespace detail { + +template <> +struct IsCompileTimeProperty : std::true_type {}; +template <> +struct IsCompileTimeProperty : std::true_type {}; + +template <> struct PropertyToKind { + static constexpr PropKind Kind = PropKind::IndirectlyCallable; +}; + +template <> struct PropertyToKind { + static constexpr PropKind Kind = PropKind::CallsIndirectly; +}; + +template +struct PropertyMetaInfo> { + static constexpr const char *name = "indirectly-callable"; + static constexpr const char *value = +#ifdef __SYCL_DEVICE_ONLY__ + __builtin_sycl_unique_stable_name(Set); +#else + ""; +#endif +}; + +template +struct PropertyMetaInfo> { + static constexpr const char *name = "calls-indirectly"; + static constexpr const char *value = +#ifdef __SYCL_DEVICE_ONLY__ + // FIXME: we should handle Rest... here as well + __builtin_sycl_unique_stable_name(First); +#else + ""; +#endif +}; + +} // namespace detail + +} // namespace ext::oneapi::experimental +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp index 89d7dd7852a8a..3f1bb28268d39 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp @@ -205,8 +205,12 @@ enum PropKind : uint32_t { WorkItemProgress = 64, NDRangeKernel = 65, SingleTaskKernel = 66, + IndirectlyCallable = 67, + CallsIndirectly = 68, + InputDataPlacement = 69, + OutputDataPlacement = 70, // PropKindSize must always be the last value. - PropKindSize = 67, + PropKindSize = 71, }; struct property_key_base_tag {}; diff --git a/sycl/include/sycl/half_type.hpp b/sycl/include/sycl/half_type.hpp index 951146f2cdfbb..799ff9fb186e9 100644 --- a/sycl/include/sycl/half_type.hpp +++ b/sycl/include/sycl/half_type.hpp @@ -249,11 +249,14 @@ using StorageT = _Float16; using BIsRepresentationT = _Float16; using VecElemT = _Float16; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES using Vec2StorageT = VecElemT __attribute__((ext_vector_type(2))); using Vec3StorageT = VecElemT __attribute__((ext_vector_type(3))); using Vec4StorageT = VecElemT __attribute__((ext_vector_type(4))); using Vec8StorageT = VecElemT __attribute__((ext_vector_type(8))); using Vec16StorageT = VecElemT __attribute__((ext_vector_type(16))); +#endif // __INTEL_PREVIEW_BREAKING_CHANGES + #else // SYCL_DEVICE_ONLY using StorageT = detail::host_half_impl::half; // No need to extract underlying data type for built-in functions operating on @@ -261,6 +264,7 @@ using StorageT = detail::host_half_impl::half; using BIsRepresentationT = half; using VecElemT = half; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES // On the host side we cannot use OpenCL cl_half# types as an underlying type // for vec because they are actually defined as an integer type under the // hood. As a result half values will be converted to the integer and passed @@ -270,6 +274,8 @@ using Vec3StorageT = std::array; using Vec4StorageT = std::array; using Vec8StorageT = std::array; using Vec16StorageT = std::array; +#endif // __INTEL_PREVIEW_BREAKING_CHANGES + #endif // SYCL_DEVICE_ONLY #ifndef __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 2c6df91a6589b..a71f5400a813d 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -962,6 +963,10 @@ class __SYCL_EXPORT handler { sycl::ext::intel::experimental::fp_control_key>() && KI::isESIMD()), "Floating point control property is supported for ESIMD kernels only."); + static_assert( + !PropertiesT::template has_property< + sycl::ext::oneapi::experimental::indirectly_callable_key>(), + "indirectly_callable property cannot be applied to SYCL kernels"); if constexpr (PropertiesT::template has_property< sycl::ext::intel::experimental::cache_config_key>()) { auto Config = Props.template get_property< @@ -3291,22 +3296,48 @@ class __SYCL_EXPORT handler { size_t DeviceRowPitch, sycl::range<3> HostExtent, sycl::range<3> CopyExtent); - /// Instruct the queue with a non-blocking wait on an external semaphore. - /// An exception is thrown if \p SemaphoreHandle is incomplete. + /// Submit a non-blocking device-side wait on an external + // semaphore to the queue. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore requires an explicit value to wait upon. + /// + /// \param SemaphoreHandle is an opaque external interop semaphore handle + void ext_oneapi_wait_external_semaphore( + ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle); + + /// Submit a non-blocking device-side wait on an external + // semaphore to the queue. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore does not support waiting on an explicitly passed value. /// /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param WaitValue is the value that this semaphore will wait upon, until it + /// allows any further commands to execute on the queue. void ext_oneapi_wait_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle - SemaphoreHandle); + ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t WaitValue); /// Instruct the queue to signal the external semaphore once all previous - /// commands have completed execution. - /// An exception is thrown if \p SemaphoreHandle is incomplete. + /// commands submitted to the queue have completed execution. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore requires an explicit value to signal. + /// + /// \param SemaphoreHandle is an opaque external interop semaphore handle + void ext_oneapi_signal_external_semaphore( + ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle); + + /// Instruct the queue to set the state of the external semaphore to + /// \p SignalValue once all previous commands submitted to the queue have + /// completed execution. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore does not support signalling an explicitly passed value. /// /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param SignalValue is the value that this semaphore signal, once all + /// prior opeartions on the queue complete. void ext_oneapi_signal_external_semaphore( - sycl::ext::oneapi::experimental::interop_semaphore_handle - SemaphoreHandle); + ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t SignalValue); private: std::shared_ptr MImpl; diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 73a5ea8e7307a..4d32218ab09d4 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -1851,9 +1851,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue with a non-blocking wait on an external semaphore. - /// An exception is thrown if \p SemaphoreHandle is incomplete. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore requires an explicit value to wait upon. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param SemaphoreHandle is an opaque external interop semaphore handle. /// \return an event representing the wait operation. event ext_oneapi_wait_external_semaphore( sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, @@ -1867,7 +1868,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } /// Instruct the queue with a non-blocking wait on an external semaphore. - /// An exception is thrown if \p SemaphoreHandle is incomplete. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore requires an explicit value to wait upon. /// /// \param SemaphoreHandle is an opaque external interop semaphore handle /// \param DepEvent is an event that specifies the kernel dependencies. @@ -1875,56 +1877,78 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { event ext_oneapi_wait_external_semaphore( sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, event DepEvent, - const detail::code_location &CodeLoc = detail::code_location::current()) { - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvent); - CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle); - }, - CodeLoc); - } + const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue with a non-blocking wait on an external semaphore. - /// An exception is thrown if \p SemaphoreHandle is incomplete. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore requires an explicit value to wait upon. /// - /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param SemaphoreHandle is an opaque external interop semaphore handle. /// \param DepEvents is a vector of events that specifies the kernel /// dependencies. /// \return an event representing the wait operation. event ext_oneapi_wait_external_semaphore( sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, const std::vector &DepEvents, - const detail::code_location &CodeLoc = detail::code_location::current()) { - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle); - }, - CodeLoc); - } + const detail::code_location &CodeLoc = detail::code_location::current()); + + /// Instruct the queue with a non-blocking wait on an external semaphore. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore does not support waiting on an explicitly passed value. + /// + /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param WaitValue is the value that this semaphore will wait upon, until it + /// allows any further commands to execute on the queue. + /// \return an event representing the wait operation. + event ext_oneapi_wait_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t WaitValue, + const detail::code_location &CodeLoc = detail::code_location::current()); + + /// Instruct the queue with a non-blocking wait on an external semaphore. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore does not support waiting on an explicitly passed value. + /// + /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param WaitValue is the value that this semaphore will wait upon, until it + /// allows any further commands to execute on the queue. + /// \param DepEvent is an event that specifies the kernel dependencies. + /// \return an event representing the wait operation. + event ext_oneapi_wait_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t WaitValue, event DepEvent, + const detail::code_location &CodeLoc = detail::code_location::current()); + + /// Instruct the queue with a non-blocking wait on an external semaphore. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore does not support waiting on an explicitly passed value. + /// + /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param WaitValue is the value that this semaphore will wait upon, until it + /// allows any further commands to execute on the queue. + /// \param DepEvents is a vector of events that specifies the kernel + /// dependencies. + /// \return an event representing the wait operation. + event ext_oneapi_wait_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t WaitValue, const std::vector &DepEvents, + const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue to signal the external semaphore once all previous /// commands have completed execution. - /// An exception is thrown if \p SemaphoreHandle is incomplete. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore requires an explicit value to signal. /// /// \param SemaphoreHandle is an opaque external interop semaphore handle /// \return an event representing the signal operation. event ext_oneapi_signal_external_semaphore( sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, - const detail::code_location &CodeLoc = detail::code_location::current()) { - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle); - }, - CodeLoc); - } + const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue to signal the external semaphore once all previous /// commands have completed execution. - /// An exception is thrown if \p SemaphoreHandle is incomplete. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore requires an explicit value to signal. /// /// \param SemaphoreHandle is an opaque external interop semaphore handle /// \param DepEvent is an event that specifies the kernel dependencies. @@ -1932,19 +1956,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { event ext_oneapi_signal_external_semaphore( sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, event DepEvent, - const detail::code_location &CodeLoc = detail::code_location::current()) { - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvent); - CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle); - }, - CodeLoc); - } + const detail::code_location &CodeLoc = detail::code_location::current()); /// Instruct the queue to signal the external semaphore once all previous /// commands have completed execution. - /// An exception is thrown if \p SemaphoreHandle is incomplete. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore requires an explicit value to signal. /// /// \param SemaphoreHandle is an opaque external interop semaphore handle /// \param DepEvents is a vector of events that specifies the kernel @@ -1953,15 +1970,52 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { event ext_oneapi_signal_external_semaphore( sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, const std::vector &DepEvents, - const detail::code_location &CodeLoc = detail::code_location::current()) { - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle); - }, - CodeLoc); - } + const detail::code_location &CodeLoc = detail::code_location::current()); + + /// Instruct the queue to signal the external semaphore once all previous + /// commands have completed execution. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore does not support signalling an explicitly passed value. + /// + /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param SignalValue is the value that this semaphore signal, once all + /// prior opeartions on the queue complete. + /// \return an event representing the signal operation. + event ext_oneapi_signal_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t SignalValue, + const detail::code_location &CodeLoc = detail::code_location::current()); + + /// Instruct the queue to signal the external semaphore once all previous + /// commands have completed execution. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore does not support signalling an explicitly passed value. + /// + /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param SignalValue is the value that this semaphore signal, once all + /// prior opeartions on the queue complete. + /// \param DepEvent is an event that specifies the kernel dependencies. + /// \return an event representing the signal operation. + event ext_oneapi_signal_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t SignalValue, event DepEvent, + const detail::code_location &CodeLoc = detail::code_location::current()); + + /// Instruct the queue to signal the external semaphore once all previous + /// commands have completed execution. + /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the + /// type of semaphore does not support signalling an explicitly passed value. + /// + /// \param SemaphoreHandle is an opaque external interop semaphore handle + /// \param SignalValue is the value that this semaphore signal, once all + /// prior opeartions on the queue complete. + /// \param DepEvents is a vector of events that specifies the kernel + /// dependencies. + /// \return an event representing the signal operation. + event ext_oneapi_signal_external_semaphore( + sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle, + uint64_t SignalValue, const std::vector &DepEvents, + const detail::code_location &CodeLoc = detail::code_location::current()); /// single_task version with a kernel represented as a lambda. /// diff --git a/sycl/include/sycl/vector.hpp b/sycl/include/sycl/vector.hpp index 200a77a9adf83..d5d193bfa7add 100644 --- a/sycl/include/sycl/vector.hpp +++ b/sycl/include/sycl/vector.hpp @@ -786,9 +786,27 @@ template class vec { detail::ConvertToOpenCLType_t>>, vec> convert() const { + using bfloat16 = sycl::ext::oneapi::bfloat16; static_assert(std::is_integral_v> || - detail::is_floating_point::value, + detail::is_floating_point::value || + // Conversion to BF16 available only for float. + (std::is_same_v && + std::is_same_v), "Unsupported convertT"); + + // Currently, for float ---> bfloat16 conversion, we only support + // Round-to-even rounding mode. + constexpr bool isFloatToBF16Conv = + std::is_same_v && std::is_same_v; + constexpr bool isBF16ToFloatConv = + std::is_same_v && std::is_same_v; + if constexpr (isFloatToBF16Conv) { + static_assert(roundingMode == rounding_mode::automatic || + roundingMode == rounding_mode::rte, + "Currently, we only supoort round-to-even rounding mode \ + for float ---> bfloat16 conversion."); + } + using T = vec_data_t; using R = vec_data_t; using OpenCLT = detail::ConvertToOpenCLType_t; @@ -828,10 +846,19 @@ template class vec { { // Otherwise, we fallback to per-element conversion: for (size_t I = 0; I < NumElements; ++I) { - Result.setValue( - I, vec_data::get( - detail::convertImpl( - vec_data::get(getValue(I))))); + // For float -> bf16. + if constexpr (isFloatToBF16Conv) { + Result[I] = bfloat16((*this)[I]); + } else + // For bf16 -> float. + if constexpr (isBF16ToFloatConv) { + Result[I] = (float)((*this)[I]); + } else { + Result.setValue(I, vec_data::get( + detail::convertImpl( + vec_data::get(getValue(I))))); + } } } diff --git a/sycl/include/sycl/vector_preview.hpp b/sycl/include/sycl/vector_preview.hpp index 7300bc0e088a0..c6993fd27c73f 100644 --- a/sycl/include/sycl/vector_preview.hpp +++ b/sycl/include/sycl/vector_preview.hpp @@ -26,10 +26,6 @@ #error "SYCL device compiler is built without ext_vector_type support" #endif -#if defined(__SYCL_DEVICE_ONLY__) -#define __SYCL_USE_EXT_VECTOR_TYPE__ -#endif - #include // for decorated, address_space #include // for half, cl_char, cl_int #include // for ArrayCreator, RepeatV... @@ -39,14 +35,16 @@ #include // for memcpy #include // for is_contained #include // for is_floating_point +#include #include // for convertImpl #include // for vector_alignment #include // for StorageT, half, Vec16... #include // bfloat16 +#include // for std::min #include // for array -#include // for assert +#include // for assert #include // for size_t, NULL, byte #include // for uint8_t, int16_t, int... #include // for divides, multiplies @@ -86,313 +84,75 @@ struct elem { }; namespace detail { -// select_apply_cl_t selects from T8/T16/T32/T64 basing on -// sizeof(_IN). expected to handle scalar types in _IN. -template -using select_apply_cl_t = std::conditional_t< - sizeof(_IN) == 1, T8, - std::conditional_t>>; - -template struct vec_helper { - using RetType = T; - static constexpr RetType get(T value) { return value; } - static constexpr RetType set(T value) { return value; } -}; -template <> struct vec_helper { - using RetType = select_apply_cl_t; - static constexpr RetType get(bool value) { return value; } - static constexpr RetType set(bool value) { return value; } -}; - -template <> struct vec_helper { - using RetType = sycl::ext::oneapi::bfloat16; - using BFloat16StorageT = sycl::ext::oneapi::detail::Bfloat16StorageT; - static constexpr RetType get(BFloat16StorageT value) { -#if defined(__SYCL_BITCAST_IS_CONSTEXPR) - return sycl::bit_cast(value); -#else - // awkward workaround. sycl::bit_cast isn't constexpr in older GCC - // C++20 will give us both std::bit_cast and constexpr reinterpet for void* - // but neither available yet. - union { - sycl::ext::oneapi::bfloat16 bf16; - sycl::ext::oneapi::detail::Bfloat16StorageT storage; - } result = {}; - result.storage = value; - return result.bf16; -#endif - } - - static constexpr RetType get(RetType value) { return value; } - - static constexpr BFloat16StorageT set(RetType value) { -#if defined(__SYCL_BITCAST_IS_CONSTEXPR) - return sycl::bit_cast(value); -#else - union { - sycl::ext::oneapi::bfloat16 bf16; - sycl::ext::oneapi::detail::Bfloat16StorageT storage; - } result = {}; - result.bf16 = value; - return result.storage; -#endif - } -}; - -#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) -template <> struct vec_helper { - using RetType = std::uint8_t; - static constexpr RetType get(std::byte value) { return (RetType)value; } - static constexpr RetType set(std::byte value) { return (RetType)value; } - static constexpr std::byte get(std::uint8_t value) { - return (std::byte)value; - } - static constexpr std::byte set(std::uint8_t value) { - return (std::byte)value; - } -}; -#endif - template class OperationCurrentT, int... Indexes> class SwizzleOp; -template struct VecStorage; - -// Element type for relational operator return value. -template -using rel_t = typename std::conditional_t< - sizeof(DataT) == sizeof(opencl::cl_char), opencl::cl_char, - typename std::conditional_t< - sizeof(DataT) == sizeof(opencl::cl_short), opencl::cl_short, - typename std::conditional_t< - sizeof(DataT) == sizeof(opencl::cl_int), opencl::cl_int, - typename std::conditional_t>>>; - // Special type indicating that SwizzleOp should just read value from vector - // not trying to perform any operations. Should not be called. template class GetOp { public: using DataT = T; - DataT getValue(size_t) const { return (DataT)0; } - DataT operator()(DataT, DataT) { return (DataT)0; } -}; - -// Forward declarations -template -class RoundedRangeKernel; -template -class RoundedRangeKernelWithKH; - -// Vectors of size 1 are handled separately and therefore 1 is not included in -// the check below. -constexpr bool isValidVectorSize(int N) { - return N == 2 || N == 3 || N == 4 || N == 8 || N == 16; -} -template struct VecStorage { - static_assert( - isValidVectorSize(N) || N == 1, - "Incorrect number of elements for sycl::vec: only 1, 2, 3, 4, 8 " - "or 16 are supported"); - static_assert(!std::is_same_v, "Incorrect data type for sycl::vec"); -}; - -#ifdef __SYCL_DEVICE_ONLY__ -// device always has ext vector support, but for huge vectors -// we switch to std::array, so that we can use a smaller alignment (64) -// this is to support MSVC, which has a max of 64 for direct params. -template struct VecStorageImpl { - static constexpr size_t Num = (N == 3) ? 4 : N; - static constexpr size_t Sz = Num * sizeof(T); - using DataType = - typename std::conditional>::type; - using VectorDataType = T __attribute__((ext_vector_type(N))); -}; -#else // __SYCL_DEVICE_ONLY__ -template struct VecStorageImpl { - using DataType = std::array; -}; -#endif // __SYCL_DEVICE_ONLY__ - -// Single element bool -template <> struct VecStorage { - using DataType = bool; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = bool; -#endif // __SYCL_DEVICE_ONLY__ -}; - -// Multiple element bool -template -struct VecStorage> { - using DataType = - typename VecStorageImpl, - N>::DataType; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = - typename VecStorageImpl, - N>::VectorDataType; -#endif // __SYCL_DEVICE_ONLY__ -}; - -#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) -// Single element byte. Multiple elements will propagate through a later -// specialization. -template <> struct VecStorage { - using DataType = std::int8_t; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = std::int8_t; -#endif // __SYCL_DEVICE_ONLY__ -}; -#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) - -// Single element signed integers -template -struct VecStorage>> { - using DataType = T; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = DataType; -#endif // __SYCL_DEVICE_ONLY__ -}; - -// Single element unsigned integers -template -struct VecStorage>> { - using DataType = T; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = DataType; -#endif // __SYCL_DEVICE_ONLY__ -}; - -// Single element floating-point (except half/bfloat16) -template -struct VecStorage< - T, 1, - typename std::enable_if_t && is_sgenfloat_v>> { - using DataType = T; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = DataType; -#endif // __SYCL_DEVICE_ONLY__ -}; -// Multiple elements signed/unsigned integers and floating-point (except -// half/bfloat16) -template -struct VecStorage< - T, N, - typename std::enable_if_t || - (is_sgenfloat_v && !is_half_or_bf16_v))>> { - using DataType = - typename VecStorageImpl::DataType, N>::DataType; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = - typename VecStorageImpl::DataType, - N>::VectorDataType; -#endif // __SYCL_DEVICE_ONLY__ -}; - -// Single element half -template <> struct VecStorage { - using DataType = sycl::detail::half_impl::StorageT; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = sycl::detail::half_impl::StorageT; -#endif // __SYCL_DEVICE_ONLY__ + DataT getValue(size_t) const { + if constexpr (std::is_same_v) + return DataT{0.0f}; + else + return (DataT)0; + } + DataT operator()(DataT, DataT) { + if constexpr (std::is_same_v) + return DataT{0.0f}; + else + return (DataT)0; + } }; -// Multiple elements half -#if defined(__SYCL_DEVICE_ONLY__) -#define __SYCL_DEFINE_HALF_VECSTORAGE(Num) \ - template <> struct VecStorage { \ - using DataType = sycl::detail::half_impl::Vec##Num##StorageT; \ - using VectorDataType = sycl::detail::half_impl::Vec##Num##StorageT; \ - }; -#else // defined(__SYCL_DEVICE_ONLY__) -#define __SYCL_DEFINE_HALF_VECSTORAGE(Num) \ - template <> struct VecStorage { \ - using DataType = sycl::detail::half_impl::Vec##Num##StorageT; \ - }; -#endif // defined(__SYCL_DEVICE_ONLY__) - -__SYCL_DEFINE_HALF_VECSTORAGE(2) -__SYCL_DEFINE_HALF_VECSTORAGE(3) -__SYCL_DEFINE_HALF_VECSTORAGE(4) -__SYCL_DEFINE_HALF_VECSTORAGE(8) -__SYCL_DEFINE_HALF_VECSTORAGE(16) -#undef __SYCL_DEFINE_HALF_VECSTORAGE - -// Single element bfloat16 -template <> struct VecStorage { - using DataType = sycl::ext::oneapi::detail::Bfloat16StorageT; - // using VectorDataType = sycl::ext::oneapi::bfloat16; - using VectorDataType = sycl::ext::oneapi::detail::Bfloat16StorageT; -}; -// Multiple elements bfloat16 -#define __SYCL_DEFINE_BF16_VECSTORAGE(Num) \ - template <> struct VecStorage { \ - using DataType = sycl::ext::oneapi::detail::bf16::Vec##Num##StorageT; \ - using VectorDataType = \ - sycl::ext::oneapi::detail::bf16::Vec##Num##StorageT; \ - }; -__SYCL_DEFINE_BF16_VECSTORAGE(2) -__SYCL_DEFINE_BF16_VECSTORAGE(3) -__SYCL_DEFINE_BF16_VECSTORAGE(4) -__SYCL_DEFINE_BF16_VECSTORAGE(8) -__SYCL_DEFINE_BF16_VECSTORAGE(16) -#undef __SYCL_DEFINE_BF16_VECSTORAGE } // namespace detail -template using vec_data = detail::vec_helper; - -template -using vec_data_t = typename detail::vec_helper::RetType; - ///////////////////////// class sycl::vec ///////////////////////// -/// Provides a cross-patform vector class template that works efficiently on -/// SYCL devices as well as in host C++ code. -/// -/// \ingroup sycl_api -template class vec { - using DataT = Type; +// Provides a cross-platform vector class template that works efficiently on +// SYCL devices as well as in host C++ code. +template +class vec : public detail::vec_arith { + + static_assert(NumElements == 1 || NumElements == 2 || NumElements == 3 || + NumElements == 4 || NumElements == 8 || NumElements == 16, + "Invalid number of elements for sycl::vec: only 1, 2, 3, 4, 8 " + "or 16 are supported"); + static_assert(sizeof(bool) == sizeof(int8_t), "bool size is not 1 byte"); + + // https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#memory-layout-and-alignment + // It is required by the SPEC to align vec with vec. + static constexpr size_t AdjustedNum = (NumElements == 3) ? 4 : NumElements; // This represent type of underlying value. There should be only one field // in the class, so vec should be equal to float16 in memory. - using DataType = typename detail::VecStorage::DataType; + using DataType = std::array; - static constexpr bool IsHostHalf = - std::is_same_v && - std::is_same_v; +#ifdef __SYCL_DEVICE_ONLY__ + using element_type_for_vector_t = typename detail::map_type< + DataT, +#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) + std::byte, /*->*/ std::uint8_t, // +#endif + bool, /*->*/ std::int8_t, // + sycl::half, /*->*/ sycl::detail::half_impl::StorageT, // + sycl::ext::oneapi::bfloat16, + /*->*/ sycl::ext::oneapi::detail::Bfloat16StorageT, // + DataT, /*->*/ DataT // + >::type; - static constexpr bool IsBfloat16 = - std::is_same_v; +public: + // Type used for passing sycl::vec to SPIRV builtins. + // We can not use ext_vector_type(1) as it's not supported by SPIRV + // plugins (CTS fails). + using vector_t = + typename std::conditional_t; - static constexpr size_t AdjustedNum = (NumElements == 3) ? 4 : NumElements; - static constexpr size_t Sz = sizeof(DataT) * AdjustedNum; - static constexpr bool IsSizeGreaterThanMaxAlign = - (Sz > detail::MaxVecAlignment); - - // TODO: There is no support for vector half type on host yet. - // Also, when Sz is greater than alignment, we use std::array instead of - // vector extension. This is for MSVC compatibility, which has a max alignment - // of 64 for direct params. If we drop MSVC, we can have alignment the same as - // size and use vector extensions for all sizes. - static constexpr bool IsUsingArrayOnDevice = - (IsHostHalf || IsBfloat16 || IsSizeGreaterThanMaxAlign); - -#if defined(__SYCL_DEVICE_ONLY__) - static constexpr bool NativeVec = NumElements > 1 && !IsUsingArrayOnDevice; - static constexpr bool IsUsingArrayOnHost = false; // not compiling for host. -#else - static constexpr bool NativeVec = false; - static constexpr bool IsUsingArrayOnHost = true; // host always std::array. -#endif +private: +#endif // __SYCL_DEVICE_ONLY__ static constexpr int getNumElements() { return NumElements; } @@ -411,7 +171,7 @@ template class vec { template static constexpr std::array VecToArray(const vec &V, std::index_sequence) { - return {static_cast(V.getValue(Is))...}; + return {static_cast(V[Is])...}; } template class T4, int... T5, std::size_t... Is> @@ -446,7 +206,9 @@ template class vec { } template static constexpr auto FlattenVecArgHelper(const T &A) { - return std::array{vec_data::get(static_cast(A))}; + // static_cast required to avoid narrowing conversion warning + // when T = unsigned long int and DataT_ = int. + return std::array{static_cast(A)}; } template struct FlattenVecArg { constexpr auto operator()(const T &A) const { @@ -541,205 +303,89 @@ template class vec { using EnableIfSuitableNumElements = typename std::enable_if_t::value>; - template - constexpr vec(const std::array, NumElements> &Arr, - std::index_sequence) - : m_Data{([&](vec_data_t v) constexpr { - if constexpr (std::is_same_v) - return v.value; - else - return vec_data_t(static_cast(v)); - })(Arr[Is])...} {} + // Element type for relational operator return value. + using rel_t = detail::select_cl_scalar_integral_signed_t; public: + // Aliases required by SYCL 2020 to make sycl::vec consistent + // with that of marray and buffer. using element_type = DataT; using value_type = DataT; - using rel_t = detail::rel_t; -#ifdef __SYCL_DEVICE_ONLY__ - using vector_t = - typename detail::VecStorage::VectorDataType; -#endif // __SYCL_DEVICE_ONLY__ + /****************** Constructors **************/ vec() = default; - constexpr vec(const vec &Rhs) = default; constexpr vec(vec &&Rhs) = default; - constexpr vec &operator=(const vec &Rhs) = default; - - // W/o this, things like "vec = vec" doesn't work. - template - typename std::enable_if_t && - std::is_convertible_v, rel_t>, - vec &> - operator=(const vec &Rhs) { - *this = Rhs.template as(); - return *this; - } - -#ifdef __SYCL_USE_EXT_VECTOR_TYPE__ - template - using EnableIfNotHostHalf = typename std::enable_if_t; - - template - using EnableIfHostHalf = typename std::enable_if_t; - - template - using EnableIfUsingArrayOnDevice = - typename std::enable_if_t; - - template - using EnableIfNotUsingArrayOnDevice = - typename std::enable_if_t; -#endif // __SYCL_USE_EXT_VECTOR_TYPE__ - - template - using EnableIfUsingArray = - typename std::enable_if_t; - - template - using EnableIfNotUsingArray = - typename std::enable_if_t; - -#ifdef __SYCL_USE_EXT_VECTOR_TYPE__ - - template - explicit constexpr vec(const EnableIfNotUsingArrayOnDevice &arg) - : m_Data{DataType(vec_data::get(arg))} {} +private: + // Implementation detail for the next public ctor. + template + constexpr vec(const std::array &Arr, + std::index_sequence) + : m_Data{Arr[Is]...} {} - template - typename std::enable_if_t< - std::is_fundamental_v> || - detail::is_half_or_bf16_v>, - vec &> - operator=(const EnableIfNotUsingArrayOnDevice &Rhs) { - m_Data = (DataType)vec_data::get(Rhs); - return *this; - } +public: + explicit constexpr vec(const DataT &arg) + : vec{detail::RepeatValue(arg), + std::make_index_sequence()} {} - template - explicit constexpr vec(const EnableIfUsingArrayOnDevice &arg) - : vec{detail::RepeatValue( - static_cast>(arg)), + // Constructor from values of base type or vec of base type. Checks that + // base types are match and that the NumElements == sum of lengths of args. + template , + typename = EnableIfSuitableNumElements> + constexpr vec(const argTN &...args) + : vec{VecArgArrayCreator::Create(args...), std::make_index_sequence()} {} + /****************** Assignment Operators **************/ + constexpr vec &operator=(const vec &Rhs) = default; + + // Template required to prevent ambiguous overload with the copy assignment + // when NumElements == 1. The template prevents implicit conversion from + // vec<_, 1> to DataT. template typename std::enable_if_t< - std::is_fundamental_v> || + std::is_fundamental_v || detail::is_half_or_bf16_v>, vec &> - operator=(const EnableIfUsingArrayOnDevice &Rhs) { - for (int i = 0; i < NumElements; ++i) { - setValue(i, Rhs); - } + operator=(const DataT &Rhs) { + *this = vec{Rhs}; return *this; } -#else // __SYCL_USE_EXT_VECTOR_TYPE__ - explicit constexpr vec(const DataT &arg) - : vec{detail::RepeatValue( - static_cast>(arg)), - std::make_index_sequence()} {} + // W/o this, things like "vec = vec" doesn't work. template typename std::enable_if_t< - std::is_fundamental_v> || - detail::is_half_or_bf16_v>, - vec &> - operator=(const DataT &Rhs) { - for (int i = 0; i < NumElements; ++i) { - setValue(i, Rhs); - } + !std::is_same_v && std::is_convertible_v, vec &> + operator=(const vec &Rhs) { + *this = Rhs.template as(); return *this; } -#endif // __SYCL_USE_EXT_VECTOR_TYPE__ - -#ifdef __SYCL_USE_EXT_VECTOR_TYPE__ - // Optimized naive constructors with NumElements of DataT values. - // We don't expect compilers to optimize vararg recursive functions well. - - // Helper type to make specific constructors available only for specific - // number of elements. - template - using EnableIfMultipleElems = typename std::enable_if_t< - std::is_convertible_v && NumElements == IdxNum, DataT>; - template - constexpr vec(const EnableIfMultipleElems<2, Ty> Arg0, - const EnableIfNotUsingArrayOnDevice Arg1) - : m_Data{vec_data::get(Arg0), vec_data::get(Arg1)} {} - template - constexpr vec(const EnableIfMultipleElems<3, Ty> Arg0, - const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2) - : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), - vec_data::get(Arg2)} {} - template - constexpr vec(const EnableIfMultipleElems<4, Ty> Arg0, - const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2, - const Ty Arg3) - : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), - vec_data::get(Arg2), vec_data::get(Arg3)} {} - template - constexpr vec(const EnableIfMultipleElems<8, Ty> Arg0, - const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2, - const DataT Arg3, const DataT Arg4, const DataT Arg5, - const DataT Arg6, const DataT Arg7) - : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), - vec_data::get(Arg2), vec_data::get(Arg3), - vec_data::get(Arg4), vec_data::get(Arg5), - vec_data::get(Arg6), vec_data::get(Arg7)} {} - template - constexpr vec(const EnableIfMultipleElems<16, Ty> Arg0, - const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2, - const DataT Arg3, const DataT Arg4, const DataT Arg5, - const DataT Arg6, const DataT Arg7, const DataT Arg8, - const DataT Arg9, const DataT ArgA, const DataT ArgB, - const DataT ArgC, const DataT ArgD, const DataT ArgE, - const DataT ArgF) - : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), - vec_data::get(Arg2), vec_data::get(Arg3), - vec_data::get(Arg4), vec_data::get(Arg5), - vec_data::get(Arg6), vec_data::get(Arg7), - vec_data::get(Arg8), vec_data::get(Arg9), - vec_data::get(ArgA), vec_data::get(ArgB), - vec_data::get(ArgC), vec_data::get(ArgD), - vec_data::get(ArgE), vec_data::get(ArgF)} {} -#endif // __SYCL_USE_EXT_VECTOR_TYPE__ - - // Constructor from values of base type or vec of base type. Checks that - // base types are match and that the NumElements == sum of lengths of args. - template , - typename = EnableIfSuitableNumElements> - constexpr vec(const argTN &...args) - : vec{VecArgArrayCreator, argTN...>::Create(args...), - std::make_index_sequence()} {} #ifdef __SYCL_DEVICE_ONLY__ - template && - !std::is_same_v>> - constexpr vec(vector_t openclVector) { - if constexpr (!IsUsingArrayOnDevice) { - m_Data = openclVector; - } else { - m_Data = bit_cast(openclVector); - } - } - - operator vector_t() const { - if constexpr (!IsUsingArrayOnDevice) { - return m_Data; - } else { - auto ptr = bit_cast((&m_Data)->data()); - return *ptr; - } - } + // Make it a template to avoid ambiguity with `vec(const DataT &)` when + // `vector_t` is the same as `DataT`. Not that the other ctor isn't a template + // so we don't even need a smart `enable_if` condition here, the mere fact of + // this being a template makes the other ctor preferred. + template < + typename vector_t_ = vector_t, + typename = typename std::enable_if_t>> + constexpr vec(vector_t_ openclVector) { + m_Data = sycl::bit_cast(openclVector); + } + + /* @SYCL2020 + * Available only when: compiled for the device. + * Converts this SYCL vec instance to the underlying backend-native vector + * type defined by vector_t. + */ + operator vector_t() const { return sycl::bit_cast(m_Data); } #endif // __SYCL_DEVICE_ONLY__ // Available only when: NumElements == 1 template operator typename std::enable_if_t() const { - return vec_data::get(m_Data); + return m_Data[0]; } __SYCL2020_DEPRECATED("get_count() is deprecated, please use size() instead") @@ -750,105 +396,124 @@ template class vec { static constexpr size_t get_size() { return byte_size(); } static constexpr size_t byte_size() noexcept { return sizeof(m_Data); } - // convertImpl can't be called with the same From and To types and therefore - // we need this version of convert which is mostly no-op. - template - std::enable_if_t< - std::is_same_v, vec_data_t> || - std::is_same_v>, - detail::ConvertToOpenCLType_t>>, - vec> - convert() const { - static_assert(std::is_integral_v> || - detail::is_floating_point::value, - "Unsupported convertT"); - if constexpr (!std::is_same_v) { - // Dummy conversion for cases like vec -> vec - vec Result; - for (size_t I = 0; I < NumElements; ++I) - Result.setValue(I, static_cast(getValue(I))); +private: + // We interpret bool as int8_t, std::byte as uint8_t for conversion to other + // types. + template + using ConvertBoolAndByteT = + typename detail::map_type*/ std::uint8_t, // +#endif + bool, /*->*/ std::int8_t, // + T, /*->*/ T // + >::type; - return Result; - } else { - // No conversion necessary - return *this; - } + // getValue should be able to operate on different underlying + // types: enum cl_float#N , builtin vector float#N, builtin type float. + constexpr auto getValue(int Index) const { + using RetType = + typename std::conditional_t, int8_t, +#ifdef __SYCL_DEVICE_ONLY__ + element_type_for_vector_t +#else + DataT +#endif + >; + +#ifdef __SYCL_DEVICE_ONLY__ + if constexpr (std::is_same_v) + return sycl::bit_cast(m_Data[Index]); + else +#endif + return static_cast(m_Data[Index]); } +public: template - std::enable_if_t< - !std::is_same_v, vec_data_t> && - !std::is_same_v>, - detail::ConvertToOpenCLType_t>>, - vec> - convert() const { - static_assert(std::is_integral_v> || - detail::is_floating_point::value, + vec convert() const { + + using T = ConvertBoolAndByteT; + using R = ConvertBoolAndByteT; + using bfloat16 = sycl::ext::oneapi::bfloat16; + static_assert(std::is_integral_v || + detail::is_floating_point::value || + std::is_same_v, "Unsupported convertT"); - using T = vec_data_t; - using R = vec_data_t; + using OpenCLT = detail::ConvertToOpenCLType_t; using OpenCLR = detail::ConvertToOpenCLType_t; vec Result; -#if defined(__SYCL_DEVICE_ONLY__) - using OpenCLVecT = OpenCLT __attribute__((ext_vector_type(NumElements))); - using OpenCLVecR = OpenCLR __attribute__((ext_vector_type(NumElements))); - // Whole vector conversion can only be done, if: - constexpr bool canUseNativeVectorConvert = + // convertImpl can't be called with the same From and To types and therefore + // we need some special processing in a few cases. + if constexpr (std::is_same_v) { + return *this; + } else if constexpr (std::is_same_v || + std::is_same_v) { + for (size_t I = 0; I < NumElements; ++I) + Result[I] = static_cast(getValue(I)); + return Result; + } else { + +#ifdef __SYCL_DEVICE_ONLY__ + using OpenCLVecT = OpenCLT __attribute__((ext_vector_type(NumElements))); + using OpenCLVecR = OpenCLR __attribute__((ext_vector_type(NumElements))); + + auto NativeVector = sycl::bit_cast(*this); + using ConvertTVecType = typename vec::vector_t; + + // Whole vector conversion can only be done, if: + constexpr bool canUseNativeVectorConvert = #ifdef __NVPTX__ - // - we are not on CUDA, see intel/llvm#11840 - false && + // TODO: Likely unnecessary as + // https://github.com/intel/llvm/issues/11840 has been closed + // already. + false && #endif - // - both vectors are represented using native vector types; - NativeVec && vec::NativeVec && - // - vec storage has an equivalent OpenCL native vector it is implicitly - // convertible to. There are some corner cases where it is not the - // case with char, long and long long types. - std::is_convertible_v && - std::is_convertible_v && - // - it is not a signed to unsigned (or vice versa) conversion - // see comments within 'convertImpl' for more details; - !detail::is_sint_to_from_uint::value && - // - destination type is not bool. bool is stored as integer under the - // hood and therefore conversion to bool looks like conversion between - // two integer types. Since bit pattern for true and false is not - // defined, there is no guarantee that integer conversion yields - // right results here; - !std::is_same_v; - if constexpr (canUseNativeVectorConvert) { - Result.m_Data = detail::convertImpl(m_Data); - } else -#endif // defined(__SYCL_DEVICE_ONLY__) - { - // Otherwise, we fallback to per-element conversion: - for (size_t I = 0; I < NumElements; ++I) { - Result.setValue( - I, vec_data::get( - detail::convertImpl( - vec_data::get(getValue(I))))); + NumElements > 1 && + // - vec storage has an equivalent OpenCL native vector it is + // implicitly convertible to. There are some corner cases where it + // is not the case with char, long and long long types. + std::is_convertible_v && + std::is_convertible_v && + // - it is not a signed to unsigned (or vice versa) conversion + // see comments within 'convertImpl' for more details; + !detail::is_sint_to_from_uint::value && + // - destination type is not bool. bool is stored as integer under the + // hood and therefore conversion to bool looks like conversion + // between two integer types. Since bit pattern for true and false + // is not defined, there is no guarantee that integer conversion + // yields right results here; + !std::is_same_v; + + if constexpr (canUseNativeVectorConvert) { + Result.m_Data = sycl::bit_cast( + detail::convertImpl(NativeVector)); + } else +#endif // __SYCL_DEVICE_ONLY__ + { + // Otherwise, we fallback to per-element conversion: + for (size_t I = 0; I < NumElements; ++I) { + auto val = + detail::convertImpl( + getValue(I)); +#ifdef __SYCL_DEVICE_ONLY__ + // On device, we interpret BF16 as uint16. + if constexpr (std::is_same_v) + Result[I] = sycl::bit_cast(val); + else +#endif + Result[I] = static_cast(val); + } } } - return Result; } - template asT as() const { - static_assert((sizeof(*this) == sizeof(asT)), - "The new SYCL vec type must have the same storage size in " - "bytes as this SYCL vec"); - static_assert( - detail::is_contained::value || - detail::is_contained::value, - "asT must be SYCL vec of a different element type and " - "number of elements specified by asT"); - asT Result; - detail::memcpy(&Result.m_Data, &m_Data, sizeof(decltype(Result.m_Data))); - return Result; - } + template asT as() const { return sycl::bit_cast(*this); } template Swizzle swizzle() { return this; @@ -859,60 +524,11 @@ template class vec { return this; } - // ext_vector_type is used as an underlying type for sycl::vec on device. - // The problem is that for clang vector types the return of operator[] is a - // temporary and not a reference to the element in the vector. In practice - // reinterpret_cast(&m_Data)[i]; is working. According to - // http://llvm.org/docs/GetElementPtr.html#can-gep-index-into-vector-elements - // this is not disallowed now. But could probably be disallowed in the future. - // That is why tests are added to check that behavior of the compiler has - // not changed. - // - // Implement operator [] in the same way for host and device. - // TODO: change host side implementation when underlying type for host side - // will be changed to std::array. - // NOTE: aliasing the incompatible types of bfloat16 may lead to problems if - // aggressively optimized. Specializing with noinline to avoid as workaround. + const DataT &operator[](int i) const { return m_Data[i]; } - template - typename std::enable_if_t, - const DataT &> - operator[](int i) const { - return reinterpret_cast(&m_Data)[i]; - } + DataT &operator[](int i) { return m_Data[i]; } - template - typename std::enable_if_t, - DataT &> - operator[](int i) { - return reinterpret_cast(&m_Data)[i]; - } - -#ifdef _MSC_VER -#define __SYCL_NOINLINE_BF16 __declspec(noinline) -#else -#define __SYCL_NOINLINE_BF16 __attribute__((noinline)) -#endif - - template - __SYCL_NOINLINE_BF16 - typename std::enable_if_t, - const DataT &> - operator[](int i) const { - return reinterpret_cast(&m_Data)[i]; - } - - template - __SYCL_NOINLINE_BF16 - typename std::enable_if_t, - DataT &> - operator[](int i) { - return reinterpret_cast(&m_Data)[i]; - } - -#undef __SYCL_NOINLINE_BF16 - - // Begin hi/lo, even/odd, xyzw, and rgba swizzles. + // Begin hi/lo, even/odd, xyzw, and rgba swizzles. @{ private: // Indexer used in the swizzles.def // Currently it is defined as a template struct. Replacing it with a constexpr @@ -930,13 +546,13 @@ template class vec { #define __SYCL_ACCESS_RETURN this #include "swizzles.def" #undef __SYCL_ACCESS_RETURN - // End of hi/lo, even/odd, xyzw, and rgba swizzles. + // }@ End of hi/lo, even/odd, xyzw, and rgba swizzles. template void load(size_t Offset, multi_ptr Ptr) { for (int I = 0; I < NumElements; I++) { - setValue(I, *multi_ptr( - Ptr + Offset * NumElements + I)); + m_Data[I] = *multi_ptr( + Ptr + Offset * NumElements + I); } } template @@ -961,7 +577,7 @@ template class vec { multi_ptr Ptr) const { for (int I = 0; I < NumElements; I++) { *multi_ptr(Ptr + Offset * NumElements + - I) = getValue(I); + I) = m_Data[I]; } } template class vec { store(Offset, MultiPtr); } - void ConvertToDataT() { - for (size_t i = 0; i < NumElements; ++i) { - DataT tmp = getValue(i); - setValue(i, tmp); - } - } - -#ifdef __SYCL_BINOP -#error "Undefine __SYCL_BINOP macro" -#endif - -#ifdef __SYCL_USE_EXT_VECTOR_TYPE__ -#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT) \ - friend vec operator BINOP(const vec &Lhs, const vec &Rhs) { \ - vec Ret; \ - if constexpr (IsUsingArrayOnDevice) { \ - for (size_t I = 0; I < NumElements; ++I) { \ - Ret.setValue(I, (Lhs.getValue(I) BINOP Rhs.getValue(I))); \ - } \ - } else { \ - Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data; \ - if constexpr (std::is_same_v && CONVERT) { \ - Ret.ConvertToDataT(); \ - } \ - } \ - return Ret; \ - } \ - friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) { \ - return Lhs BINOP vec(Rhs); \ - } \ - friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) { \ - return vec(Lhs) BINOP Rhs; \ - } \ - friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) { \ - Lhs = Lhs BINOP Rhs; \ - return Lhs; \ - } \ - template \ - friend typename std::enable_if_t operator OPASSIGN( \ - vec & Lhs, const DataT & Rhs) { \ - Lhs = Lhs BINOP vec(Rhs); \ - return Lhs; \ - } - -#else // __SYCL_USE_EXT_VECTOR_TYPE__ - -#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT) \ - friend vec operator BINOP(const vec &Lhs, const vec &Rhs) { \ - vec Ret{}; \ - if constexpr (NativeVec) \ - Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data; \ - else \ - for (size_t I = 0; I < NumElements; ++I) \ - Ret.setValue(I, (DataT)(vec_data::get(Lhs.getValue( \ - I)) BINOP vec_data::get(Rhs.getValue(I)))); \ - return Ret; \ - } \ - friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) { \ - return Lhs BINOP vec(Rhs); \ - } \ - friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) { \ - return vec(Lhs) BINOP Rhs; \ - } \ - friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) { \ - Lhs = Lhs BINOP Rhs; \ - return Lhs; \ - } \ - template \ - friend typename std::enable_if_t operator OPASSIGN( \ - vec & Lhs, const DataT & Rhs) { \ - Lhs = Lhs BINOP vec(Rhs); \ - return Lhs; \ - } - -#endif // __SYCL_USE_EXT_VECTOR_TYPE__ - - __SYCL_BINOP(+, +=, true) - __SYCL_BINOP(-, -=, true) - __SYCL_BINOP(*, *=, false) - __SYCL_BINOP(/, /=, false) - - // TODO: The following OPs are available only when: DataT != cl_float && - // DataT != cl_double && DataT != cl_half - __SYCL_BINOP(%, %=, false) - __SYCL_BINOP(|, |=, false) - __SYCL_BINOP(&, &=, false) - __SYCL_BINOP(^, ^=, false) - __SYCL_BINOP(>>, >>=, false) - __SYCL_BINOP(<<, <<=, true) -#undef __SYCL_BINOP -#undef __SYCL_BINOP_HELP - - // Note: vec<>/SwizzleOp logical value is 0/-1 logic, as opposed to 0/1 logic. - // As far as CTS validation is concerned, 0/-1 logic also applies when - // NumElements is equal to one, which is somewhat inconsistent with being - // transparent with scalar data. - // TODO: Determine if vec<, NumElements=1> is needed at all, remove this - // inconsistency if not by disallowing one-element vectors (as in OpenCL) - -#ifdef __SYCL_RELLOGOP -#error "Undefine __SYCL_RELLOGOP macro" -#endif -// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined -// by SYCL device compiler only. -#ifdef __SYCL_DEVICE_ONLY__ -#define __SYCL_RELLOGOP(RELLOGOP) \ - friend vec operator RELLOGOP(const vec & Lhs, \ - const vec & Rhs) { \ - vec Ret{}; \ - /* This special case is needed since there are no standard operator|| */ \ - /* or operator&& functions for std::array. */ \ - if constexpr (IsUsingArrayOnDevice && \ - (std::string_view(#RELLOGOP) == "||" || \ - std::string_view(#RELLOGOP) == "&&")) { \ - for (size_t I = 0; I < NumElements; ++I) { \ - /* We cannot use SetValue here as the operator is not a friend of*/ \ - /* Ret on Windows. */ \ - Ret[I] = static_cast(-(vec_data::get( \ - Lhs.getValue(I)) RELLOGOP vec_data::get(Rhs.getValue(I)))); \ - } \ - } else { \ - Ret = vec( \ - (typename vec::vector_t)( \ - Lhs.m_Data RELLOGOP Rhs.m_Data)); \ - if (NumElements == 1) /*Scalar 0/1 logic was applied, invert*/ \ - Ret *= -1; \ - } \ - return Ret; \ - } \ - friend vec operator RELLOGOP(const vec & Lhs, \ - const DataT & Rhs) { \ - return Lhs RELLOGOP vec(Rhs); \ - } \ - friend vec operator RELLOGOP(const DataT & Lhs, \ - const vec & Rhs) { \ - return vec(Lhs) RELLOGOP Rhs; \ - } - -#else -#define __SYCL_RELLOGOP(RELLOGOP) \ - friend vec operator RELLOGOP(const vec & Lhs, \ - const vec & Rhs) { \ - vec Ret{}; \ - for (size_t I = 0; I < NumElements; ++I) { \ - /* We cannot use SetValue here as the operator is not a friend of*/ \ - /* Ret on Windows. */ \ - Ret[I] = static_cast(-(vec_data::get( \ - Lhs.getValue(I)) RELLOGOP vec_data::get(Rhs.getValue(I)))); \ - } \ - return Ret; \ - } \ - friend vec operator RELLOGOP(const vec & Lhs, \ - const DataT & Rhs) { \ - return Lhs RELLOGOP vec(Rhs); \ - } \ - friend vec operator RELLOGOP(const DataT & Lhs, \ - const vec & Rhs) { \ - return vec(Lhs) RELLOGOP Rhs; \ - } -#endif - - __SYCL_RELLOGOP(==) - __SYCL_RELLOGOP(!=) - __SYCL_RELLOGOP(>) - __SYCL_RELLOGOP(<) - __SYCL_RELLOGOP(>=) - __SYCL_RELLOGOP(<=) - // TODO: limit to integral types. - __SYCL_RELLOGOP(&&) - __SYCL_RELLOGOP(||) -#undef __SYCL_RELLOGOP - -#ifdef __SYCL_UOP -#error "Undefine __SYCL_UOP macro" -#endif -#define __SYCL_UOP(UOP, OPASSIGN) \ - friend vec &operator UOP(vec & Rhs) { \ - Rhs OPASSIGN vec_data::get(1); \ - return Rhs; \ - } \ - friend vec operator UOP(vec &Lhs, int) { \ - vec Ret(Lhs); \ - Lhs OPASSIGN vec_data::get(1); \ - return Ret; \ - } - - __SYCL_UOP(++, +=) - __SYCL_UOP(--, -=) -#undef __SYCL_UOP - - // operator~() available only when: dataT != float && dataT != double - // && dataT != half - friend vec operator~(const vec &Rhs) { - if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) { - Ret.setValue(I, ~Rhs.getValue(I)); - } - return Ret; - } else { - vec Ret{(typename vec::DataType) ~Rhs.m_Data}; - if constexpr (std::is_same_v) { - Ret.ConvertToDataT(); - } - return Ret; - } - } - - // operator! - friend vec, NumElements> operator!(const vec &Rhs) { - if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) { -#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) - // std::byte neither supports ! unary op or casting, so special handling - // is needed. And, worse, Windows has a conflict with 'byte'. - if constexpr (std::is_same_v) { - Ret.setValue(I, std::byte{!vec_data::get(Rhs.getValue(I))}); - } else -#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) - { - Ret.setValue(I, !vec_data::get(Rhs.getValue(I))); - } - } - return Ret.template as, NumElements>>(); - } else { - return vec{(typename vec::DataType) !Rhs.m_Data} - .template as, NumElements>>(); - } - } - - // operator + - friend vec operator+(const vec &Lhs) { - if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) - Ret.setValue( - I, vec_data::get(+vec_data::get(Lhs.getValue(I)))); - return Ret; - } else { - return vec{+Lhs.m_Data}; - } - } - - // operator - - friend vec operator-(const vec &Lhs) { - namespace oneapi = sycl::ext::oneapi; - vec Ret{}; - if constexpr (IsBfloat16 && NumElements == 1) { - oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data); - oneapi::bfloat16 w = -v; - Ret.m_Data = oneapi::detail::bfloat16ToBits(w); - } else if constexpr (IsBfloat16) { - for (size_t I = 0; I < NumElements; I++) { - oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data[I]); - oneapi::bfloat16 w = -v; - Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w); - } - } else if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) { - for (size_t I = 0; I < NumElements; ++I) - Ret.setValue( - I, vec_data::get(-vec_data::get(Lhs.getValue(I)))); - return Ret; - } else { - Ret = vec{-Lhs.m_Data}; - if constexpr (std::is_same_v) { - Ret.ConvertToDataT(); - } - return Ret; - } - } - - // OP is: &&, || - // vec operatorOP(const vec &Rhs) const; - // vec operatorOP(const DataT &Rhs) const; - - // OP is: ==, !=, <, >, <=, >= - // vec operatorOP(const vec &Rhs) const; - // vec operatorOP(const DataT &Rhs) const; private: - // Generic method that execute "Operation" on underlying values. - -#ifdef __SYCL_USE_EXT_VECTOR_TYPE__ - template