From 28e0907111075dead53a931004b0ba6f1fe0d793 Mon Sep 17 00:00:00 2001 From: Ralf Jung Date: Wed, 24 Jul 2024 20:55:43 +0200 Subject: [PATCH 1/3] nontemporal_store: make sure that the intrinsic is truly just a hint --- .../src/intrinsics/mod.rs | 3 +- compiler/rustc_codegen_gcc/src/builder.rs | 2 ++ compiler/rustc_codegen_llvm/src/builder.rs | 30 ++++++++++++++----- library/core/src/intrinsics.rs | 10 +++---- tests/codegen/intrinsics/nontemporal.rs | 19 ++++++++++-- 5 files changed, 49 insertions(+), 15 deletions(-) diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs index b21c559e6686c..29deac6073035 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs @@ -725,7 +725,8 @@ fn codegen_regular_intrinsic_call<'tcx>( // Cranelift treats stores as volatile by default // FIXME correctly handle unaligned_volatile_store - // FIXME actually do nontemporal stores if requested + // FIXME actually do nontemporal stores if requested (but do not just emit MOVNT on x86; + // see the LLVM backend for details) let dest = CPlace::for_ptr(Pointer::new(ptr), val.layout()); dest.write_cvalue(fx, val); } diff --git a/compiler/rustc_codegen_gcc/src/builder.rs b/compiler/rustc_codegen_gcc/src/builder.rs index a64371a3d891e..47b378cc1cd82 100644 --- a/compiler/rustc_codegen_gcc/src/builder.rs +++ b/compiler/rustc_codegen_gcc/src/builder.rs @@ -1127,6 +1127,8 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> { self.llbb().add_assignment(self.location, aligned_destination, val); // TODO(antoyo): handle align and flags. // NOTE: dummy value here since it's never used. FIXME(antoyo): API should not return a value here? + // When adding support for NONTEMPORAL, make sure to not just emit MOVNT on x86; see the + // LLVM backend for details. self.cx.context.new_rvalue_zero(self.type_i32()) } diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs index 77e51f941508c..1686bc59c7646 100644 --- a/compiler/rustc_codegen_llvm/src/builder.rs +++ b/compiler/rustc_codegen_llvm/src/builder.rs @@ -727,13 +727,29 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { llvm::LLVMSetVolatile(store, llvm::True); } if flags.contains(MemFlags::NONTEMPORAL) { - // According to LLVM [1] building a nontemporal store must - // *always* point to a metadata value of the integer 1. - // - // [1]: https://llvm.org/docs/LangRef.html#store-instruction - let one = self.cx.const_i32(1); - let node = llvm::LLVMMDNodeInContext(self.cx.llcx, &one, 1); - llvm::LLVMSetMetadata(store, llvm::MD_nontemporal as c_uint, node); + // Make sure that the current target architectures supports "sane" non-temporal + // stores, i.e., non-temporal stores that are equivalent to regular stores except + // for performance. LLVM doesn't seem to care about this, and will happily treat + // `!nontemporal` stores as-if they were normal stores (for reordering optimizations + // etc) even on x86, despite later lowering them to MOVNT which do *not* behave like + // regular stores but require special fences. + // So we keep a list of architectures where `!nontemporal` is known to be truly just + // a hint, and use regular stores everywhere else. + // (In the future, we could alternatively ensure that an sfence gets emitted after a sequence of movnt + // before any kind of synchronizing operation. But it's not clear how to do that with LLVM.) + const WELL_BEHAVED_NONTEMPORAL_ARCHS: &[&str] = &["aarch64", "arm"]; + + let use_nontemporal = + WELL_BEHAVED_NONTEMPORAL_ARCHS.contains(&&*self.cx.tcx.sess.target.arch); + if use_nontemporal { + // According to LLVM [1] building a nontemporal store must + // *always* point to a metadata value of the integer 1. + // + // [1]: https://llvm.org/docs/LangRef.html#store-instruction + let one = self.cx.const_i32(1); + let node = llvm::LLVMMDNodeInContext(self.cx.llcx, &one, 1); + llvm::LLVMSetMetadata(store, llvm::MD_nontemporal as c_uint, node); + } } store } diff --git a/library/core/src/intrinsics.rs b/library/core/src/intrinsics.rs index e9eacbcd25a0a..3f3011af0a326 100644 --- a/library/core/src/intrinsics.rs +++ b/library/core/src/intrinsics.rs @@ -2386,12 +2386,12 @@ extern "rust-intrinsic" { #[rustc_nounwind] pub fn catch_unwind(try_fn: fn(*mut u8), data: *mut u8, catch_fn: fn(*mut u8, *mut u8)) -> i32; - /// Emits a `!nontemporal` store according to LLVM (see their docs). - /// Probably will never become stable. + /// Emits a `nontemporal` store, which gives a hint to the CPU that the data should not be held + /// in cache. Except for performance, this is fully equivalent to `ptr.write(val)`. /// - /// Do NOT use this intrinsic; "nontemporal" operations do not exist in our memory model! - /// It exists to support current stdarch, but the plan is to change stdarch and remove this intrinsic. - /// See for some more discussion. + /// Not all architectures provide such an operation. For instance, x86 does not: while `MOVNT` + /// exists, that operation is *not* equivalent to `ptr.write(val)` (`MOVNT` writes can be reordered + /// in ways that are not allowed for regular writes). #[rustc_nounwind] pub fn nontemporal_store(ptr: *mut T, val: T); diff --git a/tests/codegen/intrinsics/nontemporal.rs b/tests/codegen/intrinsics/nontemporal.rs index 076d6d6d9da49..828cb7e828761 100644 --- a/tests/codegen/intrinsics/nontemporal.rs +++ b/tests/codegen/intrinsics/nontemporal.rs @@ -1,13 +1,28 @@ //@ compile-flags: -O +//@ compile-flags: --target aarch64-unknown-linux-gnu +//@ needs-llvm-components: aarch64 -#![feature(core_intrinsics)] +#![feature(no_core, lang_items, intrinsics)] +#![no_core] #![crate_type = "lib"] +#[lang = "sized"] +pub trait Sized {} +#[lang = "copy"] +pub trait Copy {} + +impl Copy for u32 {} +impl Copy for *mut T {} + +extern "rust-intrinsic" { + pub fn nontemporal_store(ptr: *mut T, val: T); +} + #[no_mangle] pub fn a(a: &mut u32, b: u32) { // CHECK-LABEL: define{{.*}}void @a // CHECK: store i32 %b, ptr %a, align 4, !nontemporal unsafe { - std::intrinsics::nontemporal_store(a, b); + nontemporal_store(a, b); } } From 697787a92d8858b17bfe3e55c3e82877149f8f83 Mon Sep 17 00:00:00 2001 From: Ralf Jung Date: Thu, 25 Jul 2024 18:54:00 +0200 Subject: [PATCH 2/3] RISC-V also has sane nontemporal stores --- compiler/rustc_codegen_llvm/src/builder.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs index 1686bc59c7646..20bf580a716a9 100644 --- a/compiler/rustc_codegen_llvm/src/builder.rs +++ b/compiler/rustc_codegen_llvm/src/builder.rs @@ -737,7 +737,8 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { // a hint, and use regular stores everywhere else. // (In the future, we could alternatively ensure that an sfence gets emitted after a sequence of movnt // before any kind of synchronizing operation. But it's not clear how to do that with LLVM.) - const WELL_BEHAVED_NONTEMPORAL_ARCHS: &[&str] = &["aarch64", "arm"]; + const WELL_BEHAVED_NONTEMPORAL_ARCHS: &[&str] = + &["aarch64", "arm", "riscv32", "riscv64"]; let use_nontemporal = WELL_BEHAVED_NONTEMPORAL_ARCHS.contains(&&*self.cx.tcx.sess.target.arch); From 75743dc5a057cdc0678d88523edbbf3fdd1bf901 Mon Sep 17 00:00:00 2001 From: Ralf Jung Date: Mon, 12 Aug 2024 11:10:26 +0200 Subject: [PATCH 3/3] make the codegen test also cover an ill-behaved arch, and add links --- compiler/rustc_codegen_llvm/src/builder.rs | 2 ++ tests/codegen/intrinsics/nontemporal.rs | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs index 20bf580a716a9..8e7a99e46a5b0 100644 --- a/compiler/rustc_codegen_llvm/src/builder.rs +++ b/compiler/rustc_codegen_llvm/src/builder.rs @@ -737,6 +737,8 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { // a hint, and use regular stores everywhere else. // (In the future, we could alternatively ensure that an sfence gets emitted after a sequence of movnt // before any kind of synchronizing operation. But it's not clear how to do that with LLVM.) + // For more context, see and + // . const WELL_BEHAVED_NONTEMPORAL_ARCHS: &[&str] = &["aarch64", "arm", "riscv32", "riscv64"]; diff --git a/tests/codegen/intrinsics/nontemporal.rs b/tests/codegen/intrinsics/nontemporal.rs index 828cb7e828761..ff2d629606688 100644 --- a/tests/codegen/intrinsics/nontemporal.rs +++ b/tests/codegen/intrinsics/nontemporal.rs @@ -1,6 +1,14 @@ //@ compile-flags: -O -//@ compile-flags: --target aarch64-unknown-linux-gnu -//@ needs-llvm-components: aarch64 +//@revisions: with_nontemporal without_nontemporal +//@[with_nontemporal] compile-flags: --target aarch64-unknown-linux-gnu +//@[with_nontemporal] needs-llvm-components: aarch64 +//@[without_nontemporal] compile-flags: --target x86_64-unknown-linux-gnu +//@[without_nontemporal] needs-llvm-components: x86 + +// Ensure that we *do* emit the `!nontemporal` flag on architectures where it +// is well-behaved, but do *not* emit it on architectures where it is ill-behaved. +// For more context, see and +// . #![feature(no_core, lang_items, intrinsics)] #![no_core] @@ -21,7 +29,8 @@ extern "rust-intrinsic" { #[no_mangle] pub fn a(a: &mut u32, b: u32) { // CHECK-LABEL: define{{.*}}void @a - // CHECK: store i32 %b, ptr %a, align 4, !nontemporal + // with_nontemporal: store i32 %b, ptr %a, align 4, !nontemporal + // without_nontemporal-NOT: nontemporal unsafe { nontemporal_store(a, b); }