From c30322aafc9c72b4f5f1ee5ce21ff5340dbb9173 Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Sat, 2 Jul 2022 23:54:30 +0200 Subject: [PATCH 1/6] Align destination in mem* instructions. While misaligned reads are generally fast, misaligned writes aren't and can have severe penalties. --- src/mem/x86_64.rs | 130 +++++++++++++++++++++++++++++++++------------- 1 file changed, 94 insertions(+), 36 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index 3b372d10..68ef17f1 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -16,6 +16,7 @@ // feature is present at compile-time. We don't bother detecting other features. // Note that ERMSB does not enhance the backwards (DF=1) "rep movsb". +use core::arch::asm; use core::intrinsics; use core::mem; @@ -34,40 +35,61 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { #[inline(always)] #[cfg(not(target_feature = "ermsb"))] -pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { - let qword_count = count >> 3; - let byte_count = count & 0b111; - // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. - core::arch::asm!( - "repe movsq (%rsi), (%rdi)", - "mov {byte_count:e}, %ecx", - "repe movsb (%rsi), (%rdi)", - byte_count = in(reg) byte_count, +pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) { + let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); + // Separating the blocks gives the compiler more freedom to reorder instructions. + // It also allows us to trivially skip the rep movsb, which is faster when memcpying + // aligned data. + if pre_byte_count > 0 { + asm!( + "rep movsb", + inout("ecx") pre_byte_count => _, + inout("rdi") dest => dest, + inout("rsi") src => src, + options(nostack, preserves_flags) + ); + } + asm!( + "rep movsq", inout("rcx") qword_count => _, - inout("rdi") dest => _, - inout("rsi") src => _, - options(att_syntax, nostack, preserves_flags) + inout("rdi") dest => dest, + inout("rsi") src => src, + options(nostack, preserves_flags) ); + if byte_count > 0 { + asm!( + "rep movsb", + inout("ecx") byte_count => _, + inout("rdi") dest => _, + inout("rsi") src => _, + options(nostack, preserves_flags) + ); + } } #[inline(always)] pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { - let qword_count = count >> 3; - let byte_count = count & 0b111; - // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. - core::arch::asm!( + let (pre_byte_count, qword_count, byte_count) = rep_param_rev(dest, count); + // We can't separate this block due to std/cld + asm!( "std", - "repe movsq (%rsi), (%rdi)", - "movl {byte_count:e}, %ecx", - "addq $7, %rdi", - "addq $7, %rsi", - "repe movsb (%rsi), (%rdi)", + "rep movsb", + "sub rsi, 7", + "sub rdi, 7", + "mov rcx, {qword_count}", + "rep movsq", + "add rsi, 7", + "add rdi, 7", + "mov ecx, {byte_count:e}", + "rep movsb", "cld", byte_count = in(reg) byte_count, - inout("rcx") qword_count => _, - inout("rdi") dest.add(count).wrapping_sub(8) => _, - inout("rsi") src.add(count).wrapping_sub(8) => _, - options(att_syntax, nostack) + qword_count = in(reg) qword_count, + inout("ecx") pre_byte_count => _, + inout("rdi") dest.add(count - 1) => _, + inout("rsi") src.add(count - 1) => _, + // We modify flags, but we restore it afterwards + options(nostack, preserves_flags) ); } @@ -86,20 +108,36 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { #[inline(always)] #[cfg(not(target_feature = "ermsb"))] -pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { - let qword_count = count >> 3; - let byte_count = count & 0b111; - // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. - core::arch::asm!( - "repe stosq %rax, (%rdi)", - "mov {byte_count:e}, %ecx", - "repe stosb %al, (%rdi)", - byte_count = in(reg) byte_count, +pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { + let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); + // Separating the blocks gives the compiler more freedom to reorder instructions. + // It also allows us to trivially skip the rep stosb, which is faster when memcpying + // aligned data. + if pre_byte_count > 0 { + asm!( + "rep stosb", + inout("ecx") pre_byte_count => _, + inout("rdi") dest => dest, + in("al") c, + options(nostack, preserves_flags) + ); + } + asm!( + "rep stosq", inout("rcx") qword_count => _, - inout("rdi") dest => _, + inout("rdi") dest => dest, in("rax") (c as u64) * 0x0101010101010101, - options(att_syntax, nostack, preserves_flags) + options(nostack, preserves_flags) ); + if byte_count > 0 { + asm!( + "rep stosb", + inout("ecx") byte_count => _, + inout("rdi") dest => _, + in("al") c, + options(nostack, preserves_flags) + ); + } } #[inline(always)] @@ -156,3 +194,23 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { c16(a.cast(), b.cast(), n) } } + +/// Determine optimal parameters for a `rep` instruction. +fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { + // Unaligned writes are still slow on modern processors, so align the destination address. + let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count); + count -= pre_byte_count; + let qword_count = count >> 3; + let byte_count = count & 0b111; + (pre_byte_count, qword_count, byte_count) +} + +/// Determine optimal parameters for a reverse `rep` instruction (i.e. direction bit is set). +fn rep_param_rev(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { + // Unaligned writes are still slow on modern processors, so align the destination address. + let pre_byte_count = ((dest as usize + count) & 0b111).min(count); + count -= pre_byte_count; + let qword_count = count >> 3; + let byte_count = count & 0b111; + (pre_byte_count, qword_count, byte_count) +} From 314354d2b42460c21273e0ce5ec163491e94a796 Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Thu, 7 Jul 2022 11:53:44 +0200 Subject: [PATCH 2/6] Fix suboptimal codegen in memset --- src/mem/x86_64.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index 68ef17f1..3cbbbba5 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -109,6 +109,7 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { #[inline(always)] #[cfg(not(target_feature = "ermsb"))] pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { + let c = c as u64 * 0x0101_0101_0101_0101; let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); // Separating the blocks gives the compiler more freedom to reorder instructions. // It also allows us to trivially skip the rep stosb, which is faster when memcpying @@ -118,7 +119,7 @@ pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { "rep stosb", inout("ecx") pre_byte_count => _, inout("rdi") dest => dest, - in("al") c, + in("rax") c, options(nostack, preserves_flags) ); } @@ -126,7 +127,7 @@ pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { "rep stosq", inout("rcx") qword_count => _, inout("rdi") dest => dest, - in("rax") (c as u64) * 0x0101010101010101, + in("rax") c, options(nostack, preserves_flags) ); if byte_count > 0 { @@ -134,7 +135,7 @@ pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { "rep stosb", inout("ecx") byte_count => _, inout("rdi") dest => _, - in("al") c, + in("rax") c, options(nostack, preserves_flags) ); } From a1dd5a8946f6019b77e43e792446a0e4ec3e4671 Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Thu, 7 Jul 2022 13:13:19 +0200 Subject: [PATCH 3/6] Remove rep_param_rev --- src/mem/x86_64.rs | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index 3cbbbba5..483a3e31 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -69,7 +69,7 @@ pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) #[inline(always)] pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { - let (pre_byte_count, qword_count, byte_count) = rep_param_rev(dest, count); + let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); // We can't separate this block due to std/cld asm!( "std", @@ -80,12 +80,12 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { "rep movsq", "add rsi, 7", "add rdi, 7", - "mov ecx, {byte_count:e}", + "mov ecx, {pre_byte_count:e}", "rep movsb", "cld", - byte_count = in(reg) byte_count, + pre_byte_count = in(reg) pre_byte_count, qword_count = in(reg) qword_count, - inout("ecx") pre_byte_count => _, + inout("ecx") byte_count => _, inout("rdi") dest.add(count - 1) => _, inout("rsi") src.add(count - 1) => _, // We modify flags, but we restore it afterwards @@ -205,13 +205,3 @@ fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { let byte_count = count & 0b111; (pre_byte_count, qword_count, byte_count) } - -/// Determine optimal parameters for a reverse `rep` instruction (i.e. direction bit is set). -fn rep_param_rev(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { - // Unaligned writes are still slow on modern processors, so align the destination address. - let pre_byte_count = ((dest as usize + count) & 0b111).min(count); - count -= pre_byte_count; - let qword_count = count >> 3; - let byte_count = count & 0b111; - (pre_byte_count, qword_count, byte_count) -} From 387f83ea1d8b91f03fd8d03e8597b3c5a19236f2 Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Thu, 7 Jul 2022 13:19:06 +0200 Subject: [PATCH 4/6] Use att_syntax for now --- src/mem/x86_64.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index 483a3e31..a1015f61 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -46,7 +46,7 @@ pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) inout("ecx") pre_byte_count => _, inout("rdi") dest => dest, inout("rsi") src => src, - options(nostack, preserves_flags) + options(att_syntax, nostack, preserves_flags) ); } asm!( @@ -54,7 +54,7 @@ pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) inout("rcx") qword_count => _, inout("rdi") dest => dest, inout("rsi") src => src, - options(nostack, preserves_flags) + options(att_syntax, nostack, preserves_flags) ); if byte_count > 0 { asm!( @@ -62,7 +62,7 @@ pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) inout("ecx") byte_count => _, inout("rdi") dest => _, inout("rsi") src => _, - options(nostack, preserves_flags) + options(att_syntax, nostack, preserves_flags) ); } } @@ -74,13 +74,13 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { asm!( "std", "rep movsb", - "sub rsi, 7", - "sub rdi, 7", - "mov rcx, {qword_count}", + "sub $7, %rsi", + "sub $7, %rdi", + "mov {qword_count}, %rcx", "rep movsq", - "add rsi, 7", - "add rdi, 7", - "mov ecx, {pre_byte_count:e}", + "add $7, %rsi", + "add $7, %rdi", + "mov {pre_byte_count:e}, %ecx", "rep movsb", "cld", pre_byte_count = in(reg) pre_byte_count, @@ -89,7 +89,7 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { inout("rdi") dest.add(count - 1) => _, inout("rsi") src.add(count - 1) => _, // We modify flags, but we restore it afterwards - options(nostack, preserves_flags) + options(att_syntax, nostack, preserves_flags) ); } @@ -120,7 +120,7 @@ pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { inout("ecx") pre_byte_count => _, inout("rdi") dest => dest, in("rax") c, - options(nostack, preserves_flags) + options(att_syntax, nostack, preserves_flags) ); } asm!( @@ -128,7 +128,7 @@ pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { inout("rcx") qword_count => _, inout("rdi") dest => dest, in("rax") c, - options(nostack, preserves_flags) + options(att_syntax, nostack, preserves_flags) ); if byte_count > 0 { asm!( @@ -136,7 +136,7 @@ pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { inout("ecx") byte_count => _, inout("rdi") dest => _, in("rax") c, - options(nostack, preserves_flags) + options(att_syntax, nostack, preserves_flags) ); } } From ae557bde4efcd85bf1fc75c488c4370749d969a7 Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Thu, 7 Jul 2022 13:20:41 +0200 Subject: [PATCH 5/6] Skip rep movsb in copy_backward if possible There is currently no measureable performance difference in benchmarks but it likely will make a difference in real workloads. --- src/mem/x86_64.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index a1015f61..e9c1c56d 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -73,15 +73,21 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { // We can't separate this block due to std/cld asm!( "std", + "test %ecx, %ecx", + "jz 1f", "rep movsb", + "1:", "sub $7, %rsi", "sub $7, %rdi", "mov {qword_count}, %rcx", "rep movsq", + "test {pre_byte_count:e}, {pre_byte_count:e}", + "jz 1f", "add $7, %rsi", "add $7, %rdi", "mov {pre_byte_count:e}, %ecx", "rep movsb", + "1:", "cld", pre_byte_count = in(reg) pre_byte_count, qword_count = in(reg) qword_count, From ef37a23d8417afdb9fb6f215ec4c651e18146366 Mon Sep 17 00:00:00 2001 From: David Hoppenbrouwers Date: Thu, 28 Jul 2022 18:43:45 +0200 Subject: [PATCH 6/6] Remove branches around rep movsb/stosb While it is measurably faster for older CPUs, removing them keeps the code smaller and is likely more beneficial for newer CPUs. --- src/mem/x86_64.rs | 73 ++++++++++++++++++----------------------------- 1 file changed, 28 insertions(+), 45 deletions(-) diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index e9c1c56d..dd98e37c 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -38,17 +38,13 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) { let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); // Separating the blocks gives the compiler more freedom to reorder instructions. - // It also allows us to trivially skip the rep movsb, which is faster when memcpying - // aligned data. - if pre_byte_count > 0 { - asm!( - "rep movsb", - inout("ecx") pre_byte_count => _, - inout("rdi") dest => dest, - inout("rsi") src => src, - options(att_syntax, nostack, preserves_flags) - ); - } + asm!( + "rep movsb", + inout("ecx") pre_byte_count => _, + inout("rdi") dest => dest, + inout("rsi") src => src, + options(att_syntax, nostack, preserves_flags) + ); asm!( "rep movsq", inout("rcx") qword_count => _, @@ -56,15 +52,13 @@ pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) inout("rsi") src => src, options(att_syntax, nostack, preserves_flags) ); - if byte_count > 0 { - asm!( - "rep movsb", - inout("ecx") byte_count => _, - inout("rdi") dest => _, - inout("rsi") src => _, - options(att_syntax, nostack, preserves_flags) - ); - } + asm!( + "rep movsb", + inout("ecx") byte_count => _, + inout("rdi") dest => _, + inout("rsi") src => _, + options(att_syntax, nostack, preserves_flags) + ); } #[inline(always)] @@ -73,21 +67,16 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { // We can't separate this block due to std/cld asm!( "std", - "test %ecx, %ecx", - "jz 1f", "rep movsb", - "1:", "sub $7, %rsi", "sub $7, %rdi", "mov {qword_count}, %rcx", "rep movsq", "test {pre_byte_count:e}, {pre_byte_count:e}", - "jz 1f", "add $7, %rsi", "add $7, %rdi", "mov {pre_byte_count:e}, %ecx", "rep movsb", - "1:", "cld", pre_byte_count = in(reg) pre_byte_count, qword_count = in(reg) qword_count, @@ -118,17 +107,13 @@ pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { let c = c as u64 * 0x0101_0101_0101_0101; let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); // Separating the blocks gives the compiler more freedom to reorder instructions. - // It also allows us to trivially skip the rep stosb, which is faster when memcpying - // aligned data. - if pre_byte_count > 0 { - asm!( - "rep stosb", - inout("ecx") pre_byte_count => _, - inout("rdi") dest => dest, - in("rax") c, - options(att_syntax, nostack, preserves_flags) - ); - } + asm!( + "rep stosb", + inout("ecx") pre_byte_count => _, + inout("rdi") dest => dest, + in("rax") c, + options(att_syntax, nostack, preserves_flags) + ); asm!( "rep stosq", inout("rcx") qword_count => _, @@ -136,15 +121,13 @@ pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { in("rax") c, options(att_syntax, nostack, preserves_flags) ); - if byte_count > 0 { - asm!( - "rep stosb", - inout("ecx") byte_count => _, - inout("rdi") dest => _, - in("rax") c, - options(att_syntax, nostack, preserves_flags) - ); - } + asm!( + "rep stosb", + inout("ecx") byte_count => _, + inout("rdi") dest => _, + in("rax") c, + options(att_syntax, nostack, preserves_flags) + ); } #[inline(always)]