Undesirable auto-vectorization on a simple for-loop #102709

Nugine · 2022-10-05T14:17:16Z

I tried this code:

https://rust.godbolt.org/z/5dE9h9G3T

Compiler flags: -C opt-level=3 -C target-feature=+avx2

pub fn check(data: &[u8], lut: &[u8; 256]) -> bool {
    let mut flag = 0;
    for &x in data {
        flag |= lut[x as usize];
    }
    flag != 0xff
}

I expected to see this happen: The function should be compiled to a small scalar loop.

Instead, this happened:

Huge ASM

        movzx   ecx, byte ptr [r9 + rax - 63]
        movzx   ecx, byte ptr [rdx + rcx]
        vmovd   xmm4, ecx
        movzx   ecx, byte ptr [r9 + rax - 62]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 1
        movzx   ecx, byte ptr [r9 + rax - 47]
        movzx   ecx, byte ptr [rdx + rcx]
        vmovd   xmm5, ecx
        movzx   ecx, byte ptr [r9 + rax - 46]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 1
        movzx   ecx, byte ptr [r9 + rax - 31]
        movzx   ecx, byte ptr [rdx + rcx]
        vmovd   xmm6, ecx
        movzx   ecx, byte ptr [r9 + rax - 30]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 1
        movzx   ecx, byte ptr [r9 + rax - 15]
        movzx   ecx, byte ptr [rdx + rcx]
        vmovd   xmm7, ecx
        movzx   ecx, byte ptr [r9 + rax - 14]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 1
        movzx   ecx, byte ptr [r9 + rax - 61]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 2
        movzx   ecx, byte ptr [r9 + rax - 45]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 2
        movzx   ecx, byte ptr [r9 + rax - 29]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 2
        movzx   ecx, byte ptr [r9 + rax - 13]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 2
        movzx   ecx, byte ptr [r9 + rax - 60]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 3
        movzx   ecx, byte ptr [r9 + rax - 44]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 3
        movzx   ecx, byte ptr [r9 + rax - 28]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 3
        movzx   ecx, byte ptr [r9 + rax - 12]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 3
        movzx   ecx, byte ptr [r9 + rax - 59]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 4
        movzx   ecx, byte ptr [r9 + rax - 43]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 4
        movzx   ecx, byte ptr [r9 + rax - 27]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 4
        movzx   ecx, byte ptr [r9 + rax - 11]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 4
        movzx   ecx, byte ptr [r9 + rax - 58]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 5
        movzx   ecx, byte ptr [r9 + rax - 42]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 5
        movzx   ecx, byte ptr [r9 + rax - 26]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 5
        movzx   ecx, byte ptr [r9 + rax - 10]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 5
        movzx   ecx, byte ptr [r9 + rax - 57]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 6
        movzx   ecx, byte ptr [r9 + rax - 41]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 6
        movzx   ecx, byte ptr [r9 + rax - 25]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 6
        movzx   ecx, byte ptr [r9 + rax - 9]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 6
        movzx   ecx, byte ptr [r9 + rax - 56]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 7
        movzx   ecx, byte ptr [r9 + rax - 40]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 7
        movzx   ecx, byte ptr [r9 + rax - 24]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 7
        movzx   ecx, byte ptr [r9 + rax - 8]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 7
        movzx   ecx, byte ptr [r9 + rax - 55]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 8
        movzx   ecx, byte ptr [r9 + rax - 39]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 8
        movzx   ecx, byte ptr [r9 + rax - 23]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 8
        movzx   ecx, byte ptr [r9 + rax - 7]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 8
        movzx   ecx, byte ptr [r9 + rax - 54]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 9
        movzx   ecx, byte ptr [r9 + rax - 38]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 9
        movzx   ecx, byte ptr [r9 + rax - 22]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 9
        movzx   ecx, byte ptr [r9 + rax - 6]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 9
        movzx   ecx, byte ptr [r9 + rax - 53]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 10
        movzx   ecx, byte ptr [r9 + rax - 37]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 10
        movzx   ecx, byte ptr [r9 + rax - 21]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 10
        movzx   ecx, byte ptr [r9 + rax - 5]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 10
        movzx   ecx, byte ptr [r9 + rax - 52]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 11
        movzx   ecx, byte ptr [r9 + rax - 36]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 11
        movzx   ecx, byte ptr [r9 + rax - 20]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 11
        movzx   ecx, byte ptr [r9 + rax - 4]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 11
        movzx   ecx, byte ptr [r9 + rax - 51]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 12
        movzx   ecx, byte ptr [r9 + rax - 35]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 12
        movzx   ecx, byte ptr [r9 + rax - 19]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 12
        movzx   ecx, byte ptr [r9 + rax - 3]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 12
        movzx   ecx, byte ptr [r9 + rax - 50]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 13
        movzx   ecx, byte ptr [r9 + rax - 34]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 13
        movzx   ecx, byte ptr [r9 + rax - 18]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 13
        movzx   ecx, byte ptr [r9 + rax - 2]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 13
        movzx   ecx, byte ptr [r9 + rax - 49]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 14
        movzx   ecx, byte ptr [r9 + rax - 33]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 14
        movzx   ecx, byte ptr [r9 + rax - 17]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 14
        movzx   ecx, byte ptr [r9 + rax - 48]
        vpinsrb xmm4, xmm4, byte ptr [rdx + rcx], 15
        movzx   ecx, byte ptr [r9 + rax - 32]
        vpinsrb xmm5, xmm5, byte ptr [rdx + rcx], 15
        movzx   ecx, byte ptr [r9 + rax - 16]
        vpinsrb xmm6, xmm6, byte ptr [rdx + rcx], 15
        movzx   ecx, byte ptr [r9 + rax - 1]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 14
        movzx   ecx, byte ptr [r9 + rax]
        vpinsrb xmm7, xmm7, byte ptr [rdx + rcx], 15
        vpor    xmm0, xmm4, xmm0
        vpor    xmm1, xmm5, xmm1
        vpor    xmm2, xmm6, xmm2
        vpor    xmm3, xmm7, xmm3
        add     rax, 64
        cmp     r8, rax
        jne     .LBB0_12
        vpor    xmm0, xmm1, xmm0
        vpor    xmm0, xmm2, xmm0
        vpor    xmm0, xmm3, xmm0
        vpshufd xmm1, xmm0, 238
        vpor    xmm0, xmm0, xmm1
        vpshufd xmm1, xmm0, 85
        vpor    xmm0, xmm0, xmm1
        vpsrld  xmm1, xmm0, 16
        vpor    xmm0, xmm0, xmm1
        vpsrlw  xmm1, xmm0, 8
        vpor    xmm0, xmm0, xmm1
        vmovd   ecx, xmm0
        cmp     r8, rsi
        je      .LBB0_9
        test    sil, 56
        je      .LBB0_15
        ...

Meta

rustc --version --verbose:

rustc 1.64.0 (a55dd71d5 2022-09-19)
binary: rustc
commit-hash: a55dd71d5fb0ec5a6a3a9e8c27b2127ba491ce52
commit-date: 2022-09-19
host: x86_64-unknown-linux-gnu
release: 1.64.0
LLVM version: 14.0.6

rustc 1.66.0-nightly (01af5040f 2022-10-04)
binary: rustc
commit-hash: 01af5040fdada6ef8f1b749cda798d80a8590b2c
commit-date: 2022-10-04
host: x86_64-unknown-linux-gnu
release: 1.66.0-nightly
LLVM version: 15.0.2

The text was updated successfully, but these errors were encountered:

Rageking8 · 2022-10-05T14:43:53Z

@rustbot label +T-compiler +A-codegen +I-heavy +I-slow

leonardo-m · 2022-10-05T17:16:06Z

The function should be compiled to a small scalar loop.

What exactly is the asm should rustc produce here?
(Currently rustc mostly/totally unrolls the loop. That's why the asm is very long.)

the8472 · 2022-10-05T17:42:42Z

-Copt-level=s produces a simple scalar loop without any unrolling. Doing a simple cargo bench with 1MB random data and a randomly initialized LUT shows that the simple loop is a bit faster than the unrolled one.

Nugine · 2022-10-05T18:31:37Z

The function should be compiled to a small scalar loop.

What exactly is the asm should rustc produce here? (Currently rustc mostly/totally unrolls the loop. That's why the asm is very long.)

The unrolled one is ok. But the auto-vectorization looks terrible since there are so many vpinsrb instructions.

scottmcm · 2022-10-15T21:54:43Z

The function should be compiled to a small scalar loop.

It's small with opt-level 1, s, and z. Not getting something small is somewhat expected with opt-level 2 or 3.

But the auto-vectorization looks terrible […]

Note that rustc itself never does any auto-vectorization. So I would encourage you to file this as an opt issue with an example of LLVM IR input (which you might get from running rustc, but LLVM devs don't like reproducers that need Rust) at https://github.com/llvm/llvm-project/issues/new/choose, as it's LLVM that makes such decisions. (Everything related to the exact instructions emitted is up to LLVM, outside inline ASM.)

Nugine · 2022-10-16T02:41:29Z

Upstream:

Undesirable auto-vectorization on a simple for-loop llvm/llvm-project#58387

Nugine added the C-bug Category: This is a bug. label Oct 5, 2022

Nugine mentioned this issue Oct 5, 2022

Tracking issue for unexpected performance degradation Nugine/simd#14

Open

Nugine changed the title ~~Incorrect auto-vectorization on a simple for-loop~~ Undesirable auto-vectorization on a simple for-loop Oct 16, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Undesirable auto-vectorization on a simple for-loop #102709

Undesirable auto-vectorization on a simple for-loop #102709

Nugine commented Oct 5, 2022

Rageking8 commented Oct 5, 2022

leonardo-m commented Oct 5, 2022

the8472 commented Oct 5, 2022

Nugine commented Oct 5, 2022

scottmcm commented Oct 15, 2022 •

edited

Loading

Nugine commented Oct 16, 2022

Undesirable auto-vectorization on a simple for-loop #102709

Undesirable auto-vectorization on a simple for-loop #102709

Comments

Nugine commented Oct 5, 2022

Meta

Rageking8 commented Oct 5, 2022

leonardo-m commented Oct 5, 2022

the8472 commented Oct 5, 2022

Nugine commented Oct 5, 2022

scottmcm commented Oct 15, 2022 • edited Loading

Nugine commented Oct 16, 2022

scottmcm commented Oct 15, 2022 •

edited

Loading