From 153137d1c2d01f36a7e99d50c8b6fd4f8805e71f Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Wed, 18 Sep 2024 20:43:09 +0900 Subject: [PATCH] [AMDGPU] Add hazard workarounds to insertIndirectBranch BranchRelaxation runs after the hazard recognizer, so workarounds for SGPR accesses need to be applied directly inline to the code it generates. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 13 + llvm/test/CodeGen/AMDGPU/branch-relaxation.ll | 937 ++++++++++++++++++ 2 files changed, 950 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 30aa36be99c95f..25e73b48f31c41 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2873,9 +2873,20 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, auto I = MBB.end(); + // Note: as this is used after hazard recognizer we need to apply some hazard + // workarounds directly. + const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) || + ST.hasVALUReadSGPRHazard(); + auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() { + if (FlushSGPRWrites) + BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); + }; + // We need to compute the offset relative to the instruction immediately after // s_getpc_b64. Insert pc arithmetic code before last terminator. MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); + ApplyHazardWorkarounds(); auto &MCCtx = MF->getContext(); MCSymbol *PostGetPCLabel = @@ -2890,10 +2901,12 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, .addReg(PCReg, RegState::Define, AMDGPU::sub0) .addReg(PCReg, 0, AMDGPU::sub0) .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET); + ApplyHazardWorkarounds(); BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub1) .addReg(PCReg, 0, AMDGPU::sub1) .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET); + ApplyHazardWorkarounds(); // Insert the indirect branch after the other terminator. BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 635f3e4886b875..41023ac5ae4e48 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -amdgpu-s-branch-bits=5 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-s-branch-bits=5 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s ; FIXME: We should use llvm-mc for this, but we can't even parse our own output. @@ -42,6 +44,71 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: uniform_conditional_max_short_forward_branch: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB0_1 +; GFX11-NEXT: ; %bb.3: ; %bb +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: .Lpost_getpc0: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s4, s4, (.LBB0_2-.Lpost_getpc0)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s5, s5, (.LBB0_2-.Lpost_getpc0)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[4:5] +; GFX11-NEXT: .LBB0_1: ; %bb2 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_sleep 0 +; GFX11-NEXT: .LBB0_2: ; %bb3 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: uniform_conditional_max_short_forward_branch: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s0, 0 +; GFX12-NEXT: s_cbranch_scc0 .LBB0_1 +; GFX12-NEXT: ; %bb.3: ; %bb +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: .Lpost_getpc0: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s4, s4, (.LBB0_2-.Lpost_getpc0)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, (.LBB0_2-.Lpost_getpc0)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[4:5] +; GFX12-NEXT: .LBB0_1: ; %bb2 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_sleep 0 +; GFX12-NEXT: .LBB0_2: ; %bb3 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm bb: %cmp = icmp eq i32 %cnd, 0 br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch @@ -89,6 +156,71 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: uniform_conditional_min_long_forward_branch: +; GFX11: ; %bb.0: ; %bb0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_1 +; GFX11-NEXT: ; %bb.3: ; %bb0 +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: .Lpost_getpc1: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s4, s4, (.LBB1_2-.Lpost_getpc1)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s5, s5, (.LBB1_2-.Lpost_getpc1)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[4:5] +; GFX11-NEXT: .LBB1_1: ; %bb2 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: .LBB1_2: ; %bb3 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: uniform_conditional_min_long_forward_branch: +; GFX12: ; %bb.0: ; %bb0 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s0, 0 +; GFX12-NEXT: s_cbranch_scc0 .LBB1_1 +; GFX12-NEXT: ; %bb.3: ; %bb0 +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: .Lpost_getpc1: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s4, s4, (.LBB1_2-.Lpost_getpc1)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, (.LBB1_2-.Lpost_getpc1)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[4:5] +; GFX12-NEXT: .LBB1_1: ; %bb2 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: .LBB1_2: ; %bb3 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm bb0: %cmp = icmp eq i32 %cnd, 0 br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch @@ -138,6 +270,78 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: uniform_conditional_min_long_forward_vcnd_branch: +; GFX11: ; %bb.0: ; %bb0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_f32_e64 s[4:5], s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX11-NEXT: s_cbranch_vccz .LBB2_1 +; GFX11-NEXT: ; %bb.3: ; %bb0 +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: .Lpost_getpc2: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s4, s4, (.LBB2_2-.Lpost_getpc2)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s5, s5, (.LBB2_2-.Lpost_getpc2)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[4:5] +; GFX11-NEXT: .LBB2_1: ; %bb2 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; 32 bytes +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: .LBB2_2: ; %bb3 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: uniform_conditional_min_long_forward_vcnd_branch: +; GFX12: ; %bb.0: ; %bb0 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_f32 s0, 0 +; GFX12-NEXT: s_cselect_b32 s1, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_vccz .LBB2_1 +; GFX12-NEXT: ; %bb.3: ; %bb0 +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: .Lpost_getpc2: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s4, s4, (.LBB2_2-.Lpost_getpc2)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s5, s5, (.LBB2_2-.Lpost_getpc2)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[4:5] +; GFX12-NEXT: .LBB2_1: ; %bb2 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: ; 32 bytes +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: .LBB2_2: ; %bb3 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm bb0: %cmp = fcmp oeq float %cnd, 0.0 br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch @@ -193,6 +397,85 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: min_long_forward_vbranch: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s[2:3], s0, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s[2:3] +; GFX11-NEXT: s_mov_b64 s[0:1], exec +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_cbranch_execnz .LBB3_1 +; GFX11-NEXT: ; %bb.3: ; %bb +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: .Lpost_getpc3: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s2, s2, (.LBB3_2-.Lpost_getpc3)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s3, s3, (.LBB3_2-.Lpost_getpc3)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[2:3] +; GFX11-NEXT: .LBB3_1: ; %bb2 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; 32 bytes +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: .LBB3_2: ; %bb3 +; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: min_long_forward_vbranch: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_co_u32 v0, s0, s0, v0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX12-NEXT: s_cbranch_execnz .LBB3_1 +; GFX12-NEXT: ; %bb.3: ; %bb +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: .Lpost_getpc3: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s2, s2, (.LBB3_2-.Lpost_getpc3)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, (.LBB3_2-.Lpost_getpc3)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[2:3] +; GFX12-NEXT: .LBB3_1: ; %bb2 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: ; 32 bytes +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: .LBB3_2: ; %bb3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = zext i32 %tid to i64 @@ -237,6 +520,61 @@ define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: s_setpc_b64 s[2:3] ; GCN-NEXT: .LBB4_2: ; %bb3 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: long_backward_sbranch: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB4_1: ; %bb2 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_cmp_lt_i32 s0, 10 +; GFX11-NEXT: s_cbranch_scc0 .LBB4_2 +; GFX11-NEXT: ; %bb.3: ; %bb2 +; GFX11-NEXT: ; in Loop: Header=BB4_1 Depth=1 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: .Lpost_getpc4: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s2, s2, (.LBB4_1-.Lpost_getpc4)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s3, s3, (.LBB4_1-.Lpost_getpc4)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[2:3] +; GFX11-NEXT: .LBB4_2: ; %bb3 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: long_backward_sbranch: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB4_1: ; %bb2 +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_co_i32 s0, s0, 1 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_cmp_lt_i32 s0, 10 +; GFX12-NEXT: s_cbranch_scc0 .LBB4_2 +; GFX12-NEXT: ; %bb.3: ; %bb2 +; GFX12-NEXT: ; in Loop: Header=BB4_1 Depth=1 +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: .Lpost_getpc4: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s2, s2, (.LBB4_1-.Lpost_getpc4)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, (.LBB4_1-.Lpost_getpc4)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[2:3] +; GFX12-NEXT: .LBB4_2: ; %bb3 +; GFX12-NEXT: s_endpgm bb: br label %bb2 @@ -311,6 +649,125 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: s_add_u32 s0, s0, (.LBB5_3-.Lpost_getpc4)&4294967295 ; GCN-NEXT: s_addc_u32 s1, s1, (.LBB5_3-.Lpost_getpc4)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] +; +; GFX11-LABEL: uniform_unconditional_min_long_forward_branch: +; GFX11: ; %bb.0: ; %bb0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-NEXT: s_cbranch_scc0 .LBB5_1 +; GFX11-NEXT: ; %bb.7: ; %bb0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: .Lpost_getpc6: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB5_4-.Lpost_getpc6)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB5_4-.Lpost_getpc6)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[0:1] +; GFX11-NEXT: .LBB5_1: ; %Flow +; GFX11-NEXT: s_and_not1_b64 vcc, exec, s[0:1] +; GFX11-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX11-NEXT: .LBB5_2: ; %bb2 +; GFX11-NEXT: v_mov_b32_e32 v0, 17 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB5_3: ; %bb4 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 63 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB5_4: ; %bb3 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_cbranch_execnz .LBB5_5 +; GFX11-NEXT: ; %bb.9: ; %bb3 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: .Lpost_getpc7: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB5_2-.Lpost_getpc7)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB5_2-.Lpost_getpc7)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[0:1] +; GFX11-NEXT: .LBB5_5: ; %bb3 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: .Lpost_getpc5: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB5_3-.Lpost_getpc5)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB5_3-.Lpost_getpc5)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[0:1] +; +; GFX12-LABEL: uniform_unconditional_min_long_forward_branch: +; GFX12: ; %bb.0: ; %bb0 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x2c +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s0, 0 +; GFX12-NEXT: s_mov_b32 s0, -1 +; GFX12-NEXT: s_cbranch_scc0 .LBB5_1 +; GFX12-NEXT: ; %bb.7: ; %bb0 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: .Lpost_getpc6: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s0, s0, (.LBB5_4-.Lpost_getpc6)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, (.LBB5_4-.Lpost_getpc6)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[0:1] +; GFX12-NEXT: .LBB5_1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_vccnz .LBB5_3 +; GFX12-NEXT: .LBB5_2: ; %bb2 +; GFX12-NEXT: v_mov_b32_e32 v0, 17 +; GFX12-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: .LBB5_3: ; %bb4 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; GFX12-NEXT: .LBB5_4: ; %bb3 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_cbranch_execnz .LBB5_5 +; GFX12-NEXT: ; %bb.9: ; %bb3 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: .Lpost_getpc7: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s0, s0, (.LBB5_2-.Lpost_getpc7)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, (.LBB5_2-.Lpost_getpc7)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[0:1] +; GFX12-NEXT: .LBB5_5: ; %bb3 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: .Lpost_getpc5: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s0, s0, (.LBB5_3-.Lpost_getpc5)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, (.LBB5_3-.Lpost_getpc5)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[0:1] bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 @@ -356,6 +813,57 @@ define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr ad ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB6_2: ; %DummyReturnBlock ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: uniform_unconditional_min_long_backward_branch: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_and_b64 vcc, exec, -1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB6_1: ; %loop +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_cbranch_vccz .LBB6_2 +; GFX11-NEXT: ; %bb.3: ; %loop +; GFX11-NEXT: ; in Loop: Header=BB6_1 Depth=1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: .Lpost_getpc8: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB6_1-.Lpost_getpc8)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB6_1-.Lpost_getpc8)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[0:1] +; GFX11-NEXT: .LBB6_2: ; %DummyReturnBlock +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: uniform_unconditional_min_long_backward_branch: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_mov_b32 vcc_lo, exec_lo +; GFX12-NEXT: .LBB6_1: ; %loop +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_cbranch_vccz .LBB6_2 +; GFX12-NEXT: ; %bb.3: ; %loop +; GFX12-NEXT: ; in Loop: Header=BB6_1 Depth=1 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: .Lpost_getpc8: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s0, s0, (.LBB6_1-.Lpost_getpc8)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, (.LBB6_1-.Lpost_getpc8)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[0:1] +; GFX12-NEXT: .LBB6_2: ; %DummyReturnBlock +; GFX12-NEXT: s_endpgm entry: br label %loop @@ -410,6 +918,92 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: expand_requires_expand: +; GFX11: ; %bb.0: ; %bb0 +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lt_i32 s0, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX11-NEXT: s_cbranch_vccnz .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; %bb1 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s0, 3 +; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX11-NEXT: .LBB7_2: ; %Flow +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 vcc, exec, s[0:1] +; GFX11-NEXT: s_cbranch_vccz .LBB7_3 +; GFX11-NEXT: ; %bb.5: ; %Flow +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: .Lpost_getpc9: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB7_4-.Lpost_getpc9)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB7_4-.Lpost_getpc9)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[0:1] +; GFX11-NEXT: .LBB7_3: ; %bb2 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: .LBB7_4: ; %bb3 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: expand_requires_expand: +; GFX12: ; %bb.0: ; %bb0 +; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_lt_i32 s0, 0 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_vccnz .LBB7_2 +; GFX12-NEXT: ; %bb.1: ; %bb1 +; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_lg_u32 s0, 3 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: .LBB7_2: ; %Flow +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_vccz .LBB7_3 +; GFX12-NEXT: ; %bb.5: ; %Flow +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: .Lpost_getpc9: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s0, s0, (.LBB7_4-.Lpost_getpc9)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, (.LBB7_4-.Lpost_getpc9)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[0:1] +; GFX12-NEXT: .LBB7_3: ; %bb2 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: .LBB7_4: ; %bb3 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_endpgm bb0: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %cmp0 = icmp slt i32 %cond0, 0 @@ -470,6 +1064,56 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_sleep 5 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: uniform_inside_divergent: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_mov_b64 s[0:1], exec +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmpx_gt_u32_e32 16, v0 +; GFX11-NEXT: s_cbranch_execz .LBB8_3 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: global_store_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_cbranch_scc1 .LBB8_3 +; GFX11-NEXT: ; %bb.2: ; %if_uniform +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: .LBB8_3: ; %endif +; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11-NEXT: s_sleep 5 +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: uniform_inside_divergent: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_mov_b32 s4, exec_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_gt_u32_e32 16, v0 +; GFX12-NEXT: s_cbranch_execz .LBB8_3 +; GFX12-NEXT: ; %bb.1: ; %if +; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_lg_u32 s2, 0 +; GFX12-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX12-NEXT: s_cbranch_scc1 .LBB8_3 +; GFX12-NEXT: ; %bb.2: ; %if_uniform +; GFX12-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: .LBB8_3: ; %endif +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_sleep 5 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %d_cmp = icmp ult i32 %tid, 16 @@ -543,6 +1187,113 @@ define amdgpu_kernel void @analyze_mask_branch() #0 { ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB9_5: ; %UnifiedReturnBlock ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: analyze_mask_branch: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b64 s[0:1], exec +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_mov_b32_e64 v0, 0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_cmpx_nlt_f32_e32 0, v0 +; GFX11-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %ret +; GFX11-NEXT: v_mov_b32_e32 v0, 7 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB9_2: ; %Flow1 +; GFX11-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] +; GFX11-NEXT: s_cbranch_execnz .LBB9_3 +; GFX11-NEXT: ; %bb.6: ; %Flow1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: .Lpost_getpc10: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc10)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc10)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[0:1] +; GFX11-NEXT: .LBB9_3: ; %loop.preheader +; GFX11-NEXT: s_and_b64 vcc, exec, 0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB9_4: ; %loop +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_cbranch_vccnz .LBB9_5 +; GFX11-NEXT: ; %bb.8: ; %loop +; GFX11-NEXT: ; in Loop: Header=BB9_4 Depth=1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: .Lpost_getpc11: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc11)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc11)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[0:1] +; GFX11-NEXT: .LBB9_5: ; %UnifiedReturnBlock +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: analyze_mask_branch: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: v_mov_b32_e64 v0, 0 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: v_cmpx_nlt_f32_e32 0, v0 +; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execz .LBB9_2 +; GFX12-NEXT: ; %bb.1: ; %ret +; GFX12-NEXT: v_mov_b32_e32 v0, 7 +; GFX12-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: .LBB9_2: ; %Flow1 +; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB9_3 +; GFX12-NEXT: ; %bb.6: ; %Flow1 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: .Lpost_getpc10: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s0, s0, (.LBB9_5-.Lpost_getpc10)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, (.LBB9_5-.Lpost_getpc10)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[0:1] +; GFX12-NEXT: .LBB9_3: ; %loop.preheader +; GFX12-NEXT: s_mov_b32 vcc_lo, 0 +; GFX12-NEXT: .LBB9_4: ; %loop +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_cbranch_vccnz .LBB9_5 +; GFX12-NEXT: ; %bb.8: ; %loop +; GFX12-NEXT: ; in Loop: Header=BB9_4 Depth=1 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: .Lpost_getpc11: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s0, s0, (.LBB9_4-.Lpost_getpc11)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, (.LBB9_4-.Lpost_getpc11)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[0:1] +; GFX12-NEXT: .LBB9_5: ; %UnifiedReturnBlock +; GFX12-NEXT: s_endpgm entry: %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"() %cmp0 = fcmp ogt float %reg, 0.000000e+00 @@ -634,6 +1385,192 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: long_branch_hang: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX11-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX11-NEXT: s_cmp_lt_i32 s7, 6 +; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 +; GFX11-NEXT: ; %bb.8: ; %bb +; GFX11-NEXT: s_getpc_b64 s[8:9] +; GFX11-NEXT: .Lpost_getpc12: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc12)&4294967295 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc12)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[8:9] +; GFX11-NEXT: .LBB10_1: ; %bb13 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_cbranch_execz .LBB10_3 +; GFX11-NEXT: s_branch .LBB10_4 +; GFX11-NEXT: .LBB10_2: +; GFX11-NEXT: s_mov_b64 s[8:9], 0 +; GFX11-NEXT: .LBB10_3: ; %bb9 +; GFX11-NEXT: s_cmp_lt_i32 s7, 11 +; GFX11-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX11-NEXT: s_cmp_ge_i32 s6, s7 +; GFX11-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b64 s[8:9], s[10:11], s[8:9] +; GFX11-NEXT: .LBB10_4: ; %Flow5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 vcc, exec, s[8:9] +; GFX11-NEXT: s_cbranch_vccnz .LBB10_6 +; GFX11-NEXT: ; %bb.5: ; %bb14 +; GFX11-NEXT: s_cmp_lt_i32 s5, 9 +; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11-NEXT: s_cmp_lt_i32 s6, s7 +; GFX11-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX11-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX11-NEXT: s_branch .LBB10_7 +; GFX11-NEXT: .LBB10_6: +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: .LBB10_7: ; %bb19 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: long_branch_hang: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x2c +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_cmp_eq_u32 s4, 0 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_cmp_lg_u32 s4, 0 +; GFX12-NEXT: s_mov_b32 s4, -1 +; GFX12-NEXT: s_cselect_b32 s8, -1, 0 +; GFX12-NEXT: s_cmp_lt_i32 s7, 6 +; GFX12-NEXT: s_cbranch_scc0 .LBB10_1 +; GFX12-NEXT: ; %bb.18: ; %bb +; GFX12-NEXT: s_getpc_b64 s[10:11] +; GFX12-NEXT: .Lpost_getpc17: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s10, s10, (.LBB10_4-.Lpost_getpc17)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s11, s11, (.LBB10_4-.Lpost_getpc17)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[10:11] +; GFX12-NEXT: .LBB10_1: ; %Flow +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_vccnz .LBB10_2 +; GFX12-NEXT: ; %bb.10: ; %Flow +; GFX12-NEXT: s_getpc_b64 s[8:9] +; GFX12-NEXT: .Lpost_getpc13: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s8, s8, (.LBB10_5-.Lpost_getpc13)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s9, s9, (.LBB10_5-.Lpost_getpc13)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[8:9] +; GFX12-NEXT: .LBB10_2: ; %Flow5 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_vccz .LBB10_3 +; GFX12-NEXT: ; %bb.12: ; %Flow5 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: .Lpost_getpc14: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s0, s0, (.LBB10_6-.Lpost_getpc14)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, (.LBB10_6-.Lpost_getpc14)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[0:1] +; GFX12-NEXT: .LBB10_3: ; %bb14 +; GFX12-NEXT: s_cmp_lt_i32 s5, 9 +; GFX12-NEXT: s_cselect_b32 s1, -1, 0 +; GFX12-NEXT: s_cmp_lt_i32 s6, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_or_b32 s1, s4, s1 +; GFX12-NEXT: s_and_b32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-NEXT: ; %bb.8: ; %bb14 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: .Lpost_getpc12: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s0, s0, (.LBB10_7-.Lpost_getpc12)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, (.LBB10_7-.Lpost_getpc12)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[0:1] +; GFX12-NEXT: .LBB10_4: ; %bb13 +; GFX12-NEXT: s_mov_b32 s1, s8 +; GFX12-NEXT: ;;#ASMSTART +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: v_nop_e64 +; GFX12-NEXT: ;;#ASMEND +; GFX12-NEXT: s_cbranch_execz .LBB10_5 +; GFX12-NEXT: ; %bb.14: ; %bb13 +; GFX12-NEXT: s_getpc_b64 s[8:9] +; GFX12-NEXT: .Lpost_getpc15: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s8, s8, (.LBB10_2-.Lpost_getpc15)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s9, s9, (.LBB10_2-.Lpost_getpc15)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[8:9] +; GFX12-NEXT: .LBB10_5: ; %bb9 +; GFX12-NEXT: s_cmp_lt_i32 s7, 11 +; GFX12-NEXT: s_cselect_b32 s1, -1, 0 +; GFX12-NEXT: s_cmp_ge_i32 s6, s7 +; GFX12-NEXT: s_cselect_b32 s4, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s1, s4, s1 +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_vccnz .LBB10_6 +; GFX12-NEXT: ; %bb.16: ; %bb9 +; GFX12-NEXT: s_getpc_b64 s[8:9] +; GFX12-NEXT: .Lpost_getpc16: +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_u32 s8, s8, (.LBB10_3-.Lpost_getpc16)&4294967295 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s9, s9, (.LBB10_3-.Lpost_getpc16)>>32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_setpc_b64 s[8:9] +; GFX12-NEXT: .LBB10_6: +; GFX12-NEXT: ; implicit-def: $vgpr0 +; GFX12-NEXT: .LBB10_7: ; %bb19 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x3c +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm bb: %tmp = icmp slt i32 %arg2, 9 %tmp6 = icmp eq i32 %arg1, 0