From 133772315c8481f75f3ab03863b018364d233573 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 26 Feb 2025 17:47:09 +0700 Subject: [PATCH 1/3] AMDGPU: Start to use AV classes for unknown vector class Use AGPR+VGPR superclasses for gfx90a+. The type used for the class should be the broadest possible class, to be contextually restricted later. InstrEmitter clamps these to the common subclass of the context use instructions, so we're best off using the broadest possible class for all types. Note this does very little because we only use VGPR classes for FP types (though this doesn't particularly make any sense), and we legalize normal loads and stores to integer. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 41 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 11 + llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 4 + .../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 560 +++++++++--------- .../CodeGen/AMDGPU/a-v-global-atomicrmw.ll | 280 ++++----- llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 8 +- .../AMDGPU/av-split-dead-valno-crash.ll | 39 +- .../CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll | 16 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 64 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 64 +- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 144 +++-- .../CodeGen/AMDGPU/global-atomic-fadd.f64.ll | 28 +- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 16 +- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 16 +- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 24 +- llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 2 - 16 files changed, 662 insertions(+), 655 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 768c0abd2e3f1..29346781c3925 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -96,59 +96,65 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); const SIRegisterInfo *TRI = STI.getRegisterInfo(); - const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); + const TargetRegisterClass *V64RegClass = + TRI->getDefaultVectorSuperClassForBitWidth(64); addRegisterClass(MVT::f64, V64RegClass); addRegisterClass(MVT::v2f32, V64RegClass); addRegisterClass(MVT::Untyped, V64RegClass); addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); - addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); + addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96)); addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128)); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); - addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); + addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160)); addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass); + addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192)); addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); - addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass); + addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192)); addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); - addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass); + addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224)); addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256)); addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256)); addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass); - addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass); + addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288)); addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass); - addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass); + addRegisterClass(MVT::v10f32, + TRI->getDefaultVectorSuperClassForBitWidth(320)); addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass); - addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass); + addRegisterClass(MVT::v11f32, + TRI->getDefaultVectorSuperClassForBitWidth(352)); addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass); - addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass); + addRegisterClass(MVT::v12f32, + TRI->getDefaultVectorSuperClassForBitWidth(384)); addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16f32, + TRI->getDefaultVectorSuperClassForBitWidth(512)); addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512)); addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); - addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v16f64, + TRI->getDefaultVectorSuperClassForBitWidth(1024)); if (Subtarget->has16BitInsts()) { if (Subtarget->useRealTrue16Insts()) { @@ -180,7 +186,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, + TRI->getDefaultVectorSuperClassForBitWidth(1024)); computeRegisterProperties(Subtarget->getRegisterInfo()); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index ecf3aee6048cd..0a5b51a33f02d 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3557,6 +3557,17 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { : getAnyVectorSuperClassForBitWidth(BitWidth); } +const TargetRegisterClass * +SIRegisterInfo::getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const { + // TODO: In principle this should use AV classes for gfx908 too. This is + // limited to 90a+ to avoid regressing special case copy optimizations which + // need new handling. The core issue is that it's not possible to directly + // copy between AGPRs on gfx908, and the current optimizations around that + // expect to see copies to VGPR. + return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth) + : getVGPRClassForBitWidth(BitWidth); +} + const TargetRegisterClass * SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { if (BitWidth == 16 || BitWidth == 32) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index cd4dc9bc4d037..365a76d4a5006 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -215,6 +215,10 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const; + LLVM_READONLY + const TargetRegisterClass * + getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const; + LLVM_READONLY static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth); diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index 003aa049b2d1b..c2d79fdae1208 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -10009,10 +10009,10 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base -; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ; def v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc @@ -10020,39 +10020,39 @@ define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_cbranch_execz .LBB128_6 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v3 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v5 ; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB128_3 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off sc0 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off sc0 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB128_3: ; %Flow ; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] ; GFX950-NEXT: s_cbranch_execz .LBB128_5 ; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] -; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-NEXT: .LBB128_5: ; %Flow1 ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-NEXT: .LBB128_6: ; %Flow2 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB128_8 ; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc -; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc +; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: .LBB128_8: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] @@ -10095,14 +10095,14 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB129_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB129_4: ; %Flow3 @@ -10154,14 +10154,14 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB129_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: .LBB129_4: ; %Flow3 @@ -10195,51 +10195,51 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fsub_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB130_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB130_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB130_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: .LBB130_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB130_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB130_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10249,48 +10249,48 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB130_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB130_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB130_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB130_4: ; %Flow3 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB130_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB130_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -10710,14 +10710,14 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB135_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB135_4: ; %Flow2 @@ -10778,14 +10778,14 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB135_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB135_4: ; %Flow2 @@ -10823,59 +10823,59 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmaximum_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB136_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: .LBB136_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB136_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB136_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB136_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB136_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10885,58 +10885,58 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB136_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: .LBB136_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX950-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB136_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB136_4: ; %Flow2 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB136_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB136_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -10978,14 +10978,14 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB137_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: .LBB137_4: ; %Flow2 @@ -11046,14 +11046,14 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB137_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-NEXT: .LBB137_4: ; %Flow2 @@ -11091,59 +11091,59 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fminimum_f64_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB138_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: .LBB138_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB138_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: .LBB138_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB138_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB138_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -11153,58 +11153,58 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_mov_b64 s[2:3], 0x50 ; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[2:3] +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB138_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: .LBB138_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX950-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB138_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: .LBB138_4: ; %Flow2 ; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-NEXT: s_cbranch_execz .LBB138_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private -; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-NEXT: .LBB138_6: ; %atomicrmw.phi ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -18393,14 +18393,14 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB237_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_branch .LBB237_6 ; GFX90A-NEXT: .LBB237_4: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -18451,14 +18451,14 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB237_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_branch .LBB237_6 ; GFX950-NEXT: .LBB237_4: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 @@ -18496,43 +18496,43 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB238_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB238_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB238_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB238_6 ; GFX90A-NEXT: .LBB238_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB238_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1] -; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB238_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -18547,40 +18547,40 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB238_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: .LBB238_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB238_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB238_6 ; GFX950-NEXT: .LBB238_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB238_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1] -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB238_6: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -18988,14 +18988,14 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB243_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_branch .LBB243_6 ; GFX90A-NEXT: .LBB243_4: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -19055,14 +19055,14 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB243_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_branch .LBB243_6 ; GFX950-NEXT: .LBB243_4: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 @@ -19104,51 +19104,51 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB244_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: .LBB244_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB244_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB244_6 ; GFX90A-NEXT: .LBB244_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB244_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB244_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -19163,50 +19163,50 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB244_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: .LBB244_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX950-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB244_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB244_6 ; GFX950-NEXT: .LBB244_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB244_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB244_6: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] @@ -19248,14 +19248,14 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB245_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_branch .LBB245_6 ; GFX90A-NEXT: .LBB245_4: ; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 @@ -19315,14 +19315,14 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB245_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_branch .LBB245_6 ; GFX950-NEXT: .LBB245_4: ; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 @@ -19364,51 +19364,51 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_cbranch_vccz .LBB246_4 ; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: .LBB246_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB246_2 ; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_branch .LBB246_6 ; GFX90A-NEXT: .LBB246_4: -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_cbranch_execz .LBB246_6 ; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 ; GFX90A-NEXT: v_mov_b32_e32 v6, s4 -; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: .LBB246_6: ; %atomicrmw.phi ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -19423,50 +19423,50 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_cbranch_vccz .LBB246_4 ; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: .LBB246_2: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] -; GFX950-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc -; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB246_2 ; GFX950-NEXT: ; %bb.3: ; %Flow ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_branch .LBB246_6 ; GFX950-NEXT: .LBB246_4: -; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-NEXT: s_cbranch_execz .LBB246_6 ; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private ; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-NEXT: s_cselect_b32 s0, s0, -1 -; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 ; GFX950-NEXT: .LBB246_6: ; %atomicrmw.phi ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index 34a4899123749..b95709611999d 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -6740,14 +6740,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB129_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -6770,14 +6770,14 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB129_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -6796,23 +6796,23 @@ define void @global_atomic_fsub_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], -v[6:7] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB130_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -6822,23 +6822,23 @@ define void @global_atomic_fsub_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] -; GFX950-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0 +; GFX950-NEXT: v_add_f64 v[2:3], v[4:5], -v[6:7] +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB130_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -7020,14 +7020,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB135_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -7055,14 +7055,14 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB135_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7080,28 +7080,28 @@ define void @global_atomic_fmaximum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB136_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[10:11], v[2:3] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB136_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7110,29 +7110,29 @@ define void @global_atomic_fmaximum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB136_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[4:5] -; GFX950-NEXT: v_max_f64 v[4:5], v[10:11], v[2:3] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] +; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB136_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -7164,14 +7164,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB137_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -7199,14 +7199,14 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB137_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -7224,28 +7224,28 @@ define void @global_atomic_fminimum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[6:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_min_f64 v[4:5], v[10:11], v[2:3] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] -; GFX90A-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 glc +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB138_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ; use v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7254,29 +7254,29 @@ define void @global_atomic_fminimum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ; def v[6:7] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB138_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[4:5] -; GFX950-NEXT: v_min_f64 v[4:5], v[10:11], v[2:3] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] +; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB138_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ; use v[2:3] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12536,14 +12536,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB237_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -12567,14 +12567,14 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB237_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -12590,54 +12590,54 @@ define void @global_atomic_fsub_f64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_fsub_f64_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB238_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB238_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fsub_f64_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB238_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] -; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB238_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12826,14 +12826,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB243_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -12862,14 +12862,14 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB243_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -12885,63 +12885,63 @@ define void @global_atomic_fmaximum_f64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_fmaximum_f64_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB244_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB244_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_f64_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB244_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_max_f64 v[2:3], v[8:9], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB244_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12974,14 +12974,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB245_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a[0:1] ; GFX90A-NEXT: ;;#ASMEND @@ -13010,14 +13010,14 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB245_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a[0:1] ; GFX950-NEXT: ;;#ASMEND @@ -13033,63 +13033,63 @@ define void @global_atomic_fminimum_f64_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_fminimum_f64_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB246_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[8:9], v[0:1] -; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] -; GFX90A-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc -; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB246_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ; use v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_f64_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ; def v[4:5] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB246_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] -; GFX950-NEXT: v_min_f64 v[2:3], v[8:9], v[0:1] -; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] ; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc -; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc -; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB246_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ; use v[0:1] ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index 3c316f4acedb7..430f50e26fc05 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -836,7 +836,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: s_getpc_b64 s[4:5] ; GFX90A-NEXT: s_add_u32 s4, s4, global@rel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v2, s[4:5] +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v2, s[4:5] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: s_branch .LBB5_7 ; GFX90A-NEXT: .LBB5_6: ; %Flow @@ -846,7 +846,6 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: .LBB5_7: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX90A-NEXT: s_getpc_b64 s[6:7] ; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 @@ -856,6 +855,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[6:7], v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_7 ; GFX90A-NEXT: ; %bb.8: ; %atomicrmw.end1 @@ -926,7 +926,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: s_getpc_b64 s[0:1] ; GFX942-NEXT: s_add_u32 s0, s0, global@rel32@lo+4 ; GFX942-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v2, s[0:1] +; GFX942-NEXT: global_load_dwordx2 v[4:5], v2, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: s_branch .LBB5_7 ; GFX942-NEXT: .LBB5_6: ; %Flow @@ -936,7 +936,6 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: .LBB5_7: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] ; GFX942-NEXT: s_getpc_b64 s[2:3] ; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 @@ -946,6 +945,7 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5] ; GFX942-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB5_7 ; GFX942-NEXT: ; %bb.8: ; %atomicrmw.end1 diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll index 37040123ee20c..42f76c4a10d2a 100644 --- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 ; CHECK-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x0 ; CHECK-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6 ; CHECK-NEXT: v_mov_b32_e32 v20, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 @@ -16,12 +16,10 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_bitcmp1_b32 s0, 8 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v1 ; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] -; CHECK-NEXT: v_accvgpr_write_b32 a2, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78 ; CHECK-NEXT: v_mov_b32_e32 v2, 0xa17f65f6 ; CHECK-NEXT: v_mov_b32_e32 v3, 0xbe927e4f ; CHECK-NEXT: v_mov_b32_e32 v4, 0x19f4ec90 @@ -37,8 +35,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_mov_b32_e32 v14, 0x8427b883 ; CHECK-NEXT: v_mov_b32_e32 v15, 0x3fae1bb4 ; CHECK-NEXT: s_mov_b64 s[22:23], 0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0x57b87036 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x3fb3b136 +; CHECK-NEXT: v_mov_b32_e32 v16, 0x57b87036 +; CHECK-NEXT: v_mov_b32_e32 v17, 0x3fb3b136 ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v18, 0x55555523 ; CHECK-NEXT: v_mov_b32_e32 v19, 0xbfd55555 @@ -64,10 +62,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15] ; CHECK-NEXT: flat_load_dwordx2 v[24:25], v[24:25] -; CHECK-NEXT: v_accvgpr_read_b32 v27, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a2 +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[0:1] ; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[2:3] -; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[0:1] ; CHECK-NEXT: v_accvgpr_write_b32 a0, 0 ; CHECK-NEXT: v_accvgpr_write_b32 a1, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -85,9 +81,10 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] ; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[14:15] ; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] -; CHECK-NEXT: v_fmac_f64_e32 v[16:17], 0, v[28:29] -; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[18:19] -; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[16:17] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[16:17] +; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[18:19] +; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] ; CHECK-NEXT: s_branch .LBB0_6 ; CHECK-NEXT: .LBB0_5: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 @@ -96,8 +93,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: .LBB0_6: ; %.preheader1855.i.i.i3329 ; CHECK-NEXT: ; Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_accvgpr_read_b32 v29, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v28, a0 +; CHECK-NEXT: v_accvgpr_read_b32 v27, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a0 ; CHECK-NEXT: s_mov_b64 s[24:25], -1 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 ; CHECK-NEXT: s_mov_b64 vcc, s[2:3] @@ -112,9 +109,9 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 ; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v28 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v29 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: s_branch .LBB0_5 ; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1 @@ -132,13 +129,13 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_cbranch_vccz .LBB0_13 ; CHECK-NEXT: ; %bb.12: ; %._crit_edge2105.i.i.i2330.loopexit ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[28:29] +; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[26:27] ; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17] ; CHECK-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[16:17] -; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9] -; CHECK-NEXT: v_mov_b32_e32 v17, v16 +; CHECK-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[8:9] +; CHECK-NEXT: v_mov_b32_e32 v27, v26 ; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17] -; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[12:13] +; CHECK-NEXT: global_store_dwordx2 v20, v[26:27], s[12:13] ; CHECK-NEXT: s_cselect_b32 s23, s23, 0 ; CHECK-NEXT: s_cselect_b32 s22, s22, 0 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll index 2ce54f8a463c7..30536b18674b5 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll @@ -108,7 +108,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 @@ -136,7 +136,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 @@ -164,7 +164,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 @@ -194,7 +194,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 @@ -340,7 +340,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1 @@ -374,7 +374,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr a ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1 @@ -408,7 +408,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr a ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1 @@ -444,7 +444,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]] - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:av_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec ; GFX90A_GFX942-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 59b0537b817d2..db141d6bb3d9f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -5415,54 +5415,50 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_6 -; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start +; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX90A-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_4 -; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB24_2 -; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_cbranch_execz .LBB24_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index c9c9f332fe391..e4dca8481ee12 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -5415,54 +5415,50 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow2 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_6 -; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB24_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB24_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB24_4: ; %atomicrmw.start +; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] -; GFX90A-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB24_4 -; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: .LBB24_4: ; %Flow2 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB24_2 -; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_cbranch_execz .LBB24_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] -; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 587c2ea885077..12f429454fc42 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -3295,12 +3295,11 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB16_4 ; GFX942-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX942-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 @@ -3308,6 +3307,7 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB16_2 ; GFX942-NEXT: ; %bb.3: ; %Flow @@ -3433,51 +3433,47 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 { ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_mov_b32_e32 v4, v0 -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 -; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_3 -; GFX90A-NEXT: ; %bb.1: ; %Flow3 -; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_6 -; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.phi -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_setpc_b64 s[30:31] -; GFX90A-NEXT: .LBB16_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB16_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.start +; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_4 -; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_cbranch_execnz .LBB16_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB16_4: ; %Flow3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX90A-NEXT: s_cbranch_execz .LBB16_2 -; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_cbranch_execz .LBB16_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] -; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB16_6: ; %atomicrmw.phi ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -3713,8 +3709,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_mov_b64 s[2:3], 0x7f8 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, s[2:3] +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v9 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -3726,31 +3722,31 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB17_3: ; %atomicrmw.global -; GFX942-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[8:9] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] -; GFX942-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB17_4 ; GFX942-NEXT: ; %bb.5: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB17_2 ; GFX942-NEXT: .LBB17_6: ; %atomicrmw.private -; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] @@ -3870,10 +3866,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0x7f8, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v9 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -3885,29 +3881,29 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[8:9] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_4 ; GFX90A-NEXT: ; %bb.5: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB17_2 ; GFX90A-NEXT: .LBB17_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -4160,8 +4156,8 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX942-NEXT: s_movk_i32 s2, 0xf800 ; GFX942-NEXT: s_mov_b32 s3, -1 ; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] -; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX942-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, s[2:3] +; GFX942-NEXT: v_cmp_ne_u32_e32 vcc, s1, v9 ; GFX942-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_xor_b64 s[0:1], exec, s[0:1] @@ -4173,31 +4169,31 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_setpc_b64 s[30:31] ; GFX942-NEXT: .LBB18_3: ; %atomicrmw.global -; GFX942-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX942-NEXT: flat_load_dwordx2 v[6:7], v[8:9] ; GFX942-NEXT: s_mov_b64 s[2:3], 0 ; GFX942-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[0:1] -; GFX942-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] +; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] sc0 +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_cbranch_execnz .LBB18_4 ; GFX942-NEXT: ; %bb.5: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX942-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX942-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB18_2 ; GFX942-NEXT: .LBB18_6: ; %atomicrmw.private -; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX942-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], -v[2:3] @@ -4317,10 +4313,10 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xfffff800, v0 ; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v1, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v9 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -4332,29 +4328,29 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global -; GFX90A-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GFX90A-NEXT: flat_load_dwordx2 v[6:7], v[8:9] ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[2:3] -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc +; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 ; GFX90A-NEXT: ; %bb.5: ; %Flow ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX90A-NEXT: s_cbranch_execz .LBB18_2 ; GFX90A-NEXT: .LBB18_6: ; %atomicrmw.private -; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v8, vcc ; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GFX90A-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll index 02884559bdaa9..3d0ebc72791bd 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll @@ -38,33 +38,35 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) % ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY6]], 0, 0, implicit $exec :: (load (s64) from %ir.ptr, addrspace 1) ; GFX90A-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.atomicrmw.start: ; GFX90A-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1 - ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_LOAD_DWORDX2_]], %bb.0, %3, %bb.1 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vreg_64_align2 = PHI [[COPY7]], %bb.0, %3, %bb.1 ; GFX90A-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_ADD_F64_e64 0, [[PHI1]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 - ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 - ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 - ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 - ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3 - ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]] - ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY11]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub1 + ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_F64_e64_]].sub0 + ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1 + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0 + ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY11]], %subreg.sub2, killed [[COPY10]], %subreg.sub3 + ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE2]] + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY5]], killed [[COPY12]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic monotonic (s64) on %ir.ptr, addrspace 1) ; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U64_e64 [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], [[PHI1]], implicit $exec + ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:av_64_align2 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]] ; GFX90A-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U64_e64_]], [[PHI]], implicit-def dead $scc ; GFX90A-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2.atomicrmw.end: - ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]], %bb.1 + ; GFX90A-NEXT: [[PHI2:%[0-9]+]]:av_64_align2 = PHI [[COPY13]], %bb.1 ; GFX90A-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 ; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec - ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 - ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec + ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec + ; GFX90A-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1 + ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index 311faac1b7c29..0f89d8fcd7fbf 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -4122,24 +4122,24 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index e2808ee9bf706..1ee83360e1aab 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -4122,24 +4122,24 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; GFX90A-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] -; GFX90A-NEXT: v_min_f64 v[4:5], v[4:5], v[2:3] -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 11f0f38d2b6fa..8c101d627194c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -3760,12 +3760,11 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX942-LABEL: global_agent_atomic_fsub_ret_f64: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 @@ -3773,6 +3772,7 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3836,18 +3836,18 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f64: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4012,12 +4012,11 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX942-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 sc0 @@ -4025,6 +4024,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB17_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4088,18 +4088,18 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1) ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040 +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4265,12 +4265,11 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX942-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 sc0 @@ -4278,6 +4277,7 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB18_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4341,18 +4341,18 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1) ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index e330c72ba0fc4..c3a0b7485bfd5 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -2401,7 +2401,6 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX90A-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2 @@ -2471,7 +2470,6 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB9_2 Depth 2 From 7f9c3bb4600b860c810e96fe775526fa4060c389 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 18 Sep 2025 11:33:52 +0900 Subject: [PATCH 2/3] 32-bitcase Note this does very little because we only use VGPR classes for FP types (though this doesn't particularly make any sense), and we legalize normal loads and stores to integer. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 +- .../CodeGen/AMDGPU/a-v-global-atomicrmw.ll | 380 +++++++++--------- llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 4 +- .../AMDGPU/buffer-atomic-fadd.f32-rtn.ll | 16 +- .../CodeGen/AMDGPU/dag-divergence-atomic.ll | 24 +- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 12 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 276 +++++++------ .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 276 +++++++------ .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 92 ++--- .../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 30 +- .../AMDGPU/global-atomic-fadd.f32-rtn.ll | 54 +-- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 48 +-- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 280 ++++++------- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 280 ++++++------- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 64 +-- llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 160 ++++---- 16 files changed, 997 insertions(+), 1006 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 29346781c3925..9cfa85d47029b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -91,11 +91,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); + + const SIRegisterInfo *TRI = STI.getRegisterInfo(); + const TargetRegisterClass *V32RegClass = + TRI->getDefaultVectorSuperClassForBitWidth(32); + addRegisterClass(MVT::f32, V32RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); - const SIRegisterInfo *TRI = STI.getRegisterInfo(); const TargetRegisterClass *V64RegClass = TRI->getDefaultVectorSuperClassForBitWidth(64); diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index b95709611999d..c3531f16248e9 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -6023,12 +6023,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6051,12 +6051,12 @@ define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB117_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6072,20 +6072,20 @@ define void @global_atomic_fsub_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fsub_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB118_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6098,20 +6098,20 @@ define void @global_atomic_fsub_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_fsub_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6146,13 +6146,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6176,13 +6176,13 @@ define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6198,22 +6198,22 @@ define void @global_atomic_fmax_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmax_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB120_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6226,22 +6226,22 @@ define void @global_atomic_fmax_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_fmax_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX950-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6276,13 +6276,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6306,13 +6306,13 @@ define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6328,22 +6328,22 @@ define void @global_atomic_fmin_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmin_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB122_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6356,22 +6356,22 @@ define void @global_atomic_fmin_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX950-LABEL: global_atomic_fmin_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX950-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB122_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6407,13 +6407,13 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6436,12 +6436,12 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB123_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6457,49 +6457,49 @@ define void @global_atomic_fmaximum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fmaximum_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB124_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB124_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6535,13 +6535,13 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -6564,12 +6564,12 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -6585,49 +6585,49 @@ define void @global_atomic_fminimum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fminimum_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB126_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB126_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -11796,12 +11796,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11825,12 +11825,12 @@ define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB225_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11846,54 +11846,54 @@ define void @global_atomic_fsub_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_fsub_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB226_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fsub_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_sub_f32_e32 v4, v5, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v0, v1, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB226_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -11923,13 +11923,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -11954,13 +11954,13 @@ define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB227_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -11976,58 +11976,58 @@ define void @global_atomic_fmax_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_fmax_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB228_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v1, v2 -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB228_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmax_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX950-NEXT: .LBB228_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v1, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB228_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12057,13 +12057,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12088,13 +12088,13 @@ define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) # ; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB229_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12110,58 +12110,58 @@ define void @global_atomic_fmin_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) ; GFX90A-LABEL: global_atomic_fmin_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX90A-NEXT: .LBB230_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v1 -; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB230_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmin_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 ; GFX950-NEXT: .LBB230_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_max_f32_e32 v1, v5, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB230_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12192,13 +12192,13 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12222,12 +12222,12 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB231_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12243,57 +12243,57 @@ define void @global_atomic_fmaximum_f32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_fmaximum_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v1 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc -; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v3 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB232_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fmaximum_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_maximum3_f32 v4, v5, v2, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v0, v1, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB232_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 @@ -12324,13 +12324,13 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -12354,12 +12354,12 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB233_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -12375,57 +12375,57 @@ define void @global_atomic_fminimum_f32_saddr_ret_av_av(ptr addrspace(1) inreg % ; GFX90A-LABEL: global_atomic_fminimum_f32_saddr_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ; def v3 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_min_f32_e32 v3, v5, v1 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc -; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v3 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB234_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: global_atomic_fminimum_f32_saddr_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ; def v3 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v1 -; GFX950-NEXT: v_minimum3_f32 v4, v5, v2, v2 -; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v0, v1, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB234_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index 430f50e26fc05..89c3e8d615efd 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -381,17 +381,17 @@ define float @no_unsafe(ptr %addr, float %val) { ; GFX90A-LABEL: no_unsafe: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll index b80aa9324e616..4909fac4f307d 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll @@ -18,7 +18,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -53,7 +53,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -89,7 +89,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -127,7 +127,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32> ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -170,7 +170,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -217,7 +217,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -265,7 +265,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; @@ -315,7 +315,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr ad ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) + ; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8) ; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]] ; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll index 9c03c850c8242..0e86a1ac68119 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -421,19 +421,19 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB18_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v3, v0 -; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; CHECK-NEXT: v_add_f32_e32 v0, 1.0, v1 +; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz .LBB18_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end @@ -458,19 +458,19 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 ; CHECK-NEXT: .LBB19_1: ; %atomicrmw.start ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_mov_b32_e32 v3, v0 -; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3 -; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc +; CHECK-NEXT: v_add_f32_e32 v0, -1.0, v1 +; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execnz .LBB19_1 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 1a4a54b81c78f..5c4e25c3120e9 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -4448,18 +4448,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4771,18 +4771,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -5462,18 +5462,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg ; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index db141d6bb3d9f..53e9468c5d5b6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -34,24 +34,24 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -79,23 +79,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -177,24 +177,24 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -224,23 +224,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -324,21 +324,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -347,6 +344,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -386,20 +384,20 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -950,24 +948,24 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -997,25 +995,25 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1252,24 +1250,24 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1331,23 +1329,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1445,24 +1443,24 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1490,23 +1488,23 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1592,24 +1590,24 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1637,23 +1635,23 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1735,24 +1733,24 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1782,23 +1780,23 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1882,21 +1880,18 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -1905,6 +1900,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1944,20 +1940,20 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2508,24 +2504,24 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2555,25 +2551,25 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index e4dca8481ee12..5ee3ff67aa8a0 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -34,24 +34,24 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -79,23 +79,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -177,24 +177,24 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -224,23 +224,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -324,21 +324,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -347,6 +344,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -386,20 +384,20 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -950,24 +948,24 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -997,25 +995,25 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1252,24 +1250,24 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1331,23 +1329,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1445,24 +1443,24 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1490,23 +1488,23 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1592,24 +1590,24 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1637,23 +1635,23 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1735,24 +1733,24 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1782,23 +1780,23 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1882,21 +1880,18 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v3, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: v_max_f32_e32 v1, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: v_max_f32_e32 v0, v3, v3 ; GFX942-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX942-NEXT: buffer_wbl2 sc1 @@ -1905,6 +1900,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1944,20 +1940,20 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2508,24 +2504,24 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 +; GFX942-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2555,25 +2551,25 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 12f429454fc42..0e563c26d27ea 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -49,12 +49,11 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 @@ -62,6 +61,7 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -122,18 +122,18 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 { ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -245,12 +245,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 @@ -258,6 +257,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -319,18 +319,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -445,27 +445,25 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX942-NEXT: v_sub_f32_e32 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -533,18 +531,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1254,12 +1252,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX942-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 @@ -1267,6 +1264,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1328,12 +1326,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc @@ -1342,6 +1339,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1656,12 +1654,11 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] +; GFX942-NEXT: flat_load_dword v5, v[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 @@ -1669,6 +1666,7 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1729,18 +1727,18 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 { ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1852,12 +1850,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 @@ -1865,6 +1862,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1926,18 +1924,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX90A-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2052,27 +2050,25 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX942-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v4, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v1 -; GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX942-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 ; GFX942-NEXT: s_movk_i32 s0, 0xf800 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX942-NEXT: flat_load_dword v0, v[0:1] +; GFX942-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX942-NEXT: flat_load_dword v7, v[4:5] ; GFX942-NEXT: s_mov_b32 s1, -1 -; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX942-NEXT: v_sub_f32_e32 v6, v7, v2 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX942-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2140,18 +2136,18 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, v4 -; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v2 ; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2861,12 +2857,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX942-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX942-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 @@ -2874,6 +2869,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2935,12 +2931,11 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX90A-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc @@ -2949,6 +2944,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 357234080235a..8a4b2c428e31a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -12687,20 +12687,19 @@ define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 -; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v1 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end @@ -12812,20 +12811,19 @@ define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 -; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-SDAG-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 -; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5 -; GFX950-SDAG-NEXT: v_min_f32_e32 v4, v0, v1 -; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-SDAG-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index bd9fe397bfc68..aede3928d7b0f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -46,7 +46,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]] ; GFX90A-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE - ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.1 ; GFX90A-NEXT: {{ $}} @@ -76,12 +76,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_dpp6]] ; GFX90A-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 ; GFX90A-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] ; GFX90A-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec - ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} @@ -89,28 +90,30 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: successors: %bb.4(0x80000000) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2 - ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY %2 + ; GFX90A-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY10]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:av_32 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; GFX90A-NEXT: S_BRANCH %bb.4 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow: ; GFX90A-NEXT: successors: %bb.5(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4 + ; GFX90A-NEXT: [[PHI:%[0-9]+]]:av_32 = PHI [[DEF]], %bb.0, %8, %bb.4 ; GFX90A-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.5 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4 (%ir-block.35): ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[COPY11]], %bb.2 ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX90A-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec - ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec - ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] - ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec + ; GFX90A-NEXT: early-clobber %48:vgpr_32 = STRICT_WWM [[COPY8]], implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %48, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:sreg_64_xexec = COPY [[COPY9]] + ; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY13]], [[COPY12]], implicit $exec + ; GFX90A-NEXT: [[COPY14:%[0-9]+]]:av_32 = COPY [[V_CNDMASK_B32_e64_]] ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5 (%ir-block.41): @@ -128,7 +131,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX942-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]] ; GFX942-NEXT: [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE - ; GFX942-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX942-NEXT: [[DEF:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX942-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX942-NEXT: S_BRANCH %bb.1 ; GFX942-NEXT: {{ $}} @@ -158,12 +161,13 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX942-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec ; GFX942-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, killed [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_5]], 312, 15, 15, 0, implicit $exec + ; GFX942-NEXT: [[COPY8:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_dpp6]] ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 ; GFX942-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]] ; GFX942-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX942-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec - ; GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX942-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] + ; GFX942-NEXT: [[DEF2:%[0-9]+]]:av_32 = IMPLICIT_DEF ; GFX942-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX942-NEXT: S_BRANCH %bb.2 ; GFX942-NEXT: {{ $}} @@ -171,28 +175,30 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX942-NEXT: successors: %bb.4(0x80000000) ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY %2 - ; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY9]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY %2 + ; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_F32_SADDR_RTN killed [[V_MOV_B32_e32_1]], [[COPY10]], [[COPY3]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1) + ; GFX942-NEXT: [[COPY11:%[0-9]+]]:av_32 = COPY [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]] ; GFX942-NEXT: S_BRANCH %bb.4 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: bb.3.Flow: ; GFX942-NEXT: successors: %bb.5(0x80000000) ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, %8, %bb.4 + ; GFX942-NEXT: [[PHI:%[0-9]+]]:av_32 = PHI [[DEF]], %bb.0, %8, %bb.4 ; GFX942-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX942-NEXT: S_BRANCH %bb.5 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: bb.4 (%ir-block.35): ; GFX942-NEXT: successors: %bb.3(0x80000000) ; GFX942-NEXT: {{ $}} - ; GFX942-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX942-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[COPY11]], %bb.2 ; GFX942-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX942-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec - ; GFX942-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec - ; GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] - ; GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] - ; GFX942-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec + ; GFX942-NEXT: early-clobber %47:vgpr_32 = STRICT_WWM [[COPY8]], implicit $exec + ; GFX942-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %47, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: [[COPY12:%[0-9]+]]:sreg_64_xexec = COPY [[COPY9]] + ; GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX942-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY13]], [[COPY12]], implicit $exec + ; GFX942-NEXT: [[COPY14:%[0-9]+]]:av_32 = COPY [[V_CNDMASK_B32_e64_]] ; GFX942-NEXT: S_BRANCH %bb.3 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: bb.5 (%ir-block.41): diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index a50791e10f5a2..1978e68fdae9c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -90,18 +90,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -287,18 +287,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -486,18 +486,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1283,12 +1283,11 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc @@ -1297,6 +1296,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1697,18 +1697,18 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1912,18 +1912,18 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2319,18 +2319,18 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2719,18 +2719,18 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3712,18 +3712,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4130,18 +4130,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -4532,18 +4532,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -6724,18 +6724,18 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index 0f89d8fcd7fbf..dc995fb7ef79c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -35,24 +35,24 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -80,23 +80,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: @@ -197,24 +197,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -242,23 +242,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -361,24 +361,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -406,23 +406,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1000,24 +1000,24 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1045,25 +1045,25 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1328,24 +1328,24 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1407,23 +1407,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory: @@ -1559,24 +1559,24 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1604,23 +1604,23 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1725,24 +1725,24 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1770,23 +1770,23 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1887,24 +1887,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1932,23 +1932,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2051,24 +2051,24 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2096,23 +2096,23 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2690,24 +2690,24 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2735,25 +2735,25 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 1ee83360e1aab..f62e13a9d4341 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -35,24 +35,24 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -80,23 +80,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: @@ -197,24 +197,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -242,23 +242,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -361,24 +361,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -406,23 +406,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -1000,24 +1000,24 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1045,25 +1045,25 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -1328,24 +1328,24 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1407,23 +1407,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: @@ -1559,24 +1559,24 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1604,23 +1604,23 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: @@ -1725,24 +1725,24 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1770,23 +1770,23 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: @@ -1887,24 +1887,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -1932,23 +1932,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2051,24 +2051,24 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 sc0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB12_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2096,23 +2096,23 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: @@ -2690,24 +2690,24 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 -; GFX942-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX942-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX942-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX942-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX942-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX942-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 -; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 +; GFX942-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: buffer_inv sc0 sc1 -; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB16_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: v_mov_b32_e32 v0, v3 +; GFX942-NEXT: v_mov_b32_e32 v0, v2 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: @@ -2735,25 +2735,25 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1 -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 8c101d627194c..9e6f0fd7f13b5 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -50,12 +50,11 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 @@ -63,6 +62,7 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -123,18 +123,18 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -282,12 +282,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 @@ -295,6 +294,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -355,18 +355,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) % ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -516,12 +516,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 @@ -529,6 +528,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -589,18 +589,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) % ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1438,12 +1438,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX942-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 @@ -1451,6 +1450,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1511,12 +1511,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc @@ -1525,6 +1524,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1905,12 +1905,11 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off +; GFX942-NEXT: global_load_dword v5, v[0:1], off ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 @@ -1918,6 +1917,7 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -1978,18 +1978,18 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v5, v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2137,12 +2137,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 @@ -2150,6 +2149,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB9_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2210,18 +2210,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2371,12 +2371,11 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX942-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 sc0 @@ -2384,6 +2383,7 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX942-NEXT: buffer_inv sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -2444,18 +2444,18 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace ; GFX90A-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-2048 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-2048 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3293,12 +3293,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX942-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX942-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX942-NEXT: s_mov_b64 s[0:1], 0 ; GFX942-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX942-NEXT: buffer_wbl2 sc0 sc1 ; GFX942-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 sc0 sc1 @@ -3306,6 +3305,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX942-NEXT: buffer_inv sc0 sc1 ; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX942-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v5, v3 ; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_cbranch_execnz .LBB14_1 ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end @@ -3366,12 +3366,11 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX90A-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044 +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc @@ -3380,6 +3379,7 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac ; GFX90A-NEXT: buffer_wbinvl1 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index c3a0b7485bfd5..fe432e9d7594d 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1376,47 +1376,46 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; ; GFX90A-LABEL: test_mfma_loop_sgpr_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 @@ -1438,47 +1437,46 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; ; GFX942-LABEL: test_mfma_loop_sgpr_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c -; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX942-NEXT: v_accvgpr_write_b32 a31, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a30, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a29, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a28, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a27, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a26, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a25, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a24, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a23, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a22, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a21, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a20, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a19, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a18, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a17, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a15, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a14, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a13, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a12, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a11, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a10, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a9, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a8, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a7, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a6, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a5, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a4, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a3, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 @@ -1643,13 +1641,13 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; ; GFX90A-LABEL: test_mfma_loop_mixed_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1679,7 +1677,6 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader @@ -1706,13 +1703,13 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; ; GFX942-LABEL: test_mfma_loop_mixed_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-NEXT: v_accvgpr_write_b32 a1, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 @@ -1742,7 +1739,6 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader From f111afaa914511c67dc6208e818612d550ea9fba Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 18 Sep 2025 11:41:48 +0900 Subject: [PATCH 3/3] Regression with 32-bit case --- .../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 484 +++++++++--------- .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll | 12 +- 2 files changed, 253 insertions(+), 243 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll index c2d79fdae1208..ae83766cd6a4a 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -9181,12 +9181,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9209,12 +9209,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB117_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9230,20 +9230,20 @@ define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fsub_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB118_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9256,20 +9256,20 @@ define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fsub_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9304,13 +9304,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9334,13 +9334,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9356,22 +9356,22 @@ define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmax_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB120_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9384,22 +9384,22 @@ define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmax_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX950-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9434,13 +9434,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9464,13 +9464,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9486,22 +9486,22 @@ define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmin_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX90A-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB122_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9514,22 +9514,22 @@ define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 { ; GFX950-LABEL: flat_atomic_fmin_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v2 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[0:1], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 ; GFX950-NEXT: .LBB122_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB122_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9565,13 +9565,13 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9594,12 +9594,12 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB123_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9615,49 +9615,49 @@ define void @flat_atomic_fmaximum_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fmaximum_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB124_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmaximum_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB124_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -9693,13 +9693,13 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { ; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -9722,12 +9722,12 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -9743,49 +9743,49 @@ define void @flat_atomic_fminimum_f32_ret_av_av(ptr %ptr) #0 { ; GFX90A-LABEL: flat_atomic_fminimum_f32_ret_av_av: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB126_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v2 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fminimum_f32_ret_av_av: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-NEXT: s_cbranch_execnz .LBB126_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17456,26 +17456,27 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB225_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17485,26 +17486,27 @@ define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB225_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v0, v1, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB225_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17521,20 +17523,20 @@ define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB226_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17548,20 +17550,20 @@ define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB226_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB226_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17582,29 +17584,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB227_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17614,29 +17616,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB227_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB227_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17653,29 +17655,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB228_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB228_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17683,29 +17685,29 @@ define void @flat_atomic_fmax_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB228_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_max_f32_e32 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB228_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 @@ -17720,29 +17722,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB229_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17752,29 +17754,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB229_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB229_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17791,29 +17793,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB230_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v2 -; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB230_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17821,29 +17823,29 @@ define void @flat_atomic_fmin_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 -; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 -; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB230_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 -; GFX950-NEXT: v_min_f32_e32 v4, v2, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB230_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 @@ -17858,29 +17860,30 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB231_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -17890,26 +17893,27 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB231_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v0, v1, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB231_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -17926,29 +17930,30 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB232_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -17956,20 +17961,20 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB232_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end @@ -17990,29 +17995,30 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: .LBB233_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 -; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a0 ; GFX90A-NEXT: ;;#ASMEND @@ -18022,26 +18028,27 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; def a0 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: .LBB233_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v0, v1, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 -; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB233_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use a0 ; GFX950-NEXT: ;;#ASMEND @@ -18058,29 +18065,30 @@ define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] -; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ; def v4 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v4 -; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 -; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 -; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB234_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ; use v0 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -18088,20 +18096,20 @@ define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: ;;#ASMSTART -; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ; def v4 ; GFX950-NEXT: ;;#ASMEND ; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3 -; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB234_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index be1788c6ec83f..2462414992e36 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -6,15 +6,15 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_mov_b32 s2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX942-NEXT: s_mov_b32 s3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 @@ -22,12 +22,14 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: s_ashr_i32 s5, s3, 31 ; GFX942-NEXT: s_mov_b32 s3, s2 ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2 +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 ; GFX942-NEXT: s_and_b32 s3, s5, s4 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3] +; GFX942-NEXT: s_nop 6 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a2 ; GFX942-NEXT: s_cbranch_execz .LBB0_4 ; GFX942-NEXT: .LBB0_2: ; %bb ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -35,7 +37,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-NEXT: s_cbranch_vccz .LBB0_1 ; GFX942-NEXT: ; %bb.3: ; GFX942-NEXT: ; implicit-def: $sgpr3 -; GFX942-NEXT: ; implicit-def: $agpr2 +; GFX942-NEXT: ; implicit-def: $vgpr0 ; GFX942-NEXT: .LBB0_4: ; %common.ret ; GFX942-NEXT: s_endpgm ;