-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU: Start to use AV classes for unknown vector class #166482
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
AMDGPU: Start to use AV classes for unknown vector class #166482
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesAMDGPU: Start to use AV classes for unknown vector class Use AGPR+VGPR superclasses for gfx90a+. The type used Note this does very little because we only use VGPR classes 32-bitcase Note this does very little because we only use VGPR classes Regression with 32-bit case Patch is 475.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/166482.diff 24 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8bb28084159e8..98fe923147ccc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -91,64 +91,73 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
- addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
+
+ const SIRegisterInfo *TRI = STI.getRegisterInfo();
+ const TargetRegisterClass *V32RegClass =
+ TRI->getDefaultVectorSuperClassForBitWidth(32);
+ addRegisterClass(MVT::f32, V32RegClass);
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
- const SIRegisterInfo *TRI = STI.getRegisterInfo();
- const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
+ const TargetRegisterClass *V64RegClass =
+ TRI->getDefaultVectorSuperClassForBitWidth(64);
addRegisterClass(MVT::f64, V64RegClass);
addRegisterClass(MVT::v2f32, V64RegClass);
addRegisterClass(MVT::Untyped, V64RegClass);
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
- addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
+ addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
- addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+ addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
- addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
+ addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
- addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass);
+ addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
- addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass);
+ addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
- addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass);
+ addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
- addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass);
+ addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
- addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass);
+ addRegisterClass(MVT::v10f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(320));
addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
- addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass);
+ addRegisterClass(MVT::v11f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(352));
addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
- addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass);
+ addRegisterClass(MVT::v12f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(384));
addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v16f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(512));
addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
- addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v16f64,
+ TRI->getDefaultVectorSuperClassForBitWidth(1024));
if (Subtarget->has16BitInsts()) {
if (Subtarget->useRealTrue16Insts()) {
@@ -180,7 +189,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
- addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v32f32,
+ TRI->getDefaultVectorSuperClassForBitWidth(1024));
computeRegisterProperties(Subtarget->getRegisterInfo());
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 152f0f85c9978..3f52e8229ac08 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3557,6 +3557,17 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
: getAnyVectorSuperClassForBitWidth(BitWidth);
}
+const TargetRegisterClass *
+SIRegisterInfo::getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const {
+ // TODO: In principle this should use AV classes for gfx908 too. This is
+ // limited to 90a+ to avoid regressing special case copy optimizations which
+ // need new handling. The core issue is that it's not possible to directly
+ // copy between AGPRs on gfx908, and the current optimizations around that
+ // expect to see copies to VGPR.
+ return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
+ : getVGPRClassForBitWidth(BitWidth);
+}
+
const TargetRegisterClass *
SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth == 16 || BitWidth == 32)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7b91ba7bc581f..6e119e5e7c194 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -215,6 +215,10 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
const TargetRegisterClass *
getVectorSuperClassForBitWidth(unsigned BitWidth) const;
+ LLVM_READONLY
+ const TargetRegisterClass *
+ getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const;
+
LLVM_READONLY
static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth);
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index 003aa049b2d1b..ae83766cd6a4a 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -9181,12 +9181,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB117_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9209,12 +9209,12 @@ define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB117_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9230,20 +9230,20 @@ define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fsub_f32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v4
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: .LBB118_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB118_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9256,20 +9256,20 @@ define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_fsub_f32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: .LBB118_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB118_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9304,13 +9304,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB119_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9334,13 +9334,13 @@ define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB119_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9356,22 +9356,22 @@ define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmax_f32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB120_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5
-; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB120_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9384,22 +9384,22 @@ define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_fmax_f32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_max_f32_e32 v4, v2, v2
; GFX950-NEXT: .LBB120_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_max_f32_e32 v2, v5, v5
-; GFX950-NEXT: v_max_f32_e32 v4, v2, v3
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX950-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB120_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9434,13 +9434,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB121_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9464,13 +9464,13 @@ define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB121_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use a0
; GFX950-NEXT: ;;#ASMEND
@@ -9486,22 +9486,22 @@ define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 {
; GFX90A-LABEL: flat_atomic_fmin_f32_ret_av_av:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def v3
+; GFX90A-NEXT: ; def v2
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
; GFX90A-NEXT: .LBB122_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v2
-; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5
-; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc
+; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB122_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9514,22 +9514,22 @@ define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 {
; GFX950-LABEL: flat_atomic_fmin_f32_ret_av_av:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40
+; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v3
+; GFX950-NEXT: ; def v2
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
-; GFX950-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX950-NEXT: v_max_f32_e32 v4, v2, v2
; GFX950-NEXT: .LBB122_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v5, v2
-; GFX950-NEXT: v_max_f32_e32 v2, v5, v5
-; GFX950-NEXT: v_min_f32_e32 v4, v2, v3
-; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0
+; GFX950-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX950-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB122_1
; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -9565,13 +9565,13 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 {
; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB123_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; use a0
; GFX90A-NEXT: ;;#ASMEND
@@ -9594,12 +9594,12 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
; GFX950-NEXT: v_mov_b32_e32 v3, v2
; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_cbranch_execnz .LBB123_1
; GFX950-NEXT: ;...
[truncated]
|
3247d62 to
22f336c
Compare
6cab444 to
7af5764
Compare
Allow widening up to 128-bit registers or if the new register class is at least as large as one of the existing register classes. This was artificially limiting. In particular this was doing the wrong thing with sequences involving copies between VGPRs and AV registers. Nearly all test changes are improvements. The coalescer does not just widen registers out of nowhere. If it's trying to "widen" a register, it's generally packing a register into an existing register tuple, or in a situation where the constraints imply the wider class anyway. 067a110 addressed the allocation failure concern by rejecting coalescing if there are no available registers. The original change in a4e63ea didn't include a realistic testcase to judge if this is harmful for pressure. I would expect any issues from this to be of garden variety subreg handling issue. We could use more dynamic state information here if it really is an issue. I get the best results by removing this override completely. This is a smaller step for patch splitting purposes.
Use AGPR+VGPR superclasses for gfx90a+. The type used for the class should be the broadest possible class, to be contextually restricted later. InstrEmitter clamps these to the common subclass of the context use instructions, so we're best off using the broadest possible class for all types. Note this does very little because we only use VGPR classes for FP types (though this doesn't particularly make any sense), and we legalize normal loads and stores to integer.
Note this does very little because we only use VGPR classes for FP types (though this doesn't particularly make any sense), and we legalize normal loads and stores to integer.
7af5764 to
9c1ccff
Compare
22f336c to
02f6ebf
Compare
qcolombet
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM

AMDGPU: Start to use AV classes for unknown vector class
Use AGPR+VGPR superclasses for gfx90a+. The type used
for the class should be the broadest possible class, to
be contextually restricted later. InstrEmitter clamps these
to the common subclass of the context use instructions, so we're
best off using the broadest possible class for all types.
Note this does very little because we only use VGPR classes
for FP types (though this doesn't particularly make any sense),
and we legalize normal loads and stores to integer.