Skip to content

Commit 6cab444

Browse files
committed
AMDGPU: Relax shouldCoalesce to allow more register tuple widening
Allow widening up to 128-bit registers or if the new register class is at least as large as one of the existing register classes. This was artificially limiting. In particular this was doing the wrong thing with sequences involving copies between VGPRs and AV registers. Nearly all test changes are improvements. The coalescer does not just widen registers out of nowhere. If it's trying to "widen" a register, it's generally packing a register into an existing register tuple, or in a situation where the constraints imply the wider class anyway. 067a110 addressed the allocation failure concern by rejecting coalescing if there are no available registers. The original change in a4e63ea didn't include a realistic testcase to judge if this is harmful for pressure. I would expect any issues from this to be of garden variety subreg handling issue. We could use more dynamic state information here if it really is an issue. I get the best results by removing this override completely. This is a smaller step for patch splitting purposes.
1 parent 2b4ac66 commit 6cab444

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+8007
-9193
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3741,18 +3741,11 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
37413741
unsigned DstSubReg,
37423742
const TargetRegisterClass *NewRC,
37433743
LiveIntervals &LIS) const {
3744-
unsigned SrcSize = getRegSizeInBits(*SrcRC);
3745-
unsigned DstSize = getRegSizeInBits(*DstRC);
3744+
// TODO: This should be more aggressive, but be more cautious with very wide
3745+
// tuples.
37463746
unsigned NewSize = getRegSizeInBits(*NewRC);
3747-
3748-
// Do not increase size of registers beyond dword, we would need to allocate
3749-
// adjacent registers and constraint regalloc more than needed.
3750-
3751-
// Always allow dword coalescing.
3752-
if (SrcSize <= 32 || DstSize <= 32)
3753-
return true;
3754-
3755-
return NewSize <= DstSize || NewSize <= SrcSize;
3747+
return NewSize <= 128 || NewSize <= getRegSizeInBits(*SrcRC) ||
3748+
NewSize <= getRegSizeInBits(*DstRC);
37563749
}
37573750

37583751
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,

llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll

Lines changed: 74 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -8,37 +8,34 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
88
; GFX10-LABEL: v_mul_i64_no_zext:
99
; GFX10: ; %bb.0:
1010
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
11-
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
11+
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
1212
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1313
; GFX10-NEXT: s_clause 0x1
14-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
15-
; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
14+
; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
15+
; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3]
1616
; GFX10-NEXT: s_waitcnt vmcnt(0)
17-
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
18-
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
19-
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
20-
; GFX10-NEXT: v_mov_b32_e32 v5, v0
21-
; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
17+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v2, v4, 0
18+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v2, v5, v[1:2]
19+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v3, v4, v[1:2]
20+
; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
2221
; GFX10-NEXT: s_endpgm
2322
;
2423
; GFX11-LABEL: v_mul_i64_no_zext:
2524
; GFX11: ; %bb.0:
2625
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
2726
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2827
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
29-
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0
28+
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0
3029
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3130
; GFX11-NEXT: s_clause 0x1
32-
; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1]
33-
; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3]
31+
; GFX11-NEXT: global_load_b64 v[2:3], v8, s[0:1]
32+
; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3]
3433
; GFX11-NEXT: s_waitcnt vmcnt(0)
35-
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
34+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0
3635
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
37-
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6]
38-
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
39-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
40-
; GFX11-NEXT: v_mov_b32_e32 v5, v7
41-
; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
36+
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v2, v5, v[1:2]
37+
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v4, v[6:7]
38+
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[2:3]
4239
; GFX11-NEXT: s_endpgm
4340
%tid = call i32 @llvm.amdgcn.workitem.id.x()
4441
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -58,18 +55,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
5855
; GFX10-NEXT: s_clause 0x1
5956
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
6057
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
61-
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
62-
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0
58+
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
59+
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
6360
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
64-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
65-
; GFX10-NEXT: global_load_dword v4, v3, s[6:7]
61+
; GFX10-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3]
62+
; GFX10-NEXT: global_load_dword v4, v0, s[6:7]
6663
; GFX10-NEXT: s_waitcnt vmcnt(0)
67-
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0
68-
; GFX10-NEXT: v_mov_b32_e32 v0, v3
69-
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1]
70-
; GFX10-NEXT: v_mov_b32_e32 v3, v0
71-
; GFX10-NEXT: v_mov_b32_e32 v0, 0
72-
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
64+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, 0
65+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v4, v[1:2]
66+
; GFX10-NEXT: v_mov_b32_e32 v2, 0
67+
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
7368
; GFX10-NEXT: s_endpgm
7469
;
7570
; GFX11-LABEL: v_mul_i64_zext_src1:
@@ -80,17 +75,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
8075
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
8176
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
8277
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
83-
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
78+
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
8479
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
85-
; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3]
86-
; GFX11-NEXT: global_load_b32 v5, v2, s[4:5]
80+
; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
81+
; GFX11-NEXT: global_load_b32 v5, v0, s[4:5]
8782
; GFX11-NEXT: s_waitcnt vmcnt(0)
88-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0
83+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v5, 0
8984
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
90-
; GFX11-NEXT: v_mov_b32_e32 v0, v3
91-
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
92-
; GFX11-NEXT: v_mov_b32_e32 v0, 0
93-
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
85+
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2]
86+
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v3
87+
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
9488
; GFX11-NEXT: s_endpgm
9589
%tid = call i32 @llvm.amdgcn.workitem.id.x()
9690
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -110,18 +104,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
110104
; GFX10-NEXT: s_clause 0x1
111105
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
112106
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
113-
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
114-
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
107+
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0
108+
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
115109
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
116-
; GFX10-NEXT: global_load_dword v4, v2, s[2:3]
117-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7]
110+
; GFX10-NEXT: global_load_dword v4, v3, s[2:3]
111+
; GFX10-NEXT: global_load_dwordx2 v[1:2], v0, s[6:7]
118112
; GFX10-NEXT: s_waitcnt vmcnt(0)
119-
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0
120-
; GFX10-NEXT: v_mov_b32_e32 v0, v3
121-
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
122-
; GFX10-NEXT: v_mov_b32_e32 v3, v0
123-
; GFX10-NEXT: v_mov_b32_e32 v0, 0
124-
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
113+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, 0
114+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v2, v[1:2]
115+
; GFX10-NEXT: v_mov_b32_e32 v2, 0
116+
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
125117
; GFX10-NEXT: s_endpgm
126118
;
127119
; GFX11-LABEL: v_mul_i64_zext_src0:
@@ -135,14 +127,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
135127
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
136128
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
137129
; GFX11-NEXT: global_load_b32 v5, v1, s[2:3]
138-
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
130+
; GFX11-NEXT: global_load_b64 v[1:2], v0, s[4:5]
139131
; GFX11-NEXT: s_waitcnt vmcnt(0)
140-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
132+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v1, 0
141133
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
142-
; GFX11-NEXT: v_mov_b32_e32 v0, v3
143-
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
144-
; GFX11-NEXT: v_mov_b32_e32 v0, 0
145-
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
134+
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v2, v[1:2]
135+
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v3
136+
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
146137
; GFX11-NEXT: s_endpgm
147138
%tid = call i32 @llvm.amdgcn.workitem.id.x()
148139
%gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
@@ -209,18 +200,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
209200
; GFX10-NEXT: s_clause 0x1
210201
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
211202
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
212-
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
203+
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
213204
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
214205
; GFX10-NEXT: s_clause 0x1
215-
; GFX10-NEXT: global_load_dword v4, v2, s[2:3]
216-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
206+
; GFX10-NEXT: global_load_dword v3, v0, s[2:3]
207+
; GFX10-NEXT: global_load_dwordx2 v[1:2], v0, s[6:7]
217208
; GFX10-NEXT: s_waitcnt vmcnt(0)
218-
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0
219-
; GFX10-NEXT: v_mov_b32_e32 v0, v3
220-
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
221-
; GFX10-NEXT: v_mov_b32_e32 v3, v0
222-
; GFX10-NEXT: v_mov_b32_e32 v0, 0
223-
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
209+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v3, v1, 0
210+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v3, v2, v[1:2]
211+
; GFX10-NEXT: v_mov_b32_e32 v2, 0
212+
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
224213
; GFX10-NEXT: s_endpgm
225214
;
226215
; GFX11-LABEL: v_mul_i64_masked_src0_hi:
@@ -234,14 +223,13 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
234223
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
235224
; GFX11-NEXT: s_clause 0x1
236225
; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
237-
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
226+
; GFX11-NEXT: global_load_b64 v[1:2], v0, s[4:5]
238227
; GFX11-NEXT: s_waitcnt vmcnt(0)
239-
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
228+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v1, 0
240229
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
241-
; GFX11-NEXT: v_mov_b32_e32 v0, v3
242-
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
243-
; GFX11-NEXT: v_mov_b32_e32 v0, 0
244-
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
230+
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v2, v[1:2]
231+
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v3
232+
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
245233
; GFX11-NEXT: s_endpgm
246234
%tid = call i32 @llvm.amdgcn.workitem.id.x()
247235
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
@@ -389,22 +377,20 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
389377
; GFX10-NEXT: s_clause 0x1
390378
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
391379
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
392-
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
380+
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
393381
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
394382
; GFX10-NEXT: s_clause 0x1
395-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
396-
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
383+
; GFX10-NEXT: global_load_dwordx2 v[1:2], v0, s[2:3]
384+
; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[6:7]
397385
; GFX10-NEXT: s_waitcnt vmcnt(1)
398-
; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0
386+
; GFX10-NEXT: v_and_b32_e32 v5, 0xfff00000, v1
399387
; GFX10-NEXT: s_waitcnt vmcnt(0)
400-
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0
401-
; GFX10-NEXT: v_mov_b32_e32 v0, v5
402-
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1]
403-
; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1
404-
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6]
405-
; GFX10-NEXT: v_mov_b32_e32 v5, v0
406-
; GFX10-NEXT: v_mov_b32_e32 v0, 0
407-
; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[0:1]
388+
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v5, v3, 0
389+
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v5, v4, v[1:2]
390+
; GFX10-NEXT: v_and_b32_e32 v1, 0xf00f, v2
391+
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v1, v3, v[4:5]
392+
; GFX10-NEXT: v_mov_b32_e32 v2, 0
393+
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
408394
; GFX10-NEXT: s_endpgm
409395
;
410396
; GFX11-LABEL: v_mul_i64_partially_masked_src0:
@@ -414,24 +400,22 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
414400
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
415401
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
416402
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
417-
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
403+
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
418404
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
419405
; GFX11-NEXT: s_clause 0x1
420-
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
421-
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5]
406+
; GFX11-NEXT: global_load_b64 v[1:2], v0, s[2:3]
407+
; GFX11-NEXT: global_load_b64 v[3:4], v0, s[4:5]
422408
; GFX11-NEXT: s_waitcnt vmcnt(1)
423-
; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0
409+
; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v1
424410
; GFX11-NEXT: s_waitcnt vmcnt(0)
425411
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
426-
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0
427-
; GFX11-NEXT: v_mov_b32_e32 v0, v5
428-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
429-
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1]
430-
; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1
431-
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6]
412+
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v3, 0
413+
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v4, v[1:2]
414+
; GFX11-NEXT: v_and_b32_e32 v4, 0xf00f, v2
432415
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
433-
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
434-
; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1]
416+
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v4, v3, v[5:6]
417+
; GFX11-NEXT: v_mov_b32_e32 v2, 0
418+
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
435419
; GFX11-NEXT: s_endpgm
436420
%tid = call i32 @llvm.amdgcn.workitem.id.x()
437421
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid

0 commit comments

Comments
 (0)