@@ -8,37 +8,34 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
88; GFX10-LABEL: v_mul_i64_no_zext:
99; GFX10: ; %bb.0:
1010; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
11- ; GFX10-NEXT: v_lshlrev_b32_e32 v7 , 3, v0
11+ ; GFX10-NEXT: v_lshlrev_b32_e32 v6 , 3, v0
1212; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1313; GFX10-NEXT: s_clause 0x1
14- ; GFX10-NEXT: global_load_dwordx2 v[0:1 ], v7 , s[0:1]
15- ; GFX10-NEXT: global_load_dwordx2 v[2:3 ], v7 , s[2:3]
14+ ; GFX10-NEXT: global_load_dwordx2 v[2:3 ], v6 , s[0:1]
15+ ; GFX10-NEXT: global_load_dwordx2 v[4:5 ], v6 , s[2:3]
1616; GFX10-NEXT: s_waitcnt vmcnt(0)
17- ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
18- ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
19- ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
20- ; GFX10-NEXT: v_mov_b32_e32 v5, v0
21- ; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
17+ ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v2, v4, 0
18+ ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v2, v5, v[1:2]
19+ ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, v3, v4, v[1:2]
20+ ; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
2221; GFX10-NEXT: s_endpgm
2322;
2423; GFX11-LABEL: v_mul_i64_no_zext:
2524; GFX11: ; %bb.0:
2625; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
2726; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2827; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
29- ; GFX11-NEXT: v_lshlrev_b32_e32 v9 , 3, v0
28+ ; GFX11-NEXT: v_lshlrev_b32_e32 v8 , 3, v0
3029; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3130; GFX11-NEXT: s_clause 0x1
32- ; GFX11-NEXT: global_load_b64 v[0:1 ], v9 , s[0:1]
33- ; GFX11-NEXT: global_load_b64 v[2:3 ], v9 , s[2:3]
31+ ; GFX11-NEXT: global_load_b64 v[2:3 ], v8 , s[0:1]
32+ ; GFX11-NEXT: global_load_b64 v[4:5 ], v8 , s[2:3]
3433; GFX11-NEXT: s_waitcnt vmcnt(0)
35- ; GFX11-NEXT: v_mad_u64_u32 v[4:5 ], null, v0, v2 , 0
34+ ; GFX11-NEXT: v_mad_u64_u32 v[0:1 ], null, v2, v4 , 0
3635; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
37- ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6]
38- ; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
39- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
40- ; GFX11-NEXT: v_mov_b32_e32 v5, v7
41- ; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
36+ ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v2, v5, v[1:2]
37+ ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v3, v4, v[6:7]
38+ ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[2:3]
4239; GFX11-NEXT: s_endpgm
4340 %tid = call i32 @llvm.amdgcn.workitem.id.x ()
4441 %gep.a = getelementptr inbounds i64 , ptr addrspace (1 ) %aptr , i32 %tid
@@ -58,18 +55,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
5855; GFX10-NEXT: s_clause 0x1
5956; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
6057; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
61- ; GFX10-NEXT: v_lshlrev_b32_e32 v2 , 3, v0
62- ; GFX10-NEXT: v_lshlrev_b32_e32 v3 , 2, v0
58+ ; GFX10-NEXT: v_lshlrev_b32_e32 v3 , 3, v0
59+ ; GFX10-NEXT: v_lshlrev_b32_e32 v0 , 2, v0
6360; GFX10-NEXT: s_waitcnt lgkmcnt(0)
64- ; GFX10-NEXT: global_load_dwordx2 v[0:1 ], v2 , s[2:3]
65- ; GFX10-NEXT: global_load_dword v4, v3 , s[6:7]
61+ ; GFX10-NEXT: global_load_dwordx2 v[1:2 ], v3 , s[2:3]
62+ ; GFX10-NEXT: global_load_dword v4, v0 , s[6:7]
6663; GFX10-NEXT: s_waitcnt vmcnt(0)
67- ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0
68- ; GFX10-NEXT: v_mov_b32_e32 v0, v3
69- ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1]
70- ; GFX10-NEXT: v_mov_b32_e32 v3, v0
71- ; GFX10-NEXT: v_mov_b32_e32 v0, 0
72- ; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
64+ ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, 0
65+ ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v4, v[1:2]
66+ ; GFX10-NEXT: v_mov_b32_e32 v2, 0
67+ ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
7368; GFX10-NEXT: s_endpgm
7469;
7570; GFX11-LABEL: v_mul_i64_zext_src1:
@@ -80,17 +75,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
8075; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
8176; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
8277; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
83- ; GFX11-NEXT: v_lshlrev_b32_e32 v2 , 2, v0
78+ ; GFX11-NEXT: v_lshlrev_b32_e32 v0 , 2, v0
8479; GFX11-NEXT: s_waitcnt lgkmcnt(0)
85- ; GFX11-NEXT: global_load_b64 v[0:1 ], v1, s[2:3]
86- ; GFX11-NEXT: global_load_b32 v5, v2 , s[4:5]
80+ ; GFX11-NEXT: global_load_b64 v[1:2 ], v1, s[2:3]
81+ ; GFX11-NEXT: global_load_b32 v5, v0 , s[4:5]
8782; GFX11-NEXT: s_waitcnt vmcnt(0)
88- ; GFX11-NEXT: v_mad_u64_u32 v[2:3 ], null, v0 , v5, 0
83+ ; GFX11-NEXT: v_mad_u64_u32 v[0:1 ], null, v1 , v5, 0
8984; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
90- ; GFX11-NEXT: v_mov_b32_e32 v0, v3
91- ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
92- ; GFX11-NEXT: v_mov_b32_e32 v0, 0
93- ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
85+ ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2]
86+ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v3
87+ ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
9488; GFX11-NEXT: s_endpgm
9589 %tid = call i32 @llvm.amdgcn.workitem.id.x ()
9690 %gep.a = getelementptr inbounds i64 , ptr addrspace (1 ) %aptr , i32 %tid
@@ -110,18 +104,16 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
110104; GFX10-NEXT: s_clause 0x1
111105; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
112106; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
113- ; GFX10-NEXT: v_lshlrev_b32_e32 v2 , 2, v0
114- ; GFX10-NEXT: v_lshlrev_b32_e32 v3 , 3, v0
107+ ; GFX10-NEXT: v_lshlrev_b32_e32 v3 , 2, v0
108+ ; GFX10-NEXT: v_lshlrev_b32_e32 v0 , 3, v0
115109; GFX10-NEXT: s_waitcnt lgkmcnt(0)
116- ; GFX10-NEXT: global_load_dword v4, v2 , s[2:3]
117- ; GFX10-NEXT: global_load_dwordx2 v[0:1 ], v3 , s[6:7]
110+ ; GFX10-NEXT: global_load_dword v4, v3 , s[2:3]
111+ ; GFX10-NEXT: global_load_dwordx2 v[1:2 ], v0 , s[6:7]
118112; GFX10-NEXT: s_waitcnt vmcnt(0)
119- ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0
120- ; GFX10-NEXT: v_mov_b32_e32 v0, v3
121- ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
122- ; GFX10-NEXT: v_mov_b32_e32 v3, v0
123- ; GFX10-NEXT: v_mov_b32_e32 v0, 0
124- ; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
113+ ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, 0
114+ ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v2, v[1:2]
115+ ; GFX10-NEXT: v_mov_b32_e32 v2, 0
116+ ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
125117; GFX10-NEXT: s_endpgm
126118;
127119; GFX11-LABEL: v_mul_i64_zext_src0:
@@ -135,14 +127,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
135127; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
136128; GFX11-NEXT: s_waitcnt lgkmcnt(0)
137129; GFX11-NEXT: global_load_b32 v5, v1, s[2:3]
138- ; GFX11-NEXT: global_load_b64 v[0:1 ], v0, s[4:5]
130+ ; GFX11-NEXT: global_load_b64 v[1:2 ], v0, s[4:5]
139131; GFX11-NEXT: s_waitcnt vmcnt(0)
140- ; GFX11-NEXT: v_mad_u64_u32 v[2:3 ], null, v5, v0 , 0
132+ ; GFX11-NEXT: v_mad_u64_u32 v[0:1 ], null, v5, v1 , 0
141133; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
142- ; GFX11-NEXT: v_mov_b32_e32 v0, v3
143- ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
144- ; GFX11-NEXT: v_mov_b32_e32 v0, 0
145- ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
134+ ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v2, v[1:2]
135+ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v3
136+ ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
146137; GFX11-NEXT: s_endpgm
147138 %tid = call i32 @llvm.amdgcn.workitem.id.x ()
148139 %gep.a = getelementptr inbounds i32 , ptr addrspace (1 ) %aptr , i32 %tid
@@ -209,18 +200,16 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
209200; GFX10-NEXT: s_clause 0x1
210201; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
211202; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
212- ; GFX10-NEXT: v_lshlrev_b32_e32 v2 , 3, v0
203+ ; GFX10-NEXT: v_lshlrev_b32_e32 v0 , 3, v0
213204; GFX10-NEXT: s_waitcnt lgkmcnt(0)
214205; GFX10-NEXT: s_clause 0x1
215- ; GFX10-NEXT: global_load_dword v4, v2 , s[2:3]
216- ; GFX10-NEXT: global_load_dwordx2 v[0:1 ], v2 , s[6:7]
206+ ; GFX10-NEXT: global_load_dword v3, v0 , s[2:3]
207+ ; GFX10-NEXT: global_load_dwordx2 v[1:2 ], v0 , s[6:7]
217208; GFX10-NEXT: s_waitcnt vmcnt(0)
218- ; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0
219- ; GFX10-NEXT: v_mov_b32_e32 v0, v3
220- ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
221- ; GFX10-NEXT: v_mov_b32_e32 v3, v0
222- ; GFX10-NEXT: v_mov_b32_e32 v0, 0
223- ; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
209+ ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v3, v1, 0
210+ ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v3, v2, v[1:2]
211+ ; GFX10-NEXT: v_mov_b32_e32 v2, 0
212+ ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
224213; GFX10-NEXT: s_endpgm
225214;
226215; GFX11-LABEL: v_mul_i64_masked_src0_hi:
@@ -234,14 +223,13 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
234223; GFX11-NEXT: s_waitcnt lgkmcnt(0)
235224; GFX11-NEXT: s_clause 0x1
236225; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
237- ; GFX11-NEXT: global_load_b64 v[0:1 ], v0, s[4:5]
226+ ; GFX11-NEXT: global_load_b64 v[1:2 ], v0, s[4:5]
238227; GFX11-NEXT: s_waitcnt vmcnt(0)
239- ; GFX11-NEXT: v_mad_u64_u32 v[2:3 ], null, v5, v0 , 0
228+ ; GFX11-NEXT: v_mad_u64_u32 v[0:1 ], null, v5, v1 , 0
240229; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
241- ; GFX11-NEXT: v_mov_b32_e32 v0, v3
242- ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
243- ; GFX11-NEXT: v_mov_b32_e32 v0, 0
244- ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
230+ ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v2, v[1:2]
231+ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, v3
232+ ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
245233; GFX11-NEXT: s_endpgm
246234 %tid = call i32 @llvm.amdgcn.workitem.id.x ()
247235 %gep.a = getelementptr inbounds i64 , ptr addrspace (1 ) %aptr , i32 %tid
@@ -389,22 +377,20 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
389377; GFX10-NEXT: s_clause 0x1
390378; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
391379; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
392- ; GFX10-NEXT: v_lshlrev_b32_e32 v4 , 3, v0
380+ ; GFX10-NEXT: v_lshlrev_b32_e32 v0 , 3, v0
393381; GFX10-NEXT: s_waitcnt lgkmcnt(0)
394382; GFX10-NEXT: s_clause 0x1
395- ; GFX10-NEXT: global_load_dwordx2 v[0:1 ], v4 , s[2:3]
396- ; GFX10-NEXT: global_load_dwordx2 v[2:3 ], v4 , s[6:7]
383+ ; GFX10-NEXT: global_load_dwordx2 v[1:2 ], v0 , s[2:3]
384+ ; GFX10-NEXT: global_load_dwordx2 v[3:4 ], v0 , s[6:7]
397385; GFX10-NEXT: s_waitcnt vmcnt(1)
398- ; GFX10-NEXT: v_and_b32_e32 v6 , 0xfff00000, v0
386+ ; GFX10-NEXT: v_and_b32_e32 v5 , 0xfff00000, v1
399387; GFX10-NEXT: s_waitcnt vmcnt(0)
400- ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0
401- ; GFX10-NEXT: v_mov_b32_e32 v0, v5
402- ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1]
403- ; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1
404- ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6]
405- ; GFX10-NEXT: v_mov_b32_e32 v5, v0
406- ; GFX10-NEXT: v_mov_b32_e32 v0, 0
407- ; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[0:1]
388+ ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v5, v3, 0
389+ ; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v5, v4, v[1:2]
390+ ; GFX10-NEXT: v_and_b32_e32 v1, 0xf00f, v2
391+ ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, v1, v3, v[4:5]
392+ ; GFX10-NEXT: v_mov_b32_e32 v2, 0
393+ ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
408394; GFX10-NEXT: s_endpgm
409395;
410396; GFX11-LABEL: v_mul_i64_partially_masked_src0:
@@ -414,24 +400,22 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
414400; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
415401; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
416402; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
417- ; GFX11-NEXT: v_lshlrev_b32_e32 v2 , 3, v0
403+ ; GFX11-NEXT: v_lshlrev_b32_e32 v0 , 3, v0
418404; GFX11-NEXT: s_waitcnt lgkmcnt(0)
419405; GFX11-NEXT: s_clause 0x1
420- ; GFX11-NEXT: global_load_b64 v[0:1 ], v2 , s[2:3]
421- ; GFX11-NEXT: global_load_b64 v[2:3 ], v2 , s[4:5]
406+ ; GFX11-NEXT: global_load_b64 v[1:2 ], v0 , s[2:3]
407+ ; GFX11-NEXT: global_load_b64 v[3:4 ], v0 , s[4:5]
422408; GFX11-NEXT: s_waitcnt vmcnt(1)
423- ; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0
409+ ; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v1
424410; GFX11-NEXT: s_waitcnt vmcnt(0)
425411; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
426- ; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0
427- ; GFX11-NEXT: v_mov_b32_e32 v0, v5
428- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
429- ; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1]
430- ; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1
431- ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6]
412+ ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v3, 0
413+ ; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v4, v[1:2]
414+ ; GFX11-NEXT: v_and_b32_e32 v4, 0xf00f, v2
432415; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
433- ; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
434- ; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1]
416+ ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v4, v3, v[5:6]
417+ ; GFX11-NEXT: v_mov_b32_e32 v2, 0
418+ ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
435419; GFX11-NEXT: s_endpgm
436420 %tid = call i32 @llvm.amdgcn.workitem.id.x ()
437421 %gep.a = getelementptr inbounds i64 , ptr addrspace (1 ) %aptr , i32 %tid
0 commit comments