@@ -110,8 +110,9 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
110110; GFX11-NEXT: s_waitcnt lgkmcnt(0)
111111; GFX11-NEXT: s_add_u32 s2, s2, s4
112112; GFX11-NEXT: s_addc_u32 s3, s3, s5
113- ; GFX11-NEXT: v_mov_b32_e32 v0, s2
113+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
114114; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
115+ ; GFX11-NEXT: v_mov_b32_e32 v0, s2
115116; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
116117; GFX11-NEXT: s_endpgm
117118;
@@ -222,8 +223,9 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
222223; GFX11-NEXT: s_waitcnt lgkmcnt(0)
223224; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876
224225; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234
225- ; GFX11-NEXT: v_mov_b32_e32 v0, s2
226+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
226227; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
228+ ; GFX11-NEXT: v_mov_b32_e32 v0, s2
227229; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
228230; GFX11-NEXT: s_endpgm
229231;
@@ -1087,8 +1089,9 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
10871089; GFX11-NEXT: s_waitcnt lgkmcnt(0)
10881090; GFX11-NEXT: s_sub_u32 s2, s2, s4
10891091; GFX11-NEXT: s_subb_u32 s3, s3, s5
1090- ; GFX11-NEXT: v_mov_b32_e32 v0, s2
1092+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
10911093; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
1094+ ; GFX11-NEXT: v_mov_b32_e32 v0, s2
10921095; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
10931096; GFX11-NEXT: s_endpgm
10941097;
@@ -1199,8 +1202,9 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
11991202; GFX11-NEXT: s_waitcnt lgkmcnt(0)
12001203; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2
12011204; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3
1202- ; GFX11-NEXT: v_mov_b32_e32 v0, s2
1205+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
12031206; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
1207+ ; GFX11-NEXT: v_mov_b32_e32 v0, s2
12041208; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
12051209; GFX11-NEXT: s_endpgm
12061210;
@@ -2420,8 +2424,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
24202424; GFX9-NEXT: s_cmp_ge_u32 s2, s6
24212425; GFX9-NEXT: s_cselect_b32 s8, s4, s3
24222426; GFX9-NEXT: .LBB16_3:
2423- ; GFX9-NEXT: v_mov_b32_e32 v0, s8
24242427; GFX9-NEXT: v_mov_b32_e32 v2, 0
2428+ ; GFX9-NEXT: v_mov_b32_e32 v0, s8
24252429; GFX9-NEXT: v_mov_b32_e32 v1, s9
24262430; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
24272431; GFX9-NEXT: s_endpgm
@@ -2573,8 +2577,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
25732577; GFX1010-NEXT: s_cmp_ge_u32 s2, s6
25742578; GFX1010-NEXT: s_cselect_b32 s8, s4, s3
25752579; GFX1010-NEXT: .LBB16_3:
2576- ; GFX1010-NEXT: v_mov_b32_e32 v0, s8
25772580; GFX1010-NEXT: v_mov_b32_e32 v2, 0
2581+ ; GFX1010-NEXT: v_mov_b32_e32 v0, s8
25782582; GFX1010-NEXT: v_mov_b32_e32 v1, s9
25792583; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
25802584; GFX1010-NEXT: s_endpgm
@@ -2726,8 +2730,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
27262730; GFX1030W32-NEXT: s_cmp_ge_u32 s2, s4
27272731; GFX1030W32-NEXT: s_cselect_b32 s8, s5, s3
27282732; GFX1030W32-NEXT: .LBB16_3:
2729- ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8
27302733; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
2734+ ; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8
27312735; GFX1030W32-NEXT: v_mov_b32_e32 v1, s9
27322736; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
27332737; GFX1030W32-NEXT: s_endpgm
@@ -2878,8 +2882,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
28782882; GFX1030W64-NEXT: s_cmp_ge_u32 s2, s4
28792883; GFX1030W64-NEXT: s_cselect_b32 s6, s5, s3
28802884; GFX1030W64-NEXT: .LBB16_3:
2881- ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6
28822885; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
2886+ ; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6
28832887; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7
28842888; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
28852889; GFX1030W64-NEXT: s_endpgm
@@ -3046,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
30463050; GFX11-NEXT: s_cmp_ge_u32 s2, s4
30473051; GFX11-NEXT: s_cselect_b32 s8, s5, s3
30483052; GFX11-NEXT: .LBB16_3:
3049- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3050- ; GFX11-NEXT: v_mov_b32_e32 v0, s8
30513053; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
3054+ ; GFX11-NEXT: v_mov_b32_e32 v0, s8
30523055; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
30533056; GFX11-NEXT: s_endpgm
30543057; GFX11-NEXT: .LBB16_4:
0 commit comments