flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc Source File

flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc Source File#

Composable Kernel: flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc Source File
flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3// clang-format off
4
5// define the CK_TILE_** macro before include this file to change kernel variation
6// we will undef everything defined in this file
7
8#ifndef CK_TILE_FLATMM_UK_MFMA
9#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
10#endif
11
12#if CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_BF16
13#define _UK_MFMA_ "v_mfma_f32_16x16x16_bf16"
14
15#define _UK_PK_CVT_(x0_, x1_, y_) \
16 " v_cmp_u_f32 s[36:37], " x0_ ", " x0_ " \n" \
17 " v_add3_u32 v50, " x0_ ", %[v_nan_lo], 1 \n" \
18 " v_cndmask_b32 v54, v50, %[v_nan_hi], s[36:37] \n" \
19 " v_cmp_u_f32 s[36:37], " x1_ ", " x1_ " \n" \
20 " v_add3_u32 v50, " x1_ ", %[v_nan_lo], 1 \n" \
21 " v_cndmask_b32 v55, v50, %[v_nan_hi], s[36:37] \n" \
22 " v_perm_b32 " y_ ", v55, v54, s52 \n"
23
24#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_bf16"
25
26#elif CK_TILE_FLATMM_UK_MFMA == CK_TILE_FLATMM_UK_MFMA_FP16
27#define _UK_MFMA_ "v_mfma_f32_16x16x16_f16"
28
29#define _UK_PK_CVT_(x0_, x1_, y_) \
30 " v_cvt_f16_f32 v54, " x0_ " \n" \
31 " v_cvt_f16_f32 v55, " x1_ " \n" \
32 " v_pack_b32_f16 " y_ ", v54, v55 \n"
33
34#define _UK_ATOMIC_ADD_ "global_atomic_pk_add_f16"
35
36#endif
37
38";-------------------------------------------------------------\n"
39 " s_mov_b32 s52, 0x07060302 ; v_perm\n"
40 " s_mov_b64 s[38:39], exec ; save current exec\n"
41 " s_mov_b32 s8, %[s_res_o0] \n"
42 " s_mov_b32 s9, %[s_res_o1] \n"
43 " s_mov_b32 s12, %[s_res_b0] \n"
44 " s_mov_b32 s13, %[s_res_b1] \n"
45 " s_mov_b32 s14, %[s_res_b2] \n"
46 " s_mov_b32 s15, %[s_res_b3] \n"
47 " ds_read_b64 v[128:129], %[v_sld_y_os] offset:0 + %[sld_a_base] \n"
48 " ds_read_b64 v[130:131], %[v_sld_y_os] offset:128 + %[sld_a_base] \n"
49 " ds_read_b64 v[132:133], %[v_sld_y_os] offset:1024 + %[sld_a_base] \n"
50 " ds_read_b64 v[134:135], %[v_sld_y_os] offset:1152 + %[sld_a_base] \n"
51 " ds_read_b64 v[136:137], %[v_sld_y_os] offset:2048 + %[sld_a_base] \n"
52 " ds_read_b64 v[138:139], %[v_sld_y_os] offset:2176 + %[sld_a_base] \n"
53 " ds_read_b64 v[140:141], %[v_sld_y_os] offset:3072 + %[sld_a_base] \n"
54 " ds_read_b64 v[142:143], %[v_sld_y_os] offset:3200 + %[sld_a_base] \n"
55 " ds_read_b64 v[144:145], %[v_sld_y_os] offset:4096 + %[sld_a_base] \n"
56 " ds_read_b64 v[146:147], %[v_sld_y_os] offset:4224 + %[sld_a_base] \n"
57 " ds_read_b64 v[148:149], %[v_sld_y_os] offset:5120 + %[sld_a_base] \n"
58 " ds_read_b64 v[150:151], %[v_sld_y_os] offset:5248 + %[sld_a_base] \n"
59 " ds_read_b64 v[152:153], %[v_sld_y_os] offset:6144 + %[sld_a_base] \n"
60 " ds_read_b64 v[154:155], %[v_sld_y_os] offset:6272 + %[sld_a_base] \n"
61 " ds_read_b64 v[156:157], %[v_sld_y_os] offset:7168 + %[sld_a_base] \n"
62 " ds_read_b64 v[158:159], %[v_sld_y_os] offset:7296 + %[sld_a_base] \n"
63 " ds_read_b64 v[160:161], %[v_sld_y_os] offset:8192 + %[sld_a_base] \n"
64 " ds_read_b64 v[162:163], %[v_sld_y_os] offset:8320 + %[sld_a_base] \n"
65 " ds_read_b64 v[164:165], %[v_sld_y_os] offset:9216 + %[sld_a_base] \n"
66 " ds_read_b64 v[166:167], %[v_sld_y_os] offset:9344 + %[sld_a_base] \n"
67 " ds_read_b64 v[168:169], %[v_sld_y_os] offset:10240 + %[sld_a_base] \n"
68 " ds_read_b64 v[170:171], %[v_sld_y_os] offset:10368 + %[sld_a_base] \n"
69 " ds_read_b64 v[172:173], %[v_sld_y_os] offset:11264 + %[sld_a_base] \n"
70 " ds_read_b64 v[174:175], %[v_sld_y_os] offset:11392 + %[sld_a_base] \n"
71 " ds_read_b64 v[176:177], %[v_sld_y_os] offset:12288 + %[sld_a_base] \n"
72 " ds_read_b64 v[178:179], %[v_sld_y_os] offset:12416 + %[sld_a_base] \n"
73 " ds_read_b64 v[180:181], %[v_sld_y_os] offset:13312 + %[sld_a_base] \n"
74 " ds_read_b64 v[182:183], %[v_sld_y_os] offset:13440 + %[sld_a_base] \n"
75 " ds_read_b64 v[184:185], %[v_sld_y_os] offset:14336 + %[sld_a_base] \n"
76 " ds_read_b64 v[186:187], %[v_sld_y_os] offset:14464 + %[sld_a_base] \n"
77 " ds_read_b64 v[188:189], %[v_sld_y_os] offset:15360 + %[sld_a_base] \n"
78 " ds_read_b64 v[190:191], %[v_sld_y_os] offset:15488 + %[sld_a_base] \n"
79 " ds_read_b64 v[192:193], %[v_sld_y_os] offset:16384 + %[sld_a_base] \n"
80 " ds_read_b64 v[194:195], %[v_sld_y_os] offset:16512 + %[sld_a_base] \n"
81 " ds_read_b64 v[196:197], %[v_sld_y_os] offset:17408 + %[sld_a_base] \n"
82 " ds_read_b64 v[198:199], %[v_sld_y_os] offset:17536 + %[sld_a_base] \n"
83 " ds_read_b64 v[200:201], %[v_sld_y_os] offset:18432 + %[sld_a_base] \n"
84 " ds_read_b64 v[202:203], %[v_sld_y_os] offset:18560 + %[sld_a_base] \n"
85 " ds_read_b64 v[204:205], %[v_sld_y_os] offset:19456 + %[sld_a_base] \n"
86 " ds_read_b64 v[206:207], %[v_sld_y_os] offset:19584 + %[sld_a_base] \n"
87 " ds_read_b64 v[208:209], %[v_sld_y_os] offset:20480 + %[sld_a_base] \n"
88 " ds_read_b64 v[210:211], %[v_sld_y_os] offset:20608 + %[sld_a_base] \n"
89 " ds_read_b64 v[212:213], %[v_sld_y_os] offset:21504 + %[sld_a_base] \n"
90 " ds_read_b64 v[214:215], %[v_sld_y_os] offset:21632 + %[sld_a_base] \n"
91 " ds_read_b64 v[216:217], %[v_sld_y_os] offset:22528 + %[sld_a_base] \n"
92 " ds_read_b64 v[218:219], %[v_sld_y_os] offset:22656 + %[sld_a_base] \n"
93 " ds_read_b64 v[220:221], %[v_sld_y_os] offset:23552 + %[sld_a_base] \n"
94 " ds_read_b64 v[222:223], %[v_sld_y_os] offset:23680 + %[sld_a_base] \n"
95 " ds_read_b64 v[224:225], %[v_sld_y_os] offset:24576 + %[sld_a_base] \n"
96 " ds_read_b64 v[226:227], %[v_sld_y_os] offset:24704 + %[sld_a_base] \n"
97 " ds_read_b64 v[228:229], %[v_sld_y_os] offset:25600 + %[sld_a_base] \n"
98 " ds_read_b64 v[230:231], %[v_sld_y_os] offset:25728 + %[sld_a_base] \n"
99 " ds_read_b64 v[232:233], %[v_sld_y_os] offset:26624 + %[sld_a_base] \n"
100 " ds_read_b64 v[234:235], %[v_sld_y_os] offset:26752 + %[sld_a_base] \n"
101 " ds_read_b64 v[236:237], %[v_sld_y_os] offset:27648 + %[sld_a_base] \n"
102 " ds_read_b64 v[238:239], %[v_sld_y_os] offset:27776 + %[sld_a_base] \n"
103 " ds_read_b64 v[240:241], %[v_sld_y_os] offset:28672 + %[sld_a_base] \n"
104 " ds_read_b64 v[242:243], %[v_sld_y_os] offset:28800 + %[sld_a_base] \n"
105 " ds_read_b64 v[244:245], %[v_sld_y_os] offset:29696 + %[sld_a_base] \n"
106 " ds_read_b64 v[246:247], %[v_sld_y_os] offset:29824 + %[sld_a_base] \n"
107 " ds_read_b64 v[248:249], %[v_sld_y_os] offset:30720 + %[sld_a_base] \n"
108 " ds_read_b64 v[250:251], %[v_sld_y_os] offset:30848 + %[sld_a_base] \n"
109 " ds_read_b64 v[252:253], %[v_sld_y_os] offset:31744 + %[sld_a_base] \n"
110 " ds_read_b64 v[254:255], %[v_sld_y_os] offset:31872 + %[sld_a_base] \n"
111 " s_waitcnt 0 \n"
112 " buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen \n"
113 " buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen offset:1024 \n"
114 " buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen offset:2048 \n"
115 " buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen offset:3072 \n"
116 " buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen \n"
117 " buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen offset:1024 \n"
118 " buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen offset:2048 \n"
119 " buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen offset:3072 \n"
120 " buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen \n"
121 " buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen offset:1024 \n"
122 " buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen offset:2048 \n"
123 " buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen offset:3072 \n"
124 " buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen \n"
125 " buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen offset:1024 \n"
126 " buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen offset:2048 \n"
127 " buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen offset:3072 \n"
128 " buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen \n"
129 " buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen offset:1024 \n"
130 " buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen offset:2048 \n"
131 " buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen offset:3072 \n"
132 " buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen \n"
133 " buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen offset:1024 \n"
134 " buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen offset:2048 \n"
135 " buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen offset:3072 \n"
136 " buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen \n"
137 " buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen offset:1024 \n"
138 " buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen offset:2048 \n"
139 " buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen offset:3072 \n"
140 " buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen \n"
141 " buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen offset:1024 \n"
142 " buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen offset:2048 \n"
143 " buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen offset:3072 \n"
144 " s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n"
145 " s_cselect_b32 s86, %[s_tile_os_b], 0 \n"
146 " s_add_u32 s12, s86, s12 \n"
147 " s_addc_u32 s13, 0, s13 \n"
148 " s_waitcnt 0 \n"
149 "L_start%=: \n"
150 " s_waitcnt vmcnt(32) \n"
151 " s_barrier \n" _UK_MFMA_
152 " [%[c0], %[c1], %[c2], %[c3]], acc[0:1], v[128:129], 0 \n"
153 " buffer_load_dwordx4 acc[128:131], %[v_os_b0], s[12:15], 0 offen \n" _UK_MFMA_
154 " [%[c0], %[c1], %[c2], %[c3]], acc[2:3], v[130:131], [%[c0], %[c1], %[c2], %[c3]] "
155 "\n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[4:5], v[132:133], [%[c0], %[c1], %[c2], "
156 "%[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[6:7], v[134:135], "
157 "[%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_
158 " [%[c0], %[c1], %[c2], %[c3]], acc[8:9], v[136:137], [%[c0], %[c1], %[c2], %[c3]] \n"
159 " buffer_load_dwordx4 acc[132:135], %[v_os_b0], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
160 " [%[c0], %[c1], %[c2], %[c3]], acc[10:11], v[138:139], [%[c0], %[c1], %[c2], %[c3]] "
161 "\n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[12:13], v[140:141], [%[c0], %[c1], %[c2], "
162 "%[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[14:15], v[142:143], "
163 "[%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_
164 " [%[c4], %[c5], %[c6], %[c7]], acc[0:1], v[192:193], 0 \n"
165 " buffer_load_dwordx4 acc[136:139], %[v_os_b0], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
166 " [%[c4], %[c5], %[c6], %[c7]], acc[2:3], v[194:195], [%[c4], %[c5], %[c6], %[c7]] "
167 "\n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[4:5], v[196:197], [%[c4], %[c5], %[c6], "
168 "%[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[6:7], v[198:199], "
169 "[%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_
170 " [%[c4], %[c5], %[c6], %[c7]], acc[8:9], v[200:201], [%[c4], %[c5], %[c6], %[c7]] \n"
171 " buffer_load_dwordx4 acc[140:143], %[v_os_b0], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
172 " [%[c4], %[c5], %[c6], %[c7]], acc[10:11], v[202:203], [%[c4], %[c5], %[c6], %[c7]] "
173 "\n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[12:13], v[204:205], [%[c4], %[c5], %[c6], "
174 "%[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[14:15], v[206:207], "
175 "[%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_
176 " [%[c8], %[c9], %[c10], %[c11]], acc[16:17], v[128:129], 0 \n"
177 " buffer_load_dwordx4 acc[144:147], %[v_os_b1], s[12:15], 0 offen \n" _UK_MFMA_
178 " [%[c8], %[c9], %[c10], %[c11]], acc[18:19], v[130:131], [%[c8], %[c9], %[c10], %[c11]] "
179 "\n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[20:21], v[132:133], [%[c8], %[c9], "
180 "%[c10], %[c11]] \n" _UK_MFMA_
181 " [%[c8], %[c9], %[c10], %[c11]], acc[22:23], v[134:135], [%[c8], %[c9], %[c10], %[c11]] "
182 "\n" _UK_MFMA_
183 " [%[c8], %[c9], %[c10], %[c11]], acc[24:25], v[136:137], [%[c8], %[c9], %[c10], %[c11]] \n"
184 " buffer_load_dwordx4 acc[148:151], %[v_os_b1], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
185 " [%[c8], %[c9], %[c10], %[c11]], acc[26:27], v[138:139], [%[c8], %[c9], %[c10], %[c11]] "
186 "\n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[28:29], v[140:141], [%[c8], %[c9], "
187 "%[c10], %[c11]] \n" _UK_MFMA_
188 " [%[c8], %[c9], %[c10], %[c11]], acc[30:31], v[142:143], [%[c8], %[c9], %[c10], %[c11]] "
189 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[16:17], v[192:193], 0 \n"
190 " buffer_load_dwordx4 acc[152:155], %[v_os_b1], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
191 " [%[c12], %[c13], %[c14], %[c15]], acc[18:19], v[194:195], [%[c12], %[c13], %[c14], %[c15]] "
192 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[20:21], v[196:197], [%[c12], %[c13], "
193 "%[c14], %[c15]] \n" _UK_MFMA_
194 " [%[c12], %[c13], %[c14], %[c15]], acc[22:23], v[198:199], [%[c12], %[c13], %[c14], %[c15]] "
195 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[24:25], v[200:201], [%[c12], %[c13], "
196 "%[c14], %[c15]] \n"
197 " buffer_load_dwordx4 acc[156:159], %[v_os_b1], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
198 " [%[c12], %[c13], %[c14], %[c15]], acc[26:27], v[202:203], [%[c12], %[c13], %[c14], %[c15]] "
199 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[28:29], v[204:205], [%[c12], %[c13], "
200 "%[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[30:31], "
201 "v[206:207], [%[c12], %[c13], %[c14], %[c15]] \n"
202 " s_waitcnt vmcnt(32) \n" _UK_MFMA_
203 " [%[c0], %[c1], %[c2], %[c3]], acc[32:33], v[144:145], [%[c0], %[c1], %[c2], %[c3]] \n"
204 " buffer_load_dwordx4 acc[160:163], %[v_os_b2], s[12:15], 0 offen \n" _UK_MFMA_
205 " [%[c0], %[c1], %[c2], %[c3]], acc[34:35], v[146:147], [%[c0], %[c1], %[c2], %[c3]] "
206 "\n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[36:37], v[148:149], [%[c0], %[c1], %[c2], "
207 "%[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[38:39], v[150:151], "
208 "[%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_
209 " [%[c0], %[c1], %[c2], %[c3]], acc[40:41], v[152:153], [%[c0], %[c1], %[c2], %[c3]] \n"
210 " buffer_load_dwordx4 acc[164:167], %[v_os_b2], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
211 " [%[c0], %[c1], %[c2], %[c3]], acc[42:43], v[154:155], [%[c0], %[c1], %[c2], %[c3]] "
212 "\n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[44:45], v[156:157], [%[c0], %[c1], %[c2], "
213 "%[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[46:47], v[158:159], "
214 "[%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_
215 " [%[c4], %[c5], %[c6], %[c7]], acc[32:33], v[208:209], [%[c4], %[c5], %[c6], %[c7]] \n"
216 " buffer_load_dwordx4 acc[168:171], %[v_os_b2], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
217 " [%[c4], %[c5], %[c6], %[c7]], acc[34:35], v[210:211], [%[c4], %[c5], %[c6], %[c7]] "
218 "\n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[36:37], v[212:213], [%[c4], %[c5], %[c6], "
219 "%[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[38:39], v[214:215], "
220 "[%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_
221 " [%[c4], %[c5], %[c6], %[c7]], acc[40:41], v[216:217], [%[c4], %[c5], %[c6], %[c7]] \n"
222 " buffer_load_dwordx4 acc[172:175], %[v_os_b2], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
223 " [%[c4], %[c5], %[c6], %[c7]], acc[42:43], v[218:219], [%[c4], %[c5], %[c6], %[c7]] "
224 "\n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[44:45], v[220:221], [%[c4], %[c5], %[c6], "
225 "%[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[46:47], v[222:223], "
226 "[%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_
227 " [%[c8], %[c9], %[c10], %[c11]], acc[48:49], v[144:145], [%[c8], %[c9], %[c10], %[c11]] \n"
228 " buffer_load_dwordx4 acc[176:179], %[v_os_b3], s[12:15], 0 offen \n" _UK_MFMA_
229 " [%[c8], %[c9], %[c10], %[c11]], acc[50:51], v[146:147], [%[c8], %[c9], %[c10], %[c11]] "
230 "\n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[52:53], v[148:149], [%[c8], %[c9], "
231 "%[c10], %[c11]] \n" _UK_MFMA_
232 " [%[c8], %[c9], %[c10], %[c11]], acc[54:55], v[150:151], [%[c8], %[c9], %[c10], %[c11]] "
233 "\n" _UK_MFMA_
234 " [%[c8], %[c9], %[c10], %[c11]], acc[56:57], v[152:153], [%[c8], %[c9], %[c10], %[c11]] \n"
235 " buffer_load_dwordx4 acc[180:183], %[v_os_b3], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
236 " [%[c8], %[c9], %[c10], %[c11]], acc[58:59], v[154:155], [%[c8], %[c9], %[c10], %[c11]] "
237 "\n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[60:61], v[156:157], [%[c8], %[c9], "
238 "%[c10], %[c11]] \n" _UK_MFMA_
239 " [%[c8], %[c9], %[c10], %[c11]], acc[62:63], v[158:159], [%[c8], %[c9], %[c10], %[c11]] "
240 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[48:49], v[208:209], [%[c12], %[c13], "
241 "%[c14], %[c15]] \n"
242 " buffer_load_dwordx4 acc[184:187], %[v_os_b3], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
243 " [%[c12], %[c13], %[c14], %[c15]], acc[50:51], v[210:211], [%[c12], %[c13], %[c14], %[c15]] "
244 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[52:53], v[212:213], [%[c12], %[c13], "
245 "%[c14], %[c15]] \n" _UK_MFMA_
246 " [%[c12], %[c13], %[c14], %[c15]], acc[54:55], v[214:215], [%[c12], %[c13], %[c14], %[c15]] "
247 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[56:57], v[216:217], [%[c12], %[c13], "
248 "%[c14], %[c15]] \n"
249 " buffer_load_dwordx4 acc[188:191], %[v_os_b3], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
250 " [%[c12], %[c13], %[c14], %[c15]], acc[58:59], v[218:219], [%[c12], %[c13], %[c14], %[c15]] "
251 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[60:61], v[220:221], [%[c12], %[c13], "
252 "%[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[62:63], "
253 "v[222:223], [%[c12], %[c13], %[c14], %[c15]] \n"
254 " s_waitcnt vmcnt(32) \n" _UK_MFMA_
255 " [%[c0], %[c1], %[c2], %[c3]], acc[64:65], v[160:161], [%[c0], %[c1], %[c2], %[c3]] \n"
256 " buffer_load_dwordx4 acc[192:195], %[v_os_b4], s[12:15], 0 offen \n" _UK_MFMA_
257 " [%[c0], %[c1], %[c2], %[c3]], acc[66:67], v[162:163], [%[c0], %[c1], %[c2], %[c3]] "
258 "\n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[68:69], v[164:165], [%[c0], %[c1], %[c2], "
259 "%[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[70:71], v[166:167], "
260 "[%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_
261 " [%[c0], %[c1], %[c2], %[c3]], acc[72:73], v[168:169], [%[c0], %[c1], %[c2], %[c3]] \n"
262 " buffer_load_dwordx4 acc[196:199], %[v_os_b4], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
263 " [%[c0], %[c1], %[c2], %[c3]], acc[74:75], v[170:171], [%[c0], %[c1], %[c2], %[c3]] "
264 "\n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[76:77], v[172:173], [%[c0], %[c1], %[c2], "
265 "%[c3]] \n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[78:79], v[174:175], "
266 "[%[c0], %[c1], %[c2], %[c3]] \n" _UK_MFMA_
267 " [%[c4], %[c5], %[c6], %[c7]], acc[64:65], v[224:225], [%[c4], %[c5], %[c6], %[c7]] \n"
268 " buffer_load_dwordx4 acc[200:203], %[v_os_b4], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
269 " [%[c4], %[c5], %[c6], %[c7]], acc[66:67], v[226:227], [%[c4], %[c5], %[c6], %[c7]] "
270 "\n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[68:69], v[228:229], [%[c4], %[c5], %[c6], "
271 "%[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[70:71], v[230:231], "
272 "[%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_
273 " [%[c4], %[c5], %[c6], %[c7]], acc[72:73], v[232:233], [%[c4], %[c5], %[c6], %[c7]] \n"
274 " buffer_load_dwordx4 acc[204:207], %[v_os_b4], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
275 " [%[c4], %[c5], %[c6], %[c7]], acc[74:75], v[234:235], [%[c4], %[c5], %[c6], %[c7]] "
276 "\n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[76:77], v[236:237], [%[c4], %[c5], %[c6], "
277 "%[c7]] \n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[78:79], v[238:239], "
278 "[%[c4], %[c5], %[c6], %[c7]] \n" _UK_MFMA_
279 " [%[c8], %[c9], %[c10], %[c11]], acc[80:81], v[160:161], [%[c8], %[c9], %[c10], %[c11]] \n"
280 " buffer_load_dwordx4 acc[208:211], %[v_os_b5], s[12:15], 0 offen \n" _UK_MFMA_
281 " [%[c8], %[c9], %[c10], %[c11]], acc[82:83], v[162:163], [%[c8], %[c9], %[c10], %[c11]] "
282 "\n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[84:85], v[164:165], [%[c8], %[c9], "
283 "%[c10], %[c11]] \n" _UK_MFMA_
284 " [%[c8], %[c9], %[c10], %[c11]], acc[86:87], v[166:167], [%[c8], %[c9], %[c10], %[c11]] "
285 "\n" _UK_MFMA_
286 " [%[c8], %[c9], %[c10], %[c11]], acc[88:89], v[168:169], [%[c8], %[c9], %[c10], %[c11]] \n"
287 " buffer_load_dwordx4 acc[212:215], %[v_os_b5], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
288 " [%[c8], %[c9], %[c10], %[c11]], acc[90:91], v[170:171], [%[c8], %[c9], %[c10], %[c11]] "
289 "\n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[92:93], v[172:173], [%[c8], %[c9], "
290 "%[c10], %[c11]] \n" _UK_MFMA_
291 " [%[c8], %[c9], %[c10], %[c11]], acc[94:95], v[174:175], [%[c8], %[c9], %[c10], %[c11]] "
292 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[80:81], v[224:225], [%[c12], %[c13], "
293 "%[c14], %[c15]] \n"
294 " buffer_load_dwordx4 acc[216:219], %[v_os_b5], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
295 " [%[c12], %[c13], %[c14], %[c15]], acc[82:83], v[226:227], [%[c12], %[c13], %[c14], %[c15]] "
296 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[84:85], v[228:229], [%[c12], %[c13], "
297 "%[c14], %[c15]] \n" _UK_MFMA_
298 " [%[c12], %[c13], %[c14], %[c15]], acc[86:87], v[230:231], [%[c12], %[c13], %[c14], %[c15]] "
299 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[88:89], v[232:233], [%[c12], %[c13], "
300 "%[c14], %[c15]] \n"
301 " buffer_load_dwordx4 acc[220:223], %[v_os_b5], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
302 " [%[c12], %[c13], %[c14], %[c15]], acc[90:91], v[234:235], [%[c12], %[c13], %[c14], %[c15]] "
303 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[92:93], v[236:237], [%[c12], %[c13], "
304 "%[c14], %[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[94:95], "
305 "v[238:239], [%[c12], %[c13], %[c14], %[c15]] \n"
306 " s_waitcnt vmcnt(32) \n" _UK_MFMA_
307 " [%[c0], %[c1], %[c2], %[c3]], acc[96:97], v[176:177], [%[c0], %[c1], %[c2], %[c3]] \n"
308 " buffer_load_dwordx4 acc[224:227], %[v_os_b6], s[12:15], 0 offen \n" _UK_MFMA_
309 " [%[c0], %[c1], %[c2], %[c3]], acc[98:99], v[178:179], [%[c0], %[c1], %[c2], %[c3]] "
310 "\n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[100:101], v[180:181], [%[c0], %[c1], "
311 "%[c2], %[c3]] \n" _UK_MFMA_
312 " [%[c0], %[c1], %[c2], %[c3]], acc[102:103], v[182:183], [%[c0], %[c1], %[c2], %[c3]] "
313 "\n" _UK_MFMA_
314 " [%[c0], %[c1], %[c2], %[c3]], acc[104:105], v[184:185], [%[c0], %[c1], %[c2], %[c3]] \n"
315 " buffer_load_dwordx4 acc[228:231], %[v_os_b6], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
316 " [%[c0], %[c1], %[c2], %[c3]], acc[106:107], v[186:187], [%[c0], %[c1], %[c2], %[c3]] "
317 "\n" _UK_MFMA_ " [%[c0], %[c1], %[c2], %[c3]], acc[108:109], v[188:189], [%[c0], %[c1], "
318 "%[c2], %[c3]] \n" _UK_MFMA_
319 " [%[c0], %[c1], %[c2], %[c3]], acc[110:111], v[190:191], [%[c0], %[c1], %[c2], %[c3]] "
320 "\n" _UK_MFMA_
321 " [%[c4], %[c5], %[c6], %[c7]], acc[96:97], v[240:241], [%[c4], %[c5], %[c6], %[c7]] \n"
322 " buffer_load_dwordx4 acc[232:235], %[v_os_b6], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
323 " [%[c4], %[c5], %[c6], %[c7]], acc[98:99], v[242:243], [%[c4], %[c5], %[c6], %[c7]] "
324 "\n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[100:101], v[244:245], [%[c4], %[c5], "
325 "%[c6], %[c7]] \n" _UK_MFMA_
326 " [%[c4], %[c5], %[c6], %[c7]], acc[102:103], v[246:247], [%[c4], %[c5], %[c6], %[c7]] "
327 "\n" _UK_MFMA_
328 " [%[c4], %[c5], %[c6], %[c7]], acc[104:105], v[248:249], [%[c4], %[c5], %[c6], %[c7]] \n"
329 " buffer_load_dwordx4 acc[236:239], %[v_os_b6], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
330 " [%[c4], %[c5], %[c6], %[c7]], acc[106:107], v[250:251], [%[c4], %[c5], %[c6], %[c7]] "
331 "\n" _UK_MFMA_ " [%[c4], %[c5], %[c6], %[c7]], acc[108:109], v[252:253], [%[c4], %[c5], "
332 "%[c6], %[c7]] \n" _UK_MFMA_
333 " [%[c4], %[c5], %[c6], %[c7]], acc[110:111], v[254:255], [%[c4], %[c5], %[c6], %[c7]] "
334 "\n" _UK_MFMA_
335 " [%[c8], %[c9], %[c10], %[c11]], acc[112:113], v[176:177], [%[c8], %[c9], %[c10], %[c11]] \n"
336 " buffer_load_dwordx4 acc[240:243], %[v_os_b7], s[12:15], 0 offen \n" _UK_MFMA_
337 " [%[c8], %[c9], %[c10], %[c11]], acc[114:115], v[178:179], [%[c8], %[c9], %[c10], %[c11]] "
338 "\n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[116:117], v[180:181], [%[c8], %[c9], "
339 "%[c10], %[c11]] \n" _UK_MFMA_
340 " [%[c8], %[c9], %[c10], %[c11]], acc[118:119], v[182:183], [%[c8], %[c9], %[c10], %[c11]] "
341 "\n" _UK_MFMA_
342 " [%[c8], %[c9], %[c10], %[c11]], acc[120:121], v[184:185], [%[c8], %[c9], %[c10], %[c11]] \n"
343 " buffer_load_dwordx4 acc[244:247], %[v_os_b7], s[12:15], 0 offen offset:1024 \n" _UK_MFMA_
344 " [%[c8], %[c9], %[c10], %[c11]], acc[122:123], v[186:187], [%[c8], %[c9], %[c10], %[c11]] "
345 "\n" _UK_MFMA_ " [%[c8], %[c9], %[c10], %[c11]], acc[124:125], v[188:189], [%[c8], %[c9], "
346 "%[c10], %[c11]] \n" _UK_MFMA_
347 " [%[c8], %[c9], %[c10], %[c11]], acc[126:127], v[190:191], [%[c8], %[c9], %[c10], %[c11]] "
348 "\n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[112:113], v[240:241], [%[c12], %[c13], "
349 "%[c14], %[c15]] \n"
350 " buffer_load_dwordx4 acc[248:251], %[v_os_b7], s[12:15], 0 offen offset:2048 \n" _UK_MFMA_
351 " [%[c12], %[c13], %[c14], %[c15]], acc[114:115], v[242:243], [%[c12], %[c13], %[c14], "
352 "%[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[116:117], v[244:245], [%[c12], "
353 "%[c13], %[c14], %[c15]] \n" _UK_MFMA_
354 " [%[c12], %[c13], %[c14], %[c15]], acc[118:119], v[246:247], [%[c12], %[c13], %[c14], "
355 "%[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[120:121], v[248:249], [%[c12], "
356 "%[c13], %[c14], %[c15]] \n"
357 " buffer_load_dwordx4 acc[252:255], %[v_os_b7], s[12:15], 0 offen offset:3072 \n" _UK_MFMA_
358 " [%[c12], %[c13], %[c14], %[c15]], acc[122:123], v[250:251], [%[c12], %[c13], %[c14], "
359 "%[c15]] \n" _UK_MFMA_ " [%[c12], %[c13], %[c14], %[c15]], acc[124:125], v[252:253], [%[c12], "
360 "%[c13], %[c14], %[c15]] \n" _UK_MFMA_
361 " [%[c12], %[c13], %[c14], %[c15]], acc[126:127], v[254:255], [%[c12], %[c13], %[c14], "
362 "%[c15]]\n"
363 " v_mul_f32 %[c0], %[scale_0], %[c0] \n"
364 " v_mul_f32 %[c1], %[scale_0], %[c1] \n"
365 " v_mul_f32 %[c2], %[scale_0], %[c2] \n"
366 " v_mul_f32 %[c3], %[scale_0], %[c3] \n"
367 " v_mul_f32 %[c4], %[scale_1], %[c4] \n"
368 " v_mul_f32 %[c5], %[scale_1], %[c5] \n"
369 " v_mul_f32 %[c6], %[scale_1], %[c6] \n"
370 " v_mul_f32 %[c7], %[scale_1], %[c7] \n"
371 " v_mul_f32 %[c8], %[scale_0], %[c8] \n"
372 " v_mul_f32 %[c9], %[scale_0], %[c9] \n"
373 " v_mul_f32 %[c10], %[scale_0], %[c10] \n"
374 " v_mul_f32 %[c11], %[scale_0], %[c11] \n"
375 " v_mul_f32 %[c12], %[scale_1], %[c12] \n"
376 " v_mul_f32 %[c13], %[scale_1], %[c13] \n"
377 " v_mul_f32 %[c14], %[scale_1], %[c14] \n"
378 " v_mul_f32 %[c15], %[scale_1], %[c15] \n" _UK_PK_CVT_(
379 "%[c0]", "%[c1]", "%[c0]") _UK_PK_CVT_("%[c2]", "%[c3]", "%[c1]")
380 _UK_PK_CVT_("%[c4]", "%[c5]", "%[c2]") _UK_PK_CVT_("%[c6]", "%[c7]", "%[c3]") _UK_PK_CVT_(
381 "%[c8]", "%[c9]", "%[c4]") _UK_PK_CVT_("%[c10]", "%[c11]", "%[c5]")
382 _UK_PK_CVT_("%[c12]", "%[c13]", "%[c6]") _UK_PK_CVT_(
383 "%[c14]",
384 "%[c15]",
385 "%[c7]") " ;------------------------------ \n"
386 " ds_write_b64 %[v_sfl_sst], [%[c0],%[c1]] offset:0 + %[shfl_base] "
387 " \n"
388 " ds_write_b64 %[v_sfl_sst], [%[c2],%[c3]] offset:4352 + %[shfl_base] "
389 " \n"
390 " ds_write_b64 %[v_sfl_sst], [%[c4],%[c5]] offset:2176 + %[shfl_base] "
391 " \n"
392 " ds_write_b64 %[v_sfl_sst], [%[c6],%[c7]] offset:6528 + %[shfl_base] "
393 " \n"
394 " s_waitcnt lgkmcnt(0) \n"
395 " s_barrier \n"
396 " ds_read_b32 %[c0], %[v_sfl_sld] offset:0 + %[shfl_base] "
397 " \n"
398 " ds_read_b32 %[c1], %[v_sfl_sld] offset:32 + %[shfl_base] "
399 " \n"
400 " ds_read_b32 %[c2], %[v_sfl_sld] offset:64 + %[shfl_base] "
401 " \n"
402 " ds_read_b32 %[c3], %[v_sfl_sld] offset:96 + %[shfl_base] "
403 " \n"
404 " ds_read_b32 %[c4], %[v_sfl_sld] offset:4352 + %[shfl_base] "
405 " \n"
406 " ds_read_b32 %[c5], %[v_sfl_sld] offset:4384 + %[shfl_base] "
407 " \n"
408 " ds_read_b32 %[c6], %[v_sfl_sld] offset:4416 + %[shfl_base] "
409 " \n"
410 " ds_read_b32 %[c7], %[v_sfl_sld] offset:4448 + %[shfl_base] "
411 " \n"
412 " s_waitcnt lgkmcnt(0) \n"
413 " s_mov_b64 exec, %[s_execflag_0] "
414 "\n" _UK_ATOMIC_ADD_ " %[v_os_o0], %[c0], s[8:9] \n"
415 " s_mov_b64 exec, %[s_execflag_1] "
416 "\n" _UK_ATOMIC_ADD_ " %[v_os_o1], %[c1], s[8:9] \n"
417 " s_mov_b64 exec, %[s_execflag_2] "
418 "\n" _UK_ATOMIC_ADD_ " %[v_os_o2], %[c2], s[8:9] \n"
419 " s_mov_b64 exec, %[s_execflag_3] "
420 "\n" _UK_ATOMIC_ADD_ " %[v_os_o3], %[c3], s[8:9] \n"
421 " s_mov_b64 exec, %[s_execflag_4] "
422 "\n" _UK_ATOMIC_ADD_ " %[v_os_o4], %[c4], s[8:9] \n"
423 " s_mov_b64 exec, %[s_execflag_5] "
424 "\n" _UK_ATOMIC_ADD_ " %[v_os_o5], %[c5], s[8:9] \n"
425 " s_mov_b64 exec, %[s_execflag_6] "
426 "\n" _UK_ATOMIC_ADD_ " %[v_os_o6], %[c6], s[8:9] \n"
427 " s_mov_b64 exec, %[s_execflag_7] "
428 "\n" _UK_ATOMIC_ADD_ " %[v_os_o7], %[c7], s[8:9] \n"
429 " s_mov_b64 exec, s[38:39] \n"
430 " s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k-- \n"
431 " s_cmp_gt_i32 %[s_loop_cnt] 0 \n"
432 " s_cbranch_scc0 L_end%= \n"
433 " s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n"
434 " s_cselect_b32 s86, %[s_tile_os_b], 0 \n"
435 " s_add_u32 s12, s86, s12 \n"
436 " s_addc_u32 s13, 0, s13 \n"
437 " s_add_u32 s8, %[s_tile_os_o], s8 \n"
438 " s_addc_u32 s9, 0, s9 \n"
439 " s_waitcnt vmcnt(32) \n"
440 " s_barrier \n" _UK_MFMA_
441 " [%[c16],%[c17],%[c18],%[c19]], acc[128:129], v[128:129], 0 \n"
442 " buffer_load_dwordx4 acc[0:3], %[v_os_b0], s[12:15], 0 offen "
443 "\n" _UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[130:131], "
444 "v[130:131], [%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
445 " [%[c16],%[c17],%[c18],%[c19]], acc[132:133], v[132:133], "
446 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
447 " [%[c16],%[c17],%[c18],%[c19]], acc[134:135], v[134:135], "
448 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
449 " [%[c16],%[c17],%[c18],%[c19]], acc[136:137], v[136:137], "
450 "[%[c16],%[c17],%[c18],%[c19]] \n"
451 " buffer_load_dwordx4 acc[4:7], %[v_os_b0], s[12:15], 0 offen "
452 "offset:1024 \n" _UK_MFMA_
453 " [%[c16],%[c17],%[c18],%[c19]], acc[138:139], v[138:139], "
454 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
455 " [%[c16],%[c17],%[c18],%[c19]], acc[140:141], v[140:141], "
456 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
457 " [%[c16],%[c17],%[c18],%[c19]], acc[142:143], v[142:143], "
458 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
459 " [%[c20],%[c21],%[c22],%[c23]], acc[128:129], v[192:193], 0 \n"
460 " buffer_load_dwordx4 acc[8:11], %[v_os_b0], s[12:15], 0 offen "
461 "offset:2048 \n" _UK_MFMA_
462 " [%[c20],%[c21],%[c22],%[c23]], acc[130:131], v[194:195], "
463 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
464 " [%[c20],%[c21],%[c22],%[c23]], acc[132:133], v[196:197], "
465 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
466 " [%[c20],%[c21],%[c22],%[c23]], acc[134:135], v[198:199], "
467 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
468 " [%[c20],%[c21],%[c22],%[c23]], acc[136:137], v[200:201], "
469 "[%[c20],%[c21],%[c22],%[c23]] \n"
470 " buffer_load_dwordx4 acc[12:15], %[v_os_b0], s[12:15], 0 offen "
471 "offset:3072 \n" _UK_MFMA_
472 " [%[c20],%[c21],%[c22],%[c23]], acc[138:139], v[202:203], "
473 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
474 " [%[c20],%[c21],%[c22],%[c23]], acc[140:141], v[204:205], "
475 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
476 " [%[c20],%[c21],%[c22],%[c23]], acc[142:143], v[206:207], "
477 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
478 " [%[c24],%[c25],%[c26],%[c27]], acc[144:145], v[128:129], 0 \n"
479 " buffer_load_dwordx4 acc[16:19], %[v_os_b1], s[12:15], 0 offen "
480 "\n" _UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[146:147], "
481 "v[130:131], [%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
482 " [%[c24],%[c25],%[c26],%[c27]], acc[148:149], v[132:133], "
483 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
484 " [%[c24],%[c25],%[c26],%[c27]], acc[150:151], v[134:135], "
485 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
486 " [%[c24],%[c25],%[c26],%[c27]], acc[152:153], v[136:137], "
487 "[%[c24],%[c25],%[c26],%[c27]] \n"
488 " buffer_load_dwordx4 acc[20:23], %[v_os_b1], s[12:15], 0 offen "
489 "offset:1024 \n" _UK_MFMA_
490 " [%[c24],%[c25],%[c26],%[c27]], acc[154:155], v[138:139], "
491 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
492 " [%[c24],%[c25],%[c26],%[c27]], acc[156:157], v[140:141], "
493 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
494 " [%[c24],%[c25],%[c26],%[c27]], acc[158:159], v[142:143], "
495 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
496 " [%[c28],%[c29],%[c30],%[c31]], acc[144:145], v[192:193], 0 \n"
497 " buffer_load_dwordx4 acc[24:27], %[v_os_b1], s[12:15], 0 offen "
498 "offset:2048 \n" _UK_MFMA_
499 " [%[c28],%[c29],%[c30],%[c31]], acc[146:147], v[194:195], "
500 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
501 " [%[c28],%[c29],%[c30],%[c31]], acc[148:149], v[196:197], "
502 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
503 " [%[c28],%[c29],%[c30],%[c31]], acc[150:151], v[198:199], "
504 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
505 " [%[c28],%[c29],%[c30],%[c31]], acc[152:153], v[200:201], "
506 "[%[c28],%[c29],%[c30],%[c31]] \n"
507 " buffer_load_dwordx4 acc[28:31], %[v_os_b1], s[12:15], 0 offen "
508 "offset:3072 \n" _UK_MFMA_
509 " [%[c28],%[c29],%[c30],%[c31]], acc[154:155], v[202:203], "
510 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
511 " [%[c28],%[c29],%[c30],%[c31]], acc[156:157], v[204:205], "
512 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
513 " [%[c28],%[c29],%[c30],%[c31]], acc[158:159], v[206:207], "
514 "[%[c28],%[c29],%[c30],%[c31]] \n"
515 " s_waitcnt vmcnt(32) \n" _UK_MFMA_
516 " [%[c16],%[c17],%[c18],%[c19]], acc[160:161], v[144:145], "
517 "[%[c16],%[c17],%[c18],%[c19]] \n"
518 " buffer_load_dwordx4 acc[32:35], %[v_os_b2], s[12:15], 0 offen "
519 "\n" _UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[162:163], "
520 "v[146:147], [%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
521 " [%[c16],%[c17],%[c18],%[c19]], acc[164:165], v[148:149], "
522 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
523 " [%[c16],%[c17],%[c18],%[c19]], acc[166:167], v[150:151], "
524 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
525 " [%[c16],%[c17],%[c18],%[c19]], acc[168:169], v[152:153], "
526 "[%[c16],%[c17],%[c18],%[c19]] \n"
527 " buffer_load_dwordx4 acc[36:39], %[v_os_b2], s[12:15], 0 offen "
528 "offset:1024 \n" _UK_MFMA_
529 " [%[c16],%[c17],%[c18],%[c19]], acc[170:171], v[154:155], "
530 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
531 " [%[c16],%[c17],%[c18],%[c19]], acc[172:173], v[156:157], "
532 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
533 " [%[c16],%[c17],%[c18],%[c19]], acc[174:175], v[158:159], "
534 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
535 " [%[c20],%[c21],%[c22],%[c23]], acc[160:161], v[208:209], "
536 "[%[c20],%[c21],%[c22],%[c23]] \n"
537 " buffer_load_dwordx4 acc[40:43], %[v_os_b2], s[12:15], 0 offen "
538 "offset:2048 \n" _UK_MFMA_
539 " [%[c20],%[c21],%[c22],%[c23]], acc[162:163], v[210:211], "
540 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
541 " [%[c20],%[c21],%[c22],%[c23]], acc[164:165], v[212:213], "
542 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
543 " [%[c20],%[c21],%[c22],%[c23]], acc[166:167], v[214:215], "
544 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
545 " [%[c20],%[c21],%[c22],%[c23]], acc[168:169], v[216:217], "
546 "[%[c20],%[c21],%[c22],%[c23]] \n"
547 " buffer_load_dwordx4 acc[44:47], %[v_os_b2], s[12:15], 0 offen "
548 "offset:3072 \n" _UK_MFMA_
549 " [%[c20],%[c21],%[c22],%[c23]], acc[170:171], v[218:219], "
550 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
551 " [%[c20],%[c21],%[c22],%[c23]], acc[172:173], v[220:221], "
552 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
553 " [%[c20],%[c21],%[c22],%[c23]], acc[174:175], v[222:223], "
554 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
555 " [%[c24],%[c25],%[c26],%[c27]], acc[176:177], v[144:145], "
556 "[%[c24],%[c25],%[c26],%[c27]] \n"
557 " buffer_load_dwordx4 acc[48:51], %[v_os_b3], s[12:15], 0 offen "
558 "\n" _UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[178:179], "
559 "v[146:147], [%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
560 " [%[c24],%[c25],%[c26],%[c27]], acc[180:181], v[148:149], "
561 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
562 " [%[c24],%[c25],%[c26],%[c27]], acc[182:183], v[150:151], "
563 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
564 " [%[c24],%[c25],%[c26],%[c27]], acc[184:185], v[152:153], "
565 "[%[c24],%[c25],%[c26],%[c27]] \n"
566 " buffer_load_dwordx4 acc[52:55], %[v_os_b3], s[12:15], 0 offen "
567 "offset:1024 \n" _UK_MFMA_
568 " [%[c24],%[c25],%[c26],%[c27]], acc[186:187], v[154:155], "
569 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
570 " [%[c24],%[c25],%[c26],%[c27]], acc[188:189], v[156:157], "
571 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
572 " [%[c24],%[c25],%[c26],%[c27]], acc[190:191], v[158:159], "
573 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
574 " [%[c28],%[c29],%[c30],%[c31]], acc[176:177], v[208:209], "
575 "[%[c28],%[c29],%[c30],%[c31]] \n"
576 " buffer_load_dwordx4 acc[56:59], %[v_os_b3], s[12:15], 0 offen "
577 "offset:2048 \n" _UK_MFMA_
578 " [%[c28],%[c29],%[c30],%[c31]], acc[178:179], v[210:211], "
579 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
580 " [%[c28],%[c29],%[c30],%[c31]], acc[180:181], v[212:213], "
581 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
582 " [%[c28],%[c29],%[c30],%[c31]], acc[182:183], v[214:215], "
583 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
584 " [%[c28],%[c29],%[c30],%[c31]], acc[184:185], v[216:217], "
585 "[%[c28],%[c29],%[c30],%[c31]] \n"
586 " buffer_load_dwordx4 acc[60:63], %[v_os_b3], s[12:15], 0 offen "
587 "offset:3072 \n" _UK_MFMA_
588 " [%[c28],%[c29],%[c30],%[c31]], acc[186:187], v[218:219], "
589 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
590 " [%[c28],%[c29],%[c30],%[c31]], acc[188:189], v[220:221], "
591 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
592 " [%[c28],%[c29],%[c30],%[c31]], acc[190:191], v[222:223], "
593 "[%[c28],%[c29],%[c30],%[c31]] \n"
594 " s_waitcnt vmcnt(32) \n" _UK_MFMA_
595 " [%[c16],%[c17],%[c18],%[c19]], acc[192:193], v[160:161], "
596 "[%[c16],%[c17],%[c18],%[c19]] \n"
597 " buffer_load_dwordx4 acc[64:67], %[v_os_b4], s[12:15], 0 offen "
598 "\n" _UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[194:195], "
599 "v[162:163], [%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
600 " [%[c16],%[c17],%[c18],%[c19]], acc[196:197], v[164:165], "
601 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
602 " [%[c16],%[c17],%[c18],%[c19]], acc[198:199], v[166:167], "
603 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
604 " [%[c16],%[c17],%[c18],%[c19]], acc[200:201], v[168:169], "
605 "[%[c16],%[c17],%[c18],%[c19]] \n"
606 " buffer_load_dwordx4 acc[68:71], %[v_os_b4], s[12:15], 0 offen "
607 "offset:1024 \n" _UK_MFMA_
608 " [%[c16],%[c17],%[c18],%[c19]], acc[202:203], v[170:171], "
609 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
610 " [%[c16],%[c17],%[c18],%[c19]], acc[204:205], v[172:173], "
611 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
612 " [%[c16],%[c17],%[c18],%[c19]], acc[206:207], v[174:175], "
613 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
614 " [%[c20],%[c21],%[c22],%[c23]], acc[192:193], v[224:225], "
615 "[%[c20],%[c21],%[c22],%[c23]] \n"
616 " buffer_load_dwordx4 acc[72:75], %[v_os_b4], s[12:15], 0 offen "
617 "offset:2048 \n" _UK_MFMA_
618 " [%[c20],%[c21],%[c22],%[c23]], acc[194:195], v[226:227], "
619 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
620 " [%[c20],%[c21],%[c22],%[c23]], acc[196:197], v[228:229], "
621 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
622 " [%[c20],%[c21],%[c22],%[c23]], acc[198:199], v[230:231], "
623 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
624 " [%[c20],%[c21],%[c22],%[c23]], acc[200:201], v[232:233], "
625 "[%[c20],%[c21],%[c22],%[c23]] \n"
626 " buffer_load_dwordx4 acc[76:79], %[v_os_b4], s[12:15], 0 offen "
627 "offset:3072 \n" _UK_MFMA_
628 " [%[c20],%[c21],%[c22],%[c23]], acc[202:203], v[234:235], "
629 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
630 " [%[c20],%[c21],%[c22],%[c23]], acc[204:205], v[236:237], "
631 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
632 " [%[c20],%[c21],%[c22],%[c23]], acc[206:207], v[238:239], "
633 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
634 " [%[c24],%[c25],%[c26],%[c27]], acc[208:209], v[160:161], "
635 "[%[c24],%[c25],%[c26],%[c27]] \n"
636 " buffer_load_dwordx4 acc[80:83], %[v_os_b5], s[12:15], 0 offen "
637 "\n" _UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[210:211], "
638 "v[162:163], [%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
639 " [%[c24],%[c25],%[c26],%[c27]], acc[212:213], v[164:165], "
640 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
641 " [%[c24],%[c25],%[c26],%[c27]], acc[214:215], v[166:167], "
642 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
643 " [%[c24],%[c25],%[c26],%[c27]], acc[216:217], v[168:169], "
644 "[%[c24],%[c25],%[c26],%[c27]] \n"
645 " buffer_load_dwordx4 acc[84:87], %[v_os_b5], s[12:15], 0 offen "
646 "offset:1024 \n" _UK_MFMA_
647 " [%[c24],%[c25],%[c26],%[c27]], acc[218:219], v[170:171], "
648 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
649 " [%[c24],%[c25],%[c26],%[c27]], acc[220:221], v[172:173], "
650 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
651 " [%[c24],%[c25],%[c26],%[c27]], acc[222:223], v[174:175], "
652 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
653 " [%[c28],%[c29],%[c30],%[c31]], acc[208:209], v[224:225], "
654 "[%[c28],%[c29],%[c30],%[c31]] \n"
655 " buffer_load_dwordx4 acc[88:91], %[v_os_b5], s[12:15], 0 offen "
656 "offset:2048 \n" _UK_MFMA_
657 " [%[c28],%[c29],%[c30],%[c31]], acc[210:211], v[226:227], "
658 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
659 " [%[c28],%[c29],%[c30],%[c31]], acc[212:213], v[228:229], "
660 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
661 " [%[c28],%[c29],%[c30],%[c31]], acc[214:215], v[230:231], "
662 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
663 " [%[c28],%[c29],%[c30],%[c31]], acc[216:217], v[232:233], "
664 "[%[c28],%[c29],%[c30],%[c31]] \n"
665 " buffer_load_dwordx4 acc[92:95], %[v_os_b5], s[12:15], 0 offen "
666 "offset:3072 \n" _UK_MFMA_
667 " [%[c28],%[c29],%[c30],%[c31]], acc[218:219], v[234:235], "
668 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
669 " [%[c28],%[c29],%[c30],%[c31]], acc[220:221], v[236:237], "
670 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
671 " [%[c28],%[c29],%[c30],%[c31]], acc[222:223], v[238:239], "
672 "[%[c28],%[c29],%[c30],%[c31]] \n"
673 " s_waitcnt vmcnt(32) \n" _UK_MFMA_
674 " [%[c16],%[c17],%[c18],%[c19]], acc[224:225], v[176:177], "
675 "[%[c16],%[c17],%[c18],%[c19]] \n"
676 " buffer_load_dwordx4 acc[96:99], %[v_os_b6], s[12:15], 0 offen "
677 "\n" _UK_MFMA_ " [%[c16],%[c17],%[c18],%[c19]], acc[226:227], "
678 "v[178:179], [%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
679 " [%[c16],%[c17],%[c18],%[c19]], acc[228:229], v[180:181], "
680 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
681 " [%[c16],%[c17],%[c18],%[c19]], acc[230:231], v[182:183], "
682 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
683 " [%[c16],%[c17],%[c18],%[c19]], acc[232:233], v[184:185], "
684 "[%[c16],%[c17],%[c18],%[c19]] \n"
685 " buffer_load_dwordx4 acc[100:103], %[v_os_b6], s[12:15], 0 offen "
686 "offset:1024 \n" _UK_MFMA_
687 " [%[c16],%[c17],%[c18],%[c19]], acc[234:235], v[186:187], "
688 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
689 " [%[c16],%[c17],%[c18],%[c19]], acc[236:237], v[188:189], "
690 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
691 " [%[c16],%[c17],%[c18],%[c19]], acc[238:239], v[190:191], "
692 "[%[c16],%[c17],%[c18],%[c19]] \n" _UK_MFMA_
693 " [%[c20],%[c21],%[c22],%[c23]], acc[224:225], v[240:241], "
694 "[%[c20],%[c21],%[c22],%[c23]] \n"
695 " buffer_load_dwordx4 acc[104:107], %[v_os_b6], s[12:15], 0 offen "
696 "offset:2048 \n" _UK_MFMA_
697 " [%[c20],%[c21],%[c22],%[c23]], acc[226:227], v[242:243], "
698 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
699 " [%[c20],%[c21],%[c22],%[c23]], acc[228:229], v[244:245], "
700 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
701 " [%[c20],%[c21],%[c22],%[c23]], acc[230:231], v[246:247], "
702 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
703 " [%[c20],%[c21],%[c22],%[c23]], acc[232:233], v[248:249], "
704 "[%[c20],%[c21],%[c22],%[c23]] \n"
705 " buffer_load_dwordx4 acc[108:111], %[v_os_b6], s[12:15], 0 offen "
706 "offset:3072 \n" _UK_MFMA_
707 " [%[c20],%[c21],%[c22],%[c23]], acc[234:235], v[250:251], "
708 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
709 " [%[c20],%[c21],%[c22],%[c23]], acc[236:237], v[252:253], "
710 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
711 " [%[c20],%[c21],%[c22],%[c23]], acc[238:239], v[254:255], "
712 "[%[c20],%[c21],%[c22],%[c23]] \n" _UK_MFMA_
713 " [%[c24],%[c25],%[c26],%[c27]], acc[240:241], v[176:177], "
714 "[%[c24],%[c25],%[c26],%[c27]] \n"
715 " buffer_load_dwordx4 acc[112:115], %[v_os_b7], s[12:15], 0 offen "
716 "\n" _UK_MFMA_ " [%[c24],%[c25],%[c26],%[c27]], acc[242:243], "
717 "v[178:179], [%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
718 " [%[c24],%[c25],%[c26],%[c27]], acc[244:245], v[180:181], "
719 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
720 " [%[c24],%[c25],%[c26],%[c27]], acc[246:247], v[182:183], "
721 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
722 " [%[c24],%[c25],%[c26],%[c27]], acc[248:249], v[184:185], "
723 "[%[c24],%[c25],%[c26],%[c27]] \n"
724 " buffer_load_dwordx4 acc[116:119], %[v_os_b7], s[12:15], 0 offen "
725 "offset:1024 \n" _UK_MFMA_
726 " [%[c24],%[c25],%[c26],%[c27]], acc[250:251], v[186:187], "
727 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
728 " [%[c24],%[c25],%[c26],%[c27]], acc[252:253], v[188:189], "
729 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
730 " [%[c24],%[c25],%[c26],%[c27]], acc[254:255], v[190:191], "
731 "[%[c24],%[c25],%[c26],%[c27]] \n" _UK_MFMA_
732 " [%[c28],%[c29],%[c30],%[c31]], acc[240:241], v[240:241], "
733 "[%[c28],%[c29],%[c30],%[c31]] \n"
734 " buffer_load_dwordx4 acc[120:123], %[v_os_b7], s[12:15], 0 offen "
735 "offset:2048 \n" _UK_MFMA_
736 " [%[c28],%[c29],%[c30],%[c31]], acc[242:243], v[242:243], "
737 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
738 " [%[c28],%[c29],%[c30],%[c31]], acc[244:245], v[244:245], "
739 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
740 " [%[c28],%[c29],%[c30],%[c31]], acc[246:247], v[246:247], "
741 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
742 " [%[c28],%[c29],%[c30],%[c31]], acc[248:249], v[248:249], "
743 "[%[c28],%[c29],%[c30],%[c31]] \n"
744 " buffer_load_dwordx4 acc[124:127], %[v_os_b7], s[12:15], 0 offen "
745 "offset:3072 \n" _UK_MFMA_
746 " [%[c28],%[c29],%[c30],%[c31]], acc[250:251], v[250:251], "
747 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
748 " [%[c28],%[c29],%[c30],%[c31]], acc[252:253], v[252:253], "
749 "[%[c28],%[c29],%[c30],%[c31]] \n" _UK_MFMA_
750 " [%[c28],%[c29],%[c30],%[c31]], acc[254:255], v[254:255], "
751 "[%[c28],%[c29],%[c30],%[c31]]\n"
752 " v_mul_f32 %[c16], %[scale_0], %[c16] \n"
753 " v_mul_f32 %[c17], %[scale_0], %[c17] \n"
754 " v_mul_f32 %[c18], %[scale_0], %[c18] \n"
755 " v_mul_f32 %[c19], %[scale_0], %[c19] \n"
756 " v_mul_f32 %[c20], %[scale_1], %[c20] \n"
757 " v_mul_f32 %[c21], %[scale_1], %[c21] \n"
758 " v_mul_f32 %[c22], %[scale_1], %[c22] \n"
759 " v_mul_f32 %[c23], %[scale_1], %[c23] \n"
760 " v_mul_f32 %[c24], %[scale_0], %[c24] \n"
761 " v_mul_f32 %[c25], %[scale_0], %[c25] \n"
762 " v_mul_f32 %[c26], %[scale_0], %[c26] \n"
763 " v_mul_f32 %[c27], %[scale_0], %[c27] \n"
764 " v_mul_f32 %[c28], %[scale_1], %[c28] \n"
765 " v_mul_f32 %[c29], %[scale_1], %[c29] \n"
766 " v_mul_f32 %[c30], %[scale_1], %[c30] \n"
767 " v_mul_f32 %[c31], %[scale_1], %[c31] \n"
768
769 _UK_PK_CVT_("%[c16]", "%[c17]", "%[c16]") _UK_PK_CVT_("%[c18]", "%[c19]", "%[c17]") _UK_PK_CVT_(
770 "%[c20]", "%[c21]", "%[c18]") _UK_PK_CVT_("%[c22]", "%[c23]", "%[c19]")
771 _UK_PK_CVT_("%[c24]", "%[c25]", "%[c20]") _UK_PK_CVT_(
772 "%[c26]", "%[c27]", "%[c21]") _UK_PK_CVT_("%[c28]",
773 "%[c29]",
774 "%[c22]") _UK_PK_CVT_("%[c30]",
775 "%[c31]",
776 "%[c23]")
777
778 " ;------------------------------ \n"
779 " ds_write_b64 %[v_sfl_sst], [%[c16],%[c17]] offset:0 + %[shfl_base] \n"
780 " ds_write_b64 %[v_sfl_sst], [%[c18],%[c19]] offset:4352 + %[shfl_base] \n"
781 " ds_write_b64 %[v_sfl_sst], [%[c20],%[c21]] offset:2176 + %[shfl_base] \n"
782 " ds_write_b64 %[v_sfl_sst], [%[c22],%[c23]] offset:6528 + %[shfl_base] \n"
783 " s_waitcnt lgkmcnt(0) \n"
784 " s_barrier \n"
785 " ds_read_b32 %[c16], %[v_sfl_sld] offset:0 + %[shfl_base] \n"
786 " ds_read_b32 %[c17], %[v_sfl_sld] offset:32 + %[shfl_base] \n"
787 " ds_read_b32 %[c18], %[v_sfl_sld] offset:64 + %[shfl_base] \n"
788 " ds_read_b32 %[c19], %[v_sfl_sld] offset:96 + %[shfl_base] \n"
789 " ds_read_b32 %[c20], %[v_sfl_sld] offset:4352 + %[shfl_base] \n"
790 " ds_read_b32 %[c21], %[v_sfl_sld] offset:4384 + %[shfl_base] \n"
791 " ds_read_b32 %[c22], %[v_sfl_sld] offset:4416 + %[shfl_base] \n"
792 " ds_read_b32 %[c23], %[v_sfl_sld] offset:4448 + %[shfl_base] \n"
793 " s_waitcnt lgkmcnt(0) \n"
794 " s_mov_b64 exec, %[s_execflag_0] \n" _UK_ATOMIC_ADD_
795 " %[v_os_o0], %[c16], s[8:9] \n"
796 " s_mov_b64 exec, %[s_execflag_1] \n" _UK_ATOMIC_ADD_
797 " %[v_os_o1], %[c17], s[8:9] \n"
798 " s_mov_b64 exec, %[s_execflag_2] \n" _UK_ATOMIC_ADD_
799 " %[v_os_o2], %[c18], s[8:9] \n"
800 " s_mov_b64 exec, %[s_execflag_3] \n" _UK_ATOMIC_ADD_
801 " %[v_os_o3], %[c19], s[8:9] \n"
802 " s_mov_b64 exec, %[s_execflag_4] \n" _UK_ATOMIC_ADD_
803 " %[v_os_o4], %[c20], s[8:9] \n"
804 " s_mov_b64 exec, %[s_execflag_5] \n" _UK_ATOMIC_ADD_
805 " %[v_os_o5], %[c21], s[8:9] \n"
806 " s_mov_b64 exec, %[s_execflag_6] \n" _UK_ATOMIC_ADD_
807 " %[v_os_o6], %[c22], s[8:9] \n"
808 " s_mov_b64 exec, %[s_execflag_7] \n" _UK_ATOMIC_ADD_
809 " %[v_os_o7], %[c23], s[8:9] \n"
810 " s_mov_b64 exec, s[38:39] \n"
811 " s_sub_i32 %[s_loop_cnt], %[s_loop_cnt], 1 ; k-- \n"
812 " s_cmp_gt_i32 %[s_loop_cnt] 0 \n"
813 " s_cbranch_scc0 L_end%= \n"
814 " s_cmp_gt_i32 %[s_loop_cnt] 1 ; move b with cond \n"
815 " s_cselect_b32 s86, %[s_tile_os_b], 0 \n"
816 " s_add_u32 s12, s86, s12 \n"
817 " s_addc_u32 s13, 0, s13 \n"
818 " s_add_u32 s8, %[s_tile_os_o], s8 \n"
819 " s_addc_u32 s9, 0, s9 \n"
820 " s_branch L_start%= \n"
821 "L_end%=: \n"
822
823#undef _UK_MFMA_
824#undef _UK_PK_CVT_
825#undef _UK_ATOMIC_ADD_
826#undef CK_TILE_FLATMM_UK_MFMA
827 // clang-format on
#define _UK_MFMA_
#define _UK_PK_CVT_(x0_, x1_, y_)
#define _UK_ATOMIC_ADD_