.. | .. |
---|
24 | 24 | * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex |
---|
25 | 25 | */ |
---|
26 | 26 | |
---|
27 | | -/* HW (GFX9) source code for CWSR trap handler */ |
---|
28 | | -/* Version 18 + multiple trap handler */ |
---|
29 | | - |
---|
30 | | -// this performance-optimal version was originally from Seven Xu at SRDC |
---|
31 | | - |
---|
32 | | -// Revison #18 --... |
---|
33 | | -/* Rev History |
---|
34 | | -** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) |
---|
35 | | -** #4. SR Memory Layout: |
---|
36 | | -** 1. VGPR-SGPR-HWREG-{LDS} |
---|
37 | | -** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. |
---|
38 | | -** #5. Update: 1. Accurate g8sr_ts_save_d timestamp |
---|
39 | | -** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) |
---|
40 | | -** #7. Update: 1. don't barrier if noLDS |
---|
41 | | -** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version |
---|
42 | | -** 2. Fix SQ issue by s_sleep 2 |
---|
43 | | -** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last |
---|
44 | | -** 2. optimize s_buffer save by burst 16sgprs... |
---|
45 | | -** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. |
---|
46 | | -** #11. Update 1. Add 2 more timestamp for debug version |
---|
47 | | -** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance |
---|
48 | | -** #13. Integ 1. Always use MUBUF for PV trap shader... |
---|
49 | | -** #14. Update 1. s_buffer_store soft clause... |
---|
50 | | -** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. |
---|
51 | | -** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree |
---|
52 | | -** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] |
---|
53 | | -** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... |
---|
54 | | -** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 |
---|
55 | | -** 2. FUNC - Handle non-CWSR traps |
---|
56 | | -*/ |
---|
57 | | - |
---|
58 | | -var G8SR_WDMEM_HWREG_OFFSET = 0 |
---|
59 | | -var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes |
---|
60 | | - |
---|
61 | | -// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. |
---|
62 | | - |
---|
63 | | -var G8SR_DEBUG_TIMESTAMP = 0 |
---|
64 | | -var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset |
---|
65 | | -var s_g8sr_ts_save_s = s[34:35] // save start |
---|
66 | | -var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi |
---|
67 | | -var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ |
---|
68 | | -var s_g8sr_ts_save_d = s[40:41] // save end |
---|
69 | | -var s_g8sr_ts_restore_s = s[42:43] // restore start |
---|
70 | | -var s_g8sr_ts_restore_d = s[44:45] // restore end |
---|
71 | | - |
---|
72 | | -var G8SR_VGPR_SR_IN_DWX4 = 0 |
---|
73 | | -var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes |
---|
74 | | -var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 |
---|
75 | | - |
---|
76 | | - |
---|
77 | | -/*************************************************************************/ |
---|
78 | | -/* control on how to run the shader */ |
---|
79 | | -/*************************************************************************/ |
---|
80 | | -//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) |
---|
81 | | -var EMU_RUN_HACK = 0 |
---|
82 | | -var EMU_RUN_HACK_RESTORE_NORMAL = 0 |
---|
83 | | -var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 |
---|
84 | | -var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 |
---|
85 | | -var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK |
---|
86 | | -var SAVE_LDS = 1 |
---|
87 | | -var WG_BASE_ADDR_LO = 0x9000a000 |
---|
88 | | -var WG_BASE_ADDR_HI = 0x0 |
---|
89 | | -var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem |
---|
90 | | -var CTX_SAVE_CONTROL = 0x0 |
---|
91 | | -var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL |
---|
92 | | -var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) |
---|
93 | | -var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write |
---|
94 | | -var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes |
---|
95 | | -var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing |
---|
96 | 27 | var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency |
---|
| 28 | +var SAVE_AFTER_XNACK_ERROR = 1 //workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger |
---|
| 29 | +var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised |
---|
97 | 30 | |
---|
98 | 31 | /**************************************************************************/ |
---|
99 | 32 | /* variables */ |
---|
.. | .. |
---|
107 | 40 | var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE = 1 |
---|
108 | 41 | var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT = 3 |
---|
109 | 42 | var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE = 29 |
---|
| 43 | +var SQ_WAVE_STATUS_ALLOW_REPLAY_MASK = 0x400000 |
---|
110 | 44 | |
---|
111 | 45 | var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 |
---|
112 | 46 | var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 |
---|
.. | .. |
---|
127 | 61 | var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 |
---|
128 | 62 | var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 |
---|
129 | 63 | var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 |
---|
| 64 | +var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK = 0x10000000 |
---|
130 | 65 | |
---|
131 | 66 | var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME |
---|
132 | 67 | var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME |
---|
133 | 68 | var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x1F8000 |
---|
134 | 69 | var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME |
---|
| 70 | + |
---|
| 71 | +var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 |
---|
135 | 72 | |
---|
136 | 73 | var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 |
---|
137 | 74 | var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 |
---|
.. | .. |
---|
150 | 87 | var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG |
---|
151 | 88 | var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 |
---|
152 | 89 | |
---|
153 | | -var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used |
---|
154 | | -var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME |
---|
155 | | -var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME |
---|
156 | | -var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME |
---|
| 90 | +var S_SAVE_PC_HI_RCNT_SHIFT = 27 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used |
---|
| 91 | +var S_SAVE_PC_HI_RCNT_MASK = 0xF8000000 //FIXME |
---|
| 92 | +var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 26 //FIXME |
---|
| 93 | +var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x04000000 //FIXME |
---|
157 | 94 | |
---|
158 | 95 | var s_save_spi_init_lo = exec_lo |
---|
159 | 96 | var s_save_spi_init_hi = exec_hi |
---|
.. | .. |
---|
162 | 99 | var s_save_pc_hi = ttmp1 |
---|
163 | 100 | var s_save_exec_lo = ttmp2 |
---|
164 | 101 | var s_save_exec_hi = ttmp3 |
---|
165 | | -var s_save_tmp = ttmp4 |
---|
166 | | -var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine |
---|
| 102 | +var s_save_tmp = ttmp14 |
---|
| 103 | +var s_save_trapsts = ttmp15 //not really used until the end of the SAVE routine |
---|
167 | 104 | var s_save_xnack_mask_lo = ttmp6 |
---|
168 | 105 | var s_save_xnack_mask_hi = ttmp7 |
---|
169 | 106 | var s_save_buf_rsrc0 = ttmp8 |
---|
.. | .. |
---|
171 | 108 | var s_save_buf_rsrc2 = ttmp10 |
---|
172 | 109 | var s_save_buf_rsrc3 = ttmp11 |
---|
173 | 110 | var s_save_status = ttmp12 |
---|
174 | | -var s_save_mem_offset = ttmp14 |
---|
| 111 | +var s_save_mem_offset = ttmp4 |
---|
175 | 112 | var s_save_alloc_size = s_save_trapsts //conflict |
---|
176 | | -var s_save_m0 = ttmp15 |
---|
| 113 | +var s_save_m0 = ttmp5 |
---|
177 | 114 | var s_save_ttmps_lo = s_save_tmp //no conflict |
---|
178 | 115 | var s_save_ttmps_hi = s_save_trapsts //no conflict |
---|
179 | 116 | |
---|
.. | .. |
---|
197 | 134 | var s_restore_spi_init_hi = exec_hi |
---|
198 | 135 | |
---|
199 | 136 | var s_restore_mem_offset = ttmp12 |
---|
| 137 | +var s_restore_accvgpr_offset = ttmp13 |
---|
200 | 138 | var s_restore_alloc_size = ttmp3 |
---|
201 | 139 | var s_restore_tmp = ttmp2 |
---|
202 | 140 | var s_restore_mem_offset_save = s_restore_tmp //no conflict |
---|
| 141 | +var s_restore_accvgpr_offset_save = ttmp7 |
---|
203 | 142 | |
---|
204 | 143 | var s_restore_m0 = s_restore_alloc_size //no conflict |
---|
205 | 144 | |
---|
206 | | -var s_restore_mode = ttmp7 |
---|
| 145 | +var s_restore_mode = s_restore_accvgpr_offset_save |
---|
207 | 146 | |
---|
208 | 147 | var s_restore_pc_lo = ttmp0 |
---|
209 | 148 | var s_restore_pc_hi = ttmp1 |
---|
210 | | -var s_restore_exec_lo = ttmp14 |
---|
211 | | -var s_restore_exec_hi = ttmp15 |
---|
212 | | -var s_restore_status = ttmp4 |
---|
213 | | -var s_restore_trapsts = ttmp5 |
---|
| 149 | +var s_restore_exec_lo = ttmp4 |
---|
| 150 | +var s_restore_exec_hi = ttmp5 |
---|
| 151 | +var s_restore_status = ttmp14 |
---|
| 152 | +var s_restore_trapsts = ttmp15 |
---|
214 | 153 | var s_restore_xnack_mask_lo = xnack_mask_lo |
---|
215 | 154 | var s_restore_xnack_mask_hi = xnack_mask_hi |
---|
216 | 155 | var s_restore_buf_rsrc0 = ttmp8 |
---|
.. | .. |
---|
226 | 165 | /* Shader Main*/ |
---|
227 | 166 | |
---|
228 | 167 | shader main |
---|
229 | | - asic(GFX9) |
---|
| 168 | + asic(DEFAULT) |
---|
230 | 169 | type(CS) |
---|
231 | 170 | |
---|
232 | 171 | |
---|
233 | | - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore |
---|
234 | | - //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC |
---|
235 | | - s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC |
---|
236 | | - s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. |
---|
237 | | - s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE |
---|
238 | | - //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE |
---|
239 | | - s_branch L_SKIP_RESTORE //NOT restore, SAVE actually |
---|
240 | | - else |
---|
241 | 172 | s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save |
---|
242 | | - end |
---|
243 | 173 | |
---|
244 | 174 | L_JUMP_TO_RESTORE: |
---|
245 | 175 | s_branch L_RESTORE //restore |
---|
.. | .. |
---|
248 | 178 | |
---|
249 | 179 | s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC |
---|
250 | 180 | s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save |
---|
| 181 | + |
---|
| 182 | +if SINGLE_STEP_MISSED_WORKAROUND |
---|
| 183 | + // No single step exceptions if MODE.DEBUG_EN=0. |
---|
| 184 | + s_getreg_b32 ttmp2, hwreg(HW_REG_MODE) |
---|
| 185 | + s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK |
---|
| 186 | + s_cbranch_scc0 L_NO_SINGLE_STEP_WORKAROUND |
---|
| 187 | + |
---|
| 188 | + // Second-level trap already handled exception if STATUS.HALT=1. |
---|
| 189 | + s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK |
---|
| 190 | + |
---|
| 191 | + // Prioritize single step exception over context save. |
---|
| 192 | + // Second-level trap will halt wave and RFE, re-entering for SAVECTX. |
---|
| 193 | + s_cbranch_scc0 L_FETCH_2ND_TRAP |
---|
| 194 | + |
---|
| 195 | +L_NO_SINGLE_STEP_WORKAROUND: |
---|
| 196 | +end |
---|
| 197 | + |
---|
251 | 198 | s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) |
---|
252 | 199 | s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save |
---|
253 | 200 | s_cbranch_scc1 L_SAVE //this is the operation for save |
---|
254 | 201 | |
---|
255 | 202 | // ********* Handle non-CWSR traps ******************* |
---|
256 | | -if (!EMU_RUN_HACK) |
---|
| 203 | + |
---|
257 | 204 | // Illegal instruction is a non-maskable exception which blocks context save. |
---|
258 | 205 | // Halt the wavefront and return from the trap. |
---|
259 | 206 | s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK |
---|
.. | .. |
---|
266 | 213 | |
---|
267 | 214 | L_HALT_WAVE: |
---|
268 | 215 | // If STATUS.HALT is set then this fault must come from SQC instruction fetch. |
---|
269 | | - // We cannot prevent further faults so just terminate the wavefront. |
---|
| 216 | + // We cannot prevent further faults. Spin wait until context saved. |
---|
270 | 217 | s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK |
---|
271 | 218 | s_cbranch_scc0 L_NOT_ALREADY_HALTED |
---|
272 | | - s_endpgm |
---|
| 219 | + |
---|
| 220 | +L_WAIT_CTX_SAVE: |
---|
| 221 | + s_sleep 0x10 |
---|
| 222 | + s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) |
---|
| 223 | + s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_SAVECTX_MASK |
---|
| 224 | + s_cbranch_scc0 L_WAIT_CTX_SAVE |
---|
| 225 | + |
---|
273 | 226 | L_NOT_ALREADY_HALTED: |
---|
274 | 227 | s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK |
---|
275 | 228 | |
---|
.. | .. |
---|
293 | 246 | // Read second-level TBA/TMA from first-level TMA and jump if available. |
---|
294 | 247 | // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) |
---|
295 | 248 | // ttmp12 holds SQ_WAVE_STATUS |
---|
296 | | - s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO) |
---|
297 | | - s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI) |
---|
298 | | - s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 |
---|
299 | | - s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA |
---|
| 249 | + s_getreg_b32 ttmp14, hwreg(HW_REG_SQ_SHADER_TMA_LO) |
---|
| 250 | + s_getreg_b32 ttmp15, hwreg(HW_REG_SQ_SHADER_TMA_HI) |
---|
| 251 | + s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 |
---|
| 252 | + s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA |
---|
300 | 253 | s_waitcnt lgkmcnt(0) |
---|
301 | | - s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA |
---|
| 254 | + s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA |
---|
302 | 255 | s_waitcnt lgkmcnt(0) |
---|
303 | 256 | s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] |
---|
304 | 257 | s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set |
---|
.. | .. |
---|
324 | 277 | set_status_without_spi_prio(s_save_status, ttmp2) |
---|
325 | 278 | |
---|
326 | 279 | s_rfe_b64 [ttmp0, ttmp1] |
---|
327 | | -end |
---|
| 280 | + |
---|
328 | 281 | // ********* End handling of non-CWSR traps ******************* |
---|
329 | 282 | |
---|
330 | 283 | /**************************************************************************/ |
---|
.. | .. |
---|
332 | 285 | /**************************************************************************/ |
---|
333 | 286 | |
---|
334 | 287 | L_SAVE: |
---|
335 | | - |
---|
336 | | -if G8SR_DEBUG_TIMESTAMP |
---|
337 | | - s_memrealtime s_g8sr_ts_save_s |
---|
338 | | - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? |
---|
339 | | -end |
---|
340 | | - |
---|
341 | 288 | s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] |
---|
342 | 289 | |
---|
343 | 290 | s_mov_b32 s_save_tmp, 0 //clear saveCtx bit |
---|
.. | .. |
---|
359 | 306 | s_mov_b32 s_save_exec_hi, exec_hi |
---|
360 | 307 | s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive |
---|
361 | 308 | |
---|
362 | | -if G8SR_DEBUG_TIMESTAMP |
---|
363 | | - s_memrealtime s_g8sr_ts_sq_save_msg |
---|
364 | | - s_waitcnt lgkmcnt(0) |
---|
365 | | -end |
---|
366 | | - |
---|
367 | | - if (EMU_RUN_HACK) |
---|
368 | | - |
---|
369 | | - else |
---|
370 | 309 | s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC |
---|
371 | | - end |
---|
372 | 310 | |
---|
373 | 311 | // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. |
---|
374 | 312 | s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) |
---|
.. | .. |
---|
377 | 315 | L_SLEEP: |
---|
378 | 316 | s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 |
---|
379 | 317 | |
---|
380 | | - if (EMU_RUN_HACK) |
---|
381 | | - |
---|
382 | | - else |
---|
383 | 318 | s_cbranch_execz L_SLEEP |
---|
384 | | - end |
---|
385 | 319 | |
---|
386 | | -if G8SR_DEBUG_TIMESTAMP |
---|
387 | | - s_memrealtime s_g8sr_ts_spi_wrexec |
---|
388 | | - s_waitcnt lgkmcnt(0) |
---|
389 | | -end |
---|
390 | | - |
---|
391 | | - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) |
---|
392 | | - //calculate wd_addr using absolute thread id |
---|
393 | | - v_readlane_b32 s_save_tmp, v9, 0 |
---|
394 | | - s_lshr_b32 s_save_tmp, s_save_tmp, 6 |
---|
395 | | - s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE |
---|
396 | | - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO |
---|
397 | | - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI |
---|
398 | | - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL |
---|
399 | | - else |
---|
400 | | - end |
---|
401 | | - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) |
---|
402 | | - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO |
---|
403 | | - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI |
---|
404 | | - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL |
---|
405 | | - else |
---|
406 | | - end |
---|
407 | | - |
---|
408 | | - // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic |
---|
| 320 | + // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic |
---|
409 | 321 | // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 |
---|
410 | 322 | get_vgpr_size_bytes(s_save_ttmps_lo) |
---|
411 | 323 | get_sgpr_size_bytes(s_save_ttmps_hi) |
---|
.. | .. |
---|
413 | 325 | s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo |
---|
414 | 326 | s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0 |
---|
415 | 327 | s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF |
---|
416 | | - s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1 |
---|
| 328 | + s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1 |
---|
417 | 329 | ack_sqc_store_workaround() |
---|
418 | | - s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1 |
---|
| 330 | + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1 |
---|
419 | 331 | ack_sqc_store_workaround() |
---|
420 | | - s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1 |
---|
421 | | - ack_sqc_store_workaround() |
---|
422 | | - s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1 |
---|
| 332 | + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1 |
---|
423 | 333 | ack_sqc_store_workaround() |
---|
424 | 334 | |
---|
425 | 335 | /* setup Resource Contants */ |
---|
.. | .. |
---|
455 | 365 | |
---|
456 | 366 | |
---|
457 | 367 | s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes |
---|
458 | | - if (SWIZZLE_EN) |
---|
459 | | - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
---|
460 | | - else |
---|
461 | 368 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
---|
462 | | - end |
---|
463 | 369 | |
---|
464 | 370 | |
---|
465 | 371 | write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 |
---|
466 | | - |
---|
467 | | - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) |
---|
468 | | - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 |
---|
469 | | - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over |
---|
470 | | - end |
---|
471 | | - |
---|
472 | 372 | write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC |
---|
473 | 373 | write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) |
---|
474 | 374 | write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC |
---|
.. | .. |
---|
506 | 406 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 |
---|
507 | 407 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) |
---|
508 | 408 | |
---|
509 | | - if (SGPR_SAVE_USE_SQC) |
---|
510 | 409 | s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes |
---|
511 | | - else |
---|
512 | | - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) |
---|
513 | | - end |
---|
514 | 410 | |
---|
515 | | - if (SWIZZLE_EN) |
---|
516 | | - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
---|
517 | | - else |
---|
518 | 411 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
---|
519 | | - end |
---|
520 | 412 | |
---|
521 | 413 | |
---|
522 | 414 | // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 |
---|
.. | .. |
---|
559 | 451 | s_mov_b32 xnack_mask_lo, 0x0 |
---|
560 | 452 | s_mov_b32 xnack_mask_hi, 0x0 |
---|
561 | 453 | |
---|
562 | | - if (SWIZZLE_EN) |
---|
563 | | - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
---|
564 | | - else |
---|
565 | 454 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
---|
566 | | - end |
---|
567 | 455 | |
---|
568 | 456 | |
---|
569 | 457 | // VGPR Allocated in 4-GPR granularity |
---|
570 | 458 | |
---|
571 | | -if G8SR_VGPR_SR_IN_DWX4 |
---|
572 | | - // the const stride for DWx4 is 4*4 bytes |
---|
573 | | - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 |
---|
574 | | - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes |
---|
| 459 | +if SAVE_AFTER_XNACK_ERROR |
---|
| 460 | + check_if_tcp_store_ok() |
---|
| 461 | + s_cbranch_scc1 L_SAVE_FIRST_VGPRS_WITH_TCP |
---|
575 | 462 | |
---|
576 | | - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 |
---|
| 463 | + write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) |
---|
| 464 | + s_branch L_SAVE_LDS |
---|
577 | 465 | |
---|
578 | | - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 |
---|
579 | | - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes |
---|
580 | | -else |
---|
| 466 | +L_SAVE_FIRST_VGPRS_WITH_TCP: |
---|
| 467 | +end |
---|
| 468 | + |
---|
581 | 469 | buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 |
---|
582 | 470 | buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 |
---|
583 | 471 | buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 |
---|
584 | 472 | buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 |
---|
585 | | -end |
---|
586 | 473 | |
---|
587 | 474 | |
---|
588 | 475 | |
---|
.. | .. |
---|
617 | 504 | s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() |
---|
618 | 505 | |
---|
619 | 506 | |
---|
620 | | - if (SWIZZLE_EN) |
---|
621 | | - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
---|
622 | | - else |
---|
623 | 507 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
---|
624 | | - end |
---|
625 | 508 | |
---|
626 | 509 | s_mov_b32 m0, 0x0 //lds_offset initial value = 0 |
---|
627 | 510 | |
---|
628 | 511 | |
---|
629 | | -var LDS_DMA_ENABLE = 0 |
---|
630 | | -var UNROLL = 0 |
---|
631 | | -if UNROLL==0 && LDS_DMA_ENABLE==1 |
---|
632 | | - s_mov_b32 s3, 256*2 |
---|
633 | | - s_nop 0 |
---|
634 | | - s_nop 0 |
---|
635 | | - s_nop 0 |
---|
636 | | - L_SAVE_LDS_LOOP: |
---|
637 | | - //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? |
---|
638 | | - if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity |
---|
639 | | - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW |
---|
640 | | - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW |
---|
641 | | - end |
---|
642 | | - |
---|
643 | | - s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes |
---|
644 | | - s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes |
---|
645 | | - s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 |
---|
646 | | - s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? |
---|
647 | | - |
---|
648 | | -elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss |
---|
649 | | - // store from higest LDS address to lowest |
---|
650 | | - s_mov_b32 s3, 256*2 |
---|
651 | | - s_sub_u32 m0, s_save_alloc_size, s3 |
---|
652 | | - s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 |
---|
653 | | - s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... |
---|
654 | | - s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest |
---|
655 | | - s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction |
---|
656 | | - s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc |
---|
657 | | - s_nop 0 |
---|
658 | | - s_nop 0 |
---|
659 | | - s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes |
---|
660 | | - s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved |
---|
661 | | - s_add_u32 s0, s0,s_save_alloc_size |
---|
662 | | - s_addc_u32 s1, s1, 0 |
---|
663 | | - s_setpc_b64 s[0:1] |
---|
664 | | - |
---|
665 | | - |
---|
666 | | - for var i =0; i< 128; i++ |
---|
667 | | - // be careful to make here a 64Byte aligned address, which could improve performance... |
---|
668 | | - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW |
---|
669 | | - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW |
---|
670 | | - |
---|
671 | | - if i!=127 |
---|
672 | | - s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline |
---|
673 | | - s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 |
---|
674 | | - end |
---|
675 | | - end |
---|
676 | | - |
---|
677 | | -else // BUFFER_STORE |
---|
678 | 512 | v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 |
---|
679 | 513 | v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid |
---|
| 514 | + |
---|
| 515 | +if SAVE_AFTER_XNACK_ERROR |
---|
| 516 | + check_if_tcp_store_ok() |
---|
| 517 | + s_cbranch_scc1 L_SAVE_LDS_WITH_TCP |
---|
| 518 | + |
---|
| 519 | + v_lshlrev_b32 v2, 2, v3 |
---|
| 520 | +L_SAVE_LDS_LOOP_SQC: |
---|
| 521 | + ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40 |
---|
| 522 | + s_waitcnt lgkmcnt(0) |
---|
| 523 | + |
---|
| 524 | + write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset) |
---|
| 525 | + |
---|
| 526 | + v_add_u32 v2, 0x200, v2 |
---|
| 527 | + v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size |
---|
| 528 | + s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC |
---|
| 529 | + |
---|
| 530 | + s_branch L_SAVE_LDS_DONE |
---|
| 531 | + |
---|
| 532 | +L_SAVE_LDS_WITH_TCP: |
---|
| 533 | +end |
---|
| 534 | + |
---|
680 | 535 | v_mul_i32_i24 v2, v3, 8 // tid*8 |
---|
681 | 536 | v_mov_b32 v3, 256*2 |
---|
682 | 537 | s_mov_b32 m0, 0x10000 |
---|
.. | .. |
---|
697 | 552 | // restore rsrc3 |
---|
698 | 553 | s_mov_b32 s_save_buf_rsrc3, s0 |
---|
699 | 554 | |
---|
700 | | -end |
---|
701 | | - |
---|
702 | 555 | L_SAVE_LDS_DONE: |
---|
703 | 556 | |
---|
704 | 557 | |
---|
.. | .. |
---|
716 | 569 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 |
---|
717 | 570 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible |
---|
718 | 571 | s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) |
---|
719 | | - if (SWIZZLE_EN) |
---|
720 | | - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
---|
721 | | - else |
---|
722 | 572 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
---|
723 | | - end |
---|
724 | 573 | |
---|
725 | 574 | |
---|
726 | | - // VGPR Allocated in 4-GPR granularity |
---|
727 | | - |
---|
728 | | -if G8SR_VGPR_SR_IN_DWX4 |
---|
729 | | - // the const stride for DWx4 is 4*4 bytes |
---|
730 | | - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 |
---|
731 | | - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes |
---|
732 | | - |
---|
733 | | - s_mov_b32 m0, 4 // skip first 4 VGPRs |
---|
734 | | - s_cmp_lt_u32 m0, s_save_alloc_size |
---|
735 | | - s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs |
---|
736 | | - |
---|
737 | | - s_set_gpr_idx_on m0, 0x1 // This will change M0 |
---|
738 | | - s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 |
---|
739 | | -L_SAVE_VGPR_LOOP: |
---|
740 | | - v_mov_b32 v0, v0 // v0 = v[0+m0] |
---|
741 | | - v_mov_b32 v1, v1 |
---|
742 | | - v_mov_b32 v2, v2 |
---|
743 | | - v_mov_b32 v3, v3 |
---|
744 | | - |
---|
745 | | - |
---|
746 | | - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 |
---|
747 | | - s_add_u32 m0, m0, 4 |
---|
748 | | - s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 |
---|
749 | | - s_cmp_lt_u32 m0, s_save_alloc_size |
---|
750 | | - s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? |
---|
751 | | - s_set_gpr_idx_off |
---|
752 | | -L_SAVE_VGPR_LOOP_END: |
---|
753 | | - |
---|
754 | | - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 |
---|
755 | | - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes |
---|
756 | | -else |
---|
757 | 575 | // VGPR store using dw burst |
---|
758 | 576 | s_mov_b32 m0, 0x4 //VGPR initial index value =0 |
---|
759 | 577 | s_cmp_lt_u32 m0, s_save_alloc_size |
---|
.. | .. |
---|
763 | 581 | s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 |
---|
764 | 582 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later |
---|
765 | 583 | |
---|
| 584 | +if SAVE_AFTER_XNACK_ERROR |
---|
| 585 | + check_if_tcp_store_ok() |
---|
| 586 | + s_cbranch_scc1 L_SAVE_VGPR_LOOP |
---|
| 587 | + |
---|
| 588 | +L_SAVE_VGPR_LOOP_SQC: |
---|
| 589 | + write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) |
---|
| 590 | + |
---|
| 591 | + s_add_u32 m0, m0, 4 |
---|
| 592 | + s_cmp_lt_u32 m0, s_save_alloc_size |
---|
| 593 | + s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC |
---|
| 594 | + |
---|
| 595 | + s_set_gpr_idx_off |
---|
| 596 | + s_branch L_SAVE_VGPR_END |
---|
| 597 | +end |
---|
| 598 | + |
---|
766 | 599 | L_SAVE_VGPR_LOOP: |
---|
767 | 600 | v_mov_b32 v0, v0 //v0 = v[0+m0] |
---|
768 | 601 | v_mov_b32 v1, v1 //v0 = v[0+m0] |
---|
769 | 602 | v_mov_b32 v2, v2 //v0 = v[0+m0] |
---|
770 | 603 | v_mov_b32 v3, v3 //v0 = v[0+m0] |
---|
771 | 604 | |
---|
772 | | - if(USE_MTBUF_INSTEAD_OF_MUBUF) |
---|
773 | | - tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 |
---|
774 | | - else |
---|
775 | 605 | buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 |
---|
776 | 606 | buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 |
---|
777 | 607 | buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 |
---|
778 | 608 | buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 |
---|
779 | | - end |
---|
780 | 609 | |
---|
781 | 610 | s_add_u32 m0, m0, 4 //next vgpr index |
---|
782 | 611 | s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes |
---|
783 | 612 | s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 |
---|
784 | 613 | s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? |
---|
785 | 614 | s_set_gpr_idx_off |
---|
786 | | -end |
---|
787 | 615 | |
---|
788 | 616 | L_SAVE_VGPR_END: |
---|
789 | 617 | |
---|
| 618 | +if ASIC_TARGET_ARCTURUS |
---|
| 619 | + // Save ACC VGPRs |
---|
| 620 | + s_mov_b32 m0, 0x0 //VGPR initial index value =0 |
---|
| 621 | + s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 |
---|
790 | 622 | |
---|
| 623 | +if SAVE_AFTER_XNACK_ERROR |
---|
| 624 | + check_if_tcp_store_ok() |
---|
| 625 | + s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP |
---|
791 | 626 | |
---|
792 | | - |
---|
793 | | - |
---|
794 | | - |
---|
795 | | - /* S_PGM_END_SAVED */ //FIXME graphics ONLY |
---|
796 | | - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) |
---|
797 | | - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] |
---|
798 | | - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 |
---|
799 | | - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over |
---|
800 | | - s_rfe_b64 s_save_pc_lo //Return to the main shader program |
---|
801 | | - else |
---|
| 627 | +L_SAVE_ACCVGPR_LOOP_SQC: |
---|
| 628 | + for var vgpr = 0; vgpr < 4; ++ vgpr |
---|
| 629 | + v_accvgpr_read v[vgpr], acc[vgpr] // v[N] = acc[N+m0] |
---|
802 | 630 | end |
---|
803 | 631 | |
---|
804 | | -// Save Done timestamp |
---|
805 | | -if G8SR_DEBUG_TIMESTAMP |
---|
806 | | - s_memrealtime s_g8sr_ts_save_d |
---|
807 | | - // SGPR SR memory offset : size(VGPR) |
---|
808 | | - get_vgpr_size_bytes(s_save_mem_offset) |
---|
809 | | - s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET |
---|
810 | | - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? |
---|
811 | | - // Need reset rsrc2?? |
---|
812 | | - s_mov_b32 m0, s_save_mem_offset |
---|
813 | | - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
---|
814 | | - s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 |
---|
| 632 | + write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) |
---|
| 633 | + |
---|
| 634 | + s_add_u32 m0, m0, 4 |
---|
| 635 | + s_cmp_lt_u32 m0, s_save_alloc_size |
---|
| 636 | + s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP_SQC |
---|
| 637 | + |
---|
| 638 | + s_set_gpr_idx_off |
---|
| 639 | + s_branch L_SAVE_ACCVGPR_END |
---|
815 | 640 | end |
---|
816 | 641 | |
---|
| 642 | +L_SAVE_ACCVGPR_LOOP: |
---|
| 643 | + for var vgpr = 0; vgpr < 4; ++ vgpr |
---|
| 644 | + v_accvgpr_read v[vgpr], acc[vgpr] // v[N] = acc[N+m0] |
---|
| 645 | + end |
---|
| 646 | + |
---|
| 647 | + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 |
---|
| 648 | + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 |
---|
| 649 | + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 |
---|
| 650 | + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 |
---|
| 651 | + |
---|
| 652 | + s_add_u32 m0, m0, 4 |
---|
| 653 | + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 |
---|
| 654 | + s_cmp_lt_u32 m0, s_save_alloc_size |
---|
| 655 | + s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP |
---|
| 656 | + s_set_gpr_idx_off |
---|
| 657 | + |
---|
| 658 | +L_SAVE_ACCVGPR_END: |
---|
| 659 | +end |
---|
817 | 660 | |
---|
818 | 661 | s_branch L_END_PGM |
---|
819 | 662 | |
---|
.. | .. |
---|
825 | 668 | |
---|
826 | 669 | L_RESTORE: |
---|
827 | 670 | /* Setup Resource Contants */ |
---|
828 | | - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) |
---|
829 | | - //calculate wd_addr using absolute thread id |
---|
830 | | - v_readlane_b32 s_restore_tmp, v9, 0 |
---|
831 | | - s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 |
---|
832 | | - s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE |
---|
833 | | - s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO |
---|
834 | | - s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI |
---|
835 | | - s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL |
---|
836 | | - else |
---|
837 | | - end |
---|
838 | | - |
---|
839 | | -if G8SR_DEBUG_TIMESTAMP |
---|
840 | | - s_memrealtime s_g8sr_ts_restore_s |
---|
841 | | - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? |
---|
842 | | - // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... |
---|
843 | | - s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] |
---|
844 | | - s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. |
---|
845 | | -end |
---|
846 | | - |
---|
847 | | - |
---|
848 | | - |
---|
849 | 671 | s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo |
---|
850 | 672 | s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi |
---|
851 | 673 | s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE |
---|
.. | .. |
---|
887 | 709 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? |
---|
888 | 710 | |
---|
889 | 711 | |
---|
890 | | - if (SWIZZLE_EN) |
---|
891 | | - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
---|
892 | | - else |
---|
893 | 712 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
---|
894 | | - end |
---|
895 | 713 | s_mov_b32 m0, 0x0 //lds_offset initial value = 0 |
---|
896 | 714 | |
---|
897 | 715 | L_RESTORE_LDS_LOOP: |
---|
898 | | - if (SAVE_LDS) |
---|
899 | 716 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW |
---|
900 | 717 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW |
---|
901 | | - end |
---|
902 | 718 | s_add_u32 m0, m0, 256*2 // 128 DW |
---|
903 | 719 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW |
---|
904 | 720 | s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 |
---|
.. | .. |
---|
917 | 733 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 |
---|
918 | 734 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) |
---|
919 | 735 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) |
---|
920 | | - if (SWIZZLE_EN) |
---|
921 | | - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
---|
922 | | - else |
---|
| 736 | + |
---|
| 737 | +if ASIC_TARGET_ARCTURUS |
---|
| 738 | + s_mov_b32 s_restore_accvgpr_offset, s_restore_buf_rsrc2 //ACC VGPRs at end of VGPRs |
---|
| 739 | +end |
---|
| 740 | + |
---|
923 | 741 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
---|
924 | | - end |
---|
925 | 742 | |
---|
926 | | -if G8SR_VGPR_SR_IN_DWX4 |
---|
927 | | - get_vgpr_size_bytes(s_restore_mem_offset) |
---|
928 | | - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 |
---|
929 | | - |
---|
930 | | - // the const stride for DWx4 is 4*4 bytes |
---|
931 | | - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 |
---|
932 | | - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes |
---|
933 | | - |
---|
934 | | - s_mov_b32 m0, s_restore_alloc_size |
---|
935 | | - s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 |
---|
936 | | - |
---|
937 | | -L_RESTORE_VGPR_LOOP: |
---|
938 | | - buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 |
---|
939 | | - s_waitcnt vmcnt(0) |
---|
940 | | - s_sub_u32 m0, m0, 4 |
---|
941 | | - v_mov_b32 v0, v0 // v[0+m0] = v0 |
---|
942 | | - v_mov_b32 v1, v1 |
---|
943 | | - v_mov_b32 v2, v2 |
---|
944 | | - v_mov_b32 v3, v3 |
---|
945 | | - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 |
---|
946 | | - s_cmp_eq_u32 m0, 0x8000 |
---|
947 | | - s_cbranch_scc0 L_RESTORE_VGPR_LOOP |
---|
948 | | - s_set_gpr_idx_off |
---|
949 | | - |
---|
950 | | - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 |
---|
951 | | - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes |
---|
952 | | - |
---|
953 | | -else |
---|
954 | 743 | // VGPR load using dw burst |
---|
955 | 744 | s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last |
---|
956 | 745 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 |
---|
| 746 | +if ASIC_TARGET_ARCTURUS |
---|
| 747 | + s_mov_b32 s_restore_accvgpr_offset_save, s_restore_accvgpr_offset |
---|
| 748 | + s_add_u32 s_restore_accvgpr_offset, s_restore_accvgpr_offset, 256*4 |
---|
| 749 | +end |
---|
957 | 750 | s_mov_b32 m0, 4 //VGPR initial index value = 1 |
---|
958 | 751 | s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 |
---|
959 | 752 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later |
---|
960 | 753 | |
---|
961 | 754 | L_RESTORE_VGPR_LOOP: |
---|
962 | | - if(USE_MTBUF_INSTEAD_OF_MUBUF) |
---|
963 | | - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 |
---|
964 | | - else |
---|
| 755 | + |
---|
| 756 | +if ASIC_TARGET_ARCTURUS |
---|
| 757 | + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 |
---|
| 758 | + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256 |
---|
| 759 | + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256*2 |
---|
| 760 | + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256*3 |
---|
| 761 | + s_add_u32 s_restore_accvgpr_offset, s_restore_accvgpr_offset, 256*4 |
---|
| 762 | + s_waitcnt vmcnt(0) |
---|
| 763 | + |
---|
| 764 | + for var vgpr = 0; vgpr < 4; ++ vgpr |
---|
| 765 | + v_accvgpr_write acc[vgpr], v[vgpr] |
---|
| 766 | + end |
---|
| 767 | +end |
---|
| 768 | + |
---|
965 | 769 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 |
---|
966 | 770 | buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 |
---|
967 | 771 | buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 |
---|
968 | 772 | buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 |
---|
969 | | - end |
---|
970 | 773 | s_waitcnt vmcnt(0) //ensure data ready |
---|
971 | 774 | v_mov_b32 v0, v0 //v[0+m0] = v0 |
---|
972 | 775 | v_mov_b32 v1, v1 |
---|
.. | .. |
---|
978 | 781 | s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? |
---|
979 | 782 | s_set_gpr_idx_off |
---|
980 | 783 | /* VGPR restore on v0 */ |
---|
981 | | - if(USE_MTBUF_INSTEAD_OF_MUBUF) |
---|
982 | | - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 |
---|
983 | | - else |
---|
| 784 | +if ASIC_TARGET_ARCTURUS |
---|
| 785 | + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 |
---|
| 786 | + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256 |
---|
| 787 | + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256*2 |
---|
| 788 | + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256*3 |
---|
| 789 | + s_waitcnt vmcnt(0) |
---|
| 790 | + |
---|
| 791 | + for var vgpr = 0; vgpr < 4; ++ vgpr |
---|
| 792 | + v_accvgpr_write acc[vgpr], v[vgpr] |
---|
| 793 | + end |
---|
| 794 | +end |
---|
| 795 | + |
---|
984 | 796 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 |
---|
985 | 797 | buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 |
---|
986 | 798 | buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 |
---|
987 | 799 | buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 |
---|
988 | | - end |
---|
989 | | - |
---|
990 | | -end |
---|
991 | 800 | |
---|
992 | 801 | /* restore SGPRs */ |
---|
993 | 802 | ////////////////////////////// |
---|
.. | .. |
---|
1003 | 812 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 |
---|
1004 | 813 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) |
---|
1005 | 814 | |
---|
1006 | | - if (SGPR_SAVE_USE_SQC) |
---|
1007 | 815 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes |
---|
1008 | | - else |
---|
1009 | | - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) |
---|
1010 | | - end |
---|
1011 | | - if (SWIZZLE_EN) |
---|
1012 | | - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
---|
1013 | | - else |
---|
1014 | 816 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
---|
1015 | | - end |
---|
1016 | 817 | |
---|
1017 | 818 | s_mov_b32 m0, s_restore_alloc_size |
---|
1018 | 819 | |
---|
.. | .. |
---|
1040 | 841 | L_RESTORE_HWREG: |
---|
1041 | 842 | |
---|
1042 | 843 | |
---|
1043 | | -if G8SR_DEBUG_TIMESTAMP |
---|
1044 | | - s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo |
---|
1045 | | - s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi |
---|
1046 | | -end |
---|
1047 | | - |
---|
1048 | 844 | // HWREG SR memory offset : size(VGPR)+size(SGPR) |
---|
1049 | 845 | get_vgpr_size_bytes(s_restore_mem_offset) |
---|
1050 | 846 | get_sgpr_size_bytes(s_restore_tmp) |
---|
.. | .. |
---|
1052 | 848 | |
---|
1053 | 849 | |
---|
1054 | 850 | s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes |
---|
1055 | | - if (SWIZZLE_EN) |
---|
1056 | | - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
---|
1057 | | - else |
---|
1058 | 851 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
---|
1059 | | - end |
---|
1060 | 852 | |
---|
1061 | 853 | read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 |
---|
1062 | 854 | read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC |
---|
.. | .. |
---|
1071 | 863 | |
---|
1072 | 864 | s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS |
---|
1073 | 865 | |
---|
1074 | | - //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: |
---|
1075 | | - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) |
---|
1076 | | - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) |
---|
1077 | | - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over |
---|
1078 | | - end |
---|
1079 | | - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) |
---|
1080 | | - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal |
---|
1081 | | - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over |
---|
1082 | | - end |
---|
1083 | | - |
---|
1084 | 866 | s_mov_b32 m0, s_restore_m0 |
---|
1085 | 867 | s_mov_b32 exec_lo, s_restore_exec_lo |
---|
1086 | 868 | s_mov_b32 exec_hi, s_restore_exec_hi |
---|
.. | .. |
---|
1093 | 875 | //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore |
---|
1094 | 876 | s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode |
---|
1095 | 877 | |
---|
1096 | | - // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic |
---|
| 878 | + // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic |
---|
1097 | 879 | // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 |
---|
1098 | 880 | get_vgpr_size_bytes(s_restore_ttmps_lo) |
---|
1099 | 881 | get_sgpr_size_bytes(s_restore_ttmps_hi) |
---|
.. | .. |
---|
1101 | 883 | s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 |
---|
1102 | 884 | s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 |
---|
1103 | 885 | s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF |
---|
1104 | | - s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1 |
---|
1105 | | - s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1 |
---|
1106 | | - s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1 |
---|
1107 | | - s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1 |
---|
| 886 | + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1 |
---|
| 887 | + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1 |
---|
| 888 | + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1 |
---|
1108 | 889 | s_waitcnt lgkmcnt(0) |
---|
1109 | 890 | |
---|
1110 | 891 | //reuse s_restore_m0 as a temp register |
---|
.. | .. |
---|
1127 | 908 | set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu |
---|
1128 | 909 | |
---|
1129 | 910 | s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time |
---|
1130 | | - |
---|
1131 | | -if G8SR_DEBUG_TIMESTAMP |
---|
1132 | | - s_memrealtime s_g8sr_ts_restore_d |
---|
1133 | | - s_waitcnt lgkmcnt(0) |
---|
1134 | | -end |
---|
1135 | 911 | |
---|
1136 | 912 | // s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution |
---|
1137 | 913 | s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc |
---|
.. | .. |
---|
1187 | 963 | s_sub_u32 s_mem_offset, s_mem_offset, 4*16 |
---|
1188 | 964 | end |
---|
1189 | 965 | |
---|
| 966 | +function check_if_tcp_store_ok |
---|
| 967 | + // If STATUS.ALLOW_REPLAY=0 and TRAPSTS.XNACK_ERROR=1 then TCP stores will fail. |
---|
| 968 | + s_and_b32 s_save_tmp, s_save_status, SQ_WAVE_STATUS_ALLOW_REPLAY_MASK |
---|
| 969 | + s_cbranch_scc1 L_TCP_STORE_CHECK_DONE |
---|
1190 | 970 | |
---|
| 971 | + s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) |
---|
| 972 | + s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp |
---|
| 973 | + |
---|
| 974 | +L_TCP_STORE_CHECK_DONE: |
---|
| 975 | +end |
---|
| 976 | + |
---|
| 977 | +function write_vgpr_to_mem_with_sqc(v, s_rsrc, s_mem_offset) |
---|
| 978 | + s_mov_b32 s4, 0 |
---|
| 979 | + |
---|
| 980 | +L_WRITE_VGPR_LANE_LOOP: |
---|
| 981 | + for var lane = 0; lane < 4; ++ lane |
---|
| 982 | + v_readlane_b32 s[lane], v, s4 |
---|
| 983 | + s_add_u32 s4, s4, 1 |
---|
| 984 | + end |
---|
| 985 | + |
---|
| 986 | + s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1 |
---|
| 987 | + ack_sqc_store_workaround() |
---|
| 988 | + |
---|
| 989 | + s_add_u32 s_mem_offset, s_mem_offset, 0x10 |
---|
| 990 | + s_cmp_eq_u32 s4, 0x40 |
---|
| 991 | + s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP |
---|
| 992 | +end |
---|
| 993 | + |
---|
| 994 | +function write_vgprs_to_mem_with_sqc(v, n_vgprs, s_rsrc, s_mem_offset) |
---|
| 995 | + for var vgpr = 0; vgpr < n_vgprs; ++ vgpr |
---|
| 996 | + write_vgpr_to_mem_with_sqc(v[vgpr], s_rsrc, s_mem_offset) |
---|
| 997 | + end |
---|
| 998 | +end |
---|
1191 | 999 | |
---|
1192 | 1000 | function get_lds_size_bytes(s_lds_size_byte) |
---|
1193 | 1001 | // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW |
---|
.. | .. |
---|
1199 | 1007 | s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size |
---|
1200 | 1008 | s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 |
---|
1201 | 1009 | s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible |
---|
| 1010 | + |
---|
| 1011 | +if ASIC_TARGET_ARCTURUS |
---|
| 1012 | + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, 1 // Double size for ACC VGPRs |
---|
| 1013 | +end |
---|
1202 | 1014 | end |
---|
1203 | 1015 | |
---|
1204 | 1016 | function get_sgpr_size_bytes(s_sgpr_size_byte) |
---|