| .. | .. |
|---|
| 24 | 24 | * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex |
|---|
| 25 | 25 | */ |
|---|
| 26 | 26 | |
|---|
| 27 | | -/* HW (GFX9) source code for CWSR trap handler */ |
|---|
| 28 | | -/* Version 18 + multiple trap handler */ |
|---|
| 29 | | - |
|---|
| 30 | | -// this performance-optimal version was originally from Seven Xu at SRDC |
|---|
| 31 | | - |
|---|
| 32 | | -// Revison #18 --... |
|---|
| 33 | | -/* Rev History |
|---|
| 34 | | -** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) |
|---|
| 35 | | -** #4. SR Memory Layout: |
|---|
| 36 | | -** 1. VGPR-SGPR-HWREG-{LDS} |
|---|
| 37 | | -** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. |
|---|
| 38 | | -** #5. Update: 1. Accurate g8sr_ts_save_d timestamp |
|---|
| 39 | | -** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) |
|---|
| 40 | | -** #7. Update: 1. don't barrier if noLDS |
|---|
| 41 | | -** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version |
|---|
| 42 | | -** 2. Fix SQ issue by s_sleep 2 |
|---|
| 43 | | -** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last |
|---|
| 44 | | -** 2. optimize s_buffer save by burst 16sgprs... |
|---|
| 45 | | -** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. |
|---|
| 46 | | -** #11. Update 1. Add 2 more timestamp for debug version |
|---|
| 47 | | -** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance |
|---|
| 48 | | -** #13. Integ 1. Always use MUBUF for PV trap shader... |
|---|
| 49 | | -** #14. Update 1. s_buffer_store soft clause... |
|---|
| 50 | | -** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. |
|---|
| 51 | | -** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree |
|---|
| 52 | | -** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] |
|---|
| 53 | | -** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... |
|---|
| 54 | | -** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 |
|---|
| 55 | | -** 2. FUNC - Handle non-CWSR traps |
|---|
| 56 | | -*/ |
|---|
| 57 | | - |
|---|
| 58 | | -var G8SR_WDMEM_HWREG_OFFSET = 0 |
|---|
| 59 | | -var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes |
|---|
| 60 | | - |
|---|
| 61 | | -// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. |
|---|
| 62 | | - |
|---|
| 63 | | -var G8SR_DEBUG_TIMESTAMP = 0 |
|---|
| 64 | | -var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset |
|---|
| 65 | | -var s_g8sr_ts_save_s = s[34:35] // save start |
|---|
| 66 | | -var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi |
|---|
| 67 | | -var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ |
|---|
| 68 | | -var s_g8sr_ts_save_d = s[40:41] // save end |
|---|
| 69 | | -var s_g8sr_ts_restore_s = s[42:43] // restore start |
|---|
| 70 | | -var s_g8sr_ts_restore_d = s[44:45] // restore end |
|---|
| 71 | | - |
|---|
| 72 | | -var G8SR_VGPR_SR_IN_DWX4 = 0 |
|---|
| 73 | | -var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes |
|---|
| 74 | | -var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 |
|---|
| 75 | | - |
|---|
| 76 | | - |
|---|
| 77 | | -/*************************************************************************/ |
|---|
| 78 | | -/* control on how to run the shader */ |
|---|
| 79 | | -/*************************************************************************/ |
|---|
| 80 | | -//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) |
|---|
| 81 | | -var EMU_RUN_HACK = 0 |
|---|
| 82 | | -var EMU_RUN_HACK_RESTORE_NORMAL = 0 |
|---|
| 83 | | -var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 |
|---|
| 84 | | -var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 |
|---|
| 85 | | -var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK |
|---|
| 86 | | -var SAVE_LDS = 1 |
|---|
| 87 | | -var WG_BASE_ADDR_LO = 0x9000a000 |
|---|
| 88 | | -var WG_BASE_ADDR_HI = 0x0 |
|---|
| 89 | | -var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem |
|---|
| 90 | | -var CTX_SAVE_CONTROL = 0x0 |
|---|
| 91 | | -var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL |
|---|
| 92 | | -var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) |
|---|
| 93 | | -var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write |
|---|
| 94 | | -var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes |
|---|
| 95 | | -var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing |
|---|
| 96 | 27 | var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency |
|---|
| 28 | +var SAVE_AFTER_XNACK_ERROR = 1 //workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger |
|---|
| 29 | +var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised |
|---|
| 97 | 30 | |
|---|
| 98 | 31 | /**************************************************************************/ |
|---|
| 99 | 32 | /* variables */ |
|---|
| .. | .. |
|---|
| 107 | 40 | var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE = 1 |
|---|
| 108 | 41 | var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT = 3 |
|---|
| 109 | 42 | var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE = 29 |
|---|
| 43 | +var SQ_WAVE_STATUS_ALLOW_REPLAY_MASK = 0x400000 |
|---|
| 110 | 44 | |
|---|
| 111 | 45 | var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 |
|---|
| 112 | 46 | var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 |
|---|
| .. | .. |
|---|
| 127 | 61 | var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 |
|---|
| 128 | 62 | var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 |
|---|
| 129 | 63 | var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 |
|---|
| 64 | +var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK = 0x10000000 |
|---|
| 130 | 65 | |
|---|
| 131 | 66 | var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME |
|---|
| 132 | 67 | var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME |
|---|
| 133 | 68 | var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x1F8000 |
|---|
| 134 | 69 | var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME |
|---|
| 70 | + |
|---|
| 71 | +var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 |
|---|
| 135 | 72 | |
|---|
| 136 | 73 | var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 |
|---|
| 137 | 74 | var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 |
|---|
| .. | .. |
|---|
| 150 | 87 | var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG |
|---|
| 151 | 88 | var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 |
|---|
| 152 | 89 | |
|---|
| 153 | | -var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used |
|---|
| 154 | | -var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME |
|---|
| 155 | | -var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME |
|---|
| 156 | | -var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME |
|---|
| 90 | +var S_SAVE_PC_HI_RCNT_SHIFT = 27 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used |
|---|
| 91 | +var S_SAVE_PC_HI_RCNT_MASK = 0xF8000000 //FIXME |
|---|
| 92 | +var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 26 //FIXME |
|---|
| 93 | +var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x04000000 //FIXME |
|---|
| 157 | 94 | |
|---|
| 158 | 95 | var s_save_spi_init_lo = exec_lo |
|---|
| 159 | 96 | var s_save_spi_init_hi = exec_hi |
|---|
| .. | .. |
|---|
| 162 | 99 | var s_save_pc_hi = ttmp1 |
|---|
| 163 | 100 | var s_save_exec_lo = ttmp2 |
|---|
| 164 | 101 | var s_save_exec_hi = ttmp3 |
|---|
| 165 | | -var s_save_tmp = ttmp4 |
|---|
| 166 | | -var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine |
|---|
| 102 | +var s_save_tmp = ttmp14 |
|---|
| 103 | +var s_save_trapsts = ttmp15 //not really used until the end of the SAVE routine |
|---|
| 167 | 104 | var s_save_xnack_mask_lo = ttmp6 |
|---|
| 168 | 105 | var s_save_xnack_mask_hi = ttmp7 |
|---|
| 169 | 106 | var s_save_buf_rsrc0 = ttmp8 |
|---|
| .. | .. |
|---|
| 171 | 108 | var s_save_buf_rsrc2 = ttmp10 |
|---|
| 172 | 109 | var s_save_buf_rsrc3 = ttmp11 |
|---|
| 173 | 110 | var s_save_status = ttmp12 |
|---|
| 174 | | -var s_save_mem_offset = ttmp14 |
|---|
| 111 | +var s_save_mem_offset = ttmp4 |
|---|
| 175 | 112 | var s_save_alloc_size = s_save_trapsts //conflict |
|---|
| 176 | | -var s_save_m0 = ttmp15 |
|---|
| 113 | +var s_save_m0 = ttmp5 |
|---|
| 177 | 114 | var s_save_ttmps_lo = s_save_tmp //no conflict |
|---|
| 178 | 115 | var s_save_ttmps_hi = s_save_trapsts //no conflict |
|---|
| 179 | 116 | |
|---|
| .. | .. |
|---|
| 197 | 134 | var s_restore_spi_init_hi = exec_hi |
|---|
| 198 | 135 | |
|---|
| 199 | 136 | var s_restore_mem_offset = ttmp12 |
|---|
| 137 | +var s_restore_accvgpr_offset = ttmp13 |
|---|
| 200 | 138 | var s_restore_alloc_size = ttmp3 |
|---|
| 201 | 139 | var s_restore_tmp = ttmp2 |
|---|
| 202 | 140 | var s_restore_mem_offset_save = s_restore_tmp //no conflict |
|---|
| 141 | +var s_restore_accvgpr_offset_save = ttmp7 |
|---|
| 203 | 142 | |
|---|
| 204 | 143 | var s_restore_m0 = s_restore_alloc_size //no conflict |
|---|
| 205 | 144 | |
|---|
| 206 | | -var s_restore_mode = ttmp7 |
|---|
| 145 | +var s_restore_mode = s_restore_accvgpr_offset_save |
|---|
| 207 | 146 | |
|---|
| 208 | 147 | var s_restore_pc_lo = ttmp0 |
|---|
| 209 | 148 | var s_restore_pc_hi = ttmp1 |
|---|
| 210 | | -var s_restore_exec_lo = ttmp14 |
|---|
| 211 | | -var s_restore_exec_hi = ttmp15 |
|---|
| 212 | | -var s_restore_status = ttmp4 |
|---|
| 213 | | -var s_restore_trapsts = ttmp5 |
|---|
| 149 | +var s_restore_exec_lo = ttmp4 |
|---|
| 150 | +var s_restore_exec_hi = ttmp5 |
|---|
| 151 | +var s_restore_status = ttmp14 |
|---|
| 152 | +var s_restore_trapsts = ttmp15 |
|---|
| 214 | 153 | var s_restore_xnack_mask_lo = xnack_mask_lo |
|---|
| 215 | 154 | var s_restore_xnack_mask_hi = xnack_mask_hi |
|---|
| 216 | 155 | var s_restore_buf_rsrc0 = ttmp8 |
|---|
| .. | .. |
|---|
| 226 | 165 | /* Shader Main*/ |
|---|
| 227 | 166 | |
|---|
| 228 | 167 | shader main |
|---|
| 229 | | - asic(GFX9) |
|---|
| 168 | + asic(DEFAULT) |
|---|
| 230 | 169 | type(CS) |
|---|
| 231 | 170 | |
|---|
| 232 | 171 | |
|---|
| 233 | | - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore |
|---|
| 234 | | - //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC |
|---|
| 235 | | - s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC |
|---|
| 236 | | - s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. |
|---|
| 237 | | - s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE |
|---|
| 238 | | - //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE |
|---|
| 239 | | - s_branch L_SKIP_RESTORE //NOT restore, SAVE actually |
|---|
| 240 | | - else |
|---|
| 241 | 172 | s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save |
|---|
| 242 | | - end |
|---|
| 243 | 173 | |
|---|
| 244 | 174 | L_JUMP_TO_RESTORE: |
|---|
| 245 | 175 | s_branch L_RESTORE //restore |
|---|
| .. | .. |
|---|
| 248 | 178 | |
|---|
| 249 | 179 | s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC |
|---|
| 250 | 180 | s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save |
|---|
| 181 | + |
|---|
| 182 | +if SINGLE_STEP_MISSED_WORKAROUND |
|---|
| 183 | + // No single step exceptions if MODE.DEBUG_EN=0. |
|---|
| 184 | + s_getreg_b32 ttmp2, hwreg(HW_REG_MODE) |
|---|
| 185 | + s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK |
|---|
| 186 | + s_cbranch_scc0 L_NO_SINGLE_STEP_WORKAROUND |
|---|
| 187 | + |
|---|
| 188 | + // Second-level trap already handled exception if STATUS.HALT=1. |
|---|
| 189 | + s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK |
|---|
| 190 | + |
|---|
| 191 | + // Prioritize single step exception over context save. |
|---|
| 192 | + // Second-level trap will halt wave and RFE, re-entering for SAVECTX. |
|---|
| 193 | + s_cbranch_scc0 L_FETCH_2ND_TRAP |
|---|
| 194 | + |
|---|
| 195 | +L_NO_SINGLE_STEP_WORKAROUND: |
|---|
| 196 | +end |
|---|
| 197 | + |
|---|
| 251 | 198 | s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) |
|---|
| 252 | 199 | s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save |
|---|
| 253 | 200 | s_cbranch_scc1 L_SAVE //this is the operation for save |
|---|
| 254 | 201 | |
|---|
| 255 | 202 | // ********* Handle non-CWSR traps ******************* |
|---|
| 256 | | -if (!EMU_RUN_HACK) |
|---|
| 203 | + |
|---|
| 257 | 204 | // Illegal instruction is a non-maskable exception which blocks context save. |
|---|
| 258 | 205 | // Halt the wavefront and return from the trap. |
|---|
| 259 | 206 | s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK |
|---|
| .. | .. |
|---|
| 266 | 213 | |
|---|
| 267 | 214 | L_HALT_WAVE: |
|---|
| 268 | 215 | // If STATUS.HALT is set then this fault must come from SQC instruction fetch. |
|---|
| 269 | | - // We cannot prevent further faults so just terminate the wavefront. |
|---|
| 216 | + // We cannot prevent further faults. Spin wait until context saved. |
|---|
| 270 | 217 | s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK |
|---|
| 271 | 218 | s_cbranch_scc0 L_NOT_ALREADY_HALTED |
|---|
| 272 | | - s_endpgm |
|---|
| 219 | + |
|---|
| 220 | +L_WAIT_CTX_SAVE: |
|---|
| 221 | + s_sleep 0x10 |
|---|
| 222 | + s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) |
|---|
| 223 | + s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_SAVECTX_MASK |
|---|
| 224 | + s_cbranch_scc0 L_WAIT_CTX_SAVE |
|---|
| 225 | + |
|---|
| 273 | 226 | L_NOT_ALREADY_HALTED: |
|---|
| 274 | 227 | s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK |
|---|
| 275 | 228 | |
|---|
| .. | .. |
|---|
| 293 | 246 | // Read second-level TBA/TMA from first-level TMA and jump if available. |
|---|
| 294 | 247 | // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) |
|---|
| 295 | 248 | // ttmp12 holds SQ_WAVE_STATUS |
|---|
| 296 | | - s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO) |
|---|
| 297 | | - s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI) |
|---|
| 298 | | - s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 |
|---|
| 299 | | - s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA |
|---|
| 249 | + s_getreg_b32 ttmp14, hwreg(HW_REG_SQ_SHADER_TMA_LO) |
|---|
| 250 | + s_getreg_b32 ttmp15, hwreg(HW_REG_SQ_SHADER_TMA_HI) |
|---|
| 251 | + s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 |
|---|
| 252 | + s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA |
|---|
| 300 | 253 | s_waitcnt lgkmcnt(0) |
|---|
| 301 | | - s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA |
|---|
| 254 | + s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA |
|---|
| 302 | 255 | s_waitcnt lgkmcnt(0) |
|---|
| 303 | 256 | s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] |
|---|
| 304 | 257 | s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set |
|---|
| .. | .. |
|---|
| 324 | 277 | set_status_without_spi_prio(s_save_status, ttmp2) |
|---|
| 325 | 278 | |
|---|
| 326 | 279 | s_rfe_b64 [ttmp0, ttmp1] |
|---|
| 327 | | -end |
|---|
| 280 | + |
|---|
| 328 | 281 | // ********* End handling of non-CWSR traps ******************* |
|---|
| 329 | 282 | |
|---|
| 330 | 283 | /**************************************************************************/ |
|---|
| .. | .. |
|---|
| 332 | 285 | /**************************************************************************/ |
|---|
| 333 | 286 | |
|---|
| 334 | 287 | L_SAVE: |
|---|
| 335 | | - |
|---|
| 336 | | -if G8SR_DEBUG_TIMESTAMP |
|---|
| 337 | | - s_memrealtime s_g8sr_ts_save_s |
|---|
| 338 | | - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? |
|---|
| 339 | | -end |
|---|
| 340 | | - |
|---|
| 341 | 288 | s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] |
|---|
| 342 | 289 | |
|---|
| 343 | 290 | s_mov_b32 s_save_tmp, 0 //clear saveCtx bit |
|---|
| .. | .. |
|---|
| 359 | 306 | s_mov_b32 s_save_exec_hi, exec_hi |
|---|
| 360 | 307 | s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive |
|---|
| 361 | 308 | |
|---|
| 362 | | -if G8SR_DEBUG_TIMESTAMP |
|---|
| 363 | | - s_memrealtime s_g8sr_ts_sq_save_msg |
|---|
| 364 | | - s_waitcnt lgkmcnt(0) |
|---|
| 365 | | -end |
|---|
| 366 | | - |
|---|
| 367 | | - if (EMU_RUN_HACK) |
|---|
| 368 | | - |
|---|
| 369 | | - else |
|---|
| 370 | 309 | s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC |
|---|
| 371 | | - end |
|---|
| 372 | 310 | |
|---|
| 373 | 311 | // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. |
|---|
| 374 | 312 | s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) |
|---|
| .. | .. |
|---|
| 377 | 315 | L_SLEEP: |
|---|
| 378 | 316 | s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 |
|---|
| 379 | 317 | |
|---|
| 380 | | - if (EMU_RUN_HACK) |
|---|
| 381 | | - |
|---|
| 382 | | - else |
|---|
| 383 | 318 | s_cbranch_execz L_SLEEP |
|---|
| 384 | | - end |
|---|
| 385 | 319 | |
|---|
| 386 | | -if G8SR_DEBUG_TIMESTAMP |
|---|
| 387 | | - s_memrealtime s_g8sr_ts_spi_wrexec |
|---|
| 388 | | - s_waitcnt lgkmcnt(0) |
|---|
| 389 | | -end |
|---|
| 390 | | - |
|---|
| 391 | | - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) |
|---|
| 392 | | - //calculate wd_addr using absolute thread id |
|---|
| 393 | | - v_readlane_b32 s_save_tmp, v9, 0 |
|---|
| 394 | | - s_lshr_b32 s_save_tmp, s_save_tmp, 6 |
|---|
| 395 | | - s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE |
|---|
| 396 | | - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO |
|---|
| 397 | | - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI |
|---|
| 398 | | - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL |
|---|
| 399 | | - else |
|---|
| 400 | | - end |
|---|
| 401 | | - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) |
|---|
| 402 | | - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO |
|---|
| 403 | | - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI |
|---|
| 404 | | - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL |
|---|
| 405 | | - else |
|---|
| 406 | | - end |
|---|
| 407 | | - |
|---|
| 408 | | - // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic |
|---|
| 320 | + // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic |
|---|
| 409 | 321 | // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 |
|---|
| 410 | 322 | get_vgpr_size_bytes(s_save_ttmps_lo) |
|---|
| 411 | 323 | get_sgpr_size_bytes(s_save_ttmps_hi) |
|---|
| .. | .. |
|---|
| 413 | 325 | s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo |
|---|
| 414 | 326 | s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0 |
|---|
| 415 | 327 | s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF |
|---|
| 416 | | - s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1 |
|---|
| 328 | + s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1 |
|---|
| 417 | 329 | ack_sqc_store_workaround() |
|---|
| 418 | | - s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1 |
|---|
| 330 | + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1 |
|---|
| 419 | 331 | ack_sqc_store_workaround() |
|---|
| 420 | | - s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1 |
|---|
| 421 | | - ack_sqc_store_workaround() |
|---|
| 422 | | - s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1 |
|---|
| 332 | + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1 |
|---|
| 423 | 333 | ack_sqc_store_workaround() |
|---|
| 424 | 334 | |
|---|
| 425 | 335 | /* setup Resource Contants */ |
|---|
| .. | .. |
|---|
| 455 | 365 | |
|---|
| 456 | 366 | |
|---|
| 457 | 367 | s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes |
|---|
| 458 | | - if (SWIZZLE_EN) |
|---|
| 459 | | - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
|---|
| 460 | | - else |
|---|
| 461 | 368 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
|---|
| 462 | | - end |
|---|
| 463 | 369 | |
|---|
| 464 | 370 | |
|---|
| 465 | 371 | write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 |
|---|
| 466 | | - |
|---|
| 467 | | - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) |
|---|
| 468 | | - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 |
|---|
| 469 | | - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over |
|---|
| 470 | | - end |
|---|
| 471 | | - |
|---|
| 472 | 372 | write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC |
|---|
| 473 | 373 | write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) |
|---|
| 474 | 374 | write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC |
|---|
| .. | .. |
|---|
| 506 | 406 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 |
|---|
| 507 | 407 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) |
|---|
| 508 | 408 | |
|---|
| 509 | | - if (SGPR_SAVE_USE_SQC) |
|---|
| 510 | 409 | s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes |
|---|
| 511 | | - else |
|---|
| 512 | | - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) |
|---|
| 513 | | - end |
|---|
| 514 | 410 | |
|---|
| 515 | | - if (SWIZZLE_EN) |
|---|
| 516 | | - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
|---|
| 517 | | - else |
|---|
| 518 | 411 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
|---|
| 519 | | - end |
|---|
| 520 | 412 | |
|---|
| 521 | 413 | |
|---|
| 522 | 414 | // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 |
|---|
| .. | .. |
|---|
| 559 | 451 | s_mov_b32 xnack_mask_lo, 0x0 |
|---|
| 560 | 452 | s_mov_b32 xnack_mask_hi, 0x0 |
|---|
| 561 | 453 | |
|---|
| 562 | | - if (SWIZZLE_EN) |
|---|
| 563 | | - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
|---|
| 564 | | - else |
|---|
| 565 | 454 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
|---|
| 566 | | - end |
|---|
| 567 | 455 | |
|---|
| 568 | 456 | |
|---|
| 569 | 457 | // VGPR Allocated in 4-GPR granularity |
|---|
| 570 | 458 | |
|---|
| 571 | | -if G8SR_VGPR_SR_IN_DWX4 |
|---|
| 572 | | - // the const stride for DWx4 is 4*4 bytes |
|---|
| 573 | | - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 |
|---|
| 574 | | - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes |
|---|
| 459 | +if SAVE_AFTER_XNACK_ERROR |
|---|
| 460 | + check_if_tcp_store_ok() |
|---|
| 461 | + s_cbranch_scc1 L_SAVE_FIRST_VGPRS_WITH_TCP |
|---|
| 575 | 462 | |
|---|
| 576 | | - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 |
|---|
| 463 | + write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) |
|---|
| 464 | + s_branch L_SAVE_LDS |
|---|
| 577 | 465 | |
|---|
| 578 | | - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 |
|---|
| 579 | | - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes |
|---|
| 580 | | -else |
|---|
| 466 | +L_SAVE_FIRST_VGPRS_WITH_TCP: |
|---|
| 467 | +end |
|---|
| 468 | + |
|---|
| 581 | 469 | buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 |
|---|
| 582 | 470 | buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 |
|---|
| 583 | 471 | buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 |
|---|
| 584 | 472 | buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 |
|---|
| 585 | | -end |
|---|
| 586 | 473 | |
|---|
| 587 | 474 | |
|---|
| 588 | 475 | |
|---|
| .. | .. |
|---|
| 617 | 504 | s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() |
|---|
| 618 | 505 | |
|---|
| 619 | 506 | |
|---|
| 620 | | - if (SWIZZLE_EN) |
|---|
| 621 | | - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
|---|
| 622 | | - else |
|---|
| 623 | 507 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
|---|
| 624 | | - end |
|---|
| 625 | 508 | |
|---|
| 626 | 509 | s_mov_b32 m0, 0x0 //lds_offset initial value = 0 |
|---|
| 627 | 510 | |
|---|
| 628 | 511 | |
|---|
| 629 | | -var LDS_DMA_ENABLE = 0 |
|---|
| 630 | | -var UNROLL = 0 |
|---|
| 631 | | -if UNROLL==0 && LDS_DMA_ENABLE==1 |
|---|
| 632 | | - s_mov_b32 s3, 256*2 |
|---|
| 633 | | - s_nop 0 |
|---|
| 634 | | - s_nop 0 |
|---|
| 635 | | - s_nop 0 |
|---|
| 636 | | - L_SAVE_LDS_LOOP: |
|---|
| 637 | | - //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? |
|---|
| 638 | | - if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity |
|---|
| 639 | | - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW |
|---|
| 640 | | - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW |
|---|
| 641 | | - end |
|---|
| 642 | | - |
|---|
| 643 | | - s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes |
|---|
| 644 | | - s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes |
|---|
| 645 | | - s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 |
|---|
| 646 | | - s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? |
|---|
| 647 | | - |
|---|
| 648 | | -elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss |
|---|
| 649 | | - // store from higest LDS address to lowest |
|---|
| 650 | | - s_mov_b32 s3, 256*2 |
|---|
| 651 | | - s_sub_u32 m0, s_save_alloc_size, s3 |
|---|
| 652 | | - s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 |
|---|
| 653 | | - s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... |
|---|
| 654 | | - s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest |
|---|
| 655 | | - s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction |
|---|
| 656 | | - s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc |
|---|
| 657 | | - s_nop 0 |
|---|
| 658 | | - s_nop 0 |
|---|
| 659 | | - s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes |
|---|
| 660 | | - s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved |
|---|
| 661 | | - s_add_u32 s0, s0,s_save_alloc_size |
|---|
| 662 | | - s_addc_u32 s1, s1, 0 |
|---|
| 663 | | - s_setpc_b64 s[0:1] |
|---|
| 664 | | - |
|---|
| 665 | | - |
|---|
| 666 | | - for var i =0; i< 128; i++ |
|---|
| 667 | | - // be careful to make here a 64Byte aligned address, which could improve performance... |
|---|
| 668 | | - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW |
|---|
| 669 | | - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW |
|---|
| 670 | | - |
|---|
| 671 | | - if i!=127 |
|---|
| 672 | | - s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline |
|---|
| 673 | | - s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 |
|---|
| 674 | | - end |
|---|
| 675 | | - end |
|---|
| 676 | | - |
|---|
| 677 | | -else // BUFFER_STORE |
|---|
| 678 | 512 | v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 |
|---|
| 679 | 513 | v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid |
|---|
| 514 | + |
|---|
| 515 | +if SAVE_AFTER_XNACK_ERROR |
|---|
| 516 | + check_if_tcp_store_ok() |
|---|
| 517 | + s_cbranch_scc1 L_SAVE_LDS_WITH_TCP |
|---|
| 518 | + |
|---|
| 519 | + v_lshlrev_b32 v2, 2, v3 |
|---|
| 520 | +L_SAVE_LDS_LOOP_SQC: |
|---|
| 521 | + ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40 |
|---|
| 522 | + s_waitcnt lgkmcnt(0) |
|---|
| 523 | + |
|---|
| 524 | + write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset) |
|---|
| 525 | + |
|---|
| 526 | + v_add_u32 v2, 0x200, v2 |
|---|
| 527 | + v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size |
|---|
| 528 | + s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC |
|---|
| 529 | + |
|---|
| 530 | + s_branch L_SAVE_LDS_DONE |
|---|
| 531 | + |
|---|
| 532 | +L_SAVE_LDS_WITH_TCP: |
|---|
| 533 | +end |
|---|
| 534 | + |
|---|
| 680 | 535 | v_mul_i32_i24 v2, v3, 8 // tid*8 |
|---|
| 681 | 536 | v_mov_b32 v3, 256*2 |
|---|
| 682 | 537 | s_mov_b32 m0, 0x10000 |
|---|
| .. | .. |
|---|
| 697 | 552 | // restore rsrc3 |
|---|
| 698 | 553 | s_mov_b32 s_save_buf_rsrc3, s0 |
|---|
| 699 | 554 | |
|---|
| 700 | | -end |
|---|
| 701 | | - |
|---|
| 702 | 555 | L_SAVE_LDS_DONE: |
|---|
| 703 | 556 | |
|---|
| 704 | 557 | |
|---|
| .. | .. |
|---|
| 716 | 569 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 |
|---|
| 717 | 570 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible |
|---|
| 718 | 571 | s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) |
|---|
| 719 | | - if (SWIZZLE_EN) |
|---|
| 720 | | - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
|---|
| 721 | | - else |
|---|
| 722 | 572 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
|---|
| 723 | | - end |
|---|
| 724 | 573 | |
|---|
| 725 | 574 | |
|---|
| 726 | | - // VGPR Allocated in 4-GPR granularity |
|---|
| 727 | | - |
|---|
| 728 | | -if G8SR_VGPR_SR_IN_DWX4 |
|---|
| 729 | | - // the const stride for DWx4 is 4*4 bytes |
|---|
| 730 | | - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 |
|---|
| 731 | | - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes |
|---|
| 732 | | - |
|---|
| 733 | | - s_mov_b32 m0, 4 // skip first 4 VGPRs |
|---|
| 734 | | - s_cmp_lt_u32 m0, s_save_alloc_size |
|---|
| 735 | | - s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs |
|---|
| 736 | | - |
|---|
| 737 | | - s_set_gpr_idx_on m0, 0x1 // This will change M0 |
|---|
| 738 | | - s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 |
|---|
| 739 | | -L_SAVE_VGPR_LOOP: |
|---|
| 740 | | - v_mov_b32 v0, v0 // v0 = v[0+m0] |
|---|
| 741 | | - v_mov_b32 v1, v1 |
|---|
| 742 | | - v_mov_b32 v2, v2 |
|---|
| 743 | | - v_mov_b32 v3, v3 |
|---|
| 744 | | - |
|---|
| 745 | | - |
|---|
| 746 | | - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 |
|---|
| 747 | | - s_add_u32 m0, m0, 4 |
|---|
| 748 | | - s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 |
|---|
| 749 | | - s_cmp_lt_u32 m0, s_save_alloc_size |
|---|
| 750 | | - s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? |
|---|
| 751 | | - s_set_gpr_idx_off |
|---|
| 752 | | -L_SAVE_VGPR_LOOP_END: |
|---|
| 753 | | - |
|---|
| 754 | | - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 |
|---|
| 755 | | - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes |
|---|
| 756 | | -else |
|---|
| 757 | 575 | // VGPR store using dw burst |
|---|
| 758 | 576 | s_mov_b32 m0, 0x4 //VGPR initial index value =0 |
|---|
| 759 | 577 | s_cmp_lt_u32 m0, s_save_alloc_size |
|---|
| .. | .. |
|---|
| 763 | 581 | s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 |
|---|
| 764 | 582 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later |
|---|
| 765 | 583 | |
|---|
| 584 | +if SAVE_AFTER_XNACK_ERROR |
|---|
| 585 | + check_if_tcp_store_ok() |
|---|
| 586 | + s_cbranch_scc1 L_SAVE_VGPR_LOOP |
|---|
| 587 | + |
|---|
| 588 | +L_SAVE_VGPR_LOOP_SQC: |
|---|
| 589 | + write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) |
|---|
| 590 | + |
|---|
| 591 | + s_add_u32 m0, m0, 4 |
|---|
| 592 | + s_cmp_lt_u32 m0, s_save_alloc_size |
|---|
| 593 | + s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC |
|---|
| 594 | + |
|---|
| 595 | + s_set_gpr_idx_off |
|---|
| 596 | + s_branch L_SAVE_VGPR_END |
|---|
| 597 | +end |
|---|
| 598 | + |
|---|
| 766 | 599 | L_SAVE_VGPR_LOOP: |
|---|
| 767 | 600 | v_mov_b32 v0, v0 //v0 = v[0+m0] |
|---|
| 768 | 601 | v_mov_b32 v1, v1 //v0 = v[0+m0] |
|---|
| 769 | 602 | v_mov_b32 v2, v2 //v0 = v[0+m0] |
|---|
| 770 | 603 | v_mov_b32 v3, v3 //v0 = v[0+m0] |
|---|
| 771 | 604 | |
|---|
| 772 | | - if(USE_MTBUF_INSTEAD_OF_MUBUF) |
|---|
| 773 | | - tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 |
|---|
| 774 | | - else |
|---|
| 775 | 605 | buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 |
|---|
| 776 | 606 | buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 |
|---|
| 777 | 607 | buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 |
|---|
| 778 | 608 | buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 |
|---|
| 779 | | - end |
|---|
| 780 | 609 | |
|---|
| 781 | 610 | s_add_u32 m0, m0, 4 //next vgpr index |
|---|
| 782 | 611 | s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes |
|---|
| 783 | 612 | s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 |
|---|
| 784 | 613 | s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? |
|---|
| 785 | 614 | s_set_gpr_idx_off |
|---|
| 786 | | -end |
|---|
| 787 | 615 | |
|---|
| 788 | 616 | L_SAVE_VGPR_END: |
|---|
| 789 | 617 | |
|---|
| 618 | +if ASIC_TARGET_ARCTURUS |
|---|
| 619 | + // Save ACC VGPRs |
|---|
| 620 | + s_mov_b32 m0, 0x0 //VGPR initial index value =0 |
|---|
| 621 | + s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 |
|---|
| 790 | 622 | |
|---|
| 623 | +if SAVE_AFTER_XNACK_ERROR |
|---|
| 624 | + check_if_tcp_store_ok() |
|---|
| 625 | + s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP |
|---|
| 791 | 626 | |
|---|
| 792 | | - |
|---|
| 793 | | - |
|---|
| 794 | | - |
|---|
| 795 | | - /* S_PGM_END_SAVED */ //FIXME graphics ONLY |
|---|
| 796 | | - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) |
|---|
| 797 | | - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] |
|---|
| 798 | | - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 |
|---|
| 799 | | - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over |
|---|
| 800 | | - s_rfe_b64 s_save_pc_lo //Return to the main shader program |
|---|
| 801 | | - else |
|---|
| 627 | +L_SAVE_ACCVGPR_LOOP_SQC: |
|---|
| 628 | + for var vgpr = 0; vgpr < 4; ++ vgpr |
|---|
| 629 | + v_accvgpr_read v[vgpr], acc[vgpr] // v[N] = acc[N+m0] |
|---|
| 802 | 630 | end |
|---|
| 803 | 631 | |
|---|
| 804 | | -// Save Done timestamp |
|---|
| 805 | | -if G8SR_DEBUG_TIMESTAMP |
|---|
| 806 | | - s_memrealtime s_g8sr_ts_save_d |
|---|
| 807 | | - // SGPR SR memory offset : size(VGPR) |
|---|
| 808 | | - get_vgpr_size_bytes(s_save_mem_offset) |
|---|
| 809 | | - s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET |
|---|
| 810 | | - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? |
|---|
| 811 | | - // Need reset rsrc2?? |
|---|
| 812 | | - s_mov_b32 m0, s_save_mem_offset |
|---|
| 813 | | - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
|---|
| 814 | | - s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 |
|---|
| 632 | + write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) |
|---|
| 633 | + |
|---|
| 634 | + s_add_u32 m0, m0, 4 |
|---|
| 635 | + s_cmp_lt_u32 m0, s_save_alloc_size |
|---|
| 636 | + s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP_SQC |
|---|
| 637 | + |
|---|
| 638 | + s_set_gpr_idx_off |
|---|
| 639 | + s_branch L_SAVE_ACCVGPR_END |
|---|
| 815 | 640 | end |
|---|
| 816 | 641 | |
|---|
| 642 | +L_SAVE_ACCVGPR_LOOP: |
|---|
| 643 | + for var vgpr = 0; vgpr < 4; ++ vgpr |
|---|
| 644 | + v_accvgpr_read v[vgpr], acc[vgpr] // v[N] = acc[N+m0] |
|---|
| 645 | + end |
|---|
| 646 | + |
|---|
| 647 | + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 |
|---|
| 648 | + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 |
|---|
| 649 | + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 |
|---|
| 650 | + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 |
|---|
| 651 | + |
|---|
| 652 | + s_add_u32 m0, m0, 4 |
|---|
| 653 | + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 |
|---|
| 654 | + s_cmp_lt_u32 m0, s_save_alloc_size |
|---|
| 655 | + s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP |
|---|
| 656 | + s_set_gpr_idx_off |
|---|
| 657 | + |
|---|
| 658 | +L_SAVE_ACCVGPR_END: |
|---|
| 659 | +end |
|---|
| 817 | 660 | |
|---|
| 818 | 661 | s_branch L_END_PGM |
|---|
| 819 | 662 | |
|---|
| .. | .. |
|---|
| 825 | 668 | |
|---|
| 826 | 669 | L_RESTORE: |
|---|
| 827 | 670 | /* Setup Resource Contants */ |
|---|
| 828 | | - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) |
|---|
| 829 | | - //calculate wd_addr using absolute thread id |
|---|
| 830 | | - v_readlane_b32 s_restore_tmp, v9, 0 |
|---|
| 831 | | - s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 |
|---|
| 832 | | - s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE |
|---|
| 833 | | - s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO |
|---|
| 834 | | - s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI |
|---|
| 835 | | - s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL |
|---|
| 836 | | - else |
|---|
| 837 | | - end |
|---|
| 838 | | - |
|---|
| 839 | | -if G8SR_DEBUG_TIMESTAMP |
|---|
| 840 | | - s_memrealtime s_g8sr_ts_restore_s |
|---|
| 841 | | - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? |
|---|
| 842 | | - // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... |
|---|
| 843 | | - s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] |
|---|
| 844 | | - s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. |
|---|
| 845 | | -end |
|---|
| 846 | | - |
|---|
| 847 | | - |
|---|
| 848 | | - |
|---|
| 849 | 671 | s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo |
|---|
| 850 | 672 | s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi |
|---|
| 851 | 673 | s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE |
|---|
| .. | .. |
|---|
| 887 | 709 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? |
|---|
| 888 | 710 | |
|---|
| 889 | 711 | |
|---|
| 890 | | - if (SWIZZLE_EN) |
|---|
| 891 | | - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
|---|
| 892 | | - else |
|---|
| 893 | 712 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
|---|
| 894 | | - end |
|---|
| 895 | 713 | s_mov_b32 m0, 0x0 //lds_offset initial value = 0 |
|---|
| 896 | 714 | |
|---|
| 897 | 715 | L_RESTORE_LDS_LOOP: |
|---|
| 898 | | - if (SAVE_LDS) |
|---|
| 899 | 716 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW |
|---|
| 900 | 717 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW |
|---|
| 901 | | - end |
|---|
| 902 | 718 | s_add_u32 m0, m0, 256*2 // 128 DW |
|---|
| 903 | 719 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW |
|---|
| 904 | 720 | s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 |
|---|
| .. | .. |
|---|
| 917 | 733 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 |
|---|
| 918 | 734 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) |
|---|
| 919 | 735 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) |
|---|
| 920 | | - if (SWIZZLE_EN) |
|---|
| 921 | | - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
|---|
| 922 | | - else |
|---|
| 736 | + |
|---|
| 737 | +if ASIC_TARGET_ARCTURUS |
|---|
| 738 | + s_mov_b32 s_restore_accvgpr_offset, s_restore_buf_rsrc2 //ACC VGPRs at end of VGPRs |
|---|
| 739 | +end |
|---|
| 740 | + |
|---|
| 923 | 741 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
|---|
| 924 | | - end |
|---|
| 925 | 742 | |
|---|
| 926 | | -if G8SR_VGPR_SR_IN_DWX4 |
|---|
| 927 | | - get_vgpr_size_bytes(s_restore_mem_offset) |
|---|
| 928 | | - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 |
|---|
| 929 | | - |
|---|
| 930 | | - // the const stride for DWx4 is 4*4 bytes |
|---|
| 931 | | - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 |
|---|
| 932 | | - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes |
|---|
| 933 | | - |
|---|
| 934 | | - s_mov_b32 m0, s_restore_alloc_size |
|---|
| 935 | | - s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 |
|---|
| 936 | | - |
|---|
| 937 | | -L_RESTORE_VGPR_LOOP: |
|---|
| 938 | | - buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 |
|---|
| 939 | | - s_waitcnt vmcnt(0) |
|---|
| 940 | | - s_sub_u32 m0, m0, 4 |
|---|
| 941 | | - v_mov_b32 v0, v0 // v[0+m0] = v0 |
|---|
| 942 | | - v_mov_b32 v1, v1 |
|---|
| 943 | | - v_mov_b32 v2, v2 |
|---|
| 944 | | - v_mov_b32 v3, v3 |
|---|
| 945 | | - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 |
|---|
| 946 | | - s_cmp_eq_u32 m0, 0x8000 |
|---|
| 947 | | - s_cbranch_scc0 L_RESTORE_VGPR_LOOP |
|---|
| 948 | | - s_set_gpr_idx_off |
|---|
| 949 | | - |
|---|
| 950 | | - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 |
|---|
| 951 | | - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes |
|---|
| 952 | | - |
|---|
| 953 | | -else |
|---|
| 954 | 743 | // VGPR load using dw burst |
|---|
| 955 | 744 | s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last |
|---|
| 956 | 745 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 |
|---|
| 746 | +if ASIC_TARGET_ARCTURUS |
|---|
| 747 | + s_mov_b32 s_restore_accvgpr_offset_save, s_restore_accvgpr_offset |
|---|
| 748 | + s_add_u32 s_restore_accvgpr_offset, s_restore_accvgpr_offset, 256*4 |
|---|
| 749 | +end |
|---|
| 957 | 750 | s_mov_b32 m0, 4 //VGPR initial index value = 1 |
|---|
| 958 | 751 | s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 |
|---|
| 959 | 752 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later |
|---|
| 960 | 753 | |
|---|
| 961 | 754 | L_RESTORE_VGPR_LOOP: |
|---|
| 962 | | - if(USE_MTBUF_INSTEAD_OF_MUBUF) |
|---|
| 963 | | - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 |
|---|
| 964 | | - else |
|---|
| 755 | + |
|---|
| 756 | +if ASIC_TARGET_ARCTURUS |
|---|
| 757 | + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 |
|---|
| 758 | + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256 |
|---|
| 759 | + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256*2 |
|---|
| 760 | + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256*3 |
|---|
| 761 | + s_add_u32 s_restore_accvgpr_offset, s_restore_accvgpr_offset, 256*4 |
|---|
| 762 | + s_waitcnt vmcnt(0) |
|---|
| 763 | + |
|---|
| 764 | + for var vgpr = 0; vgpr < 4; ++ vgpr |
|---|
| 765 | + v_accvgpr_write acc[vgpr], v[vgpr] |
|---|
| 766 | + end |
|---|
| 767 | +end |
|---|
| 768 | + |
|---|
| 965 | 769 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 |
|---|
| 966 | 770 | buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 |
|---|
| 967 | 771 | buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 |
|---|
| 968 | 772 | buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 |
|---|
| 969 | | - end |
|---|
| 970 | 773 | s_waitcnt vmcnt(0) //ensure data ready |
|---|
| 971 | 774 | v_mov_b32 v0, v0 //v[0+m0] = v0 |
|---|
| 972 | 775 | v_mov_b32 v1, v1 |
|---|
| .. | .. |
|---|
| 978 | 781 | s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? |
|---|
| 979 | 782 | s_set_gpr_idx_off |
|---|
| 980 | 783 | /* VGPR restore on v0 */ |
|---|
| 981 | | - if(USE_MTBUF_INSTEAD_OF_MUBUF) |
|---|
| 982 | | - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 |
|---|
| 983 | | - else |
|---|
| 784 | +if ASIC_TARGET_ARCTURUS |
|---|
| 785 | + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 |
|---|
| 786 | + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256 |
|---|
| 787 | + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256*2 |
|---|
| 788 | + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256*3 |
|---|
| 789 | + s_waitcnt vmcnt(0) |
|---|
| 790 | + |
|---|
| 791 | + for var vgpr = 0; vgpr < 4; ++ vgpr |
|---|
| 792 | + v_accvgpr_write acc[vgpr], v[vgpr] |
|---|
| 793 | + end |
|---|
| 794 | +end |
|---|
| 795 | + |
|---|
| 984 | 796 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 |
|---|
| 985 | 797 | buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 |
|---|
| 986 | 798 | buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 |
|---|
| 987 | 799 | buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 |
|---|
| 988 | | - end |
|---|
| 989 | | - |
|---|
| 990 | | -end |
|---|
| 991 | 800 | |
|---|
| 992 | 801 | /* restore SGPRs */ |
|---|
| 993 | 802 | ////////////////////////////// |
|---|
| .. | .. |
|---|
| 1003 | 812 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 |
|---|
| 1004 | 813 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) |
|---|
| 1005 | 814 | |
|---|
| 1006 | | - if (SGPR_SAVE_USE_SQC) |
|---|
| 1007 | 815 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes |
|---|
| 1008 | | - else |
|---|
| 1009 | | - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) |
|---|
| 1010 | | - end |
|---|
| 1011 | | - if (SWIZZLE_EN) |
|---|
| 1012 | | - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
|---|
| 1013 | | - else |
|---|
| 1014 | 816 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
|---|
| 1015 | | - end |
|---|
| 1016 | 817 | |
|---|
| 1017 | 818 | s_mov_b32 m0, s_restore_alloc_size |
|---|
| 1018 | 819 | |
|---|
| .. | .. |
|---|
| 1040 | 841 | L_RESTORE_HWREG: |
|---|
| 1041 | 842 | |
|---|
| 1042 | 843 | |
|---|
| 1043 | | -if G8SR_DEBUG_TIMESTAMP |
|---|
| 1044 | | - s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo |
|---|
| 1045 | | - s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi |
|---|
| 1046 | | -end |
|---|
| 1047 | | - |
|---|
| 1048 | 844 | // HWREG SR memory offset : size(VGPR)+size(SGPR) |
|---|
| 1049 | 845 | get_vgpr_size_bytes(s_restore_mem_offset) |
|---|
| 1050 | 846 | get_sgpr_size_bytes(s_restore_tmp) |
|---|
| .. | .. |
|---|
| 1052 | 848 | |
|---|
| 1053 | 849 | |
|---|
| 1054 | 850 | s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes |
|---|
| 1055 | | - if (SWIZZLE_EN) |
|---|
| 1056 | | - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
|---|
| 1057 | | - else |
|---|
| 1058 | 851 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes |
|---|
| 1059 | | - end |
|---|
| 1060 | 852 | |
|---|
| 1061 | 853 | read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 |
|---|
| 1062 | 854 | read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC |
|---|
| .. | .. |
|---|
| 1071 | 863 | |
|---|
| 1072 | 864 | s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS |
|---|
| 1073 | 865 | |
|---|
| 1074 | | - //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: |
|---|
| 1075 | | - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) |
|---|
| 1076 | | - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) |
|---|
| 1077 | | - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over |
|---|
| 1078 | | - end |
|---|
| 1079 | | - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) |
|---|
| 1080 | | - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal |
|---|
| 1081 | | - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over |
|---|
| 1082 | | - end |
|---|
| 1083 | | - |
|---|
| 1084 | 866 | s_mov_b32 m0, s_restore_m0 |
|---|
| 1085 | 867 | s_mov_b32 exec_lo, s_restore_exec_lo |
|---|
| 1086 | 868 | s_mov_b32 exec_hi, s_restore_exec_hi |
|---|
| .. | .. |
|---|
| 1093 | 875 | //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore |
|---|
| 1094 | 876 | s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode |
|---|
| 1095 | 877 | |
|---|
| 1096 | | - // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic |
|---|
| 878 | + // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic |
|---|
| 1097 | 879 | // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 |
|---|
| 1098 | 880 | get_vgpr_size_bytes(s_restore_ttmps_lo) |
|---|
| 1099 | 881 | get_sgpr_size_bytes(s_restore_ttmps_hi) |
|---|
| .. | .. |
|---|
| 1101 | 883 | s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 |
|---|
| 1102 | 884 | s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 |
|---|
| 1103 | 885 | s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF |
|---|
| 1104 | | - s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1 |
|---|
| 1105 | | - s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1 |
|---|
| 1106 | | - s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1 |
|---|
| 1107 | | - s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1 |
|---|
| 886 | + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1 |
|---|
| 887 | + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1 |
|---|
| 888 | + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1 |
|---|
| 1108 | 889 | s_waitcnt lgkmcnt(0) |
|---|
| 1109 | 890 | |
|---|
| 1110 | 891 | //reuse s_restore_m0 as a temp register |
|---|
| .. | .. |
|---|
| 1127 | 908 | set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu |
|---|
| 1128 | 909 | |
|---|
| 1129 | 910 | s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time |
|---|
| 1130 | | - |
|---|
| 1131 | | -if G8SR_DEBUG_TIMESTAMP |
|---|
| 1132 | | - s_memrealtime s_g8sr_ts_restore_d |
|---|
| 1133 | | - s_waitcnt lgkmcnt(0) |
|---|
| 1134 | | -end |
|---|
| 1135 | 911 | |
|---|
| 1136 | 912 | // s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution |
|---|
| 1137 | 913 | s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc |
|---|
| .. | .. |
|---|
| 1187 | 963 | s_sub_u32 s_mem_offset, s_mem_offset, 4*16 |
|---|
| 1188 | 964 | end |
|---|
| 1189 | 965 | |
|---|
| 966 | +function check_if_tcp_store_ok |
|---|
| 967 | + // If STATUS.ALLOW_REPLAY=0 and TRAPSTS.XNACK_ERROR=1 then TCP stores will fail. |
|---|
| 968 | + s_and_b32 s_save_tmp, s_save_status, SQ_WAVE_STATUS_ALLOW_REPLAY_MASK |
|---|
| 969 | + s_cbranch_scc1 L_TCP_STORE_CHECK_DONE |
|---|
| 1190 | 970 | |
|---|
| 971 | + s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) |
|---|
| 972 | + s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp |
|---|
| 973 | + |
|---|
| 974 | +L_TCP_STORE_CHECK_DONE: |
|---|
| 975 | +end |
|---|
| 976 | + |
|---|
| 977 | +function write_vgpr_to_mem_with_sqc(v, s_rsrc, s_mem_offset) |
|---|
| 978 | + s_mov_b32 s4, 0 |
|---|
| 979 | + |
|---|
| 980 | +L_WRITE_VGPR_LANE_LOOP: |
|---|
| 981 | + for var lane = 0; lane < 4; ++ lane |
|---|
| 982 | + v_readlane_b32 s[lane], v, s4 |
|---|
| 983 | + s_add_u32 s4, s4, 1 |
|---|
| 984 | + end |
|---|
| 985 | + |
|---|
| 986 | + s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1 |
|---|
| 987 | + ack_sqc_store_workaround() |
|---|
| 988 | + |
|---|
| 989 | + s_add_u32 s_mem_offset, s_mem_offset, 0x10 |
|---|
| 990 | + s_cmp_eq_u32 s4, 0x40 |
|---|
| 991 | + s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP |
|---|
| 992 | +end |
|---|
| 993 | + |
|---|
| 994 | +function write_vgprs_to_mem_with_sqc(v, n_vgprs, s_rsrc, s_mem_offset) |
|---|
| 995 | + for var vgpr = 0; vgpr < n_vgprs; ++ vgpr |
|---|
| 996 | + write_vgpr_to_mem_with_sqc(v[vgpr], s_rsrc, s_mem_offset) |
|---|
| 997 | + end |
|---|
| 998 | +end |
|---|
| 1191 | 999 | |
|---|
| 1192 | 1000 | function get_lds_size_bytes(s_lds_size_byte) |
|---|
| 1193 | 1001 | // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW |
|---|
| .. | .. |
|---|
| 1199 | 1007 | s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size |
|---|
| 1200 | 1008 | s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 |
|---|
| 1201 | 1009 | s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible |
|---|
| 1010 | + |
|---|
| 1011 | +if ASIC_TARGET_ARCTURUS |
|---|
| 1012 | + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, 1 // Double size for ACC VGPRs |
|---|
| 1013 | +end |
|---|
| 1202 | 1014 | end |
|---|
| 1203 | 1015 | |
|---|
| 1204 | 1016 | function get_sgpr_size_bytes(s_sgpr_size_byte) |
|---|