hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
....@@ -24,78 +24,6 @@
2424 * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex
2525 */
2626
27
-/* HW (VI) source code for CWSR trap handler */
28
-/* Version 18 + multiple trap handler */
29
-
30
-// this performance-optimal version was originally from Seven Xu at SRDC
31
-
32
-// Revison #18 --...
33
-/* Rev History
34
-** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
35
-** #4. SR Memory Layout:
36
-** 1. VGPR-SGPR-HWREG-{LDS}
37
-** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
38
-** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
39
-** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
40
-** #7. Update: 1. don't barrier if noLDS
41
-** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
42
-** 2. Fix SQ issue by s_sleep 2
43
-** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
44
-** 2. optimize s_buffer save by burst 16sgprs...
45
-** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
46
-** #11. Update 1. Add 2 more timestamp for debug version
47
-** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
48
-** #13. Integ 1. Always use MUBUF for PV trap shader...
49
-** #14. Update 1. s_buffer_store soft clause...
50
-** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
51
-** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
52
-** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
53
-** 2. PERF - Save LDS before save VGPR to cover LDS save long latency...
54
-** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
55
-** 2. FUNC - Handle non-CWSR traps
56
-*/
57
-
58
-var G8SR_WDMEM_HWREG_OFFSET = 0
59
-var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes
60
-
61
-// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
62
-
63
-var G8SR_DEBUG_TIMESTAMP = 0
64
-var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
65
-var s_g8sr_ts_save_s = s[34:35] // save start
66
-var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi
67
-var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ
68
-var s_g8sr_ts_save_d = s[40:41] // save end
69
-var s_g8sr_ts_restore_s = s[42:43] // restore start
70
-var s_g8sr_ts_restore_d = s[44:45] // restore end
71
-
72
-var G8SR_VGPR_SR_IN_DWX4 = 0
73
-var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes
74
-var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
75
-
76
-
77
-/*************************************************************************/
78
-/* control on how to run the shader */
79
-/*************************************************************************/
80
-//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
81
-var EMU_RUN_HACK = 0
82
-var EMU_RUN_HACK_RESTORE_NORMAL = 0
83
-var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
84
-var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
85
-var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
86
-var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
87
-var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
88
-var SAVE_LDS = 1
89
-var WG_BASE_ADDR_LO = 0x9000a000
90
-var WG_BASE_ADDR_HI = 0x0
91
-var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem
92
-var CTX_SAVE_CONTROL = 0x0
93
-var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
94
-var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
95
-var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write
96
-var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
97
-var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
98
-
9927 /**************************************************************************/
10028 /* variables */
10129 /**************************************************************************/
....@@ -226,16 +154,7 @@
226154 type(CS)
227155
228156
229
- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
230
- //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
231
- s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
232
- s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
233
- s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
234
- //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
235
- s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
236
- else
237157 s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
238
- end
239158
240159 L_JUMP_TO_RESTORE:
241160 s_branch L_RESTORE //restore
....@@ -249,7 +168,7 @@
249168 s_cbranch_scc1 L_SAVE //this is the operation for save
250169
251170 // ********* Handle non-CWSR traps *******************
252
-if (!EMU_RUN_HACK)
171
+
253172 /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
254173 s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0
255174 s_waitcnt lgkmcnt(0)
....@@ -268,7 +187,7 @@
268187 s_and_b32 ttmp1, ttmp1, 0xFFFF
269188 set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC)
270189 s_rfe_b64 [ttmp0, ttmp1]
271
-end
190
+
272191 // ********* End handling of non-CWSR traps *******************
273192
274193 /**************************************************************************/
....@@ -276,25 +195,6 @@
276195 /**************************************************************************/
277196
278197 L_SAVE:
279
-
280
-if G8SR_DEBUG_TIMESTAMP
281
- s_memrealtime s_g8sr_ts_save_s
282
- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
283
-end
284
-
285
- //check whether there is mem_viol
286
- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
287
- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
288
- s_cbranch_scc0 L_NO_PC_REWIND
289
-
290
- //if so, need rewind PC assuming GDS operation gets NACKed
291
- s_mov_b32 s_save_tmp, 0 //clear mem_viol bit
292
- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit
293
- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
294
- s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8
295
- s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc
296
-
297
-L_NO_PC_REWIND:
298198 s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
299199 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
300200
....@@ -316,16 +216,7 @@
316216 s_mov_b32 s_save_exec_hi, exec_hi
317217 s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
318218
319
-if G8SR_DEBUG_TIMESTAMP
320
- s_memrealtime s_g8sr_ts_sq_save_msg
321
- s_waitcnt lgkmcnt(0)
322
-end
323
-
324
- if (EMU_RUN_HACK)
325
-
326
- else
327219 s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
328
- end
329220
330221 // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
331222 s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
....@@ -334,36 +225,9 @@
334225 L_SLEEP:
335226 s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
336227
337
- if (EMU_RUN_HACK)
338
-
339
- else
340228 s_cbranch_execz L_SLEEP
341
- end
342
-
343
-if G8SR_DEBUG_TIMESTAMP
344
- s_memrealtime s_g8sr_ts_spi_wrexec
345
- s_waitcnt lgkmcnt(0)
346
-end
347229
348230 /* setup Resource Contants */
349
- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
350
- //calculate wd_addr using absolute thread id
351
- v_readlane_b32 s_save_tmp, v9, 0
352
- s_lshr_b32 s_save_tmp, s_save_tmp, 6
353
- s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
354
- s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
355
- s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
356
- s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
357
- else
358
- end
359
- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
360
- s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
361
- s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
362
- s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
363
- else
364
- end
365
-
366
-
367231 s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
368232 s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
369233 s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
....@@ -396,22 +260,10 @@
396260
397261
398262 s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
399
- if (SWIZZLE_EN)
400
- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
401
- else
402263 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
403
- end
404264
405265
406266 write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0
407
-
408
- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
409
- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
410
- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
411
- s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
412
- s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
413
- end
414
-
415267 write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC
416268 write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
417269 write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC
....@@ -453,18 +305,8 @@
453305 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
454306 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
455307
456
- if (SGPR_SAVE_USE_SQC)
457308 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes
458
- else
459
- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
460
- end
461
-
462
- if (SWIZZLE_EN)
463
- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
464
- else
465309 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
466
- end
467
-
468310
469311 // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
470312 //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
....@@ -503,30 +345,14 @@
503345 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
504346 s_mov_b32 exec_hi, 0xFFFFFFFF
505347
506
- if (SWIZZLE_EN)
507
- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
508
- else
509348 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
510
- end
511
-
512349
513350 // VGPR Allocated in 4-GPR granularity
514351
515
-if G8SR_VGPR_SR_IN_DWX4
516
- // the const stride for DWx4 is 4*4 bytes
517
- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
518
- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
519
-
520
- buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
521
-
522
- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
523
- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes
524
-else
525352 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
526353 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
527354 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
528355 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
529
-end
530356
531357
532358
....@@ -562,64 +388,10 @@
562388 s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
563389
564390
565
- if (SWIZZLE_EN)
566
- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
567
- else
568391 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
569
- end
570
-
571392 s_mov_b32 m0, 0x0 //lds_offset initial value = 0
572393
573394
574
-var LDS_DMA_ENABLE = 0
575
-var UNROLL = 0
576
-if UNROLL==0 && LDS_DMA_ENABLE==1
577
- s_mov_b32 s3, 256*2
578
- s_nop 0
579
- s_nop 0
580
- s_nop 0
581
- L_SAVE_LDS_LOOP:
582
- //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
583
- if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity
584
- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW
585
- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
586
- end
587
-
588
- s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
589
- s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes
590
- s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
591
- s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete?
592
-
593
-elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss
594
- // store from higest LDS address to lowest
595
- s_mov_b32 s3, 256*2
596
- s_sub_u32 m0, s_save_alloc_size, s3
597
- s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
598
- s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks...
599
- s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest
600
- s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction
601
- s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc
602
- s_nop 0
603
- s_nop 0
604
- s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes
605
- s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved
606
- s_add_u32 s0, s0,s_save_alloc_size
607
- s_addc_u32 s1, s1, 0
608
- s_setpc_b64 s[0:1]
609
-
610
-
611
- for var i =0; i< 128; i++
612
- // be careful to make here a 64Byte aligned address, which could improve performance...
613
- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW
614
- buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
615
-
616
- if i!=127
617
- s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline
618
- s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3
619
- end
620
- end
621
-
622
-else // BUFFER_STORE
623395 v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
624396 v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid
625397 v_mul_i32_i24 v2, v3, 8 // tid*8
....@@ -641,8 +413,6 @@
641413 // restore rsrc3
642414 s_mov_b32 s_save_buf_rsrc3, s0
643415
644
-end
645
-
646416 L_SAVE_LDS_DONE:
647417
648418
....@@ -660,44 +430,8 @@
660430 s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
661431 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
662432 s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
663
- if (SWIZZLE_EN)
664
- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
665
- else
666433 s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
667
- end
668434
669
-
670
- // VGPR Allocated in 4-GPR granularity
671
-
672
-if G8SR_VGPR_SR_IN_DWX4
673
- // the const stride for DWx4 is 4*4 bytes
674
- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
675
- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
676
-
677
- s_mov_b32 m0, 4 // skip first 4 VGPRs
678
- s_cmp_lt_u32 m0, s_save_alloc_size
679
- s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs
680
-
681
- s_set_gpr_idx_on m0, 0x1 // This will change M0
682
- s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0
683
-L_SAVE_VGPR_LOOP:
684
- v_mov_b32 v0, v0 // v0 = v[0+m0]
685
- v_mov_b32 v1, v1
686
- v_mov_b32 v2, v2
687
- v_mov_b32 v3, v3
688
-
689
-
690
- buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
691
- s_add_u32 m0, m0, 4
692
- s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4
693
- s_cmp_lt_u32 m0, s_save_alloc_size
694
- s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
695
- s_set_gpr_idx_off
696
-L_SAVE_VGPR_LOOP_END:
697
-
698
- s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0
699
- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes
700
-else
701435 // VGPR store using dw burst
702436 s_mov_b32 m0, 0x4 //VGPR initial index value =0
703437 s_cmp_lt_u32 m0, s_save_alloc_size
....@@ -713,52 +447,18 @@
713447 v_mov_b32 v2, v2 //v0 = v[0+m0]
714448 v_mov_b32 v3, v3 //v0 = v[0+m0]
715449
716
- if(USE_MTBUF_INSTEAD_OF_MUBUF)
717
- tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
718
- else
719450 buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
720451 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
721452 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
722453 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
723
- end
724454
725455 s_add_u32 m0, m0, 4 //next vgpr index
726456 s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
727457 s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
728458 s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete?
729459 s_set_gpr_idx_off
730
-end
731460
732461 L_SAVE_VGPR_END:
733
-
734
-
735
-
736
-
737
-
738
-
739
- /* S_PGM_END_SAVED */ //FIXME graphics ONLY
740
- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
741
- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
742
- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
743
- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
744
- s_rfe_b64 s_save_pc_lo //Return to the main shader program
745
- else
746
- end
747
-
748
-// Save Done timestamp
749
-if G8SR_DEBUG_TIMESTAMP
750
- s_memrealtime s_g8sr_ts_save_d
751
- // SGPR SR memory offset : size(VGPR)
752
- get_vgpr_size_bytes(s_save_mem_offset)
753
- s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
754
- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
755
- // Need reset rsrc2??
756
- s_mov_b32 m0, s_save_mem_offset
757
- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
758
- s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1
759
-end
760
-
761
-
762462 s_branch L_END_PGM
763463
764464
....@@ -769,27 +469,6 @@
769469
770470 L_RESTORE:
771471 /* Setup Resource Contants */
772
- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
773
- //calculate wd_addr using absolute thread id
774
- v_readlane_b32 s_restore_tmp, v9, 0
775
- s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
776
- s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
777
- s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
778
- s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
779
- s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
780
- else
781
- end
782
-
783
-if G8SR_DEBUG_TIMESTAMP
784
- s_memrealtime s_g8sr_ts_restore_s
785
- s_waitcnt lgkmcnt(0) //FIXME, will cause xnack??
786
- // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
787
- s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
788
- s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored..
789
-end
790
-
791
-
792
-
793472 s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
794473 s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
795474 s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
....@@ -831,18 +510,12 @@
831510 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow???
832511
833512
834
- if (SWIZZLE_EN)
835
- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
836
- else
837513 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
838
- end
839514 s_mov_b32 m0, 0x0 //lds_offset initial value = 0
840515
841516 L_RESTORE_LDS_LOOP:
842
- if (SAVE_LDS)
843517 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
844518 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW
845
- end
846519 s_add_u32 m0, m0, 256*2 // 128 DW
847520 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW
848521 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
....@@ -861,40 +534,8 @@
861534 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
862535 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
863536 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
864
- if (SWIZZLE_EN)
865
- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
866
- else
867537 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
868
- end
869538
870
-if G8SR_VGPR_SR_IN_DWX4
871
- get_vgpr_size_bytes(s_restore_mem_offset)
872
- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
873
-
874
- // the const stride for DWx4 is 4*4 bytes
875
- s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0
876
- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes
877
-
878
- s_mov_b32 m0, s_restore_alloc_size
879
- s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0
880
-
881
-L_RESTORE_VGPR_LOOP:
882
- buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
883
- s_waitcnt vmcnt(0)
884
- s_sub_u32 m0, m0, 4
885
- v_mov_b32 v0, v0 // v[0+m0] = v0
886
- v_mov_b32 v1, v1
887
- v_mov_b32 v2, v2
888
- v_mov_b32 v3, v3
889
- s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
890
- s_cmp_eq_u32 m0, 0x8000
891
- s_cbranch_scc0 L_RESTORE_VGPR_LOOP
892
- s_set_gpr_idx_off
893
-
894
- s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0
895
- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes
896
-
897
-else
898539 // VGPR load using dw burst
899540 s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
900541 s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
....@@ -903,14 +544,10 @@
903544 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later
904545
905546 L_RESTORE_VGPR_LOOP:
906
- if(USE_MTBUF_INSTEAD_OF_MUBUF)
907
- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
908
- else
909547 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
910548 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
911549 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
912550 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
913
- end
914551 s_waitcnt vmcnt(0) //ensure data ready
915552 v_mov_b32 v0, v0 //v[0+m0] = v0
916553 v_mov_b32 v1, v1
....@@ -922,16 +559,10 @@
922559 s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete?
923560 s_set_gpr_idx_off
924561 /* VGPR restore on v0 */
925
- if(USE_MTBUF_INSTEAD_OF_MUBUF)
926
- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
927
- else
928562 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
929563 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256
930564 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2
931565 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3
932
- end
933
-
934
-end
935566
936567 /* restore SGPRs */
937568 //////////////////////////////
....@@ -947,16 +578,8 @@
947578 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
948579 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
949580
950
- if (SGPR_SAVE_USE_SQC)
951581 s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes
952
- else
953
- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads)
954
- end
955
- if (SWIZZLE_EN)
956
- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
957
- else
958582 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
959
- end
960583
961584 /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111),
962585 However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG
....@@ -985,12 +608,6 @@
985608 //////////////////////////////
986609 L_RESTORE_HWREG:
987610
988
-
989
-if G8SR_DEBUG_TIMESTAMP
990
- s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
991
- s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
992
-end
993
-
994611 // HWREG SR memory offset : size(VGPR)+size(SGPR)
995612 get_vgpr_size_bytes(s_restore_mem_offset)
996613 get_sgpr_size_bytes(s_restore_tmp)
....@@ -998,11 +615,7 @@
998615
999616
1000617 s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
1001
- if (SWIZZLE_EN)
1002
- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
1003
- else
1004618 s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
1005
- end
1006619
1007620 read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0
1008621 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC
....@@ -1018,16 +631,6 @@
1018631 read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI
1019632
1020633 s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
1021
-
1022
- //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
1023
- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
1024
- s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
1025
- s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
1026
- end
1027
- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
1028
- s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
1029
- s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
1030
- end
1031634
1032635 s_mov_b32 m0, s_restore_m0
1033636 s_mov_b32 exec_lo, s_restore_exec_lo
....@@ -1060,11 +663,6 @@
1060663 set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu
1061664
1062665 s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
1063
-
1064
-if G8SR_DEBUG_TIMESTAMP
1065
- s_memrealtime s_g8sr_ts_restore_d
1066
- s_waitcnt lgkmcnt(0)
1067
-end
1068666
1069667 // s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
1070668 s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc