hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
....@@ -19,18 +19,8 @@
1919 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
2020 * OTHER DEALINGS IN THE SOFTWARE.
2121 */
22
-
23
-#define pr_fmt(fmt) "kfd2kgd: " fmt
24
-
25
-#include <linux/module.h>
26
-#include <linux/fdtable.h>
27
-#include <linux/uaccess.h>
28
-#include <linux/firmware.h>
29
-#include <drm/drmP.h>
3022 #include "amdgpu.h"
3123 #include "amdgpu_amdkfd.h"
32
-#include "amdgpu_ucode.h"
33
-#include "soc15_hw_ip.h"
3424 #include "gc/gc_9_0_offset.h"
3525 #include "gc/gc_9_0_sh_mask.h"
3626 #include "vega10_enum.h"
....@@ -46,181 +36,13 @@
4636 #include "v9_structs.h"
4737 #include "soc15.h"
4838 #include "soc15d.h"
49
-
50
-/* HACK: MMHUB and GC both have VM-related register with the same
51
- * names but different offsets. Define the MMHUB register we need here
52
- * with a prefix. A proper solution would be to move the functions
53
- * programming these registers into gfx_v9_0.c and mmhub_v1_0.c
54
- * respectively.
55
- */
56
-#define mmMMHUB_VM_INVALIDATE_ENG16_REQ 0x06f3
57
-#define mmMMHUB_VM_INVALIDATE_ENG16_REQ_BASE_IDX 0
58
-
59
-#define mmMMHUB_VM_INVALIDATE_ENG16_ACK 0x0705
60
-#define mmMMHUB_VM_INVALIDATE_ENG16_ACK_BASE_IDX 0
61
-
62
-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32 0x072b
63
-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32_BASE_IDX 0
64
-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32 0x072c
65
-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32_BASE_IDX 0
66
-
67
-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32 0x074b
68
-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32_BASE_IDX 0
69
-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32 0x074c
70
-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32_BASE_IDX 0
71
-
72
-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32 0x076b
73
-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32_BASE_IDX 0
74
-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32 0x076c
75
-#define mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32_BASE_IDX 0
76
-
77
-#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32 0x0727
78
-#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32_BASE_IDX 0
79
-#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32 0x0728
80
-#define mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32_BASE_IDX 0
81
-
82
-#define V9_PIPE_PER_MEC (4)
83
-#define V9_QUEUES_PER_PIPE_MEC (8)
39
+#include "gfx_v9_0.h"
8440
8541 enum hqd_dequeue_request_type {
8642 NO_ACTION = 0,
8743 DRAIN_PIPE,
8844 RESET_WAVES
8945 };
90
-
91
-/*
92
- * Register access functions
93
- */
94
-
95
-static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
96
- uint32_t sh_mem_config,
97
- uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
98
- uint32_t sh_mem_bases);
99
-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
100
- unsigned int vmid);
101
-static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
102
-static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
103
- uint32_t queue_id, uint32_t __user *wptr,
104
- uint32_t wptr_shift, uint32_t wptr_mask,
105
- struct mm_struct *mm);
106
-static int kgd_hqd_dump(struct kgd_dev *kgd,
107
- uint32_t pipe_id, uint32_t queue_id,
108
- uint32_t (**dump)[2], uint32_t *n_regs);
109
-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
110
- uint32_t __user *wptr, struct mm_struct *mm);
111
-static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
112
- uint32_t engine_id, uint32_t queue_id,
113
- uint32_t (**dump)[2], uint32_t *n_regs);
114
-static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
115
- uint32_t pipe_id, uint32_t queue_id);
116
-static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
117
-static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
118
- enum kfd_preempt_type reset_type,
119
- unsigned int utimeout, uint32_t pipe_id,
120
- uint32_t queue_id);
121
-static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
122
- unsigned int utimeout);
123
-static int kgd_address_watch_disable(struct kgd_dev *kgd);
124
-static int kgd_address_watch_execute(struct kgd_dev *kgd,
125
- unsigned int watch_point_id,
126
- uint32_t cntl_val,
127
- uint32_t addr_hi,
128
- uint32_t addr_lo);
129
-static int kgd_wave_control_execute(struct kgd_dev *kgd,
130
- uint32_t gfx_index_val,
131
- uint32_t sq_cmd);
132
-static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
133
- unsigned int watch_point_id,
134
- unsigned int reg_offset);
135
-
136
-static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
137
- uint8_t vmid);
138
-static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
139
- uint8_t vmid);
140
-static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
141
- uint32_t page_table_base);
142
-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type);
143
-static void set_scratch_backing_va(struct kgd_dev *kgd,
144
- uint64_t va, uint32_t vmid);
145
-static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid);
146
-static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid);
147
-
148
-/* Because of REG_GET_FIELD() being used, we put this function in the
149
- * asic specific file.
150
- */
151
-static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
152
- struct tile_config *config)
153
-{
154
- struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
155
-
156
- config->gb_addr_config = adev->gfx.config.gb_addr_config;
157
-
158
- config->tile_config_ptr = adev->gfx.config.tile_mode_array;
159
- config->num_tile_configs =
160
- ARRAY_SIZE(adev->gfx.config.tile_mode_array);
161
- config->macro_tile_config_ptr =
162
- adev->gfx.config.macrotile_mode_array;
163
- config->num_macro_tile_configs =
164
- ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
165
-
166
- return 0;
167
-}
168
-
169
-static const struct kfd2kgd_calls kfd2kgd = {
170
- .init_gtt_mem_allocation = alloc_gtt_mem,
171
- .free_gtt_mem = free_gtt_mem,
172
- .get_local_mem_info = get_local_mem_info,
173
- .get_gpu_clock_counter = get_gpu_clock_counter,
174
- .get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
175
- .alloc_pasid = amdgpu_pasid_alloc,
176
- .free_pasid = amdgpu_pasid_free,
177
- .program_sh_mem_settings = kgd_program_sh_mem_settings,
178
- .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
179
- .init_interrupts = kgd_init_interrupts,
180
- .hqd_load = kgd_hqd_load,
181
- .hqd_sdma_load = kgd_hqd_sdma_load,
182
- .hqd_dump = kgd_hqd_dump,
183
- .hqd_sdma_dump = kgd_hqd_sdma_dump,
184
- .hqd_is_occupied = kgd_hqd_is_occupied,
185
- .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
186
- .hqd_destroy = kgd_hqd_destroy,
187
- .hqd_sdma_destroy = kgd_hqd_sdma_destroy,
188
- .address_watch_disable = kgd_address_watch_disable,
189
- .address_watch_execute = kgd_address_watch_execute,
190
- .wave_control_execute = kgd_wave_control_execute,
191
- .address_watch_get_offset = kgd_address_watch_get_offset,
192
- .get_atc_vmid_pasid_mapping_pasid =
193
- get_atc_vmid_pasid_mapping_pasid,
194
- .get_atc_vmid_pasid_mapping_valid =
195
- get_atc_vmid_pasid_mapping_valid,
196
- .get_fw_version = get_fw_version,
197
- .set_scratch_backing_va = set_scratch_backing_va,
198
- .get_tile_config = amdgpu_amdkfd_get_tile_config,
199
- .get_cu_info = get_cu_info,
200
- .get_vram_usage = amdgpu_amdkfd_get_vram_usage,
201
- .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm,
202
- .acquire_process_vm = amdgpu_amdkfd_gpuvm_acquire_process_vm,
203
- .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm,
204
- .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir,
205
- .set_vm_context_page_table_base = set_vm_context_page_table_base,
206
- .alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
207
- .free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
208
- .map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
209
- .unmap_memory_to_gpu = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu,
210
- .sync_memory = amdgpu_amdkfd_gpuvm_sync_memory,
211
- .map_gtt_bo_to_kernel = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel,
212
- .restore_process_bos = amdgpu_amdkfd_gpuvm_restore_process_bos,
213
- .invalidate_tlbs = invalidate_tlbs,
214
- .invalidate_tlbs_vmid = invalidate_tlbs_vmid,
215
- .submit_ib = amdgpu_amdkfd_submit_ib,
216
- .gpu_recover = amdgpu_amdkfd_gpu_reset,
217
- .set_compute_idle = amdgpu_amdkfd_set_compute_idle
218
-};
219
-
220
-struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void)
221
-{
222
- return (struct kfd2kgd_calls *)&kfd2kgd;
223
-}
22446
22547 static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
22648 {
....@@ -255,13 +77,13 @@
25577 lock_srbm(kgd, mec, pipe, queue_id, 0);
25678 }
25779
258
-static uint32_t get_queue_mask(struct amdgpu_device *adev,
80
+static uint64_t get_queue_mask(struct amdgpu_device *adev,
25981 uint32_t pipe_id, uint32_t queue_id)
26082 {
261
- unsigned int bit = (pipe_id * adev->gfx.mec.num_queue_per_pipe +
262
- queue_id) & 31;
83
+ unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe +
84
+ queue_id;
26385
264
- return ((uint32_t)1) << bit;
86
+ return 1ull << bit;
26587 }
26688
26789 static void release_queue(struct kgd_dev *kgd)
....@@ -269,7 +91,7 @@
26991 unlock_srbm(kgd);
27092 }
27193
272
-static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
94
+void kgd_gfx_v9_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid,
27395 uint32_t sh_mem_config,
27496 uint32_t sh_mem_ape1_base,
27597 uint32_t sh_mem_ape1_limit,
....@@ -279,14 +101,14 @@
279101
280102 lock_srbm(kgd, 0, 0, 0, vmid);
281103
282
- WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config);
283
- WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases);
104
+ WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config);
105
+ WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases);
284106 /* APE1 no longer exists on GFX9 */
285107
286108 unlock_srbm(kgd);
287109 }
288110
289
-static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid,
111
+int kgd_gfx_v9_set_pasid_vmid_mapping(struct kgd_dev *kgd, u32 pasid,
290112 unsigned int vmid)
291113 {
292114 struct amdgpu_device *adev = get_amdgpu_device(kgd);
....@@ -347,7 +169,7 @@
347169 * but still works
348170 */
349171
350
-static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
172
+int kgd_gfx_v9_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
351173 {
352174 struct amdgpu_device *adev = get_amdgpu_device(kgd);
353175 uint32_t mec;
....@@ -367,24 +189,36 @@
367189 return 0;
368190 }
369191
370
-static uint32_t get_sdma_base_addr(struct amdgpu_device *adev,
192
+static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev,
371193 unsigned int engine_id,
372194 unsigned int queue_id)
373195 {
374
- uint32_t base[2] = {
375
- SOC15_REG_OFFSET(SDMA0, 0,
376
- mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL,
377
- SOC15_REG_OFFSET(SDMA1, 0,
378
- mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL
379
- };
380
- uint32_t retval;
196
+ uint32_t sdma_engine_reg_base = 0;
197
+ uint32_t sdma_rlc_reg_offset;
381198
382
- retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL -
383
- mmSDMA0_RLC0_RB_CNTL);
199
+ switch (engine_id) {
200
+ default:
201
+ dev_warn(adev->dev,
202
+ "Invalid sdma engine id (%d), using engine id 0\n",
203
+ engine_id);
204
+ fallthrough;
205
+ case 0:
206
+ sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0,
207
+ mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL;
208
+ break;
209
+ case 1:
210
+ sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA1, 0,
211
+ mmSDMA1_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL;
212
+ break;
213
+ }
384214
385
- pr_debug("sdma base address: 0x%x\n", retval);
215
+ sdma_rlc_reg_offset = sdma_engine_reg_base
216
+ + queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL);
386217
387
- return retval;
218
+ pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id,
219
+ queue_id, sdma_rlc_reg_offset);
220
+
221
+ return sdma_rlc_reg_offset;
388222 }
389223
390224 static inline struct v9_mqd *get_mqd(void *mqd)
....@@ -397,7 +231,7 @@
397231 return (struct v9_sdma_mqd *)mqd;
398232 }
399233
400
-static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
234
+int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
401235 uint32_t queue_id, uint32_t __user *wptr,
402236 uint32_t wptr_shift, uint32_t wptr_mask,
403237 struct mm_struct *mm)
....@@ -411,34 +245,19 @@
411245
412246 acquire_queue(kgd, pipe_id, queue_id);
413247
414
- /* HIQ is set during driver init period with vmid set to 0*/
415
- if (m->cp_hqd_vmid == 0) {
416
- uint32_t value, mec, pipe;
417
-
418
- mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
419
- pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
420
-
421
- pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
422
- mec, pipe, queue_id);
423
- value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS));
424
- value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1,
425
- ((mec << 5) | (pipe << 3) | queue_id | 0x80));
426
- WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value);
427
- }
428
-
429248 /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */
430249 mqd_hqd = &m->cp_mqd_base_addr_lo;
431250 hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR);
432251
433252 for (reg = hqd_base;
434253 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++)
435
- WREG32(reg, mqd_hqd[reg - hqd_base]);
254
+ WREG32_RLC(reg, mqd_hqd[reg - hqd_base]);
436255
437256
438257 /* Activate doorbell logic before triggering WPTR poll. */
439258 data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control,
440259 CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
441
- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data);
260
+ WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data);
442261
443262 if (wptr) {
444263 /* Don't read wptr with get_user because the user
....@@ -467,32 +286,85 @@
467286 guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1);
468287 guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32;
469288
470
- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO),
289
+ WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO),
471290 lower_32_bits(guessed_wptr));
472
- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI),
291
+ WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI),
473292 upper_32_bits(guessed_wptr));
474
- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR),
293
+ WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR),
475294 lower_32_bits((uintptr_t)wptr));
476
- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI),
295
+ WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI),
477296 upper_32_bits((uintptr_t)wptr));
478297 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1),
479
- get_queue_mask(adev, pipe_id, queue_id));
298
+ (uint32_t)get_queue_mask(adev, pipe_id, queue_id));
480299 }
481300
482301 /* Start the EOP fetcher */
483
- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR),
302
+ WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR),
484303 REG_SET_FIELD(m->cp_hqd_eop_rptr,
485304 CP_HQD_EOP_RPTR, INIT_FETCHER, 1));
486305
487306 data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
488
- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data);
307
+ WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data);
489308
490309 release_queue(kgd);
491310
492311 return 0;
493312 }
494313
495
-static int kgd_hqd_dump(struct kgd_dev *kgd,
314
+int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
315
+ uint32_t pipe_id, uint32_t queue_id,
316
+ uint32_t doorbell_off)
317
+{
318
+ struct amdgpu_device *adev = get_amdgpu_device(kgd);
319
+ struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
320
+ struct v9_mqd *m;
321
+ uint32_t mec, pipe;
322
+ int r;
323
+
324
+ m = get_mqd(mqd);
325
+
326
+ acquire_queue(kgd, pipe_id, queue_id);
327
+
328
+ mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
329
+ pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
330
+
331
+ pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
332
+ mec, pipe, queue_id);
333
+
334
+ spin_lock(&adev->gfx.kiq.ring_lock);
335
+ r = amdgpu_ring_alloc(kiq_ring, 7);
336
+ if (r) {
337
+ pr_err("Failed to alloc KIQ (%d).\n", r);
338
+ goto out_unlock;
339
+ }
340
+
341
+ amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
342
+ amdgpu_ring_write(kiq_ring,
343
+ PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
344
+ PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) | /* VMID */
345
+ PACKET3_MAP_QUEUES_QUEUE(queue_id) |
346
+ PACKET3_MAP_QUEUES_PIPE(pipe) |
347
+ PACKET3_MAP_QUEUES_ME((mec - 1)) |
348
+ PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */
349
+ PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */
350
+ PACKET3_MAP_QUEUES_ENGINE_SEL(1) | /* engine_sel: hiq */
351
+ PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */
352
+ amdgpu_ring_write(kiq_ring,
353
+ PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off));
354
+ amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_lo);
355
+ amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_hi);
356
+ amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_lo);
357
+ amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
358
+ amdgpu_ring_commit(kiq_ring);
359
+
360
+out_unlock:
361
+ spin_unlock(&adev->gfx.kiq.ring_lock);
362
+ release_queue(kgd);
363
+
364
+ return r;
365
+}
366
+
367
+int kgd_gfx_v9_hqd_dump(struct kgd_dev *kgd,
496368 uint32_t pipe_id, uint32_t queue_id,
497369 uint32_t (**dump)[2], uint32_t *n_regs)
498370 {
....@@ -529,71 +401,67 @@
529401 {
530402 struct amdgpu_device *adev = get_amdgpu_device(kgd);
531403 struct v9_sdma_mqd *m;
532
- uint32_t sdma_base_addr, sdmax_gfx_context_cntl;
404
+ uint32_t sdma_rlc_reg_offset;
533405 unsigned long end_jiffies;
534406 uint32_t data;
535407 uint64_t data64;
536408 uint64_t __user *wptr64 = (uint64_t __user *)wptr;
537409
538410 m = get_sdma_mqd(mqd);
539
- sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id,
411
+ sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
540412 m->sdma_queue_id);
541
- sdmax_gfx_context_cntl = m->sdma_engine_id ?
542
- SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) :
543
- SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL);
544413
545
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
414
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL,
546415 m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
547416
548417 end_jiffies = msecs_to_jiffies(2000) + jiffies;
549418 while (true) {
550
- data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
419
+ data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS);
551420 if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
552421 break;
553
- if (time_after(jiffies, end_jiffies))
422
+ if (time_after(jiffies, end_jiffies)) {
423
+ pr_err("SDMA RLC not idle in %s\n", __func__);
554424 return -ETIME;
425
+ }
555426 usleep_range(500, 1000);
556427 }
557
- data = RREG32(sdmax_gfx_context_cntl);
558
- data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL,
559
- RESUME_CTX, 0);
560
- WREG32(sdmax_gfx_context_cntl, data);
561428
562
- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET,
429
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET,
563430 m->sdmax_rlcx_doorbell_offset);
564431
565432 data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
566433 ENABLE, 1);
567
- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
568
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr);
569
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI,
434
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data);
435
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR,
436
+ m->sdmax_rlcx_rb_rptr);
437
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI,
570438 m->sdmax_rlcx_rb_rptr_hi);
571439
572
- WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1);
440
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1);
573441 if (read_user_wptr(mm, wptr64, data64)) {
574
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
442
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR,
575443 lower_32_bits(data64));
576
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI,
444
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI,
577445 upper_32_bits(data64));
578446 } else {
579
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
447
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR,
580448 m->sdmax_rlcx_rb_rptr);
581
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI,
449
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI,
582450 m->sdmax_rlcx_rb_rptr_hi);
583451 }
584
- WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0);
452
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0);
585453
586
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
587
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI,
454
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
455
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI,
588456 m->sdmax_rlcx_rb_base_hi);
589
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
457
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
590458 m->sdmax_rlcx_rb_rptr_addr_lo);
591
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
459
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
592460 m->sdmax_rlcx_rb_rptr_addr_hi);
593461
594462 data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
595463 RB_ENABLE, 1);
596
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
464
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data);
597465
598466 return 0;
599467 }
....@@ -603,7 +471,8 @@
603471 uint32_t (**dump)[2], uint32_t *n_regs)
604472 {
605473 struct amdgpu_device *adev = get_amdgpu_device(kgd);
606
- uint32_t sdma_base_addr = get_sdma_base_addr(adev, engine_id, queue_id);
474
+ uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
475
+ engine_id, queue_id);
607476 uint32_t i = 0, reg;
608477 #undef HQD_N_REGS
609478 #define HQD_N_REGS (19+6+7+10)
....@@ -613,15 +482,15 @@
613482 return -ENOMEM;
614483
615484 for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
616
- DUMP_REG(sdma_base_addr + reg);
485
+ DUMP_REG(sdma_rlc_reg_offset + reg);
617486 for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++)
618
- DUMP_REG(sdma_base_addr + reg);
487
+ DUMP_REG(sdma_rlc_reg_offset + reg);
619488 for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN;
620489 reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++)
621
- DUMP_REG(sdma_base_addr + reg);
490
+ DUMP_REG(sdma_rlc_reg_offset + reg);
622491 for (reg = mmSDMA0_RLC0_MIDCMD_DATA0;
623492 reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++)
624
- DUMP_REG(sdma_base_addr + reg);
493
+ DUMP_REG(sdma_rlc_reg_offset + reg);
625494
626495 WARN_ON_ONCE(i != HQD_N_REGS);
627496 *n_regs = i;
....@@ -629,7 +498,7 @@
629498 return 0;
630499 }
631500
632
-static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
501
+bool kgd_gfx_v9_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
633502 uint32_t pipe_id, uint32_t queue_id)
634503 {
635504 struct amdgpu_device *adev = get_amdgpu_device(kgd);
....@@ -655,14 +524,14 @@
655524 {
656525 struct amdgpu_device *adev = get_amdgpu_device(kgd);
657526 struct v9_sdma_mqd *m;
658
- uint32_t sdma_base_addr;
527
+ uint32_t sdma_rlc_reg_offset;
659528 uint32_t sdma_rlc_rb_cntl;
660529
661530 m = get_sdma_mqd(mqd);
662
- sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id,
531
+ sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
663532 m->sdma_queue_id);
664533
665
- sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL);
534
+ sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL);
666535
667536 if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)
668537 return true;
....@@ -670,7 +539,7 @@
670539 return false;
671540 }
672541
673
-static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
542
+int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
674543 enum kfd_preempt_type reset_type,
675544 unsigned int utimeout, uint32_t pipe_id,
676545 uint32_t queue_id)
....@@ -681,13 +550,13 @@
681550 uint32_t temp;
682551 struct v9_mqd *m = get_mqd(mqd);
683552
684
- if (adev->in_gpu_reset)
553
+ if (amdgpu_in_reset(adev))
685554 return -EIO;
686555
687556 acquire_queue(kgd, pipe_id, queue_id);
688557
689558 if (m->cp_hqd_vmid == 0)
690
- WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0);
559
+ WREG32_FIELD15_RLC(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0);
691560
692561 switch (reset_type) {
693562 case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
....@@ -701,7 +570,7 @@
701570 break;
702571 }
703572
704
- WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type);
573
+ WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type);
705574
706575 end_jiffies = (utimeout * HZ / 1000) + jiffies;
707576 while (true) {
....@@ -725,192 +594,60 @@
725594 {
726595 struct amdgpu_device *adev = get_amdgpu_device(kgd);
727596 struct v9_sdma_mqd *m;
728
- uint32_t sdma_base_addr;
597
+ uint32_t sdma_rlc_reg_offset;
729598 uint32_t temp;
730599 unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
731600
732601 m = get_sdma_mqd(mqd);
733
- sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id,
602
+ sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
734603 m->sdma_queue_id);
735604
736
- temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL);
605
+ temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL);
737606 temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK;
738
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp);
607
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp);
739608
740609 while (true) {
741
- temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
610
+ temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS);
742611 if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
743612 break;
744
- if (time_after(jiffies, end_jiffies))
613
+ if (time_after(jiffies, end_jiffies)) {
614
+ pr_err("SDMA RLC not idle in %s\n", __func__);
745615 return -ETIME;
616
+ }
746617 usleep_range(500, 1000);
747618 }
748619
749
- WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
750
- WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
751
- RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
620
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, 0);
621
+ WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL,
622
+ RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) |
752623 SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
753624
754
- m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
625
+ m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR);
755626 m->sdmax_rlcx_rb_rptr_hi =
756
- RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI);
627
+ RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI);
757628
758629 return 0;
759630 }
760631
761
-static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd,
762
- uint8_t vmid)
632
+bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct kgd_dev *kgd,
633
+ uint8_t vmid, uint16_t *p_pasid)
763634 {
764
- uint32_t reg;
635
+ uint32_t value;
765636 struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
766637
767
- reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
638
+ value = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
768639 + vmid);
769
- return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK;
640
+ *p_pasid = value & ATC_VMID0_PASID_MAPPING__PASID_MASK;
641
+
642
+ return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK);
770643 }
771644
772
-static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd,
773
- uint8_t vmid)
774
-{
775
- uint32_t reg;
776
- struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
777
-
778
- reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING)
779
- + vmid);
780
- return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK;
781
-}
782
-
783
-static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid)
784
-{
785
- struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
786
- uint32_t req = (1 << vmid) |
787
- (0 << VM_INVALIDATE_ENG16_REQ__FLUSH_TYPE__SHIFT) | /* legacy */
788
- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PTES_MASK |
789
- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE0_MASK |
790
- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE1_MASK |
791
- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L2_PDE2_MASK |
792
- VM_INVALIDATE_ENG16_REQ__INVALIDATE_L1_PTES_MASK;
793
-
794
- mutex_lock(&adev->srbm_mutex);
795
-
796
- /* Use legacy mode tlb invalidation.
797
- *
798
- * Currently on Raven the code below is broken for anything but
799
- * legacy mode due to a MMHUB power gating problem. A workaround
800
- * is for MMHUB to wait until the condition PER_VMID_INVALIDATE_REQ
801
- * == PER_VMID_INVALIDATE_ACK instead of simply waiting for the ack
802
- * bit.
803
- *
804
- * TODO 1: agree on the right set of invalidation registers for
805
- * KFD use. Use the last one for now. Invalidate both GC and
806
- * MMHUB.
807
- *
808
- * TODO 2: support range-based invalidation, requires kfg2kgd
809
- * interface change
810
- */
811
- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_LO32),
812
- 0xffffffff);
813
- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ADDR_RANGE_HI32),
814
- 0x0000001f);
815
-
816
- WREG32(SOC15_REG_OFFSET(MMHUB, 0,
817
- mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_LO32),
818
- 0xffffffff);
819
- WREG32(SOC15_REG_OFFSET(MMHUB, 0,
820
- mmMMHUB_VM_INVALIDATE_ENG16_ADDR_RANGE_HI32),
821
- 0x0000001f);
822
-
823
- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_REQ), req);
824
-
825
- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_INVALIDATE_ENG16_REQ),
826
- req);
827
-
828
- while (!(RREG32(SOC15_REG_OFFSET(GC, 0, mmVM_INVALIDATE_ENG16_ACK)) &
829
- (1 << vmid)))
830
- cpu_relax();
831
-
832
- while (!(RREG32(SOC15_REG_OFFSET(MMHUB, 0,
833
- mmMMHUB_VM_INVALIDATE_ENG16_ACK)) &
834
- (1 << vmid)))
835
- cpu_relax();
836
-
837
- mutex_unlock(&adev->srbm_mutex);
838
-
839
-}
840
-
841
-static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid)
842
-{
843
- signed long r;
844
- uint32_t seq;
845
- struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
846
-
847
- spin_lock(&adev->gfx.kiq.ring_lock);
848
- amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/
849
- amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0));
850
- amdgpu_ring_write(ring,
851
- PACKET3_INVALIDATE_TLBS_DST_SEL(1) |
852
- PACKET3_INVALIDATE_TLBS_ALL_HUB(1) |
853
- PACKET3_INVALIDATE_TLBS_PASID(pasid) |
854
- PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(0)); /* legacy */
855
- amdgpu_fence_emit_polling(ring, &seq);
856
- amdgpu_ring_commit(ring);
857
- spin_unlock(&adev->gfx.kiq.ring_lock);
858
-
859
- r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
860
- if (r < 1) {
861
- DRM_ERROR("wait for kiq fence error: %ld.\n", r);
862
- return -ETIME;
863
- }
864
-
865
- return 0;
866
-}
867
-
868
-static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid)
869
-{
870
- struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
871
- int vmid;
872
- struct amdgpu_ring *ring = &adev->gfx.kiq.ring;
873
-
874
- if (adev->in_gpu_reset)
875
- return -EIO;
876
-
877
- if (ring->ready)
878
- return invalidate_tlbs_with_kiq(adev, pasid);
879
-
880
- for (vmid = 0; vmid < 16; vmid++) {
881
- if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid))
882
- continue;
883
- if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) {
884
- if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid)
885
- == pasid) {
886
- write_vmid_invalidate_request(kgd, vmid);
887
- break;
888
- }
889
- }
890
- }
891
-
892
- return 0;
893
-}
894
-
895
-static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid)
896
-{
897
- struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
898
-
899
- if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) {
900
- pr_err("non kfd vmid %d\n", vmid);
901
- return 0;
902
- }
903
-
904
- write_vmid_invalidate_request(kgd, vmid);
905
- return 0;
906
-}
907
-
908
-static int kgd_address_watch_disable(struct kgd_dev *kgd)
645
+int kgd_gfx_v9_address_watch_disable(struct kgd_dev *kgd)
909646 {
910647 return 0;
911648 }
912649
913
-static int kgd_address_watch_execute(struct kgd_dev *kgd,
650
+int kgd_gfx_v9_address_watch_execute(struct kgd_dev *kgd,
914651 unsigned int watch_point_id,
915652 uint32_t cntl_val,
916653 uint32_t addr_hi,
....@@ -919,7 +656,7 @@
919656 return 0;
920657 }
921658
922
-static int kgd_wave_control_execute(struct kgd_dev *kgd,
659
+int kgd_gfx_v9_wave_control_execute(struct kgd_dev *kgd,
923660 uint32_t gfx_index_val,
924661 uint32_t sq_cmd)
925662 {
....@@ -928,7 +665,7 @@
928665
929666 mutex_lock(&adev->grbm_idx_mutex);
930667
931
- WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val);
668
+ WREG32_SOC15_RLC_SHADOW(GC, 0, mmGRBM_GFX_INDEX, gfx_index_val);
932669 WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd);
933670
934671 data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
....@@ -938,84 +675,23 @@
938675 data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
939676 SE_BROADCAST_WRITES, 1);
940677
941
- WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data);
678
+ WREG32_SOC15_RLC_SHADOW(GC, 0, mmGRBM_GFX_INDEX, data);
942679 mutex_unlock(&adev->grbm_idx_mutex);
943680
944681 return 0;
945682 }
946683
947
-static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd,
684
+uint32_t kgd_gfx_v9_address_watch_get_offset(struct kgd_dev *kgd,
948685 unsigned int watch_point_id,
949686 unsigned int reg_offset)
950687 {
951688 return 0;
952689 }
953690
954
-static void set_scratch_backing_va(struct kgd_dev *kgd,
955
- uint64_t va, uint32_t vmid)
956
-{
957
- /* No longer needed on GFXv9. The scratch base address is
958
- * passed to the shader by the CP. It's the user mode driver's
959
- * responsibility.
960
- */
961
-}
962
-
963
-/* FIXME: Does this need to be ASIC-specific code? */
964
-static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
965
-{
966
- struct amdgpu_device *adev = (struct amdgpu_device *) kgd;
967
- const union amdgpu_firmware_header *hdr;
968
-
969
- switch (type) {
970
- case KGD_ENGINE_PFP:
971
- hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data;
972
- break;
973
-
974
- case KGD_ENGINE_ME:
975
- hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data;
976
- break;
977
-
978
- case KGD_ENGINE_CE:
979
- hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data;
980
- break;
981
-
982
- case KGD_ENGINE_MEC1:
983
- hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data;
984
- break;
985
-
986
- case KGD_ENGINE_MEC2:
987
- hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data;
988
- break;
989
-
990
- case KGD_ENGINE_RLC:
991
- hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data;
992
- break;
993
-
994
- case KGD_ENGINE_SDMA1:
995
- hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data;
996
- break;
997
-
998
- case KGD_ENGINE_SDMA2:
999
- hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data;
1000
- break;
1001
-
1002
- default:
1003
- return 0;
1004
- }
1005
-
1006
- if (hdr == NULL)
1007
- return 0;
1008
-
1009
- /* Only 12 bit in use*/
1010
- return hdr->common.ucode_version;
1011
-}
1012
-
1013
-static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid,
1014
- uint32_t page_table_base)
691
+void kgd_gfx_v9_set_vm_context_page_table_base(struct kgd_dev *kgd,
692
+ uint32_t vmid, uint64_t page_table_base)
1015693 {
1016694 struct amdgpu_device *adev = get_amdgpu_device(kgd);
1017
- uint64_t base = (uint64_t)page_table_base << PAGE_SHIFT |
1018
- AMDGPU_PTE_VALID;
1019695
1020696 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) {
1021697 pr_err("trying to set page table base for wrong VMID %u\n",
....@@ -1023,29 +699,203 @@
1023699 return;
1024700 }
1025701
1026
- /* TODO: take advantage of per-process address space size. For
1027
- * now, all processes share the same address space size, like
1028
- * on GFX8 and older.
1029
- */
1030
- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0);
1031
- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0);
702
+ adev->mmhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
1032703
1033
- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2),
1034
- lower_32_bits(adev->vm_manager.max_pfn - 1));
1035
- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2),
1036
- upper_32_bits(adev->vm_manager.max_pfn - 1));
1037
-
1038
- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base));
1039
- WREG32(SOC15_REG_OFFSET(MMHUB, 0, mmMMHUB_VM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base));
1040
-
1041
- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32) + (vmid*2), 0);
1042
- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32) + (vmid*2), 0);
1043
-
1044
- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32) + (vmid*2),
1045
- lower_32_bits(adev->vm_manager.max_pfn - 1));
1046
- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32) + (vmid*2),
1047
- upper_32_bits(adev->vm_manager.max_pfn - 1));
1048
-
1049
- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32) + (vmid*2), lower_32_bits(base));
1050
- WREG32(SOC15_REG_OFFSET(GC, 0, mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32) + (vmid*2), upper_32_bits(base));
704
+ adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
1051705 }
706
+
707
+static void lock_spi_csq_mutexes(struct amdgpu_device *adev)
708
+{
709
+ mutex_lock(&adev->srbm_mutex);
710
+ mutex_lock(&adev->grbm_idx_mutex);
711
+
712
+}
713
+
714
+static void unlock_spi_csq_mutexes(struct amdgpu_device *adev)
715
+{
716
+ mutex_unlock(&adev->grbm_idx_mutex);
717
+ mutex_unlock(&adev->srbm_mutex);
718
+}
719
+
720
+/**
721
+ * @get_wave_count: Read device registers to get number of waves in flight for
722
+ * a particular queue. The method also returns the VMID associated with the
723
+ * queue.
724
+ *
725
+ * @adev: Handle of device whose registers are to be read
726
+ * @queue_idx: Index of queue in the queue-map bit-field
727
+ * @wave_cnt: Output parameter updated with number of waves in flight
728
+ * @vmid: Output parameter updated with VMID of queue whose wave count
729
+ * is being collected
730
+ */
731
+static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
732
+ int *wave_cnt, int *vmid)
733
+{
734
+ int pipe_idx;
735
+ int queue_slot;
736
+ unsigned int reg_val;
737
+
738
+ /*
739
+ * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
740
+ * parameters to read out waves in flight. Get VMID if there are
741
+ * non-zero waves in flight.
742
+ */
743
+ *vmid = 0xFF;
744
+ *wave_cnt = 0;
745
+ pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
746
+ queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
747
+ soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0);
748
+ reg_val = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_CSQ_WF_ACTIVE_COUNT_0) +
749
+ queue_slot);
750
+ *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
751
+ if (*wave_cnt != 0)
752
+ *vmid = (RREG32_SOC15(GC, 0, mmCP_HQD_VMID) &
753
+ CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT;
754
+}
755
+
756
+/**
757
+ * @kgd_gfx_v9_get_cu_occupancy: Reads relevant registers associated with each
758
+ * shader engine and aggregates the number of waves that are in flight for the
759
+ * process whose pasid is provided as a parameter. The process could have ZERO
760
+ * or more queues running and submitting waves to compute units.
761
+ *
762
+ * @kgd: Handle of device from which to get number of waves in flight
763
+ * @pasid: Identifies the process for which this query call is invoked
764
+ * @wave_cnt: Output parameter updated with number of waves in flight that
765
+ * belong to process with given pasid
766
+ * @max_waves_per_cu: Output parameter updated with maximum number of waves
767
+ * possible per Compute Unit
768
+ *
769
+ * @note: It's possible that the device has too many queues (oversubscription)
770
+ * in which case a VMID could be remapped to a different PASID. This could lead
771
+ * to an iaccurate wave count. Following is a high-level sequence:
772
+ * Time T1: vmid = getVmid(); vmid is associated with Pasid P1
773
+ * Time T2: passId = getPasId(vmid); vmid is associated with Pasid P2
774
+ * In the sequence above wave count obtained from time T1 will be incorrectly
775
+ * lost or added to total wave count.
776
+ *
777
+ * The registers that provide the waves in flight are:
778
+ *
779
+ * SPI_CSQ_WF_ACTIVE_STATUS - bit-map of queues per pipe. The bit is ON if a
780
+ * queue is slotted, OFF if there is no queue. A process could have ZERO or
781
+ * more queues slotted and submitting waves to be run on compute units. Even
782
+ * when there is a queue it is possible there could be zero wave fronts, this
783
+ * can happen when queue is waiting on top-of-pipe events - e.g. waitRegMem
784
+ * command
785
+ *
786
+ * For each bit that is ON from above:
787
+ *
788
+ * Read (SPI_CSQ_WF_ACTIVE_COUNT_0 + queue_idx) register. It provides the
789
+ * number of waves that are in flight for the queue at specified index. The
790
+ * index ranges from 0 to 7.
791
+ *
792
+ * If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID
793
+ * of the wave(s).
794
+ *
795
+ * Determine if VMID from above step maps to pasid provided as parameter. If
796
+ * it matches agrregate the wave count. That the VMID will not match pasid is
797
+ * a normal condition i.e. a device is expected to support multiple queues
798
+ * from multiple proceses.
799
+ *
800
+ * Reading registers referenced above involves programming GRBM appropriately
801
+ */
802
+static void kgd_gfx_v9_get_cu_occupancy(struct kgd_dev *kgd, int pasid,
803
+ int *pasid_wave_cnt, int *max_waves_per_cu)
804
+{
805
+ int qidx;
806
+ int vmid;
807
+ int se_idx;
808
+ int sh_idx;
809
+ int se_cnt;
810
+ int sh_cnt;
811
+ int wave_cnt;
812
+ int queue_map;
813
+ int pasid_tmp;
814
+ int max_queue_cnt;
815
+ int vmid_wave_cnt = 0;
816
+ struct amdgpu_device *adev;
817
+ DECLARE_BITMAP(cp_queue_bitmap, KGD_MAX_QUEUES);
818
+
819
+ adev = get_amdgpu_device(kgd);
820
+ lock_spi_csq_mutexes(adev);
821
+ soc15_grbm_select(adev, 1, 0, 0, 0);
822
+
823
+ /*
824
+ * Iterate through the shader engines and arrays of the device
825
+ * to get number of waves in flight
826
+ */
827
+ bitmap_complement(cp_queue_bitmap, adev->gfx.mec.queue_bitmap,
828
+ KGD_MAX_QUEUES);
829
+ max_queue_cnt = adev->gfx.mec.num_pipe_per_mec *
830
+ adev->gfx.mec.num_queue_per_pipe;
831
+ sh_cnt = adev->gfx.config.max_sh_per_se;
832
+ se_cnt = adev->gfx.config.max_shader_engines;
833
+ for (se_idx = 0; se_idx < se_cnt; se_idx++) {
834
+ for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) {
835
+
836
+ gfx_v9_0_select_se_sh(adev, se_idx, sh_idx, 0xffffffff);
837
+ queue_map = RREG32(SOC15_REG_OFFSET(GC, 0,
838
+ mmSPI_CSQ_WF_ACTIVE_STATUS));
839
+
840
+ /*
841
+ * Assumption: queue map encodes following schema: four
842
+ * pipes per each micro-engine, with each pipe mapping
843
+ * eight queues. This schema is true for GFX9 devices
844
+ * and must be verified for newer device families
845
+ */
846
+ for (qidx = 0; qidx < max_queue_cnt; qidx++) {
847
+
848
+ /* Skip qeueus that are not associated with
849
+ * compute functions
850
+ */
851
+ if (!test_bit(qidx, cp_queue_bitmap))
852
+ continue;
853
+
854
+ if (!(queue_map & (1 << qidx)))
855
+ continue;
856
+
857
+ /* Get number of waves in flight and aggregate them */
858
+ get_wave_count(adev, qidx, &wave_cnt, &vmid);
859
+ if (wave_cnt != 0) {
860
+ pasid_tmp =
861
+ RREG32(SOC15_REG_OFFSET(OSSSYS, 0,
862
+ mmIH_VMID_0_LUT) + vmid);
863
+ if (pasid_tmp == pasid)
864
+ vmid_wave_cnt += wave_cnt;
865
+ }
866
+ }
867
+ }
868
+ }
869
+
870
+ gfx_v9_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
871
+ soc15_grbm_select(adev, 0, 0, 0, 0);
872
+ unlock_spi_csq_mutexes(adev);
873
+
874
+ /* Update the output parameters and return */
875
+ *pasid_wave_cnt = vmid_wave_cnt;
876
+ *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu *
877
+ adev->gfx.cu_info.max_waves_per_simd;
878
+}
879
+
880
+const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
881
+ .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
882
+ .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
883
+ .init_interrupts = kgd_gfx_v9_init_interrupts,
884
+ .hqd_load = kgd_gfx_v9_hqd_load,
885
+ .hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load,
886
+ .hqd_sdma_load = kgd_hqd_sdma_load,
887
+ .hqd_dump = kgd_gfx_v9_hqd_dump,
888
+ .hqd_sdma_dump = kgd_hqd_sdma_dump,
889
+ .hqd_is_occupied = kgd_gfx_v9_hqd_is_occupied,
890
+ .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
891
+ .hqd_destroy = kgd_gfx_v9_hqd_destroy,
892
+ .hqd_sdma_destroy = kgd_hqd_sdma_destroy,
893
+ .address_watch_disable = kgd_gfx_v9_address_watch_disable,
894
+ .address_watch_execute = kgd_gfx_v9_address_watch_execute,
895
+ .wave_control_execute = kgd_gfx_v9_wave_control_execute,
896
+ .address_watch_get_offset = kgd_gfx_v9_address_watch_get_offset,
897
+ .get_atc_vmid_pasid_mapping_info =
898
+ kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
899
+ .set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
900
+ .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
901
+};