hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
....@@ -26,6 +26,7 @@
2626 #include "nbio/nbio_6_1_sh_mask.h"
2727 #include "gc/gc_9_0_offset.h"
2828 #include "gc/gc_9_0_sh_mask.h"
29
+#include "mp/mp_9_0_offset.h"
2930 #include "soc15.h"
3031 #include "vega10_ih.h"
3132 #include "soc15_common.h"
....@@ -237,19 +238,15 @@
237238 struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
238239 struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
239240 int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
240
- int locked;
241241
242242 /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
243243 * otherwise the mailbox msg will be ruined/reseted by
244244 * the VF FLR.
245
- *
246
- * we can unlock the lock_reset to allow "amdgpu_job_timedout"
247
- * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
248
- * which means host side had finished this VF's FLR.
249245 */
250
- locked = mutex_trylock(&adev->lock_reset);
251
- if (locked)
252
- adev->in_gpu_reset = 1;
246
+ if (!down_read_trylock(&adev->reset_sem))
247
+ return;
248
+
249
+ atomic_set(&adev->in_gpu_reset, 1);
253250
254251 do {
255252 if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
....@@ -260,14 +257,14 @@
260257 } while (timeout > 1);
261258
262259 flr_done:
263
- if (locked) {
264
- adev->in_gpu_reset = 0;
265
- mutex_unlock(&adev->lock_reset);
266
- }
260
+ atomic_set(&adev->in_gpu_reset, 0);
261
+ up_read(&adev->reset_sem);
267262
268263 /* Trigger recovery for world switch failure if no TDR */
269
- if (amdgpu_lockup_timeout == 0)
270
- amdgpu_device_gpu_recover(adev, NULL, true);
264
+ if (amdgpu_device_should_recover_gpu(adev)
265
+ && (!amdgpu_device_has_job_running(adev) ||
266
+ adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
267
+ amdgpu_device_gpu_recover(adev, NULL);
271268 }
272269
273270 static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
....@@ -295,6 +292,9 @@
295292 if (amdgpu_sriov_runtime(adev))
296293 schedule_work(&adev->virt.flr_work);
297294 break;
295
+ case IDH_QUERY_ALIVE:
296
+ xgpu_ai_mailbox_send_ack(adev);
297
+ break;
298298 /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
299299 * it byfar since that polling thread will handle it,
300300 * other msg like flr complete is not handled here.