.. | .. |
---|
26 | 26 | #include "nbio/nbio_6_1_sh_mask.h" |
---|
27 | 27 | #include "gc/gc_9_0_offset.h" |
---|
28 | 28 | #include "gc/gc_9_0_sh_mask.h" |
---|
| 29 | +#include "mp/mp_9_0_offset.h" |
---|
29 | 30 | #include "soc15.h" |
---|
30 | 31 | #include "vega10_ih.h" |
---|
31 | 32 | #include "soc15_common.h" |
---|
.. | .. |
---|
237 | 238 | struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); |
---|
238 | 239 | struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); |
---|
239 | 240 | int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; |
---|
240 | | - int locked; |
---|
241 | 241 | |
---|
242 | 242 | /* block amdgpu_gpu_recover till msg FLR COMPLETE received, |
---|
243 | 243 | * otherwise the mailbox msg will be ruined/reseted by |
---|
244 | 244 | * the VF FLR. |
---|
245 | | - * |
---|
246 | | - * we can unlock the lock_reset to allow "amdgpu_job_timedout" |
---|
247 | | - * to run gpu_recover() after FLR_NOTIFICATION_CMPL received |
---|
248 | | - * which means host side had finished this VF's FLR. |
---|
249 | 245 | */ |
---|
250 | | - locked = mutex_trylock(&adev->lock_reset); |
---|
251 | | - if (locked) |
---|
252 | | - adev->in_gpu_reset = 1; |
---|
| 246 | + if (!down_read_trylock(&adev->reset_sem)) |
---|
| 247 | + return; |
---|
| 248 | + |
---|
| 249 | + atomic_set(&adev->in_gpu_reset, 1); |
---|
253 | 250 | |
---|
254 | 251 | do { |
---|
255 | 252 | if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) |
---|
.. | .. |
---|
260 | 257 | } while (timeout > 1); |
---|
261 | 258 | |
---|
262 | 259 | flr_done: |
---|
263 | | - if (locked) { |
---|
264 | | - adev->in_gpu_reset = 0; |
---|
265 | | - mutex_unlock(&adev->lock_reset); |
---|
266 | | - } |
---|
| 260 | + atomic_set(&adev->in_gpu_reset, 0); |
---|
| 261 | + up_read(&adev->reset_sem); |
---|
267 | 262 | |
---|
268 | 263 | /* Trigger recovery for world switch failure if no TDR */ |
---|
269 | | - if (amdgpu_lockup_timeout == 0) |
---|
270 | | - amdgpu_device_gpu_recover(adev, NULL, true); |
---|
| 264 | + if (amdgpu_device_should_recover_gpu(adev) |
---|
| 265 | + && (!amdgpu_device_has_job_running(adev) || |
---|
| 266 | + adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) |
---|
| 267 | + amdgpu_device_gpu_recover(adev, NULL); |
---|
271 | 268 | } |
---|
272 | 269 | |
---|
273 | 270 | static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, |
---|
.. | .. |
---|
295 | 292 | if (amdgpu_sriov_runtime(adev)) |
---|
296 | 293 | schedule_work(&adev->virt.flr_work); |
---|
297 | 294 | break; |
---|
| 295 | + case IDH_QUERY_ALIVE: |
---|
| 296 | + xgpu_ai_mailbox_send_ack(adev); |
---|
| 297 | + break; |
---|
298 | 298 | /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore |
---|
299 | 299 | * it byfar since that polling thread will handle it, |
---|
300 | 300 | * other msg like flr complete is not handled here. |
---|