| .. | .. |
|---|
| 26 | 26 | #include "nbio/nbio_6_1_sh_mask.h" |
|---|
| 27 | 27 | #include "gc/gc_9_0_offset.h" |
|---|
| 28 | 28 | #include "gc/gc_9_0_sh_mask.h" |
|---|
| 29 | +#include "mp/mp_9_0_offset.h" |
|---|
| 29 | 30 | #include "soc15.h" |
|---|
| 30 | 31 | #include "vega10_ih.h" |
|---|
| 31 | 32 | #include "soc15_common.h" |
|---|
| .. | .. |
|---|
| 237 | 238 | struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); |
|---|
| 238 | 239 | struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); |
|---|
| 239 | 240 | int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; |
|---|
| 240 | | - int locked; |
|---|
| 241 | 241 | |
|---|
| 242 | 242 | /* block amdgpu_gpu_recover till msg FLR COMPLETE received, |
|---|
| 243 | 243 | * otherwise the mailbox msg will be ruined/reseted by |
|---|
| 244 | 244 | * the VF FLR. |
|---|
| 245 | | - * |
|---|
| 246 | | - * we can unlock the lock_reset to allow "amdgpu_job_timedout" |
|---|
| 247 | | - * to run gpu_recover() after FLR_NOTIFICATION_CMPL received |
|---|
| 248 | | - * which means host side had finished this VF's FLR. |
|---|
| 249 | 245 | */ |
|---|
| 250 | | - locked = mutex_trylock(&adev->lock_reset); |
|---|
| 251 | | - if (locked) |
|---|
| 252 | | - adev->in_gpu_reset = 1; |
|---|
| 246 | + if (!down_read_trylock(&adev->reset_sem)) |
|---|
| 247 | + return; |
|---|
| 248 | + |
|---|
| 249 | + atomic_set(&adev->in_gpu_reset, 1); |
|---|
| 253 | 250 | |
|---|
| 254 | 251 | do { |
|---|
| 255 | 252 | if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) |
|---|
| .. | .. |
|---|
| 260 | 257 | } while (timeout > 1); |
|---|
| 261 | 258 | |
|---|
| 262 | 259 | flr_done: |
|---|
| 263 | | - if (locked) { |
|---|
| 264 | | - adev->in_gpu_reset = 0; |
|---|
| 265 | | - mutex_unlock(&adev->lock_reset); |
|---|
| 266 | | - } |
|---|
| 260 | + atomic_set(&adev->in_gpu_reset, 0); |
|---|
| 261 | + up_read(&adev->reset_sem); |
|---|
| 267 | 262 | |
|---|
| 268 | 263 | /* Trigger recovery for world switch failure if no TDR */ |
|---|
| 269 | | - if (amdgpu_lockup_timeout == 0) |
|---|
| 270 | | - amdgpu_device_gpu_recover(adev, NULL, true); |
|---|
| 264 | + if (amdgpu_device_should_recover_gpu(adev) |
|---|
| 265 | + && (!amdgpu_device_has_job_running(adev) || |
|---|
| 266 | + adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) |
|---|
| 267 | + amdgpu_device_gpu_recover(adev, NULL); |
|---|
| 271 | 268 | } |
|---|
| 272 | 269 | |
|---|
| 273 | 270 | static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, |
|---|
| .. | .. |
|---|
| 295 | 292 | if (amdgpu_sriov_runtime(adev)) |
|---|
| 296 | 293 | schedule_work(&adev->virt.flr_work); |
|---|
| 297 | 294 | break; |
|---|
| 295 | + case IDH_QUERY_ALIVE: |
|---|
| 296 | + xgpu_ai_mailbox_send_ack(adev); |
|---|
| 297 | + break; |
|---|
| 298 | 298 | /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore |
|---|
| 299 | 299 | * it byfar since that polling thread will handle it, |
|---|
| 300 | 300 | * other msg like flr complete is not handled here. |
|---|