From 9d77db3c730780c8ef5ccd4b66403ff5675cfe4e Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Mon, 13 May 2024 10:30:14 +0000 Subject: [PATCH] modify sin led gpio --- kernel/drivers/gpu/drm/amd/amdkfd/kfd_events.c | 61 +++++++++++++++++++----------- 1 files changed, 38 insertions(+), 23 deletions(-) diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_events.c index e9f0e0a..2c19b37 100644 --- a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -187,7 +187,7 @@ if (p->signal_mapped_size && p->signal_event_count == p->signal_mapped_size / 8) { if (!p->signal_event_limit_reached) { - pr_warn("Signal event wasn't created because limit was reached\n"); + pr_debug("Signal event wasn't created because limit was reached\n"); p->signal_event_limit_reached = true; } return -ENOSPC; @@ -346,7 +346,6 @@ ret = create_signal_event(devkfd, p, ev); if (!ret) { *event_page_offset = KFD_MMAP_TYPE_EVENTS; - *event_page_offset <<= PAGE_SHIFT; *event_slot_index = ev->event_id; } break; @@ -461,7 +460,7 @@ } } -void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, +void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, uint32_t valid_id_bits) { struct kfd_event *ev = NULL; @@ -529,14 +528,13 @@ struct kfd_event_waiter *event_waiters; uint32_t i; - event_waiters = kmalloc_array(num_events, - sizeof(struct kfd_event_waiter), - GFP_KERNEL); + event_waiters = kcalloc(num_events, sizeof(struct kfd_event_waiter), + GFP_KERNEL); + if (!event_waiters) + return NULL; - for (i = 0; (event_waiters) && (i < num_events) ; i++) { + for (i = 0; i < num_events; i++) init_wait(&event_waiters[i].wait); - event_waiters[i].activated = false; - } return event_waiters; } @@ -852,8 +850,8 @@ if (type == KFD_EVENT_TYPE_MEMORY) { dev_warn(kfd_device, - "Sending SIGSEGV to HSA Process with PID %d ", - p->lead_thread->pid); + "Sending SIGSEGV to process %d (pasid 0x%x)", + p->lead_thread->pid, p->pasid); send_sig(SIGSEGV, p->lead_thread, 0); } @@ -861,19 +859,19 @@ if (send_signal) { if (send_sigterm) { dev_warn(kfd_device, - "Sending SIGTERM to HSA Process with PID %d ", - p->lead_thread->pid); + "Sending SIGTERM to process %d (pasid 0x%x)", + p->lead_thread->pid, p->pasid); send_sig(SIGTERM, p->lead_thread, 0); } else { dev_err(kfd_device, - "HSA Process (PID %d) got unhandled exception", - p->lead_thread->pid); + "Process %d (pasid 0x%x) got unhandled exception", + p->lead_thread->pid, p->pasid); } } } #ifdef KFD_SUPPORT_IOMMU_V2 -void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, +void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid, unsigned long address, bool is_write_requested, bool is_execute_requested) { @@ -902,7 +900,7 @@ memset(&memory_exception_data, 0, sizeof(memory_exception_data)); - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, address); memory_exception_data.gpu_id = dev->id; @@ -925,7 +923,7 @@ memory_exception_data.failure.NoExecute = 0; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); mmput(mm); pr_debug("notpresent %d, noexecute %d, readonly %d\n", @@ -936,7 +934,8 @@ /* Workaround on Raven to not kill the process when memory is freed * before IOMMU is able to finish processing all the excessive PPRs */ - if (dev->device_info->asic_family != CHIP_RAVEN) { + if (dev->device_info->asic_family != CHIP_RAVEN && + dev->device_info->asic_family != CHIP_RENOIR) { mutex_lock(&p->event_mutex); /* Lookup events by type and signal them */ @@ -950,7 +949,7 @@ } #endif /* KFD_SUPPORT_IOMMU_V2 */ -void kfd_signal_hw_exception_event(unsigned int pasid) +void kfd_signal_hw_exception_event(u32 pasid) { /* * Because we are called from arbitrary context (workqueue) as opposed @@ -971,7 +970,7 @@ kfd_unref_process(p); } -void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, +void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid, struct kfd_vm_fault_info *info) { struct kfd_event *ev; @@ -983,7 +982,7 @@ return; /* Presumably process exited. */ memset(&memory_exception_data, 0, sizeof(memory_exception_data)); memory_exception_data.gpu_id = dev->id; - memory_exception_data.failure.imprecise = 1; + memory_exception_data.failure.imprecise = true; /* Set failure reason */ if (info) { memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; @@ -1011,25 +1010,41 @@ void kfd_signal_reset_event(struct kfd_dev *dev) { struct kfd_hsa_hw_exception_data hw_exception_data; + struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_process *p; struct kfd_event *ev; unsigned int temp; uint32_t id, idx; + int reset_cause = atomic_read(&dev->sram_ecc_flag) ? + KFD_HW_EXCEPTION_ECC : + KFD_HW_EXCEPTION_GPU_HANG; /* Whole gpu reset caused by GPU hang and memory is lost */ memset(&hw_exception_data, 0, sizeof(hw_exception_data)); hw_exception_data.gpu_id = dev->id; hw_exception_data.memory_lost = 1; + hw_exception_data.reset_cause = reset_cause; + + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); + memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC; + memory_exception_data.gpu_id = dev->id; + memory_exception_data.failure.imprecise = true; idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { mutex_lock(&p->event_mutex); id = KFD_FIRST_NONSIGNAL_EVENT_ID; - idr_for_each_entry_continue(&p->event_idr, ev, id) + idr_for_each_entry_continue(&p->event_idr, ev, id) { if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { ev->hw_exception_data = hw_exception_data; set_event(ev); } + if (ev->type == KFD_EVENT_TYPE_MEMORY && + reset_cause == KFD_HW_EXCEPTION_ECC) { + ev->memory_exception_data = memory_exception_data; + set_event(ev); + } + } mutex_unlock(&p->event_mutex); } srcu_read_unlock(&kfd_processes_srcu, idx); -- Gitblit v1.6.2