forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-11 6778948f9de86c3cfaf36725a7c87dcff9ba247f
kernel/drivers/gpu/drm/amd/amdkfd/kfd_events.c
....@@ -187,7 +187,7 @@
187187 if (p->signal_mapped_size &&
188188 p->signal_event_count == p->signal_mapped_size / 8) {
189189 if (!p->signal_event_limit_reached) {
190
- pr_warn("Signal event wasn't created because limit was reached\n");
190
+ pr_debug("Signal event wasn't created because limit was reached\n");
191191 p->signal_event_limit_reached = true;
192192 }
193193 return -ENOSPC;
....@@ -346,7 +346,6 @@
346346 ret = create_signal_event(devkfd, p, ev);
347347 if (!ret) {
348348 *event_page_offset = KFD_MMAP_TYPE_EVENTS;
349
- *event_page_offset <<= PAGE_SHIFT;
350349 *event_slot_index = ev->event_id;
351350 }
352351 break;
....@@ -461,7 +460,7 @@
461460 }
462461 }
463462
464
-void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
463
+void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
465464 uint32_t valid_id_bits)
466465 {
467466 struct kfd_event *ev = NULL;
....@@ -532,6 +531,8 @@
532531 event_waiters = kmalloc_array(num_events,
533532 sizeof(struct kfd_event_waiter),
534533 GFP_KERNEL);
534
+ if (!event_waiters)
535
+ return NULL;
535536
536537 for (i = 0; (event_waiters) && (i < num_events) ; i++) {
537538 init_wait(&event_waiters[i].wait);
....@@ -852,8 +853,8 @@
852853
853854 if (type == KFD_EVENT_TYPE_MEMORY) {
854855 dev_warn(kfd_device,
855
- "Sending SIGSEGV to HSA Process with PID %d ",
856
- p->lead_thread->pid);
856
+ "Sending SIGSEGV to process %d (pasid 0x%x)",
857
+ p->lead_thread->pid, p->pasid);
857858 send_sig(SIGSEGV, p->lead_thread, 0);
858859 }
859860
....@@ -861,19 +862,19 @@
861862 if (send_signal) {
862863 if (send_sigterm) {
863864 dev_warn(kfd_device,
864
- "Sending SIGTERM to HSA Process with PID %d ",
865
- p->lead_thread->pid);
865
+ "Sending SIGTERM to process %d (pasid 0x%x)",
866
+ p->lead_thread->pid, p->pasid);
866867 send_sig(SIGTERM, p->lead_thread, 0);
867868 } else {
868869 dev_err(kfd_device,
869
- "HSA Process (PID %d) got unhandled exception",
870
- p->lead_thread->pid);
870
+ "Process %d (pasid 0x%x) got unhandled exception",
871
+ p->lead_thread->pid, p->pasid);
871872 }
872873 }
873874 }
874875
875876 #ifdef KFD_SUPPORT_IOMMU_V2
876
-void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
877
+void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid,
877878 unsigned long address, bool is_write_requested,
878879 bool is_execute_requested)
879880 {
....@@ -902,7 +903,7 @@
902903
903904 memset(&memory_exception_data, 0, sizeof(memory_exception_data));
904905
905
- down_read(&mm->mmap_sem);
906
+ mmap_read_lock(mm);
906907 vma = find_vma(mm, address);
907908
908909 memory_exception_data.gpu_id = dev->id;
....@@ -925,7 +926,7 @@
925926 memory_exception_data.failure.NoExecute = 0;
926927 }
927928
928
- up_read(&mm->mmap_sem);
929
+ mmap_read_unlock(mm);
929930 mmput(mm);
930931
931932 pr_debug("notpresent %d, noexecute %d, readonly %d\n",
....@@ -936,7 +937,8 @@
936937 /* Workaround on Raven to not kill the process when memory is freed
937938 * before IOMMU is able to finish processing all the excessive PPRs
938939 */
939
- if (dev->device_info->asic_family != CHIP_RAVEN) {
940
+ if (dev->device_info->asic_family != CHIP_RAVEN &&
941
+ dev->device_info->asic_family != CHIP_RENOIR) {
940942 mutex_lock(&p->event_mutex);
941943
942944 /* Lookup events by type and signal them */
....@@ -950,7 +952,7 @@
950952 }
951953 #endif /* KFD_SUPPORT_IOMMU_V2 */
952954
953
-void kfd_signal_hw_exception_event(unsigned int pasid)
955
+void kfd_signal_hw_exception_event(u32 pasid)
954956 {
955957 /*
956958 * Because we are called from arbitrary context (workqueue) as opposed
....@@ -971,7 +973,7 @@
971973 kfd_unref_process(p);
972974 }
973975
974
-void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
976
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
975977 struct kfd_vm_fault_info *info)
976978 {
977979 struct kfd_event *ev;
....@@ -983,7 +985,7 @@
983985 return; /* Presumably process exited. */
984986 memset(&memory_exception_data, 0, sizeof(memory_exception_data));
985987 memory_exception_data.gpu_id = dev->id;
986
- memory_exception_data.failure.imprecise = 1;
988
+ memory_exception_data.failure.imprecise = true;
987989 /* Set failure reason */
988990 if (info) {
989991 memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
....@@ -1011,25 +1013,41 @@
10111013 void kfd_signal_reset_event(struct kfd_dev *dev)
10121014 {
10131015 struct kfd_hsa_hw_exception_data hw_exception_data;
1016
+ struct kfd_hsa_memory_exception_data memory_exception_data;
10141017 struct kfd_process *p;
10151018 struct kfd_event *ev;
10161019 unsigned int temp;
10171020 uint32_t id, idx;
1021
+ int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
1022
+ KFD_HW_EXCEPTION_ECC :
1023
+ KFD_HW_EXCEPTION_GPU_HANG;
10181024
10191025 /* Whole gpu reset caused by GPU hang and memory is lost */
10201026 memset(&hw_exception_data, 0, sizeof(hw_exception_data));
10211027 hw_exception_data.gpu_id = dev->id;
10221028 hw_exception_data.memory_lost = 1;
1029
+ hw_exception_data.reset_cause = reset_cause;
1030
+
1031
+ memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1032
+ memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
1033
+ memory_exception_data.gpu_id = dev->id;
1034
+ memory_exception_data.failure.imprecise = true;
10231035
10241036 idx = srcu_read_lock(&kfd_processes_srcu);
10251037 hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
10261038 mutex_lock(&p->event_mutex);
10271039 id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1028
- idr_for_each_entry_continue(&p->event_idr, ev, id)
1040
+ idr_for_each_entry_continue(&p->event_idr, ev, id) {
10291041 if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
10301042 ev->hw_exception_data = hw_exception_data;
10311043 set_event(ev);
10321044 }
1045
+ if (ev->type == KFD_EVENT_TYPE_MEMORY &&
1046
+ reset_cause == KFD_HW_EXCEPTION_ECC) {
1047
+ ev->memory_exception_data = memory_exception_data;
1048
+ set_event(ev);
1049
+ }
1050
+ }
10331051 mutex_unlock(&p->event_mutex);
10341052 }
10351053 srcu_read_unlock(&kfd_processes_srcu, idx);