forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-05-13 9d77db3c730780c8ef5ccd4b66403ff5675cfe4e
kernel/drivers/gpu/drm/amd/amdkfd/kfd_events.c
....@@ -187,7 +187,7 @@
187187 if (p->signal_mapped_size &&
188188 p->signal_event_count == p->signal_mapped_size / 8) {
189189 if (!p->signal_event_limit_reached) {
190
- pr_warn("Signal event wasn't created because limit was reached\n");
190
+ pr_debug("Signal event wasn't created because limit was reached\n");
191191 p->signal_event_limit_reached = true;
192192 }
193193 return -ENOSPC;
....@@ -346,7 +346,6 @@
346346 ret = create_signal_event(devkfd, p, ev);
347347 if (!ret) {
348348 *event_page_offset = KFD_MMAP_TYPE_EVENTS;
349
- *event_page_offset <<= PAGE_SHIFT;
350349 *event_slot_index = ev->event_id;
351350 }
352351 break;
....@@ -461,7 +460,7 @@
461460 }
462461 }
463462
464
-void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
463
+void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
465464 uint32_t valid_id_bits)
466465 {
467466 struct kfd_event *ev = NULL;
....@@ -529,14 +528,13 @@
529528 struct kfd_event_waiter *event_waiters;
530529 uint32_t i;
531530
532
- event_waiters = kmalloc_array(num_events,
533
- sizeof(struct kfd_event_waiter),
534
- GFP_KERNEL);
531
+ event_waiters = kcalloc(num_events, sizeof(struct kfd_event_waiter),
532
+ GFP_KERNEL);
533
+ if (!event_waiters)
534
+ return NULL;
535535
536
- for (i = 0; (event_waiters) && (i < num_events) ; i++) {
536
+ for (i = 0; i < num_events; i++)
537537 init_wait(&event_waiters[i].wait);
538
- event_waiters[i].activated = false;
539
- }
540538
541539 return event_waiters;
542540 }
....@@ -852,8 +850,8 @@
852850
853851 if (type == KFD_EVENT_TYPE_MEMORY) {
854852 dev_warn(kfd_device,
855
- "Sending SIGSEGV to HSA Process with PID %d ",
856
- p->lead_thread->pid);
853
+ "Sending SIGSEGV to process %d (pasid 0x%x)",
854
+ p->lead_thread->pid, p->pasid);
857855 send_sig(SIGSEGV, p->lead_thread, 0);
858856 }
859857
....@@ -861,19 +859,19 @@
861859 if (send_signal) {
862860 if (send_sigterm) {
863861 dev_warn(kfd_device,
864
- "Sending SIGTERM to HSA Process with PID %d ",
865
- p->lead_thread->pid);
862
+ "Sending SIGTERM to process %d (pasid 0x%x)",
863
+ p->lead_thread->pid, p->pasid);
866864 send_sig(SIGTERM, p->lead_thread, 0);
867865 } else {
868866 dev_err(kfd_device,
869
- "HSA Process (PID %d) got unhandled exception",
870
- p->lead_thread->pid);
867
+ "Process %d (pasid 0x%x) got unhandled exception",
868
+ p->lead_thread->pid, p->pasid);
871869 }
872870 }
873871 }
874872
875873 #ifdef KFD_SUPPORT_IOMMU_V2
876
-void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
874
+void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid,
877875 unsigned long address, bool is_write_requested,
878876 bool is_execute_requested)
879877 {
....@@ -902,7 +900,7 @@
902900
903901 memset(&memory_exception_data, 0, sizeof(memory_exception_data));
904902
905
- down_read(&mm->mmap_sem);
903
+ mmap_read_lock(mm);
906904 vma = find_vma(mm, address);
907905
908906 memory_exception_data.gpu_id = dev->id;
....@@ -925,7 +923,7 @@
925923 memory_exception_data.failure.NoExecute = 0;
926924 }
927925
928
- up_read(&mm->mmap_sem);
926
+ mmap_read_unlock(mm);
929927 mmput(mm);
930928
931929 pr_debug("notpresent %d, noexecute %d, readonly %d\n",
....@@ -936,7 +934,8 @@
936934 /* Workaround on Raven to not kill the process when memory is freed
937935 * before IOMMU is able to finish processing all the excessive PPRs
938936 */
939
- if (dev->device_info->asic_family != CHIP_RAVEN) {
937
+ if (dev->device_info->asic_family != CHIP_RAVEN &&
938
+ dev->device_info->asic_family != CHIP_RENOIR) {
940939 mutex_lock(&p->event_mutex);
941940
942941 /* Lookup events by type and signal them */
....@@ -950,7 +949,7 @@
950949 }
951950 #endif /* KFD_SUPPORT_IOMMU_V2 */
952951
953
-void kfd_signal_hw_exception_event(unsigned int pasid)
952
+void kfd_signal_hw_exception_event(u32 pasid)
954953 {
955954 /*
956955 * Because we are called from arbitrary context (workqueue) as opposed
....@@ -971,7 +970,7 @@
971970 kfd_unref_process(p);
972971 }
973972
974
-void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
973
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
975974 struct kfd_vm_fault_info *info)
976975 {
977976 struct kfd_event *ev;
....@@ -983,7 +982,7 @@
983982 return; /* Presumably process exited. */
984983 memset(&memory_exception_data, 0, sizeof(memory_exception_data));
985984 memory_exception_data.gpu_id = dev->id;
986
- memory_exception_data.failure.imprecise = 1;
985
+ memory_exception_data.failure.imprecise = true;
987986 /* Set failure reason */
988987 if (info) {
989988 memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
....@@ -1011,25 +1010,41 @@
10111010 void kfd_signal_reset_event(struct kfd_dev *dev)
10121011 {
10131012 struct kfd_hsa_hw_exception_data hw_exception_data;
1013
+ struct kfd_hsa_memory_exception_data memory_exception_data;
10141014 struct kfd_process *p;
10151015 struct kfd_event *ev;
10161016 unsigned int temp;
10171017 uint32_t id, idx;
1018
+ int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
1019
+ KFD_HW_EXCEPTION_ECC :
1020
+ KFD_HW_EXCEPTION_GPU_HANG;
10181021
10191022 /* Whole gpu reset caused by GPU hang and memory is lost */
10201023 memset(&hw_exception_data, 0, sizeof(hw_exception_data));
10211024 hw_exception_data.gpu_id = dev->id;
10221025 hw_exception_data.memory_lost = 1;
1026
+ hw_exception_data.reset_cause = reset_cause;
1027
+
1028
+ memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1029
+ memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
1030
+ memory_exception_data.gpu_id = dev->id;
1031
+ memory_exception_data.failure.imprecise = true;
10231032
10241033 idx = srcu_read_lock(&kfd_processes_srcu);
10251034 hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
10261035 mutex_lock(&p->event_mutex);
10271036 id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1028
- idr_for_each_entry_continue(&p->event_idr, ev, id)
1037
+ idr_for_each_entry_continue(&p->event_idr, ev, id) {
10291038 if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
10301039 ev->hw_exception_data = hw_exception_data;
10311040 set_event(ev);
10321041 }
1042
+ if (ev->type == KFD_EVENT_TYPE_MEMORY &&
1043
+ reset_cause == KFD_HW_EXCEPTION_ECC) {
1044
+ ev->memory_exception_data = memory_exception_data;
1045
+ set_event(ev);
1046
+ }
1047
+ }
10331048 mutex_unlock(&p->event_mutex);
10341049 }
10351050 srcu_read_unlock(&kfd_processes_srcu, idx);