.. | .. |
---|
187 | 187 | if (p->signal_mapped_size && |
---|
188 | 188 | p->signal_event_count == p->signal_mapped_size / 8) { |
---|
189 | 189 | if (!p->signal_event_limit_reached) { |
---|
190 | | - pr_warn("Signal event wasn't created because limit was reached\n"); |
---|
| 190 | + pr_debug("Signal event wasn't created because limit was reached\n"); |
---|
191 | 191 | p->signal_event_limit_reached = true; |
---|
192 | 192 | } |
---|
193 | 193 | return -ENOSPC; |
---|
.. | .. |
---|
346 | 346 | ret = create_signal_event(devkfd, p, ev); |
---|
347 | 347 | if (!ret) { |
---|
348 | 348 | *event_page_offset = KFD_MMAP_TYPE_EVENTS; |
---|
349 | | - *event_page_offset <<= PAGE_SHIFT; |
---|
350 | 349 | *event_slot_index = ev->event_id; |
---|
351 | 350 | } |
---|
352 | 351 | break; |
---|
.. | .. |
---|
461 | 460 | } |
---|
462 | 461 | } |
---|
463 | 462 | |
---|
464 | | -void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, |
---|
| 463 | +void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, |
---|
465 | 464 | uint32_t valid_id_bits) |
---|
466 | 465 | { |
---|
467 | 466 | struct kfd_event *ev = NULL; |
---|
.. | .. |
---|
529 | 528 | struct kfd_event_waiter *event_waiters; |
---|
530 | 529 | uint32_t i; |
---|
531 | 530 | |
---|
532 | | - event_waiters = kmalloc_array(num_events, |
---|
533 | | - sizeof(struct kfd_event_waiter), |
---|
534 | | - GFP_KERNEL); |
---|
| 531 | + event_waiters = kcalloc(num_events, sizeof(struct kfd_event_waiter), |
---|
| 532 | + GFP_KERNEL); |
---|
| 533 | + if (!event_waiters) |
---|
| 534 | + return NULL; |
---|
535 | 535 | |
---|
536 | | - for (i = 0; (event_waiters) && (i < num_events) ; i++) { |
---|
| 536 | + for (i = 0; i < num_events; i++) |
---|
537 | 537 | init_wait(&event_waiters[i].wait); |
---|
538 | | - event_waiters[i].activated = false; |
---|
539 | | - } |
---|
540 | 538 | |
---|
541 | 539 | return event_waiters; |
---|
542 | 540 | } |
---|
.. | .. |
---|
852 | 850 | |
---|
853 | 851 | if (type == KFD_EVENT_TYPE_MEMORY) { |
---|
854 | 852 | dev_warn(kfd_device, |
---|
855 | | - "Sending SIGSEGV to HSA Process with PID %d ", |
---|
856 | | - p->lead_thread->pid); |
---|
| 853 | + "Sending SIGSEGV to process %d (pasid 0x%x)", |
---|
| 854 | + p->lead_thread->pid, p->pasid); |
---|
857 | 855 | send_sig(SIGSEGV, p->lead_thread, 0); |
---|
858 | 856 | } |
---|
859 | 857 | |
---|
.. | .. |
---|
861 | 859 | if (send_signal) { |
---|
862 | 860 | if (send_sigterm) { |
---|
863 | 861 | dev_warn(kfd_device, |
---|
864 | | - "Sending SIGTERM to HSA Process with PID %d ", |
---|
865 | | - p->lead_thread->pid); |
---|
| 862 | + "Sending SIGTERM to process %d (pasid 0x%x)", |
---|
| 863 | + p->lead_thread->pid, p->pasid); |
---|
866 | 864 | send_sig(SIGTERM, p->lead_thread, 0); |
---|
867 | 865 | } else { |
---|
868 | 866 | dev_err(kfd_device, |
---|
869 | | - "HSA Process (PID %d) got unhandled exception", |
---|
870 | | - p->lead_thread->pid); |
---|
| 867 | + "Process %d (pasid 0x%x) got unhandled exception", |
---|
| 868 | + p->lead_thread->pid, p->pasid); |
---|
871 | 869 | } |
---|
872 | 870 | } |
---|
873 | 871 | } |
---|
874 | 872 | |
---|
875 | 873 | #ifdef KFD_SUPPORT_IOMMU_V2 |
---|
876 | | -void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, |
---|
| 874 | +void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid, |
---|
877 | 875 | unsigned long address, bool is_write_requested, |
---|
878 | 876 | bool is_execute_requested) |
---|
879 | 877 | { |
---|
.. | .. |
---|
902 | 900 | |
---|
903 | 901 | memset(&memory_exception_data, 0, sizeof(memory_exception_data)); |
---|
904 | 902 | |
---|
905 | | - down_read(&mm->mmap_sem); |
---|
| 903 | + mmap_read_lock(mm); |
---|
906 | 904 | vma = find_vma(mm, address); |
---|
907 | 905 | |
---|
908 | 906 | memory_exception_data.gpu_id = dev->id; |
---|
.. | .. |
---|
925 | 923 | memory_exception_data.failure.NoExecute = 0; |
---|
926 | 924 | } |
---|
927 | 925 | |
---|
928 | | - up_read(&mm->mmap_sem); |
---|
| 926 | + mmap_read_unlock(mm); |
---|
929 | 927 | mmput(mm); |
---|
930 | 928 | |
---|
931 | 929 | pr_debug("notpresent %d, noexecute %d, readonly %d\n", |
---|
.. | .. |
---|
936 | 934 | /* Workaround on Raven to not kill the process when memory is freed |
---|
937 | 935 | * before IOMMU is able to finish processing all the excessive PPRs |
---|
938 | 936 | */ |
---|
939 | | - if (dev->device_info->asic_family != CHIP_RAVEN) { |
---|
| 937 | + if (dev->device_info->asic_family != CHIP_RAVEN && |
---|
| 938 | + dev->device_info->asic_family != CHIP_RENOIR) { |
---|
940 | 939 | mutex_lock(&p->event_mutex); |
---|
941 | 940 | |
---|
942 | 941 | /* Lookup events by type and signal them */ |
---|
.. | .. |
---|
950 | 949 | } |
---|
951 | 950 | #endif /* KFD_SUPPORT_IOMMU_V2 */ |
---|
952 | 951 | |
---|
953 | | -void kfd_signal_hw_exception_event(unsigned int pasid) |
---|
| 952 | +void kfd_signal_hw_exception_event(u32 pasid) |
---|
954 | 953 | { |
---|
955 | 954 | /* |
---|
956 | 955 | * Because we are called from arbitrary context (workqueue) as opposed |
---|
.. | .. |
---|
971 | 970 | kfd_unref_process(p); |
---|
972 | 971 | } |
---|
973 | 972 | |
---|
974 | | -void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, |
---|
| 973 | +void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid, |
---|
975 | 974 | struct kfd_vm_fault_info *info) |
---|
976 | 975 | { |
---|
977 | 976 | struct kfd_event *ev; |
---|
.. | .. |
---|
983 | 982 | return; /* Presumably process exited. */ |
---|
984 | 983 | memset(&memory_exception_data, 0, sizeof(memory_exception_data)); |
---|
985 | 984 | memory_exception_data.gpu_id = dev->id; |
---|
986 | | - memory_exception_data.failure.imprecise = 1; |
---|
| 985 | + memory_exception_data.failure.imprecise = true; |
---|
987 | 986 | /* Set failure reason */ |
---|
988 | 987 | if (info) { |
---|
989 | 988 | memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; |
---|
.. | .. |
---|
1011 | 1010 | void kfd_signal_reset_event(struct kfd_dev *dev) |
---|
1012 | 1011 | { |
---|
1013 | 1012 | struct kfd_hsa_hw_exception_data hw_exception_data; |
---|
| 1013 | + struct kfd_hsa_memory_exception_data memory_exception_data; |
---|
1014 | 1014 | struct kfd_process *p; |
---|
1015 | 1015 | struct kfd_event *ev; |
---|
1016 | 1016 | unsigned int temp; |
---|
1017 | 1017 | uint32_t id, idx; |
---|
| 1018 | + int reset_cause = atomic_read(&dev->sram_ecc_flag) ? |
---|
| 1019 | + KFD_HW_EXCEPTION_ECC : |
---|
| 1020 | + KFD_HW_EXCEPTION_GPU_HANG; |
---|
1018 | 1021 | |
---|
1019 | 1022 | /* Whole gpu reset caused by GPU hang and memory is lost */ |
---|
1020 | 1023 | memset(&hw_exception_data, 0, sizeof(hw_exception_data)); |
---|
1021 | 1024 | hw_exception_data.gpu_id = dev->id; |
---|
1022 | 1025 | hw_exception_data.memory_lost = 1; |
---|
| 1026 | + hw_exception_data.reset_cause = reset_cause; |
---|
| 1027 | + |
---|
| 1028 | + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); |
---|
| 1029 | + memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC; |
---|
| 1030 | + memory_exception_data.gpu_id = dev->id; |
---|
| 1031 | + memory_exception_data.failure.imprecise = true; |
---|
1023 | 1032 | |
---|
1024 | 1033 | idx = srcu_read_lock(&kfd_processes_srcu); |
---|
1025 | 1034 | hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { |
---|
1026 | 1035 | mutex_lock(&p->event_mutex); |
---|
1027 | 1036 | id = KFD_FIRST_NONSIGNAL_EVENT_ID; |
---|
1028 | | - idr_for_each_entry_continue(&p->event_idr, ev, id) |
---|
| 1037 | + idr_for_each_entry_continue(&p->event_idr, ev, id) { |
---|
1029 | 1038 | if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { |
---|
1030 | 1039 | ev->hw_exception_data = hw_exception_data; |
---|
1031 | 1040 | set_event(ev); |
---|
1032 | 1041 | } |
---|
| 1042 | + if (ev->type == KFD_EVENT_TYPE_MEMORY && |
---|
| 1043 | + reset_cause == KFD_HW_EXCEPTION_ECC) { |
---|
| 1044 | + ev->memory_exception_data = memory_exception_data; |
---|
| 1045 | + set_event(ev); |
---|
| 1046 | + } |
---|
| 1047 | + } |
---|
1033 | 1048 | mutex_unlock(&p->event_mutex); |
---|
1034 | 1049 | } |
---|
1035 | 1050 | srcu_read_unlock(&kfd_processes_srcu, idx); |
---|