| .. | .. |
|---|
| 187 | 187 | if (p->signal_mapped_size && |
|---|
| 188 | 188 | p->signal_event_count == p->signal_mapped_size / 8) { |
|---|
| 189 | 189 | if (!p->signal_event_limit_reached) { |
|---|
| 190 | | - pr_warn("Signal event wasn't created because limit was reached\n"); |
|---|
| 190 | + pr_debug("Signal event wasn't created because limit was reached\n"); |
|---|
| 191 | 191 | p->signal_event_limit_reached = true; |
|---|
| 192 | 192 | } |
|---|
| 193 | 193 | return -ENOSPC; |
|---|
| .. | .. |
|---|
| 346 | 346 | ret = create_signal_event(devkfd, p, ev); |
|---|
| 347 | 347 | if (!ret) { |
|---|
| 348 | 348 | *event_page_offset = KFD_MMAP_TYPE_EVENTS; |
|---|
| 349 | | - *event_page_offset <<= PAGE_SHIFT; |
|---|
| 350 | 349 | *event_slot_index = ev->event_id; |
|---|
| 351 | 350 | } |
|---|
| 352 | 351 | break; |
|---|
| .. | .. |
|---|
| 461 | 460 | } |
|---|
| 462 | 461 | } |
|---|
| 463 | 462 | |
|---|
| 464 | | -void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, |
|---|
| 463 | +void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, |
|---|
| 465 | 464 | uint32_t valid_id_bits) |
|---|
| 466 | 465 | { |
|---|
| 467 | 466 | struct kfd_event *ev = NULL; |
|---|
| .. | .. |
|---|
| 532 | 531 | event_waiters = kmalloc_array(num_events, |
|---|
| 533 | 532 | sizeof(struct kfd_event_waiter), |
|---|
| 534 | 533 | GFP_KERNEL); |
|---|
| 534 | + if (!event_waiters) |
|---|
| 535 | + return NULL; |
|---|
| 535 | 536 | |
|---|
| 536 | 537 | for (i = 0; (event_waiters) && (i < num_events) ; i++) { |
|---|
| 537 | 538 | init_wait(&event_waiters[i].wait); |
|---|
| .. | .. |
|---|
| 852 | 853 | |
|---|
| 853 | 854 | if (type == KFD_EVENT_TYPE_MEMORY) { |
|---|
| 854 | 855 | dev_warn(kfd_device, |
|---|
| 855 | | - "Sending SIGSEGV to HSA Process with PID %d ", |
|---|
| 856 | | - p->lead_thread->pid); |
|---|
| 856 | + "Sending SIGSEGV to process %d (pasid 0x%x)", |
|---|
| 857 | + p->lead_thread->pid, p->pasid); |
|---|
| 857 | 858 | send_sig(SIGSEGV, p->lead_thread, 0); |
|---|
| 858 | 859 | } |
|---|
| 859 | 860 | |
|---|
| .. | .. |
|---|
| 861 | 862 | if (send_signal) { |
|---|
| 862 | 863 | if (send_sigterm) { |
|---|
| 863 | 864 | dev_warn(kfd_device, |
|---|
| 864 | | - "Sending SIGTERM to HSA Process with PID %d ", |
|---|
| 865 | | - p->lead_thread->pid); |
|---|
| 865 | + "Sending SIGTERM to process %d (pasid 0x%x)", |
|---|
| 866 | + p->lead_thread->pid, p->pasid); |
|---|
| 866 | 867 | send_sig(SIGTERM, p->lead_thread, 0); |
|---|
| 867 | 868 | } else { |
|---|
| 868 | 869 | dev_err(kfd_device, |
|---|
| 869 | | - "HSA Process (PID %d) got unhandled exception", |
|---|
| 870 | | - p->lead_thread->pid); |
|---|
| 870 | + "Process %d (pasid 0x%x) got unhandled exception", |
|---|
| 871 | + p->lead_thread->pid, p->pasid); |
|---|
| 871 | 872 | } |
|---|
| 872 | 873 | } |
|---|
| 873 | 874 | } |
|---|
| 874 | 875 | |
|---|
| 875 | 876 | #ifdef KFD_SUPPORT_IOMMU_V2 |
|---|
| 876 | | -void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, |
|---|
| 877 | +void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid, |
|---|
| 877 | 878 | unsigned long address, bool is_write_requested, |
|---|
| 878 | 879 | bool is_execute_requested) |
|---|
| 879 | 880 | { |
|---|
| .. | .. |
|---|
| 902 | 903 | |
|---|
| 903 | 904 | memset(&memory_exception_data, 0, sizeof(memory_exception_data)); |
|---|
| 904 | 905 | |
|---|
| 905 | | - down_read(&mm->mmap_sem); |
|---|
| 906 | + mmap_read_lock(mm); |
|---|
| 906 | 907 | vma = find_vma(mm, address); |
|---|
| 907 | 908 | |
|---|
| 908 | 909 | memory_exception_data.gpu_id = dev->id; |
|---|
| .. | .. |
|---|
| 925 | 926 | memory_exception_data.failure.NoExecute = 0; |
|---|
| 926 | 927 | } |
|---|
| 927 | 928 | |
|---|
| 928 | | - up_read(&mm->mmap_sem); |
|---|
| 929 | + mmap_read_unlock(mm); |
|---|
| 929 | 930 | mmput(mm); |
|---|
| 930 | 931 | |
|---|
| 931 | 932 | pr_debug("notpresent %d, noexecute %d, readonly %d\n", |
|---|
| .. | .. |
|---|
| 936 | 937 | /* Workaround on Raven to not kill the process when memory is freed |
|---|
| 937 | 938 | * before IOMMU is able to finish processing all the excessive PPRs |
|---|
| 938 | 939 | */ |
|---|
| 939 | | - if (dev->device_info->asic_family != CHIP_RAVEN) { |
|---|
| 940 | + if (dev->device_info->asic_family != CHIP_RAVEN && |
|---|
| 941 | + dev->device_info->asic_family != CHIP_RENOIR) { |
|---|
| 940 | 942 | mutex_lock(&p->event_mutex); |
|---|
| 941 | 943 | |
|---|
| 942 | 944 | /* Lookup events by type and signal them */ |
|---|
| .. | .. |
|---|
| 950 | 952 | } |
|---|
| 951 | 953 | #endif /* KFD_SUPPORT_IOMMU_V2 */ |
|---|
| 952 | 954 | |
|---|
| 953 | | -void kfd_signal_hw_exception_event(unsigned int pasid) |
|---|
| 955 | +void kfd_signal_hw_exception_event(u32 pasid) |
|---|
| 954 | 956 | { |
|---|
| 955 | 957 | /* |
|---|
| 956 | 958 | * Because we are called from arbitrary context (workqueue) as opposed |
|---|
| .. | .. |
|---|
| 971 | 973 | kfd_unref_process(p); |
|---|
| 972 | 974 | } |
|---|
| 973 | 975 | |
|---|
| 974 | | -void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, |
|---|
| 976 | +void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid, |
|---|
| 975 | 977 | struct kfd_vm_fault_info *info) |
|---|
| 976 | 978 | { |
|---|
| 977 | 979 | struct kfd_event *ev; |
|---|
| .. | .. |
|---|
| 983 | 985 | return; /* Presumably process exited. */ |
|---|
| 984 | 986 | memset(&memory_exception_data, 0, sizeof(memory_exception_data)); |
|---|
| 985 | 987 | memory_exception_data.gpu_id = dev->id; |
|---|
| 986 | | - memory_exception_data.failure.imprecise = 1; |
|---|
| 988 | + memory_exception_data.failure.imprecise = true; |
|---|
| 987 | 989 | /* Set failure reason */ |
|---|
| 988 | 990 | if (info) { |
|---|
| 989 | 991 | memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; |
|---|
| .. | .. |
|---|
| 1011 | 1013 | void kfd_signal_reset_event(struct kfd_dev *dev) |
|---|
| 1012 | 1014 | { |
|---|
| 1013 | 1015 | struct kfd_hsa_hw_exception_data hw_exception_data; |
|---|
| 1016 | + struct kfd_hsa_memory_exception_data memory_exception_data; |
|---|
| 1014 | 1017 | struct kfd_process *p; |
|---|
| 1015 | 1018 | struct kfd_event *ev; |
|---|
| 1016 | 1019 | unsigned int temp; |
|---|
| 1017 | 1020 | uint32_t id, idx; |
|---|
| 1021 | + int reset_cause = atomic_read(&dev->sram_ecc_flag) ? |
|---|
| 1022 | + KFD_HW_EXCEPTION_ECC : |
|---|
| 1023 | + KFD_HW_EXCEPTION_GPU_HANG; |
|---|
| 1018 | 1024 | |
|---|
| 1019 | 1025 | /* Whole gpu reset caused by GPU hang and memory is lost */ |
|---|
| 1020 | 1026 | memset(&hw_exception_data, 0, sizeof(hw_exception_data)); |
|---|
| 1021 | 1027 | hw_exception_data.gpu_id = dev->id; |
|---|
| 1022 | 1028 | hw_exception_data.memory_lost = 1; |
|---|
| 1029 | + hw_exception_data.reset_cause = reset_cause; |
|---|
| 1030 | + |
|---|
| 1031 | + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); |
|---|
| 1032 | + memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC; |
|---|
| 1033 | + memory_exception_data.gpu_id = dev->id; |
|---|
| 1034 | + memory_exception_data.failure.imprecise = true; |
|---|
| 1023 | 1035 | |
|---|
| 1024 | 1036 | idx = srcu_read_lock(&kfd_processes_srcu); |
|---|
| 1025 | 1037 | hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { |
|---|
| 1026 | 1038 | mutex_lock(&p->event_mutex); |
|---|
| 1027 | 1039 | id = KFD_FIRST_NONSIGNAL_EVENT_ID; |
|---|
| 1028 | | - idr_for_each_entry_continue(&p->event_idr, ev, id) |
|---|
| 1040 | + idr_for_each_entry_continue(&p->event_idr, ev, id) { |
|---|
| 1029 | 1041 | if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { |
|---|
| 1030 | 1042 | ev->hw_exception_data = hw_exception_data; |
|---|
| 1031 | 1043 | set_event(ev); |
|---|
| 1032 | 1044 | } |
|---|
| 1045 | + if (ev->type == KFD_EVENT_TYPE_MEMORY && |
|---|
| 1046 | + reset_cause == KFD_HW_EXCEPTION_ECC) { |
|---|
| 1047 | + ev->memory_exception_data = memory_exception_data; |
|---|
| 1048 | + set_event(ev); |
|---|
| 1049 | + } |
|---|
| 1050 | + } |
|---|
| 1033 | 1051 | mutex_unlock(&p->event_mutex); |
|---|
| 1034 | 1052 | } |
|---|
| 1035 | 1053 | srcu_read_unlock(&kfd_processes_srcu, idx); |
|---|