| .. | .. |
|---|
| 187 | 187 | if (p->signal_mapped_size && |
|---|
| 188 | 188 | p->signal_event_count == p->signal_mapped_size / 8) { |
|---|
| 189 | 189 | if (!p->signal_event_limit_reached) { |
|---|
| 190 | | - pr_warn("Signal event wasn't created because limit was reached\n"); |
|---|
| 190 | + pr_debug("Signal event wasn't created because limit was reached\n"); |
|---|
| 191 | 191 | p->signal_event_limit_reached = true; |
|---|
| 192 | 192 | } |
|---|
| 193 | 193 | return -ENOSPC; |
|---|
| .. | .. |
|---|
| 346 | 346 | ret = create_signal_event(devkfd, p, ev); |
|---|
| 347 | 347 | if (!ret) { |
|---|
| 348 | 348 | *event_page_offset = KFD_MMAP_TYPE_EVENTS; |
|---|
| 349 | | - *event_page_offset <<= PAGE_SHIFT; |
|---|
| 350 | 349 | *event_slot_index = ev->event_id; |
|---|
| 351 | 350 | } |
|---|
| 352 | 351 | break; |
|---|
| .. | .. |
|---|
| 461 | 460 | } |
|---|
| 462 | 461 | } |
|---|
| 463 | 462 | |
|---|
| 464 | | -void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, |
|---|
| 463 | +void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, |
|---|
| 465 | 464 | uint32_t valid_id_bits) |
|---|
| 466 | 465 | { |
|---|
| 467 | 466 | struct kfd_event *ev = NULL; |
|---|
| .. | .. |
|---|
| 529 | 528 | struct kfd_event_waiter *event_waiters; |
|---|
| 530 | 529 | uint32_t i; |
|---|
| 531 | 530 | |
|---|
| 532 | | - event_waiters = kmalloc_array(num_events, |
|---|
| 533 | | - sizeof(struct kfd_event_waiter), |
|---|
| 534 | | - GFP_KERNEL); |
|---|
| 531 | + event_waiters = kcalloc(num_events, sizeof(struct kfd_event_waiter), |
|---|
| 532 | + GFP_KERNEL); |
|---|
| 533 | + if (!event_waiters) |
|---|
| 534 | + return NULL; |
|---|
| 535 | 535 | |
|---|
| 536 | | - for (i = 0; (event_waiters) && (i < num_events) ; i++) { |
|---|
| 536 | + for (i = 0; i < num_events; i++) |
|---|
| 537 | 537 | init_wait(&event_waiters[i].wait); |
|---|
| 538 | | - event_waiters[i].activated = false; |
|---|
| 539 | | - } |
|---|
| 540 | 538 | |
|---|
| 541 | 539 | return event_waiters; |
|---|
| 542 | 540 | } |
|---|
| .. | .. |
|---|
| 852 | 850 | |
|---|
| 853 | 851 | if (type == KFD_EVENT_TYPE_MEMORY) { |
|---|
| 854 | 852 | dev_warn(kfd_device, |
|---|
| 855 | | - "Sending SIGSEGV to HSA Process with PID %d ", |
|---|
| 856 | | - p->lead_thread->pid); |
|---|
| 853 | + "Sending SIGSEGV to process %d (pasid 0x%x)", |
|---|
| 854 | + p->lead_thread->pid, p->pasid); |
|---|
| 857 | 855 | send_sig(SIGSEGV, p->lead_thread, 0); |
|---|
| 858 | 856 | } |
|---|
| 859 | 857 | |
|---|
| .. | .. |
|---|
| 861 | 859 | if (send_signal) { |
|---|
| 862 | 860 | if (send_sigterm) { |
|---|
| 863 | 861 | dev_warn(kfd_device, |
|---|
| 864 | | - "Sending SIGTERM to HSA Process with PID %d ", |
|---|
| 865 | | - p->lead_thread->pid); |
|---|
| 862 | + "Sending SIGTERM to process %d (pasid 0x%x)", |
|---|
| 863 | + p->lead_thread->pid, p->pasid); |
|---|
| 866 | 864 | send_sig(SIGTERM, p->lead_thread, 0); |
|---|
| 867 | 865 | } else { |
|---|
| 868 | 866 | dev_err(kfd_device, |
|---|
| 869 | | - "HSA Process (PID %d) got unhandled exception", |
|---|
| 870 | | - p->lead_thread->pid); |
|---|
| 867 | + "Process %d (pasid 0x%x) got unhandled exception", |
|---|
| 868 | + p->lead_thread->pid, p->pasid); |
|---|
| 871 | 869 | } |
|---|
| 872 | 870 | } |
|---|
| 873 | 871 | } |
|---|
| 874 | 872 | |
|---|
| 875 | 873 | #ifdef KFD_SUPPORT_IOMMU_V2 |
|---|
| 876 | | -void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, |
|---|
| 874 | +void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid, |
|---|
| 877 | 875 | unsigned long address, bool is_write_requested, |
|---|
| 878 | 876 | bool is_execute_requested) |
|---|
| 879 | 877 | { |
|---|
| .. | .. |
|---|
| 902 | 900 | |
|---|
| 903 | 901 | memset(&memory_exception_data, 0, sizeof(memory_exception_data)); |
|---|
| 904 | 902 | |
|---|
| 905 | | - down_read(&mm->mmap_sem); |
|---|
| 903 | + mmap_read_lock(mm); |
|---|
| 906 | 904 | vma = find_vma(mm, address); |
|---|
| 907 | 905 | |
|---|
| 908 | 906 | memory_exception_data.gpu_id = dev->id; |
|---|
| .. | .. |
|---|
| 925 | 923 | memory_exception_data.failure.NoExecute = 0; |
|---|
| 926 | 924 | } |
|---|
| 927 | 925 | |
|---|
| 928 | | - up_read(&mm->mmap_sem); |
|---|
| 926 | + mmap_read_unlock(mm); |
|---|
| 929 | 927 | mmput(mm); |
|---|
| 930 | 928 | |
|---|
| 931 | 929 | pr_debug("notpresent %d, noexecute %d, readonly %d\n", |
|---|
| .. | .. |
|---|
| 936 | 934 | /* Workaround on Raven to not kill the process when memory is freed |
|---|
| 937 | 935 | * before IOMMU is able to finish processing all the excessive PPRs |
|---|
| 938 | 936 | */ |
|---|
| 939 | | - if (dev->device_info->asic_family != CHIP_RAVEN) { |
|---|
| 937 | + if (dev->device_info->asic_family != CHIP_RAVEN && |
|---|
| 938 | + dev->device_info->asic_family != CHIP_RENOIR) { |
|---|
| 940 | 939 | mutex_lock(&p->event_mutex); |
|---|
| 941 | 940 | |
|---|
| 942 | 941 | /* Lookup events by type and signal them */ |
|---|
| .. | .. |
|---|
| 950 | 949 | } |
|---|
| 951 | 950 | #endif /* KFD_SUPPORT_IOMMU_V2 */ |
|---|
| 952 | 951 | |
|---|
| 953 | | -void kfd_signal_hw_exception_event(unsigned int pasid) |
|---|
| 952 | +void kfd_signal_hw_exception_event(u32 pasid) |
|---|
| 954 | 953 | { |
|---|
| 955 | 954 | /* |
|---|
| 956 | 955 | * Because we are called from arbitrary context (workqueue) as opposed |
|---|
| .. | .. |
|---|
| 971 | 970 | kfd_unref_process(p); |
|---|
| 972 | 971 | } |
|---|
| 973 | 972 | |
|---|
| 974 | | -void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, |
|---|
| 973 | +void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid, |
|---|
| 975 | 974 | struct kfd_vm_fault_info *info) |
|---|
| 976 | 975 | { |
|---|
| 977 | 976 | struct kfd_event *ev; |
|---|
| .. | .. |
|---|
| 983 | 982 | return; /* Presumably process exited. */ |
|---|
| 984 | 983 | memset(&memory_exception_data, 0, sizeof(memory_exception_data)); |
|---|
| 985 | 984 | memory_exception_data.gpu_id = dev->id; |
|---|
| 986 | | - memory_exception_data.failure.imprecise = 1; |
|---|
| 985 | + memory_exception_data.failure.imprecise = true; |
|---|
| 987 | 986 | /* Set failure reason */ |
|---|
| 988 | 987 | if (info) { |
|---|
| 989 | 988 | memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; |
|---|
| .. | .. |
|---|
| 1011 | 1010 | void kfd_signal_reset_event(struct kfd_dev *dev) |
|---|
| 1012 | 1011 | { |
|---|
| 1013 | 1012 | struct kfd_hsa_hw_exception_data hw_exception_data; |
|---|
| 1013 | + struct kfd_hsa_memory_exception_data memory_exception_data; |
|---|
| 1014 | 1014 | struct kfd_process *p; |
|---|
| 1015 | 1015 | struct kfd_event *ev; |
|---|
| 1016 | 1016 | unsigned int temp; |
|---|
| 1017 | 1017 | uint32_t id, idx; |
|---|
| 1018 | + int reset_cause = atomic_read(&dev->sram_ecc_flag) ? |
|---|
| 1019 | + KFD_HW_EXCEPTION_ECC : |
|---|
| 1020 | + KFD_HW_EXCEPTION_GPU_HANG; |
|---|
| 1018 | 1021 | |
|---|
| 1019 | 1022 | /* Whole gpu reset caused by GPU hang and memory is lost */ |
|---|
| 1020 | 1023 | memset(&hw_exception_data, 0, sizeof(hw_exception_data)); |
|---|
| 1021 | 1024 | hw_exception_data.gpu_id = dev->id; |
|---|
| 1022 | 1025 | hw_exception_data.memory_lost = 1; |
|---|
| 1026 | + hw_exception_data.reset_cause = reset_cause; |
|---|
| 1027 | + |
|---|
| 1028 | + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); |
|---|
| 1029 | + memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC; |
|---|
| 1030 | + memory_exception_data.gpu_id = dev->id; |
|---|
| 1031 | + memory_exception_data.failure.imprecise = true; |
|---|
| 1023 | 1032 | |
|---|
| 1024 | 1033 | idx = srcu_read_lock(&kfd_processes_srcu); |
|---|
| 1025 | 1034 | hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { |
|---|
| 1026 | 1035 | mutex_lock(&p->event_mutex); |
|---|
| 1027 | 1036 | id = KFD_FIRST_NONSIGNAL_EVENT_ID; |
|---|
| 1028 | | - idr_for_each_entry_continue(&p->event_idr, ev, id) |
|---|
| 1037 | + idr_for_each_entry_continue(&p->event_idr, ev, id) { |
|---|
| 1029 | 1038 | if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { |
|---|
| 1030 | 1039 | ev->hw_exception_data = hw_exception_data; |
|---|
| 1031 | 1040 | set_event(ev); |
|---|
| 1032 | 1041 | } |
|---|
| 1042 | + if (ev->type == KFD_EVENT_TYPE_MEMORY && |
|---|
| 1043 | + reset_cause == KFD_HW_EXCEPTION_ECC) { |
|---|
| 1044 | + ev->memory_exception_data = memory_exception_data; |
|---|
| 1045 | + set_event(ev); |
|---|
| 1046 | + } |
|---|
| 1047 | + } |
|---|
| 1033 | 1048 | mutex_unlock(&p->event_mutex); |
|---|
| 1034 | 1049 | } |
|---|
| 1035 | 1050 | srcu_read_unlock(&kfd_processes_srcu, idx); |
|---|