From 151fecfb72a0d602dfe79790602ef64b4e241574 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 19 Feb 2024 01:51:07 +0000
Subject: [PATCH] export RK_PA3
---
kernel/arch/powerpc/platforms/powernv/npu-dma.c | 1167 +++++++++++++++++++++------------------------------------
1 files changed, 430 insertions(+), 737 deletions(-)
diff --git a/kernel/arch/powerpc/platforms/powernv/npu-dma.c b/kernel/arch/powerpc/platforms/powernv/npu-dma.c
index fd8166f..b711dc3 100644
--- a/kernel/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/kernel/arch/powerpc/platforms/powernv/npu-dma.c
@@ -1,58 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This file implements the DMA operations for NVLink devices. The NPU
* devices all point to the same iommu table as the parent PCI device.
*
* Copyright Alistair Popple, IBM Corporation 2015.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
-#include <linux/slab.h>
#include <linux/mmu_notifier.h>
#include <linux/mmu_context.h>
#include <linux/of.h>
-#include <linux/export.h>
#include <linux/pci.h>
#include <linux/memblock.h>
-#include <linux/iommu.h>
-#include <linux/debugfs.h>
+#include <linux/sizes.h>
#include <asm/debugfs.h>
-#include <asm/tlb.h>
#include <asm/powernv.h>
-#include <asm/reg.h>
-#include <asm/opal.h>
-#include <asm/io.h>
-#include <asm/iommu.h>
-#include <asm/pnv-pci.h>
-#include <asm/msi_bitmap.h>
+#include <asm/ppc-pci.h>
#include <asm/opal.h>
-#include "powernv.h"
#include "pci.h"
-#define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
-
-/*
- * spinlock to protect initialisation of an npu_context for a particular
- * mm_struct.
- */
-static DEFINE_SPINLOCK(npu_context_lock);
-
-/*
- * When an address shootdown range exceeds this threshold we invalidate the
- * entire TLB on the GPU for the given PID rather than each specific address in
- * the range.
- */
-static uint64_t atsd_threshold = 2 * 1024 * 1024;
-static struct dentry *atsd_threshold_dentry;
-
-/*
- * Other types of TCE cache invalidation are not functional in the
- * hardware.
- */
static struct pci_dev *get_pci_dev(struct device_node *dn)
{
struct pci_dn *pdn = PCI_DN(dn);
@@ -123,63 +90,7 @@
}
EXPORT_SYMBOL(pnv_pci_get_npu_dev);
-#define NPU_DMA_OP_UNSUPPORTED() \
- dev_err_once(dev, "%s operation unsupported for NVLink devices\n", \
- __func__)
-
-static void *dma_npu_alloc(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag,
- unsigned long attrs)
-{
- NPU_DMA_OP_UNSUPPORTED();
- return NULL;
-}
-
-static void dma_npu_free(struct device *dev, size_t size,
- void *vaddr, dma_addr_t dma_handle,
- unsigned long attrs)
-{
- NPU_DMA_OP_UNSUPPORTED();
-}
-
-static dma_addr_t dma_npu_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction direction,
- unsigned long attrs)
-{
- NPU_DMA_OP_UNSUPPORTED();
- return 0;
-}
-
-static int dma_npu_map_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction direction,
- unsigned long attrs)
-{
- NPU_DMA_OP_UNSUPPORTED();
- return 0;
-}
-
-static int dma_npu_dma_supported(struct device *dev, u64 mask)
-{
- NPU_DMA_OP_UNSUPPORTED();
- return 0;
-}
-
-static u64 dma_npu_get_required_mask(struct device *dev)
-{
- NPU_DMA_OP_UNSUPPORTED();
- return 0;
-}
-
-static const struct dma_map_ops dma_npu_ops = {
- .map_page = dma_npu_map_page,
- .map_sg = dma_npu_map_sg,
- .alloc = dma_npu_alloc,
- .free = dma_npu_free,
- .dma_supported = dma_npu_dma_supported,
- .get_required_mask = dma_npu_get_required_mask,
-};
-
+#ifdef CONFIG_IOMMU_API
/*
* Returns the PE assoicated with the PCI device of the given
* NPU. Returns the linked pci device if pci_dev != NULL.
@@ -211,15 +122,25 @@
return pe;
}
-long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
+static long pnv_npu_unset_window(struct iommu_table_group *table_group,
+ int num);
+
+static long pnv_npu_set_window(struct iommu_table_group *table_group, int num,
struct iommu_table *tbl)
{
+ struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+ table_group);
struct pnv_phb *phb = npe->phb;
int64_t rc;
const unsigned long size = tbl->it_indirect_levels ?
tbl->it_level_size : tbl->it_size;
const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+ int num2 = (num == 0) ? 1 : 0;
+
+ /* NPU has just one TVE so if there is another table, remove it first */
+ if (npe->table_group.tables[num2])
+ pnv_npu_unset_window(&npe->table_group, num2);
pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
start_addr, start_addr + win_size - 1,
@@ -245,10 +166,15 @@
return 0;
}
-long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
+static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num)
{
+ struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+ table_group);
struct pnv_phb *phb = npe->phb;
int64_t rc;
+
+ if (!npe->table_group.tables[num])
+ return 0;
pe_info(npe, "Removing DMA window\n");
@@ -268,108 +194,14 @@
return 0;
}
-/*
- * Enables 32 bit DMA on NPU.
- */
-static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
-{
- struct pci_dev *gpdev;
- struct pnv_ioda_pe *gpe;
- int64_t rc;
-
- /*
- * Find the assoicated PCI devices and get the dma window
- * information from there.
- */
- if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
- return;
-
- gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
- if (!gpe)
- return;
-
- rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
-
- /*
- * We don't initialise npu_pe->tce32_table as we always use
- * dma_npu_ops which are nops.
- */
- set_dma_ops(&npe->pdev->dev, &dma_npu_ops);
-}
-
-/*
- * Enables bypass mode on the NPU. The NPU only supports one
- * window per link, so bypass needs to be explicitly enabled or
- * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
- * active at the same time.
- */
-static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
-{
- struct pnv_phb *phb = npe->phb;
- int64_t rc = 0;
- phys_addr_t top = memblock_end_of_DRAM();
-
- if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
- return -EINVAL;
-
- rc = pnv_npu_unset_window(npe, 0);
- if (rc != OPAL_SUCCESS)
- return rc;
-
- /* Enable the bypass window */
-
- top = roundup_pow_of_two(top);
- dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
- npe->pe_number);
- rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
- npe->pe_number, npe->pe_number,
- 0 /* bypass base */, top);
-
- if (rc == OPAL_SUCCESS)
- pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
- return rc;
-}
-
-void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
-{
- int i;
- struct pnv_phb *phb;
- struct pci_dn *pdn;
- struct pnv_ioda_pe *npe;
- struct pci_dev *npdev;
-
- for (i = 0; ; ++i) {
- npdev = pnv_pci_get_npu_dev(gpdev, i);
-
- if (!npdev)
- break;
-
- pdn = pci_get_pdn(npdev);
- if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
- return;
-
- phb = pci_bus_to_host(npdev->bus)->private_data;
-
- /* We only do bypass if it's enabled on the linked device */
- npe = &phb->ioda.pe_array[pdn->pe_number];
-
- if (bypass) {
- dev_info(&npdev->dev,
- "Using 64-bit DMA iommu bypass\n");
- pnv_npu_dma_set_bypass(npe);
- } else {
- dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
- pnv_npu_dma_set_32(npe);
- }
- }
-}
-
/* Switch ownership from platform code to external user (e.g. VFIO) */
-void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
+static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
{
+ struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+ table_group);
struct pnv_phb *phb = npe->phb;
int64_t rc;
+ struct pci_dev *gpdev = NULL;
/*
* Note: NPU has just a single TVE in the hardware which means that
@@ -378,7 +210,7 @@
* if it was enabled at the moment of ownership change.
*/
if (npe->table_group.tables[0]) {
- pnv_npu_unset_window(npe, 0);
+ pnv_npu_unset_window(&npe->table_group, 0);
return;
}
@@ -391,622 +223,483 @@
return;
}
pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
+
+ get_gpu_pci_dev_and_pe(npe, &gpdev);
+ if (gpdev)
+ pnv_npu2_unmap_lpar_dev(gpdev);
}
-struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
+static void pnv_npu_release_ownership(struct iommu_table_group *table_group)
{
- struct pnv_phb *phb = npe->phb;
- struct pci_bus *pbus = phb->hose->bus;
- struct pci_dev *npdev, *gpdev = NULL, *gptmp;
- struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
+ struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+ table_group);
+ struct pci_dev *gpdev = NULL;
- if (!gpe || !gpdev)
- return NULL;
-
- list_for_each_entry(npdev, &pbus->devices, bus_list) {
- gptmp = pnv_pci_get_gpu_dev(npdev);
-
- if (gptmp != gpdev)
- continue;
-
- pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
- iommu_group_add_device(gpe->table_group.group, &npdev->dev);
- }
-
- return gpe;
+ get_gpu_pci_dev_and_pe(npe, &gpdev);
+ if (gpdev)
+ pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV);
}
-/* Maximum number of nvlinks per npu */
-#define NV_MAX_LINKS 6
-
-/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
-static int max_npu2_index;
-
-struct npu_context {
- struct mm_struct *mm;
- struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
- struct mmu_notifier mn;
- struct kref kref;
- bool nmmu_flush;
-
- /* Callback to stop translation requests on a given GPU */
- void (*release_cb)(struct npu_context *context, void *priv);
-
- /*
- * Private pointer passed to the above callback for usage by
- * device drivers.
- */
- void *priv;
+static struct iommu_table_group_ops pnv_pci_npu_ops = {
+ .set_window = pnv_npu_set_window,
+ .unset_window = pnv_npu_unset_window,
+ .take_ownership = pnv_npu_take_ownership,
+ .release_ownership = pnv_npu_release_ownership,
};
-
-struct mmio_atsd_reg {
- struct npu *npu;
- int reg;
-};
+#endif /* !CONFIG_IOMMU_API */
/*
- * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
- * if none are available.
+ * NPU2 ATS
*/
-static int get_mmio_atsd_reg(struct npu *npu)
-{
- int i;
-
- for (i = 0; i < npu->mmio_atsd_count; i++) {
- if (!test_bit(i, &npu->mmio_atsd_usage))
- if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
- return i;
- }
-
- return -ENOSPC;
-}
-
-static void put_mmio_atsd_reg(struct npu *npu, int reg)
-{
- clear_bit_unlock(reg, &npu->mmio_atsd_usage);
-}
-
-/* MMIO ATSD register offsets */
-#define XTS_ATSD_AVA 1
-#define XTS_ATSD_STAT 2
-
-static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
- unsigned long launch, unsigned long va)
-{
- struct npu *npu = mmio_atsd_reg->npu;
- int reg = mmio_atsd_reg->reg;
-
- __raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
- eieio();
- __raw_writeq_be(launch, npu->mmio_atsd_regs[reg]);
-}
-
-static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
- unsigned long pid, bool flush)
-{
- int i;
- unsigned long launch;
-
- for (i = 0; i <= max_npu2_index; i++) {
- if (mmio_atsd_reg[i].reg < 0)
- continue;
-
- /* IS set to invalidate matching PID */
- launch = PPC_BIT(12);
-
- /* PRS set to process-scoped */
- launch |= PPC_BIT(13);
-
- /* AP */
- launch |= (u64)
- mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
-
- /* PID */
- launch |= pid << PPC_BITLSHIFT(38);
-
- /* No flush */
- launch |= !flush << PPC_BITLSHIFT(39);
-
- /* Invalidating the entire process doesn't use a va */
- mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
- }
-}
-
-static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
- unsigned long va, unsigned long pid, bool flush)
-{
- int i;
- unsigned long launch;
-
- for (i = 0; i <= max_npu2_index; i++) {
- if (mmio_atsd_reg[i].reg < 0)
- continue;
-
- /* IS set to invalidate target VA */
- launch = 0;
-
- /* PRS set to process scoped */
- launch |= PPC_BIT(13);
-
- /* AP */
- launch |= (u64)
- mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
-
- /* PID */
- launch |= pid << PPC_BITLSHIFT(38);
-
- /* No flush */
- launch |= !flush << PPC_BITLSHIFT(39);
-
- mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
- }
-}
-
-#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
-
-static void mmio_invalidate_wait(
- struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
-{
- struct npu *npu;
- int i, reg;
-
- /* Wait for all invalidations to complete */
- for (i = 0; i <= max_npu2_index; i++) {
- if (mmio_atsd_reg[i].reg < 0)
- continue;
-
- /* Wait for completion */
- npu = mmio_atsd_reg[i].npu;
- reg = mmio_atsd_reg[i].reg;
- while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
- cpu_relax();
- }
-}
+/* Maximum possible number of ATSD MMIO registers per NPU */
+#define NV_NMMU_ATSD_REGS 8
+#define NV_NPU_MAX_PE_NUM 16
/*
- * Acquires all the address translation shootdown (ATSD) registers required to
- * launch an ATSD on all links this npu_context is active on.
+ * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or
+ * up to 3 x (GPU + 2xNPUs) (POWER9).
*/
-static void acquire_atsd_reg(struct npu_context *npu_context,
- struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
+struct npu_comp {
+ struct iommu_table_group table_group;
+ int pe_num;
+ struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM];
+};
+
+/* An NPU descriptor, valid for POWER9 only */
+struct npu {
+ int index;
+ struct npu_comp npucomp;
+};
+
+#ifdef CONFIG_IOMMU_API
+static long pnv_npu_peers_create_table_userspace(
+ struct iommu_table_group *table_group,
+ int num, __u32 page_shift, __u64 window_size, __u32 levels,
+ struct iommu_table **ptbl)
+{
+ struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+ table_group);
+
+ if (!npucomp->pe_num || !npucomp->pe[0] ||
+ !npucomp->pe[0]->table_group.ops ||
+ !npucomp->pe[0]->table_group.ops->create_table)
+ return -EFAULT;
+
+ return npucomp->pe[0]->table_group.ops->create_table(
+ &npucomp->pe[0]->table_group, num, page_shift,
+ window_size, levels, ptbl);
+}
+
+static long pnv_npu_peers_set_window(struct iommu_table_group *table_group,
+ int num, struct iommu_table *tbl)
{
int i, j;
- struct npu *npu;
- struct pci_dev *npdev;
- struct pnv_phb *nphb;
+ long ret = 0;
+ struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+ table_group);
- for (i = 0; i <= max_npu2_index; i++) {
- mmio_atsd_reg[i].reg = -1;
- for (j = 0; j < NV_MAX_LINKS; j++) {
- /*
- * There are no ordering requirements with respect to
- * the setup of struct npu_context, but to ensure
- * consistent behaviour we need to ensure npdev[][] is
- * only read once.
- */
- npdev = READ_ONCE(npu_context->npdev[i][j]);
- if (!npdev)
+ for (i = 0; i < npucomp->pe_num; ++i) {
+ struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+ if (!pe->table_group.ops->set_window)
+ continue;
+
+ ret = pe->table_group.ops->set_window(&pe->table_group,
+ num, tbl);
+ if (ret)
+ break;
+ }
+
+ if (ret) {
+ for (j = 0; j < i; ++j) {
+ struct pnv_ioda_pe *pe = npucomp->pe[j];
+
+ if (!pe->table_group.ops->unset_window)
continue;
- nphb = pci_bus_to_host(npdev->bus)->private_data;
- npu = &nphb->npu;
- mmio_atsd_reg[i].npu = npu;
- mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
- while (mmio_atsd_reg[i].reg < 0) {
- mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
- cpu_relax();
- }
- break;
+ ret = pe->table_group.ops->unset_window(
+ &pe->table_group, num);
+ if (ret)
+ break;
}
+ } else {
+ table_group->tables[num] = iommu_tce_table_get(tbl);
}
+
+ return ret;
}
-/*
- * Release previously acquired ATSD registers. To avoid deadlocks the registers
- * must be released in the same order they were acquired above in
- * acquire_atsd_reg.
- */
-static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
+static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group,
+ int num)
+{
+ int i, j;
+ long ret = 0;
+ struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+ table_group);
+
+ for (i = 0; i < npucomp->pe_num; ++i) {
+ struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+ WARN_ON(npucomp->table_group.tables[num] !=
+ table_group->tables[num]);
+ if (!npucomp->table_group.tables[num])
+ continue;
+
+ if (!pe->table_group.ops->unset_window)
+ continue;
+
+ ret = pe->table_group.ops->unset_window(&pe->table_group, num);
+ if (ret)
+ break;
+ }
+
+ if (ret) {
+ for (j = 0; j < i; ++j) {
+ struct pnv_ioda_pe *pe = npucomp->pe[j];
+
+ if (!npucomp->table_group.tables[num])
+ continue;
+
+ if (!pe->table_group.ops->set_window)
+ continue;
+
+ ret = pe->table_group.ops->set_window(&pe->table_group,
+ num, table_group->tables[num]);
+ if (ret)
+ break;
+ }
+ } else if (table_group->tables[num]) {
+ iommu_tce_table_put(table_group->tables[num]);
+ table_group->tables[num] = NULL;
+ }
+
+ return ret;
+}
+
+static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group)
{
int i;
+ struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+ table_group);
- for (i = 0; i <= max_npu2_index; i++) {
- /*
- * We can't rely on npu_context->npdev[][] being the same here
- * as when acquire_atsd_reg() was called, hence we use the
- * values stored in mmio_atsd_reg during the acquire phase
- * rather than re-reading npdev[][].
- */
- if (mmio_atsd_reg[i].reg < 0)
+ for (i = 0; i < npucomp->pe_num; ++i) {
+ struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+ if (!pe->table_group.ops ||
+ !pe->table_group.ops->take_ownership)
continue;
-
- put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
+ pe->table_group.ops->take_ownership(&pe->table_group);
}
}
-/*
- * Invalidate either a single address or an entire PID depending on
- * the value of va.
- */
-static void mmio_invalidate(struct npu_context *npu_context, int va,
- unsigned long address, bool flush)
+static void pnv_npu_peers_release_ownership(
+ struct iommu_table_group *table_group)
{
- struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
- unsigned long pid = npu_context->mm->context.id;
+ int i;
+ struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+ table_group);
- if (npu_context->nmmu_flush)
- /*
- * Unfortunately the nest mmu does not support flushing specific
- * addresses so we have to flush the whole mm once before
- * shooting down the GPU translation.
- */
- flush_all_mm(npu_context->mm);
+ for (i = 0; i < npucomp->pe_num; ++i) {
+ struct pnv_ioda_pe *pe = npucomp->pe[i];
- /*
- * Loop over all the NPUs this process is active on and launch
- * an invalidate.
- */
- acquire_atsd_reg(npu_context, mmio_atsd_reg);
- if (va)
- mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
- else
- mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
-
- mmio_invalidate_wait(mmio_atsd_reg);
- if (flush) {
- /*
- * The GPU requires two flush ATSDs to ensure all entries have
- * been flushed. We use PID 0 as it will never be used for a
- * process on the GPU.
- */
- mmio_invalidate_pid(mmio_atsd_reg, 0, true);
- mmio_invalidate_wait(mmio_atsd_reg);
- mmio_invalidate_pid(mmio_atsd_reg, 0, true);
- mmio_invalidate_wait(mmio_atsd_reg);
- }
- release_atsd_reg(mmio_atsd_reg);
-}
-
-static void pnv_npu2_mn_release(struct mmu_notifier *mn,
- struct mm_struct *mm)
-{
- struct npu_context *npu_context = mn_to_npu_context(mn);
-
- /* Call into device driver to stop requests to the NMMU */
- if (npu_context->release_cb)
- npu_context->release_cb(npu_context, npu_context->priv);
-
- /*
- * There should be no more translation requests for this PID, but we
- * need to ensure any entries for it are removed from the TLB.
- */
- mmio_invalidate(npu_context, 0, 0, true);
-}
-
-static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long address,
- pte_t pte)
-{
- struct npu_context *npu_context = mn_to_npu_context(mn);
-
- mmio_invalidate(npu_context, 1, address, true);
-}
-
-static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- struct npu_context *npu_context = mn_to_npu_context(mn);
- unsigned long address;
-
- if (end - start > atsd_threshold) {
- /*
- * Just invalidate the entire PID if the address range is too
- * large.
- */
- mmio_invalidate(npu_context, 0, 0, true);
- } else {
- for (address = start; address < end; address += PAGE_SIZE)
- mmio_invalidate(npu_context, 1, address, false);
-
- /* Do the flush only on the final addess == end */
- mmio_invalidate(npu_context, 1, address, true);
+ if (!pe->table_group.ops ||
+ !pe->table_group.ops->release_ownership)
+ continue;
+ pe->table_group.ops->release_ownership(&pe->table_group);
}
}
-static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
- .release = pnv_npu2_mn_release,
- .change_pte = pnv_npu2_mn_change_pte,
- .invalidate_range = pnv_npu2_mn_invalidate_range,
+static struct iommu_table_group_ops pnv_npu_peers_ops = {
+ .get_table_size = pnv_pci_ioda2_get_table_size,
+ .create_table = pnv_npu_peers_create_table_userspace,
+ .set_window = pnv_npu_peers_set_window,
+ .unset_window = pnv_npu_peers_unset_window,
+ .take_ownership = pnv_npu_peers_take_ownership,
+ .release_ownership = pnv_npu_peers_release_ownership,
};
-/*
- * Call into OPAL to setup the nmmu context for the current task in
- * the NPU. This must be called to setup the context tables before the
- * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
- *
- * A release callback should be registered to allow a device driver to
- * be notified that it should not launch any new translation requests
- * as the final TLB invalidate is about to occur.
- *
- * Returns an error if there no contexts are currently available or a
- * npu_context which should be passed to pnv_npu2_handle_fault().
- *
- * mmap_sem must be held in write mode and must not be called from interrupt
- * context.
- */
-struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
- unsigned long flags,
- void (*cb)(struct npu_context *, void *),
- void *priv)
+static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
+ struct pnv_ioda_pe *pe)
{
- int rc;
- u32 nvlink_index;
- struct device_node *nvlink_dn;
- struct mm_struct *mm = current->mm;
- struct pnv_phb *nphb;
- struct npu *npu;
- struct npu_context *npu_context;
+ if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM))
+ return;
- /*
- * At present we don't support GPUs connected to multiple NPUs and I'm
- * not sure the hardware does either.
- */
- struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+ npucomp->pe[npucomp->pe_num] = pe;
+ ++npucomp->pe_num;
+}
- if (!firmware_has_feature(FW_FEATURE_OPAL))
- return ERR_PTR(-ENODEV);
+static struct iommu_table_group *
+ pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
+{
+ struct iommu_table_group *compound_group;
+ struct npu_comp *npucomp;
+ struct pci_dev *gpdev = NULL;
+ struct pci_controller *hose;
+ struct pci_dev *npdev = NULL;
+
+ list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) {
+ npdev = pnv_pci_get_npu_dev(gpdev, 0);
+ if (npdev)
+ break;
+ }
if (!npdev)
- /* No nvlink associated with this GPU device */
- return ERR_PTR(-ENODEV);
+ /* It is not an NPU attached device, skip */
+ return NULL;
- nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
- if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
- &nvlink_index)))
- return ERR_PTR(-ENODEV);
+ hose = pci_bus_to_host(npdev->bus);
- if (!mm || mm->context.id == 0) {
- /*
- * Kernel thread contexts are not supported and context id 0 is
- * reserved on the GPU.
- */
- return ERR_PTR(-EINVAL);
+ if (hose->npu) {
+ /* P9 case: compound group is per-NPU (all gpus, all links) */
+ npucomp = &hose->npu->npucomp;
+ } else {
+ /* P8 case: Compound group is per-GPU (1 gpu, 2 links) */
+ npucomp = pe->npucomp = kzalloc(sizeof(*npucomp), GFP_KERNEL);
}
- nphb = pci_bus_to_host(npdev->bus)->private_data;
- npu = &nphb->npu;
+ compound_group = &npucomp->table_group;
+ if (!compound_group->group) {
+ compound_group->ops = &pnv_npu_peers_ops;
+ iommu_register_group(compound_group, hose->global_number,
+ pe->pe_number);
- /*
- * Setup the NPU context table for a particular GPU. These need to be
- * per-GPU as we need the tables to filter ATSDs when there are no
- * active contexts on a particular GPU. It is safe for these to be
- * called concurrently with destroy as the OPAL call takes appropriate
- * locks and refcounts on init/destroy.
- */
- rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
- PCI_DEVID(gpdev->bus->number, gpdev->devfn));
- if (rc < 0)
- return ERR_PTR(-ENOSPC);
-
- /*
- * We store the npu pci device so we can more easily get at the
- * associated npus.
- */
- spin_lock(&npu_context_lock);
- npu_context = mm->context.npu_context;
- if (npu_context) {
- if (npu_context->release_cb != cb ||
- npu_context->priv != priv) {
- spin_unlock(&npu_context_lock);
- opal_npu_destroy_context(nphb->opal_id, mm->context.id,
- PCI_DEVID(gpdev->bus->number,
- gpdev->devfn));
- return ERR_PTR(-EINVAL);
- }
-
- WARN_ON(!kref_get_unless_zero(&npu_context->kref));
- }
- spin_unlock(&npu_context_lock);
-
- if (!npu_context) {
- /*
- * We can set up these fields without holding the
- * npu_context_lock as the npu_context hasn't been returned to
- * the caller meaning it can't be destroyed. Parallel allocation
- * is protected against by mmap_sem.
- */
- rc = -ENOMEM;
- npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
- if (npu_context) {
- kref_init(&npu_context->kref);
- npu_context->mm = mm;
- npu_context->mn.ops = &nv_nmmu_notifier_ops;
- rc = __mmu_notifier_register(&npu_context->mn, mm);
- }
-
- if (rc) {
- kfree(npu_context);
- opal_npu_destroy_context(nphb->opal_id, mm->context.id,
- PCI_DEVID(gpdev->bus->number,
- gpdev->devfn));
- return ERR_PTR(rc);
- }
-
- mm->context.npu_context = npu_context;
+ /* Steal capabilities from a GPU PE */
+ compound_group->max_dynamic_windows_supported =
+ pe->table_group.max_dynamic_windows_supported;
+ compound_group->tce32_start = pe->table_group.tce32_start;
+ compound_group->tce32_size = pe->table_group.tce32_size;
+ compound_group->max_levels = pe->table_group.max_levels;
+ if (!compound_group->pgsizes)
+ compound_group->pgsizes = pe->table_group.pgsizes;
}
- npu_context->release_cb = cb;
- npu_context->priv = priv;
-
/*
- * npdev is a pci_dev pointer setup by the PCI code. We assign it to
- * npdev[][] to indicate to the mmu notifiers that an invalidation
- * should also be sent over this nvlink. The notifiers don't use any
- * other fields in npu_context, so we just need to ensure that when they
- * deference npu_context->npdev[][] it is either a valid pointer or
- * NULL.
+ * The gpu would have been added to the iommu group that's created
+ * for the PE. Pull it out now.
*/
- WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
+ iommu_del_device(&gpdev->dev);
- if (!nphb->npu.nmmu_flush) {
- /*
- * If we're not explicitly flushing ourselves we need to mark
- * the thread for global flushes
- */
- npu_context->nmmu_flush = false;
- mm_context_add_copro(mm);
- } else
- npu_context->nmmu_flush = true;
+ /*
+ * I'm not sure this is strictly required, but it's probably a good idea
+ * since the table_group for the PE is going to be attached to the
+ * compound table group. If we leave the PE's iommu group active then
+ * we might have the same table_group being modifiable via two sepeate
+ * iommu groups.
+ */
+ iommu_group_put(pe->table_group.group);
- return npu_context;
-}
-EXPORT_SYMBOL(pnv_npu2_init_context);
+ /* now put the GPU into the compound group */
+ pnv_comp_attach_table_group(npucomp, pe);
+ iommu_add_device(compound_group, &gpdev->dev);
-static void pnv_npu2_release_context(struct kref *kref)
-{
- struct npu_context *npu_context =
- container_of(kref, struct npu_context, kref);
-
- if (!npu_context->nmmu_flush)
- mm_context_remove_copro(npu_context->mm);
-
- npu_context->mm->context.npu_context = NULL;
+ return compound_group;
}
-/*
- * Destroy a context on the given GPU. May free the npu_context if it is no
- * longer active on any GPUs. Must not be called from interrupt context.
- */
-void pnv_npu2_destroy_context(struct npu_context *npu_context,
- struct pci_dev *gpdev)
+static struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
{
- int removed;
- struct pnv_phb *nphb;
+ struct iommu_table_group *table_group;
+ struct npu_comp *npucomp;
+ struct pci_dev *gpdev = NULL;
+ struct pci_dev *npdev;
+ struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev);
+
+ WARN_ON(!(pe->flags & PNV_IODA_PE_DEV));
+ if (!gpe)
+ return NULL;
+
+ /*
+ * IODA2 bridges get this set up from pci_controller_ops::setup_bridge
+ * but NPU bridges do not have this hook defined so we do it here.
+ * We do not setup other table group parameters as they won't be used
+ * anyway - NVLink bridges are subordinate PEs.
+ */
+ pe->table_group.ops = &pnv_pci_npu_ops;
+
+ table_group = iommu_group_get_iommudata(
+ iommu_group_get(&gpdev->dev));
+
+ /*
+ * On P9 NPU PHB and PCI PHB support different page sizes,
+ * keep only matching. We expect here that NVLink bridge PE pgsizes is
+ * initialized by the caller.
+ */
+ table_group->pgsizes &= pe->table_group.pgsizes;
+ npucomp = container_of(table_group, struct npu_comp, table_group);
+ pnv_comp_attach_table_group(npucomp, pe);
+
+ list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) {
+ struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev);
+
+ if (gpdevtmp != gpdev)
+ continue;
+
+ iommu_add_device(table_group, &npdev->dev);
+ }
+
+ return table_group;
+}
+
+void pnv_pci_npu_setup_iommu_groups(void)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe;
+
+ /*
+ * For non-nvlink devices the IOMMU group is registered when the PE is
+ * configured and devices are added to the group when the per-device
+ * DMA setup is run. That's done in hose->ops.dma_dev_setup() which is
+ * only initialise for "normal" IODA PHBs.
+ *
+ * For NVLink devices we need to ensure the NVLinks and the GPU end up
+ * in the same IOMMU group, so that's handled here.
+ */
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb = hose->private_data;
+
+ if (phb->type == PNV_PHB_IODA2)
+ list_for_each_entry(pe, &phb->ioda.pe_list, list)
+ pnv_try_setup_npu_table_group(pe);
+ }
+
+ /*
+ * Now we have all PHBs discovered, time to add NPU devices to
+ * the corresponding IOMMU groups.
+ */
+ list_for_each_entry(hose, &hose_list, list_node) {
+ unsigned long pgsizes;
+
+ phb = hose->private_data;
+
+ if (phb->type != PNV_PHB_NPU_NVLINK)
+ continue;
+
+ pgsizes = pnv_ioda_parse_tce_sizes(phb);
+ list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+ /*
+ * IODA2 bridges get this set up from
+ * pci_controller_ops::setup_bridge but NPU bridges
+ * do not have this hook defined so we do it here.
+ */
+ pe->table_group.pgsizes = pgsizes;
+ pnv_npu_compound_attach(pe);
+ }
+ }
+}
+#endif /* CONFIG_IOMMU_API */
+
+int pnv_npu2_init(struct pci_controller *hose)
+{
+ static int npu_index;
struct npu *npu;
- struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
- struct device_node *nvlink_dn;
- u32 nvlink_index;
+ int ret;
- if (WARN_ON(!npdev))
- return;
+ npu = kzalloc(sizeof(*npu), GFP_KERNEL);
+ if (!npu)
+ return -ENOMEM;
- if (!firmware_has_feature(FW_FEATURE_OPAL))
- return;
-
- nphb = pci_bus_to_host(npdev->bus)->private_data;
- npu = &nphb->npu;
- nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
- if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
- &nvlink_index)))
- return;
- WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
- opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
- PCI_DEVID(gpdev->bus->number, gpdev->devfn));
- spin_lock(&npu_context_lock);
- removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
- spin_unlock(&npu_context_lock);
-
- /*
- * We need to do this outside of pnv_npu2_release_context so that it is
- * outside the spinlock as mmu_notifier_destroy uses SRCU.
- */
- if (removed) {
- mmu_notifier_unregister(&npu_context->mn,
- npu_context->mm);
-
- kfree(npu_context);
+ npu_index++;
+ if (WARN_ON(npu_index >= NV_MAX_NPUS)) {
+ ret = -ENOSPC;
+ goto fail_exit;
}
+ npu->index = npu_index;
+ hose->npu = npu;
+ return 0;
+
+fail_exit:
+ kfree(npu);
+ return ret;
}
-EXPORT_SYMBOL(pnv_npu2_destroy_context);
-/*
- * Assumes mmap_sem is held for the contexts associated mm.
- */
-int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
- unsigned long *flags, unsigned long *status, int count)
+int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
+ unsigned long msr)
{
- u64 rc = 0, result = 0;
- int i, is_write;
- struct page *page[1];
+ int ret;
+ struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+ struct pci_controller *hose;
+ struct pnv_phb *nphb;
- /* mmap_sem should be held so the struct_mm must be present */
- struct mm_struct *mm = context->mm;
-
- if (!firmware_has_feature(FW_FEATURE_OPAL))
+ if (!npdev)
return -ENODEV;
- WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
-
- for (i = 0; i < count; i++) {
- is_write = flags[i] & NPU2_WRITE;
- rc = get_user_pages_remote(NULL, mm, ea[i], 1,
- is_write ? FOLL_WRITE : 0,
- page, NULL, NULL);
-
- /*
- * To support virtualised environments we will have to do an
- * access to the page to ensure it gets faulted into the
- * hypervisor. For the moment virtualisation is not supported in
- * other areas so leave the access out.
- */
- if (rc != 1) {
- status[i] = rc;
- result = -EFAULT;
- continue;
- }
-
- status[i] = 0;
- put_page(page[0]);
+ hose = pci_bus_to_host(npdev->bus);
+ if (hose->npu == NULL) {
+ dev_info_once(&npdev->dev, "Nvlink1 does not support contexts");
+ return 0;
}
- return result;
-}
-EXPORT_SYMBOL(pnv_npu2_handle_fault);
+ nphb = hose->private_data;
-int pnv_npu2_init(struct pnv_phb *phb)
-{
- unsigned int i;
- u64 mmio_atsd;
- struct device_node *dn;
- struct pci_dev *gpdev;
- static int npu_index;
- uint64_t rc = 0;
-
- if (!atsd_threshold_dentry) {
- atsd_threshold_dentry = debugfs_create_x64("atsd_threshold",
- 0600, powerpc_debugfs_root, &atsd_threshold);
+ dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=%u\n",
+ nphb->opal_id, lparid);
+ /*
+ * Currently we only support radix and non-zero LPCR only makes sense
+ * for hash tables so skiboot expects the LPCR parameter to be a zero.
+ */
+ ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), lparid,
+ 0 /* LPCR bits */);
+ if (ret) {
+ dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
+ return ret;
}
- phb->npu.nmmu_flush =
- of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush");
- for_each_child_of_node(phb->hose->dn, dn) {
- gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
- if (gpdev) {
- rc = opal_npu_map_lpar(phb->opal_id,
- PCI_DEVID(gpdev->bus->number, gpdev->devfn),
- 0, 0);
- if (rc)
- dev_err(&gpdev->dev,
- "Error %lld mapping device to LPAR\n",
- rc);
- }
- }
-
- for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
- i, &mmio_atsd); i++)
- phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
-
- pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
- phb->npu.mmio_atsd_count = i;
- phb->npu.mmio_atsd_usage = 0;
- npu_index++;
- if (WARN_ON(npu_index >= NV_MAX_NPUS))
- return -ENOSPC;
- max_npu2_index = npu_index;
- phb->npu.index = npu_index;
+ dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n",
+ nphb->opal_id, msr);
+ ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr,
+ pci_dev_id(gpdev));
+ if (ret < 0)
+ dev_err(&gpdev->dev, "Failed to init context: %d\n", ret);
+ else
+ ret = 0;
return 0;
}
+EXPORT_SYMBOL_GPL(pnv_npu2_map_lpar_dev);
+
+void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr)
+{
+ struct pci_dev *gpdev;
+
+ list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list)
+ pnv_npu2_map_lpar_dev(gpdev, 0, msr);
+}
+
+int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev)
+{
+ int ret;
+ struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+ struct pci_controller *hose;
+ struct pnv_phb *nphb;
+
+ if (!npdev)
+ return -ENODEV;
+
+ hose = pci_bus_to_host(npdev->bus);
+ if (hose->npu == NULL) {
+ dev_info_once(&npdev->dev, "Nvlink1 does not support contexts");
+ return 0;
+ }
+
+ nphb = hose->private_data;
+
+ dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n",
+ nphb->opal_id);
+ ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/,
+ pci_dev_id(gpdev));
+ if (ret < 0) {
+ dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret);
+ return ret;
+ }
+
+ /* Set LPID to 0 anyway, just to be safe */
+ dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id);
+ ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), 0 /*LPID*/,
+ 0 /* LPCR bits */);
+ if (ret)
+ dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev);
--
Gitblit v1.6.2