.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * This file implements the DMA operations for NVLink devices. The NPU |
---|
3 | 4 | * devices all point to the same iommu table as the parent PCI device. |
---|
4 | 5 | * |
---|
5 | 6 | * Copyright Alistair Popple, IBM Corporation 2015. |
---|
6 | | - * |
---|
7 | | - * This program is free software; you can redistribute it and/or |
---|
8 | | - * modify it under the terms of version 2 of the GNU General Public |
---|
9 | | - * License as published by the Free Software Foundation. |
---|
10 | 7 | */ |
---|
11 | 8 | |
---|
12 | | -#include <linux/slab.h> |
---|
13 | 9 | #include <linux/mmu_notifier.h> |
---|
14 | 10 | #include <linux/mmu_context.h> |
---|
15 | 11 | #include <linux/of.h> |
---|
16 | | -#include <linux/export.h> |
---|
17 | 12 | #include <linux/pci.h> |
---|
18 | 13 | #include <linux/memblock.h> |
---|
19 | | -#include <linux/iommu.h> |
---|
20 | | -#include <linux/debugfs.h> |
---|
| 14 | +#include <linux/sizes.h> |
---|
21 | 15 | |
---|
22 | 16 | #include <asm/debugfs.h> |
---|
23 | | -#include <asm/tlb.h> |
---|
24 | 17 | #include <asm/powernv.h> |
---|
25 | | -#include <asm/reg.h> |
---|
26 | | -#include <asm/opal.h> |
---|
27 | | -#include <asm/io.h> |
---|
28 | | -#include <asm/iommu.h> |
---|
29 | | -#include <asm/pnv-pci.h> |
---|
30 | | -#include <asm/msi_bitmap.h> |
---|
| 18 | +#include <asm/ppc-pci.h> |
---|
31 | 19 | #include <asm/opal.h> |
---|
32 | 20 | |
---|
33 | | -#include "powernv.h" |
---|
34 | 21 | #include "pci.h" |
---|
35 | 22 | |
---|
36 | | -#define npu_to_phb(x) container_of(x, struct pnv_phb, npu) |
---|
37 | | - |
---|
38 | | -/* |
---|
39 | | - * spinlock to protect initialisation of an npu_context for a particular |
---|
40 | | - * mm_struct. |
---|
41 | | - */ |
---|
42 | | -static DEFINE_SPINLOCK(npu_context_lock); |
---|
43 | | - |
---|
44 | | -/* |
---|
45 | | - * When an address shootdown range exceeds this threshold we invalidate the |
---|
46 | | - * entire TLB on the GPU for the given PID rather than each specific address in |
---|
47 | | - * the range. |
---|
48 | | - */ |
---|
49 | | -static uint64_t atsd_threshold = 2 * 1024 * 1024; |
---|
50 | | -static struct dentry *atsd_threshold_dentry; |
---|
51 | | - |
---|
52 | | -/* |
---|
53 | | - * Other types of TCE cache invalidation are not functional in the |
---|
54 | | - * hardware. |
---|
55 | | - */ |
---|
56 | 23 | static struct pci_dev *get_pci_dev(struct device_node *dn) |
---|
57 | 24 | { |
---|
58 | 25 | struct pci_dn *pdn = PCI_DN(dn); |
---|
.. | .. |
---|
123 | 90 | } |
---|
124 | 91 | EXPORT_SYMBOL(pnv_pci_get_npu_dev); |
---|
125 | 92 | |
---|
126 | | -#define NPU_DMA_OP_UNSUPPORTED() \ |
---|
127 | | - dev_err_once(dev, "%s operation unsupported for NVLink devices\n", \ |
---|
128 | | - __func__) |
---|
129 | | - |
---|
130 | | -static void *dma_npu_alloc(struct device *dev, size_t size, |
---|
131 | | - dma_addr_t *dma_handle, gfp_t flag, |
---|
132 | | - unsigned long attrs) |
---|
133 | | -{ |
---|
134 | | - NPU_DMA_OP_UNSUPPORTED(); |
---|
135 | | - return NULL; |
---|
136 | | -} |
---|
137 | | - |
---|
138 | | -static void dma_npu_free(struct device *dev, size_t size, |
---|
139 | | - void *vaddr, dma_addr_t dma_handle, |
---|
140 | | - unsigned long attrs) |
---|
141 | | -{ |
---|
142 | | - NPU_DMA_OP_UNSUPPORTED(); |
---|
143 | | -} |
---|
144 | | - |
---|
145 | | -static dma_addr_t dma_npu_map_page(struct device *dev, struct page *page, |
---|
146 | | - unsigned long offset, size_t size, |
---|
147 | | - enum dma_data_direction direction, |
---|
148 | | - unsigned long attrs) |
---|
149 | | -{ |
---|
150 | | - NPU_DMA_OP_UNSUPPORTED(); |
---|
151 | | - return 0; |
---|
152 | | -} |
---|
153 | | - |
---|
154 | | -static int dma_npu_map_sg(struct device *dev, struct scatterlist *sglist, |
---|
155 | | - int nelems, enum dma_data_direction direction, |
---|
156 | | - unsigned long attrs) |
---|
157 | | -{ |
---|
158 | | - NPU_DMA_OP_UNSUPPORTED(); |
---|
159 | | - return 0; |
---|
160 | | -} |
---|
161 | | - |
---|
162 | | -static int dma_npu_dma_supported(struct device *dev, u64 mask) |
---|
163 | | -{ |
---|
164 | | - NPU_DMA_OP_UNSUPPORTED(); |
---|
165 | | - return 0; |
---|
166 | | -} |
---|
167 | | - |
---|
168 | | -static u64 dma_npu_get_required_mask(struct device *dev) |
---|
169 | | -{ |
---|
170 | | - NPU_DMA_OP_UNSUPPORTED(); |
---|
171 | | - return 0; |
---|
172 | | -} |
---|
173 | | - |
---|
174 | | -static const struct dma_map_ops dma_npu_ops = { |
---|
175 | | - .map_page = dma_npu_map_page, |
---|
176 | | - .map_sg = dma_npu_map_sg, |
---|
177 | | - .alloc = dma_npu_alloc, |
---|
178 | | - .free = dma_npu_free, |
---|
179 | | - .dma_supported = dma_npu_dma_supported, |
---|
180 | | - .get_required_mask = dma_npu_get_required_mask, |
---|
181 | | -}; |
---|
182 | | - |
---|
| 93 | +#ifdef CONFIG_IOMMU_API |
---|
183 | 94 | /* |
---|
184 | 95 | * Returns the PE assoicated with the PCI device of the given |
---|
185 | 96 | * NPU. Returns the linked pci device if pci_dev != NULL. |
---|
.. | .. |
---|
211 | 122 | return pe; |
---|
212 | 123 | } |
---|
213 | 124 | |
---|
214 | | -long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, |
---|
| 125 | +static long pnv_npu_unset_window(struct iommu_table_group *table_group, |
---|
| 126 | + int num); |
---|
| 127 | + |
---|
| 128 | +static long pnv_npu_set_window(struct iommu_table_group *table_group, int num, |
---|
215 | 129 | struct iommu_table *tbl) |
---|
216 | 130 | { |
---|
| 131 | + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, |
---|
| 132 | + table_group); |
---|
217 | 133 | struct pnv_phb *phb = npe->phb; |
---|
218 | 134 | int64_t rc; |
---|
219 | 135 | const unsigned long size = tbl->it_indirect_levels ? |
---|
220 | 136 | tbl->it_level_size : tbl->it_size; |
---|
221 | 137 | const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; |
---|
222 | 138 | const __u64 win_size = tbl->it_size << tbl->it_page_shift; |
---|
| 139 | + int num2 = (num == 0) ? 1 : 0; |
---|
| 140 | + |
---|
| 141 | + /* NPU has just one TVE so if there is another table, remove it first */ |
---|
| 142 | + if (npe->table_group.tables[num2]) |
---|
| 143 | + pnv_npu_unset_window(&npe->table_group, num2); |
---|
223 | 144 | |
---|
224 | 145 | pe_info(npe, "Setting up window %llx..%llx pg=%lx\n", |
---|
225 | 146 | start_addr, start_addr + win_size - 1, |
---|
.. | .. |
---|
245 | 166 | return 0; |
---|
246 | 167 | } |
---|
247 | 168 | |
---|
248 | | -long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num) |
---|
| 169 | +static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num) |
---|
249 | 170 | { |
---|
| 171 | + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, |
---|
| 172 | + table_group); |
---|
250 | 173 | struct pnv_phb *phb = npe->phb; |
---|
251 | 174 | int64_t rc; |
---|
| 175 | + |
---|
| 176 | + if (!npe->table_group.tables[num]) |
---|
| 177 | + return 0; |
---|
252 | 178 | |
---|
253 | 179 | pe_info(npe, "Removing DMA window\n"); |
---|
254 | 180 | |
---|
.. | .. |
---|
268 | 194 | return 0; |
---|
269 | 195 | } |
---|
270 | 196 | |
---|
271 | | -/* |
---|
272 | | - * Enables 32 bit DMA on NPU. |
---|
273 | | - */ |
---|
274 | | -static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) |
---|
275 | | -{ |
---|
276 | | - struct pci_dev *gpdev; |
---|
277 | | - struct pnv_ioda_pe *gpe; |
---|
278 | | - int64_t rc; |
---|
279 | | - |
---|
280 | | - /* |
---|
281 | | - * Find the assoicated PCI devices and get the dma window |
---|
282 | | - * information from there. |
---|
283 | | - */ |
---|
284 | | - if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV)) |
---|
285 | | - return; |
---|
286 | | - |
---|
287 | | - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); |
---|
288 | | - if (!gpe) |
---|
289 | | - return; |
---|
290 | | - |
---|
291 | | - rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]); |
---|
292 | | - |
---|
293 | | - /* |
---|
294 | | - * We don't initialise npu_pe->tce32_table as we always use |
---|
295 | | - * dma_npu_ops which are nops. |
---|
296 | | - */ |
---|
297 | | - set_dma_ops(&npe->pdev->dev, &dma_npu_ops); |
---|
298 | | -} |
---|
299 | | - |
---|
300 | | -/* |
---|
301 | | - * Enables bypass mode on the NPU. The NPU only supports one |
---|
302 | | - * window per link, so bypass needs to be explicitly enabled or |
---|
303 | | - * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be |
---|
304 | | - * active at the same time. |
---|
305 | | - */ |
---|
306 | | -static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) |
---|
307 | | -{ |
---|
308 | | - struct pnv_phb *phb = npe->phb; |
---|
309 | | - int64_t rc = 0; |
---|
310 | | - phys_addr_t top = memblock_end_of_DRAM(); |
---|
311 | | - |
---|
312 | | - if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev) |
---|
313 | | - return -EINVAL; |
---|
314 | | - |
---|
315 | | - rc = pnv_npu_unset_window(npe, 0); |
---|
316 | | - if (rc != OPAL_SUCCESS) |
---|
317 | | - return rc; |
---|
318 | | - |
---|
319 | | - /* Enable the bypass window */ |
---|
320 | | - |
---|
321 | | - top = roundup_pow_of_two(top); |
---|
322 | | - dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n", |
---|
323 | | - npe->pe_number); |
---|
324 | | - rc = opal_pci_map_pe_dma_window_real(phb->opal_id, |
---|
325 | | - npe->pe_number, npe->pe_number, |
---|
326 | | - 0 /* bypass base */, top); |
---|
327 | | - |
---|
328 | | - if (rc == OPAL_SUCCESS) |
---|
329 | | - pnv_pci_ioda2_tce_invalidate_entire(phb, false); |
---|
330 | | - |
---|
331 | | - return rc; |
---|
332 | | -} |
---|
333 | | - |
---|
334 | | -void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) |
---|
335 | | -{ |
---|
336 | | - int i; |
---|
337 | | - struct pnv_phb *phb; |
---|
338 | | - struct pci_dn *pdn; |
---|
339 | | - struct pnv_ioda_pe *npe; |
---|
340 | | - struct pci_dev *npdev; |
---|
341 | | - |
---|
342 | | - for (i = 0; ; ++i) { |
---|
343 | | - npdev = pnv_pci_get_npu_dev(gpdev, i); |
---|
344 | | - |
---|
345 | | - if (!npdev) |
---|
346 | | - break; |
---|
347 | | - |
---|
348 | | - pdn = pci_get_pdn(npdev); |
---|
349 | | - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) |
---|
350 | | - return; |
---|
351 | | - |
---|
352 | | - phb = pci_bus_to_host(npdev->bus)->private_data; |
---|
353 | | - |
---|
354 | | - /* We only do bypass if it's enabled on the linked device */ |
---|
355 | | - npe = &phb->ioda.pe_array[pdn->pe_number]; |
---|
356 | | - |
---|
357 | | - if (bypass) { |
---|
358 | | - dev_info(&npdev->dev, |
---|
359 | | - "Using 64-bit DMA iommu bypass\n"); |
---|
360 | | - pnv_npu_dma_set_bypass(npe); |
---|
361 | | - } else { |
---|
362 | | - dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); |
---|
363 | | - pnv_npu_dma_set_32(npe); |
---|
364 | | - } |
---|
365 | | - } |
---|
366 | | -} |
---|
367 | | - |
---|
368 | 197 | /* Switch ownership from platform code to external user (e.g. VFIO) */ |
---|
369 | | -void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) |
---|
| 198 | +static void pnv_npu_take_ownership(struct iommu_table_group *table_group) |
---|
370 | 199 | { |
---|
| 200 | + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, |
---|
| 201 | + table_group); |
---|
371 | 202 | struct pnv_phb *phb = npe->phb; |
---|
372 | 203 | int64_t rc; |
---|
| 204 | + struct pci_dev *gpdev = NULL; |
---|
373 | 205 | |
---|
374 | 206 | /* |
---|
375 | 207 | * Note: NPU has just a single TVE in the hardware which means that |
---|
.. | .. |
---|
378 | 210 | * if it was enabled at the moment of ownership change. |
---|
379 | 211 | */ |
---|
380 | 212 | if (npe->table_group.tables[0]) { |
---|
381 | | - pnv_npu_unset_window(npe, 0); |
---|
| 213 | + pnv_npu_unset_window(&npe->table_group, 0); |
---|
382 | 214 | return; |
---|
383 | 215 | } |
---|
384 | 216 | |
---|
.. | .. |
---|
391 | 223 | return; |
---|
392 | 224 | } |
---|
393 | 225 | pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); |
---|
| 226 | + |
---|
| 227 | + get_gpu_pci_dev_and_pe(npe, &gpdev); |
---|
| 228 | + if (gpdev) |
---|
| 229 | + pnv_npu2_unmap_lpar_dev(gpdev); |
---|
394 | 230 | } |
---|
395 | 231 | |
---|
396 | | -struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) |
---|
| 232 | +static void pnv_npu_release_ownership(struct iommu_table_group *table_group) |
---|
397 | 233 | { |
---|
398 | | - struct pnv_phb *phb = npe->phb; |
---|
399 | | - struct pci_bus *pbus = phb->hose->bus; |
---|
400 | | - struct pci_dev *npdev, *gpdev = NULL, *gptmp; |
---|
401 | | - struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); |
---|
| 234 | + struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, |
---|
| 235 | + table_group); |
---|
| 236 | + struct pci_dev *gpdev = NULL; |
---|
402 | 237 | |
---|
403 | | - if (!gpe || !gpdev) |
---|
404 | | - return NULL; |
---|
405 | | - |
---|
406 | | - list_for_each_entry(npdev, &pbus->devices, bus_list) { |
---|
407 | | - gptmp = pnv_pci_get_gpu_dev(npdev); |
---|
408 | | - |
---|
409 | | - if (gptmp != gpdev) |
---|
410 | | - continue; |
---|
411 | | - |
---|
412 | | - pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev)); |
---|
413 | | - iommu_group_add_device(gpe->table_group.group, &npdev->dev); |
---|
414 | | - } |
---|
415 | | - |
---|
416 | | - return gpe; |
---|
| 238 | + get_gpu_pci_dev_and_pe(npe, &gpdev); |
---|
| 239 | + if (gpdev) |
---|
| 240 | + pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV); |
---|
417 | 241 | } |
---|
418 | 242 | |
---|
419 | | -/* Maximum number of nvlinks per npu */ |
---|
420 | | -#define NV_MAX_LINKS 6 |
---|
421 | | - |
---|
422 | | -/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */ |
---|
423 | | -static int max_npu2_index; |
---|
424 | | - |
---|
425 | | -struct npu_context { |
---|
426 | | - struct mm_struct *mm; |
---|
427 | | - struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS]; |
---|
428 | | - struct mmu_notifier mn; |
---|
429 | | - struct kref kref; |
---|
430 | | - bool nmmu_flush; |
---|
431 | | - |
---|
432 | | - /* Callback to stop translation requests on a given GPU */ |
---|
433 | | - void (*release_cb)(struct npu_context *context, void *priv); |
---|
434 | | - |
---|
435 | | - /* |
---|
436 | | - * Private pointer passed to the above callback for usage by |
---|
437 | | - * device drivers. |
---|
438 | | - */ |
---|
439 | | - void *priv; |
---|
| 243 | +static struct iommu_table_group_ops pnv_pci_npu_ops = { |
---|
| 244 | + .set_window = pnv_npu_set_window, |
---|
| 245 | + .unset_window = pnv_npu_unset_window, |
---|
| 246 | + .take_ownership = pnv_npu_take_ownership, |
---|
| 247 | + .release_ownership = pnv_npu_release_ownership, |
---|
440 | 248 | }; |
---|
441 | | - |
---|
442 | | -struct mmio_atsd_reg { |
---|
443 | | - struct npu *npu; |
---|
444 | | - int reg; |
---|
445 | | -}; |
---|
| 249 | +#endif /* !CONFIG_IOMMU_API */ |
---|
446 | 250 | |
---|
447 | 251 | /* |
---|
448 | | - * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC |
---|
449 | | - * if none are available. |
---|
| 252 | + * NPU2 ATS |
---|
450 | 253 | */ |
---|
451 | | -static int get_mmio_atsd_reg(struct npu *npu) |
---|
452 | | -{ |
---|
453 | | - int i; |
---|
454 | | - |
---|
455 | | - for (i = 0; i < npu->mmio_atsd_count; i++) { |
---|
456 | | - if (!test_bit(i, &npu->mmio_atsd_usage)) |
---|
457 | | - if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage)) |
---|
458 | | - return i; |
---|
459 | | - } |
---|
460 | | - |
---|
461 | | - return -ENOSPC; |
---|
462 | | -} |
---|
463 | | - |
---|
464 | | -static void put_mmio_atsd_reg(struct npu *npu, int reg) |
---|
465 | | -{ |
---|
466 | | - clear_bit_unlock(reg, &npu->mmio_atsd_usage); |
---|
467 | | -} |
---|
468 | | - |
---|
469 | | -/* MMIO ATSD register offsets */ |
---|
470 | | -#define XTS_ATSD_AVA 1 |
---|
471 | | -#define XTS_ATSD_STAT 2 |
---|
472 | | - |
---|
473 | | -static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg, |
---|
474 | | - unsigned long launch, unsigned long va) |
---|
475 | | -{ |
---|
476 | | - struct npu *npu = mmio_atsd_reg->npu; |
---|
477 | | - int reg = mmio_atsd_reg->reg; |
---|
478 | | - |
---|
479 | | - __raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA); |
---|
480 | | - eieio(); |
---|
481 | | - __raw_writeq_be(launch, npu->mmio_atsd_regs[reg]); |
---|
482 | | -} |
---|
483 | | - |
---|
484 | | -static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], |
---|
485 | | - unsigned long pid, bool flush) |
---|
486 | | -{ |
---|
487 | | - int i; |
---|
488 | | - unsigned long launch; |
---|
489 | | - |
---|
490 | | - for (i = 0; i <= max_npu2_index; i++) { |
---|
491 | | - if (mmio_atsd_reg[i].reg < 0) |
---|
492 | | - continue; |
---|
493 | | - |
---|
494 | | - /* IS set to invalidate matching PID */ |
---|
495 | | - launch = PPC_BIT(12); |
---|
496 | | - |
---|
497 | | - /* PRS set to process-scoped */ |
---|
498 | | - launch |= PPC_BIT(13); |
---|
499 | | - |
---|
500 | | - /* AP */ |
---|
501 | | - launch |= (u64) |
---|
502 | | - mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); |
---|
503 | | - |
---|
504 | | - /* PID */ |
---|
505 | | - launch |= pid << PPC_BITLSHIFT(38); |
---|
506 | | - |
---|
507 | | - /* No flush */ |
---|
508 | | - launch |= !flush << PPC_BITLSHIFT(39); |
---|
509 | | - |
---|
510 | | - /* Invalidating the entire process doesn't use a va */ |
---|
511 | | - mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0); |
---|
512 | | - } |
---|
513 | | -} |
---|
514 | | - |
---|
515 | | -static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], |
---|
516 | | - unsigned long va, unsigned long pid, bool flush) |
---|
517 | | -{ |
---|
518 | | - int i; |
---|
519 | | - unsigned long launch; |
---|
520 | | - |
---|
521 | | - for (i = 0; i <= max_npu2_index; i++) { |
---|
522 | | - if (mmio_atsd_reg[i].reg < 0) |
---|
523 | | - continue; |
---|
524 | | - |
---|
525 | | - /* IS set to invalidate target VA */ |
---|
526 | | - launch = 0; |
---|
527 | | - |
---|
528 | | - /* PRS set to process scoped */ |
---|
529 | | - launch |= PPC_BIT(13); |
---|
530 | | - |
---|
531 | | - /* AP */ |
---|
532 | | - launch |= (u64) |
---|
533 | | - mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); |
---|
534 | | - |
---|
535 | | - /* PID */ |
---|
536 | | - launch |= pid << PPC_BITLSHIFT(38); |
---|
537 | | - |
---|
538 | | - /* No flush */ |
---|
539 | | - launch |= !flush << PPC_BITLSHIFT(39); |
---|
540 | | - |
---|
541 | | - mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va); |
---|
542 | | - } |
---|
543 | | -} |
---|
544 | | - |
---|
545 | | -#define mn_to_npu_context(x) container_of(x, struct npu_context, mn) |
---|
546 | | - |
---|
547 | | -static void mmio_invalidate_wait( |
---|
548 | | - struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) |
---|
549 | | -{ |
---|
550 | | - struct npu *npu; |
---|
551 | | - int i, reg; |
---|
552 | | - |
---|
553 | | - /* Wait for all invalidations to complete */ |
---|
554 | | - for (i = 0; i <= max_npu2_index; i++) { |
---|
555 | | - if (mmio_atsd_reg[i].reg < 0) |
---|
556 | | - continue; |
---|
557 | | - |
---|
558 | | - /* Wait for completion */ |
---|
559 | | - npu = mmio_atsd_reg[i].npu; |
---|
560 | | - reg = mmio_atsd_reg[i].reg; |
---|
561 | | - while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) |
---|
562 | | - cpu_relax(); |
---|
563 | | - } |
---|
564 | | -} |
---|
| 254 | +/* Maximum possible number of ATSD MMIO registers per NPU */ |
---|
| 255 | +#define NV_NMMU_ATSD_REGS 8 |
---|
| 256 | +#define NV_NPU_MAX_PE_NUM 16 |
---|
565 | 257 | |
---|
566 | 258 | /* |
---|
567 | | - * Acquires all the address translation shootdown (ATSD) registers required to |
---|
568 | | - * launch an ATSD on all links this npu_context is active on. |
---|
| 259 | + * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or |
---|
| 260 | + * up to 3 x (GPU + 2xNPUs) (POWER9). |
---|
569 | 261 | */ |
---|
570 | | -static void acquire_atsd_reg(struct npu_context *npu_context, |
---|
571 | | - struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) |
---|
| 262 | +struct npu_comp { |
---|
| 263 | + struct iommu_table_group table_group; |
---|
| 264 | + int pe_num; |
---|
| 265 | + struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM]; |
---|
| 266 | +}; |
---|
| 267 | + |
---|
| 268 | +/* An NPU descriptor, valid for POWER9 only */ |
---|
| 269 | +struct npu { |
---|
| 270 | + int index; |
---|
| 271 | + struct npu_comp npucomp; |
---|
| 272 | +}; |
---|
| 273 | + |
---|
| 274 | +#ifdef CONFIG_IOMMU_API |
---|
| 275 | +static long pnv_npu_peers_create_table_userspace( |
---|
| 276 | + struct iommu_table_group *table_group, |
---|
| 277 | + int num, __u32 page_shift, __u64 window_size, __u32 levels, |
---|
| 278 | + struct iommu_table **ptbl) |
---|
| 279 | +{ |
---|
| 280 | + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, |
---|
| 281 | + table_group); |
---|
| 282 | + |
---|
| 283 | + if (!npucomp->pe_num || !npucomp->pe[0] || |
---|
| 284 | + !npucomp->pe[0]->table_group.ops || |
---|
| 285 | + !npucomp->pe[0]->table_group.ops->create_table) |
---|
| 286 | + return -EFAULT; |
---|
| 287 | + |
---|
| 288 | + return npucomp->pe[0]->table_group.ops->create_table( |
---|
| 289 | + &npucomp->pe[0]->table_group, num, page_shift, |
---|
| 290 | + window_size, levels, ptbl); |
---|
| 291 | +} |
---|
| 292 | + |
---|
| 293 | +static long pnv_npu_peers_set_window(struct iommu_table_group *table_group, |
---|
| 294 | + int num, struct iommu_table *tbl) |
---|
572 | 295 | { |
---|
573 | 296 | int i, j; |
---|
574 | | - struct npu *npu; |
---|
575 | | - struct pci_dev *npdev; |
---|
576 | | - struct pnv_phb *nphb; |
---|
| 297 | + long ret = 0; |
---|
| 298 | + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, |
---|
| 299 | + table_group); |
---|
577 | 300 | |
---|
578 | | - for (i = 0; i <= max_npu2_index; i++) { |
---|
579 | | - mmio_atsd_reg[i].reg = -1; |
---|
580 | | - for (j = 0; j < NV_MAX_LINKS; j++) { |
---|
581 | | - /* |
---|
582 | | - * There are no ordering requirements with respect to |
---|
583 | | - * the setup of struct npu_context, but to ensure |
---|
584 | | - * consistent behaviour we need to ensure npdev[][] is |
---|
585 | | - * only read once. |
---|
586 | | - */ |
---|
587 | | - npdev = READ_ONCE(npu_context->npdev[i][j]); |
---|
588 | | - if (!npdev) |
---|
| 301 | + for (i = 0; i < npucomp->pe_num; ++i) { |
---|
| 302 | + struct pnv_ioda_pe *pe = npucomp->pe[i]; |
---|
| 303 | + |
---|
| 304 | + if (!pe->table_group.ops->set_window) |
---|
| 305 | + continue; |
---|
| 306 | + |
---|
| 307 | + ret = pe->table_group.ops->set_window(&pe->table_group, |
---|
| 308 | + num, tbl); |
---|
| 309 | + if (ret) |
---|
| 310 | + break; |
---|
| 311 | + } |
---|
| 312 | + |
---|
| 313 | + if (ret) { |
---|
| 314 | + for (j = 0; j < i; ++j) { |
---|
| 315 | + struct pnv_ioda_pe *pe = npucomp->pe[j]; |
---|
| 316 | + |
---|
| 317 | + if (!pe->table_group.ops->unset_window) |
---|
589 | 318 | continue; |
---|
590 | 319 | |
---|
591 | | - nphb = pci_bus_to_host(npdev->bus)->private_data; |
---|
592 | | - npu = &nphb->npu; |
---|
593 | | - mmio_atsd_reg[i].npu = npu; |
---|
594 | | - mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); |
---|
595 | | - while (mmio_atsd_reg[i].reg < 0) { |
---|
596 | | - mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); |
---|
597 | | - cpu_relax(); |
---|
598 | | - } |
---|
599 | | - break; |
---|
| 320 | + ret = pe->table_group.ops->unset_window( |
---|
| 321 | + &pe->table_group, num); |
---|
| 322 | + if (ret) |
---|
| 323 | + break; |
---|
600 | 324 | } |
---|
| 325 | + } else { |
---|
| 326 | + table_group->tables[num] = iommu_tce_table_get(tbl); |
---|
601 | 327 | } |
---|
| 328 | + |
---|
| 329 | + return ret; |
---|
602 | 330 | } |
---|
603 | 331 | |
---|
604 | | -/* |
---|
605 | | - * Release previously acquired ATSD registers. To avoid deadlocks the registers |
---|
606 | | - * must be released in the same order they were acquired above in |
---|
607 | | - * acquire_atsd_reg. |
---|
608 | | - */ |
---|
609 | | -static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) |
---|
| 332 | +static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group, |
---|
| 333 | + int num) |
---|
| 334 | +{ |
---|
| 335 | + int i, j; |
---|
| 336 | + long ret = 0; |
---|
| 337 | + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, |
---|
| 338 | + table_group); |
---|
| 339 | + |
---|
| 340 | + for (i = 0; i < npucomp->pe_num; ++i) { |
---|
| 341 | + struct pnv_ioda_pe *pe = npucomp->pe[i]; |
---|
| 342 | + |
---|
| 343 | + WARN_ON(npucomp->table_group.tables[num] != |
---|
| 344 | + table_group->tables[num]); |
---|
| 345 | + if (!npucomp->table_group.tables[num]) |
---|
| 346 | + continue; |
---|
| 347 | + |
---|
| 348 | + if (!pe->table_group.ops->unset_window) |
---|
| 349 | + continue; |
---|
| 350 | + |
---|
| 351 | + ret = pe->table_group.ops->unset_window(&pe->table_group, num); |
---|
| 352 | + if (ret) |
---|
| 353 | + break; |
---|
| 354 | + } |
---|
| 355 | + |
---|
| 356 | + if (ret) { |
---|
| 357 | + for (j = 0; j < i; ++j) { |
---|
| 358 | + struct pnv_ioda_pe *pe = npucomp->pe[j]; |
---|
| 359 | + |
---|
| 360 | + if (!npucomp->table_group.tables[num]) |
---|
| 361 | + continue; |
---|
| 362 | + |
---|
| 363 | + if (!pe->table_group.ops->set_window) |
---|
| 364 | + continue; |
---|
| 365 | + |
---|
| 366 | + ret = pe->table_group.ops->set_window(&pe->table_group, |
---|
| 367 | + num, table_group->tables[num]); |
---|
| 368 | + if (ret) |
---|
| 369 | + break; |
---|
| 370 | + } |
---|
| 371 | + } else if (table_group->tables[num]) { |
---|
| 372 | + iommu_tce_table_put(table_group->tables[num]); |
---|
| 373 | + table_group->tables[num] = NULL; |
---|
| 374 | + } |
---|
| 375 | + |
---|
| 376 | + return ret; |
---|
| 377 | +} |
---|
| 378 | + |
---|
| 379 | +static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group) |
---|
610 | 380 | { |
---|
611 | 381 | int i; |
---|
| 382 | + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, |
---|
| 383 | + table_group); |
---|
612 | 384 | |
---|
613 | | - for (i = 0; i <= max_npu2_index; i++) { |
---|
614 | | - /* |
---|
615 | | - * We can't rely on npu_context->npdev[][] being the same here |
---|
616 | | - * as when acquire_atsd_reg() was called, hence we use the |
---|
617 | | - * values stored in mmio_atsd_reg during the acquire phase |
---|
618 | | - * rather than re-reading npdev[][]. |
---|
619 | | - */ |
---|
620 | | - if (mmio_atsd_reg[i].reg < 0) |
---|
| 385 | + for (i = 0; i < npucomp->pe_num; ++i) { |
---|
| 386 | + struct pnv_ioda_pe *pe = npucomp->pe[i]; |
---|
| 387 | + |
---|
| 388 | + if (!pe->table_group.ops || |
---|
| 389 | + !pe->table_group.ops->take_ownership) |
---|
621 | 390 | continue; |
---|
622 | | - |
---|
623 | | - put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg); |
---|
| 391 | + pe->table_group.ops->take_ownership(&pe->table_group); |
---|
624 | 392 | } |
---|
625 | 393 | } |
---|
626 | 394 | |
---|
627 | | -/* |
---|
628 | | - * Invalidate either a single address or an entire PID depending on |
---|
629 | | - * the value of va. |
---|
630 | | - */ |
---|
631 | | -static void mmio_invalidate(struct npu_context *npu_context, int va, |
---|
632 | | - unsigned long address, bool flush) |
---|
| 395 | +static void pnv_npu_peers_release_ownership( |
---|
| 396 | + struct iommu_table_group *table_group) |
---|
633 | 397 | { |
---|
634 | | - struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; |
---|
635 | | - unsigned long pid = npu_context->mm->context.id; |
---|
| 398 | + int i; |
---|
| 399 | + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, |
---|
| 400 | + table_group); |
---|
636 | 401 | |
---|
637 | | - if (npu_context->nmmu_flush) |
---|
638 | | - /* |
---|
639 | | - * Unfortunately the nest mmu does not support flushing specific |
---|
640 | | - * addresses so we have to flush the whole mm once before |
---|
641 | | - * shooting down the GPU translation. |
---|
642 | | - */ |
---|
643 | | - flush_all_mm(npu_context->mm); |
---|
| 402 | + for (i = 0; i < npucomp->pe_num; ++i) { |
---|
| 403 | + struct pnv_ioda_pe *pe = npucomp->pe[i]; |
---|
644 | 404 | |
---|
645 | | - /* |
---|
646 | | - * Loop over all the NPUs this process is active on and launch |
---|
647 | | - * an invalidate. |
---|
648 | | - */ |
---|
649 | | - acquire_atsd_reg(npu_context, mmio_atsd_reg); |
---|
650 | | - if (va) |
---|
651 | | - mmio_invalidate_va(mmio_atsd_reg, address, pid, flush); |
---|
652 | | - else |
---|
653 | | - mmio_invalidate_pid(mmio_atsd_reg, pid, flush); |
---|
654 | | - |
---|
655 | | - mmio_invalidate_wait(mmio_atsd_reg); |
---|
656 | | - if (flush) { |
---|
657 | | - /* |
---|
658 | | - * The GPU requires two flush ATSDs to ensure all entries have |
---|
659 | | - * been flushed. We use PID 0 as it will never be used for a |
---|
660 | | - * process on the GPU. |
---|
661 | | - */ |
---|
662 | | - mmio_invalidate_pid(mmio_atsd_reg, 0, true); |
---|
663 | | - mmio_invalidate_wait(mmio_atsd_reg); |
---|
664 | | - mmio_invalidate_pid(mmio_atsd_reg, 0, true); |
---|
665 | | - mmio_invalidate_wait(mmio_atsd_reg); |
---|
666 | | - } |
---|
667 | | - release_atsd_reg(mmio_atsd_reg); |
---|
668 | | -} |
---|
669 | | - |
---|
670 | | -static void pnv_npu2_mn_release(struct mmu_notifier *mn, |
---|
671 | | - struct mm_struct *mm) |
---|
672 | | -{ |
---|
673 | | - struct npu_context *npu_context = mn_to_npu_context(mn); |
---|
674 | | - |
---|
675 | | - /* Call into device driver to stop requests to the NMMU */ |
---|
676 | | - if (npu_context->release_cb) |
---|
677 | | - npu_context->release_cb(npu_context, npu_context->priv); |
---|
678 | | - |
---|
679 | | - /* |
---|
680 | | - * There should be no more translation requests for this PID, but we |
---|
681 | | - * need to ensure any entries for it are removed from the TLB. |
---|
682 | | - */ |
---|
683 | | - mmio_invalidate(npu_context, 0, 0, true); |
---|
684 | | -} |
---|
685 | | - |
---|
686 | | -static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, |
---|
687 | | - struct mm_struct *mm, |
---|
688 | | - unsigned long address, |
---|
689 | | - pte_t pte) |
---|
690 | | -{ |
---|
691 | | - struct npu_context *npu_context = mn_to_npu_context(mn); |
---|
692 | | - |
---|
693 | | - mmio_invalidate(npu_context, 1, address, true); |
---|
694 | | -} |
---|
695 | | - |
---|
696 | | -static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, |
---|
697 | | - struct mm_struct *mm, |
---|
698 | | - unsigned long start, unsigned long end) |
---|
699 | | -{ |
---|
700 | | - struct npu_context *npu_context = mn_to_npu_context(mn); |
---|
701 | | - unsigned long address; |
---|
702 | | - |
---|
703 | | - if (end - start > atsd_threshold) { |
---|
704 | | - /* |
---|
705 | | - * Just invalidate the entire PID if the address range is too |
---|
706 | | - * large. |
---|
707 | | - */ |
---|
708 | | - mmio_invalidate(npu_context, 0, 0, true); |
---|
709 | | - } else { |
---|
710 | | - for (address = start; address < end; address += PAGE_SIZE) |
---|
711 | | - mmio_invalidate(npu_context, 1, address, false); |
---|
712 | | - |
---|
713 | | - /* Do the flush only on the final addess == end */ |
---|
714 | | - mmio_invalidate(npu_context, 1, address, true); |
---|
| 405 | + if (!pe->table_group.ops || |
---|
| 406 | + !pe->table_group.ops->release_ownership) |
---|
| 407 | + continue; |
---|
| 408 | + pe->table_group.ops->release_ownership(&pe->table_group); |
---|
715 | 409 | } |
---|
716 | 410 | } |
---|
717 | 411 | |
---|
718 | | -static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { |
---|
719 | | - .release = pnv_npu2_mn_release, |
---|
720 | | - .change_pte = pnv_npu2_mn_change_pte, |
---|
721 | | - .invalidate_range = pnv_npu2_mn_invalidate_range, |
---|
| 412 | +static struct iommu_table_group_ops pnv_npu_peers_ops = { |
---|
| 413 | + .get_table_size = pnv_pci_ioda2_get_table_size, |
---|
| 414 | + .create_table = pnv_npu_peers_create_table_userspace, |
---|
| 415 | + .set_window = pnv_npu_peers_set_window, |
---|
| 416 | + .unset_window = pnv_npu_peers_unset_window, |
---|
| 417 | + .take_ownership = pnv_npu_peers_take_ownership, |
---|
| 418 | + .release_ownership = pnv_npu_peers_release_ownership, |
---|
722 | 419 | }; |
---|
723 | 420 | |
---|
724 | | -/* |
---|
725 | | - * Call into OPAL to setup the nmmu context for the current task in |
---|
726 | | - * the NPU. This must be called to setup the context tables before the |
---|
727 | | - * GPU issues ATRs. pdev should be a pointed to PCIe GPU device. |
---|
728 | | - * |
---|
729 | | - * A release callback should be registered to allow a device driver to |
---|
730 | | - * be notified that it should not launch any new translation requests |
---|
731 | | - * as the final TLB invalidate is about to occur. |
---|
732 | | - * |
---|
733 | | - * Returns an error if there no contexts are currently available or a |
---|
734 | | - * npu_context which should be passed to pnv_npu2_handle_fault(). |
---|
735 | | - * |
---|
736 | | - * mmap_sem must be held in write mode and must not be called from interrupt |
---|
737 | | - * context. |
---|
738 | | - */ |
---|
739 | | -struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, |
---|
740 | | - unsigned long flags, |
---|
741 | | - void (*cb)(struct npu_context *, void *), |
---|
742 | | - void *priv) |
---|
| 421 | +static void pnv_comp_attach_table_group(struct npu_comp *npucomp, |
---|
| 422 | + struct pnv_ioda_pe *pe) |
---|
743 | 423 | { |
---|
744 | | - int rc; |
---|
745 | | - u32 nvlink_index; |
---|
746 | | - struct device_node *nvlink_dn; |
---|
747 | | - struct mm_struct *mm = current->mm; |
---|
748 | | - struct pnv_phb *nphb; |
---|
749 | | - struct npu *npu; |
---|
750 | | - struct npu_context *npu_context; |
---|
| 424 | + if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM)) |
---|
| 425 | + return; |
---|
751 | 426 | |
---|
752 | | - /* |
---|
753 | | - * At present we don't support GPUs connected to multiple NPUs and I'm |
---|
754 | | - * not sure the hardware does either. |
---|
755 | | - */ |
---|
756 | | - struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); |
---|
| 427 | + npucomp->pe[npucomp->pe_num] = pe; |
---|
| 428 | + ++npucomp->pe_num; |
---|
| 429 | +} |
---|
757 | 430 | |
---|
758 | | - if (!firmware_has_feature(FW_FEATURE_OPAL)) |
---|
759 | | - return ERR_PTR(-ENODEV); |
---|
| 431 | +static struct iommu_table_group * |
---|
| 432 | + pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) |
---|
| 433 | +{ |
---|
| 434 | + struct iommu_table_group *compound_group; |
---|
| 435 | + struct npu_comp *npucomp; |
---|
| 436 | + struct pci_dev *gpdev = NULL; |
---|
| 437 | + struct pci_controller *hose; |
---|
| 438 | + struct pci_dev *npdev = NULL; |
---|
| 439 | + |
---|
| 440 | + list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) { |
---|
| 441 | + npdev = pnv_pci_get_npu_dev(gpdev, 0); |
---|
| 442 | + if (npdev) |
---|
| 443 | + break; |
---|
| 444 | + } |
---|
760 | 445 | |
---|
761 | 446 | if (!npdev) |
---|
762 | | - /* No nvlink associated with this GPU device */ |
---|
763 | | - return ERR_PTR(-ENODEV); |
---|
| 447 | + /* It is not an NPU attached device, skip */ |
---|
| 448 | + return NULL; |
---|
764 | 449 | |
---|
765 | | - nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); |
---|
766 | | - if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", |
---|
767 | | - &nvlink_index))) |
---|
768 | | - return ERR_PTR(-ENODEV); |
---|
| 450 | + hose = pci_bus_to_host(npdev->bus); |
---|
769 | 451 | |
---|
770 | | - if (!mm || mm->context.id == 0) { |
---|
771 | | - /* |
---|
772 | | - * Kernel thread contexts are not supported and context id 0 is |
---|
773 | | - * reserved on the GPU. |
---|
774 | | - */ |
---|
775 | | - return ERR_PTR(-EINVAL); |
---|
| 452 | + if (hose->npu) { |
---|
| 453 | + /* P9 case: compound group is per-NPU (all gpus, all links) */ |
---|
| 454 | + npucomp = &hose->npu->npucomp; |
---|
| 455 | + } else { |
---|
| 456 | + /* P8 case: Compound group is per-GPU (1 gpu, 2 links) */ |
---|
| 457 | + npucomp = pe->npucomp = kzalloc(sizeof(*npucomp), GFP_KERNEL); |
---|
776 | 458 | } |
---|
777 | 459 | |
---|
778 | | - nphb = pci_bus_to_host(npdev->bus)->private_data; |
---|
779 | | - npu = &nphb->npu; |
---|
| 460 | + compound_group = &npucomp->table_group; |
---|
| 461 | + if (!compound_group->group) { |
---|
| 462 | + compound_group->ops = &pnv_npu_peers_ops; |
---|
| 463 | + iommu_register_group(compound_group, hose->global_number, |
---|
| 464 | + pe->pe_number); |
---|
780 | 465 | |
---|
781 | | - /* |
---|
782 | | - * Setup the NPU context table for a particular GPU. These need to be |
---|
783 | | - * per-GPU as we need the tables to filter ATSDs when there are no |
---|
784 | | - * active contexts on a particular GPU. It is safe for these to be |
---|
785 | | - * called concurrently with destroy as the OPAL call takes appropriate |
---|
786 | | - * locks and refcounts on init/destroy. |
---|
787 | | - */ |
---|
788 | | - rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags, |
---|
789 | | - PCI_DEVID(gpdev->bus->number, gpdev->devfn)); |
---|
790 | | - if (rc < 0) |
---|
791 | | - return ERR_PTR(-ENOSPC); |
---|
792 | | - |
---|
793 | | - /* |
---|
794 | | - * We store the npu pci device so we can more easily get at the |
---|
795 | | - * associated npus. |
---|
796 | | - */ |
---|
797 | | - spin_lock(&npu_context_lock); |
---|
798 | | - npu_context = mm->context.npu_context; |
---|
799 | | - if (npu_context) { |
---|
800 | | - if (npu_context->release_cb != cb || |
---|
801 | | - npu_context->priv != priv) { |
---|
802 | | - spin_unlock(&npu_context_lock); |
---|
803 | | - opal_npu_destroy_context(nphb->opal_id, mm->context.id, |
---|
804 | | - PCI_DEVID(gpdev->bus->number, |
---|
805 | | - gpdev->devfn)); |
---|
806 | | - return ERR_PTR(-EINVAL); |
---|
807 | | - } |
---|
808 | | - |
---|
809 | | - WARN_ON(!kref_get_unless_zero(&npu_context->kref)); |
---|
810 | | - } |
---|
811 | | - spin_unlock(&npu_context_lock); |
---|
812 | | - |
---|
813 | | - if (!npu_context) { |
---|
814 | | - /* |
---|
815 | | - * We can set up these fields without holding the |
---|
816 | | - * npu_context_lock as the npu_context hasn't been returned to |
---|
817 | | - * the caller meaning it can't be destroyed. Parallel allocation |
---|
818 | | - * is protected against by mmap_sem. |
---|
819 | | - */ |
---|
820 | | - rc = -ENOMEM; |
---|
821 | | - npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL); |
---|
822 | | - if (npu_context) { |
---|
823 | | - kref_init(&npu_context->kref); |
---|
824 | | - npu_context->mm = mm; |
---|
825 | | - npu_context->mn.ops = &nv_nmmu_notifier_ops; |
---|
826 | | - rc = __mmu_notifier_register(&npu_context->mn, mm); |
---|
827 | | - } |
---|
828 | | - |
---|
829 | | - if (rc) { |
---|
830 | | - kfree(npu_context); |
---|
831 | | - opal_npu_destroy_context(nphb->opal_id, mm->context.id, |
---|
832 | | - PCI_DEVID(gpdev->bus->number, |
---|
833 | | - gpdev->devfn)); |
---|
834 | | - return ERR_PTR(rc); |
---|
835 | | - } |
---|
836 | | - |
---|
837 | | - mm->context.npu_context = npu_context; |
---|
| 466 | + /* Steal capabilities from a GPU PE */ |
---|
| 467 | + compound_group->max_dynamic_windows_supported = |
---|
| 468 | + pe->table_group.max_dynamic_windows_supported; |
---|
| 469 | + compound_group->tce32_start = pe->table_group.tce32_start; |
---|
| 470 | + compound_group->tce32_size = pe->table_group.tce32_size; |
---|
| 471 | + compound_group->max_levels = pe->table_group.max_levels; |
---|
| 472 | + if (!compound_group->pgsizes) |
---|
| 473 | + compound_group->pgsizes = pe->table_group.pgsizes; |
---|
838 | 474 | } |
---|
839 | 475 | |
---|
840 | | - npu_context->release_cb = cb; |
---|
841 | | - npu_context->priv = priv; |
---|
842 | | - |
---|
843 | 476 | /* |
---|
844 | | - * npdev is a pci_dev pointer setup by the PCI code. We assign it to |
---|
845 | | - * npdev[][] to indicate to the mmu notifiers that an invalidation |
---|
846 | | - * should also be sent over this nvlink. The notifiers don't use any |
---|
847 | | - * other fields in npu_context, so we just need to ensure that when they |
---|
848 | | - * deference npu_context->npdev[][] it is either a valid pointer or |
---|
849 | | - * NULL. |
---|
| 477 | + * The gpu would have been added to the iommu group that's created |
---|
| 478 | + * for the PE. Pull it out now. |
---|
850 | 479 | */ |
---|
851 | | - WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev); |
---|
| 480 | + iommu_del_device(&gpdev->dev); |
---|
852 | 481 | |
---|
853 | | - if (!nphb->npu.nmmu_flush) { |
---|
854 | | - /* |
---|
855 | | - * If we're not explicitly flushing ourselves we need to mark |
---|
856 | | - * the thread for global flushes |
---|
857 | | - */ |
---|
858 | | - npu_context->nmmu_flush = false; |
---|
859 | | - mm_context_add_copro(mm); |
---|
860 | | - } else |
---|
861 | | - npu_context->nmmu_flush = true; |
---|
| 482 | + /* |
---|
| 483 | + * I'm not sure this is strictly required, but it's probably a good idea |
---|
| 484 | + * since the table_group for the PE is going to be attached to the |
---|
| 485 | + * compound table group. If we leave the PE's iommu group active then |
---|
| 486 | + * we might have the same table_group being modifiable via two sepeate |
---|
| 487 | + * iommu groups. |
---|
| 488 | + */ |
---|
| 489 | + iommu_group_put(pe->table_group.group); |
---|
862 | 490 | |
---|
863 | | - return npu_context; |
---|
864 | | -} |
---|
865 | | -EXPORT_SYMBOL(pnv_npu2_init_context); |
---|
| 491 | + /* now put the GPU into the compound group */ |
---|
| 492 | + pnv_comp_attach_table_group(npucomp, pe); |
---|
| 493 | + iommu_add_device(compound_group, &gpdev->dev); |
---|
866 | 494 | |
---|
867 | | -static void pnv_npu2_release_context(struct kref *kref) |
---|
868 | | -{ |
---|
869 | | - struct npu_context *npu_context = |
---|
870 | | - container_of(kref, struct npu_context, kref); |
---|
871 | | - |
---|
872 | | - if (!npu_context->nmmu_flush) |
---|
873 | | - mm_context_remove_copro(npu_context->mm); |
---|
874 | | - |
---|
875 | | - npu_context->mm->context.npu_context = NULL; |
---|
| 495 | + return compound_group; |
---|
876 | 496 | } |
---|
877 | 497 | |
---|
878 | | -/* |
---|
879 | | - * Destroy a context on the given GPU. May free the npu_context if it is no |
---|
880 | | - * longer active on any GPUs. Must not be called from interrupt context. |
---|
881 | | - */ |
---|
882 | | -void pnv_npu2_destroy_context(struct npu_context *npu_context, |
---|
883 | | - struct pci_dev *gpdev) |
---|
| 498 | +static struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) |
---|
884 | 499 | { |
---|
885 | | - int removed; |
---|
886 | | - struct pnv_phb *nphb; |
---|
| 500 | + struct iommu_table_group *table_group; |
---|
| 501 | + struct npu_comp *npucomp; |
---|
| 502 | + struct pci_dev *gpdev = NULL; |
---|
| 503 | + struct pci_dev *npdev; |
---|
| 504 | + struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev); |
---|
| 505 | + |
---|
| 506 | + WARN_ON(!(pe->flags & PNV_IODA_PE_DEV)); |
---|
| 507 | + if (!gpe) |
---|
| 508 | + return NULL; |
---|
| 509 | + |
---|
| 510 | + /* |
---|
| 511 | + * IODA2 bridges get this set up from pci_controller_ops::setup_bridge |
---|
| 512 | + * but NPU bridges do not have this hook defined so we do it here. |
---|
| 513 | + * We do not setup other table group parameters as they won't be used |
---|
| 514 | + * anyway - NVLink bridges are subordinate PEs. |
---|
| 515 | + */ |
---|
| 516 | + pe->table_group.ops = &pnv_pci_npu_ops; |
---|
| 517 | + |
---|
| 518 | + table_group = iommu_group_get_iommudata( |
---|
| 519 | + iommu_group_get(&gpdev->dev)); |
---|
| 520 | + |
---|
| 521 | + /* |
---|
| 522 | + * On P9 NPU PHB and PCI PHB support different page sizes, |
---|
| 523 | + * keep only matching. We expect here that NVLink bridge PE pgsizes is |
---|
| 524 | + * initialized by the caller. |
---|
| 525 | + */ |
---|
| 526 | + table_group->pgsizes &= pe->table_group.pgsizes; |
---|
| 527 | + npucomp = container_of(table_group, struct npu_comp, table_group); |
---|
| 528 | + pnv_comp_attach_table_group(npucomp, pe); |
---|
| 529 | + |
---|
| 530 | + list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) { |
---|
| 531 | + struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev); |
---|
| 532 | + |
---|
| 533 | + if (gpdevtmp != gpdev) |
---|
| 534 | + continue; |
---|
| 535 | + |
---|
| 536 | + iommu_add_device(table_group, &npdev->dev); |
---|
| 537 | + } |
---|
| 538 | + |
---|
| 539 | + return table_group; |
---|
| 540 | +} |
---|
| 541 | + |
---|
| 542 | +void pnv_pci_npu_setup_iommu_groups(void) |
---|
| 543 | +{ |
---|
| 544 | + struct pci_controller *hose; |
---|
| 545 | + struct pnv_phb *phb; |
---|
| 546 | + struct pnv_ioda_pe *pe; |
---|
| 547 | + |
---|
| 548 | + /* |
---|
| 549 | + * For non-nvlink devices the IOMMU group is registered when the PE is |
---|
| 550 | + * configured and devices are added to the group when the per-device |
---|
| 551 | + * DMA setup is run. That's done in hose->ops.dma_dev_setup() which is |
---|
| 552 | + * only initialise for "normal" IODA PHBs. |
---|
| 553 | + * |
---|
| 554 | + * For NVLink devices we need to ensure the NVLinks and the GPU end up |
---|
| 555 | + * in the same IOMMU group, so that's handled here. |
---|
| 556 | + */ |
---|
| 557 | + list_for_each_entry(hose, &hose_list, list_node) { |
---|
| 558 | + phb = hose->private_data; |
---|
| 559 | + |
---|
| 560 | + if (phb->type == PNV_PHB_IODA2) |
---|
| 561 | + list_for_each_entry(pe, &phb->ioda.pe_list, list) |
---|
| 562 | + pnv_try_setup_npu_table_group(pe); |
---|
| 563 | + } |
---|
| 564 | + |
---|
| 565 | + /* |
---|
| 566 | + * Now we have all PHBs discovered, time to add NPU devices to |
---|
| 567 | + * the corresponding IOMMU groups. |
---|
| 568 | + */ |
---|
| 569 | + list_for_each_entry(hose, &hose_list, list_node) { |
---|
| 570 | + unsigned long pgsizes; |
---|
| 571 | + |
---|
| 572 | + phb = hose->private_data; |
---|
| 573 | + |
---|
| 574 | + if (phb->type != PNV_PHB_NPU_NVLINK) |
---|
| 575 | + continue; |
---|
| 576 | + |
---|
| 577 | + pgsizes = pnv_ioda_parse_tce_sizes(phb); |
---|
| 578 | + list_for_each_entry(pe, &phb->ioda.pe_list, list) { |
---|
| 579 | + /* |
---|
| 580 | + * IODA2 bridges get this set up from |
---|
| 581 | + * pci_controller_ops::setup_bridge but NPU bridges |
---|
| 582 | + * do not have this hook defined so we do it here. |
---|
| 583 | + */ |
---|
| 584 | + pe->table_group.pgsizes = pgsizes; |
---|
| 585 | + pnv_npu_compound_attach(pe); |
---|
| 586 | + } |
---|
| 587 | + } |
---|
| 588 | +} |
---|
| 589 | +#endif /* CONFIG_IOMMU_API */ |
---|
| 590 | + |
---|
| 591 | +int pnv_npu2_init(struct pci_controller *hose) |
---|
| 592 | +{ |
---|
| 593 | + static int npu_index; |
---|
887 | 594 | struct npu *npu; |
---|
888 | | - struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); |
---|
889 | | - struct device_node *nvlink_dn; |
---|
890 | | - u32 nvlink_index; |
---|
| 595 | + int ret; |
---|
891 | 596 | |
---|
892 | | - if (WARN_ON(!npdev)) |
---|
893 | | - return; |
---|
| 597 | + npu = kzalloc(sizeof(*npu), GFP_KERNEL); |
---|
| 598 | + if (!npu) |
---|
| 599 | + return -ENOMEM; |
---|
894 | 600 | |
---|
895 | | - if (!firmware_has_feature(FW_FEATURE_OPAL)) |
---|
896 | | - return; |
---|
897 | | - |
---|
898 | | - nphb = pci_bus_to_host(npdev->bus)->private_data; |
---|
899 | | - npu = &nphb->npu; |
---|
900 | | - nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); |
---|
901 | | - if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", |
---|
902 | | - &nvlink_index))) |
---|
903 | | - return; |
---|
904 | | - WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL); |
---|
905 | | - opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id, |
---|
906 | | - PCI_DEVID(gpdev->bus->number, gpdev->devfn)); |
---|
907 | | - spin_lock(&npu_context_lock); |
---|
908 | | - removed = kref_put(&npu_context->kref, pnv_npu2_release_context); |
---|
909 | | - spin_unlock(&npu_context_lock); |
---|
910 | | - |
---|
911 | | - /* |
---|
912 | | - * We need to do this outside of pnv_npu2_release_context so that it is |
---|
913 | | - * outside the spinlock as mmu_notifier_destroy uses SRCU. |
---|
914 | | - */ |
---|
915 | | - if (removed) { |
---|
916 | | - mmu_notifier_unregister(&npu_context->mn, |
---|
917 | | - npu_context->mm); |
---|
918 | | - |
---|
919 | | - kfree(npu_context); |
---|
| 601 | + npu_index++; |
---|
| 602 | + if (WARN_ON(npu_index >= NV_MAX_NPUS)) { |
---|
| 603 | + ret = -ENOSPC; |
---|
| 604 | + goto fail_exit; |
---|
920 | 605 | } |
---|
| 606 | + npu->index = npu_index; |
---|
| 607 | + hose->npu = npu; |
---|
921 | 608 | |
---|
| 609 | + return 0; |
---|
| 610 | + |
---|
| 611 | +fail_exit: |
---|
| 612 | + kfree(npu); |
---|
| 613 | + return ret; |
---|
922 | 614 | } |
---|
923 | | -EXPORT_SYMBOL(pnv_npu2_destroy_context); |
---|
924 | 615 | |
---|
925 | | -/* |
---|
926 | | - * Assumes mmap_sem is held for the contexts associated mm. |
---|
927 | | - */ |
---|
928 | | -int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, |
---|
929 | | - unsigned long *flags, unsigned long *status, int count) |
---|
| 616 | +int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid, |
---|
| 617 | + unsigned long msr) |
---|
930 | 618 | { |
---|
931 | | - u64 rc = 0, result = 0; |
---|
932 | | - int i, is_write; |
---|
933 | | - struct page *page[1]; |
---|
| 619 | + int ret; |
---|
| 620 | + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); |
---|
| 621 | + struct pci_controller *hose; |
---|
| 622 | + struct pnv_phb *nphb; |
---|
934 | 623 | |
---|
935 | | - /* mmap_sem should be held so the struct_mm must be present */ |
---|
936 | | - struct mm_struct *mm = context->mm; |
---|
937 | | - |
---|
938 | | - if (!firmware_has_feature(FW_FEATURE_OPAL)) |
---|
| 624 | + if (!npdev) |
---|
939 | 625 | return -ENODEV; |
---|
940 | 626 | |
---|
941 | | - WARN_ON(!rwsem_is_locked(&mm->mmap_sem)); |
---|
942 | | - |
---|
943 | | - for (i = 0; i < count; i++) { |
---|
944 | | - is_write = flags[i] & NPU2_WRITE; |
---|
945 | | - rc = get_user_pages_remote(NULL, mm, ea[i], 1, |
---|
946 | | - is_write ? FOLL_WRITE : 0, |
---|
947 | | - page, NULL, NULL); |
---|
948 | | - |
---|
949 | | - /* |
---|
950 | | - * To support virtualised environments we will have to do an |
---|
951 | | - * access to the page to ensure it gets faulted into the |
---|
952 | | - * hypervisor. For the moment virtualisation is not supported in |
---|
953 | | - * other areas so leave the access out. |
---|
954 | | - */ |
---|
955 | | - if (rc != 1) { |
---|
956 | | - status[i] = rc; |
---|
957 | | - result = -EFAULT; |
---|
958 | | - continue; |
---|
959 | | - } |
---|
960 | | - |
---|
961 | | - status[i] = 0; |
---|
962 | | - put_page(page[0]); |
---|
| 627 | + hose = pci_bus_to_host(npdev->bus); |
---|
| 628 | + if (hose->npu == NULL) { |
---|
| 629 | + dev_info_once(&npdev->dev, "Nvlink1 does not support contexts"); |
---|
| 630 | + return 0; |
---|
963 | 631 | } |
---|
964 | 632 | |
---|
965 | | - return result; |
---|
966 | | -} |
---|
967 | | -EXPORT_SYMBOL(pnv_npu2_handle_fault); |
---|
| 633 | + nphb = hose->private_data; |
---|
968 | 634 | |
---|
969 | | -int pnv_npu2_init(struct pnv_phb *phb) |
---|
970 | | -{ |
---|
971 | | - unsigned int i; |
---|
972 | | - u64 mmio_atsd; |
---|
973 | | - struct device_node *dn; |
---|
974 | | - struct pci_dev *gpdev; |
---|
975 | | - static int npu_index; |
---|
976 | | - uint64_t rc = 0; |
---|
977 | | - |
---|
978 | | - if (!atsd_threshold_dentry) { |
---|
979 | | - atsd_threshold_dentry = debugfs_create_x64("atsd_threshold", |
---|
980 | | - 0600, powerpc_debugfs_root, &atsd_threshold); |
---|
| 635 | + dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=%u\n", |
---|
| 636 | + nphb->opal_id, lparid); |
---|
| 637 | + /* |
---|
| 638 | + * Currently we only support radix and non-zero LPCR only makes sense |
---|
| 639 | + * for hash tables so skiboot expects the LPCR parameter to be a zero. |
---|
| 640 | + */ |
---|
| 641 | + ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), lparid, |
---|
| 642 | + 0 /* LPCR bits */); |
---|
| 643 | + if (ret) { |
---|
| 644 | + dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret); |
---|
| 645 | + return ret; |
---|
981 | 646 | } |
---|
982 | 647 | |
---|
983 | | - phb->npu.nmmu_flush = |
---|
984 | | - of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); |
---|
985 | | - for_each_child_of_node(phb->hose->dn, dn) { |
---|
986 | | - gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); |
---|
987 | | - if (gpdev) { |
---|
988 | | - rc = opal_npu_map_lpar(phb->opal_id, |
---|
989 | | - PCI_DEVID(gpdev->bus->number, gpdev->devfn), |
---|
990 | | - 0, 0); |
---|
991 | | - if (rc) |
---|
992 | | - dev_err(&gpdev->dev, |
---|
993 | | - "Error %lld mapping device to LPAR\n", |
---|
994 | | - rc); |
---|
995 | | - } |
---|
996 | | - } |
---|
997 | | - |
---|
998 | | - for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd", |
---|
999 | | - i, &mmio_atsd); i++) |
---|
1000 | | - phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32); |
---|
1001 | | - |
---|
1002 | | - pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i); |
---|
1003 | | - phb->npu.mmio_atsd_count = i; |
---|
1004 | | - phb->npu.mmio_atsd_usage = 0; |
---|
1005 | | - npu_index++; |
---|
1006 | | - if (WARN_ON(npu_index >= NV_MAX_NPUS)) |
---|
1007 | | - return -ENOSPC; |
---|
1008 | | - max_npu2_index = npu_index; |
---|
1009 | | - phb->npu.index = npu_index; |
---|
| 648 | + dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n", |
---|
| 649 | + nphb->opal_id, msr); |
---|
| 650 | + ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr, |
---|
| 651 | + pci_dev_id(gpdev)); |
---|
| 652 | + if (ret < 0) |
---|
| 653 | + dev_err(&gpdev->dev, "Failed to init context: %d\n", ret); |
---|
| 654 | + else |
---|
| 655 | + ret = 0; |
---|
1010 | 656 | |
---|
1011 | 657 | return 0; |
---|
1012 | 658 | } |
---|
| 659 | +EXPORT_SYMBOL_GPL(pnv_npu2_map_lpar_dev); |
---|
| 660 | + |
---|
| 661 | +void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr) |
---|
| 662 | +{ |
---|
| 663 | + struct pci_dev *gpdev; |
---|
| 664 | + |
---|
| 665 | + list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list) |
---|
| 666 | + pnv_npu2_map_lpar_dev(gpdev, 0, msr); |
---|
| 667 | +} |
---|
| 668 | + |
---|
| 669 | +int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev) |
---|
| 670 | +{ |
---|
| 671 | + int ret; |
---|
| 672 | + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); |
---|
| 673 | + struct pci_controller *hose; |
---|
| 674 | + struct pnv_phb *nphb; |
---|
| 675 | + |
---|
| 676 | + if (!npdev) |
---|
| 677 | + return -ENODEV; |
---|
| 678 | + |
---|
| 679 | + hose = pci_bus_to_host(npdev->bus); |
---|
| 680 | + if (hose->npu == NULL) { |
---|
| 681 | + dev_info_once(&npdev->dev, "Nvlink1 does not support contexts"); |
---|
| 682 | + return 0; |
---|
| 683 | + } |
---|
| 684 | + |
---|
| 685 | + nphb = hose->private_data; |
---|
| 686 | + |
---|
| 687 | + dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n", |
---|
| 688 | + nphb->opal_id); |
---|
| 689 | + ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/, |
---|
| 690 | + pci_dev_id(gpdev)); |
---|
| 691 | + if (ret < 0) { |
---|
| 692 | + dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret); |
---|
| 693 | + return ret; |
---|
| 694 | + } |
---|
| 695 | + |
---|
| 696 | + /* Set LPID to 0 anyway, just to be safe */ |
---|
| 697 | + dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id); |
---|
| 698 | + ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), 0 /*LPID*/, |
---|
| 699 | + 0 /* LPCR bits */); |
---|
| 700 | + if (ret) |
---|
| 701 | + dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret); |
---|
| 702 | + |
---|
| 703 | + return ret; |
---|
| 704 | +} |
---|
| 705 | +EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev); |
---|