forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-05-10 cde9070d9970eef1f7ec2360586c802a16230ad8
kernel/arch/powerpc/platforms/powernv/pci-ioda.c
....@@ -1,12 +1,8 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * Support PCI/PCIe on PowerNV platforms
34 *
45 * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
5
- *
6
- * This program is free software; you can redistribute it and/or
7
- * modify it under the terms of the GNU General Public License
8
- * as published by the Free Software Foundation; either version
9
- * 2 of the License, or (at your option) any later version.
106 */
117
128 #undef DEBUG
....@@ -17,11 +13,10 @@
1713 #include <linux/delay.h>
1814 #include <linux/string.h>
1915 #include <linux/init.h>
20
-#include <linux/bootmem.h>
16
+#include <linux/memblock.h>
2117 #include <linux/irq.h>
2218 #include <linux/io.h>
2319 #include <linux/msi.h>
24
-#include <linux/memblock.h>
2520 #include <linux/iommu.h>
2621 #include <linux/rculist.h>
2722 #include <linux/sizes.h>
....@@ -54,6 +49,9 @@
5449
5550 static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
5651 "NPU_OCAPI" };
52
+
53
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
54
+static void pnv_pci_configure_bus(struct pci_bus *bus);
5755
5856 void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
5957 const char *fmt, ...)
....@@ -117,32 +115,13 @@
117115
118116 early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
119117
120
-static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
121
-{
122
- /*
123
- * WARNING: We cannot rely on the resource flags. The Linux PCI
124
- * allocation code sometimes decides to put a 64-bit prefetchable
125
- * BAR in the 32-bit window, so we have to compare the addresses.
126
- *
127
- * For simplicity we only test resource start.
128
- */
129
- return (r->start >= phb->ioda.m64_base &&
130
- r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
131
-}
132
-
133
-static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
134
-{
135
- unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
136
-
137
- return (resource_flags & flags) == flags;
138
-}
139
-
140118 static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
141119 {
142120 s64 rc;
143121
144122 phb->ioda.pe_array[pe_no].phb = phb;
145123 phb->ioda.pe_array[pe_no].pe_number = pe_no;
124
+ phb->ioda.pe_array[pe_no].dma_setup_done = false;
146125
147126 /*
148127 * Clear the PE frozen state as it might be put into frozen state
....@@ -166,34 +145,60 @@
166145 return;
167146 }
168147
148
+ mutex_lock(&phb->ioda.pe_alloc_mutex);
169149 if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
170150 pr_debug("%s: PE %x was reserved on PHB#%x\n",
171151 __func__, pe_no, phb->hose->global_number);
152
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
172153
173154 pnv_ioda_init_pe(phb, pe_no);
174155 }
175156
176
-static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
157
+struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count)
177158 {
178
- long pe;
159
+ struct pnv_ioda_pe *ret = NULL;
160
+ int run = 0, pe, i;
179161
162
+ mutex_lock(&phb->ioda.pe_alloc_mutex);
163
+
164
+ /* scan backwards for a run of @count cleared bits */
180165 for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
181
- if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
182
- return pnv_ioda_init_pe(phb, pe);
183
- }
166
+ if (test_bit(pe, phb->ioda.pe_alloc)) {
167
+ run = 0;
168
+ continue;
169
+ }
184170
185
- return NULL;
171
+ run++;
172
+ if (run == count)
173
+ break;
174
+ }
175
+ if (run != count)
176
+ goto out;
177
+
178
+ for (i = pe; i < pe + count; i++) {
179
+ set_bit(i, phb->ioda.pe_alloc);
180
+ pnv_ioda_init_pe(phb, i);
181
+ }
182
+ ret = &phb->ioda.pe_array[pe];
183
+
184
+out:
185
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
186
+ return ret;
186187 }
187188
188
-static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
189
+void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
189190 {
190191 struct pnv_phb *phb = pe->phb;
191192 unsigned int pe_num = pe->pe_number;
192193
193194 WARN_ON(pe->pdev);
194
-
195
+ WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */
196
+ kfree(pe->npucomp);
195197 memset(pe, 0, sizeof(struct pnv_ioda_pe));
198
+
199
+ mutex_lock(&phb->ioda.pe_alloc_mutex);
196200 clear_bit(pe_num, phb->ioda.pe_alloc);
201
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
197202 }
198203
199204 /* The default M64 BAR is shared by all PEs */
....@@ -253,8 +258,7 @@
253258 static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
254259 unsigned long *pe_bitmap)
255260 {
256
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
257
- struct pnv_phb *phb = hose->private_data;
261
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
258262 struct resource *r;
259263 resource_size_t base, sgsz, start, end;
260264 int segno, i;
....@@ -266,8 +270,8 @@
266270 if (!r->parent || !pnv_pci_is_m64(phb, r))
267271 continue;
268272
269
- start = _ALIGN_DOWN(r->start - base, sgsz);
270
- end = _ALIGN_UP(r->end - base, sgsz);
273
+ start = ALIGN_DOWN(r->start - base, sgsz);
274
+ end = ALIGN(r->end - base, sgsz);
271275 for (segno = start / sgsz; segno < end / sgsz; segno++) {
272276 if (pe_bitmap)
273277 set_bit(segno, pe_bitmap);
....@@ -312,6 +316,28 @@
312316 }
313317 }
314318
319
+ for (index = 0; index < phb->ioda.total_pe_num; index++) {
320
+ int64_t rc;
321
+
322
+ /*
323
+ * P7IOC supports M64DT, which helps mapping M64 segment
324
+ * to one particular PE#. However, PHB3 has fixed mapping
325
+ * between M64 segment and PE#. In order to have same logic
326
+ * for P7IOC and PHB3, we enforce fixed mapping between M64
327
+ * segment and PE# on P7IOC.
328
+ */
329
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
330
+ index, OPAL_M64_WINDOW_TYPE,
331
+ index / PNV_IODA1_M64_SEGS,
332
+ index % PNV_IODA1_M64_SEGS);
333
+ if (rc != OPAL_SUCCESS) {
334
+ pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
335
+ __func__, rc, phb->hose->global_number,
336
+ index);
337
+ goto fail;
338
+ }
339
+ }
340
+
315341 /*
316342 * Exclude the segments for reserved and root bus PE, which
317343 * are first or last two PEs.
....@@ -352,8 +378,7 @@
352378
353379 static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
354380 {
355
- struct pci_controller *hose = pci_bus_to_host(bus);
356
- struct pnv_phb *phb = hose->private_data;
381
+ struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
357382 struct pnv_ioda_pe *master_pe, *pe;
358383 unsigned long size, *pe_alloc;
359384 int i;
....@@ -363,7 +388,7 @@
363388 return NULL;
364389
365390 /* Allocate bitmap */
366
- size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
391
+ size = ALIGN(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
367392 pe_alloc = kzalloc(size, GFP_KERNEL);
368393 if (!pe_alloc) {
369394 pr_warn("%s: Out of memory !\n",
....@@ -403,26 +428,6 @@
403428 pe->flags |= PNV_IODA_PE_SLAVE;
404429 pe->master = master_pe;
405430 list_add_tail(&pe->list, &master_pe->slaves);
406
- }
407
-
408
- /*
409
- * P7IOC supports M64DT, which helps mapping M64 segment
410
- * to one particular PE#. However, PHB3 has fixed mapping
411
- * between M64 segment and PE#. In order to have same logic
412
- * for P7IOC and PHB3, we enforce fixed mapping between M64
413
- * segment and PE# on P7IOC.
414
- */
415
- if (phb->type == PNV_PHB_IODA1) {
416
- int64_t rc;
417
-
418
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
419
- pe->pe_number, OPAL_M64_WINDOW_TYPE,
420
- pe->pe_number / PNV_IODA1_M64_SEGS,
421
- pe->pe_number % PNV_IODA1_M64_SEGS);
422
- if (rc != OPAL_SUCCESS)
423
- pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
424
- __func__, rc, phb->hose->global_number,
425
- pe->pe_number);
426431 }
427432 }
428433
....@@ -518,8 +523,6 @@
518523 phb->init_m64 = pnv_ioda1_init_m64;
519524 else
520525 phb->init_m64 = pnv_ioda2_init_m64;
521
- phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe;
522
- phb->pick_m64_pe = pnv_ioda_pick_m64_pe;
523526 }
524527
525528 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
....@@ -664,14 +667,19 @@
664667 return state;
665668 }
666669
667
-/* Currently those 2 are only used when MSIs are enabled, this will change
668
- * but in the meantime, we need to protect them to avoid warnings
669
- */
670
-#ifdef CONFIG_PCI_MSI
670
+struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn)
671
+{
672
+ int pe_number = phb->ioda.pe_rmap[bdfn];
673
+
674
+ if (pe_number == IODA_INVALID_PE)
675
+ return NULL;
676
+
677
+ return &phb->ioda.pe_array[pe_number];
678
+}
679
+
671680 struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
672681 {
673
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
674
- struct pnv_phb *phb = hose->private_data;
682
+ struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
675683 struct pci_dn *pdn = pci_get_pdn(dev);
676684
677685 if (!pdn)
....@@ -680,7 +688,6 @@
680688 return NULL;
681689 return &phb->ioda.pe_array[pdn->pe_number];
682690 }
683
-#endif /* CONFIG_PCI_MSI */
684691
685692 static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
686693 struct pnv_ioda_pe *parent,
....@@ -786,7 +793,35 @@
786793 return 0;
787794 }
788795
789
-static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
796
+static void pnv_ioda_unset_peltv(struct pnv_phb *phb,
797
+ struct pnv_ioda_pe *pe,
798
+ struct pci_dev *parent)
799
+{
800
+ int64_t rc;
801
+
802
+ while (parent) {
803
+ struct pci_dn *pdn = pci_get_pdn(parent);
804
+
805
+ if (pdn && pdn->pe_number != IODA_INVALID_PE) {
806
+ rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
807
+ pe->pe_number,
808
+ OPAL_REMOVE_PE_FROM_DOMAIN);
809
+ /* XXX What to do in case of error ? */
810
+ }
811
+ parent = parent->bus->self;
812
+ }
813
+
814
+ opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
815
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
816
+
817
+ /* Disassociate PE in PELT */
818
+ rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
819
+ pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
820
+ if (rc)
821
+ pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
822
+}
823
+
824
+int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
790825 {
791826 struct pci_dev *parent;
792827 uint8_t bcomp, dcomp, fcomp;
....@@ -801,7 +836,7 @@
801836 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
802837 parent = pe->pbus->self;
803838 if (pe->flags & PNV_IODA_PE_BUS_ALL)
804
- count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
839
+ count = resource_size(&pe->pbus->busn_res);
805840 else
806841 count = 1;
807842
....@@ -836,29 +871,17 @@
836871 for (rid = pe->rid; rid < rid_end; rid++)
837872 phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
838873
839
- /* Release from all parents PELT-V */
840
- while (parent) {
841
- struct pci_dn *pdn = pci_get_pdn(parent);
842
- if (pdn && pdn->pe_number != IODA_INVALID_PE) {
843
- rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
844
- pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
845
- /* XXX What to do in case of error ? */
846
- }
847
- parent = parent->bus->self;
848
- }
874
+ /*
875
+ * Release from all parents PELT-V. NPUs don't have a PELTV
876
+ * table
877
+ */
878
+ if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
879
+ pnv_ioda_unset_peltv(phb, pe, parent);
849880
850
- opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
851
- OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
852
-
853
- /* Disassociate PE in PELT */
854
- rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
855
- pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
856
- if (rc)
857
- pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
858881 rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
859882 bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
860883 if (rc)
861
- pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
884
+ pe_err(pe, "OPAL error %lld trying to setup PELT table\n", rc);
862885
863886 pe->pbus = NULL;
864887 pe->pdev = NULL;
....@@ -869,9 +892,8 @@
869892 return 0;
870893 }
871894
872
-static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
895
+int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
873896 {
874
- struct pci_dev *parent;
875897 uint8_t bcomp, dcomp, fcomp;
876898 long rc, rid_end, rid;
877899
....@@ -881,9 +903,8 @@
881903
882904 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
883905 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
884
- parent = pe->pbus->self;
885906 if (pe->flags & PNV_IODA_PE_BUS_ALL)
886
- count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
907
+ count = resource_size(&pe->pbus->busn_res);
887908 else
888909 count = 1;
889910
....@@ -902,12 +923,6 @@
902923 }
903924 rid_end = pe->rid + (count << 8);
904925 } else {
905
-#ifdef CONFIG_PCI_IOV
906
- if (pe->flags & PNV_IODA_PE_VF)
907
- parent = pe->parent_dev;
908
- else
909
-#endif /* CONFIG_PCI_IOV */
910
- parent = pe->pdev->bus->self;
911926 bcomp = OpalPciBusAll;
912927 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
913928 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
....@@ -964,95 +979,9 @@
964979 return 0;
965980 }
966981
967
-#ifdef CONFIG_PCI_IOV
968
-static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
969
-{
970
- struct pci_dn *pdn = pci_get_pdn(dev);
971
- int i;
972
- struct resource *res, res2;
973
- resource_size_t size;
974
- u16 num_vfs;
975
-
976
- if (!dev->is_physfn)
977
- return -EINVAL;
978
-
979
- /*
980
- * "offset" is in VFs. The M64 windows are sized so that when they
981
- * are segmented, each segment is the same size as the IOV BAR.
982
- * Each segment is in a separate PE, and the high order bits of the
983
- * address are the PE number. Therefore, each VF's BAR is in a
984
- * separate PE, and changing the IOV BAR start address changes the
985
- * range of PEs the VFs are in.
986
- */
987
- num_vfs = pdn->num_vfs;
988
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
989
- res = &dev->resource[i + PCI_IOV_RESOURCES];
990
- if (!res->flags || !res->parent)
991
- continue;
992
-
993
- /*
994
- * The actual IOV BAR range is determined by the start address
995
- * and the actual size for num_vfs VFs BAR. This check is to
996
- * make sure that after shifting, the range will not overlap
997
- * with another device.
998
- */
999
- size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
1000
- res2.flags = res->flags;
1001
- res2.start = res->start + (size * offset);
1002
- res2.end = res2.start + (size * num_vfs) - 1;
1003
-
1004
- if (res2.end > res->end) {
1005
- dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
1006
- i, &res2, res, num_vfs, offset);
1007
- return -EBUSY;
1008
- }
1009
- }
1010
-
1011
- /*
1012
- * Since M64 BAR shares segments among all possible 256 PEs,
1013
- * we have to shift the beginning of PF IOV BAR to make it start from
1014
- * the segment which belongs to the PE number assigned to the first VF.
1015
- * This creates a "hole" in the /proc/iomem which could be used for
1016
- * allocating other resources so we reserve this area below and
1017
- * release when IOV is released.
1018
- */
1019
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
1020
- res = &dev->resource[i + PCI_IOV_RESOURCES];
1021
- if (!res->flags || !res->parent)
1022
- continue;
1023
-
1024
- size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
1025
- res2 = *res;
1026
- res->start += size * offset;
1027
-
1028
- dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
1029
- i, &res2, res, (offset > 0) ? "En" : "Dis",
1030
- num_vfs, offset);
1031
-
1032
- if (offset < 0) {
1033
- devm_release_resource(&dev->dev, &pdn->holes[i]);
1034
- memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
1035
- }
1036
-
1037
- pci_update_resource(dev, i + PCI_IOV_RESOURCES);
1038
-
1039
- if (offset > 0) {
1040
- pdn->holes[i].start = res2.start;
1041
- pdn->holes[i].end = res2.start + size * offset - 1;
1042
- pdn->holes[i].flags = IORESOURCE_BUS;
1043
- pdn->holes[i].name = "pnv_iov_reserved";
1044
- devm_request_resource(&dev->dev, res->parent,
1045
- &pdn->holes[i]);
1046
- }
1047
- }
1048
- return 0;
1049
-}
1050
-#endif /* CONFIG_PCI_IOV */
1051
-
1052982 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
1053983 {
1054
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
1055
- struct pnv_phb *phb = hose->private_data;
984
+ struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
1056985 struct pci_dn *pdn = pci_get_pdn(dev);
1057986 struct pnv_ioda_pe *pe;
1058987
....@@ -1064,27 +993,27 @@
1064993 if (pdn->pe_number != IODA_INVALID_PE)
1065994 return NULL;
1066995
1067
- pe = pnv_ioda_alloc_pe(phb);
996
+ pe = pnv_ioda_alloc_pe(phb, 1);
1068997 if (!pe) {
1069998 pr_warn("%s: Not enough PE# available, disabling device\n",
1070999 pci_name(dev));
10711000 return NULL;
10721001 }
10731002
1074
- /* NOTE: We get only one ref to the pci_dev for the pdn, not for the
1075
- * pointer in the PE data structure, both should be destroyed at the
1076
- * same time. However, this needs to be looked at more closely again
1077
- * once we actually start removing things (Hotplug, SR-IOV, ...)
1003
+ /* NOTE: We don't get a reference for the pointer in the PE
1004
+ * data structure, both the device and PE structures should be
1005
+ * destroyed at the same time. However, removing nvlink
1006
+ * devices will need some work.
10781007 *
10791008 * At some point we want to remove the PDN completely anyways
10801009 */
1081
- pci_dev_get(dev);
10821010 pdn->pe_number = pe->pe_number;
10831011 pe->flags = PNV_IODA_PE_DEV;
10841012 pe->pdev = dev;
10851013 pe->pbus = NULL;
10861014 pe->mve_number = -1;
10871015 pe->rid = dev->bus->number << 8 | pdn->devfn;
1016
+ pe->device_count++;
10881017
10891018 pe_info(pe, "Associated device to PE\n");
10901019
....@@ -1093,42 +1022,14 @@
10931022 pnv_ioda_free_pe(pe);
10941023 pdn->pe_number = IODA_INVALID_PE;
10951024 pe->pdev = NULL;
1096
- pci_dev_put(dev);
10971025 return NULL;
10981026 }
10991027
11001028 /* Put PE to the list */
1029
+ mutex_lock(&phb->ioda.pe_list_mutex);
11011030 list_add_tail(&pe->list, &phb->ioda.pe_list);
1102
-
1031
+ mutex_unlock(&phb->ioda.pe_list_mutex);
11031032 return pe;
1104
-}
1105
-
1106
-static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
1107
-{
1108
- struct pci_dev *dev;
1109
-
1110
- list_for_each_entry(dev, &bus->devices, bus_list) {
1111
- struct pci_dn *pdn = pci_get_pdn(dev);
1112
-
1113
- if (pdn == NULL) {
1114
- pr_warn("%s: No device node associated with device !\n",
1115
- pci_name(dev));
1116
- continue;
1117
- }
1118
-
1119
- /*
1120
- * In partial hotplug case, the PCI device might be still
1121
- * associated with the PE and needn't attach it to the PE
1122
- * again.
1123
- */
1124
- if (pdn->pe_number != IODA_INVALID_PE)
1125
- continue;
1126
-
1127
- pe->device_count++;
1128
- pdn->pe_number = pe->pe_number;
1129
- if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1130
- pnv_ioda_setup_same_PE(dev->subordinate, pe);
1131
- }
11321033 }
11331034
11341035 /*
....@@ -1139,8 +1040,7 @@
11391040 */
11401041 static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
11411042 {
1142
- struct pci_controller *hose = pci_bus_to_host(bus);
1143
- struct pnv_phb *phb = hose->private_data;
1043
+ struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
11441044 struct pnv_ioda_pe *pe = NULL;
11451045 unsigned int pe_num;
11461046
....@@ -1149,24 +1049,22 @@
11491049 * We should reuse it instead of allocating a new one.
11501050 */
11511051 pe_num = phb->ioda.pe_rmap[bus->number << 8];
1152
- if (pe_num != IODA_INVALID_PE) {
1052
+ if (WARN_ON(pe_num != IODA_INVALID_PE)) {
11531053 pe = &phb->ioda.pe_array[pe_num];
1154
- pnv_ioda_setup_same_PE(bus, pe);
11551054 return NULL;
11561055 }
11571056
11581057 /* PE number for root bus should have been reserved */
1159
- if (pci_is_root_bus(bus) &&
1160
- phb->ioda.root_pe_idx != IODA_INVALID_PE)
1058
+ if (pci_is_root_bus(bus))
11611059 pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
11621060
11631061 /* Check if PE is determined by M64 */
1164
- if (!pe && phb->pick_m64_pe)
1165
- pe = phb->pick_m64_pe(bus, all);
1062
+ if (!pe)
1063
+ pe = pnv_ioda_pick_m64_pe(bus, all);
11661064
11671065 /* The PE number isn't pinned by M64 */
11681066 if (!pe)
1169
- pe = pnv_ioda_alloc_pe(phb);
1067
+ pe = pnv_ioda_alloc_pe(phb, 1);
11701068
11711069 if (!pe) {
11721070 pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
....@@ -1181,11 +1079,12 @@
11811079 pe->rid = bus->busn_res.start << 8;
11821080
11831081 if (all)
1184
- pe_info(pe, "Secondary bus %d..%d associated with PE#%x\n",
1185
- bus->busn_res.start, bus->busn_res.end, pe->pe_number);
1082
+ pe_info(pe, "Secondary bus %pad..%pad associated with PE#%x\n",
1083
+ &bus->busn_res.start, &bus->busn_res.end,
1084
+ pe->pe_number);
11861085 else
1187
- pe_info(pe, "Secondary bus %d associated with PE#%x\n",
1188
- bus->busn_res.start, pe->pe_number);
1086
+ pe_info(pe, "Secondary bus %pad associated with PE#%x\n",
1087
+ &bus->busn_res.start, pe->pe_number);
11891088
11901089 if (pnv_ioda_configure_pe(phb, pe)) {
11911090 /* XXX What do we do here ? */
....@@ -1193,9 +1092,6 @@
11931092 pe->pbus = NULL;
11941093 return NULL;
11951094 }
1196
-
1197
- /* Associate it with all child devices */
1198
- pnv_ioda_setup_same_PE(bus, pe);
11991095
12001096 /* Put PE to the list */
12011097 list_add_tail(&pe->list, &phb->ioda.pe_list);
....@@ -1210,8 +1106,15 @@
12101106 struct pnv_ioda_pe *pe;
12111107 struct pci_dev *gpu_pdev;
12121108 struct pci_dn *npu_pdn;
1213
- struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus);
1214
- struct pnv_phb *phb = hose->private_data;
1109
+ struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus);
1110
+
1111
+ /*
1112
+ * Intentionally leak a reference on the npu device (for
1113
+ * nvlink only; this is not an opencapi path) to make sure it
1114
+ * never goes away, as it's been the case all along and some
1115
+ * work is needed otherwise.
1116
+ */
1117
+ pci_dev_get(npu_pdev);
12151118
12161119 /*
12171120 * Due to a hardware errata PE#0 on the NPU is reserved for
....@@ -1236,11 +1139,11 @@
12361139 */
12371140 dev_info(&npu_pdev->dev,
12381141 "Associating to existing PE %x\n", pe_num);
1239
- pci_dev_get(npu_pdev);
12401142 npu_pdn = pci_get_pdn(npu_pdev);
12411143 rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
12421144 npu_pdn->pe_number = pe_num;
12431145 phb->ioda.pe_rmap[rid] = pe->pe_number;
1146
+ pe->device_count++;
12441147
12451148 /* Map the PE to this link */
12461149 rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
....@@ -1272,514 +1175,97 @@
12721175 pnv_ioda_setup_npu_PE(pdev);
12731176 }
12741177
1275
-static void pnv_pci_ioda_setup_PEs(void)
1178
+static void pnv_pci_ioda_setup_nvlink(void)
12761179 {
1277
- struct pci_controller *hose, *tmp;
1180
+ struct pci_controller *hose;
12781181 struct pnv_phb *phb;
1279
- struct pci_bus *bus;
1280
- struct pci_dev *pdev;
1182
+ struct pnv_ioda_pe *pe;
12811183
1282
- list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
1184
+ list_for_each_entry(hose, &hose_list, list_node) {
12831185 phb = hose->private_data;
12841186 if (phb->type == PNV_PHB_NPU_NVLINK) {
12851187 /* PE#0 is needed for error reporting */
12861188 pnv_ioda_reserve_pe(phb, 0);
12871189 pnv_ioda_setup_npu_PEs(hose->bus);
12881190 if (phb->model == PNV_PHB_MODEL_NPU2)
1289
- pnv_npu2_init(phb);
1290
- }
1291
- if (phb->type == PNV_PHB_NPU_OCAPI) {
1292
- bus = hose->bus;
1293
- list_for_each_entry(pdev, &bus->devices, bus_list)
1294
- pnv_ioda_setup_dev_PE(pdev);
1191
+ WARN_ON_ONCE(pnv_npu2_init(hose));
12951192 }
12961193 }
1297
-}
1298
-
1299
-#ifdef CONFIG_PCI_IOV
1300
-static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
1301
-{
1302
- struct pci_bus *bus;
1303
- struct pci_controller *hose;
1304
- struct pnv_phb *phb;
1305
- struct pci_dn *pdn;
1306
- int i, j;
1307
- int m64_bars;
1308
-
1309
- bus = pdev->bus;
1310
- hose = pci_bus_to_host(bus);
1311
- phb = hose->private_data;
1312
- pdn = pci_get_pdn(pdev);
1313
-
1314
- if (pdn->m64_single_mode)
1315
- m64_bars = num_vfs;
1316
- else
1317
- m64_bars = 1;
1318
-
1319
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
1320
- for (j = 0; j < m64_bars; j++) {
1321
- if (pdn->m64_map[j][i] == IODA_INVALID_M64)
1322
- continue;
1323
- opal_pci_phb_mmio_enable(phb->opal_id,
1324
- OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
1325
- clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
1326
- pdn->m64_map[j][i] = IODA_INVALID_M64;
1327
- }
1328
-
1329
- kfree(pdn->m64_map);
1330
- return 0;
1331
-}
1332
-
1333
-static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
1334
-{
1335
- struct pci_bus *bus;
1336
- struct pci_controller *hose;
1337
- struct pnv_phb *phb;
1338
- struct pci_dn *pdn;
1339
- unsigned int win;
1340
- struct resource *res;
1341
- int i, j;
1342
- int64_t rc;
1343
- int total_vfs;
1344
- resource_size_t size, start;
1345
- int pe_num;
1346
- int m64_bars;
1347
-
1348
- bus = pdev->bus;
1349
- hose = pci_bus_to_host(bus);
1350
- phb = hose->private_data;
1351
- pdn = pci_get_pdn(pdev);
1352
- total_vfs = pci_sriov_get_totalvfs(pdev);
1353
-
1354
- if (pdn->m64_single_mode)
1355
- m64_bars = num_vfs;
1356
- else
1357
- m64_bars = 1;
1358
-
1359
- pdn->m64_map = kmalloc_array(m64_bars,
1360
- sizeof(*pdn->m64_map),
1361
- GFP_KERNEL);
1362
- if (!pdn->m64_map)
1363
- return -ENOMEM;
1364
- /* Initialize the m64_map to IODA_INVALID_M64 */
1365
- for (i = 0; i < m64_bars ; i++)
1366
- for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
1367
- pdn->m64_map[i][j] = IODA_INVALID_M64;
1368
-
1369
-
1370
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
1371
- res = &pdev->resource[i + PCI_IOV_RESOURCES];
1372
- if (!res->flags || !res->parent)
1194
+ list_for_each_entry(hose, &hose_list, list_node) {
1195
+ phb = hose->private_data;
1196
+ if (phb->type != PNV_PHB_IODA2)
13731197 continue;
13741198
1375
- for (j = 0; j < m64_bars; j++) {
1376
- do {
1377
- win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
1378
- phb->ioda.m64_bar_idx + 1, 0);
1379
-
1380
- if (win >= phb->ioda.m64_bar_idx + 1)
1381
- goto m64_failed;
1382
- } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
1383
-
1384
- pdn->m64_map[j][i] = win;
1385
-
1386
- if (pdn->m64_single_mode) {
1387
- size = pci_iov_resource_size(pdev,
1388
- PCI_IOV_RESOURCES + i);
1389
- start = res->start + size * j;
1390
- } else {
1391
- size = resource_size(res);
1392
- start = res->start;
1393
- }
1394
-
1395
- /* Map the M64 here */
1396
- if (pdn->m64_single_mode) {
1397
- pe_num = pdn->pe_num_map[j];
1398
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
1399
- pe_num, OPAL_M64_WINDOW_TYPE,
1400
- pdn->m64_map[j][i], 0);
1401
- }
1402
-
1403
- rc = opal_pci_set_phb_mem_window(phb->opal_id,
1404
- OPAL_M64_WINDOW_TYPE,
1405
- pdn->m64_map[j][i],
1406
- start,
1407
- 0, /* unused */
1408
- size);
1409
-
1410
-
1411
- if (rc != OPAL_SUCCESS) {
1412
- dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
1413
- win, rc);
1414
- goto m64_failed;
1415
- }
1416
-
1417
- if (pdn->m64_single_mode)
1418
- rc = opal_pci_phb_mmio_enable(phb->opal_id,
1419
- OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
1420
- else
1421
- rc = opal_pci_phb_mmio_enable(phb->opal_id,
1422
- OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
1423
-
1424
- if (rc != OPAL_SUCCESS) {
1425
- dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
1426
- win, rc);
1427
- goto m64_failed;
1428
- }
1429
- }
1199
+ list_for_each_entry(pe, &phb->ioda.pe_list, list)
1200
+ pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
14301201 }
1431
- return 0;
14321202
1433
-m64_failed:
1434
- pnv_pci_vf_release_m64(pdev, num_vfs);
1435
- return -EBUSY;
1203
+#ifdef CONFIG_IOMMU_API
1204
+ /* setup iommu groups so we can do nvlink pass-thru */
1205
+ pnv_pci_npu_setup_iommu_groups();
1206
+#endif
14361207 }
14371208
1438
-static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
1439
- int num);
1440
-
1441
-static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
1442
-{
1443
- struct iommu_table *tbl;
1444
- int64_t rc;
1445
-
1446
- tbl = pe->table_group.tables[0];
1447
- rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
1448
- if (rc)
1449
- pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
1450
-
1451
- pnv_pci_ioda2_set_bypass(pe, false);
1452
- if (pe->table_group.group) {
1453
- iommu_group_put(pe->table_group.group);
1454
- BUG_ON(pe->table_group.group);
1455
- }
1456
- iommu_tce_table_put(tbl);
1457
-}
1458
-
1459
-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
1460
-{
1461
- struct pci_bus *bus;
1462
- struct pci_controller *hose;
1463
- struct pnv_phb *phb;
1464
- struct pnv_ioda_pe *pe, *pe_n;
1465
- struct pci_dn *pdn;
1466
-
1467
- bus = pdev->bus;
1468
- hose = pci_bus_to_host(bus);
1469
- phb = hose->private_data;
1470
- pdn = pci_get_pdn(pdev);
1471
-
1472
- if (!pdev->is_physfn)
1473
- return;
1474
-
1475
- list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
1476
- if (pe->parent_dev != pdev)
1477
- continue;
1478
-
1479
- pnv_pci_ioda2_release_dma_pe(pdev, pe);
1480
-
1481
- /* Remove from list */
1482
- mutex_lock(&phb->ioda.pe_list_mutex);
1483
- list_del(&pe->list);
1484
- mutex_unlock(&phb->ioda.pe_list_mutex);
1485
-
1486
- pnv_ioda_deconfigure_pe(phb, pe);
1487
-
1488
- pnv_ioda_free_pe(pe);
1489
- }
1490
-}
1491
-
1492
-void pnv_pci_sriov_disable(struct pci_dev *pdev)
1493
-{
1494
- struct pci_bus *bus;
1495
- struct pci_controller *hose;
1496
- struct pnv_phb *phb;
1497
- struct pnv_ioda_pe *pe;
1498
- struct pci_dn *pdn;
1499
- u16 num_vfs, i;
1500
-
1501
- bus = pdev->bus;
1502
- hose = pci_bus_to_host(bus);
1503
- phb = hose->private_data;
1504
- pdn = pci_get_pdn(pdev);
1505
- num_vfs = pdn->num_vfs;
1506
-
1507
- /* Release VF PEs */
1508
- pnv_ioda_release_vf_PE(pdev);
1509
-
1510
- if (phb->type == PNV_PHB_IODA2) {
1511
- if (!pdn->m64_single_mode)
1512
- pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
1513
-
1514
- /* Release M64 windows */
1515
- pnv_pci_vf_release_m64(pdev, num_vfs);
1516
-
1517
- /* Release PE numbers */
1518
- if (pdn->m64_single_mode) {
1519
- for (i = 0; i < num_vfs; i++) {
1520
- if (pdn->pe_num_map[i] == IODA_INVALID_PE)
1521
- continue;
1522
-
1523
- pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
1524
- pnv_ioda_free_pe(pe);
1525
- }
1526
- } else
1527
- bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1528
- /* Releasing pe_num_map */
1529
- kfree(pdn->pe_num_map);
1530
- }
1531
-}
1532
-
1533
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
1209
+static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
15341210 struct pnv_ioda_pe *pe);
1535
-static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
1211
+
1212
+static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
15361213 {
1537
- struct pci_bus *bus;
1538
- struct pci_controller *hose;
1539
- struct pnv_phb *phb;
1540
- struct pnv_ioda_pe *pe;
1541
- int pe_num;
1542
- u16 vf_index;
1543
- struct pci_dn *pdn;
1544
-
1545
- bus = pdev->bus;
1546
- hose = pci_bus_to_host(bus);
1547
- phb = hose->private_data;
1548
- pdn = pci_get_pdn(pdev);
1549
-
1550
- if (!pdev->is_physfn)
1551
- return;
1552
-
1553
- /* Reserve PE for each VF */
1554
- for (vf_index = 0; vf_index < num_vfs; vf_index++) {
1555
- int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
1556
- int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
1557
- struct pci_dn *vf_pdn;
1558
-
1559
- if (pdn->m64_single_mode)
1560
- pe_num = pdn->pe_num_map[vf_index];
1561
- else
1562
- pe_num = *pdn->pe_num_map + vf_index;
1563
-
1564
- pe = &phb->ioda.pe_array[pe_num];
1565
- pe->pe_number = pe_num;
1566
- pe->phb = phb;
1567
- pe->flags = PNV_IODA_PE_VF;
1568
- pe->pbus = NULL;
1569
- pe->parent_dev = pdev;
1570
- pe->mve_number = -1;
1571
- pe->rid = (vf_bus << 8) | vf_devfn;
1572
-
1573
- pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
1574
- hose->global_number, pdev->bus->number,
1575
- PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
1576
-
1577
- if (pnv_ioda_configure_pe(phb, pe)) {
1578
- /* XXX What do we do here ? */
1579
- pnv_ioda_free_pe(pe);
1580
- pe->pdev = NULL;
1581
- continue;
1582
- }
1583
-
1584
- /* Put PE to the list */
1585
- mutex_lock(&phb->ioda.pe_list_mutex);
1586
- list_add_tail(&pe->list, &phb->ioda.pe_list);
1587
- mutex_unlock(&phb->ioda.pe_list_mutex);
1588
-
1589
- /* associate this pe to it's pdn */
1590
- list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
1591
- if (vf_pdn->busno == vf_bus &&
1592
- vf_pdn->devfn == vf_devfn) {
1593
- vf_pdn->pe_number = pe_num;
1594
- break;
1595
- }
1596
- }
1597
-
1598
- pnv_pci_ioda2_setup_dma_pe(phb, pe);
1599
- }
1600
-}
1601
-
1602
-int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1603
-{
1604
- struct pci_bus *bus;
1605
- struct pci_controller *hose;
1606
- struct pnv_phb *phb;
1607
- struct pnv_ioda_pe *pe;
1608
- struct pci_dn *pdn;
1609
- int ret;
1610
- u16 i;
1611
-
1612
- bus = pdev->bus;
1613
- hose = pci_bus_to_host(bus);
1614
- phb = hose->private_data;
1615
- pdn = pci_get_pdn(pdev);
1616
-
1617
- if (phb->type == PNV_PHB_IODA2) {
1618
- if (!pdn->vfs_expanded) {
1619
- dev_info(&pdev->dev, "don't support this SRIOV device"
1620
- " with non 64bit-prefetchable IOV BAR\n");
1621
- return -ENOSPC;
1622
- }
1623
-
1624
- /*
1625
- * When M64 BARs functions in Single PE mode, the number of VFs
1626
- * could be enabled must be less than the number of M64 BARs.
1627
- */
1628
- if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
1629
- dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
1630
- return -EBUSY;
1631
- }
1632
-
1633
- /* Allocating pe_num_map */
1634
- if (pdn->m64_single_mode)
1635
- pdn->pe_num_map = kmalloc_array(num_vfs,
1636
- sizeof(*pdn->pe_num_map),
1637
- GFP_KERNEL);
1638
- else
1639
- pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
1640
-
1641
- if (!pdn->pe_num_map)
1642
- return -ENOMEM;
1643
-
1644
- if (pdn->m64_single_mode)
1645
- for (i = 0; i < num_vfs; i++)
1646
- pdn->pe_num_map[i] = IODA_INVALID_PE;
1647
-
1648
- /* Calculate available PE for required VFs */
1649
- if (pdn->m64_single_mode) {
1650
- for (i = 0; i < num_vfs; i++) {
1651
- pe = pnv_ioda_alloc_pe(phb);
1652
- if (!pe) {
1653
- ret = -EBUSY;
1654
- goto m64_failed;
1655
- }
1656
-
1657
- pdn->pe_num_map[i] = pe->pe_number;
1658
- }
1659
- } else {
1660
- mutex_lock(&phb->ioda.pe_alloc_mutex);
1661
- *pdn->pe_num_map = bitmap_find_next_zero_area(
1662
- phb->ioda.pe_alloc, phb->ioda.total_pe_num,
1663
- 0, num_vfs, 0);
1664
- if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
1665
- mutex_unlock(&phb->ioda.pe_alloc_mutex);
1666
- dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
1667
- kfree(pdn->pe_num_map);
1668
- return -EBUSY;
1669
- }
1670
- bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1671
- mutex_unlock(&phb->ioda.pe_alloc_mutex);
1672
- }
1673
- pdn->num_vfs = num_vfs;
1674
-
1675
- /* Assign M64 window accordingly */
1676
- ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
1677
- if (ret) {
1678
- dev_info(&pdev->dev, "Not enough M64 window resources\n");
1679
- goto m64_failed;
1680
- }
1681
-
1682
- /*
1683
- * When using one M64 BAR to map one IOV BAR, we need to shift
1684
- * the IOV BAR according to the PE# allocated to the VFs.
1685
- * Otherwise, the PE# for the VF will conflict with others.
1686
- */
1687
- if (!pdn->m64_single_mode) {
1688
- ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
1689
- if (ret)
1690
- goto m64_failed;
1691
- }
1692
- }
1693
-
1694
- /* Setup VF PEs */
1695
- pnv_ioda_setup_vf_PE(pdev, num_vfs);
1696
-
1697
- return 0;
1698
-
1699
-m64_failed:
1700
- if (pdn->m64_single_mode) {
1701
- for (i = 0; i < num_vfs; i++) {
1702
- if (pdn->pe_num_map[i] == IODA_INVALID_PE)
1703
- continue;
1704
-
1705
- pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
1706
- pnv_ioda_free_pe(pe);
1707
- }
1708
- } else
1709
- bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1710
-
1711
- /* Releasing pe_num_map */
1712
- kfree(pdn->pe_num_map);
1713
-
1714
- return ret;
1715
-}
1716
-
1717
-int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
1718
-{
1719
- pnv_pci_sriov_disable(pdev);
1720
-
1721
- /* Release PCI data */
1722
- remove_dev_pci_data(pdev);
1723
- return 0;
1724
-}
1725
-
1726
-int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1727
-{
1728
- /* Allocate PCI data */
1729
- add_dev_pci_data(pdev);
1730
-
1731
- return pnv_pci_sriov_enable(pdev, num_vfs);
1732
-}
1733
-#endif /* CONFIG_PCI_IOV */
1734
-
1735
-static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
1736
-{
1214
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
17371215 struct pci_dn *pdn = pci_get_pdn(pdev);
17381216 struct pnv_ioda_pe *pe;
17391217
1740
- /*
1741
- * The function can be called while the PE#
1742
- * hasn't been assigned. Do nothing for the
1743
- * case.
1744
- */
1745
- if (!pdn || pdn->pe_number == IODA_INVALID_PE)
1746
- return;
1218
+ /* Check if the BDFN for this device is associated with a PE yet */
1219
+ pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8));
1220
+ if (!pe) {
1221
+ /* VF PEs should be pre-configured in pnv_pci_sriov_enable() */
1222
+ if (WARN_ON(pdev->is_virtfn))
1223
+ return;
17471224
1748
- pe = &phb->ioda.pe_array[pdn->pe_number];
1749
- WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
1750
- set_dma_offset(&pdev->dev, pe->tce_bypass_base);
1751
- set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
1752
- /*
1753
- * Note: iommu_add_device() will fail here as
1754
- * for physical PE: the device is already added by now;
1755
- * for virtual PE: sysfs entries are not ready yet and
1756
- * tce_iommu_bus_notifier will add the device to a group later.
1757
- */
1758
-}
1225
+ pnv_pci_configure_bus(pdev->bus);
1226
+ pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8));
1227
+ pci_info(pdev, "Configured PE#%x\n", pe ? pe->pe_number : 0xfffff);
17591228
1760
-static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
1761
-{
1762
- unsigned short vendor = 0;
1763
- struct pci_dev *pdev;
17641229
1765
- if (pe->device_count == 1)
1766
- return true;
1767
-
1768
- /* pe->pdev should be set if it's a single device, pe->pbus if not */
1769
- if (!pe->pbus)
1770
- return true;
1771
-
1772
- list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
1773
- if (!vendor) {
1774
- vendor = pdev->vendor;
1775
- continue;
1776
- }
1777
-
1778
- if (pdev->vendor != vendor)
1779
- return false;
1230
+ /*
1231
+ * If we can't setup the IODA PE something has gone horribly
1232
+ * wrong and we can't enable DMA for the device.
1233
+ */
1234
+ if (WARN_ON(!pe))
1235
+ return;
1236
+ } else {
1237
+ pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number);
17801238 }
17811239
1782
- return true;
1240
+ /*
1241
+ * We assume that bridges *probably* don't need to do any DMA so we can
1242
+ * skip allocating a TCE table, etc unless we get a non-bridge device.
1243
+ */
1244
+ if (!pe->dma_setup_done && !pci_is_bridge(pdev)) {
1245
+ switch (phb->type) {
1246
+ case PNV_PHB_IODA1:
1247
+ pnv_pci_ioda1_setup_dma_pe(phb, pe);
1248
+ break;
1249
+ case PNV_PHB_IODA2:
1250
+ pnv_pci_ioda2_setup_dma_pe(phb, pe);
1251
+ break;
1252
+ default:
1253
+ pr_warn("%s: No DMA for PHB#%x (type %d)\n",
1254
+ __func__, phb->hose->global_number, phb->type);
1255
+ }
1256
+ }
1257
+
1258
+ if (pdn)
1259
+ pdn->pe_number = pe->pe_number;
1260
+ pe->device_count++;
1261
+
1262
+ WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
1263
+ pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
1264
+ set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
1265
+
1266
+ /* PEs with a DMA weight of zero won't have a group */
1267
+ if (pe->table_group.group)
1268
+ iommu_add_device(&pe->table_group, &pdev->dev);
17831269 }
17841270
17851271 /*
....@@ -1851,106 +1337,44 @@
18511337 return -EIO;
18521338 }
18531339
1854
-static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
1340
+static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
1341
+ u64 dma_mask)
18551342 {
1856
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
1857
- struct pnv_phb *phb = hose->private_data;
1343
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
18581344 struct pci_dn *pdn = pci_get_pdn(pdev);
18591345 struct pnv_ioda_pe *pe;
1860
- uint64_t top;
1861
- bool bypass = false;
1862
- s64 rc;
18631346
18641347 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1865
- return -ENODEV;
1348
+ return false;
18661349
18671350 pe = &phb->ioda.pe_array[pdn->pe_number];
18681351 if (pe->tce_bypass_enabled) {
1869
- top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
1870
- bypass = (dma_mask >= top);
1352
+ u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
1353
+ if (dma_mask >= top)
1354
+ return true;
18711355 }
18721356
1873
- if (bypass) {
1874
- dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
1875
- set_dma_ops(&pdev->dev, &dma_nommu_ops);
1876
- } else {
1877
- /*
1878
- * If the device can't set the TCE bypass bit but still wants
1879
- * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
1880
- * bypass the 32-bit region and be usable for 64-bit DMAs.
1881
- * The device needs to be able to address all of this space.
1882
- */
1883
- if (dma_mask >> 32 &&
1884
- dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
1885
- pnv_pci_ioda_pe_single_vendor(pe) &&
1886
- phb->model == PNV_PHB_MODEL_PHB3) {
1887
- /* Configure the bypass mode */
1888
- rc = pnv_pci_ioda_dma_64bit_bypass(pe);
1889
- if (rc)
1890
- return rc;
1891
- /* 4GB offset bypasses 32-bit space */
1892
- set_dma_offset(&pdev->dev, (1ULL << 32));
1893
- set_dma_ops(&pdev->dev, &dma_nommu_ops);
1894
- } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
1895
- /*
1896
- * Fail the request if a DMA mask between 32 and 64 bits
1897
- * was requested but couldn't be fulfilled. Ideally we
1898
- * would do this for 64-bits but historically we have
1899
- * always fallen back to 32-bits.
1900
- */
1901
- return -ENOMEM;
1902
- } else {
1903
- dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
1904
- set_dma_ops(&pdev->dev, &dma_iommu_ops);
1905
- }
1357
+ /*
1358
+ * If the device can't set the TCE bypass bit but still wants
1359
+ * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
1360
+ * bypass the 32-bit region and be usable for 64-bit DMAs.
1361
+ * The device needs to be able to address all of this space.
1362
+ */
1363
+ if (dma_mask >> 32 &&
1364
+ dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
1365
+ /* pe->pdev should be set if it's a single device, pe->pbus if not */
1366
+ (pe->device_count == 1 || !pe->pbus) &&
1367
+ phb->model == PNV_PHB_MODEL_PHB3) {
1368
+ /* Configure the bypass mode */
1369
+ s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
1370
+ if (rc)
1371
+ return false;
1372
+ /* 4GB offset bypasses 32-bit space */
1373
+ pdev->dev.archdata.dma_offset = (1ULL << 32);
1374
+ return true;
19061375 }
1907
- *pdev->dev.dma_mask = dma_mask;
19081376
1909
- /* Update peer npu devices */
1910
- pnv_npu_try_dma_set_bypass(pdev, bypass);
1911
-
1912
- return 0;
1913
-}
1914
-
1915
-static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev)
1916
-{
1917
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
1918
- struct pnv_phb *phb = hose->private_data;
1919
- struct pci_dn *pdn = pci_get_pdn(pdev);
1920
- struct pnv_ioda_pe *pe;
1921
- u64 end, mask;
1922
-
1923
- if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1924
- return 0;
1925
-
1926
- pe = &phb->ioda.pe_array[pdn->pe_number];
1927
- if (!pe->tce_bypass_enabled)
1928
- return __dma_get_required_mask(&pdev->dev);
1929
-
1930
-
1931
- end = pe->tce_bypass_base + memblock_end_of_DRAM();
1932
- mask = 1ULL << (fls64(end) - 1);
1933
- mask += mask - 1;
1934
-
1935
- return mask;
1936
-}
1937
-
1938
-static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
1939
- struct pci_bus *bus,
1940
- bool add_to_group)
1941
-{
1942
- struct pci_dev *dev;
1943
-
1944
- list_for_each_entry(dev, &bus->devices, bus_list) {
1945
- set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
1946
- set_dma_offset(&dev->dev, pe->tce_bypass_base);
1947
- if (add_to_group)
1948
- iommu_add_device(&dev->dev);
1949
-
1950
- if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1951
- pnv_ioda_setup_bus_dma(pe, dev->subordinate,
1952
- add_to_group);
1953
- }
1377
+ return false;
19541378 }
19551379
19561380 static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
....@@ -2012,26 +1436,12 @@
20121436 }
20131437
20141438 #ifdef CONFIG_IOMMU_API
2015
-static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
2016
- unsigned long *hpa, enum dma_data_direction *direction)
1439
+/* Common for IODA1 and IODA2 */
1440
+static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
1441
+ unsigned long *hpa, enum dma_data_direction *direction,
1442
+ bool realmode)
20171443 {
2018
- long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
2019
-
2020
- if (!ret)
2021
- pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
2022
-
2023
- return ret;
2024
-}
2025
-
2026
-static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
2027
- unsigned long *hpa, enum dma_data_direction *direction)
2028
-{
2029
- long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
2030
-
2031
- if (!ret)
2032
- pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
2033
-
2034
- return ret;
1444
+ return pnv_tce_xchg(tbl, index, hpa, direction, !realmode);
20351445 }
20361446 #endif
20371447
....@@ -2046,8 +1456,8 @@
20461456 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
20471457 .set = pnv_ioda1_tce_build,
20481458 #ifdef CONFIG_IOMMU_API
2049
- .exchange = pnv_ioda1_tce_xchg,
2050
- .exchange_rm = pnv_ioda1_tce_xchg_rm,
1459
+ .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
1460
+ .tce_kill = pnv_pci_p7ioc_tce_invalidate,
20511461 .useraddrptr = pnv_tce_useraddrptr,
20521462 #endif
20531463 .clear = pnv_ioda1_tce_free,
....@@ -2176,30 +1586,6 @@
21761586 return ret;
21771587 }
21781588
2179
-#ifdef CONFIG_IOMMU_API
2180
-static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
2181
- unsigned long *hpa, enum dma_data_direction *direction)
2182
-{
2183
- long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
2184
-
2185
- if (!ret)
2186
- pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
2187
-
2188
- return ret;
2189
-}
2190
-
2191
-static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
2192
- unsigned long *hpa, enum dma_data_direction *direction)
2193
-{
2194
- long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
2195
-
2196
- if (!ret)
2197
- pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
2198
-
2199
- return ret;
2200
-}
2201
-#endif
2202
-
22031589 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
22041590 long npages)
22051591 {
....@@ -2211,8 +1597,8 @@
22111597 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
22121598 .set = pnv_ioda2_tce_build,
22131599 #ifdef CONFIG_IOMMU_API
2214
- .exchange = pnv_ioda2_tce_xchg,
2215
- .exchange_rm = pnv_ioda2_tce_xchg_rm,
1600
+ .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
1601
+ .tce_kill = pnv_pci_ioda2_tce_invalidate,
22161602 .useraddrptr = pnv_tce_useraddrptr,
22171603 #endif
22181604 .clear = pnv_ioda2_tce_free,
....@@ -2358,8 +1744,8 @@
23581744 __pa(addr) + tce32_segsz * i,
23591745 tce32_segsz, IOMMU_PAGE_SIZE_4K);
23601746 if (rc) {
2361
- pe_err(pe, " Failed to configure 32-bit TCE table,"
2362
- " err %ld\n", rc);
1747
+ pe_err(pe, " Failed to configure 32-bit TCE table, err %lld\n",
1748
+ rc);
23631749 goto fail;
23641750 }
23651751 }
....@@ -2376,19 +1762,9 @@
23761762 tbl->it_ops = &pnv_ioda1_iommu_ops;
23771763 pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
23781764 pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
2379
- iommu_init_table(tbl, phb->hose->node);
1765
+ iommu_init_table(tbl, phb->hose->node, 0, 0);
23801766
2381
- if (pe->flags & PNV_IODA_PE_DEV) {
2382
- /*
2383
- * Setting table base here only for carrying iommu_group
2384
- * further down to let iommu_add_device() do the job.
2385
- * pnv_pci_ioda_dma_dev_setup will override it later anyway.
2386
- */
2387
- set_iommu_table_base(&pe->pdev->dev, tbl);
2388
- iommu_add_device(&pe->pdev->dev);
2389
- } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2390
- pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
2391
-
1767
+ pe->dma_setup_done = true;
23921768 return;
23931769 fail:
23941770 /* XXX Failure: Try to fallback to 64-bit only ? */
....@@ -2412,9 +1788,9 @@
24121788 const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
24131789 const __u64 win_size = tbl->it_size << tbl->it_page_shift;
24141790
2415
- pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num,
2416
- start_addr, start_addr + win_size - 1,
2417
- IOMMU_PAGE_SIZE(tbl));
1791
+ pe_info(pe, "Setting up window#%d %llx..%llx pg=%lx\n",
1792
+ num, start_addr, start_addr + win_size - 1,
1793
+ IOMMU_PAGE_SIZE(tbl));
24181794
24191795 /*
24201796 * Map TCE table through TVT. The TVE index is the PE number
....@@ -2428,7 +1804,7 @@
24281804 size << 3,
24291805 IOMMU_PAGE_SIZE(tbl));
24301806 if (rc) {
2431
- pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
1807
+ pe_err(pe, "Failed to configure TCE table, err %lld\n", rc);
24321808 return rc;
24331809 }
24341810
....@@ -2439,7 +1815,7 @@
24391815 return 0;
24401816 }
24411817
2442
-void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
1818
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
24431819 {
24441820 uint16_t window_id = (pe->pe_number << 1 ) + 1;
24451821 int64_t rc;
....@@ -2501,6 +1877,7 @@
25011877 {
25021878 struct iommu_table *tbl = NULL;
25031879 long rc;
1880
+ unsigned long res_start, res_end;
25041881
25051882 /*
25061883 * crashkernel= specifies the kdump kernel's maximum memory at
....@@ -2514,19 +1891,46 @@
25141891 * DMA window can be larger than available memory, which will
25151892 * cause errors later.
25161893 */
2517
- const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory);
1894
+ const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
25181895
2519
- rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
2520
- IOMMU_PAGE_SHIFT_4K,
2521
- window_size,
2522
- POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl);
1896
+ /*
1897
+ * We create the default window as big as we can. The constraint is
1898
+ * the max order of allocation possible. The TCE table is likely to
1899
+ * end up being multilevel and with on-demand allocation in place,
1900
+ * the initial use is not going to be huge as the default window aims
1901
+ * to support crippled devices (i.e. not fully 64bit DMAble) only.
1902
+ */
1903
+ /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
1904
+ const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory);
1905
+ /* Each TCE level cannot exceed maxblock so go multilevel if needed */
1906
+ unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT);
1907
+ unsigned long tcelevel_order = ilog2(maxblock >> 3);
1908
+ unsigned int levels = tces_order / tcelevel_order;
1909
+
1910
+ if (tces_order % tcelevel_order)
1911
+ levels += 1;
1912
+ /*
1913
+ * We try to stick to default levels (which is >1 at the moment) in
1914
+ * order to save memory by relying on on-demain TCE level allocation.
1915
+ */
1916
+ levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS);
1917
+
1918
+ rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT,
1919
+ window_size, levels, false, &tbl);
25231920 if (rc) {
25241921 pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
25251922 rc);
25261923 return rc;
25271924 }
25281925
2529
- iommu_init_table(tbl, pe->phb->hose->node);
1926
+ /* We use top part of 32bit space for MMIO so exclude it from DMA */
1927
+ res_start = 0;
1928
+ res_end = 0;
1929
+ if (window_size > pe->phb->ioda.m32_pci_base) {
1930
+ res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
1931
+ res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
1932
+ }
1933
+ iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
25301934
25311935 rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
25321936 if (rc) {
....@@ -2540,17 +1944,16 @@
25401944 pnv_pci_ioda2_set_bypass(pe, true);
25411945
25421946 /*
2543
- * Setting table base here only for carrying iommu_group
2544
- * further down to let iommu_add_device() do the job.
2545
- * pnv_pci_ioda_dma_dev_setup will override it later anyway.
1947
+ * Set table base for the case of IOMMU DMA use. Usually this is done
1948
+ * from dma_dev_setup() which is not called when a device is returned
1949
+ * from VFIO so do it here.
25461950 */
2547
- if (pe->flags & PNV_IODA_PE_DEV)
1951
+ if (pe->pdev)
25481952 set_iommu_table_base(&pe->pdev->dev, tbl);
25491953
25501954 return 0;
25511955 }
25521956
2553
-#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
25541957 static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
25551958 int num)
25561959 {
....@@ -2574,10 +1977,9 @@
25741977
25751978 return ret;
25761979 }
2577
-#endif
25781980
25791981 #ifdef CONFIG_IOMMU_API
2580
-static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
1982
+unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
25811983 __u64 window_size, __u32 levels)
25821984 {
25831985 unsigned long bytes = 0;
....@@ -2598,7 +2000,7 @@
25982000 direct_table_size = 1UL << table_shift;
25992001
26002002 for ( ; levels; --levels) {
2601
- bytes += _ALIGN_UP(tce_table_size, direct_table_size);
2003
+ bytes += ALIGN(tce_table_size, direct_table_size);
26022004
26032005 tce_table_size /= direct_table_size;
26042006 tce_table_size <<= 3;
....@@ -2623,6 +2025,19 @@
26232025 return ret;
26242026 }
26252027
2028
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
2029
+{
2030
+ struct pci_dev *dev;
2031
+
2032
+ list_for_each_entry(dev, &bus->devices, bus_list) {
2033
+ set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
2034
+ dev->dev.archdata.dma_offset = pe->tce_bypass_base;
2035
+
2036
+ if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
2037
+ pnv_ioda_setup_bus_dma(pe, dev->subordinate);
2038
+ }
2039
+}
2040
+
26262041 static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
26272042 {
26282043 struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
....@@ -2633,7 +2048,9 @@
26332048 pnv_pci_ioda2_set_bypass(pe, false);
26342049 pnv_pci_ioda2_unset_window(&pe->table_group, 0);
26352050 if (pe->pbus)
2636
- pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
2051
+ pnv_ioda_setup_bus_dma(pe, pe->pbus);
2052
+ else if (pe->pdev)
2053
+ set_iommu_table_base(&pe->pdev->dev, NULL);
26372054 iommu_tce_table_put(tbl);
26382055 }
26392056
....@@ -2644,7 +2061,7 @@
26442061
26452062 pnv_pci_ioda2_setup_default_config(pe);
26462063 if (pe->pbus)
2647
- pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
2064
+ pnv_ioda_setup_bus_dma(pe, pe->pbus);
26482065 }
26492066
26502067 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
....@@ -2655,180 +2072,15 @@
26552072 .take_ownership = pnv_ioda2_take_ownership,
26562073 .release_ownership = pnv_ioda2_release_ownership,
26572074 };
2658
-
2659
-static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
2660
-{
2661
- struct pci_controller *hose;
2662
- struct pnv_phb *phb;
2663
- struct pnv_ioda_pe **ptmppe = opaque;
2664
- struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
2665
- struct pci_dn *pdn = pci_get_pdn(pdev);
2666
-
2667
- if (!pdn || pdn->pe_number == IODA_INVALID_PE)
2668
- return 0;
2669
-
2670
- hose = pci_bus_to_host(pdev->bus);
2671
- phb = hose->private_data;
2672
- if (phb->type != PNV_PHB_NPU_NVLINK)
2673
- return 0;
2674
-
2675
- *ptmppe = &phb->ioda.pe_array[pdn->pe_number];
2676
-
2677
- return 1;
2678
-}
2679
-
2680
-/*
2681
- * This returns PE of associated NPU.
2682
- * This assumes that NPU is in the same IOMMU group with GPU and there is
2683
- * no other PEs.
2684
- */
2685
-static struct pnv_ioda_pe *gpe_table_group_to_npe(
2686
- struct iommu_table_group *table_group)
2687
-{
2688
- struct pnv_ioda_pe *npe = NULL;
2689
- int ret = iommu_group_for_each_dev(table_group->group, &npe,
2690
- gpe_table_group_to_npe_cb);
2691
-
2692
- BUG_ON(!ret || !npe);
2693
-
2694
- return npe;
2695
-}
2696
-
2697
-static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group,
2698
- int num, struct iommu_table *tbl)
2699
-{
2700
- struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
2701
- int num2 = (num == 0) ? 1 : 0;
2702
- long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
2703
-
2704
- if (ret)
2705
- return ret;
2706
-
2707
- if (table_group->tables[num2])
2708
- pnv_npu_unset_window(npe, num2);
2709
-
2710
- ret = pnv_npu_set_window(npe, num, tbl);
2711
- if (ret) {
2712
- pnv_pci_ioda2_unset_window(table_group, num);
2713
- if (table_group->tables[num2])
2714
- pnv_npu_set_window(npe, num2,
2715
- table_group->tables[num2]);
2716
- }
2717
-
2718
- return ret;
2719
-}
2720
-
2721
-static long pnv_pci_ioda2_npu_unset_window(
2722
- struct iommu_table_group *table_group,
2723
- int num)
2724
-{
2725
- struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
2726
- int num2 = (num == 0) ? 1 : 0;
2727
- long ret = pnv_pci_ioda2_unset_window(table_group, num);
2728
-
2729
- if (ret)
2730
- return ret;
2731
-
2732
- if (!npe->table_group.tables[num])
2733
- return 0;
2734
-
2735
- ret = pnv_npu_unset_window(npe, num);
2736
- if (ret)
2737
- return ret;
2738
-
2739
- if (table_group->tables[num2])
2740
- ret = pnv_npu_set_window(npe, num2, table_group->tables[num2]);
2741
-
2742
- return ret;
2743
-}
2744
-
2745
-static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
2746
-{
2747
- /*
2748
- * Detach NPU first as pnv_ioda2_take_ownership() will destroy
2749
- * the iommu_table if 32bit DMA is enabled.
2750
- */
2751
- pnv_npu_take_ownership(gpe_table_group_to_npe(table_group));
2752
- pnv_ioda2_take_ownership(table_group);
2753
-}
2754
-
2755
-static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
2756
- .get_table_size = pnv_pci_ioda2_get_table_size,
2757
- .create_table = pnv_pci_ioda2_create_table_userspace,
2758
- .set_window = pnv_pci_ioda2_npu_set_window,
2759
- .unset_window = pnv_pci_ioda2_npu_unset_window,
2760
- .take_ownership = pnv_ioda2_npu_take_ownership,
2761
- .release_ownership = pnv_ioda2_release_ownership,
2762
-};
2763
-
2764
-static void pnv_pci_ioda_setup_iommu_api(void)
2765
-{
2766
- struct pci_controller *hose, *tmp;
2767
- struct pnv_phb *phb;
2768
- struct pnv_ioda_pe *pe, *gpe;
2769
-
2770
- /*
2771
- * Now we have all PHBs discovered, time to add NPU devices to
2772
- * the corresponding IOMMU groups.
2773
- */
2774
- list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
2775
- phb = hose->private_data;
2776
-
2777
- if (phb->type != PNV_PHB_NPU_NVLINK)
2778
- continue;
2779
-
2780
- list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2781
- gpe = pnv_pci_npu_setup_iommu(pe);
2782
- if (gpe)
2783
- gpe->table_group.ops = &pnv_pci_ioda2_npu_ops;
2784
- }
2785
- }
2786
-}
2787
-#else /* !CONFIG_IOMMU_API */
2788
-static void pnv_pci_ioda_setup_iommu_api(void) { };
27892075 #endif
27902076
2791
-static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
2792
-{
2793
- struct pci_controller *hose = phb->hose;
2794
- struct device_node *dn = hose->dn;
2795
- unsigned long mask = 0;
2796
- int i, rc, count;
2797
- u32 val;
2798
-
2799
- count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
2800
- if (count <= 0) {
2801
- mask = SZ_4K | SZ_64K;
2802
- /* Add 16M for POWER8 by default */
2803
- if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
2804
- !cpu_has_feature(CPU_FTR_ARCH_300))
2805
- mask |= SZ_16M | SZ_256M;
2806
- return mask;
2807
- }
2808
-
2809
- for (i = 0; i < count; i++) {
2810
- rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
2811
- i, &val);
2812
- if (rc == 0)
2813
- mask |= 1ULL << val;
2814
- }
2815
-
2816
- return mask;
2817
-}
2818
-
2819
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
2820
- struct pnv_ioda_pe *pe)
2077
+void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
2078
+ struct pnv_ioda_pe *pe)
28212079 {
28222080 int64_t rc;
28232081
2824
- if (!pnv_pci_ioda_pe_dma_weight(pe))
2825
- return;
2826
-
28272082 /* TVE #1 is selected by PCI address bit 59 */
28282083 pe->tce_bypass_base = 1ull << 59;
2829
-
2830
- iommu_register_group(&pe->table_group, phb->hose->global_number,
2831
- pe->pe_number);
28322084
28332085 /* The PE will reserve all possible 32-bits space */
28342086 pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
....@@ -2841,19 +2093,19 @@
28412093 IOMMU_TABLE_GROUP_MAX_TABLES;
28422094 pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
28432095 pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
2844
-#ifdef CONFIG_IOMMU_API
2845
- pe->table_group.ops = &pnv_pci_ioda2_ops;
2846
-#endif
28472096
28482097 rc = pnv_pci_ioda2_setup_default_config(pe);
28492098 if (rc)
28502099 return;
28512100
2852
- if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2853
- pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
2101
+#ifdef CONFIG_IOMMU_API
2102
+ pe->table_group.ops = &pnv_pci_ioda2_ops;
2103
+ iommu_register_group(&pe->table_group, phb->hose->global_number,
2104
+ pe->pe_number);
2105
+#endif
2106
+ pe->dma_setup_done = true;
28542107 }
28552108
2856
-#ifdef CONFIG_PCI_MSI
28572109 int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
28582110 {
28592111 struct pnv_phb *phb = container_of(chip, struct pnv_phb,
....@@ -2999,121 +2251,6 @@
29992251 pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
30002252 count, phb->msi_base);
30012253 }
3002
-#else
3003
-static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
3004
-#endif /* CONFIG_PCI_MSI */
3005
-
3006
-#ifdef CONFIG_PCI_IOV
3007
-static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
3008
-{
3009
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
3010
- struct pnv_phb *phb = hose->private_data;
3011
- const resource_size_t gate = phb->ioda.m64_segsize >> 2;
3012
- struct resource *res;
3013
- int i;
3014
- resource_size_t size, total_vf_bar_sz;
3015
- struct pci_dn *pdn;
3016
- int mul, total_vfs;
3017
-
3018
- pdn = pci_get_pdn(pdev);
3019
- pdn->vfs_expanded = 0;
3020
- pdn->m64_single_mode = false;
3021
-
3022
- total_vfs = pci_sriov_get_totalvfs(pdev);
3023
- mul = phb->ioda.total_pe_num;
3024
- total_vf_bar_sz = 0;
3025
-
3026
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
3027
- res = &pdev->resource[i + PCI_IOV_RESOURCES];
3028
- if (!res->flags || res->parent)
3029
- continue;
3030
- if (!pnv_pci_is_m64_flags(res->flags)) {
3031
- dev_warn(&pdev->dev, "Don't support SR-IOV with"
3032
- " non M64 VF BAR%d: %pR. \n",
3033
- i, res);
3034
- goto truncate_iov;
3035
- }
3036
-
3037
- total_vf_bar_sz += pci_iov_resource_size(pdev,
3038
- i + PCI_IOV_RESOURCES);
3039
-
3040
- /*
3041
- * If bigger than quarter of M64 segment size, just round up
3042
- * power of two.
3043
- *
3044
- * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
3045
- * with other devices, IOV BAR size is expanded to be
3046
- * (total_pe * VF_BAR_size). When VF_BAR_size is half of M64
3047
- * segment size , the expanded size would equal to half of the
3048
- * whole M64 space size, which will exhaust the M64 Space and
3049
- * limit the system flexibility. This is a design decision to
3050
- * set the boundary to quarter of the M64 segment size.
3051
- */
3052
- if (total_vf_bar_sz > gate) {
3053
- mul = roundup_pow_of_two(total_vfs);
3054
- dev_info(&pdev->dev,
3055
- "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
3056
- total_vf_bar_sz, gate, mul);
3057
- pdn->m64_single_mode = true;
3058
- break;
3059
- }
3060
- }
3061
-
3062
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
3063
- res = &pdev->resource[i + PCI_IOV_RESOURCES];
3064
- if (!res->flags || res->parent)
3065
- continue;
3066
-
3067
- size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
3068
- /*
3069
- * On PHB3, the minimum size alignment of M64 BAR in single
3070
- * mode is 32MB.
3071
- */
3072
- if (pdn->m64_single_mode && (size < SZ_32M))
3073
- goto truncate_iov;
3074
- dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
3075
- res->end = res->start + size * mul - 1;
3076
- dev_dbg(&pdev->dev, " %pR\n", res);
3077
- dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
3078
- i, res, mul);
3079
- }
3080
- pdn->vfs_expanded = mul;
3081
-
3082
- return;
3083
-
3084
-truncate_iov:
3085
- /* To save MMIO space, IOV BAR is truncated. */
3086
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
3087
- res = &pdev->resource[i + PCI_IOV_RESOURCES];
3088
- res->flags = 0;
3089
- res->end = res->start - 1;
3090
- }
3091
-}
3092
-
3093
-static void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
3094
-{
3095
- if (WARN_ON(pci_dev_is_added(pdev)))
3096
- return;
3097
-
3098
- if (pdev->is_virtfn) {
3099
- struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
3100
-
3101
- /*
3102
- * VF PEs are single-device PEs so their pdev pointer needs to
3103
- * be set. The pdev doesn't exist when the PE is allocated (in
3104
- * (pcibios_sriov_enable()) so we fix it up here.
3105
- */
3106
- pe->pdev = pdev;
3107
- WARN_ON(!(pe->flags & PNV_IODA_PE_VF));
3108
- } else if (pdev->is_physfn) {
3109
- /*
3110
- * For PFs adjust their allocated IOV resources to match what
3111
- * the PHB can support using it's M64 BAR table.
3112
- */
3113
- pnv_pci_ioda_fixup_iov_resources(pdev);
3114
- }
3115
-}
3116
-#endif /* CONFIG_PCI_IOV */
31172254
31182255 static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
31192256 struct resource *res)
....@@ -3123,7 +2260,8 @@
31232260 int index;
31242261 int64_t rc;
31252262
3126
- if (!res || !res->flags || res->start > res->end)
2263
+ if (!res || !res->flags || res->start > res->end ||
2264
+ res->flags & IORESOURCE_UNSET)
31272265 return;
31282266
31292267 if (res->flags & IORESOURCE_IO) {
....@@ -3209,18 +2347,8 @@
32092347 #ifdef CONFIG_DEBUG_FS
32102348 static int pnv_pci_diag_data_set(void *data, u64 val)
32112349 {
3212
- struct pci_controller *hose;
3213
- struct pnv_phb *phb;
2350
+ struct pnv_phb *phb = data;
32142351 s64 ret;
3215
-
3216
- if (val != 1ULL)
3217
- return -EINVAL;
3218
-
3219
- hose = (struct pci_controller *)data;
3220
- if (!hose || !hose->private_data)
3221
- return -ENODEV;
3222
-
3223
- phb = hose->private_data;
32242352
32252353 /* Retrieve the diag data from firmware */
32262354 ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
....@@ -3233,8 +2361,35 @@
32332361 return 0;
32342362 }
32352363
3236
-DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_diag_data_fops, NULL,
3237
- pnv_pci_diag_data_set, "%llu\n");
2364
+DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, pnv_pci_diag_data_set,
2365
+ "%llu\n");
2366
+
2367
+static int pnv_pci_ioda_pe_dump(void *data, u64 val)
2368
+{
2369
+ struct pnv_phb *phb = data;
2370
+ int pe_num;
2371
+
2372
+ for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
2373
+ struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_num];
2374
+
2375
+ if (!test_bit(pe_num, phb->ioda.pe_alloc))
2376
+ continue;
2377
+
2378
+ pe_warn(pe, "rid: %04x dev count: %2d flags: %s%s%s%s%s%s\n",
2379
+ pe->rid, pe->device_count,
2380
+ (pe->flags & PNV_IODA_PE_DEV) ? "dev " : "",
2381
+ (pe->flags & PNV_IODA_PE_BUS) ? "bus " : "",
2382
+ (pe->flags & PNV_IODA_PE_BUS_ALL) ? "all " : "",
2383
+ (pe->flags & PNV_IODA_PE_MASTER) ? "master " : "",
2384
+ (pe->flags & PNV_IODA_PE_SLAVE) ? "slave " : "",
2385
+ (pe->flags & PNV_IODA_PE_VF) ? "vf " : "");
2386
+ }
2387
+
2388
+ return 0;
2389
+}
2390
+
2391
+DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_ioda_pe_dump_fops, NULL,
2392
+ pnv_pci_ioda_pe_dump, "%llu\n");
32382393
32392394 #endif /* CONFIG_DEBUG_FS */
32402395
....@@ -3253,14 +2408,11 @@
32532408
32542409 sprintf(name, "PCI%04x", hose->global_number);
32552410 phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
3256
- if (!phb->dbgfs) {
3257
- pr_warn("%s: Error on creating debugfs on PHB#%x\n",
3258
- __func__, hose->global_number);
3259
- continue;
3260
- }
32612411
3262
- debugfs_create_file("dump_diag_regs", 0200, phb->dbgfs, hose,
3263
- &pnv_pci_diag_data_fops);
2412
+ debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs,
2413
+ phb, &pnv_pci_diag_data_fops);
2414
+ debugfs_create_file_unsafe("dump_ioda_pe_state", 0200, phb->dbgfs,
2415
+ phb, &pnv_pci_ioda_pe_dump_fops);
32642416 }
32652417 #endif /* CONFIG_DEBUG_FS */
32662418 }
....@@ -3302,8 +2454,7 @@
33022454
33032455 static void pnv_pci_ioda_fixup(void)
33042456 {
3305
- pnv_pci_ioda_setup_PEs();
3306
- pnv_pci_ioda_setup_iommu_api();
2457
+ pnv_pci_ioda_setup_nvlink();
33072458 pnv_pci_ioda_create_dbgfs();
33082459
33092460 pnv_pci_enable_bridges();
....@@ -3328,10 +2479,9 @@
33282479 static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
33292480 unsigned long type)
33302481 {
3331
- struct pci_dev *bridge;
3332
- struct pci_controller *hose = pci_bus_to_host(bus);
3333
- struct pnv_phb *phb = hose->private_data;
2482
+ struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
33342483 int num_pci_bridges = 0;
2484
+ struct pci_dev *bridge;
33352485
33362486 bridge = bus->self;
33372487 while (bridge) {
....@@ -3415,33 +2565,20 @@
34152565 }
34162566 }
34172567
3418
-static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
2568
+static void pnv_pci_configure_bus(struct pci_bus *bus)
34192569 {
3420
- struct pci_controller *hose = pci_bus_to_host(bus);
3421
- struct pnv_phb *phb = hose->private_data;
34222570 struct pci_dev *bridge = bus->self;
34232571 struct pnv_ioda_pe *pe;
3424
- bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
2572
+ bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
34252573
3426
- /* Extend bridge's windows if necessary */
3427
- pnv_pci_fixup_bridge_resources(bus, type);
3428
-
3429
- /* The PE for root bus should be realized before any one else */
3430
- if (!phb->ioda.root_pe_populated) {
3431
- pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false);
3432
- if (pe) {
3433
- phb->ioda.root_pe_idx = pe->pe_number;
3434
- phb->ioda.root_pe_populated = true;
3435
- }
3436
- }
2574
+ dev_info(&bus->dev, "Configuring PE for bus\n");
34372575
34382576 /* Don't assign PE to PCI bus, which doesn't have subordinate devices */
3439
- if (list_empty(&bus->devices))
2577
+ if (WARN_ON(list_empty(&bus->devices)))
34402578 return;
34412579
34422580 /* Reserve PEs according to used M64 resources */
3443
- if (phb->reserve_m64_pe)
3444
- phb->reserve_m64_pe(bus, NULL, all);
2581
+ pnv_ioda_reserve_m64_pe(bus, NULL, all);
34452582
34462583 /*
34472584 * Assign PE. We might run here because of partial hotplug.
....@@ -3453,17 +2590,6 @@
34532590 return;
34542591
34552592 pnv_ioda_setup_pe_seg(pe);
3456
- switch (phb->type) {
3457
- case PNV_PHB_IODA1:
3458
- pnv_pci_ioda1_setup_dma_pe(phb, pe);
3459
- break;
3460
- case PNV_PHB_IODA2:
3461
- pnv_pci_ioda2_setup_dma_pe(phb, pe);
3462
- break;
3463
- default:
3464
- pr_warn("%s: No DMA for PHB#%x (type %d)\n",
3465
- __func__, phb->hose->global_number, phb->type);
3466
- }
34672593 }
34682594
34692595 static resource_size_t pnv_pci_default_alignment(void)
....@@ -3471,49 +2597,12 @@
34712597 return PAGE_SIZE;
34722598 }
34732599
3474
-#ifdef CONFIG_PCI_IOV
3475
-static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
3476
- int resno)
3477
-{
3478
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
3479
- struct pnv_phb *phb = hose->private_data;
3480
- struct pci_dn *pdn = pci_get_pdn(pdev);
3481
- resource_size_t align;
3482
-
3483
- /*
3484
- * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
3485
- * SR-IOV. While from hardware perspective, the range mapped by M64
3486
- * BAR should be size aligned.
3487
- *
3488
- * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
3489
- * powernv-specific hardware restriction is gone. But if just use the
3490
- * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
3491
- * in one segment of M64 #15, which introduces the PE conflict between
3492
- * PF and VF. Based on this, the minimum alignment of an IOV BAR is
3493
- * m64_segsize.
3494
- *
3495
- * This function returns the total IOV BAR size if M64 BAR is in
3496
- * Shared PE mode or just VF BAR size if not.
3497
- * If the M64 BAR is in Single PE mode, return the VF BAR size or
3498
- * M64 segment size if IOV BAR size is less.
3499
- */
3500
- align = pci_iov_resource_size(pdev, resno);
3501
- if (!pdn->vfs_expanded)
3502
- return align;
3503
- if (pdn->m64_single_mode)
3504
- return max(align, (resource_size_t)phb->ioda.m64_segsize);
3505
-
3506
- return pdn->vfs_expanded * align;
3507
-}
3508
-#endif /* CONFIG_PCI_IOV */
3509
-
35102600 /* Prevent enabling devices for which we couldn't properly
35112601 * assign a PE
35122602 */
35132603 static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
35142604 {
3515
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
3516
- struct pnv_phb *phb = hose->private_data;
2605
+ struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
35172606 struct pci_dn *pdn;
35182607
35192608 /* The function is probably called while the PEs have
....@@ -3528,6 +2617,28 @@
35282617 if (!pdn || pdn->pe_number == IODA_INVALID_PE)
35292618 return false;
35302619
2620
+ return true;
2621
+}
2622
+
2623
+static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev)
2624
+{
2625
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
2626
+ struct pnv_phb *phb = hose->private_data;
2627
+ struct pci_dn *pdn;
2628
+ struct pnv_ioda_pe *pe;
2629
+
2630
+ if (!phb->initialized)
2631
+ return true;
2632
+
2633
+ pdn = pci_get_pdn(dev);
2634
+ if (!pdn)
2635
+ return false;
2636
+
2637
+ if (pdn->pe_number == IODA_INVALID_PE) {
2638
+ pe = pnv_ioda_setup_dev_PE(dev);
2639
+ if (!pe)
2640
+ return false;
2641
+ }
35312642 return true;
35322643 }
35332644
....@@ -3562,11 +2673,10 @@
35622673
35632674 static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
35642675 {
3565
- unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
35662676 struct iommu_table *tbl = pe->table_group.tables[0];
35672677 int64_t rc;
35682678
3569
- if (!weight)
2679
+ if (!pe->dma_setup_done)
35702680 return;
35712681
35722682 rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
....@@ -3583,22 +2693,17 @@
35832693 iommu_tce_table_put(tbl);
35842694 }
35852695
3586
-static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
2696
+void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
35872697 {
35882698 struct iommu_table *tbl = pe->table_group.tables[0];
3589
- unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
3590
-#ifdef CONFIG_IOMMU_API
35912699 int64_t rc;
3592
-#endif
35932700
3594
- if (!weight)
2701
+ if (!pe->dma_setup_done)
35952702 return;
35962703
3597
-#ifdef CONFIG_IOMMU_API
35982704 rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
35992705 if (rc)
3600
- pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
3601
-#endif
2706
+ pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
36022707
36032708 pnv_pci_ioda2_set_bypass(pe, false);
36042709 if (pe->table_group.group) {
....@@ -3621,17 +2726,11 @@
36212726 if (map[idx] != pe->pe_number)
36222727 continue;
36232728
3624
- if (win == OPAL_M64_WINDOW_TYPE)
3625
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3626
- phb->ioda.reserved_pe_idx, win,
3627
- idx / PNV_IODA1_M64_SEGS,
3628
- idx % PNV_IODA1_M64_SEGS);
3629
- else
3630
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3631
- phb->ioda.reserved_pe_idx, win, 0, idx);
2729
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2730
+ phb->ioda.reserved_pe_idx, win, 0, idx);
36322731
36332732 if (rc != OPAL_SUCCESS)
3634
- pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n",
2733
+ pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n",
36352734 rc, win, idx);
36362735
36372736 map[idx] = IODA_INVALID_PE;
....@@ -3647,8 +2746,7 @@
36472746 phb->ioda.io_segmap);
36482747 pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
36492748 phb->ioda.m32_segmap);
3650
- pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE,
3651
- phb->ioda.m64_segmap);
2749
+ /* M64 is pre-configured by pnv_ioda1_init_m64() */
36522750 } else if (phb->type == PNV_PHB_IODA2) {
36532751 pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
36542752 phb->ioda.m32_segmap);
....@@ -3660,13 +2758,20 @@
36602758 struct pnv_phb *phb = pe->phb;
36612759 struct pnv_ioda_pe *slave, *tmp;
36622760
2761
+ pe_info(pe, "Releasing PE\n");
2762
+
2763
+ mutex_lock(&phb->ioda.pe_list_mutex);
36632764 list_del(&pe->list);
2765
+ mutex_unlock(&phb->ioda.pe_list_mutex);
2766
+
36642767 switch (phb->type) {
36652768 case PNV_PHB_IODA1:
36662769 pnv_pci_ioda1_release_pe_dma(pe);
36672770 break;
36682771 case PNV_PHB_IODA2:
36692772 pnv_pci_ioda2_release_pe_dma(pe);
2773
+ break;
2774
+ case PNV_PHB_NPU_OCAPI:
36702775 break;
36712776 default:
36722777 WARN_ON(1);
....@@ -3689,25 +2794,34 @@
36892794 * that it can be populated again in PCI hot add path. The PE
36902795 * shouldn't be destroyed as it's the global reserved resource.
36912796 */
3692
- if (phb->ioda.root_pe_populated &&
3693
- phb->ioda.root_pe_idx == pe->pe_number)
3694
- phb->ioda.root_pe_populated = false;
3695
- else
3696
- pnv_ioda_free_pe(pe);
2797
+ if (phb->ioda.root_pe_idx == pe->pe_number)
2798
+ return;
2799
+
2800
+ pnv_ioda_free_pe(pe);
36972801 }
36982802
36992803 static void pnv_pci_release_device(struct pci_dev *pdev)
37002804 {
3701
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
3702
- struct pnv_phb *phb = hose->private_data;
2805
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
37032806 struct pci_dn *pdn = pci_get_pdn(pdev);
37042807 struct pnv_ioda_pe *pe;
37052808
2809
+ /* The VF PE state is torn down when sriov_disable() is called */
37062810 if (pdev->is_virtfn)
37072811 return;
37082812
37092813 if (!pdn || pdn->pe_number == IODA_INVALID_PE)
37102814 return;
2815
+
2816
+#ifdef CONFIG_PCI_IOV
2817
+ /*
2818
+ * FIXME: Try move this to sriov_disable(). It's here since we allocate
2819
+ * the iov state at probe time since we need to fiddle with the IOV
2820
+ * resources.
2821
+ */
2822
+ if (pdev->is_physfn)
2823
+ kfree(pdev->dev.archdata.iov_data);
2824
+#endif
37112825
37122826 /*
37132827 * PCI hotplug can happen as part of EEH error recovery. The @pdn
....@@ -3725,6 +2839,15 @@
37252839 pnv_ioda_release_pe(pe);
37262840 }
37272841
2842
+static void pnv_npu_disable_device(struct pci_dev *pdev)
2843
+{
2844
+ struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
2845
+ struct eeh_pe *eehpe = edev ? edev->pe : NULL;
2846
+
2847
+ if (eehpe && eeh_ops && eeh_ops->reset)
2848
+ eeh_ops->reset(eehpe, EEH_RESET_HOT);
2849
+}
2850
+
37282851 static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
37292852 {
37302853 struct pnv_phb *phb = hose->private_data;
....@@ -3733,46 +2856,52 @@
37332856 OPAL_ASSERT_RESET);
37342857 }
37352858
2859
+static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus)
2860
+{
2861
+ struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
2862
+ struct pnv_ioda_pe *pe;
2863
+
2864
+ list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2865
+ if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
2866
+ continue;
2867
+
2868
+ if (!pe->pbus)
2869
+ continue;
2870
+
2871
+ if (bus->number == ((pe->rid >> 8) & 0xFF)) {
2872
+ pe->pbus = bus;
2873
+ break;
2874
+ }
2875
+ }
2876
+}
2877
+
37362878 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
3737
- .dma_dev_setup = pnv_pci_dma_dev_setup,
3738
- .dma_bus_setup = pnv_pci_dma_bus_setup,
3739
-#ifdef CONFIG_PCI_MSI
2879
+ .dma_dev_setup = pnv_pci_ioda_dma_dev_setup,
2880
+ .dma_bus_setup = pnv_pci_ioda_dma_bus_setup,
2881
+ .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
37402882 .setup_msi_irqs = pnv_setup_msi_irqs,
37412883 .teardown_msi_irqs = pnv_teardown_msi_irqs,
3742
-#endif
37432884 .enable_device_hook = pnv_pci_enable_device_hook,
37442885 .release_device = pnv_pci_release_device,
37452886 .window_alignment = pnv_pci_window_alignment,
3746
- .setup_bridge = pnv_pci_setup_bridge,
2887
+ .setup_bridge = pnv_pci_fixup_bridge_resources,
37472888 .reset_secondary_bus = pnv_pci_reset_secondary_bus,
3748
- .dma_set_mask = pnv_pci_ioda_dma_set_mask,
3749
- .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask,
37502889 .shutdown = pnv_pci_ioda_shutdown,
37512890 };
37522891
3753
-static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
3754
-{
3755
- dev_err_once(&npdev->dev,
3756
- "%s operation unsupported for NVLink devices\n",
3757
- __func__);
3758
- return -EPERM;
3759
-}
3760
-
37612892 static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
3762
- .dma_dev_setup = pnv_pci_dma_dev_setup,
3763
-#ifdef CONFIG_PCI_MSI
37642893 .setup_msi_irqs = pnv_setup_msi_irqs,
37652894 .teardown_msi_irqs = pnv_teardown_msi_irqs,
3766
-#endif
37672895 .enable_device_hook = pnv_pci_enable_device_hook,
37682896 .window_alignment = pnv_pci_window_alignment,
37692897 .reset_secondary_bus = pnv_pci_reset_secondary_bus,
3770
- .dma_set_mask = pnv_npu_dma_set_mask,
37712898 .shutdown = pnv_pci_ioda_shutdown,
2899
+ .disable_device = pnv_npu_disable_device,
37722900 };
37732901
37742902 static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
3775
- .enable_device_hook = pnv_pci_enable_device_hook,
2903
+ .enable_device_hook = pnv_ocapi_enable_device_hook,
2904
+ .release_device = pnv_pci_release_device,
37762905 .window_alignment = pnv_pci_window_alignment,
37772906 .reset_secondary_bus = pnv_pci_reset_secondary_bus,
37782907 .shutdown = pnv_pci_ioda_shutdown,
....@@ -3785,6 +2914,7 @@
37852914 struct pnv_phb *phb;
37862915 unsigned long size, m64map_off, m32map_off, pemap_off;
37872916 unsigned long iomap_off = 0, dma32map_off = 0;
2917
+ struct pnv_ioda_pe *root_pe;
37882918 struct resource r;
37892919 const __be64 *prop64;
37902920 const __be32 *prop32;
....@@ -3807,7 +2937,10 @@
38072937 phb_id = be64_to_cpup(prop64);
38082938 pr_debug(" PHB-ID : 0x%016llx\n", phb_id);
38092939
3810
- phb = memblock_virt_alloc(sizeof(*phb), 0);
2940
+ phb = memblock_alloc(sizeof(*phb), SMP_CACHE_BYTES);
2941
+ if (!phb)
2942
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
2943
+ sizeof(*phb));
38112944
38122945 /* Allocate PCI controller */
38132946 phb->hose = hose = pcibios_alloc_controller(np);
....@@ -3853,7 +2986,10 @@
38532986 else
38542987 phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
38552988
3856
- phb->diag_data = memblock_virt_alloc(phb->diag_data_size, 0);
2989
+ phb->diag_data = memblock_alloc(phb->diag_data_size, SMP_CACHE_BYTES);
2990
+ if (!phb->diag_data)
2991
+ panic("%s: Failed to allocate %u bytes\n", __func__,
2992
+ phb->diag_data_size);
38572993
38582994 /* Parse 32-bit and IO ranges (if any) */
38592995 pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
....@@ -3897,7 +3033,7 @@
38973033 PNV_IODA1_DMA32_SEGSIZE;
38983034
38993035 /* Allocate aux data & arrays. We don't have IO ports on PHB3 */
3900
- size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
3036
+ size = ALIGN(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
39013037 sizeof(unsigned long));
39023038 m64map_off = size;
39033039 size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
....@@ -3912,7 +3048,9 @@
39123048 }
39133049 pemap_off = size;
39143050 size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
3915
- aux = memblock_virt_alloc(size, 0);
3051
+ aux = memblock_alloc(size, SMP_CACHE_BYTES);
3052
+ if (!aux)
3053
+ panic("%s: Failed to allocate %lu bytes\n", __func__, size);
39163054 phb->ioda.pe_alloc = aux;
39173055 phb->ioda.m64_segmap = aux + m64map_off;
39183056 phb->ioda.m32_segmap = aux + m32map_off;
....@@ -3944,7 +3082,9 @@
39443082 phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
39453083 pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
39463084 } else {
3947
- phb->ioda.root_pe_idx = IODA_INVALID_PE;
3085
+ /* otherwise just allocate one */
3086
+ root_pe = pnv_ioda_alloc_pe(phb, 1);
3087
+ phb->ioda.root_pe_idx = root_pe->pe_number;
39483088 }
39493089
39503090 INIT_LIST_HEAD(&phb->ioda.pe_list);
....@@ -3999,7 +3139,6 @@
39993139 hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
40003140 break;
40013141 default:
4002
- phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
40033142 hose->controller_ops = pnv_pci_ioda_controller_ops;
40043143 }
40053144
....@@ -4024,9 +3163,12 @@
40243163 * shutdown PCI devices correctly. We already got IODA table
40253164 * cleaned out. So we have to issue PHB reset to stop all PCI
40263165 * transactions from previous kernel. The ppc_pci_reset_phbs
4027
- * kernel parameter will force this reset too.
3166
+ * kernel parameter will force this reset too. Additionally,
3167
+ * if the IODA reset above failed then use a bigger hammer.
3168
+ * This can happen if we get a PHB fatal error in very early
3169
+ * boot.
40283170 */
4029
- if (is_kdump_kernel() || pci_reset_phbs) {
3171
+ if (is_kdump_kernel() || pci_reset_phbs || rc) {
40303172 pr_info(" Issue PHB reset ...\n");
40313173 pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
40323174 pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
....@@ -4054,8 +3196,7 @@
40543196
40553197 static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
40563198 {
4057
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
4058
- struct pnv_phb *phb = hose->private_data;
3199
+ struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
40593200
40603201 if (!machine_is(powernv))
40613202 return;