hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/drivers/vfio/pci/vfio_pci.c
....@@ -1,10 +1,7 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
34 * Author: Alex Williamson <alex.williamson@redhat.com>
4
- *
5
- * This program is free software; you can redistribute it and/or modify
6
- * it under the terms of the GNU General Public License version 2 as
7
- * published by the Free Software Foundation.
85 *
96 * Derived from original vfio:
107 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
....@@ -57,7 +54,15 @@
5754 MODULE_PARM_DESC(disable_idle_d3,
5855 "Disable using the PCI D3 low power state for idle, unused devices");
5956
60
-static DEFINE_MUTEX(driver_lock);
57
+static bool enable_sriov;
58
+#ifdef CONFIG_PCI_IOV
59
+module_param(enable_sriov, bool, 0644);
60
+MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration. Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
61
+#endif
62
+
63
+static bool disable_denylist;
64
+module_param(disable_denylist, bool, 0444);
65
+MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users.");
6166
6267 static inline bool vfio_vga_disabled(void)
6368 {
....@@ -66,6 +71,44 @@
6671 #else
6772 return true;
6873 #endif
74
+}
75
+
76
+static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
77
+{
78
+ switch (pdev->vendor) {
79
+ case PCI_VENDOR_ID_INTEL:
80
+ switch (pdev->device) {
81
+ case PCI_DEVICE_ID_INTEL_QAT_C3XXX:
82
+ case PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF:
83
+ case PCI_DEVICE_ID_INTEL_QAT_C62X:
84
+ case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
85
+ case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
86
+ case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
87
+ return true;
88
+ default:
89
+ return false;
90
+ }
91
+ }
92
+
93
+ return false;
94
+}
95
+
96
+static bool vfio_pci_is_denylisted(struct pci_dev *pdev)
97
+{
98
+ if (!vfio_pci_dev_in_denylist(pdev))
99
+ return false;
100
+
101
+ if (disable_denylist) {
102
+ pci_warn(pdev,
103
+ "device denylist disabled - allowing device %04x:%04x.\n",
104
+ pdev->vendor, pdev->device);
105
+ return false;
106
+ }
107
+
108
+ pci_warn(pdev, "%04x:%04x exists in vfio-pci device denylist, driver probing disallowed.\n",
109
+ pdev->vendor, pdev->device);
110
+
111
+ return true;
69112 }
70113
71114 /*
....@@ -115,11 +158,13 @@
115158 static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
116159 {
117160 struct resource *res;
118
- int bar;
161
+ int i;
119162 struct vfio_pci_dummy_resource *dummy_res;
120163
121
- for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
122
- res = vdev->pdev->resource + bar;
164
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
165
+ int bar = i + PCI_STD_RESOURCES;
166
+
167
+ res = &vdev->pdev->resource[bar];
123168
124169 if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
125170 goto no_mmap;
....@@ -202,6 +247,8 @@
202247 case 0x1580 ... 0x1581:
203248 case 0x1583 ... 0x158b:
204249 case 0x37d0 ... 0x37d2:
250
+ /* X550 */
251
+ case 0x1563:
205252 return true;
206253 default:
207254 return false;
....@@ -211,6 +258,57 @@
211258 return false;
212259 }
213260
261
+static void vfio_pci_probe_power_state(struct vfio_pci_device *vdev)
262
+{
263
+ struct pci_dev *pdev = vdev->pdev;
264
+ u16 pmcsr;
265
+
266
+ if (!pdev->pm_cap)
267
+ return;
268
+
269
+ pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
270
+
271
+ vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
272
+}
273
+
274
+/*
275
+ * pci_set_power_state() wrapper handling devices which perform a soft reset on
276
+ * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev,
277
+ * restore when returned to D0. Saved separately from pci_saved_state for use
278
+ * by PM capability emulation and separately from pci_dev internal saved state
279
+ * to avoid it being overwritten and consumed around other resets.
280
+ */
281
+int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state)
282
+{
283
+ struct pci_dev *pdev = vdev->pdev;
284
+ bool needs_restore = false, needs_save = false;
285
+ int ret;
286
+
287
+ if (vdev->needs_pm_restore) {
288
+ if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
289
+ pci_save_state(pdev);
290
+ needs_save = true;
291
+ }
292
+
293
+ if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
294
+ needs_restore = true;
295
+ }
296
+
297
+ ret = pci_set_power_state(pdev, state);
298
+
299
+ if (!ret) {
300
+ /* D3 might be unsupported via quirk, skip unless in D3 */
301
+ if (needs_save && pdev->current_state >= PCI_D3hot) {
302
+ vdev->pm_save = pci_store_saved_state(pdev);
303
+ } else if (needs_restore) {
304
+ pci_load_and_free_saved_state(pdev, &vdev->pm_save);
305
+ pci_restore_state(pdev);
306
+ }
307
+ }
308
+
309
+ return ret;
310
+}
311
+
214312 static int vfio_pci_enable(struct vfio_pci_device *vdev)
215313 {
216314 struct pci_dev *pdev = vdev->pdev;
....@@ -218,7 +316,7 @@
218316 u16 cmd;
219317 u8 msix_pos;
220318
221
- pci_set_power_state(pdev, PCI_D0);
319
+ vfio_pci_set_power_state(vdev, PCI_D0);
222320
223321 /* Don't allow our initial saved state to include busmaster */
224322 pci_clear_master(pdev);
....@@ -238,12 +336,11 @@
238336 pci_save_state(pdev);
239337 vdev->pci_saved_state = pci_store_saved_state(pdev);
240338 if (!vdev->pci_saved_state)
241
- pr_debug("%s: Couldn't store %s saved state\n",
242
- __func__, dev_name(&pdev->dev));
339
+ pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
243340
244341 if (likely(!nointxmask)) {
245342 if (vfio_pci_nointx(pdev)) {
246
- dev_info(&pdev->dev, "Masking broken INTx support\n");
343
+ pci_info(pdev, "Masking broken INTx support\n");
247344 vdev->nointx = true;
248345 pci_intx(pdev, 0);
249346 } else
....@@ -286,17 +383,37 @@
286383 pdev->vendor == PCI_VENDOR_ID_INTEL &&
287384 IS_ENABLED(CONFIG_VFIO_PCI_IGD)) {
288385 ret = vfio_pci_igd_init(vdev);
289
- if (ret) {
290
- dev_warn(&vdev->pdev->dev,
291
- "Failed to setup Intel IGD regions\n");
292
- vfio_pci_disable(vdev);
293
- return ret;
386
+ if (ret && ret != -ENODEV) {
387
+ pci_warn(pdev, "Failed to setup Intel IGD regions\n");
388
+ goto disable_exit;
389
+ }
390
+ }
391
+
392
+ if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
393
+ IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
394
+ ret = vfio_pci_nvdia_v100_nvlink2_init(vdev);
395
+ if (ret && ret != -ENODEV) {
396
+ pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n");
397
+ goto disable_exit;
398
+ }
399
+ }
400
+
401
+ if (pdev->vendor == PCI_VENDOR_ID_IBM &&
402
+ IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
403
+ ret = vfio_pci_ibm_npu2_init(vdev);
404
+ if (ret && ret != -ENODEV) {
405
+ pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n");
406
+ goto disable_exit;
294407 }
295408 }
296409
297410 vfio_pci_probe_mmaps(vdev);
298411
299412 return 0;
413
+
414
+disable_exit:
415
+ vfio_pci_disable(vdev);
416
+ return ret;
300417 }
301418
302419 static void vfio_pci_disable(struct vfio_pci_device *vdev)
....@@ -333,7 +450,8 @@
333450
334451 vfio_config_free(vdev);
335452
336
- for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
453
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
454
+ bar = i + PCI_STD_RESOURCES;
337455 if (!vdev->barmap[bar])
338456 continue;
339457 pci_iounmap(pdev, vdev->barmap[bar]);
....@@ -357,8 +475,7 @@
357475 * is just busy work.
358476 */
359477 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
360
- pr_info("%s: Couldn't reload %s saved state\n",
361
- __func__, dev_name(&pdev->dev));
478
+ pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
362479
363480 if (!vdev->reset_works)
364481 goto out;
....@@ -395,26 +512,63 @@
395512 vfio_pci_try_bus_reset(vdev);
396513
397514 if (!disable_idle_d3)
398
- pci_set_power_state(pdev, PCI_D3hot);
515
+ vfio_pci_set_power_state(vdev, PCI_D3hot);
516
+}
517
+
518
+static struct pci_driver vfio_pci_driver;
519
+
520
+static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev,
521
+ struct vfio_device **pf_dev)
522
+{
523
+ struct pci_dev *physfn = pci_physfn(vdev->pdev);
524
+
525
+ if (!vdev->pdev->is_virtfn)
526
+ return NULL;
527
+
528
+ *pf_dev = vfio_device_get_from_dev(&physfn->dev);
529
+ if (!*pf_dev)
530
+ return NULL;
531
+
532
+ if (pci_dev_driver(physfn) != &vfio_pci_driver) {
533
+ vfio_device_put(*pf_dev);
534
+ return NULL;
535
+ }
536
+
537
+ return vfio_device_data(*pf_dev);
538
+}
539
+
540
+static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val)
541
+{
542
+ struct vfio_device *pf_dev;
543
+ struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev);
544
+
545
+ if (!pf_vdev)
546
+ return;
547
+
548
+ mutex_lock(&pf_vdev->vf_token->lock);
549
+ pf_vdev->vf_token->users += val;
550
+ WARN_ON(pf_vdev->vf_token->users < 0);
551
+ mutex_unlock(&pf_vdev->vf_token->lock);
552
+
553
+ vfio_device_put(pf_dev);
399554 }
400555
401556 static void vfio_pci_release(void *device_data)
402557 {
403558 struct vfio_pci_device *vdev = device_data;
404559
405
- mutex_lock(&driver_lock);
560
+ mutex_lock(&vdev->reflck->lock);
406561
407562 if (!(--vdev->refcnt)) {
563
+ vfio_pci_vf_token_user_add(vdev, -1);
408564 vfio_spapr_pci_eeh_release(vdev->pdev);
409565 vfio_pci_disable(vdev);
566
+
410567 mutex_lock(&vdev->igate);
411568 if (vdev->err_trigger) {
412569 eventfd_ctx_put(vdev->err_trigger);
413570 vdev->err_trigger = NULL;
414571 }
415
- mutex_unlock(&vdev->igate);
416
-
417
- mutex_lock(&vdev->igate);
418572 if (vdev->req_trigger) {
419573 eventfd_ctx_put(vdev->req_trigger);
420574 vdev->req_trigger = NULL;
....@@ -422,7 +576,7 @@
422576 mutex_unlock(&vdev->igate);
423577 }
424578
425
- mutex_unlock(&driver_lock);
579
+ mutex_unlock(&vdev->reflck->lock);
426580
427581 module_put(THIS_MODULE);
428582 }
....@@ -435,7 +589,7 @@
435589 if (!try_module_get(THIS_MODULE))
436590 return -ENODEV;
437591
438
- mutex_lock(&driver_lock);
592
+ mutex_lock(&vdev->reflck->lock);
439593
440594 if (!vdev->refcnt) {
441595 ret = vfio_pci_enable(vdev);
....@@ -443,10 +597,11 @@
443597 goto error;
444598
445599 vfio_spapr_pci_eeh_open(vdev->pdev);
600
+ vfio_pci_vf_token_user_add(vdev, 1);
446601 }
447602 vdev->refcnt++;
448603 error:
449
- mutex_unlock(&driver_lock);
604
+ mutex_unlock(&vdev->reflck->lock);
450605 if (ret)
451606 module_put(THIS_MODULE);
452607 return ret;
....@@ -650,14 +805,24 @@
650805
651806 if (cmd == VFIO_DEVICE_GET_INFO) {
652807 struct vfio_device_info info;
808
+ struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
809
+ unsigned long capsz;
653810
654811 minsz = offsetofend(struct vfio_device_info, num_irqs);
812
+
813
+ /* For backward compatibility, cannot require this */
814
+ capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
655815
656816 if (copy_from_user(&info, (void __user *)arg, minsz))
657817 return -EFAULT;
658818
659819 if (info.argsz < minsz)
660820 return -EINVAL;
821
+
822
+ if (info.argsz >= capsz) {
823
+ minsz = capsz;
824
+ info.cap_offset = 0;
825
+ }
661826
662827 info.flags = VFIO_DEVICE_FLAGS_PCI;
663828
....@@ -666,6 +831,33 @@
666831
667832 info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
668833 info.num_irqs = VFIO_PCI_NUM_IRQS;
834
+
835
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) {
836
+ int ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
837
+
838
+ if (ret && ret != -ENODEV) {
839
+ pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
840
+ return ret;
841
+ }
842
+ }
843
+
844
+ if (caps.size) {
845
+ info.flags |= VFIO_DEVICE_FLAGS_CAPS;
846
+ if (info.argsz < sizeof(info) + caps.size) {
847
+ info.argsz = sizeof(info) + caps.size;
848
+ } else {
849
+ vfio_info_cap_shift(&caps, sizeof(info));
850
+ if (copy_to_user((void __user *)arg +
851
+ sizeof(info), caps.buf,
852
+ caps.size)) {
853
+ kfree(caps.buf);
854
+ return -EFAULT;
855
+ }
856
+ info.cap_offset = sizeof(info);
857
+ }
858
+
859
+ kfree(caps.buf);
860
+ }
669861
670862 return copy_to_user((void __user *)arg, &info, minsz) ?
671863 -EFAULT : 0;
....@@ -784,6 +976,12 @@
784976 if (ret)
785977 return ret;
786978
979
+ if (vdev->region[i].ops->add_capability) {
980
+ ret = vdev->region[i].ops->add_capability(vdev,
981
+ &vdev->region[i], &caps);
982
+ if (ret)
983
+ return ret;
984
+ }
787985 }
788986 }
789987
....@@ -827,7 +1025,7 @@
8271025 case VFIO_PCI_ERR_IRQ_INDEX:
8281026 if (pci_is_pcie(vdev->pdev))
8291027 break;
830
- /* fall through */
1028
+ fallthrough;
8311029 default:
8321030 return -EINVAL;
8331031 }
....@@ -1073,7 +1271,7 @@
10731271
10741272 /*
10751273 * We need to get memory_lock for each device, but devices
1076
- * can share mmap_sem, therefore we need to zap and hold
1274
+ * can share mmap_lock, therefore we need to zap and hold
10771275 * the vma_lock for each device, and only then get each
10781276 * memory_lock.
10791277 */
....@@ -1142,6 +1340,65 @@
11421340
11431341 return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
11441342 ioeventfd.data, count, ioeventfd.fd);
1343
+ } else if (cmd == VFIO_DEVICE_FEATURE) {
1344
+ struct vfio_device_feature feature;
1345
+ uuid_t uuid;
1346
+
1347
+ minsz = offsetofend(struct vfio_device_feature, flags);
1348
+
1349
+ if (copy_from_user(&feature, (void __user *)arg, minsz))
1350
+ return -EFAULT;
1351
+
1352
+ if (feature.argsz < minsz)
1353
+ return -EINVAL;
1354
+
1355
+ /* Check unknown flags */
1356
+ if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK |
1357
+ VFIO_DEVICE_FEATURE_SET |
1358
+ VFIO_DEVICE_FEATURE_GET |
1359
+ VFIO_DEVICE_FEATURE_PROBE))
1360
+ return -EINVAL;
1361
+
1362
+ /* GET & SET are mutually exclusive except with PROBE */
1363
+ if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1364
+ (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1365
+ (feature.flags & VFIO_DEVICE_FEATURE_GET))
1366
+ return -EINVAL;
1367
+
1368
+ switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1369
+ case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
1370
+ if (!vdev->vf_token)
1371
+ return -ENOTTY;
1372
+
1373
+ /*
1374
+ * We do not support GET of the VF Token UUID as this
1375
+ * could expose the token of the previous device user.
1376
+ */
1377
+ if (feature.flags & VFIO_DEVICE_FEATURE_GET)
1378
+ return -EINVAL;
1379
+
1380
+ if (feature.flags & VFIO_DEVICE_FEATURE_PROBE)
1381
+ return 0;
1382
+
1383
+ /* Don't SET unless told to do so */
1384
+ if (!(feature.flags & VFIO_DEVICE_FEATURE_SET))
1385
+ return -EINVAL;
1386
+
1387
+ if (feature.argsz < minsz + sizeof(uuid))
1388
+ return -EINVAL;
1389
+
1390
+ if (copy_from_user(&uuid, (void __user *)(arg + minsz),
1391
+ sizeof(uuid)))
1392
+ return -EFAULT;
1393
+
1394
+ mutex_lock(&vdev->vf_token->lock);
1395
+ uuid_copy(&vdev->vf_token->uuid, &uuid);
1396
+ mutex_unlock(&vdev->vf_token->lock);
1397
+
1398
+ return 0;
1399
+ default:
1400
+ return -ENOTTY;
1401
+ }
11451402 }
11461403
11471404 return -ENOTTY;
....@@ -1204,26 +1461,26 @@
12041461
12051462 /*
12061463 * Lock ordering:
1207
- * vma_lock is nested under mmap_sem for vm_ops callback paths.
1464
+ * vma_lock is nested under mmap_lock for vm_ops callback paths.
12081465 * The memory_lock semaphore is used by both code paths calling
12091466 * into this function to zap vmas and the vm_ops.fault callback
12101467 * to protect the memory enable state of the device.
12111468 *
1212
- * When zapping vmas we need to maintain the mmap_sem => vma_lock
1469
+ * When zapping vmas we need to maintain the mmap_lock => vma_lock
12131470 * ordering, which requires using vma_lock to walk vma_list to
1214
- * acquire an mm, then dropping vma_lock to get the mmap_sem and
1471
+ * acquire an mm, then dropping vma_lock to get the mmap_lock and
12151472 * reacquiring vma_lock. This logic is derived from similar
12161473 * requirements in uverbs_user_mmap_disassociate().
12171474 *
1218
- * mmap_sem must always be the top-level lock when it is taken.
1475
+ * mmap_lock must always be the top-level lock when it is taken.
12191476 * Therefore we can only hold the memory_lock write lock when
1220
- * vma_list is empty, as we'd need to take mmap_sem to clear
1477
+ * vma_list is empty, as we'd need to take mmap_lock to clear
12211478 * entries. vma_list can only be guaranteed empty when holding
12221479 * vma_lock, thus memory_lock is nested under vma_lock.
12231480 *
12241481 * This enables the vm_ops.fault callback to acquire vma_lock,
12251482 * followed by memory_lock read lock, while already holding
1226
- * mmap_sem without risk of deadlock.
1483
+ * mmap_lock without risk of deadlock.
12271484 */
12281485 while (1) {
12291486 struct mm_struct *mm = NULL;
....@@ -1251,39 +1508,37 @@
12511508 mutex_unlock(&vdev->vma_lock);
12521509
12531510 if (try) {
1254
- if (!down_read_trylock(&mm->mmap_sem)) {
1511
+ if (!mmap_read_trylock(mm)) {
12551512 mmput(mm);
12561513 return 0;
12571514 }
12581515 } else {
1259
- down_read(&mm->mmap_sem);
1516
+ mmap_read_lock(mm);
12601517 }
1261
- if (mmget_still_valid(mm)) {
1262
- if (try) {
1263
- if (!mutex_trylock(&vdev->vma_lock)) {
1264
- up_read(&mm->mmap_sem);
1265
- mmput(mm);
1266
- return 0;
1267
- }
1268
- } else {
1269
- mutex_lock(&vdev->vma_lock);
1518
+ if (try) {
1519
+ if (!mutex_trylock(&vdev->vma_lock)) {
1520
+ mmap_read_unlock(mm);
1521
+ mmput(mm);
1522
+ return 0;
12701523 }
1271
- list_for_each_entry_safe(mmap_vma, tmp,
1272
- &vdev->vma_list, vma_next) {
1273
- struct vm_area_struct *vma = mmap_vma->vma;
1274
-
1275
- if (vma->vm_mm != mm)
1276
- continue;
1277
-
1278
- list_del(&mmap_vma->vma_next);
1279
- kfree(mmap_vma);
1280
-
1281
- zap_vma_ptes(vma, vma->vm_start,
1282
- vma->vm_end - vma->vm_start);
1283
- }
1284
- mutex_unlock(&vdev->vma_lock);
1524
+ } else {
1525
+ mutex_lock(&vdev->vma_lock);
12851526 }
1286
- up_read(&mm->mmap_sem);
1527
+ list_for_each_entry_safe(mmap_vma, tmp,
1528
+ &vdev->vma_list, vma_next) {
1529
+ struct vm_area_struct *vma = mmap_vma->vma;
1530
+
1531
+ if (vma->vm_mm != mm)
1532
+ continue;
1533
+
1534
+ list_del(&mmap_vma->vma_next);
1535
+ kfree(mmap_vma);
1536
+
1537
+ zap_vma_ptes(vma, vma->vm_start,
1538
+ vma->vm_end - vma->vm_start);
1539
+ }
1540
+ mutex_unlock(&vdev->vma_lock);
1541
+ mmap_read_unlock(mm);
12871542 mmput(mm);
12881543 }
12891544 }
....@@ -1416,10 +1671,21 @@
14161671
14171672 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
14181673
1674
+ if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1675
+ return -EINVAL;
14191676 if (vma->vm_end < vma->vm_start)
14201677 return -EINVAL;
14211678 if ((vma->vm_flags & VM_SHARED) == 0)
14221679 return -EINVAL;
1680
+ if (index >= VFIO_PCI_NUM_REGIONS) {
1681
+ int regnum = index - VFIO_PCI_NUM_REGIONS;
1682
+ struct vfio_pci_region *region = vdev->region + regnum;
1683
+
1684
+ if (region->ops && region->ops->mmap &&
1685
+ (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
1686
+ return region->ops->mmap(vdev, region, vma);
1687
+ return -EINVAL;
1688
+ }
14231689 if (index >= VFIO_PCI_ROM_REGION_INDEX)
14241690 return -EINVAL;
14251691 if (!vdev->bar_mmap_supported[index])
....@@ -1468,21 +1734,166 @@
14681734 static void vfio_pci_request(void *device_data, unsigned int count)
14691735 {
14701736 struct vfio_pci_device *vdev = device_data;
1737
+ struct pci_dev *pdev = vdev->pdev;
14711738
14721739 mutex_lock(&vdev->igate);
14731740
14741741 if (vdev->req_trigger) {
14751742 if (!(count % 10))
1476
- dev_notice_ratelimited(&vdev->pdev->dev,
1743
+ pci_notice_ratelimited(pdev,
14771744 "Relaying device request to user (#%u)\n",
14781745 count);
14791746 eventfd_signal(vdev->req_trigger, 1);
14801747 } else if (count == 0) {
1481
- dev_warn(&vdev->pdev->dev,
1748
+ pci_warn(pdev,
14821749 "No device request channel registered, blocked until released by user\n");
14831750 }
14841751
14851752 mutex_unlock(&vdev->igate);
1753
+}
1754
+
1755
+static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
1756
+ bool vf_token, uuid_t *uuid)
1757
+{
1758
+ /*
1759
+ * There's always some degree of trust or collaboration between SR-IOV
1760
+ * PF and VFs, even if just that the PF hosts the SR-IOV capability and
1761
+ * can disrupt VFs with a reset, but often the PF has more explicit
1762
+ * access to deny service to the VF or access data passed through the
1763
+ * VF. We therefore require an opt-in via a shared VF token (UUID) to
1764
+ * represent this trust. This both prevents that a VF driver might
1765
+ * assume the PF driver is a trusted, in-kernel driver, and also that
1766
+ * a PF driver might be replaced with a rogue driver, unknown to in-use
1767
+ * VF drivers.
1768
+ *
1769
+ * Therefore when presented with a VF, if the PF is a vfio device and
1770
+ * it is bound to the vfio-pci driver, the user needs to provide a VF
1771
+ * token to access the device, in the form of appending a vf_token to
1772
+ * the device name, for example:
1773
+ *
1774
+ * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
1775
+ *
1776
+ * When presented with a PF which has VFs in use, the user must also
1777
+ * provide the current VF token to prove collaboration with existing
1778
+ * VF users. If VFs are not in use, the VF token provided for the PF
1779
+ * device will act to set the VF token.
1780
+ *
1781
+ * If the VF token is provided but unused, an error is generated.
1782
+ */
1783
+ if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token)
1784
+ return 0; /* No VF token provided or required */
1785
+
1786
+ if (vdev->pdev->is_virtfn) {
1787
+ struct vfio_device *pf_dev;
1788
+ struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev);
1789
+ bool match;
1790
+
1791
+ if (!pf_vdev) {
1792
+ if (!vf_token)
1793
+ return 0; /* PF is not vfio-pci, no VF token */
1794
+
1795
+ pci_info_ratelimited(vdev->pdev,
1796
+ "VF token incorrectly provided, PF not bound to vfio-pci\n");
1797
+ return -EINVAL;
1798
+ }
1799
+
1800
+ if (!vf_token) {
1801
+ vfio_device_put(pf_dev);
1802
+ pci_info_ratelimited(vdev->pdev,
1803
+ "VF token required to access device\n");
1804
+ return -EACCES;
1805
+ }
1806
+
1807
+ mutex_lock(&pf_vdev->vf_token->lock);
1808
+ match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
1809
+ mutex_unlock(&pf_vdev->vf_token->lock);
1810
+
1811
+ vfio_device_put(pf_dev);
1812
+
1813
+ if (!match) {
1814
+ pci_info_ratelimited(vdev->pdev,
1815
+ "Incorrect VF token provided for device\n");
1816
+ return -EACCES;
1817
+ }
1818
+ } else if (vdev->vf_token) {
1819
+ mutex_lock(&vdev->vf_token->lock);
1820
+ if (vdev->vf_token->users) {
1821
+ if (!vf_token) {
1822
+ mutex_unlock(&vdev->vf_token->lock);
1823
+ pci_info_ratelimited(vdev->pdev,
1824
+ "VF token required to access device\n");
1825
+ return -EACCES;
1826
+ }
1827
+
1828
+ if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
1829
+ mutex_unlock(&vdev->vf_token->lock);
1830
+ pci_info_ratelimited(vdev->pdev,
1831
+ "Incorrect VF token provided for device\n");
1832
+ return -EACCES;
1833
+ }
1834
+ } else if (vf_token) {
1835
+ uuid_copy(&vdev->vf_token->uuid, uuid);
1836
+ }
1837
+
1838
+ mutex_unlock(&vdev->vf_token->lock);
1839
+ } else if (vf_token) {
1840
+ pci_info_ratelimited(vdev->pdev,
1841
+ "VF token incorrectly provided, not a PF or VF\n");
1842
+ return -EINVAL;
1843
+ }
1844
+
1845
+ return 0;
1846
+}
1847
+
1848
+#define VF_TOKEN_ARG "vf_token="
1849
+
1850
+static int vfio_pci_match(void *device_data, char *buf)
1851
+{
1852
+ struct vfio_pci_device *vdev = device_data;
1853
+ bool vf_token = false;
1854
+ uuid_t uuid;
1855
+ int ret;
1856
+
1857
+ if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
1858
+ return 0; /* No match */
1859
+
1860
+ if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
1861
+ buf += strlen(pci_name(vdev->pdev));
1862
+
1863
+ if (*buf != ' ')
1864
+ return 0; /* No match: non-whitespace after name */
1865
+
1866
+ while (*buf) {
1867
+ if (*buf == ' ') {
1868
+ buf++;
1869
+ continue;
1870
+ }
1871
+
1872
+ if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
1873
+ strlen(VF_TOKEN_ARG))) {
1874
+ buf += strlen(VF_TOKEN_ARG);
1875
+
1876
+ if (strlen(buf) < UUID_STRING_LEN)
1877
+ return -EINVAL;
1878
+
1879
+ ret = uuid_parse(buf, &uuid);
1880
+ if (ret)
1881
+ return ret;
1882
+
1883
+ vf_token = true;
1884
+ buf += UUID_STRING_LEN;
1885
+ } else {
1886
+ /* Unknown/duplicate option */
1887
+ return -EINVAL;
1888
+ }
1889
+ }
1890
+ }
1891
+
1892
+ ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
1893
+ if (ret)
1894
+ return ret;
1895
+
1896
+ return 1; /* Match */
14861897 }
14871898
14881899 static const struct vfio_device_ops vfio_pci_ops = {
....@@ -1494,7 +1905,101 @@
14941905 .write = vfio_pci_write,
14951906 .mmap = vfio_pci_mmap,
14961907 .request = vfio_pci_request,
1908
+ .match = vfio_pci_match,
14971909 };
1910
+
1911
+static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
1912
+static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
1913
+
1914
+static int vfio_pci_bus_notifier(struct notifier_block *nb,
1915
+ unsigned long action, void *data)
1916
+{
1917
+ struct vfio_pci_device *vdev = container_of(nb,
1918
+ struct vfio_pci_device, nb);
1919
+ struct device *dev = data;
1920
+ struct pci_dev *pdev = to_pci_dev(dev);
1921
+ struct pci_dev *physfn = pci_physfn(pdev);
1922
+
1923
+ if (action == BUS_NOTIFY_ADD_DEVICE &&
1924
+ pdev->is_virtfn && physfn == vdev->pdev) {
1925
+ pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
1926
+ pci_name(pdev));
1927
+ pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
1928
+ vfio_pci_ops.name);
1929
+ } else if (action == BUS_NOTIFY_BOUND_DRIVER &&
1930
+ pdev->is_virtfn && physfn == vdev->pdev) {
1931
+ struct pci_driver *drv = pci_dev_driver(pdev);
1932
+
1933
+ if (drv && drv != &vfio_pci_driver)
1934
+ pci_warn(vdev->pdev,
1935
+ "VF %s bound to driver %s while PF bound to vfio-pci\n",
1936
+ pci_name(pdev), drv->name);
1937
+ }
1938
+
1939
+ return 0;
1940
+}
1941
+
1942
+static int vfio_pci_vf_init(struct vfio_pci_device *vdev)
1943
+{
1944
+ struct pci_dev *pdev = vdev->pdev;
1945
+ int ret;
1946
+
1947
+ if (!pdev->is_physfn)
1948
+ return 0;
1949
+
1950
+ vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
1951
+ if (!vdev->vf_token)
1952
+ return -ENOMEM;
1953
+
1954
+ mutex_init(&vdev->vf_token->lock);
1955
+ uuid_gen(&vdev->vf_token->uuid);
1956
+
1957
+ vdev->nb.notifier_call = vfio_pci_bus_notifier;
1958
+ ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
1959
+ if (ret) {
1960
+ kfree(vdev->vf_token);
1961
+ return ret;
1962
+ }
1963
+ return 0;
1964
+}
1965
+
1966
+static void vfio_pci_vf_uninit(struct vfio_pci_device *vdev)
1967
+{
1968
+ if (!vdev->vf_token)
1969
+ return;
1970
+
1971
+ bus_unregister_notifier(&pci_bus_type, &vdev->nb);
1972
+ WARN_ON(vdev->vf_token->users);
1973
+ mutex_destroy(&vdev->vf_token->lock);
1974
+ kfree(vdev->vf_token);
1975
+}
1976
+
1977
+static int vfio_pci_vga_init(struct vfio_pci_device *vdev)
1978
+{
1979
+ struct pci_dev *pdev = vdev->pdev;
1980
+ int ret;
1981
+
1982
+ if (!vfio_pci_is_vga(pdev))
1983
+ return 0;
1984
+
1985
+ ret = vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
1986
+ if (ret)
1987
+ return ret;
1988
+ vga_set_legacy_decoding(pdev, vfio_pci_set_vga_decode(vdev, false));
1989
+ return 0;
1990
+}
1991
+
1992
+static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev)
1993
+{
1994
+ struct pci_dev *pdev = vdev->pdev;
1995
+
1996
+ if (!vfio_pci_is_vga(pdev))
1997
+ return;
1998
+ vga_client_register(pdev, NULL, NULL, NULL);
1999
+ vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
2000
+ VGA_RSRC_LEGACY_IO |
2001
+ VGA_RSRC_LEGACY_MEM);
2002
+}
14982003
14992004 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
15002005 {
....@@ -1502,16 +2007,19 @@
15022007 struct iommu_group *group;
15032008 int ret;
15042009
2010
+ if (vfio_pci_is_denylisted(pdev))
2011
+ return -EINVAL;
2012
+
15052013 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
15062014 return -EINVAL;
15072015
15082016 /*
1509
- * Prevent binding to PFs with VFs enabled, this too easily allows
1510
- * userspace instance with VFs and PFs from the same device, which
1511
- * cannot work. Disabling SR-IOV here would initiate removing the
1512
- * VFs, which would unbind the driver, which is prone to blocking
1513
- * if that VF is also in use by vfio-pci. Just reject these PFs
1514
- * and let the user sort it out.
2017
+ * Prevent binding to PFs with VFs enabled, the VFs might be in use
2018
+ * by the host or other users. We cannot capture the VFs if they
2019
+ * already exist, nor can we track VF users. Disabling SR-IOV here
2020
+ * would initiate removing the VFs, which would unbind the driver,
2021
+ * which is prone to blocking if that VF is also in use by vfio-pci.
2022
+ * Just reject these PFs and let the user sort it out.
15152023 */
15162024 if (pci_num_vf(pdev)) {
15172025 pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
....@@ -1524,8 +2032,8 @@
15242032
15252033 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
15262034 if (!vdev) {
1527
- vfio_iommu_group_put(group, &pdev->dev);
1528
- return -ENOMEM;
2035
+ ret = -ENOMEM;
2036
+ goto out_group_put;
15292037 }
15302038
15312039 vdev->pdev = pdev;
....@@ -1539,18 +2047,17 @@
15392047 INIT_LIST_HEAD(&vdev->vma_list);
15402048 init_rwsem(&vdev->memory_lock);
15412049
1542
- ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
1543
- if (ret) {
1544
- vfio_iommu_group_put(group, &pdev->dev);
1545
- kfree(vdev);
1546
- return ret;
1547
- }
2050
+ ret = vfio_pci_reflck_attach(vdev);
2051
+ if (ret)
2052
+ goto out_free;
2053
+ ret = vfio_pci_vf_init(vdev);
2054
+ if (ret)
2055
+ goto out_reflck;
2056
+ ret = vfio_pci_vga_init(vdev);
2057
+ if (ret)
2058
+ goto out_vf;
15482059
1549
- if (vfio_pci_is_vga(pdev)) {
1550
- vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
1551
- vga_set_legacy_decoding(pdev,
1552
- vfio_pci_set_vga_decode(vdev, false));
1553
- }
2060
+ vfio_pci_probe_power_state(vdev);
15542061
15552062 if (!disable_idle_d3) {
15562063 /*
....@@ -1562,10 +2069,27 @@
15622069 * be able to get to D3. Therefore first do a D0 transition
15632070 * before going to D3.
15642071 */
1565
- pci_set_power_state(pdev, PCI_D0);
1566
- pci_set_power_state(pdev, PCI_D3hot);
2072
+ vfio_pci_set_power_state(vdev, PCI_D0);
2073
+ vfio_pci_set_power_state(vdev, PCI_D3hot);
15672074 }
15682075
2076
+ ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
2077
+ if (ret)
2078
+ goto out_power;
2079
+ return 0;
2080
+
2081
+out_power:
2082
+ if (!disable_idle_d3)
2083
+ vfio_pci_set_power_state(vdev, PCI_D0);
2084
+out_vf:
2085
+ vfio_pci_vf_uninit(vdev);
2086
+out_reflck:
2087
+ vfio_pci_reflck_put(vdev->reflck);
2088
+out_free:
2089
+ kfree(vdev->pm_save);
2090
+ kfree(vdev);
2091
+out_group_put:
2092
+ vfio_iommu_group_put(group, &pdev->dev);
15692093 return ret;
15702094 }
15712095
....@@ -1573,24 +2097,25 @@
15732097 {
15742098 struct vfio_pci_device *vdev;
15752099
2100
+ pci_disable_sriov(pdev);
2101
+
15762102 vdev = vfio_del_group_dev(&pdev->dev);
15772103 if (!vdev)
15782104 return;
15792105
1580
- vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
1581
- kfree(vdev->region);
1582
- mutex_destroy(&vdev->ioeventfds_lock);
1583
- kfree(vdev);
2106
+ vfio_pci_vf_uninit(vdev);
2107
+ vfio_pci_reflck_put(vdev->reflck);
2108
+ vfio_pci_vga_uninit(vdev);
15842109
1585
- if (vfio_pci_is_vga(pdev)) {
1586
- vga_client_register(pdev, NULL, NULL, NULL);
1587
- vga_set_legacy_decoding(pdev,
1588
- VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
1589
- VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM);
1590
- }
2110
+ vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
15912111
15922112 if (!disable_idle_d3)
1593
- pci_set_power_state(pdev, PCI_D0);
2113
+ vfio_pci_set_power_state(vdev, PCI_D0);
2114
+
2115
+ mutex_destroy(&vdev->ioeventfds_lock);
2116
+ kfree(vdev->region);
2117
+ kfree(vdev->pm_save);
2118
+ kfree(vdev);
15942119 }
15952120
15962121 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
....@@ -1621,22 +2146,135 @@
16212146 return PCI_ERS_RESULT_CAN_RECOVER;
16222147 }
16232148
2149
+static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
2150
+{
2151
+ struct vfio_pci_device *vdev;
2152
+ struct vfio_device *device;
2153
+ int ret = 0;
2154
+
2155
+ might_sleep();
2156
+
2157
+ if (!enable_sriov)
2158
+ return -ENOENT;
2159
+
2160
+ device = vfio_device_get_from_dev(&pdev->dev);
2161
+ if (!device)
2162
+ return -ENODEV;
2163
+
2164
+ vdev = vfio_device_data(device);
2165
+ if (!vdev) {
2166
+ vfio_device_put(device);
2167
+ return -ENODEV;
2168
+ }
2169
+
2170
+ if (nr_virtfn == 0)
2171
+ pci_disable_sriov(pdev);
2172
+ else
2173
+ ret = pci_enable_sriov(pdev, nr_virtfn);
2174
+
2175
+ vfio_device_put(device);
2176
+
2177
+ return ret < 0 ? ret : nr_virtfn;
2178
+}
2179
+
16242180 static const struct pci_error_handlers vfio_err_handlers = {
16252181 .error_detected = vfio_pci_aer_err_detected,
16262182 };
16272183
16282184 static struct pci_driver vfio_pci_driver = {
1629
- .name = "vfio-pci",
1630
- .id_table = NULL, /* only dynamic ids */
1631
- .probe = vfio_pci_probe,
1632
- .remove = vfio_pci_remove,
1633
- .err_handler = &vfio_err_handlers,
2185
+ .name = "vfio-pci",
2186
+ .id_table = NULL, /* only dynamic ids */
2187
+ .probe = vfio_pci_probe,
2188
+ .remove = vfio_pci_remove,
2189
+ .sriov_configure = vfio_pci_sriov_configure,
2190
+ .err_handler = &vfio_err_handlers,
16342191 };
16352192
1636
-static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
2193
+static DEFINE_MUTEX(reflck_lock);
2194
+
2195
+static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
2196
+{
2197
+ struct vfio_pci_reflck *reflck;
2198
+
2199
+ reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
2200
+ if (!reflck)
2201
+ return ERR_PTR(-ENOMEM);
2202
+
2203
+ kref_init(&reflck->kref);
2204
+ mutex_init(&reflck->lock);
2205
+
2206
+ return reflck;
2207
+}
2208
+
2209
+static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
2210
+{
2211
+ kref_get(&reflck->kref);
2212
+}
2213
+
2214
+static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
2215
+{
2216
+ struct vfio_pci_reflck **preflck = data;
2217
+ struct vfio_device *device;
2218
+ struct vfio_pci_device *vdev;
2219
+
2220
+ device = vfio_device_get_from_dev(&pdev->dev);
2221
+ if (!device)
2222
+ return 0;
2223
+
2224
+ if (pci_dev_driver(pdev) != &vfio_pci_driver) {
2225
+ vfio_device_put(device);
2226
+ return 0;
2227
+ }
2228
+
2229
+ vdev = vfio_device_data(device);
2230
+
2231
+ if (vdev->reflck) {
2232
+ vfio_pci_reflck_get(vdev->reflck);
2233
+ *preflck = vdev->reflck;
2234
+ vfio_device_put(device);
2235
+ return 1;
2236
+ }
2237
+
2238
+ vfio_device_put(device);
2239
+ return 0;
2240
+}
2241
+
2242
+static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
2243
+{
2244
+ bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
2245
+
2246
+ mutex_lock(&reflck_lock);
2247
+
2248
+ if (pci_is_root_bus(vdev->pdev->bus) ||
2249
+ vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
2250
+ &vdev->reflck, slot) <= 0)
2251
+ vdev->reflck = vfio_pci_reflck_alloc();
2252
+
2253
+ mutex_unlock(&reflck_lock);
2254
+
2255
+ return PTR_ERR_OR_ZERO(vdev->reflck);
2256
+}
2257
+
2258
+static void vfio_pci_reflck_release(struct kref *kref)
2259
+{
2260
+ struct vfio_pci_reflck *reflck = container_of(kref,
2261
+ struct vfio_pci_reflck,
2262
+ kref);
2263
+
2264
+ kfree(reflck);
2265
+ mutex_unlock(&reflck_lock);
2266
+}
2267
+
2268
+static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
2269
+{
2270
+ kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
2271
+}
2272
+
2273
+static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
16372274 {
16382275 struct vfio_devices *devs = data;
16392276 struct vfio_device *device;
2277
+ struct vfio_pci_device *vdev;
16402278
16412279 if (devs->cur_index == devs->max_index)
16422280 return -ENOSPC;
....@@ -1646,6 +2284,14 @@
16462284 return -EINVAL;
16472285
16482286 if (pci_dev_driver(pdev) != &vfio_pci_driver) {
2287
+ vfio_device_put(device);
2288
+ return -EBUSY;
2289
+ }
2290
+
2291
+ vdev = vfio_device_data(device);
2292
+
2293
+ /* Fault if the device is not unused */
2294
+ if (vdev->refcnt) {
16492295 vfio_device_put(device);
16502296 return -EBUSY;
16512297 }
....@@ -1688,11 +2334,15 @@
16882334 }
16892335
16902336 /*
1691
- * Attempt to do a bus/slot reset if there are devices affected by a reset for
1692
- * this device that are needs_reset and all of the affected devices are unused
1693
- * (!refcnt). Callers are required to hold driver_lock when calling this to
1694
- * prevent device opens and concurrent bus reset attempts. We prevent device
1695
- * unbinds by acquiring and holding a reference to the vfio_device.
2337
+ * If a bus or slot reset is available for the provided device and:
2338
+ * - All of the devices affected by that bus or slot reset are unused
2339
+ * (!refcnt)
2340
+ * - At least one of the affected devices is marked dirty via
2341
+ * needs_reset (such as by lack of FLR support)
2342
+ * Then attempt to perform that bus or slot reset. Callers are required
2343
+ * to hold vdev->reflck->lock, protecting the bus/slot reset group from
2344
+ * concurrent opens. A vfio_device reference is acquired for each device
2345
+ * to prevent unbinds during the reset operation.
16962346 *
16972347 * NB: vfio-core considers a group to be viable even if some devices are
16982348 * bound to drivers like pci-stub or pcieport. Here we require all devices
....@@ -1703,7 +2353,7 @@
17032353 {
17042354 struct vfio_devices devs = { .cur_index = 0 };
17052355 int i = 0, ret = -EINVAL;
1706
- bool needs_reset = false, slot = false;
2356
+ bool slot = false;
17072357 struct vfio_pci_device *tmp;
17082358
17092359 if (!pci_probe_reset_slot(vdev->pdev->slot))
....@@ -1721,28 +2371,36 @@
17212371 return;
17222372
17232373 if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
1724
- vfio_pci_get_devs, &devs, slot))
2374
+ vfio_pci_get_unused_devs,
2375
+ &devs, slot))
17252376 goto put_devs;
17262377
2378
+ /* Does at least one need a reset? */
17272379 for (i = 0; i < devs.cur_index; i++) {
17282380 tmp = vfio_device_data(devs.devices[i]);
1729
- if (tmp->needs_reset)
1730
- needs_reset = true;
1731
- if (tmp->refcnt)
1732
- goto put_devs;
2381
+ if (tmp->needs_reset) {
2382
+ ret = pci_reset_bus(vdev->pdev);
2383
+ break;
2384
+ }
17332385 }
1734
-
1735
- if (needs_reset)
1736
- ret = pci_reset_bus(vdev->pdev);
17372386
17382387 put_devs:
17392388 for (i = 0; i < devs.cur_index; i++) {
17402389 tmp = vfio_device_data(devs.devices[i]);
1741
- if (!ret)
2390
+
2391
+ /*
2392
+ * If reset was successful, affected devices no longer need
2393
+ * a reset and we should return all the collateral devices
2394
+ * to low power. If not successful, we either didn't reset
2395
+ * the bus or timed out waiting for it, so let's not touch
2396
+ * the power state.
2397
+ */
2398
+ if (!ret) {
17422399 tmp->needs_reset = false;
17432400
1744
- if (!tmp->refcnt && !disable_idle_d3)
1745
- pci_set_power_state(tmp->pdev, PCI_D3hot);
2401
+ if (tmp != vdev && !disable_idle_d3)
2402
+ vfio_pci_set_power_state(tmp, PCI_D3hot);
2403
+ }
17462404
17472405 vfio_device_put(devs.devices[i]);
17482406 }
....@@ -1813,6 +2471,9 @@
18132471
18142472 vfio_pci_fill_ids();
18152473
2474
+ if (disable_denylist)
2475
+ pr_warn("device denylist disabled.\n");
2476
+
18162477 return 0;
18172478
18182479 out_driver: