.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. |
---|
3 | 4 | * Author: Alex Williamson <alex.williamson@redhat.com> |
---|
4 | | - * |
---|
5 | | - * This program is free software; you can redistribute it and/or modify |
---|
6 | | - * it under the terms of the GNU General Public License version 2 as |
---|
7 | | - * published by the Free Software Foundation. |
---|
8 | 5 | * |
---|
9 | 6 | * Derived from original vfio: |
---|
10 | 7 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. |
---|
.. | .. |
---|
57 | 54 | MODULE_PARM_DESC(disable_idle_d3, |
---|
58 | 55 | "Disable using the PCI D3 low power state for idle, unused devices"); |
---|
59 | 56 | |
---|
60 | | -static DEFINE_MUTEX(driver_lock); |
---|
| 57 | +static bool enable_sriov; |
---|
| 58 | +#ifdef CONFIG_PCI_IOV |
---|
| 59 | +module_param(enable_sriov, bool, 0644); |
---|
| 60 | +MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration. Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF."); |
---|
| 61 | +#endif |
---|
| 62 | + |
---|
| 63 | +static bool disable_denylist; |
---|
| 64 | +module_param(disable_denylist, bool, 0444); |
---|
| 65 | +MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users."); |
---|
61 | 66 | |
---|
62 | 67 | static inline bool vfio_vga_disabled(void) |
---|
63 | 68 | { |
---|
.. | .. |
---|
66 | 71 | #else |
---|
67 | 72 | return true; |
---|
68 | 73 | #endif |
---|
| 74 | +} |
---|
| 75 | + |
---|
| 76 | +static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev) |
---|
| 77 | +{ |
---|
| 78 | + switch (pdev->vendor) { |
---|
| 79 | + case PCI_VENDOR_ID_INTEL: |
---|
| 80 | + switch (pdev->device) { |
---|
| 81 | + case PCI_DEVICE_ID_INTEL_QAT_C3XXX: |
---|
| 82 | + case PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF: |
---|
| 83 | + case PCI_DEVICE_ID_INTEL_QAT_C62X: |
---|
| 84 | + case PCI_DEVICE_ID_INTEL_QAT_C62X_VF: |
---|
| 85 | + case PCI_DEVICE_ID_INTEL_QAT_DH895XCC: |
---|
| 86 | + case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF: |
---|
| 87 | + return true; |
---|
| 88 | + default: |
---|
| 89 | + return false; |
---|
| 90 | + } |
---|
| 91 | + } |
---|
| 92 | + |
---|
| 93 | + return false; |
---|
| 94 | +} |
---|
| 95 | + |
---|
| 96 | +static bool vfio_pci_is_denylisted(struct pci_dev *pdev) |
---|
| 97 | +{ |
---|
| 98 | + if (!vfio_pci_dev_in_denylist(pdev)) |
---|
| 99 | + return false; |
---|
| 100 | + |
---|
| 101 | + if (disable_denylist) { |
---|
| 102 | + pci_warn(pdev, |
---|
| 103 | + "device denylist disabled - allowing device %04x:%04x.\n", |
---|
| 104 | + pdev->vendor, pdev->device); |
---|
| 105 | + return false; |
---|
| 106 | + } |
---|
| 107 | + |
---|
| 108 | + pci_warn(pdev, "%04x:%04x exists in vfio-pci device denylist, driver probing disallowed.\n", |
---|
| 109 | + pdev->vendor, pdev->device); |
---|
| 110 | + |
---|
| 111 | + return true; |
---|
69 | 112 | } |
---|
70 | 113 | |
---|
71 | 114 | /* |
---|
.. | .. |
---|
115 | 158 | static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev) |
---|
116 | 159 | { |
---|
117 | 160 | struct resource *res; |
---|
118 | | - int bar; |
---|
| 161 | + int i; |
---|
119 | 162 | struct vfio_pci_dummy_resource *dummy_res; |
---|
120 | 163 | |
---|
121 | | - for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { |
---|
122 | | - res = vdev->pdev->resource + bar; |
---|
| 164 | + for (i = 0; i < PCI_STD_NUM_BARS; i++) { |
---|
| 165 | + int bar = i + PCI_STD_RESOURCES; |
---|
| 166 | + |
---|
| 167 | + res = &vdev->pdev->resource[bar]; |
---|
123 | 168 | |
---|
124 | 169 | if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) |
---|
125 | 170 | goto no_mmap; |
---|
.. | .. |
---|
202 | 247 | case 0x1580 ... 0x1581: |
---|
203 | 248 | case 0x1583 ... 0x158b: |
---|
204 | 249 | case 0x37d0 ... 0x37d2: |
---|
| 250 | + /* X550 */ |
---|
| 251 | + case 0x1563: |
---|
205 | 252 | return true; |
---|
206 | 253 | default: |
---|
207 | 254 | return false; |
---|
.. | .. |
---|
211 | 258 | return false; |
---|
212 | 259 | } |
---|
213 | 260 | |
---|
| 261 | +static void vfio_pci_probe_power_state(struct vfio_pci_device *vdev) |
---|
| 262 | +{ |
---|
| 263 | + struct pci_dev *pdev = vdev->pdev; |
---|
| 264 | + u16 pmcsr; |
---|
| 265 | + |
---|
| 266 | + if (!pdev->pm_cap) |
---|
| 267 | + return; |
---|
| 268 | + |
---|
| 269 | + pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr); |
---|
| 270 | + |
---|
| 271 | + vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET); |
---|
| 272 | +} |
---|
| 273 | + |
---|
| 274 | +/* |
---|
| 275 | + * pci_set_power_state() wrapper handling devices which perform a soft reset on |
---|
| 276 | + * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev, |
---|
| 277 | + * restore when returned to D0. Saved separately from pci_saved_state for use |
---|
| 278 | + * by PM capability emulation and separately from pci_dev internal saved state |
---|
| 279 | + * to avoid it being overwritten and consumed around other resets. |
---|
| 280 | + */ |
---|
| 281 | +int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state) |
---|
| 282 | +{ |
---|
| 283 | + struct pci_dev *pdev = vdev->pdev; |
---|
| 284 | + bool needs_restore = false, needs_save = false; |
---|
| 285 | + int ret; |
---|
| 286 | + |
---|
| 287 | + if (vdev->needs_pm_restore) { |
---|
| 288 | + if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) { |
---|
| 289 | + pci_save_state(pdev); |
---|
| 290 | + needs_save = true; |
---|
| 291 | + } |
---|
| 292 | + |
---|
| 293 | + if (pdev->current_state >= PCI_D3hot && state <= PCI_D0) |
---|
| 294 | + needs_restore = true; |
---|
| 295 | + } |
---|
| 296 | + |
---|
| 297 | + ret = pci_set_power_state(pdev, state); |
---|
| 298 | + |
---|
| 299 | + if (!ret) { |
---|
| 300 | + /* D3 might be unsupported via quirk, skip unless in D3 */ |
---|
| 301 | + if (needs_save && pdev->current_state >= PCI_D3hot) { |
---|
| 302 | + vdev->pm_save = pci_store_saved_state(pdev); |
---|
| 303 | + } else if (needs_restore) { |
---|
| 304 | + pci_load_and_free_saved_state(pdev, &vdev->pm_save); |
---|
| 305 | + pci_restore_state(pdev); |
---|
| 306 | + } |
---|
| 307 | + } |
---|
| 308 | + |
---|
| 309 | + return ret; |
---|
| 310 | +} |
---|
| 311 | + |
---|
214 | 312 | static int vfio_pci_enable(struct vfio_pci_device *vdev) |
---|
215 | 313 | { |
---|
216 | 314 | struct pci_dev *pdev = vdev->pdev; |
---|
.. | .. |
---|
218 | 316 | u16 cmd; |
---|
219 | 317 | u8 msix_pos; |
---|
220 | 318 | |
---|
221 | | - pci_set_power_state(pdev, PCI_D0); |
---|
| 319 | + vfio_pci_set_power_state(vdev, PCI_D0); |
---|
222 | 320 | |
---|
223 | 321 | /* Don't allow our initial saved state to include busmaster */ |
---|
224 | 322 | pci_clear_master(pdev); |
---|
.. | .. |
---|
238 | 336 | pci_save_state(pdev); |
---|
239 | 337 | vdev->pci_saved_state = pci_store_saved_state(pdev); |
---|
240 | 338 | if (!vdev->pci_saved_state) |
---|
241 | | - pr_debug("%s: Couldn't store %s saved state\n", |
---|
242 | | - __func__, dev_name(&pdev->dev)); |
---|
| 339 | + pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__); |
---|
243 | 340 | |
---|
244 | 341 | if (likely(!nointxmask)) { |
---|
245 | 342 | if (vfio_pci_nointx(pdev)) { |
---|
246 | | - dev_info(&pdev->dev, "Masking broken INTx support\n"); |
---|
| 343 | + pci_info(pdev, "Masking broken INTx support\n"); |
---|
247 | 344 | vdev->nointx = true; |
---|
248 | 345 | pci_intx(pdev, 0); |
---|
249 | 346 | } else |
---|
.. | .. |
---|
286 | 383 | pdev->vendor == PCI_VENDOR_ID_INTEL && |
---|
287 | 384 | IS_ENABLED(CONFIG_VFIO_PCI_IGD)) { |
---|
288 | 385 | ret = vfio_pci_igd_init(vdev); |
---|
289 | | - if (ret) { |
---|
290 | | - dev_warn(&vdev->pdev->dev, |
---|
291 | | - "Failed to setup Intel IGD regions\n"); |
---|
292 | | - vfio_pci_disable(vdev); |
---|
293 | | - return ret; |
---|
| 386 | + if (ret && ret != -ENODEV) { |
---|
| 387 | + pci_warn(pdev, "Failed to setup Intel IGD regions\n"); |
---|
| 388 | + goto disable_exit; |
---|
| 389 | + } |
---|
| 390 | + } |
---|
| 391 | + |
---|
| 392 | + if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && |
---|
| 393 | + IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { |
---|
| 394 | + ret = vfio_pci_nvdia_v100_nvlink2_init(vdev); |
---|
| 395 | + if (ret && ret != -ENODEV) { |
---|
| 396 | + pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n"); |
---|
| 397 | + goto disable_exit; |
---|
| 398 | + } |
---|
| 399 | + } |
---|
| 400 | + |
---|
| 401 | + if (pdev->vendor == PCI_VENDOR_ID_IBM && |
---|
| 402 | + IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { |
---|
| 403 | + ret = vfio_pci_ibm_npu2_init(vdev); |
---|
| 404 | + if (ret && ret != -ENODEV) { |
---|
| 405 | + pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n"); |
---|
| 406 | + goto disable_exit; |
---|
294 | 407 | } |
---|
295 | 408 | } |
---|
296 | 409 | |
---|
297 | 410 | vfio_pci_probe_mmaps(vdev); |
---|
298 | 411 | |
---|
299 | 412 | return 0; |
---|
| 413 | + |
---|
| 414 | +disable_exit: |
---|
| 415 | + vfio_pci_disable(vdev); |
---|
| 416 | + return ret; |
---|
300 | 417 | } |
---|
301 | 418 | |
---|
302 | 419 | static void vfio_pci_disable(struct vfio_pci_device *vdev) |
---|
.. | .. |
---|
333 | 450 | |
---|
334 | 451 | vfio_config_free(vdev); |
---|
335 | 452 | |
---|
336 | | - for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { |
---|
| 453 | + for (i = 0; i < PCI_STD_NUM_BARS; i++) { |
---|
| 454 | + bar = i + PCI_STD_RESOURCES; |
---|
337 | 455 | if (!vdev->barmap[bar]) |
---|
338 | 456 | continue; |
---|
339 | 457 | pci_iounmap(pdev, vdev->barmap[bar]); |
---|
.. | .. |
---|
357 | 475 | * is just busy work. |
---|
358 | 476 | */ |
---|
359 | 477 | if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { |
---|
360 | | - pr_info("%s: Couldn't reload %s saved state\n", |
---|
361 | | - __func__, dev_name(&pdev->dev)); |
---|
| 478 | + pci_info(pdev, "%s: Couldn't reload saved state\n", __func__); |
---|
362 | 479 | |
---|
363 | 480 | if (!vdev->reset_works) |
---|
364 | 481 | goto out; |
---|
.. | .. |
---|
395 | 512 | vfio_pci_try_bus_reset(vdev); |
---|
396 | 513 | |
---|
397 | 514 | if (!disable_idle_d3) |
---|
398 | | - pci_set_power_state(pdev, PCI_D3hot); |
---|
| 515 | + vfio_pci_set_power_state(vdev, PCI_D3hot); |
---|
| 516 | +} |
---|
| 517 | + |
---|
| 518 | +static struct pci_driver vfio_pci_driver; |
---|
| 519 | + |
---|
| 520 | +static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev, |
---|
| 521 | + struct vfio_device **pf_dev) |
---|
| 522 | +{ |
---|
| 523 | + struct pci_dev *physfn = pci_physfn(vdev->pdev); |
---|
| 524 | + |
---|
| 525 | + if (!vdev->pdev->is_virtfn) |
---|
| 526 | + return NULL; |
---|
| 527 | + |
---|
| 528 | + *pf_dev = vfio_device_get_from_dev(&physfn->dev); |
---|
| 529 | + if (!*pf_dev) |
---|
| 530 | + return NULL; |
---|
| 531 | + |
---|
| 532 | + if (pci_dev_driver(physfn) != &vfio_pci_driver) { |
---|
| 533 | + vfio_device_put(*pf_dev); |
---|
| 534 | + return NULL; |
---|
| 535 | + } |
---|
| 536 | + |
---|
| 537 | + return vfio_device_data(*pf_dev); |
---|
| 538 | +} |
---|
| 539 | + |
---|
| 540 | +static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val) |
---|
| 541 | +{ |
---|
| 542 | + struct vfio_device *pf_dev; |
---|
| 543 | + struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev); |
---|
| 544 | + |
---|
| 545 | + if (!pf_vdev) |
---|
| 546 | + return; |
---|
| 547 | + |
---|
| 548 | + mutex_lock(&pf_vdev->vf_token->lock); |
---|
| 549 | + pf_vdev->vf_token->users += val; |
---|
| 550 | + WARN_ON(pf_vdev->vf_token->users < 0); |
---|
| 551 | + mutex_unlock(&pf_vdev->vf_token->lock); |
---|
| 552 | + |
---|
| 553 | + vfio_device_put(pf_dev); |
---|
399 | 554 | } |
---|
400 | 555 | |
---|
401 | 556 | static void vfio_pci_release(void *device_data) |
---|
402 | 557 | { |
---|
403 | 558 | struct vfio_pci_device *vdev = device_data; |
---|
404 | 559 | |
---|
405 | | - mutex_lock(&driver_lock); |
---|
| 560 | + mutex_lock(&vdev->reflck->lock); |
---|
406 | 561 | |
---|
407 | 562 | if (!(--vdev->refcnt)) { |
---|
| 563 | + vfio_pci_vf_token_user_add(vdev, -1); |
---|
408 | 564 | vfio_spapr_pci_eeh_release(vdev->pdev); |
---|
409 | 565 | vfio_pci_disable(vdev); |
---|
| 566 | + |
---|
410 | 567 | mutex_lock(&vdev->igate); |
---|
411 | 568 | if (vdev->err_trigger) { |
---|
412 | 569 | eventfd_ctx_put(vdev->err_trigger); |
---|
413 | 570 | vdev->err_trigger = NULL; |
---|
414 | 571 | } |
---|
415 | | - mutex_unlock(&vdev->igate); |
---|
416 | | - |
---|
417 | | - mutex_lock(&vdev->igate); |
---|
418 | 572 | if (vdev->req_trigger) { |
---|
419 | 573 | eventfd_ctx_put(vdev->req_trigger); |
---|
420 | 574 | vdev->req_trigger = NULL; |
---|
.. | .. |
---|
422 | 576 | mutex_unlock(&vdev->igate); |
---|
423 | 577 | } |
---|
424 | 578 | |
---|
425 | | - mutex_unlock(&driver_lock); |
---|
| 579 | + mutex_unlock(&vdev->reflck->lock); |
---|
426 | 580 | |
---|
427 | 581 | module_put(THIS_MODULE); |
---|
428 | 582 | } |
---|
.. | .. |
---|
435 | 589 | if (!try_module_get(THIS_MODULE)) |
---|
436 | 590 | return -ENODEV; |
---|
437 | 591 | |
---|
438 | | - mutex_lock(&driver_lock); |
---|
| 592 | + mutex_lock(&vdev->reflck->lock); |
---|
439 | 593 | |
---|
440 | 594 | if (!vdev->refcnt) { |
---|
441 | 595 | ret = vfio_pci_enable(vdev); |
---|
.. | .. |
---|
443 | 597 | goto error; |
---|
444 | 598 | |
---|
445 | 599 | vfio_spapr_pci_eeh_open(vdev->pdev); |
---|
| 600 | + vfio_pci_vf_token_user_add(vdev, 1); |
---|
446 | 601 | } |
---|
447 | 602 | vdev->refcnt++; |
---|
448 | 603 | error: |
---|
449 | | - mutex_unlock(&driver_lock); |
---|
| 604 | + mutex_unlock(&vdev->reflck->lock); |
---|
450 | 605 | if (ret) |
---|
451 | 606 | module_put(THIS_MODULE); |
---|
452 | 607 | return ret; |
---|
.. | .. |
---|
650 | 805 | |
---|
651 | 806 | if (cmd == VFIO_DEVICE_GET_INFO) { |
---|
652 | 807 | struct vfio_device_info info; |
---|
| 808 | + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; |
---|
| 809 | + unsigned long capsz; |
---|
653 | 810 | |
---|
654 | 811 | minsz = offsetofend(struct vfio_device_info, num_irqs); |
---|
| 812 | + |
---|
| 813 | + /* For backward compatibility, cannot require this */ |
---|
| 814 | + capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); |
---|
655 | 815 | |
---|
656 | 816 | if (copy_from_user(&info, (void __user *)arg, minsz)) |
---|
657 | 817 | return -EFAULT; |
---|
658 | 818 | |
---|
659 | 819 | if (info.argsz < minsz) |
---|
660 | 820 | return -EINVAL; |
---|
| 821 | + |
---|
| 822 | + if (info.argsz >= capsz) { |
---|
| 823 | + minsz = capsz; |
---|
| 824 | + info.cap_offset = 0; |
---|
| 825 | + } |
---|
661 | 826 | |
---|
662 | 827 | info.flags = VFIO_DEVICE_FLAGS_PCI; |
---|
663 | 828 | |
---|
.. | .. |
---|
666 | 831 | |
---|
667 | 832 | info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; |
---|
668 | 833 | info.num_irqs = VFIO_PCI_NUM_IRQS; |
---|
| 834 | + |
---|
| 835 | + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) { |
---|
| 836 | + int ret = vfio_pci_info_zdev_add_caps(vdev, &caps); |
---|
| 837 | + |
---|
| 838 | + if (ret && ret != -ENODEV) { |
---|
| 839 | + pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n"); |
---|
| 840 | + return ret; |
---|
| 841 | + } |
---|
| 842 | + } |
---|
| 843 | + |
---|
| 844 | + if (caps.size) { |
---|
| 845 | + info.flags |= VFIO_DEVICE_FLAGS_CAPS; |
---|
| 846 | + if (info.argsz < sizeof(info) + caps.size) { |
---|
| 847 | + info.argsz = sizeof(info) + caps.size; |
---|
| 848 | + } else { |
---|
| 849 | + vfio_info_cap_shift(&caps, sizeof(info)); |
---|
| 850 | + if (copy_to_user((void __user *)arg + |
---|
| 851 | + sizeof(info), caps.buf, |
---|
| 852 | + caps.size)) { |
---|
| 853 | + kfree(caps.buf); |
---|
| 854 | + return -EFAULT; |
---|
| 855 | + } |
---|
| 856 | + info.cap_offset = sizeof(info); |
---|
| 857 | + } |
---|
| 858 | + |
---|
| 859 | + kfree(caps.buf); |
---|
| 860 | + } |
---|
669 | 861 | |
---|
670 | 862 | return copy_to_user((void __user *)arg, &info, minsz) ? |
---|
671 | 863 | -EFAULT : 0; |
---|
.. | .. |
---|
784 | 976 | if (ret) |
---|
785 | 977 | return ret; |
---|
786 | 978 | |
---|
| 979 | + if (vdev->region[i].ops->add_capability) { |
---|
| 980 | + ret = vdev->region[i].ops->add_capability(vdev, |
---|
| 981 | + &vdev->region[i], &caps); |
---|
| 982 | + if (ret) |
---|
| 983 | + return ret; |
---|
| 984 | + } |
---|
787 | 985 | } |
---|
788 | 986 | } |
---|
789 | 987 | |
---|
.. | .. |
---|
827 | 1025 | case VFIO_PCI_ERR_IRQ_INDEX: |
---|
828 | 1026 | if (pci_is_pcie(vdev->pdev)) |
---|
829 | 1027 | break; |
---|
830 | | - /* fall through */ |
---|
| 1028 | + fallthrough; |
---|
831 | 1029 | default: |
---|
832 | 1030 | return -EINVAL; |
---|
833 | 1031 | } |
---|
.. | .. |
---|
1073 | 1271 | |
---|
1074 | 1272 | /* |
---|
1075 | 1273 | * We need to get memory_lock for each device, but devices |
---|
1076 | | - * can share mmap_sem, therefore we need to zap and hold |
---|
| 1274 | + * can share mmap_lock, therefore we need to zap and hold |
---|
1077 | 1275 | * the vma_lock for each device, and only then get each |
---|
1078 | 1276 | * memory_lock. |
---|
1079 | 1277 | */ |
---|
.. | .. |
---|
1142 | 1340 | |
---|
1143 | 1341 | return vfio_pci_ioeventfd(vdev, ioeventfd.offset, |
---|
1144 | 1342 | ioeventfd.data, count, ioeventfd.fd); |
---|
| 1343 | + } else if (cmd == VFIO_DEVICE_FEATURE) { |
---|
| 1344 | + struct vfio_device_feature feature; |
---|
| 1345 | + uuid_t uuid; |
---|
| 1346 | + |
---|
| 1347 | + minsz = offsetofend(struct vfio_device_feature, flags); |
---|
| 1348 | + |
---|
| 1349 | + if (copy_from_user(&feature, (void __user *)arg, minsz)) |
---|
| 1350 | + return -EFAULT; |
---|
| 1351 | + |
---|
| 1352 | + if (feature.argsz < minsz) |
---|
| 1353 | + return -EINVAL; |
---|
| 1354 | + |
---|
| 1355 | + /* Check unknown flags */ |
---|
| 1356 | + if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK | |
---|
| 1357 | + VFIO_DEVICE_FEATURE_SET | |
---|
| 1358 | + VFIO_DEVICE_FEATURE_GET | |
---|
| 1359 | + VFIO_DEVICE_FEATURE_PROBE)) |
---|
| 1360 | + return -EINVAL; |
---|
| 1361 | + |
---|
| 1362 | + /* GET & SET are mutually exclusive except with PROBE */ |
---|
| 1363 | + if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) && |
---|
| 1364 | + (feature.flags & VFIO_DEVICE_FEATURE_SET) && |
---|
| 1365 | + (feature.flags & VFIO_DEVICE_FEATURE_GET)) |
---|
| 1366 | + return -EINVAL; |
---|
| 1367 | + |
---|
| 1368 | + switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) { |
---|
| 1369 | + case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: |
---|
| 1370 | + if (!vdev->vf_token) |
---|
| 1371 | + return -ENOTTY; |
---|
| 1372 | + |
---|
| 1373 | + /* |
---|
| 1374 | + * We do not support GET of the VF Token UUID as this |
---|
| 1375 | + * could expose the token of the previous device user. |
---|
| 1376 | + */ |
---|
| 1377 | + if (feature.flags & VFIO_DEVICE_FEATURE_GET) |
---|
| 1378 | + return -EINVAL; |
---|
| 1379 | + |
---|
| 1380 | + if (feature.flags & VFIO_DEVICE_FEATURE_PROBE) |
---|
| 1381 | + return 0; |
---|
| 1382 | + |
---|
| 1383 | + /* Don't SET unless told to do so */ |
---|
| 1384 | + if (!(feature.flags & VFIO_DEVICE_FEATURE_SET)) |
---|
| 1385 | + return -EINVAL; |
---|
| 1386 | + |
---|
| 1387 | + if (feature.argsz < minsz + sizeof(uuid)) |
---|
| 1388 | + return -EINVAL; |
---|
| 1389 | + |
---|
| 1390 | + if (copy_from_user(&uuid, (void __user *)(arg + minsz), |
---|
| 1391 | + sizeof(uuid))) |
---|
| 1392 | + return -EFAULT; |
---|
| 1393 | + |
---|
| 1394 | + mutex_lock(&vdev->vf_token->lock); |
---|
| 1395 | + uuid_copy(&vdev->vf_token->uuid, &uuid); |
---|
| 1396 | + mutex_unlock(&vdev->vf_token->lock); |
---|
| 1397 | + |
---|
| 1398 | + return 0; |
---|
| 1399 | + default: |
---|
| 1400 | + return -ENOTTY; |
---|
| 1401 | + } |
---|
1145 | 1402 | } |
---|
1146 | 1403 | |
---|
1147 | 1404 | return -ENOTTY; |
---|
.. | .. |
---|
1204 | 1461 | |
---|
1205 | 1462 | /* |
---|
1206 | 1463 | * Lock ordering: |
---|
1207 | | - * vma_lock is nested under mmap_sem for vm_ops callback paths. |
---|
| 1464 | + * vma_lock is nested under mmap_lock for vm_ops callback paths. |
---|
1208 | 1465 | * The memory_lock semaphore is used by both code paths calling |
---|
1209 | 1466 | * into this function to zap vmas and the vm_ops.fault callback |
---|
1210 | 1467 | * to protect the memory enable state of the device. |
---|
1211 | 1468 | * |
---|
1212 | | - * When zapping vmas we need to maintain the mmap_sem => vma_lock |
---|
| 1469 | + * When zapping vmas we need to maintain the mmap_lock => vma_lock |
---|
1213 | 1470 | * ordering, which requires using vma_lock to walk vma_list to |
---|
1214 | | - * acquire an mm, then dropping vma_lock to get the mmap_sem and |
---|
| 1471 | + * acquire an mm, then dropping vma_lock to get the mmap_lock and |
---|
1215 | 1472 | * reacquiring vma_lock. This logic is derived from similar |
---|
1216 | 1473 | * requirements in uverbs_user_mmap_disassociate(). |
---|
1217 | 1474 | * |
---|
1218 | | - * mmap_sem must always be the top-level lock when it is taken. |
---|
| 1475 | + * mmap_lock must always be the top-level lock when it is taken. |
---|
1219 | 1476 | * Therefore we can only hold the memory_lock write lock when |
---|
1220 | | - * vma_list is empty, as we'd need to take mmap_sem to clear |
---|
| 1477 | + * vma_list is empty, as we'd need to take mmap_lock to clear |
---|
1221 | 1478 | * entries. vma_list can only be guaranteed empty when holding |
---|
1222 | 1479 | * vma_lock, thus memory_lock is nested under vma_lock. |
---|
1223 | 1480 | * |
---|
1224 | 1481 | * This enables the vm_ops.fault callback to acquire vma_lock, |
---|
1225 | 1482 | * followed by memory_lock read lock, while already holding |
---|
1226 | | - * mmap_sem without risk of deadlock. |
---|
| 1483 | + * mmap_lock without risk of deadlock. |
---|
1227 | 1484 | */ |
---|
1228 | 1485 | while (1) { |
---|
1229 | 1486 | struct mm_struct *mm = NULL; |
---|
.. | .. |
---|
1251 | 1508 | mutex_unlock(&vdev->vma_lock); |
---|
1252 | 1509 | |
---|
1253 | 1510 | if (try) { |
---|
1254 | | - if (!down_read_trylock(&mm->mmap_sem)) { |
---|
| 1511 | + if (!mmap_read_trylock(mm)) { |
---|
1255 | 1512 | mmput(mm); |
---|
1256 | 1513 | return 0; |
---|
1257 | 1514 | } |
---|
1258 | 1515 | } else { |
---|
1259 | | - down_read(&mm->mmap_sem); |
---|
| 1516 | + mmap_read_lock(mm); |
---|
1260 | 1517 | } |
---|
1261 | | - if (mmget_still_valid(mm)) { |
---|
1262 | | - if (try) { |
---|
1263 | | - if (!mutex_trylock(&vdev->vma_lock)) { |
---|
1264 | | - up_read(&mm->mmap_sem); |
---|
1265 | | - mmput(mm); |
---|
1266 | | - return 0; |
---|
1267 | | - } |
---|
1268 | | - } else { |
---|
1269 | | - mutex_lock(&vdev->vma_lock); |
---|
| 1518 | + if (try) { |
---|
| 1519 | + if (!mutex_trylock(&vdev->vma_lock)) { |
---|
| 1520 | + mmap_read_unlock(mm); |
---|
| 1521 | + mmput(mm); |
---|
| 1522 | + return 0; |
---|
1270 | 1523 | } |
---|
1271 | | - list_for_each_entry_safe(mmap_vma, tmp, |
---|
1272 | | - &vdev->vma_list, vma_next) { |
---|
1273 | | - struct vm_area_struct *vma = mmap_vma->vma; |
---|
1274 | | - |
---|
1275 | | - if (vma->vm_mm != mm) |
---|
1276 | | - continue; |
---|
1277 | | - |
---|
1278 | | - list_del(&mmap_vma->vma_next); |
---|
1279 | | - kfree(mmap_vma); |
---|
1280 | | - |
---|
1281 | | - zap_vma_ptes(vma, vma->vm_start, |
---|
1282 | | - vma->vm_end - vma->vm_start); |
---|
1283 | | - } |
---|
1284 | | - mutex_unlock(&vdev->vma_lock); |
---|
| 1524 | + } else { |
---|
| 1525 | + mutex_lock(&vdev->vma_lock); |
---|
1285 | 1526 | } |
---|
1286 | | - up_read(&mm->mmap_sem); |
---|
| 1527 | + list_for_each_entry_safe(mmap_vma, tmp, |
---|
| 1528 | + &vdev->vma_list, vma_next) { |
---|
| 1529 | + struct vm_area_struct *vma = mmap_vma->vma; |
---|
| 1530 | + |
---|
| 1531 | + if (vma->vm_mm != mm) |
---|
| 1532 | + continue; |
---|
| 1533 | + |
---|
| 1534 | + list_del(&mmap_vma->vma_next); |
---|
| 1535 | + kfree(mmap_vma); |
---|
| 1536 | + |
---|
| 1537 | + zap_vma_ptes(vma, vma->vm_start, |
---|
| 1538 | + vma->vm_end - vma->vm_start); |
---|
| 1539 | + } |
---|
| 1540 | + mutex_unlock(&vdev->vma_lock); |
---|
| 1541 | + mmap_read_unlock(mm); |
---|
1287 | 1542 | mmput(mm); |
---|
1288 | 1543 | } |
---|
1289 | 1544 | } |
---|
.. | .. |
---|
1416 | 1671 | |
---|
1417 | 1672 | index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); |
---|
1418 | 1673 | |
---|
| 1674 | + if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) |
---|
| 1675 | + return -EINVAL; |
---|
1419 | 1676 | if (vma->vm_end < vma->vm_start) |
---|
1420 | 1677 | return -EINVAL; |
---|
1421 | 1678 | if ((vma->vm_flags & VM_SHARED) == 0) |
---|
1422 | 1679 | return -EINVAL; |
---|
| 1680 | + if (index >= VFIO_PCI_NUM_REGIONS) { |
---|
| 1681 | + int regnum = index - VFIO_PCI_NUM_REGIONS; |
---|
| 1682 | + struct vfio_pci_region *region = vdev->region + regnum; |
---|
| 1683 | + |
---|
| 1684 | + if (region->ops && region->ops->mmap && |
---|
| 1685 | + (region->flags & VFIO_REGION_INFO_FLAG_MMAP)) |
---|
| 1686 | + return region->ops->mmap(vdev, region, vma); |
---|
| 1687 | + return -EINVAL; |
---|
| 1688 | + } |
---|
1423 | 1689 | if (index >= VFIO_PCI_ROM_REGION_INDEX) |
---|
1424 | 1690 | return -EINVAL; |
---|
1425 | 1691 | if (!vdev->bar_mmap_supported[index]) |
---|
.. | .. |
---|
1468 | 1734 | static void vfio_pci_request(void *device_data, unsigned int count) |
---|
1469 | 1735 | { |
---|
1470 | 1736 | struct vfio_pci_device *vdev = device_data; |
---|
| 1737 | + struct pci_dev *pdev = vdev->pdev; |
---|
1471 | 1738 | |
---|
1472 | 1739 | mutex_lock(&vdev->igate); |
---|
1473 | 1740 | |
---|
1474 | 1741 | if (vdev->req_trigger) { |
---|
1475 | 1742 | if (!(count % 10)) |
---|
1476 | | - dev_notice_ratelimited(&vdev->pdev->dev, |
---|
| 1743 | + pci_notice_ratelimited(pdev, |
---|
1477 | 1744 | "Relaying device request to user (#%u)\n", |
---|
1478 | 1745 | count); |
---|
1479 | 1746 | eventfd_signal(vdev->req_trigger, 1); |
---|
1480 | 1747 | } else if (count == 0) { |
---|
1481 | | - dev_warn(&vdev->pdev->dev, |
---|
| 1748 | + pci_warn(pdev, |
---|
1482 | 1749 | "No device request channel registered, blocked until released by user\n"); |
---|
1483 | 1750 | } |
---|
1484 | 1751 | |
---|
1485 | 1752 | mutex_unlock(&vdev->igate); |
---|
| 1753 | +} |
---|
| 1754 | + |
---|
| 1755 | +static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, |
---|
| 1756 | + bool vf_token, uuid_t *uuid) |
---|
| 1757 | +{ |
---|
| 1758 | + /* |
---|
| 1759 | + * There's always some degree of trust or collaboration between SR-IOV |
---|
| 1760 | + * PF and VFs, even if just that the PF hosts the SR-IOV capability and |
---|
| 1761 | + * can disrupt VFs with a reset, but often the PF has more explicit |
---|
| 1762 | + * access to deny service to the VF or access data passed through the |
---|
| 1763 | + * VF. We therefore require an opt-in via a shared VF token (UUID) to |
---|
| 1764 | + * represent this trust. This both prevents that a VF driver might |
---|
| 1765 | + * assume the PF driver is a trusted, in-kernel driver, and also that |
---|
| 1766 | + * a PF driver might be replaced with a rogue driver, unknown to in-use |
---|
| 1767 | + * VF drivers. |
---|
| 1768 | + * |
---|
| 1769 | + * Therefore when presented with a VF, if the PF is a vfio device and |
---|
| 1770 | + * it is bound to the vfio-pci driver, the user needs to provide a VF |
---|
| 1771 | + * token to access the device, in the form of appending a vf_token to |
---|
| 1772 | + * the device name, for example: |
---|
| 1773 | + * |
---|
| 1774 | + * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" |
---|
| 1775 | + * |
---|
| 1776 | + * When presented with a PF which has VFs in use, the user must also |
---|
| 1777 | + * provide the current VF token to prove collaboration with existing |
---|
| 1778 | + * VF users. If VFs are not in use, the VF token provided for the PF |
---|
| 1779 | + * device will act to set the VF token. |
---|
| 1780 | + * |
---|
| 1781 | + * If the VF token is provided but unused, an error is generated. |
---|
| 1782 | + */ |
---|
| 1783 | + if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token) |
---|
| 1784 | + return 0; /* No VF token provided or required */ |
---|
| 1785 | + |
---|
| 1786 | + if (vdev->pdev->is_virtfn) { |
---|
| 1787 | + struct vfio_device *pf_dev; |
---|
| 1788 | + struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev); |
---|
| 1789 | + bool match; |
---|
| 1790 | + |
---|
| 1791 | + if (!pf_vdev) { |
---|
| 1792 | + if (!vf_token) |
---|
| 1793 | + return 0; /* PF is not vfio-pci, no VF token */ |
---|
| 1794 | + |
---|
| 1795 | + pci_info_ratelimited(vdev->pdev, |
---|
| 1796 | + "VF token incorrectly provided, PF not bound to vfio-pci\n"); |
---|
| 1797 | + return -EINVAL; |
---|
| 1798 | + } |
---|
| 1799 | + |
---|
| 1800 | + if (!vf_token) { |
---|
| 1801 | + vfio_device_put(pf_dev); |
---|
| 1802 | + pci_info_ratelimited(vdev->pdev, |
---|
| 1803 | + "VF token required to access device\n"); |
---|
| 1804 | + return -EACCES; |
---|
| 1805 | + } |
---|
| 1806 | + |
---|
| 1807 | + mutex_lock(&pf_vdev->vf_token->lock); |
---|
| 1808 | + match = uuid_equal(uuid, &pf_vdev->vf_token->uuid); |
---|
| 1809 | + mutex_unlock(&pf_vdev->vf_token->lock); |
---|
| 1810 | + |
---|
| 1811 | + vfio_device_put(pf_dev); |
---|
| 1812 | + |
---|
| 1813 | + if (!match) { |
---|
| 1814 | + pci_info_ratelimited(vdev->pdev, |
---|
| 1815 | + "Incorrect VF token provided for device\n"); |
---|
| 1816 | + return -EACCES; |
---|
| 1817 | + } |
---|
| 1818 | + } else if (vdev->vf_token) { |
---|
| 1819 | + mutex_lock(&vdev->vf_token->lock); |
---|
| 1820 | + if (vdev->vf_token->users) { |
---|
| 1821 | + if (!vf_token) { |
---|
| 1822 | + mutex_unlock(&vdev->vf_token->lock); |
---|
| 1823 | + pci_info_ratelimited(vdev->pdev, |
---|
| 1824 | + "VF token required to access device\n"); |
---|
| 1825 | + return -EACCES; |
---|
| 1826 | + } |
---|
| 1827 | + |
---|
| 1828 | + if (!uuid_equal(uuid, &vdev->vf_token->uuid)) { |
---|
| 1829 | + mutex_unlock(&vdev->vf_token->lock); |
---|
| 1830 | + pci_info_ratelimited(vdev->pdev, |
---|
| 1831 | + "Incorrect VF token provided for device\n"); |
---|
| 1832 | + return -EACCES; |
---|
| 1833 | + } |
---|
| 1834 | + } else if (vf_token) { |
---|
| 1835 | + uuid_copy(&vdev->vf_token->uuid, uuid); |
---|
| 1836 | + } |
---|
| 1837 | + |
---|
| 1838 | + mutex_unlock(&vdev->vf_token->lock); |
---|
| 1839 | + } else if (vf_token) { |
---|
| 1840 | + pci_info_ratelimited(vdev->pdev, |
---|
| 1841 | + "VF token incorrectly provided, not a PF or VF\n"); |
---|
| 1842 | + return -EINVAL; |
---|
| 1843 | + } |
---|
| 1844 | + |
---|
| 1845 | + return 0; |
---|
| 1846 | +} |
---|
| 1847 | + |
---|
| 1848 | +#define VF_TOKEN_ARG "vf_token=" |
---|
| 1849 | + |
---|
| 1850 | +static int vfio_pci_match(void *device_data, char *buf) |
---|
| 1851 | +{ |
---|
| 1852 | + struct vfio_pci_device *vdev = device_data; |
---|
| 1853 | + bool vf_token = false; |
---|
| 1854 | + uuid_t uuid; |
---|
| 1855 | + int ret; |
---|
| 1856 | + |
---|
| 1857 | + if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev)))) |
---|
| 1858 | + return 0; /* No match */ |
---|
| 1859 | + |
---|
| 1860 | + if (strlen(buf) > strlen(pci_name(vdev->pdev))) { |
---|
| 1861 | + buf += strlen(pci_name(vdev->pdev)); |
---|
| 1862 | + |
---|
| 1863 | + if (*buf != ' ') |
---|
| 1864 | + return 0; /* No match: non-whitespace after name */ |
---|
| 1865 | + |
---|
| 1866 | + while (*buf) { |
---|
| 1867 | + if (*buf == ' ') { |
---|
| 1868 | + buf++; |
---|
| 1869 | + continue; |
---|
| 1870 | + } |
---|
| 1871 | + |
---|
| 1872 | + if (!vf_token && !strncmp(buf, VF_TOKEN_ARG, |
---|
| 1873 | + strlen(VF_TOKEN_ARG))) { |
---|
| 1874 | + buf += strlen(VF_TOKEN_ARG); |
---|
| 1875 | + |
---|
| 1876 | + if (strlen(buf) < UUID_STRING_LEN) |
---|
| 1877 | + return -EINVAL; |
---|
| 1878 | + |
---|
| 1879 | + ret = uuid_parse(buf, &uuid); |
---|
| 1880 | + if (ret) |
---|
| 1881 | + return ret; |
---|
| 1882 | + |
---|
| 1883 | + vf_token = true; |
---|
| 1884 | + buf += UUID_STRING_LEN; |
---|
| 1885 | + } else { |
---|
| 1886 | + /* Unknown/duplicate option */ |
---|
| 1887 | + return -EINVAL; |
---|
| 1888 | + } |
---|
| 1889 | + } |
---|
| 1890 | + } |
---|
| 1891 | + |
---|
| 1892 | + ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid); |
---|
| 1893 | + if (ret) |
---|
| 1894 | + return ret; |
---|
| 1895 | + |
---|
| 1896 | + return 1; /* Match */ |
---|
1486 | 1897 | } |
---|
1487 | 1898 | |
---|
1488 | 1899 | static const struct vfio_device_ops vfio_pci_ops = { |
---|
.. | .. |
---|
1494 | 1905 | .write = vfio_pci_write, |
---|
1495 | 1906 | .mmap = vfio_pci_mmap, |
---|
1496 | 1907 | .request = vfio_pci_request, |
---|
| 1908 | + .match = vfio_pci_match, |
---|
1497 | 1909 | }; |
---|
| 1910 | + |
---|
| 1911 | +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev); |
---|
| 1912 | +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck); |
---|
| 1913 | + |
---|
| 1914 | +static int vfio_pci_bus_notifier(struct notifier_block *nb, |
---|
| 1915 | + unsigned long action, void *data) |
---|
| 1916 | +{ |
---|
| 1917 | + struct vfio_pci_device *vdev = container_of(nb, |
---|
| 1918 | + struct vfio_pci_device, nb); |
---|
| 1919 | + struct device *dev = data; |
---|
| 1920 | + struct pci_dev *pdev = to_pci_dev(dev); |
---|
| 1921 | + struct pci_dev *physfn = pci_physfn(pdev); |
---|
| 1922 | + |
---|
| 1923 | + if (action == BUS_NOTIFY_ADD_DEVICE && |
---|
| 1924 | + pdev->is_virtfn && physfn == vdev->pdev) { |
---|
| 1925 | + pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n", |
---|
| 1926 | + pci_name(pdev)); |
---|
| 1927 | + pdev->driver_override = kasprintf(GFP_KERNEL, "%s", |
---|
| 1928 | + vfio_pci_ops.name); |
---|
| 1929 | + } else if (action == BUS_NOTIFY_BOUND_DRIVER && |
---|
| 1930 | + pdev->is_virtfn && physfn == vdev->pdev) { |
---|
| 1931 | + struct pci_driver *drv = pci_dev_driver(pdev); |
---|
| 1932 | + |
---|
| 1933 | + if (drv && drv != &vfio_pci_driver) |
---|
| 1934 | + pci_warn(vdev->pdev, |
---|
| 1935 | + "VF %s bound to driver %s while PF bound to vfio-pci\n", |
---|
| 1936 | + pci_name(pdev), drv->name); |
---|
| 1937 | + } |
---|
| 1938 | + |
---|
| 1939 | + return 0; |
---|
| 1940 | +} |
---|
| 1941 | + |
---|
| 1942 | +static int vfio_pci_vf_init(struct vfio_pci_device *vdev) |
---|
| 1943 | +{ |
---|
| 1944 | + struct pci_dev *pdev = vdev->pdev; |
---|
| 1945 | + int ret; |
---|
| 1946 | + |
---|
| 1947 | + if (!pdev->is_physfn) |
---|
| 1948 | + return 0; |
---|
| 1949 | + |
---|
| 1950 | + vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL); |
---|
| 1951 | + if (!vdev->vf_token) |
---|
| 1952 | + return -ENOMEM; |
---|
| 1953 | + |
---|
| 1954 | + mutex_init(&vdev->vf_token->lock); |
---|
| 1955 | + uuid_gen(&vdev->vf_token->uuid); |
---|
| 1956 | + |
---|
| 1957 | + vdev->nb.notifier_call = vfio_pci_bus_notifier; |
---|
| 1958 | + ret = bus_register_notifier(&pci_bus_type, &vdev->nb); |
---|
| 1959 | + if (ret) { |
---|
| 1960 | + kfree(vdev->vf_token); |
---|
| 1961 | + return ret; |
---|
| 1962 | + } |
---|
| 1963 | + return 0; |
---|
| 1964 | +} |
---|
| 1965 | + |
---|
| 1966 | +static void vfio_pci_vf_uninit(struct vfio_pci_device *vdev) |
---|
| 1967 | +{ |
---|
| 1968 | + if (!vdev->vf_token) |
---|
| 1969 | + return; |
---|
| 1970 | + |
---|
| 1971 | + bus_unregister_notifier(&pci_bus_type, &vdev->nb); |
---|
| 1972 | + WARN_ON(vdev->vf_token->users); |
---|
| 1973 | + mutex_destroy(&vdev->vf_token->lock); |
---|
| 1974 | + kfree(vdev->vf_token); |
---|
| 1975 | +} |
---|
| 1976 | + |
---|
| 1977 | +static int vfio_pci_vga_init(struct vfio_pci_device *vdev) |
---|
| 1978 | +{ |
---|
| 1979 | + struct pci_dev *pdev = vdev->pdev; |
---|
| 1980 | + int ret; |
---|
| 1981 | + |
---|
| 1982 | + if (!vfio_pci_is_vga(pdev)) |
---|
| 1983 | + return 0; |
---|
| 1984 | + |
---|
| 1985 | + ret = vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); |
---|
| 1986 | + if (ret) |
---|
| 1987 | + return ret; |
---|
| 1988 | + vga_set_legacy_decoding(pdev, vfio_pci_set_vga_decode(vdev, false)); |
---|
| 1989 | + return 0; |
---|
| 1990 | +} |
---|
| 1991 | + |
---|
| 1992 | +static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev) |
---|
| 1993 | +{ |
---|
| 1994 | + struct pci_dev *pdev = vdev->pdev; |
---|
| 1995 | + |
---|
| 1996 | + if (!vfio_pci_is_vga(pdev)) |
---|
| 1997 | + return; |
---|
| 1998 | + vga_client_register(pdev, NULL, NULL, NULL); |
---|
| 1999 | + vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | |
---|
| 2000 | + VGA_RSRC_LEGACY_IO | |
---|
| 2001 | + VGA_RSRC_LEGACY_MEM); |
---|
| 2002 | +} |
---|
1498 | 2003 | |
---|
1499 | 2004 | static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) |
---|
1500 | 2005 | { |
---|
.. | .. |
---|
1502 | 2007 | struct iommu_group *group; |
---|
1503 | 2008 | int ret; |
---|
1504 | 2009 | |
---|
| 2010 | + if (vfio_pci_is_denylisted(pdev)) |
---|
| 2011 | + return -EINVAL; |
---|
| 2012 | + |
---|
1505 | 2013 | if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) |
---|
1506 | 2014 | return -EINVAL; |
---|
1507 | 2015 | |
---|
1508 | 2016 | /* |
---|
1509 | | - * Prevent binding to PFs with VFs enabled, this too easily allows |
---|
1510 | | - * userspace instance with VFs and PFs from the same device, which |
---|
1511 | | - * cannot work. Disabling SR-IOV here would initiate removing the |
---|
1512 | | - * VFs, which would unbind the driver, which is prone to blocking |
---|
1513 | | - * if that VF is also in use by vfio-pci. Just reject these PFs |
---|
1514 | | - * and let the user sort it out. |
---|
| 2017 | + * Prevent binding to PFs with VFs enabled, the VFs might be in use |
---|
| 2018 | + * by the host or other users. We cannot capture the VFs if they |
---|
| 2019 | + * already exist, nor can we track VF users. Disabling SR-IOV here |
---|
| 2020 | + * would initiate removing the VFs, which would unbind the driver, |
---|
| 2021 | + * which is prone to blocking if that VF is also in use by vfio-pci. |
---|
| 2022 | + * Just reject these PFs and let the user sort it out. |
---|
1515 | 2023 | */ |
---|
1516 | 2024 | if (pci_num_vf(pdev)) { |
---|
1517 | 2025 | pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n"); |
---|
.. | .. |
---|
1524 | 2032 | |
---|
1525 | 2033 | vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); |
---|
1526 | 2034 | if (!vdev) { |
---|
1527 | | - vfio_iommu_group_put(group, &pdev->dev); |
---|
1528 | | - return -ENOMEM; |
---|
| 2035 | + ret = -ENOMEM; |
---|
| 2036 | + goto out_group_put; |
---|
1529 | 2037 | } |
---|
1530 | 2038 | |
---|
1531 | 2039 | vdev->pdev = pdev; |
---|
.. | .. |
---|
1539 | 2047 | INIT_LIST_HEAD(&vdev->vma_list); |
---|
1540 | 2048 | init_rwsem(&vdev->memory_lock); |
---|
1541 | 2049 | |
---|
1542 | | - ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); |
---|
1543 | | - if (ret) { |
---|
1544 | | - vfio_iommu_group_put(group, &pdev->dev); |
---|
1545 | | - kfree(vdev); |
---|
1546 | | - return ret; |
---|
1547 | | - } |
---|
| 2050 | + ret = vfio_pci_reflck_attach(vdev); |
---|
| 2051 | + if (ret) |
---|
| 2052 | + goto out_free; |
---|
| 2053 | + ret = vfio_pci_vf_init(vdev); |
---|
| 2054 | + if (ret) |
---|
| 2055 | + goto out_reflck; |
---|
| 2056 | + ret = vfio_pci_vga_init(vdev); |
---|
| 2057 | + if (ret) |
---|
| 2058 | + goto out_vf; |
---|
1548 | 2059 | |
---|
1549 | | - if (vfio_pci_is_vga(pdev)) { |
---|
1550 | | - vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); |
---|
1551 | | - vga_set_legacy_decoding(pdev, |
---|
1552 | | - vfio_pci_set_vga_decode(vdev, false)); |
---|
1553 | | - } |
---|
| 2060 | + vfio_pci_probe_power_state(vdev); |
---|
1554 | 2061 | |
---|
1555 | 2062 | if (!disable_idle_d3) { |
---|
1556 | 2063 | /* |
---|
.. | .. |
---|
1562 | 2069 | * be able to get to D3. Therefore first do a D0 transition |
---|
1563 | 2070 | * before going to D3. |
---|
1564 | 2071 | */ |
---|
1565 | | - pci_set_power_state(pdev, PCI_D0); |
---|
1566 | | - pci_set_power_state(pdev, PCI_D3hot); |
---|
| 2072 | + vfio_pci_set_power_state(vdev, PCI_D0); |
---|
| 2073 | + vfio_pci_set_power_state(vdev, PCI_D3hot); |
---|
1567 | 2074 | } |
---|
1568 | 2075 | |
---|
| 2076 | + ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); |
---|
| 2077 | + if (ret) |
---|
| 2078 | + goto out_power; |
---|
| 2079 | + return 0; |
---|
| 2080 | + |
---|
| 2081 | +out_power: |
---|
| 2082 | + if (!disable_idle_d3) |
---|
| 2083 | + vfio_pci_set_power_state(vdev, PCI_D0); |
---|
| 2084 | +out_vf: |
---|
| 2085 | + vfio_pci_vf_uninit(vdev); |
---|
| 2086 | +out_reflck: |
---|
| 2087 | + vfio_pci_reflck_put(vdev->reflck); |
---|
| 2088 | +out_free: |
---|
| 2089 | + kfree(vdev->pm_save); |
---|
| 2090 | + kfree(vdev); |
---|
| 2091 | +out_group_put: |
---|
| 2092 | + vfio_iommu_group_put(group, &pdev->dev); |
---|
1569 | 2093 | return ret; |
---|
1570 | 2094 | } |
---|
1571 | 2095 | |
---|
.. | .. |
---|
1573 | 2097 | { |
---|
1574 | 2098 | struct vfio_pci_device *vdev; |
---|
1575 | 2099 | |
---|
| 2100 | + pci_disable_sriov(pdev); |
---|
| 2101 | + |
---|
1576 | 2102 | vdev = vfio_del_group_dev(&pdev->dev); |
---|
1577 | 2103 | if (!vdev) |
---|
1578 | 2104 | return; |
---|
1579 | 2105 | |
---|
1580 | | - vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); |
---|
1581 | | - kfree(vdev->region); |
---|
1582 | | - mutex_destroy(&vdev->ioeventfds_lock); |
---|
1583 | | - kfree(vdev); |
---|
| 2106 | + vfio_pci_vf_uninit(vdev); |
---|
| 2107 | + vfio_pci_reflck_put(vdev->reflck); |
---|
| 2108 | + vfio_pci_vga_uninit(vdev); |
---|
1584 | 2109 | |
---|
1585 | | - if (vfio_pci_is_vga(pdev)) { |
---|
1586 | | - vga_client_register(pdev, NULL, NULL, NULL); |
---|
1587 | | - vga_set_legacy_decoding(pdev, |
---|
1588 | | - VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | |
---|
1589 | | - VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM); |
---|
1590 | | - } |
---|
| 2110 | + vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); |
---|
1591 | 2111 | |
---|
1592 | 2112 | if (!disable_idle_d3) |
---|
1593 | | - pci_set_power_state(pdev, PCI_D0); |
---|
| 2113 | + vfio_pci_set_power_state(vdev, PCI_D0); |
---|
| 2114 | + |
---|
| 2115 | + mutex_destroy(&vdev->ioeventfds_lock); |
---|
| 2116 | + kfree(vdev->region); |
---|
| 2117 | + kfree(vdev->pm_save); |
---|
| 2118 | + kfree(vdev); |
---|
1594 | 2119 | } |
---|
1595 | 2120 | |
---|
1596 | 2121 | static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, |
---|
.. | .. |
---|
1621 | 2146 | return PCI_ERS_RESULT_CAN_RECOVER; |
---|
1622 | 2147 | } |
---|
1623 | 2148 | |
---|
| 2149 | +static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) |
---|
| 2150 | +{ |
---|
| 2151 | + struct vfio_pci_device *vdev; |
---|
| 2152 | + struct vfio_device *device; |
---|
| 2153 | + int ret = 0; |
---|
| 2154 | + |
---|
| 2155 | + might_sleep(); |
---|
| 2156 | + |
---|
| 2157 | + if (!enable_sriov) |
---|
| 2158 | + return -ENOENT; |
---|
| 2159 | + |
---|
| 2160 | + device = vfio_device_get_from_dev(&pdev->dev); |
---|
| 2161 | + if (!device) |
---|
| 2162 | + return -ENODEV; |
---|
| 2163 | + |
---|
| 2164 | + vdev = vfio_device_data(device); |
---|
| 2165 | + if (!vdev) { |
---|
| 2166 | + vfio_device_put(device); |
---|
| 2167 | + return -ENODEV; |
---|
| 2168 | + } |
---|
| 2169 | + |
---|
| 2170 | + if (nr_virtfn == 0) |
---|
| 2171 | + pci_disable_sriov(pdev); |
---|
| 2172 | + else |
---|
| 2173 | + ret = pci_enable_sriov(pdev, nr_virtfn); |
---|
| 2174 | + |
---|
| 2175 | + vfio_device_put(device); |
---|
| 2176 | + |
---|
| 2177 | + return ret < 0 ? ret : nr_virtfn; |
---|
| 2178 | +} |
---|
| 2179 | + |
---|
1624 | 2180 | static const struct pci_error_handlers vfio_err_handlers = { |
---|
1625 | 2181 | .error_detected = vfio_pci_aer_err_detected, |
---|
1626 | 2182 | }; |
---|
1627 | 2183 | |
---|
1628 | 2184 | static struct pci_driver vfio_pci_driver = { |
---|
1629 | | - .name = "vfio-pci", |
---|
1630 | | - .id_table = NULL, /* only dynamic ids */ |
---|
1631 | | - .probe = vfio_pci_probe, |
---|
1632 | | - .remove = vfio_pci_remove, |
---|
1633 | | - .err_handler = &vfio_err_handlers, |
---|
| 2185 | + .name = "vfio-pci", |
---|
| 2186 | + .id_table = NULL, /* only dynamic ids */ |
---|
| 2187 | + .probe = vfio_pci_probe, |
---|
| 2188 | + .remove = vfio_pci_remove, |
---|
| 2189 | + .sriov_configure = vfio_pci_sriov_configure, |
---|
| 2190 | + .err_handler = &vfio_err_handlers, |
---|
1634 | 2191 | }; |
---|
1635 | 2192 | |
---|
1636 | | -static int vfio_pci_get_devs(struct pci_dev *pdev, void *data) |
---|
| 2193 | +static DEFINE_MUTEX(reflck_lock); |
---|
| 2194 | + |
---|
| 2195 | +static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void) |
---|
| 2196 | +{ |
---|
| 2197 | + struct vfio_pci_reflck *reflck; |
---|
| 2198 | + |
---|
| 2199 | + reflck = kzalloc(sizeof(*reflck), GFP_KERNEL); |
---|
| 2200 | + if (!reflck) |
---|
| 2201 | + return ERR_PTR(-ENOMEM); |
---|
| 2202 | + |
---|
| 2203 | + kref_init(&reflck->kref); |
---|
| 2204 | + mutex_init(&reflck->lock); |
---|
| 2205 | + |
---|
| 2206 | + return reflck; |
---|
| 2207 | +} |
---|
| 2208 | + |
---|
| 2209 | +static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck) |
---|
| 2210 | +{ |
---|
| 2211 | + kref_get(&reflck->kref); |
---|
| 2212 | +} |
---|
| 2213 | + |
---|
| 2214 | +static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data) |
---|
| 2215 | +{ |
---|
| 2216 | + struct vfio_pci_reflck **preflck = data; |
---|
| 2217 | + struct vfio_device *device; |
---|
| 2218 | + struct vfio_pci_device *vdev; |
---|
| 2219 | + |
---|
| 2220 | + device = vfio_device_get_from_dev(&pdev->dev); |
---|
| 2221 | + if (!device) |
---|
| 2222 | + return 0; |
---|
| 2223 | + |
---|
| 2224 | + if (pci_dev_driver(pdev) != &vfio_pci_driver) { |
---|
| 2225 | + vfio_device_put(device); |
---|
| 2226 | + return 0; |
---|
| 2227 | + } |
---|
| 2228 | + |
---|
| 2229 | + vdev = vfio_device_data(device); |
---|
| 2230 | + |
---|
| 2231 | + if (vdev->reflck) { |
---|
| 2232 | + vfio_pci_reflck_get(vdev->reflck); |
---|
| 2233 | + *preflck = vdev->reflck; |
---|
| 2234 | + vfio_device_put(device); |
---|
| 2235 | + return 1; |
---|
| 2236 | + } |
---|
| 2237 | + |
---|
| 2238 | + vfio_device_put(device); |
---|
| 2239 | + return 0; |
---|
| 2240 | +} |
---|
| 2241 | + |
---|
| 2242 | +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev) |
---|
| 2243 | +{ |
---|
| 2244 | + bool slot = !pci_probe_reset_slot(vdev->pdev->slot); |
---|
| 2245 | + |
---|
| 2246 | + mutex_lock(&reflck_lock); |
---|
| 2247 | + |
---|
| 2248 | + if (pci_is_root_bus(vdev->pdev->bus) || |
---|
| 2249 | + vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find, |
---|
| 2250 | + &vdev->reflck, slot) <= 0) |
---|
| 2251 | + vdev->reflck = vfio_pci_reflck_alloc(); |
---|
| 2252 | + |
---|
| 2253 | + mutex_unlock(&reflck_lock); |
---|
| 2254 | + |
---|
| 2255 | + return PTR_ERR_OR_ZERO(vdev->reflck); |
---|
| 2256 | +} |
---|
| 2257 | + |
---|
| 2258 | +static void vfio_pci_reflck_release(struct kref *kref) |
---|
| 2259 | +{ |
---|
| 2260 | + struct vfio_pci_reflck *reflck = container_of(kref, |
---|
| 2261 | + struct vfio_pci_reflck, |
---|
| 2262 | + kref); |
---|
| 2263 | + |
---|
| 2264 | + kfree(reflck); |
---|
| 2265 | + mutex_unlock(&reflck_lock); |
---|
| 2266 | +} |
---|
| 2267 | + |
---|
| 2268 | +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck) |
---|
| 2269 | +{ |
---|
| 2270 | + kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock); |
---|
| 2271 | +} |
---|
| 2272 | + |
---|
| 2273 | +static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data) |
---|
1637 | 2274 | { |
---|
1638 | 2275 | struct vfio_devices *devs = data; |
---|
1639 | 2276 | struct vfio_device *device; |
---|
| 2277 | + struct vfio_pci_device *vdev; |
---|
1640 | 2278 | |
---|
1641 | 2279 | if (devs->cur_index == devs->max_index) |
---|
1642 | 2280 | return -ENOSPC; |
---|
.. | .. |
---|
1646 | 2284 | return -EINVAL; |
---|
1647 | 2285 | |
---|
1648 | 2286 | if (pci_dev_driver(pdev) != &vfio_pci_driver) { |
---|
| 2287 | + vfio_device_put(device); |
---|
| 2288 | + return -EBUSY; |
---|
| 2289 | + } |
---|
| 2290 | + |
---|
| 2291 | + vdev = vfio_device_data(device); |
---|
| 2292 | + |
---|
| 2293 | + /* Fault if the device is not unused */ |
---|
| 2294 | + if (vdev->refcnt) { |
---|
1649 | 2295 | vfio_device_put(device); |
---|
1650 | 2296 | return -EBUSY; |
---|
1651 | 2297 | } |
---|
.. | .. |
---|
1688 | 2334 | } |
---|
1689 | 2335 | |
---|
1690 | 2336 | /* |
---|
1691 | | - * Attempt to do a bus/slot reset if there are devices affected by a reset for |
---|
1692 | | - * this device that are needs_reset and all of the affected devices are unused |
---|
1693 | | - * (!refcnt). Callers are required to hold driver_lock when calling this to |
---|
1694 | | - * prevent device opens and concurrent bus reset attempts. We prevent device |
---|
1695 | | - * unbinds by acquiring and holding a reference to the vfio_device. |
---|
| 2337 | + * If a bus or slot reset is available for the provided device and: |
---|
| 2338 | + * - All of the devices affected by that bus or slot reset are unused |
---|
| 2339 | + * (!refcnt) |
---|
| 2340 | + * - At least one of the affected devices is marked dirty via |
---|
| 2341 | + * needs_reset (such as by lack of FLR support) |
---|
| 2342 | + * Then attempt to perform that bus or slot reset. Callers are required |
---|
| 2343 | + * to hold vdev->reflck->lock, protecting the bus/slot reset group from |
---|
| 2344 | + * concurrent opens. A vfio_device reference is acquired for each device |
---|
| 2345 | + * to prevent unbinds during the reset operation. |
---|
1696 | 2346 | * |
---|
1697 | 2347 | * NB: vfio-core considers a group to be viable even if some devices are |
---|
1698 | 2348 | * bound to drivers like pci-stub or pcieport. Here we require all devices |
---|
.. | .. |
---|
1703 | 2353 | { |
---|
1704 | 2354 | struct vfio_devices devs = { .cur_index = 0 }; |
---|
1705 | 2355 | int i = 0, ret = -EINVAL; |
---|
1706 | | - bool needs_reset = false, slot = false; |
---|
| 2356 | + bool slot = false; |
---|
1707 | 2357 | struct vfio_pci_device *tmp; |
---|
1708 | 2358 | |
---|
1709 | 2359 | if (!pci_probe_reset_slot(vdev->pdev->slot)) |
---|
.. | .. |
---|
1721 | 2371 | return; |
---|
1722 | 2372 | |
---|
1723 | 2373 | if (vfio_pci_for_each_slot_or_bus(vdev->pdev, |
---|
1724 | | - vfio_pci_get_devs, &devs, slot)) |
---|
| 2374 | + vfio_pci_get_unused_devs, |
---|
| 2375 | + &devs, slot)) |
---|
1725 | 2376 | goto put_devs; |
---|
1726 | 2377 | |
---|
| 2378 | + /* Does at least one need a reset? */ |
---|
1727 | 2379 | for (i = 0; i < devs.cur_index; i++) { |
---|
1728 | 2380 | tmp = vfio_device_data(devs.devices[i]); |
---|
1729 | | - if (tmp->needs_reset) |
---|
1730 | | - needs_reset = true; |
---|
1731 | | - if (tmp->refcnt) |
---|
1732 | | - goto put_devs; |
---|
| 2381 | + if (tmp->needs_reset) { |
---|
| 2382 | + ret = pci_reset_bus(vdev->pdev); |
---|
| 2383 | + break; |
---|
| 2384 | + } |
---|
1733 | 2385 | } |
---|
1734 | | - |
---|
1735 | | - if (needs_reset) |
---|
1736 | | - ret = pci_reset_bus(vdev->pdev); |
---|
1737 | 2386 | |
---|
1738 | 2387 | put_devs: |
---|
1739 | 2388 | for (i = 0; i < devs.cur_index; i++) { |
---|
1740 | 2389 | tmp = vfio_device_data(devs.devices[i]); |
---|
1741 | | - if (!ret) |
---|
| 2390 | + |
---|
| 2391 | + /* |
---|
| 2392 | + * If reset was successful, affected devices no longer need |
---|
| 2393 | + * a reset and we should return all the collateral devices |
---|
| 2394 | + * to low power. If not successful, we either didn't reset |
---|
| 2395 | + * the bus or timed out waiting for it, so let's not touch |
---|
| 2396 | + * the power state. |
---|
| 2397 | + */ |
---|
| 2398 | + if (!ret) { |
---|
1742 | 2399 | tmp->needs_reset = false; |
---|
1743 | 2400 | |
---|
1744 | | - if (!tmp->refcnt && !disable_idle_d3) |
---|
1745 | | - pci_set_power_state(tmp->pdev, PCI_D3hot); |
---|
| 2401 | + if (tmp != vdev && !disable_idle_d3) |
---|
| 2402 | + vfio_pci_set_power_state(tmp, PCI_D3hot); |
---|
| 2403 | + } |
---|
1746 | 2404 | |
---|
1747 | 2405 | vfio_device_put(devs.devices[i]); |
---|
1748 | 2406 | } |
---|
.. | .. |
---|
1813 | 2471 | |
---|
1814 | 2472 | vfio_pci_fill_ids(); |
---|
1815 | 2473 | |
---|
| 2474 | + if (disable_denylist) |
---|
| 2475 | + pr_warn("device denylist disabled.\n"); |
---|
| 2476 | + |
---|
1816 | 2477 | return 0; |
---|
1817 | 2478 | |
---|
1818 | 2479 | out_driver: |
---|