hc
2024-05-14 bedbef8ad3e75a304af6361af235302bcc61d06b
kernel/include/uapi/linux/vfio.h
....@@ -200,8 +200,12 @@
200200 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */
201201 #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */
202202 #define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */
203
+#define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */
204
+#define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */
205
+#define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */
203206 __u32 num_regions; /* Max region index + 1 */
204207 __u32 num_irqs; /* Max IRQ index + 1 */
208
+ __u32 cap_offset; /* Offset within info struct of first cap */
205209 };
206210 #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7)
207211
....@@ -215,6 +219,16 @@
215219 #define VFIO_DEVICE_API_PLATFORM_STRING "vfio-platform"
216220 #define VFIO_DEVICE_API_AMBA_STRING "vfio-amba"
217221 #define VFIO_DEVICE_API_CCW_STRING "vfio-ccw"
222
+#define VFIO_DEVICE_API_AP_STRING "vfio-ap"
223
+
224
+/*
225
+ * The following capabilities are unique to s390 zPCI devices. Their contents
226
+ * are further-defined in vfio_zdev.h
227
+ */
228
+#define VFIO_DEVICE_INFO_CAP_ZPCI_BASE 1
229
+#define VFIO_DEVICE_INFO_CAP_ZPCI_GROUP 2
230
+#define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL 3
231
+#define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP 4
218232
219233 /**
220234 * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
....@@ -293,13 +307,319 @@
293307 __u32 subtype; /* type specific */
294308 };
295309
310
+/*
311
+ * List of region types, global per bus driver.
312
+ * If you introduce a new type, please add it here.
313
+ */
314
+
315
+/* PCI region type containing a PCI vendor part */
296316 #define VFIO_REGION_TYPE_PCI_VENDOR_TYPE (1 << 31)
297317 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff)
318
+#define VFIO_REGION_TYPE_GFX (1)
319
+#define VFIO_REGION_TYPE_CCW (2)
320
+#define VFIO_REGION_TYPE_MIGRATION (3)
298321
299
-/* 8086 Vendor sub-types */
322
+/* sub-types for VFIO_REGION_TYPE_PCI_* */
323
+
324
+/* 8086 vendor PCI sub-types */
300325 #define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1)
301326 #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2)
302327 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3)
328
+
329
+/* 10de vendor PCI sub-types */
330
+/*
331
+ * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space.
332
+ */
333
+#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM (1)
334
+
335
+/* 1014 vendor PCI sub-types */
336
+/*
337
+ * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU
338
+ * to do TLB invalidation on a GPU.
339
+ */
340
+#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1)
341
+
342
+/* sub-types for VFIO_REGION_TYPE_GFX */
343
+#define VFIO_REGION_SUBTYPE_GFX_EDID (1)
344
+
345
+/**
346
+ * struct vfio_region_gfx_edid - EDID region layout.
347
+ *
348
+ * Set display link state and EDID blob.
349
+ *
350
+ * The EDID blob has monitor information such as brand, name, serial
351
+ * number, physical size, supported video modes and more.
352
+ *
353
+ * This special region allows userspace (typically qemu) set a virtual
354
+ * EDID for the virtual monitor, which allows a flexible display
355
+ * configuration.
356
+ *
357
+ * For the edid blob spec look here:
358
+ * https://en.wikipedia.org/wiki/Extended_Display_Identification_Data
359
+ *
360
+ * On linux systems you can find the EDID blob in sysfs:
361
+ * /sys/class/drm/${card}/${connector}/edid
362
+ *
363
+ * You can use the edid-decode ulility (comes with xorg-x11-utils) to
364
+ * decode the EDID blob.
365
+ *
366
+ * @edid_offset: location of the edid blob, relative to the
367
+ * start of the region (readonly).
368
+ * @edid_max_size: max size of the edid blob (readonly).
369
+ * @edid_size: actual edid size (read/write).
370
+ * @link_state: display link state (read/write).
371
+ * VFIO_DEVICE_GFX_LINK_STATE_UP: Monitor is turned on.
372
+ * VFIO_DEVICE_GFX_LINK_STATE_DOWN: Monitor is turned off.
373
+ * @max_xres: max display width (0 == no limitation, readonly).
374
+ * @max_yres: max display height (0 == no limitation, readonly).
375
+ *
376
+ * EDID update protocol:
377
+ * (1) set link-state to down.
378
+ * (2) update edid blob and size.
379
+ * (3) set link-state to up.
380
+ */
381
+struct vfio_region_gfx_edid {
382
+ __u32 edid_offset;
383
+ __u32 edid_max_size;
384
+ __u32 edid_size;
385
+ __u32 max_xres;
386
+ __u32 max_yres;
387
+ __u32 link_state;
388
+#define VFIO_DEVICE_GFX_LINK_STATE_UP 1
389
+#define VFIO_DEVICE_GFX_LINK_STATE_DOWN 2
390
+};
391
+
392
+/* sub-types for VFIO_REGION_TYPE_CCW */
393
+#define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1)
394
+#define VFIO_REGION_SUBTYPE_CCW_SCHIB (2)
395
+#define VFIO_REGION_SUBTYPE_CCW_CRW (3)
396
+
397
+/* sub-types for VFIO_REGION_TYPE_MIGRATION */
398
+#define VFIO_REGION_SUBTYPE_MIGRATION (1)
399
+
400
+/*
401
+ * The structure vfio_device_migration_info is placed at the 0th offset of
402
+ * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
403
+ * migration information. Field accesses from this structure are only supported
404
+ * at their native width and alignment. Otherwise, the result is undefined and
405
+ * vendor drivers should return an error.
406
+ *
407
+ * device_state: (read/write)
408
+ * - The user application writes to this field to inform the vendor driver
409
+ * about the device state to be transitioned to.
410
+ * - The vendor driver should take the necessary actions to change the
411
+ * device state. After successful transition to a given state, the
412
+ * vendor driver should return success on write(device_state, state)
413
+ * system call. If the device state transition fails, the vendor driver
414
+ * should return an appropriate -errno for the fault condition.
415
+ * - On the user application side, if the device state transition fails,
416
+ * that is, if write(device_state, state) returns an error, read
417
+ * device_state again to determine the current state of the device from
418
+ * the vendor driver.
419
+ * - The vendor driver should return previous state of the device unless
420
+ * the vendor driver has encountered an internal error, in which case
421
+ * the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR.
422
+ * - The user application must use the device reset ioctl to recover the
423
+ * device from VFIO_DEVICE_STATE_ERROR state. If the device is
424
+ * indicated to be in a valid device state by reading device_state, the
425
+ * user application may attempt to transition the device to any valid
426
+ * state reachable from the current state or terminate itself.
427
+ *
428
+ * device_state consists of 3 bits:
429
+ * - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
430
+ * it indicates the _STOP state. When the device state is changed to
431
+ * _STOP, driver should stop the device before write() returns.
432
+ * - If bit 1 is set, it indicates the _SAVING state, which means that the
433
+ * driver should start gathering device state information that will be
434
+ * provided to the VFIO user application to save the device's state.
435
+ * - If bit 2 is set, it indicates the _RESUMING state, which means that
436
+ * the driver should prepare to resume the device. Data provided through
437
+ * the migration region should be used to resume the device.
438
+ * Bits 3 - 31 are reserved for future use. To preserve them, the user
439
+ * application should perform a read-modify-write operation on this
440
+ * field when modifying the specified bits.
441
+ *
442
+ * +------- _RESUMING
443
+ * |+------ _SAVING
444
+ * ||+----- _RUNNING
445
+ * |||
446
+ * 000b => Device Stopped, not saving or resuming
447
+ * 001b => Device running, which is the default state
448
+ * 010b => Stop the device & save the device state, stop-and-copy state
449
+ * 011b => Device running and save the device state, pre-copy state
450
+ * 100b => Device stopped and the device state is resuming
451
+ * 101b => Invalid state
452
+ * 110b => Error state
453
+ * 111b => Invalid state
454
+ *
455
+ * State transitions:
456
+ *
457
+ * _RESUMING _RUNNING Pre-copy Stop-and-copy _STOP
458
+ * (100b) (001b) (011b) (010b) (000b)
459
+ * 0. Running or default state
460
+ * |
461
+ *
462
+ * 1. Normal Shutdown (optional)
463
+ * |------------------------------------->|
464
+ *
465
+ * 2. Save the state or suspend
466
+ * |------------------------->|---------->|
467
+ *
468
+ * 3. Save the state during live migration
469
+ * |----------->|------------>|---------->|
470
+ *
471
+ * 4. Resuming
472
+ * |<---------|
473
+ *
474
+ * 5. Resumed
475
+ * |--------->|
476
+ *
477
+ * 0. Default state of VFIO device is _RUNNING when the user application starts.
478
+ * 1. During normal shutdown of the user application, the user application may
479
+ * optionally change the VFIO device state from _RUNNING to _STOP. This
480
+ * transition is optional. The vendor driver must support this transition but
481
+ * must not require it.
482
+ * 2. When the user application saves state or suspends the application, the
483
+ * device state transitions from _RUNNING to stop-and-copy and then to _STOP.
484
+ * On state transition from _RUNNING to stop-and-copy, driver must stop the
485
+ * device, save the device state and send it to the application through the
486
+ * migration region. The sequence to be followed for such transition is given
487
+ * below.
488
+ * 3. In live migration of user application, the state transitions from _RUNNING
489
+ * to pre-copy, to stop-and-copy, and to _STOP.
490
+ * On state transition from _RUNNING to pre-copy, the driver should start
491
+ * gathering the device state while the application is still running and send
492
+ * the device state data to application through the migration region.
493
+ * On state transition from pre-copy to stop-and-copy, the driver must stop
494
+ * the device, save the device state and send it to the user application
495
+ * through the migration region.
496
+ * Vendor drivers must support the pre-copy state even for implementations
497
+ * where no data is provided to the user before the stop-and-copy state. The
498
+ * user must not be required to consume all migration data before the device
499
+ * transitions to a new state, including the stop-and-copy state.
500
+ * The sequence to be followed for above two transitions is given below.
501
+ * 4. To start the resuming phase, the device state should be transitioned from
502
+ * the _RUNNING to the _RESUMING state.
503
+ * In the _RESUMING state, the driver should use the device state data
504
+ * received through the migration region to resume the device.
505
+ * 5. After providing saved device data to the driver, the application should
506
+ * change the state from _RESUMING to _RUNNING.
507
+ *
508
+ * reserved:
509
+ * Reads on this field return zero and writes are ignored.
510
+ *
511
+ * pending_bytes: (read only)
512
+ * The number of pending bytes still to be migrated from the vendor driver.
513
+ *
514
+ * data_offset: (read only)
515
+ * The user application should read data_offset field from the migration
516
+ * region. The user application should read the device data from this
517
+ * offset within the migration region during the _SAVING state or write
518
+ * the device data during the _RESUMING state. See below for details of
519
+ * sequence to be followed.
520
+ *
521
+ * data_size: (read/write)
522
+ * The user application should read data_size to get the size in bytes of
523
+ * the data copied in the migration region during the _SAVING state and
524
+ * write the size in bytes of the data copied in the migration region
525
+ * during the _RESUMING state.
526
+ *
527
+ * The format of the migration region is as follows:
528
+ * ------------------------------------------------------------------
529
+ * |vfio_device_migration_info| data section |
530
+ * | | /////////////////////////////// |
531
+ * ------------------------------------------------------------------
532
+ * ^ ^
533
+ * offset 0-trapped part data_offset
534
+ *
535
+ * The structure vfio_device_migration_info is always followed by the data
536
+ * section in the region, so data_offset will always be nonzero. The offset
537
+ * from where the data is copied is decided by the kernel driver. The data
538
+ * section can be trapped, mmapped, or partitioned, depending on how the kernel
539
+ * driver defines the data section. The data section partition can be defined
540
+ * as mapped by the sparse mmap capability. If mmapped, data_offset must be
541
+ * page aligned, whereas initial section which contains the
542
+ * vfio_device_migration_info structure, might not end at the offset, which is
543
+ * page aligned. The user is not required to access through mmap regardless
544
+ * of the capabilities of the region mmap.
545
+ * The vendor driver should determine whether and how to partition the data
546
+ * section. The vendor driver should return data_offset accordingly.
547
+ *
548
+ * The sequence to be followed while in pre-copy state and stop-and-copy state
549
+ * is as follows:
550
+ * a. Read pending_bytes, indicating the start of a new iteration to get device
551
+ * data. Repeated read on pending_bytes at this stage should have no side
552
+ * effects.
553
+ * If pending_bytes == 0, the user application should not iterate to get data
554
+ * for that device.
555
+ * If pending_bytes > 0, perform the following steps.
556
+ * b. Read data_offset, indicating that the vendor driver should make data
557
+ * available through the data section. The vendor driver should return this
558
+ * read operation only after data is available from (region + data_offset)
559
+ * to (region + data_offset + data_size).
560
+ * c. Read data_size, which is the amount of data in bytes available through
561
+ * the migration region.
562
+ * Read on data_offset and data_size should return the offset and size of
563
+ * the current buffer if the user application reads data_offset and
564
+ * data_size more than once here.
565
+ * d. Read data_size bytes of data from (region + data_offset) from the
566
+ * migration region.
567
+ * e. Process the data.
568
+ * f. Read pending_bytes, which indicates that the data from the previous
569
+ * iteration has been read. If pending_bytes > 0, go to step b.
570
+ *
571
+ * The user application can transition from the _SAVING|_RUNNING
572
+ * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
573
+ * number of pending bytes. The user application should iterate in _SAVING
574
+ * (stop-and-copy) until pending_bytes is 0.
575
+ *
576
+ * The sequence to be followed while _RESUMING device state is as follows:
577
+ * While data for this device is available, repeat the following steps:
578
+ * a. Read data_offset from where the user application should write data.
579
+ * b. Write migration data starting at the migration region + data_offset for
580
+ * the length determined by data_size from the migration source.
581
+ * c. Write data_size, which indicates to the vendor driver that data is
582
+ * written in the migration region. Vendor driver must return this write
583
+ * operations on consuming data. Vendor driver should apply the
584
+ * user-provided migration region data to the device resume state.
585
+ *
586
+ * If an error occurs during the above sequences, the vendor driver can return
587
+ * an error code for next read() or write() operation, which will terminate the
588
+ * loop. The user application should then take the next necessary action, for
589
+ * example, failing migration or terminating the user application.
590
+ *
591
+ * For the user application, data is opaque. The user application should write
592
+ * data in the same order as the data is received and the data should be of
593
+ * same transaction size at the source.
594
+ */
595
+
596
+struct vfio_device_migration_info {
597
+ __u32 device_state; /* VFIO device state */
598
+#define VFIO_DEVICE_STATE_STOP (0)
599
+#define VFIO_DEVICE_STATE_RUNNING (1 << 0)
600
+#define VFIO_DEVICE_STATE_SAVING (1 << 1)
601
+#define VFIO_DEVICE_STATE_RESUMING (1 << 2)
602
+#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \
603
+ VFIO_DEVICE_STATE_SAVING | \
604
+ VFIO_DEVICE_STATE_RESUMING)
605
+
606
+#define VFIO_DEVICE_STATE_VALID(state) \
607
+ (state & VFIO_DEVICE_STATE_RESUMING ? \
608
+ (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
609
+
610
+#define VFIO_DEVICE_STATE_IS_ERROR(state) \
611
+ ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
612
+ VFIO_DEVICE_STATE_RESUMING))
613
+
614
+#define VFIO_DEVICE_STATE_SET_ERROR(state) \
615
+ ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
616
+ VFIO_DEVICE_STATE_RESUMING)
617
+
618
+ __u32 reserved;
619
+ __u64 pending_bytes;
620
+ __u64 data_offset;
621
+ __u64 data_size;
622
+};
303623
304624 /*
305625 * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
....@@ -310,6 +630,33 @@
310630 * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration.
311631 */
312632 #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE 3
633
+
634
+/*
635
+ * Capability with compressed real address (aka SSA - small system address)
636
+ * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing
637
+ * and by the userspace to associate a NVLink bridge with a GPU.
638
+ */
639
+#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT 4
640
+
641
+struct vfio_region_info_cap_nvlink2_ssatgt {
642
+ struct vfio_info_cap_header header;
643
+ __u64 tgt;
644
+};
645
+
646
+/*
647
+ * Capability with an NVLink link speed. The value is read by
648
+ * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed"
649
+ * property in the device tree. The value is fixed in the hardware
650
+ * and failing to provide the correct value results in the link
651
+ * not working with no indication from the driver why.
652
+ */
653
+#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD 5
654
+
655
+struct vfio_region_info_cap_nvlink2_lnkspd {
656
+ struct vfio_info_cap_header header;
657
+ __u32 link_speed;
658
+ __u32 __pad;
659
+};
313660
314661 /**
315662 * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
....@@ -472,6 +819,7 @@
472819
473820 enum {
474821 VFIO_CCW_IO_IRQ_INDEX,
822
+ VFIO_CCW_CRW_IRQ_INDEX,
475823 VFIO_CCW_NUM_IRQS
476824 };
477825
....@@ -602,6 +950,43 @@
602950
603951 #define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16)
604952
953
+/**
954
+ * VFIO_DEVICE_FEATURE - _IORW(VFIO_TYPE, VFIO_BASE + 17,
955
+ * struct vfio_device_feature)
956
+ *
957
+ * Get, set, or probe feature data of the device. The feature is selected
958
+ * using the FEATURE_MASK portion of the flags field. Support for a feature
959
+ * can be probed by setting both the FEATURE_MASK and PROBE bits. A probe
960
+ * may optionally include the GET and/or SET bits to determine read vs write
961
+ * access of the feature respectively. Probing a feature will return success
962
+ * if the feature is supported and all of the optionally indicated GET/SET
963
+ * methods are supported. The format of the data portion of the structure is
964
+ * specific to the given feature. The data portion is not required for
965
+ * probing. GET and SET are mutually exclusive, except for use with PROBE.
966
+ *
967
+ * Return 0 on success, -errno on failure.
968
+ */
969
+struct vfio_device_feature {
970
+ __u32 argsz;
971
+ __u32 flags;
972
+#define VFIO_DEVICE_FEATURE_MASK (0xffff) /* 16-bit feature index */
973
+#define VFIO_DEVICE_FEATURE_GET (1 << 16) /* Get feature into data[] */
974
+#define VFIO_DEVICE_FEATURE_SET (1 << 17) /* Set feature from data[] */
975
+#define VFIO_DEVICE_FEATURE_PROBE (1 << 18) /* Probe feature support */
976
+ __u8 data[];
977
+};
978
+
979
+#define VFIO_DEVICE_FEATURE _IO(VFIO_TYPE, VFIO_BASE + 17)
980
+
981
+/*
982
+ * Provide support for setting a PCI VF Token, which is used as a shared
983
+ * secret between PF and VF drivers. This feature may only be set on a
984
+ * PCI SR-IOV PF when SR-IOV is enabled on the PF and there are no existing
985
+ * open VFs. Data provided when setting this feature is a 16-byte array
986
+ * (__u8 b[16]), representing a UUID.
987
+ */
988
+#define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN (0)
989
+
605990 /* -------- API for Type1 VFIO IOMMU -------- */
606991
607992 /**
....@@ -616,7 +1001,69 @@
6161001 __u32 argsz;
6171002 __u32 flags;
6181003 #define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */
619
- __u64 iova_pgsizes; /* Bitmap of supported page sizes */
1004
+#define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */
1005
+ __u64 iova_pgsizes; /* Bitmap of supported page sizes */
1006
+ __u32 cap_offset; /* Offset within info struct of first cap */
1007
+};
1008
+
1009
+/*
1010
+ * The IOVA capability allows to report the valid IOVA range(s)
1011
+ * excluding any non-relaxable reserved regions exposed by
1012
+ * devices attached to the container. Any DMA map attempt
1013
+ * outside the valid iova range will return error.
1014
+ *
1015
+ * The structures below define version 1 of this capability.
1016
+ */
1017
+#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE 1
1018
+
1019
+struct vfio_iova_range {
1020
+ __u64 start;
1021
+ __u64 end;
1022
+};
1023
+
1024
+struct vfio_iommu_type1_info_cap_iova_range {
1025
+ struct vfio_info_cap_header header;
1026
+ __u32 nr_iovas;
1027
+ __u32 reserved;
1028
+ struct vfio_iova_range iova_ranges[];
1029
+};
1030
+
1031
+/*
1032
+ * The migration capability allows to report supported features for migration.
1033
+ *
1034
+ * The structures below define version 1 of this capability.
1035
+ *
1036
+ * The existence of this capability indicates that IOMMU kernel driver supports
1037
+ * dirty page logging.
1038
+ *
1039
+ * pgsize_bitmap: Kernel driver returns bitmap of supported page sizes for dirty
1040
+ * page logging.
1041
+ * max_dirty_bitmap_size: Kernel driver returns maximum supported dirty bitmap
1042
+ * size in bytes that can be used by user applications when getting the dirty
1043
+ * bitmap.
1044
+ */
1045
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION 2
1046
+
1047
+struct vfio_iommu_type1_info_cap_migration {
1048
+ struct vfio_info_cap_header header;
1049
+ __u32 flags;
1050
+ __u64 pgsize_bitmap;
1051
+ __u64 max_dirty_bitmap_size; /* in bytes */
1052
+};
1053
+
1054
+/*
1055
+ * The DMA available capability allows to report the current number of
1056
+ * simultaneously outstanding DMA mappings that are allowed.
1057
+ *
1058
+ * The structure below defines version 1 of this capability.
1059
+ *
1060
+ * avail: specifies the current number of outstanding DMA mappings allowed.
1061
+ */
1062
+#define VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL 3
1063
+
1064
+struct vfio_iommu_type1_info_dma_avail {
1065
+ struct vfio_info_cap_header header;
1066
+ __u32 avail;
6201067 };
6211068
6221069 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
....@@ -639,6 +1086,12 @@
6391086
6401087 #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
6411088
1089
+struct vfio_bitmap {
1090
+ __u64 pgsize; /* page size for bitmap in bytes */
1091
+ __u64 size; /* in bytes */
1092
+ __u64 __user *data; /* one bit per page */
1093
+};
1094
+
6421095 /**
6431096 * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
6441097 * struct vfio_dma_unmap)
....@@ -648,12 +1101,23 @@
6481101 * field. No guarantee is made to the user that arbitrary unmaps of iova
6491102 * or size different from those used in the original mapping call will
6501103 * succeed.
1104
+ * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
1105
+ * before unmapping IO virtual addresses. When this flag is set, the user must
1106
+ * provide a struct vfio_bitmap in data[]. User must provide zero-allocated
1107
+ * memory via vfio_bitmap.data and its size in the vfio_bitmap.size field.
1108
+ * A bit in the bitmap represents one page, of user provided page size in
1109
+ * vfio_bitmap.pgsize field, consecutively starting from iova offset. Bit set
1110
+ * indicates that the page at that offset from iova is dirty. A Bitmap of the
1111
+ * pages in the range of unmapped size is returned in the user-provided
1112
+ * vfio_bitmap.data.
6511113 */
6521114 struct vfio_iommu_type1_dma_unmap {
6531115 __u32 argsz;
6541116 __u32 flags;
1117
+#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
6551118 __u64 iova; /* IO virtual address */
6561119 __u64 size; /* Size of mapping (bytes) */
1120
+ __u8 data[];
6571121 };
6581122
6591123 #define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
....@@ -665,6 +1129,57 @@
6651129 #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15)
6661130 #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
6671131
1132
+/**
1133
+ * VFIO_IOMMU_DIRTY_PAGES - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
1134
+ * struct vfio_iommu_type1_dirty_bitmap)
1135
+ * IOCTL is used for dirty pages logging.
1136
+ * Caller should set flag depending on which operation to perform, details as
1137
+ * below:
1138
+ *
1139
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_START flag set, instructs
1140
+ * the IOMMU driver to log pages that are dirtied or potentially dirtied by
1141
+ * the device; designed to be used when a migration is in progress. Dirty pages
1142
+ * are logged until logging is disabled by user application by calling the IOCTL
1143
+ * with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag.
1144
+ *
1145
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP flag set, instructs
1146
+ * the IOMMU driver to stop logging dirtied pages.
1147
+ *
1148
+ * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP flag set
1149
+ * returns the dirty pages bitmap for IOMMU container for a given IOVA range.
1150
+ * The user must specify the IOVA range and the pgsize through the structure
1151
+ * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface
1152
+ * supports getting a bitmap of the smallest supported pgsize only and can be
1153
+ * modified in future to get a bitmap of any specified supported pgsize. The
1154
+ * user must provide a zeroed memory area for the bitmap memory and specify its
1155
+ * size in bitmap.size. One bit is used to represent one page consecutively
1156
+ * starting from iova offset. The user should provide page size in bitmap.pgsize
1157
+ * field. A bit set in the bitmap indicates that the page at that offset from
1158
+ * iova is dirty. The caller must set argsz to a value including the size of
1159
+ * structure vfio_iommu_type1_dirty_bitmap_get, but excluding the size of the
1160
+ * actual bitmap. If dirty pages logging is not enabled, an error will be
1161
+ * returned.
1162
+ *
1163
+ * Only one of the flags _START, _STOP and _GET may be specified at a time.
1164
+ *
1165
+ */
1166
+struct vfio_iommu_type1_dirty_bitmap {
1167
+ __u32 argsz;
1168
+ __u32 flags;
1169
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0)
1170
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1)
1171
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
1172
+ __u8 data[];
1173
+};
1174
+
1175
+struct vfio_iommu_type1_dirty_bitmap_get {
1176
+ __u64 iova; /* IO virtual address */
1177
+ __u64 size; /* Size of iova range */
1178
+ struct vfio_bitmap bitmap;
1179
+};
1180
+
1181
+#define VFIO_IOMMU_DIRTY_PAGES _IO(VFIO_TYPE, VFIO_BASE + 17)
1182
+
6681183 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
6691184
6701185 /*