hc
2023-12-11 6778948f9de86c3cfaf36725a7c87dcff9ba247f
kernel/drivers/pci/controller/pci-hyperv.c
....@@ -63,6 +63,7 @@
6363 enum pci_protocol_version_t {
6464 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */
6565 PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */
66
+ PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3), /* Vibranium */
6667 };
6768
6869 #define CPU_AFFINITY_ALL -1ULL
....@@ -72,14 +73,10 @@
7273 * first.
7374 */
7475 static enum pci_protocol_version_t pci_protocol_versions[] = {
76
+ PCI_PROTOCOL_VERSION_1_3,
7577 PCI_PROTOCOL_VERSION_1_2,
7678 PCI_PROTOCOL_VERSION_1_1,
7779 };
78
-
79
-/*
80
- * Protocol version negotiated by hv_pci_protocol_negotiation().
81
- */
82
-static enum pci_protocol_version_t pci_protocol_version;
8380
8481 #define PCI_CONFIG_MMIO_LENGTH 0x2000
8582 #define CFG_PAGE_OFFSET 0x1000
....@@ -124,6 +121,7 @@
124121 PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16,
125122 PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17,
126123 PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */
124
+ PCI_BUS_RELATIONS2 = PCI_MESSAGE_BASE + 0x19,
127125 PCI_MESSAGE_MAXIMUM
128126 };
129127
....@@ -167,6 +165,26 @@
167165 u32 subsystem_id;
168166 union win_slot_encoding win_slot;
169167 u32 ser; /* serial number */
168
+} __packed;
169
+
170
+enum pci_device_description_flags {
171
+ HV_PCI_DEVICE_FLAG_NONE = 0x0,
172
+ HV_PCI_DEVICE_FLAG_NUMA_AFFINITY = 0x1,
173
+};
174
+
175
+struct pci_function_description2 {
176
+ u16 v_id; /* vendor ID */
177
+ u16 d_id; /* device ID */
178
+ u8 rev;
179
+ u8 prog_intf;
180
+ u8 subclass;
181
+ u8 base_class;
182
+ u32 subsystem_id;
183
+ union win_slot_encoding win_slot;
184
+ u32 ser; /* serial number */
185
+ u32 flags;
186
+ u16 virtual_numa_node;
187
+ u16 reserved;
170188 } __packed;
171189
172190 /**
....@@ -265,7 +283,7 @@
265283 int resp_packet_size);
266284 void *compl_ctxt;
267285
268
- struct pci_message message[0];
286
+ struct pci_message message[];
269287 };
270288
271289 /*
....@@ -301,13 +319,19 @@
301319 struct pci_bus_relations {
302320 struct pci_incoming_message incoming;
303321 u32 device_count;
304
- struct pci_function_description func[0];
322
+ struct pci_function_description func[];
323
+} __packed;
324
+
325
+struct pci_bus_relations2 {
326
+ struct pci_incoming_message incoming;
327
+ u32 device_count;
328
+ struct pci_function_description2 func[];
305329 } __packed;
306330
307331 struct pci_q_res_req_response {
308332 struct vmpacket_descriptor hdr;
309333 s32 status; /* negative values are failures */
310
- u32 probed_bar[6];
334
+ u32 probed_bar[PCI_STD_NUM_BARS];
311335 } __packed;
312336
313337 struct pci_set_power {
....@@ -365,6 +389,39 @@
365389 struct tran_int_desc int_desc;
366390 } __packed;
367391
392
+/*
393
+ * Note: the VM must pass a valid block id, wslot and bytes_requested.
394
+ */
395
+struct pci_read_block {
396
+ struct pci_message message_type;
397
+ u32 block_id;
398
+ union win_slot_encoding wslot;
399
+ u32 bytes_requested;
400
+} __packed;
401
+
402
+struct pci_read_block_response {
403
+ struct vmpacket_descriptor hdr;
404
+ u32 status;
405
+ u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
406
+} __packed;
407
+
408
+/*
409
+ * Note: the VM must pass a valid block id, wslot and byte_count.
410
+ */
411
+struct pci_write_block {
412
+ struct pci_message message_type;
413
+ u32 block_id;
414
+ union win_slot_encoding wslot;
415
+ u32 byte_count;
416
+ u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
417
+} __packed;
418
+
419
+struct pci_dev_inval_block {
420
+ struct pci_incoming_message incoming;
421
+ union win_slot_encoding wslot;
422
+ u64 block_mask;
423
+} __packed;
424
+
368425 struct pci_dev_incoming {
369426 struct pci_incoming_message incoming;
370427 union win_slot_encoding wslot;
....@@ -379,50 +436,6 @@
379436 static int pci_ring_size = (4 * PAGE_SIZE);
380437
381438 /*
382
- * Definitions or interrupt steering hypercall.
383
- */
384
-#define HV_PARTITION_ID_SELF ((u64)-1)
385
-#define HVCALL_RETARGET_INTERRUPT 0x7e
386
-
387
-struct hv_interrupt_entry {
388
- u32 source; /* 1 for MSI(-X) */
389
- u32 reserved1;
390
- u32 address;
391
- u32 data;
392
-};
393
-
394
-#define HV_VP_SET_BANK_COUNT_MAX 5 /* current implementation limit */
395
-
396
-struct hv_vp_set {
397
- u64 format; /* 0 (HvGenericSetSparse4k) */
398
- u64 valid_banks;
399
- u64 masks[HV_VP_SET_BANK_COUNT_MAX];
400
-};
401
-
402
-/*
403
- * flags for hv_device_interrupt_target.flags
404
- */
405
-#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1
406
-#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2
407
-
408
-struct hv_device_interrupt_target {
409
- u32 vector;
410
- u32 flags;
411
- union {
412
- u64 vp_mask;
413
- struct hv_vp_set vp_set;
414
- };
415
-};
416
-
417
-struct retarget_msi_interrupt {
418
- u64 partition_id; /* use "self" */
419
- u64 device_id;
420
- struct hv_interrupt_entry int_entry;
421
- u64 reserved2;
422
- struct hv_device_interrupt_target int_target;
423
-} __packed;
424
-
425
-/*
426439 * Driver specific state.
427440 */
428441
....@@ -430,12 +443,14 @@
430443 hv_pcibus_init = 0,
431444 hv_pcibus_probed,
432445 hv_pcibus_installed,
433
- hv_pcibus_removed,
446
+ hv_pcibus_removing,
434447 hv_pcibus_maximum
435448 };
436449
437450 struct hv_pcibus_device {
438451 struct pci_sysdata sysdata;
452
+ /* Protocol version negotiated with the host */
453
+ enum pci_protocol_version_t protocol_version;
439454 enum hv_pcibus_state state;
440455 refcount_t remove_lock;
441456 struct hv_device *hdev;
....@@ -460,12 +475,19 @@
460475 struct msi_controller msi_chip;
461476 struct irq_domain *irq_domain;
462477
463
- /* hypercall arg, must not cross page boundary */
464
- struct retarget_msi_interrupt retarget_msi_interrupt_params;
465
-
466478 spinlock_t retarget_msi_interrupt_lock;
467479
468480 struct workqueue_struct *wq;
481
+
482
+ /* Highest slot of child device with resources allocated */
483
+ int wslot_res_allocated;
484
+
485
+ /* hypercall arg, must not cross page boundary */
486
+ struct hv_retarget_device_interrupt retarget_msi_interrupt_params;
487
+
488
+ /*
489
+ * Don't put anything here: retarget_msi_interrupt_params must be last
490
+ */
469491 };
470492
471493 /*
....@@ -478,10 +500,24 @@
478500 struct hv_pcibus_device *bus;
479501 };
480502
503
+struct hv_pcidev_description {
504
+ u16 v_id; /* vendor ID */
505
+ u16 d_id; /* device ID */
506
+ u8 rev;
507
+ u8 prog_intf;
508
+ u8 subclass;
509
+ u8 base_class;
510
+ u32 subsystem_id;
511
+ union win_slot_encoding win_slot;
512
+ u32 ser; /* serial number */
513
+ u32 flags;
514
+ u16 virtual_numa_node;
515
+};
516
+
481517 struct hv_dr_state {
482518 struct list_head list_entry;
483519 u32 device_count;
484
- struct pci_function_description func[0];
520
+ struct hv_pcidev_description func[];
485521 };
486522
487523 enum hv_pcichild_state {
....@@ -498,16 +534,19 @@
498534 refcount_t refs;
499535 enum hv_pcichild_state state;
500536 struct pci_slot *pci_slot;
501
- struct pci_function_description desc;
537
+ struct hv_pcidev_description desc;
502538 bool reported_missing;
503539 struct hv_pcibus_device *hbus;
504540 struct work_struct wrk;
541
+
542
+ void (*block_invalidate)(void *context, u64 block_mask);
543
+ void *invalidate_context;
505544
506545 /*
507546 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
508547 * read it back, for each of the BAR offsets within config space.
509548 */
510
- u32 probed_bar[6];
549
+ u32 probed_bar[PCI_STD_NUM_BARS];
511550 };
512551
513552 struct hv_pci_compl {
....@@ -821,6 +860,254 @@
821860 .write = hv_pcifront_write_config,
822861 };
823862
863
+/*
864
+ * Paravirtual backchannel
865
+ *
866
+ * Hyper-V SR-IOV provides a backchannel mechanism in software for
867
+ * communication between a VF driver and a PF driver. These
868
+ * "configuration blocks" are similar in concept to PCI configuration space,
869
+ * but instead of doing reads and writes in 32-bit chunks through a very slow
870
+ * path, packets of up to 128 bytes can be sent or received asynchronously.
871
+ *
872
+ * Nearly every SR-IOV device contains just such a communications channel in
873
+ * hardware, so using this one in software is usually optional. Using the
874
+ * software channel, however, allows driver implementers to leverage software
875
+ * tools that fuzz the communications channel looking for vulnerabilities.
876
+ *
877
+ * The usage model for these packets puts the responsibility for reading or
878
+ * writing on the VF driver. The VF driver sends a read or a write packet,
879
+ * indicating which "block" is being referred to by number.
880
+ *
881
+ * If the PF driver wishes to initiate communication, it can "invalidate" one or
882
+ * more of the first 64 blocks. This invalidation is delivered via a callback
883
+ * supplied by the VF driver by this driver.
884
+ *
885
+ * No protocol is implied, except that supplied by the PF and VF drivers.
886
+ */
887
+
888
+struct hv_read_config_compl {
889
+ struct hv_pci_compl comp_pkt;
890
+ void *buf;
891
+ unsigned int len;
892
+ unsigned int bytes_returned;
893
+};
894
+
895
+/**
896
+ * hv_pci_read_config_compl() - Invoked when a response packet
897
+ * for a read config block operation arrives.
898
+ * @context: Identifies the read config operation
899
+ * @resp: The response packet itself
900
+ * @resp_packet_size: Size in bytes of the response packet
901
+ */
902
+static void hv_pci_read_config_compl(void *context, struct pci_response *resp,
903
+ int resp_packet_size)
904
+{
905
+ struct hv_read_config_compl *comp = context;
906
+ struct pci_read_block_response *read_resp =
907
+ (struct pci_read_block_response *)resp;
908
+ unsigned int data_len, hdr_len;
909
+
910
+ hdr_len = offsetof(struct pci_read_block_response, bytes);
911
+ if (resp_packet_size < hdr_len) {
912
+ comp->comp_pkt.completion_status = -1;
913
+ goto out;
914
+ }
915
+
916
+ data_len = resp_packet_size - hdr_len;
917
+ if (data_len > 0 && read_resp->status == 0) {
918
+ comp->bytes_returned = min(comp->len, data_len);
919
+ memcpy(comp->buf, read_resp->bytes, comp->bytes_returned);
920
+ } else {
921
+ comp->bytes_returned = 0;
922
+ }
923
+
924
+ comp->comp_pkt.completion_status = read_resp->status;
925
+out:
926
+ complete(&comp->comp_pkt.host_event);
927
+}
928
+
929
+/**
930
+ * hv_read_config_block() - Sends a read config block request to
931
+ * the back-end driver running in the Hyper-V parent partition.
932
+ * @pdev: The PCI driver's representation for this device.
933
+ * @buf: Buffer into which the config block will be copied.
934
+ * @len: Size in bytes of buf.
935
+ * @block_id: Identifies the config block which has been requested.
936
+ * @bytes_returned: Size which came back from the back-end driver.
937
+ *
938
+ * Return: 0 on success, -errno on failure
939
+ */
940
+static int hv_read_config_block(struct pci_dev *pdev, void *buf,
941
+ unsigned int len, unsigned int block_id,
942
+ unsigned int *bytes_returned)
943
+{
944
+ struct hv_pcibus_device *hbus =
945
+ container_of(pdev->bus->sysdata, struct hv_pcibus_device,
946
+ sysdata);
947
+ struct {
948
+ struct pci_packet pkt;
949
+ char buf[sizeof(struct pci_read_block)];
950
+ } pkt;
951
+ struct hv_read_config_compl comp_pkt;
952
+ struct pci_read_block *read_blk;
953
+ int ret;
954
+
955
+ if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
956
+ return -EINVAL;
957
+
958
+ init_completion(&comp_pkt.comp_pkt.host_event);
959
+ comp_pkt.buf = buf;
960
+ comp_pkt.len = len;
961
+
962
+ memset(&pkt, 0, sizeof(pkt));
963
+ pkt.pkt.completion_func = hv_pci_read_config_compl;
964
+ pkt.pkt.compl_ctxt = &comp_pkt;
965
+ read_blk = (struct pci_read_block *)&pkt.pkt.message;
966
+ read_blk->message_type.type = PCI_READ_BLOCK;
967
+ read_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
968
+ read_blk->block_id = block_id;
969
+ read_blk->bytes_requested = len;
970
+
971
+ ret = vmbus_sendpacket(hbus->hdev->channel, read_blk,
972
+ sizeof(*read_blk), (unsigned long)&pkt.pkt,
973
+ VM_PKT_DATA_INBAND,
974
+ VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
975
+ if (ret)
976
+ return ret;
977
+
978
+ ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event);
979
+ if (ret)
980
+ return ret;
981
+
982
+ if (comp_pkt.comp_pkt.completion_status != 0 ||
983
+ comp_pkt.bytes_returned == 0) {
984
+ dev_err(&hbus->hdev->device,
985
+ "Read Config Block failed: 0x%x, bytes_returned=%d\n",
986
+ comp_pkt.comp_pkt.completion_status,
987
+ comp_pkt.bytes_returned);
988
+ return -EIO;
989
+ }
990
+
991
+ *bytes_returned = comp_pkt.bytes_returned;
992
+ return 0;
993
+}
994
+
995
+/**
996
+ * hv_pci_write_config_compl() - Invoked when a response packet for a write
997
+ * config block operation arrives.
998
+ * @context: Identifies the write config operation
999
+ * @resp: The response packet itself
1000
+ * @resp_packet_size: Size in bytes of the response packet
1001
+ */
1002
+static void hv_pci_write_config_compl(void *context, struct pci_response *resp,
1003
+ int resp_packet_size)
1004
+{
1005
+ struct hv_pci_compl *comp_pkt = context;
1006
+
1007
+ comp_pkt->completion_status = resp->status;
1008
+ complete(&comp_pkt->host_event);
1009
+}
1010
+
1011
+/**
1012
+ * hv_write_config_block() - Sends a write config block request to the
1013
+ * back-end driver running in the Hyper-V parent partition.
1014
+ * @pdev: The PCI driver's representation for this device.
1015
+ * @buf: Buffer from which the config block will be copied.
1016
+ * @len: Size in bytes of buf.
1017
+ * @block_id: Identifies the config block which is being written.
1018
+ *
1019
+ * Return: 0 on success, -errno on failure
1020
+ */
1021
+static int hv_write_config_block(struct pci_dev *pdev, void *buf,
1022
+ unsigned int len, unsigned int block_id)
1023
+{
1024
+ struct hv_pcibus_device *hbus =
1025
+ container_of(pdev->bus->sysdata, struct hv_pcibus_device,
1026
+ sysdata);
1027
+ struct {
1028
+ struct pci_packet pkt;
1029
+ char buf[sizeof(struct pci_write_block)];
1030
+ u32 reserved;
1031
+ } pkt;
1032
+ struct hv_pci_compl comp_pkt;
1033
+ struct pci_write_block *write_blk;
1034
+ u32 pkt_size;
1035
+ int ret;
1036
+
1037
+ if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
1038
+ return -EINVAL;
1039
+
1040
+ init_completion(&comp_pkt.host_event);
1041
+
1042
+ memset(&pkt, 0, sizeof(pkt));
1043
+ pkt.pkt.completion_func = hv_pci_write_config_compl;
1044
+ pkt.pkt.compl_ctxt = &comp_pkt;
1045
+ write_blk = (struct pci_write_block *)&pkt.pkt.message;
1046
+ write_blk->message_type.type = PCI_WRITE_BLOCK;
1047
+ write_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
1048
+ write_blk->block_id = block_id;
1049
+ write_blk->byte_count = len;
1050
+ memcpy(write_blk->bytes, buf, len);
1051
+ pkt_size = offsetof(struct pci_write_block, bytes) + len;
1052
+ /*
1053
+ * This quirk is required on some hosts shipped around 2018, because
1054
+ * these hosts don't check the pkt_size correctly (new hosts have been
1055
+ * fixed since early 2019). The quirk is also safe on very old hosts
1056
+ * and new hosts, because, on them, what really matters is the length
1057
+ * specified in write_blk->byte_count.
1058
+ */
1059
+ pkt_size += sizeof(pkt.reserved);
1060
+
1061
+ ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size,
1062
+ (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND,
1063
+ VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1064
+ if (ret)
1065
+ return ret;
1066
+
1067
+ ret = wait_for_response(hbus->hdev, &comp_pkt.host_event);
1068
+ if (ret)
1069
+ return ret;
1070
+
1071
+ if (comp_pkt.completion_status != 0) {
1072
+ dev_err(&hbus->hdev->device,
1073
+ "Write Config Block failed: 0x%x\n",
1074
+ comp_pkt.completion_status);
1075
+ return -EIO;
1076
+ }
1077
+
1078
+ return 0;
1079
+}
1080
+
1081
+/**
1082
+ * hv_register_block_invalidate() - Invoked when a config block invalidation
1083
+ * arrives from the back-end driver.
1084
+ * @pdev: The PCI driver's representation for this device.
1085
+ * @context: Identifies the device.
1086
+ * @block_invalidate: Identifies all of the blocks being invalidated.
1087
+ *
1088
+ * Return: 0 on success, -errno on failure
1089
+ */
1090
+static int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
1091
+ void (*block_invalidate)(void *context,
1092
+ u64 block_mask))
1093
+{
1094
+ struct hv_pcibus_device *hbus =
1095
+ container_of(pdev->bus->sysdata, struct hv_pcibus_device,
1096
+ sysdata);
1097
+ struct hv_pci_dev *hpdev;
1098
+
1099
+ hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1100
+ if (!hpdev)
1101
+ return -ENODEV;
1102
+
1103
+ hpdev->block_invalidate = block_invalidate;
1104
+ hpdev->invalidate_context = context;
1105
+
1106
+ put_pcichild(hpdev);
1107
+ return 0;
1108
+
1109
+}
1110
+
8241111 /* Interrupt management hooks */
8251112 static void hv_int_desc_free(struct hv_pci_dev *hpdev,
8261113 struct tran_int_desc *int_desc)
....@@ -831,6 +1118,10 @@
8311118 u8 buffer[sizeof(struct pci_delete_interrupt)];
8321119 } ctxt;
8331120
1121
+ if (!int_desc->vector_count) {
1122
+ kfree(int_desc);
1123
+ return;
1124
+ }
8341125 memset(&ctxt, 0, sizeof(ctxt));
8351126 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
8361127 int_pkt->message_type.type =
....@@ -893,6 +1184,28 @@
8931184 pci_msi_mask_irq(data);
8941185 }
8951186
1187
+static unsigned int hv_msi_get_int_vector(struct irq_data *data)
1188
+{
1189
+ struct irq_cfg *cfg = irqd_cfg(data);
1190
+
1191
+ return cfg->vector;
1192
+}
1193
+
1194
+static int hv_msi_prepare(struct irq_domain *domain, struct device *dev,
1195
+ int nvec, msi_alloc_info_t *info)
1196
+{
1197
+ int ret = pci_msi_prepare(domain, dev, nvec, info);
1198
+
1199
+ /*
1200
+ * By using the interrupt remapper in the hypervisor IOMMU, contiguous
1201
+ * CPU vectors is not needed for multi-MSI
1202
+ */
1203
+ if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
1204
+ info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
1205
+
1206
+ return ret;
1207
+}
1208
+
8961209 /**
8971210 * hv_irq_unmask() - "Unmask" the IRQ by setting its current
8981211 * affinity.
....@@ -907,21 +1220,23 @@
9071220 {
9081221 struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
9091222 struct irq_cfg *cfg = irqd_cfg(data);
910
- struct retarget_msi_interrupt *params;
1223
+ struct hv_retarget_device_interrupt *params;
1224
+ struct tran_int_desc *int_desc;
9111225 struct hv_pcibus_device *hbus;
9121226 struct cpumask *dest;
1227
+ cpumask_var_t tmp;
9131228 struct pci_bus *pbus;
9141229 struct pci_dev *pdev;
9151230 unsigned long flags;
9161231 u32 var_size = 0;
917
- int cpu_vmbus;
918
- int cpu;
1232
+ int cpu, nr_bank;
9191233 u64 res;
9201234
9211235 dest = irq_data_get_effective_affinity_mask(data);
9221236 pdev = msi_desc_to_pci_dev(msi_desc);
9231237 pbus = pdev->bus;
9241238 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
1239
+ int_desc = data->chip_data;
9251240
9261241 spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
9271242
....@@ -929,8 +1244,8 @@
9291244 memset(params, 0, sizeof(*params));
9301245 params->partition_id = HV_PARTITION_ID_SELF;
9311246 params->int_entry.source = 1; /* MSI(-X) */
932
- params->int_entry.address = msi_desc->msg.address_lo;
933
- params->int_entry.data = msi_desc->msg.data;
1247
+ params->int_entry.msi_entry.address = int_desc->address & 0xffffffff;
1248
+ params->int_entry.msi_entry.data = int_desc->data;
9341249 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
9351250 (hbus->hdev->dev_instance.b[4] << 16) |
9361251 (hbus->hdev->dev_instance.b[7] << 8) |
....@@ -945,7 +1260,7 @@
9451260 * negative effect (yet?).
9461261 */
9471262
948
- if (pci_protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
1263
+ if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
9491264 /*
9501265 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
9511266 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
....@@ -955,28 +1270,27 @@
9551270 */
9561271 params->int_target.flags |=
9571272 HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
958
- params->int_target.vp_set.valid_banks =
959
- (1ull << HV_VP_SET_BANK_COUNT_MAX) - 1;
1273
+
1274
+ if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) {
1275
+ res = 1;
1276
+ goto exit_unlock;
1277
+ }
1278
+
1279
+ cpumask_and(tmp, dest, cpu_online_mask);
1280
+ nr_bank = cpumask_to_vpset(&params->int_target.vp_set, tmp);
1281
+ free_cpumask_var(tmp);
1282
+
1283
+ if (nr_bank <= 0) {
1284
+ res = 1;
1285
+ goto exit_unlock;
1286
+ }
9601287
9611288 /*
9621289 * var-sized hypercall, var-size starts after vp_mask (thus
963
- * vp_set.format does not count, but vp_set.valid_banks does).
1290
+ * vp_set.format does not count, but vp_set.valid_bank_mask
1291
+ * does).
9641292 */
965
- var_size = 1 + HV_VP_SET_BANK_COUNT_MAX;
966
-
967
- for_each_cpu_and(cpu, dest, cpu_online_mask) {
968
- cpu_vmbus = hv_cpu_number_to_vp_number(cpu);
969
-
970
- if (cpu_vmbus >= HV_VP_SET_BANK_COUNT_MAX * 64) {
971
- dev_err(&hbus->hdev->device,
972
- "too high CPU %d", cpu_vmbus);
973
- res = 1;
974
- goto exit_unlock;
975
- }
976
-
977
- params->int_target.vp_set.masks[cpu_vmbus / 64] |=
978
- (1ULL << (cpu_vmbus & 63));
979
- }
1293
+ var_size = 1 + nr_bank;
9801294 } else {
9811295 for_each_cpu_and(cpu, dest, cpu_online_mask) {
9821296 params->int_target.vp_mask |=
....@@ -990,11 +1304,25 @@
9901304 exit_unlock:
9911305 spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
9921306
993
- if (res) {
1307
+ /*
1308
+ * During hibernation, when a CPU is offlined, the kernel tries
1309
+ * to move the interrupt to the remaining CPUs that haven't
1310
+ * been offlined yet. In this case, the below hv_do_hypercall()
1311
+ * always fails since the vmbus channel has been closed:
1312
+ * refer to cpu_disable_common() -> fixup_irqs() ->
1313
+ * irq_migrate_all_off_this_cpu() -> migrate_one_irq().
1314
+ *
1315
+ * Suppress the error message for hibernation because the failure
1316
+ * during hibernation does not matter (at this time all the devices
1317
+ * have been frozen). Note: the correct affinity info is still updated
1318
+ * into the irqdata data structure in migrate_one_irq() ->
1319
+ * irq_do_set_affinity() -> hv_set_affinity(), so later when the VM
1320
+ * resumes, hv_pci_restore_msi_state() is able to correctly restore
1321
+ * the interrupt with the correct affinity.
1322
+ */
1323
+ if (res && hbus->state != hv_pcibus_removing)
9941324 dev_err(&hbus->hdev->device,
9951325 "%s() failed: %#llx", __func__, res);
996
- return;
997
- }
9981326
9991327 pci_msi_unmask_irq(data);
10001328 }
....@@ -1018,12 +1346,12 @@
10181346
10191347 static u32 hv_compose_msi_req_v1(
10201348 struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
1021
- u32 slot, u8 vector)
1349
+ u32 slot, u8 vector, u8 vector_count)
10221350 {
10231351 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
10241352 int_pkt->wslot.slot = slot;
10251353 int_pkt->int_desc.vector = vector;
1026
- int_pkt->int_desc.vector_count = 1;
1354
+ int_pkt->int_desc.vector_count = vector_count;
10271355 int_pkt->int_desc.delivery_mode = dest_Fixed;
10281356
10291357 /*
....@@ -1037,14 +1365,14 @@
10371365
10381366 static u32 hv_compose_msi_req_v2(
10391367 struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
1040
- u32 slot, u8 vector)
1368
+ u32 slot, u8 vector, u8 vector_count)
10411369 {
10421370 int cpu;
10431371
10441372 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
10451373 int_pkt->wslot.slot = slot;
10461374 int_pkt->int_desc.vector = vector;
1047
- int_pkt->int_desc.vector_count = 1;
1375
+ int_pkt->int_desc.vector_count = vector_count;
10481376 int_pkt->int_desc.delivery_mode = dest_Fixed;
10491377
10501378 /*
....@@ -1072,15 +1400,16 @@
10721400 */
10731401 static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
10741402 {
1075
- struct irq_cfg *cfg = irqd_cfg(data);
10761403 struct hv_pcibus_device *hbus;
1404
+ struct vmbus_channel *channel;
10771405 struct hv_pci_dev *hpdev;
10781406 struct pci_bus *pbus;
10791407 struct pci_dev *pdev;
10801408 struct cpumask *dest;
1081
- unsigned long flags;
10821409 struct compose_comp_ctxt comp;
10831410 struct tran_int_desc *int_desc;
1411
+ struct msi_desc *msi_desc;
1412
+ u8 vector, vector_count;
10841413 struct {
10851414 struct pci_packet pci_pkt;
10861415 union {
....@@ -1092,43 +1421,80 @@
10921421 u32 size;
10931422 int ret;
10941423
1095
- pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
1424
+ /* Reuse the previous allocation */
1425
+ if (data->chip_data) {
1426
+ int_desc = data->chip_data;
1427
+ msg->address_hi = int_desc->address >> 32;
1428
+ msg->address_lo = int_desc->address & 0xffffffff;
1429
+ msg->data = int_desc->data;
1430
+ return;
1431
+ }
1432
+
1433
+ msi_desc = irq_data_get_msi_desc(data);
1434
+ pdev = msi_desc_to_pci_dev(msi_desc);
10961435 dest = irq_data_get_effective_affinity_mask(data);
10971436 pbus = pdev->bus;
10981437 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
1438
+ channel = hbus->hdev->channel;
10991439 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
11001440 if (!hpdev)
11011441 goto return_null_message;
11021442
1103
- /* Free any previous message that might have already been composed. */
1104
- if (data->chip_data) {
1105
- int_desc = data->chip_data;
1106
- data->chip_data = NULL;
1107
- hv_int_desc_free(hpdev, int_desc);
1108
- }
1109
-
11101443 int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
11111444 if (!int_desc)
11121445 goto drop_reference;
1446
+
1447
+ if (!msi_desc->msi_attrib.is_msix && msi_desc->nvec_used > 1) {
1448
+ /*
1449
+ * If this is not the first MSI of Multi MSI, we already have
1450
+ * a mapping. Can exit early.
1451
+ */
1452
+ if (msi_desc->irq != data->irq) {
1453
+ data->chip_data = int_desc;
1454
+ int_desc->address = msi_desc->msg.address_lo |
1455
+ (u64)msi_desc->msg.address_hi << 32;
1456
+ int_desc->data = msi_desc->msg.data +
1457
+ (data->irq - msi_desc->irq);
1458
+ msg->address_hi = msi_desc->msg.address_hi;
1459
+ msg->address_lo = msi_desc->msg.address_lo;
1460
+ msg->data = int_desc->data;
1461
+ put_pcichild(hpdev);
1462
+ return;
1463
+ }
1464
+ /*
1465
+ * The vector we select here is a dummy value. The correct
1466
+ * value gets sent to the hypervisor in unmask(). This needs
1467
+ * to be aligned with the count, and also not zero. Multi-msi
1468
+ * is powers of 2 up to 32, so 32 will always work here.
1469
+ */
1470
+ vector = 32;
1471
+ vector_count = msi_desc->nvec_used;
1472
+ } else {
1473
+ vector = hv_msi_get_int_vector(data);
1474
+ vector_count = 1;
1475
+ }
11131476
11141477 memset(&ctxt, 0, sizeof(ctxt));
11151478 init_completion(&comp.comp_pkt.host_event);
11161479 ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
11171480 ctxt.pci_pkt.compl_ctxt = &comp;
11181481
1119
- switch (pci_protocol_version) {
1482
+ switch (hbus->protocol_version) {
11201483 case PCI_PROTOCOL_VERSION_1_1:
11211484 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
11221485 dest,
11231486 hpdev->desc.win_slot.slot,
1124
- cfg->vector);
1487
+ vector,
1488
+ vector_count);
11251489 break;
11261490
11271491 case PCI_PROTOCOL_VERSION_1_2:
1492
+ case PCI_PROTOCOL_VERSION_1_3:
11281493 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
11291494 dest,
11301495 hpdev->desc.win_slot.slot,
1131
- cfg->vector);
1496
+ vector,
1497
+ vector_count);
11321498 break;
11331499
11341500 default:
....@@ -1153,41 +1519,50 @@
11531519 }
11541520
11551521 /*
1522
+ * Prevents hv_pci_onchannelcallback() from running concurrently
1523
+ * in the tasklet.
1524
+ */
1525
+ tasklet_disable(&channel->callback_event);
1526
+
1527
+ /*
11561528 * Since this function is called with IRQ locks held, can't
11571529 * do normal wait for completion; instead poll.
11581530 */
11591531 while (!try_wait_for_completion(&comp.comp_pkt.host_event)) {
1532
+ unsigned long flags;
1533
+
11601534 /* 0xFFFF means an invalid PCI VENDOR ID. */
11611535 if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) {
11621536 dev_err_once(&hbus->hdev->device,
11631537 "the device has gone\n");
1164
- goto free_int_desc;
1538
+ goto enable_tasklet;
11651539 }
11661540
11671541 /*
1168
- * When the higher level interrupt code calls us with
1169
- * interrupt disabled, we must poll the channel by calling
1170
- * the channel callback directly when channel->target_cpu is
1171
- * the current CPU. When the higher level interrupt code
1172
- * calls us with interrupt enabled, let's add the
1173
- * local_irq_save()/restore() to avoid race:
1174
- * hv_pci_onchannelcallback() can also run in tasklet.
1542
+ * Make sure that the ring buffer data structure doesn't get
1543
+ * freed while we dereference the ring buffer pointer. Test
1544
+ * for the channel's onchannel_callback being NULL within a
1545
+ * sched_lock critical section. See also the inline comments
1546
+ * in vmbus_reset_channel_cb().
11751547 */
1176
- local_irq_save(flags);
1177
-
1178
- if (hbus->hdev->channel->target_cpu == smp_processor_id())
1179
- hv_pci_onchannelcallback(hbus);
1180
-
1181
- local_irq_restore(flags);
1548
+ spin_lock_irqsave(&channel->sched_lock, flags);
1549
+ if (unlikely(channel->onchannel_callback == NULL)) {
1550
+ spin_unlock_irqrestore(&channel->sched_lock, flags);
1551
+ goto enable_tasklet;
1552
+ }
1553
+ hv_pci_onchannelcallback(hbus);
1554
+ spin_unlock_irqrestore(&channel->sched_lock, flags);
11821555
11831556 if (hpdev->state == hv_pcichild_ejecting) {
11841557 dev_err_once(&hbus->hdev->device,
11851558 "the device is being ejected\n");
1186
- goto free_int_desc;
1559
+ goto enable_tasklet;
11871560 }
11881561
11891562 udelay(100);
11901563 }
1564
+
1565
+ tasklet_enable(&channel->callback_event);
11911566
11921567 if (comp.comp_pkt.completion_status < 0) {
11931568 dev_err(&hbus->hdev->device,
....@@ -1212,6 +1587,8 @@
12121587 put_pcichild(hpdev);
12131588 return;
12141589
1590
+enable_tasklet:
1591
+ tasklet_enable(&channel->callback_event);
12151592 free_int_desc:
12161593 kfree(int_desc);
12171594 drop_reference:
....@@ -1232,16 +1609,8 @@
12321609 .irq_unmask = hv_irq_unmask,
12331610 };
12341611
1235
-static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
1236
- msi_alloc_info_t *arg)
1237
-{
1238
- return arg->msi_hwirq;
1239
-}
1240
-
12411612 static struct msi_domain_ops hv_msi_ops = {
1242
- .get_hwirq = hv_msi_domain_ops_get_hwirq,
1243
- .msi_prepare = pci_msi_prepare,
1244
- .set_desc = pci_msi_set_desc,
1613
+ .msi_prepare = hv_msi_prepare,
12451614 .msi_free = hv_msi_free,
12461615 };
12471616
....@@ -1332,7 +1701,7 @@
13321701 * so it's sufficient to just add them up without tracking alignment.
13331702 */
13341703 list_for_each_entry(hpdev, &hbus->children, list_entry) {
1335
- for (i = 0; i < 6; i++) {
1704
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
13361705 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
13371706 dev_err(&hbus->hdev->device,
13381707 "There's an I/O BAR in this list!\n");
....@@ -1403,10 +1772,27 @@
14031772
14041773 spin_lock_irqsave(&hbus->device_list_lock, flags);
14051774
1775
+ /*
1776
+ * Clear the memory enable bit, in case it's already set. This occurs
1777
+ * in the suspend path of hibernation, where the device is suspended,
1778
+ * resumed and suspended again: see hibernation_snapshot() and
1779
+ * hibernation_platform_enter().
1780
+ *
1781
+ * If the memory enable bit is already set, Hyper-V sliently ignores
1782
+ * the below BAR updates, and the related PCI device driver can not
1783
+ * work, because reading from the device register(s) always returns
1784
+ * 0xFFFFFFFF.
1785
+ */
1786
+ list_for_each_entry(hpdev, &hbus->children, list_entry) {
1787
+ _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command);
1788
+ command &= ~PCI_COMMAND_MEMORY;
1789
+ _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command);
1790
+ }
1791
+
14061792 /* Pick addresses for the BARs. */
14071793 do {
14081794 list_for_each_entry(hpdev, &hbus->children, list_entry) {
1409
- for (i = 0; i < 6; i++) {
1795
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
14101796 bar_val = hpdev->probed_bar[i];
14111797 if (bar_val == 0)
14121798 continue;
....@@ -1506,6 +1892,36 @@
15061892 }
15071893 }
15081894
1895
+/*
1896
+ * Set NUMA node for the devices on the bus
1897
+ */
1898
+static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus)
1899
+{
1900
+ struct pci_dev *dev;
1901
+ struct pci_bus *bus = hbus->pci_bus;
1902
+ struct hv_pci_dev *hv_dev;
1903
+
1904
+ list_for_each_entry(dev, &bus->devices, bus_list) {
1905
+ hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn));
1906
+ if (!hv_dev)
1907
+ continue;
1908
+
1909
+ if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY &&
1910
+ hv_dev->desc.virtual_numa_node < num_possible_nodes())
1911
+ /*
1912
+ * The kernel may boot with some NUMA nodes offline
1913
+ * (e.g. in a KDUMP kernel) or with NUMA disabled via
1914
+ * "numa=off". In those cases, adjust the host provided
1915
+ * NUMA node to a valid NUMA node used by the kernel.
1916
+ */
1917
+ set_dev_node(&dev->dev,
1918
+ numa_map_to_online_node(
1919
+ hv_dev->desc.virtual_numa_node));
1920
+
1921
+ put_pcichild(hv_dev);
1922
+ }
1923
+}
1924
+
15091925 /**
15101926 * create_root_hv_pci_bus() - Expose a new root PCI bus
15111927 * @hbus: Root PCI bus, as understood by this driver
....@@ -1528,6 +1944,7 @@
15281944
15291945 pci_lock_rescan_remove();
15301946 pci_scan_child_bus(hbus->pci_bus);
1947
+ hv_pci_assign_numa_node(hbus);
15311948 pci_bus_assign_resources(hbus->pci_bus);
15321949 hv_pci_assign_slots(hbus);
15331950 pci_bus_add_devices(hbus->pci_bus);
....@@ -1563,7 +1980,7 @@
15631980 "query resource requirements failed: %x\n",
15641981 resp->status);
15651982 } else {
1566
- for (i = 0; i < 6; i++) {
1983
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
15671984 completion->hpdev->probed_bar[i] =
15681985 q_res_req->probed_bar[i];
15691986 }
....@@ -1584,7 +2001,7 @@
15842001 * Return: Pointer to the new tracking struct
15852002 */
15862003 static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
1587
- struct pci_function_description *desc)
2004
+ struct hv_pcidev_description *desc)
15882005 {
15892006 struct hv_pci_dev *hpdev;
15902007 struct pci_child_message *res_req;
....@@ -1695,7 +2112,7 @@
16952112 {
16962113 u32 child_no;
16972114 bool found;
1698
- struct pci_function_description *new_desc;
2115
+ struct hv_pcidev_description *new_desc;
16992116 struct hv_pci_dev *hpdev;
17002117 struct hv_pcibus_device *hbus;
17012118 struct list_head removed;
....@@ -1796,6 +2213,7 @@
17962213 */
17972214 pci_lock_rescan_remove();
17982215 pci_scan_child_bus(hbus->pci_bus);
2216
+ hv_pci_assign_numa_node(hbus);
17992217 hv_pci_assign_slots(hbus);
18002218 pci_unlock_rescan_remove();
18012219 break;
....@@ -1814,41 +2232,31 @@
18142232 }
18152233
18162234 /**
1817
- * hv_pci_devices_present() - Handles list of new children
2235
+ * hv_pci_start_relations_work() - Queue work to start device discovery
18182236 * @hbus: Root PCI bus, as understood by this driver
1819
- * @relations: Packet from host listing children
2237
+ * @dr: The list of children returned from host
18202238 *
1821
- * This function is invoked whenever a new list of devices for
1822
- * this bus appears.
2239
+ * Return: 0 on success, -errno on failure
18232240 */
1824
-static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
1825
- struct pci_bus_relations *relations)
2241
+static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus,
2242
+ struct hv_dr_state *dr)
18262243 {
1827
- struct hv_dr_state *dr;
18282244 struct hv_dr_work *dr_wrk;
18292245 unsigned long flags;
18302246 bool pending_dr;
18312247
2248
+ if (hbus->state == hv_pcibus_removing) {
2249
+ dev_info(&hbus->hdev->device,
2250
+ "PCI VMBus BUS_RELATIONS: ignored\n");
2251
+ return -ENOENT;
2252
+ }
2253
+
18322254 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
18332255 if (!dr_wrk)
1834
- return;
1835
-
1836
- dr = kzalloc(offsetof(struct hv_dr_state, func) +
1837
- (sizeof(struct pci_function_description) *
1838
- (relations->device_count)), GFP_NOWAIT);
1839
- if (!dr) {
1840
- kfree(dr_wrk);
1841
- return;
1842
- }
2256
+ return -ENOMEM;
18432257
18442258 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
18452259 dr_wrk->bus = hbus;
1846
- dr->device_count = relations->device_count;
1847
- if (dr->device_count != 0) {
1848
- memcpy(dr->func, relations->func,
1849
- sizeof(struct pci_function_description) *
1850
- dr->device_count);
1851
- }
18522260
18532261 spin_lock_irqsave(&hbus->device_list_lock, flags);
18542262 /*
....@@ -1866,6 +2274,83 @@
18662274 get_hvpcibus(hbus);
18672275 queue_work(hbus->wq, &dr_wrk->wrk);
18682276 }
2277
+
2278
+ return 0;
2279
+}
2280
+
2281
+/**
2282
+ * hv_pci_devices_present() - Handle list of new children
2283
+ * @hbus: Root PCI bus, as understood by this driver
2284
+ * @relations: Packet from host listing children
2285
+ *
2286
+ * Process a new list of devices on the bus. The list of devices is
2287
+ * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS,
2288
+ * whenever a new list of devices for this bus appears.
2289
+ */
2290
+static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
2291
+ struct pci_bus_relations *relations)
2292
+{
2293
+ struct hv_dr_state *dr;
2294
+ int i;
2295
+
2296
+ dr = kzalloc(struct_size(dr, func, relations->device_count),
2297
+ GFP_NOWAIT);
2298
+ if (!dr)
2299
+ return;
2300
+
2301
+ dr->device_count = relations->device_count;
2302
+ for (i = 0; i < dr->device_count; i++) {
2303
+ dr->func[i].v_id = relations->func[i].v_id;
2304
+ dr->func[i].d_id = relations->func[i].d_id;
2305
+ dr->func[i].rev = relations->func[i].rev;
2306
+ dr->func[i].prog_intf = relations->func[i].prog_intf;
2307
+ dr->func[i].subclass = relations->func[i].subclass;
2308
+ dr->func[i].base_class = relations->func[i].base_class;
2309
+ dr->func[i].subsystem_id = relations->func[i].subsystem_id;
2310
+ dr->func[i].win_slot = relations->func[i].win_slot;
2311
+ dr->func[i].ser = relations->func[i].ser;
2312
+ }
2313
+
2314
+ if (hv_pci_start_relations_work(hbus, dr))
2315
+ kfree(dr);
2316
+}
2317
+
2318
+/**
2319
+ * hv_pci_devices_present2() - Handle list of new children
2320
+ * @hbus: Root PCI bus, as understood by this driver
2321
+ * @relations: Packet from host listing children
2322
+ *
2323
+ * This function is the v2 version of hv_pci_devices_present()
2324
+ */
2325
+static void hv_pci_devices_present2(struct hv_pcibus_device *hbus,
2326
+ struct pci_bus_relations2 *relations)
2327
+{
2328
+ struct hv_dr_state *dr;
2329
+ int i;
2330
+
2331
+ dr = kzalloc(struct_size(dr, func, relations->device_count),
2332
+ GFP_NOWAIT);
2333
+ if (!dr)
2334
+ return;
2335
+
2336
+ dr->device_count = relations->device_count;
2337
+ for (i = 0; i < dr->device_count; i++) {
2338
+ dr->func[i].v_id = relations->func[i].v_id;
2339
+ dr->func[i].d_id = relations->func[i].d_id;
2340
+ dr->func[i].rev = relations->func[i].rev;
2341
+ dr->func[i].prog_intf = relations->func[i].prog_intf;
2342
+ dr->func[i].subclass = relations->func[i].subclass;
2343
+ dr->func[i].base_class = relations->func[i].base_class;
2344
+ dr->func[i].subsystem_id = relations->func[i].subsystem_id;
2345
+ dr->func[i].win_slot = relations->func[i].win_slot;
2346
+ dr->func[i].ser = relations->func[i].ser;
2347
+ dr->func[i].flags = relations->func[i].flags;
2348
+ dr->func[i].virtual_numa_node =
2349
+ relations->func[i].virtual_numa_node;
2350
+ }
2351
+
2352
+ if (hv_pci_start_relations_work(hbus, dr))
2353
+ kfree(dr);
18692354 }
18702355
18712356 /**
....@@ -1945,11 +2430,19 @@
19452430 */
19462431 static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
19472432 {
2433
+ struct hv_pcibus_device *hbus = hpdev->hbus;
2434
+ struct hv_device *hdev = hbus->hdev;
2435
+
2436
+ if (hbus->state == hv_pcibus_removing) {
2437
+ dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n");
2438
+ return;
2439
+ }
2440
+
19482441 hpdev->state = hv_pcichild_ejecting;
19492442 get_pcichild(hpdev);
19502443 INIT_WORK(&hpdev->wrk, hv_eject_device_work);
1951
- get_hvpcibus(hpdev->hbus);
1952
- queue_work(hpdev->hbus->wq, &hpdev->wrk);
2444
+ get_hvpcibus(hbus);
2445
+ queue_work(hbus->wq, &hpdev->wrk);
19532446 }
19542447
19552448 /**
....@@ -1973,6 +2466,8 @@
19732466 struct pci_response *response;
19742467 struct pci_incoming_message *new_message;
19752468 struct pci_bus_relations *bus_rel;
2469
+ struct pci_bus_relations2 *bus_rel2;
2470
+ struct pci_dev_inval_block *inval;
19762471 struct pci_dev_incoming *dev_message;
19772472 struct hv_pci_dev *hpdev;
19782473
....@@ -2028,15 +2523,28 @@
20282523
20292524 bus_rel = (struct pci_bus_relations *)buffer;
20302525 if (bytes_recvd <
2031
- offsetof(struct pci_bus_relations, func) +
2032
- (sizeof(struct pci_function_description) *
2033
- (bus_rel->device_count))) {
2526
+ struct_size(bus_rel, func,
2527
+ bus_rel->device_count)) {
20342528 dev_err(&hbus->hdev->device,
20352529 "bus relations too small\n");
20362530 break;
20372531 }
20382532
20392533 hv_pci_devices_present(hbus, bus_rel);
2534
+ break;
2535
+
2536
+ case PCI_BUS_RELATIONS2:
2537
+
2538
+ bus_rel2 = (struct pci_bus_relations2 *)buffer;
2539
+ if (bytes_recvd <
2540
+ struct_size(bus_rel2, func,
2541
+ bus_rel2->device_count)) {
2542
+ dev_err(&hbus->hdev->device,
2543
+ "bus relations v2 too small\n");
2544
+ break;
2545
+ }
2546
+
2547
+ hv_pci_devices_present2(hbus, bus_rel2);
20402548 break;
20412549
20422550 case PCI_EJECT:
....@@ -2046,6 +2554,21 @@
20462554 dev_message->wslot.slot);
20472555 if (hpdev) {
20482556 hv_pci_eject_device(hpdev);
2557
+ put_pcichild(hpdev);
2558
+ }
2559
+ break;
2560
+
2561
+ case PCI_INVALIDATE_BLOCK:
2562
+
2563
+ inval = (struct pci_dev_inval_block *)buffer;
2564
+ hpdev = get_pcichild_wslot(hbus,
2565
+ inval->wslot.slot);
2566
+ if (hpdev) {
2567
+ if (hpdev->block_invalidate) {
2568
+ hpdev->block_invalidate(
2569
+ hpdev->invalidate_context,
2570
+ inval->block_mask);
2571
+ }
20492572 put_pcichild(hpdev);
20502573 }
20512574 break;
....@@ -2071,7 +2594,10 @@
20712594
20722595 /**
20732596 * hv_pci_protocol_negotiation() - Set up protocol
2074
- * @hdev: VMBus's tracking struct for this root PCI bus
2597
+ * @hdev: VMBus's tracking struct for this root PCI bus.
2598
+ * @version: Array of supported channel protocol versions in
2599
+ * the order of probing - highest go first.
2600
+ * @num_version: Number of elements in the version array.
20752601 *
20762602 * This driver is intended to support running on Windows 10
20772603 * (server) and later versions. It will not run on earlier
....@@ -2085,8 +2611,11 @@
20852611 * failing if the host doesn't support the necessary protocol
20862612 * level.
20872613 */
2088
-static int hv_pci_protocol_negotiation(struct hv_device *hdev)
2614
+static int hv_pci_protocol_negotiation(struct hv_device *hdev,
2615
+ enum pci_protocol_version_t version[],
2616
+ int num_version)
20892617 {
2618
+ struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
20902619 struct pci_version_request *version_req;
20912620 struct hv_pci_compl comp_pkt;
20922621 struct pci_packet *pkt;
....@@ -2109,8 +2638,8 @@
21092638 version_req = (struct pci_version_request *)&pkt->message;
21102639 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
21112640
2112
- for (i = 0; i < ARRAY_SIZE(pci_protocol_versions); i++) {
2113
- version_req->protocol_version = pci_protocol_versions[i];
2641
+ for (i = 0; i < num_version; i++) {
2642
+ version_req->protocol_version = version[i];
21142643 ret = vmbus_sendpacket(hdev->channel, version_req,
21152644 sizeof(struct pci_version_request),
21162645 (unsigned long)pkt, VM_PKT_DATA_INBAND,
....@@ -2126,10 +2655,10 @@
21262655 }
21272656
21282657 if (comp_pkt.completion_status >= 0) {
2129
- pci_protocol_version = pci_protocol_versions[i];
2658
+ hbus->protocol_version = version[i];
21302659 dev_info(&hdev->device,
21312660 "PCI VMBus probing: Using version %#x\n",
2132
- pci_protocol_version);
2661
+ hbus->protocol_version);
21332662 goto exit;
21342663 }
21352664
....@@ -2299,6 +2828,8 @@
22992828 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
23002829 }
23012830
2831
+static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs);
2832
+
23022833 /**
23032834 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
23042835 * @hdev: VMBus's tracking struct for this root PCI bus
....@@ -2410,10 +2941,10 @@
24102941 struct hv_pci_dev *hpdev;
24112942 struct pci_packet *pkt;
24122943 size_t size_res;
2413
- u32 wslot;
2944
+ int wslot;
24142945 int ret;
24152946
2416
- size_res = (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2)
2947
+ size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2)
24172948 ? sizeof(*res_assigned) : sizeof(*res_assigned2);
24182949
24192950 pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL);
....@@ -2432,7 +2963,7 @@
24322963 pkt->completion_func = hv_pci_generic_compl;
24332964 pkt->compl_ctxt = &comp_pkt;
24342965
2435
- if (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2) {
2966
+ if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) {
24362967 res_assigned =
24372968 (struct pci_resources_assigned *)&pkt->message;
24382969 res_assigned->message_type.type =
....@@ -2463,6 +2994,8 @@
24632994 comp_pkt.completion_status);
24642995 break;
24652996 }
2997
+
2998
+ hbus->wslot_res_allocated = wslot;
24662999 }
24673000
24683001 kfree(pkt);
....@@ -2481,10 +3014,10 @@
24813014 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
24823015 struct pci_child_message pkt;
24833016 struct hv_pci_dev *hpdev;
2484
- u32 wslot;
3017
+ int wslot;
24853018 int ret;
24863019
2487
- for (wslot = 0; wslot < 256; wslot++) {
3020
+ for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) {
24883021 hpdev = get_pcichild_wslot(hbus, wslot);
24893022 if (!hpdev)
24903023 continue;
....@@ -2499,7 +3032,11 @@
24993032 VM_PKT_DATA_INBAND, 0);
25003033 if (ret)
25013034 return ret;
3035
+
3036
+ hbus->wslot_res_allocated = wslot - 1;
25023037 }
3038
+
3039
+ hbus->wslot_res_allocated = -1;
25033040
25043041 return 0;
25053042 }
....@@ -2515,6 +3052,48 @@
25153052 complete(&hbus->remove_event);
25163053 }
25173054
3055
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
3056
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
3057
+
3058
+/*
3059
+ * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
3060
+ * as invalid for passthrough PCI devices of this driver.
3061
+ */
3062
+#define HVPCI_DOM_INVALID 0
3063
+
3064
+/**
3065
+ * hv_get_dom_num() - Get a valid PCI domain number
3066
+ * Check if the PCI domain number is in use, and return another number if
3067
+ * it is in use.
3068
+ *
3069
+ * @dom: Requested domain number
3070
+ *
3071
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
3072
+ */
3073
+static u16 hv_get_dom_num(u16 dom)
3074
+{
3075
+ unsigned int i;
3076
+
3077
+ if (test_and_set_bit(dom, hvpci_dom_map) == 0)
3078
+ return dom;
3079
+
3080
+ for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
3081
+ if (test_and_set_bit(i, hvpci_dom_map) == 0)
3082
+ return i;
3083
+ }
3084
+
3085
+ return HVPCI_DOM_INVALID;
3086
+}
3087
+
3088
+/**
3089
+ * hv_put_dom_num() - Mark the PCI domain number as free
3090
+ * @dom: Domain number to be freed
3091
+ */
3092
+static void hv_put_dom_num(u16 dom)
3093
+{
3094
+ clear_bit(dom, hvpci_dom_map);
3095
+}
3096
+
25183097 /**
25193098 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
25203099 * @hdev: VMBus's tracking struct for this root PCI bus
....@@ -2526,33 +3105,70 @@
25263105 const struct hv_vmbus_device_id *dev_id)
25273106 {
25283107 struct hv_pcibus_device *hbus;
3108
+ u16 dom_req, dom;
3109
+ char *name;
3110
+ bool enter_d0_retry = true;
25293111 int ret;
25303112
25313113 /*
25323114 * hv_pcibus_device contains the hypercall arguments for retargeting in
25333115 * hv_irq_unmask(). Those must not cross a page boundary.
25343116 */
2535
- BUILD_BUG_ON(sizeof(*hbus) > PAGE_SIZE);
3117
+ BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE);
25363118
2537
- hbus = (struct hv_pcibus_device *)get_zeroed_page(GFP_KERNEL);
3119
+ /*
3120
+ * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural
3121
+ * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate
3122
+ * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and
3123
+ * alignment of hbus is important because hbus's field
3124
+ * retarget_msi_interrupt_params must not cross a 4KB page boundary.
3125
+ *
3126
+ * Here we prefer kzalloc to get_zeroed_page(), because a buffer
3127
+ * allocated by the latter is not tracked and scanned by kmemleak, and
3128
+ * hence kmemleak reports the pointer contained in the hbus buffer
3129
+ * (i.e. the hpdev struct, which is created in new_pcichild_device() and
3130
+ * is tracked by hbus->children) as memory leak (false positive).
3131
+ *
3132
+ * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be
3133
+ * used to allocate the hbus buffer and we can avoid the kmemleak false
3134
+ * positive by using kmemleak_alloc() and kmemleak_free() to ask
3135
+ * kmemleak to track and scan the hbus buffer.
3136
+ */
3137
+ hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
25383138 if (!hbus)
25393139 return -ENOMEM;
25403140 hbus->state = hv_pcibus_init;
3141
+ hbus->wslot_res_allocated = -1;
25413142
25423143 /*
2543
- * The PCI bus "domain" is what is called "segment" in ACPI and
2544
- * other specs. Pull it from the instance ID, to get something
2545
- * unique. Bytes 8 and 9 are what is used in Windows guests, so
2546
- * do the same thing for consistency. Note that, since this code
2547
- * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
2548
- * that (1) the only domain in use for something that looks like
2549
- * a physical PCI bus (which is actually emulated by the
2550
- * hypervisor) is domain 0 and (2) there will be no overlap
2551
- * between domains derived from these instance IDs in the same
2552
- * VM.
3144
+ * The PCI bus "domain" is what is called "segment" in ACPI and other
3145
+ * specs. Pull it from the instance ID, to get something usually
3146
+ * unique. In rare cases of collision, we will find out another number
3147
+ * not in use.
3148
+ *
3149
+ * Note that, since this code only runs in a Hyper-V VM, Hyper-V
3150
+ * together with this guest driver can guarantee that (1) The only
3151
+ * domain used by Gen1 VMs for something that looks like a physical
3152
+ * PCI bus (which is actually emulated by the hypervisor) is domain 0.
3153
+ * (2) There will be no overlap between domains (after fixing possible
3154
+ * collisions) in the same VM.
25533155 */
2554
- hbus->sysdata.domain = hdev->dev_instance.b[9] |
2555
- hdev->dev_instance.b[8] << 8;
3156
+ dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
3157
+ dom = hv_get_dom_num(dom_req);
3158
+
3159
+ if (dom == HVPCI_DOM_INVALID) {
3160
+ dev_err(&hdev->device,
3161
+ "Unable to use dom# 0x%hx or other numbers", dom_req);
3162
+ ret = -EINVAL;
3163
+ goto free_bus;
3164
+ }
3165
+
3166
+ if (dom != dom_req)
3167
+ dev_info(&hdev->device,
3168
+ "PCI dom# 0x%hx has collision, using 0x%hx",
3169
+ dom_req, dom);
3170
+
3171
+ hbus->sysdata.domain = dom;
25563172
25573173 hbus->hdev = hdev;
25583174 refcount_set(&hbus->remove_lock, 1);
....@@ -2567,7 +3183,7 @@
25673183 hbus->sysdata.domain);
25683184 if (!hbus->wq) {
25693185 ret = -ENOMEM;
2570
- goto free_bus;
3186
+ goto free_dom;
25713187 }
25723188
25733189 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
....@@ -2577,7 +3193,8 @@
25773193
25783194 hv_set_drvdata(hdev, hbus);
25793195
2580
- ret = hv_pci_protocol_negotiation(hdev);
3196
+ ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions,
3197
+ ARRAY_SIZE(pci_protocol_versions));
25813198 if (ret)
25823199 goto close;
25833200
....@@ -2594,7 +3211,14 @@
25943211 goto free_config;
25953212 }
25963213
2597
- hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus);
3214
+ name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance);
3215
+ if (!name) {
3216
+ ret = -ENOMEM;
3217
+ goto unmap;
3218
+ }
3219
+
3220
+ hbus->sysdata.fwnode = irq_domain_alloc_named_fwnode(name);
3221
+ kfree(name);
25983222 if (!hbus->sysdata.fwnode) {
25993223 ret = -ENOMEM;
26003224 goto unmap;
....@@ -2604,17 +3228,53 @@
26043228 if (ret)
26053229 goto free_fwnode;
26063230
3231
+retry:
26073232 ret = hv_pci_query_relations(hdev);
26083233 if (ret)
26093234 goto free_irq_domain;
26103235
26113236 ret = hv_pci_enter_d0(hdev);
3237
+ /*
3238
+ * In certain case (Kdump) the pci device of interest was
3239
+ * not cleanly shut down and resource is still held on host
3240
+ * side, the host could return invalid device status.
3241
+ * We need to explicitly request host to release the resource
3242
+ * and try to enter D0 again.
3243
+ * Since the hv_pci_bus_exit() call releases structures
3244
+ * of all its child devices, we need to start the retry from
3245
+ * hv_pci_query_relations() call, requesting host to send
3246
+ * the synchronous child device relations message before this
3247
+ * information is needed in hv_send_resources_allocated()
3248
+ * call later.
3249
+ */
3250
+ if (ret == -EPROTO && enter_d0_retry) {
3251
+ enter_d0_retry = false;
3252
+
3253
+ dev_err(&hdev->device, "Retrying D0 Entry\n");
3254
+
3255
+ /*
3256
+ * Hv_pci_bus_exit() calls hv_send_resources_released()
3257
+ * to free up resources of its child devices.
3258
+ * In the kdump kernel we need to set the
3259
+ * wslot_res_allocated to 255 so it scans all child
3260
+ * devices to release resources allocated in the
3261
+ * normal kernel before panic happened.
3262
+ */
3263
+ hbus->wslot_res_allocated = 255;
3264
+ ret = hv_pci_bus_exit(hdev, true);
3265
+
3266
+ if (ret == 0)
3267
+ goto retry;
3268
+
3269
+ dev_err(&hdev->device,
3270
+ "Retrying D0 failed with ret %d\n", ret);
3271
+ }
26123272 if (ret)
26133273 goto free_irq_domain;
26143274
26153275 ret = hv_pci_allocate_bridge_windows(hbus);
26163276 if (ret)
2617
- goto free_irq_domain;
3277
+ goto exit_d0;
26183278
26193279 ret = hv_send_resources_allocated(hdev);
26203280 if (ret)
....@@ -2632,6 +3292,8 @@
26323292
26333293 free_windows:
26343294 hv_pci_free_bridge_windows(hbus);
3295
+exit_d0:
3296
+ (void) hv_pci_bus_exit(hdev, true);
26353297 free_irq_domain:
26363298 irq_domain_remove(hbus->irq_domain);
26373299 free_fwnode:
....@@ -2644,20 +3306,23 @@
26443306 vmbus_close(hdev->channel);
26453307 destroy_wq:
26463308 destroy_workqueue(hbus->wq);
3309
+free_dom:
3310
+ hv_put_dom_num(hbus->sysdata.domain);
26473311 free_bus:
2648
- free_page((unsigned long)hbus);
3312
+ kfree(hbus);
26493313 return ret;
26503314 }
26513315
2652
-static void hv_pci_bus_exit(struct hv_device *hdev)
3316
+static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs)
26533317 {
26543318 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
26553319 struct {
26563320 struct pci_packet teardown_packet;
26573321 u8 buffer[sizeof(struct pci_message)];
26583322 } pkt;
2659
- struct pci_bus_relations relations;
26603323 struct hv_pci_compl comp_pkt;
3324
+ struct hv_pci_dev *hpdev, *tmp;
3325
+ unsigned long flags;
26613326 int ret;
26623327
26633328 /*
....@@ -2665,16 +3330,35 @@
26653330 * access the per-channel ringbuffer any longer.
26663331 */
26673332 if (hdev->channel->rescind)
2668
- return;
3333
+ return 0;
26693334
2670
- /* Delete any children which might still exist. */
2671
- memset(&relations, 0, sizeof(relations));
2672
- hv_pci_devices_present(hbus, &relations);
3335
+ if (!keep_devs) {
3336
+ struct list_head removed;
3337
+
3338
+ /* Move all present children to the list on stack */
3339
+ INIT_LIST_HEAD(&removed);
3340
+ spin_lock_irqsave(&hbus->device_list_lock, flags);
3341
+ list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry)
3342
+ list_move_tail(&hpdev->list_entry, &removed);
3343
+ spin_unlock_irqrestore(&hbus->device_list_lock, flags);
3344
+
3345
+ /* Remove all children in the list */
3346
+ list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) {
3347
+ list_del(&hpdev->list_entry);
3348
+ if (hpdev->pci_slot)
3349
+ pci_destroy_slot(hpdev->pci_slot);
3350
+ /* For the two refs got in new_pcichild_device() */
3351
+ put_pcichild(hpdev);
3352
+ put_pcichild(hpdev);
3353
+ }
3354
+ }
26733355
26743356 ret = hv_send_resources_released(hdev);
2675
- if (ret)
3357
+ if (ret) {
26763358 dev_err(&hdev->device,
26773359 "Couldn't send resources released packet(s)\n");
3360
+ return ret;
3361
+ }
26783362
26793363 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
26803364 init_completion(&comp_pkt.host_event);
....@@ -2687,8 +3371,13 @@
26873371 (unsigned long)&pkt.teardown_packet,
26883372 VM_PKT_DATA_INBAND,
26893373 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2690
- if (!ret)
2691
- wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ);
3374
+ if (ret)
3375
+ return ret;
3376
+
3377
+ if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0)
3378
+ return -ETIMEDOUT;
3379
+
3380
+ return 0;
26923381 }
26933382
26943383 /**
....@@ -2700,19 +3389,30 @@
27003389 static int hv_pci_remove(struct hv_device *hdev)
27013390 {
27023391 struct hv_pcibus_device *hbus;
3392
+ int ret;
27033393
27043394 hbus = hv_get_drvdata(hdev);
27053395 if (hbus->state == hv_pcibus_installed) {
3396
+ tasklet_disable(&hdev->channel->callback_event);
3397
+ hbus->state = hv_pcibus_removing;
3398
+ tasklet_enable(&hdev->channel->callback_event);
3399
+ destroy_workqueue(hbus->wq);
3400
+ hbus->wq = NULL;
3401
+ /*
3402
+ * At this point, no work is running or can be scheduled
3403
+ * on hbus-wq. We can't race with hv_pci_devices_present()
3404
+ * or hv_pci_eject_device(), it's safe to proceed.
3405
+ */
3406
+
27063407 /* Remove the bus from PCI's point of view. */
27073408 pci_lock_rescan_remove();
27083409 pci_stop_root_bus(hbus->pci_bus);
27093410 hv_pci_remove_slots(hbus);
27103411 pci_remove_root_bus(hbus->pci_bus);
27113412 pci_unlock_rescan_remove();
2712
- hbus->state = hv_pcibus_removed;
27133413 }
27143414
2715
- hv_pci_bus_exit(hdev);
3415
+ ret = hv_pci_bus_exit(hdev, false);
27163416
27173417 vmbus_close(hdev->channel);
27183418
....@@ -2724,9 +3424,128 @@
27243424 irq_domain_free_fwnode(hbus->sysdata.fwnode);
27253425 put_hvpcibus(hbus);
27263426 wait_for_completion(&hbus->remove_event);
2727
- destroy_workqueue(hbus->wq);
2728
- free_page((unsigned long)hbus);
3427
+
3428
+ hv_put_dom_num(hbus->sysdata.domain);
3429
+
3430
+ kfree(hbus);
3431
+ return ret;
3432
+}
3433
+
3434
+static int hv_pci_suspend(struct hv_device *hdev)
3435
+{
3436
+ struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3437
+ enum hv_pcibus_state old_state;
3438
+ int ret;
3439
+
3440
+ /*
3441
+ * hv_pci_suspend() must make sure there are no pending work items
3442
+ * before calling vmbus_close(), since it runs in a process context
3443
+ * as a callback in dpm_suspend(). When it starts to run, the channel
3444
+ * callback hv_pci_onchannelcallback(), which runs in a tasklet
3445
+ * context, can be still running concurrently and scheduling new work
3446
+ * items onto hbus->wq in hv_pci_devices_present() and
3447
+ * hv_pci_eject_device(), and the work item handlers can access the
3448
+ * vmbus channel, which can be being closed by hv_pci_suspend(), e.g.
3449
+ * the work item handler pci_devices_present_work() ->
3450
+ * new_pcichild_device() writes to the vmbus channel.
3451
+ *
3452
+ * To eliminate the race, hv_pci_suspend() disables the channel
3453
+ * callback tasklet, sets hbus->state to hv_pcibus_removing, and
3454
+ * re-enables the tasklet. This way, when hv_pci_suspend() proceeds,
3455
+ * it knows that no new work item can be scheduled, and then it flushes
3456
+ * hbus->wq and safely closes the vmbus channel.
3457
+ */
3458
+ tasklet_disable(&hdev->channel->callback_event);
3459
+
3460
+ /* Change the hbus state to prevent new work items. */
3461
+ old_state = hbus->state;
3462
+ if (hbus->state == hv_pcibus_installed)
3463
+ hbus->state = hv_pcibus_removing;
3464
+
3465
+ tasklet_enable(&hdev->channel->callback_event);
3466
+
3467
+ if (old_state != hv_pcibus_installed)
3468
+ return -EINVAL;
3469
+
3470
+ flush_workqueue(hbus->wq);
3471
+
3472
+ ret = hv_pci_bus_exit(hdev, true);
3473
+ if (ret)
3474
+ return ret;
3475
+
3476
+ vmbus_close(hdev->channel);
3477
+
27293478 return 0;
3479
+}
3480
+
3481
+static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg)
3482
+{
3483
+ struct msi_desc *entry;
3484
+ struct irq_data *irq_data;
3485
+
3486
+ for_each_pci_msi_entry(entry, pdev) {
3487
+ irq_data = irq_get_irq_data(entry->irq);
3488
+ if (WARN_ON_ONCE(!irq_data))
3489
+ return -EINVAL;
3490
+
3491
+ hv_compose_msi_msg(irq_data, &entry->msg);
3492
+ }
3493
+
3494
+ return 0;
3495
+}
3496
+
3497
+/*
3498
+ * Upon resume, pci_restore_msi_state() -> ... -> __pci_write_msi_msg()
3499
+ * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V
3500
+ * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg()
3501
+ * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping
3502
+ * Table entries.
3503
+ */
3504
+static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus)
3505
+{
3506
+ pci_walk_bus(hbus->pci_bus, hv_pci_restore_msi_msg, NULL);
3507
+}
3508
+
3509
+static int hv_pci_resume(struct hv_device *hdev)
3510
+{
3511
+ struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3512
+ enum pci_protocol_version_t version[1];
3513
+ int ret;
3514
+
3515
+ hbus->state = hv_pcibus_init;
3516
+
3517
+ ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
3518
+ hv_pci_onchannelcallback, hbus);
3519
+ if (ret)
3520
+ return ret;
3521
+
3522
+ /* Only use the version that was in use before hibernation. */
3523
+ version[0] = hbus->protocol_version;
3524
+ ret = hv_pci_protocol_negotiation(hdev, version, 1);
3525
+ if (ret)
3526
+ goto out;
3527
+
3528
+ ret = hv_pci_query_relations(hdev);
3529
+ if (ret)
3530
+ goto out;
3531
+
3532
+ ret = hv_pci_enter_d0(hdev);
3533
+ if (ret)
3534
+ goto out;
3535
+
3536
+ ret = hv_send_resources_allocated(hdev);
3537
+ if (ret)
3538
+ goto out;
3539
+
3540
+ prepopulate_bars(hbus);
3541
+
3542
+ hv_pci_restore_msi_state(hbus);
3543
+
3544
+ hbus->state = hv_pcibus_installed;
3545
+ return 0;
3546
+out:
3547
+ vmbus_close(hdev->channel);
3548
+ return ret;
27303549 }
27313550
27323551 static const struct hv_vmbus_device_id hv_pci_id_table[] = {
....@@ -2743,15 +3562,32 @@
27433562 .id_table = hv_pci_id_table,
27443563 .probe = hv_pci_probe,
27453564 .remove = hv_pci_remove,
3565
+ .suspend = hv_pci_suspend,
3566
+ .resume = hv_pci_resume,
27463567 };
27473568
27483569 static void __exit exit_hv_pci_drv(void)
27493570 {
27503571 vmbus_driver_unregister(&hv_pci_drv);
3572
+
3573
+ hvpci_block_ops.read_block = NULL;
3574
+ hvpci_block_ops.write_block = NULL;
3575
+ hvpci_block_ops.reg_blk_invalidate = NULL;
27513576 }
27523577
27533578 static int __init init_hv_pci_drv(void)
27543579 {
3580
+ if (!hv_is_hyperv_initialized())
3581
+ return -ENODEV;
3582
+
3583
+ /* Set the invalid domain number's bit, so it will not be used */
3584
+ set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
3585
+
3586
+ /* Initialize PCI block r/w interface */
3587
+ hvpci_block_ops.read_block = hv_read_config_block;
3588
+ hvpci_block_ops.write_block = hv_write_config_block;
3589
+ hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate;
3590
+
27553591 return vmbus_driver_register(&hv_pci_drv);
27563592 }
27573593