hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/drivers/pci/controller/pci-hyperv.c
....@@ -63,6 +63,7 @@
6363 enum pci_protocol_version_t {
6464 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */
6565 PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */
66
+ PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3), /* Vibranium */
6667 };
6768
6869 #define CPU_AFFINITY_ALL -1ULL
....@@ -72,14 +73,10 @@
7273 * first.
7374 */
7475 static enum pci_protocol_version_t pci_protocol_versions[] = {
76
+ PCI_PROTOCOL_VERSION_1_3,
7577 PCI_PROTOCOL_VERSION_1_2,
7678 PCI_PROTOCOL_VERSION_1_1,
7779 };
78
-
79
-/*
80
- * Protocol version negotiated by hv_pci_protocol_negotiation().
81
- */
82
-static enum pci_protocol_version_t pci_protocol_version;
8380
8481 #define PCI_CONFIG_MMIO_LENGTH 0x2000
8582 #define CFG_PAGE_OFFSET 0x1000
....@@ -124,6 +121,7 @@
124121 PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16,
125122 PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17,
126123 PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */
124
+ PCI_BUS_RELATIONS2 = PCI_MESSAGE_BASE + 0x19,
127125 PCI_MESSAGE_MAXIMUM
128126 };
129127
....@@ -167,6 +165,26 @@
167165 u32 subsystem_id;
168166 union win_slot_encoding win_slot;
169167 u32 ser; /* serial number */
168
+} __packed;
169
+
170
+enum pci_device_description_flags {
171
+ HV_PCI_DEVICE_FLAG_NONE = 0x0,
172
+ HV_PCI_DEVICE_FLAG_NUMA_AFFINITY = 0x1,
173
+};
174
+
175
+struct pci_function_description2 {
176
+ u16 v_id; /* vendor ID */
177
+ u16 d_id; /* device ID */
178
+ u8 rev;
179
+ u8 prog_intf;
180
+ u8 subclass;
181
+ u8 base_class;
182
+ u32 subsystem_id;
183
+ union win_slot_encoding win_slot;
184
+ u32 ser; /* serial number */
185
+ u32 flags;
186
+ u16 virtual_numa_node;
187
+ u16 reserved;
170188 } __packed;
171189
172190 /**
....@@ -265,7 +283,7 @@
265283 int resp_packet_size);
266284 void *compl_ctxt;
267285
268
- struct pci_message message[0];
286
+ struct pci_message message[];
269287 };
270288
271289 /*
....@@ -301,13 +319,19 @@
301319 struct pci_bus_relations {
302320 struct pci_incoming_message incoming;
303321 u32 device_count;
304
- struct pci_function_description func[0];
322
+ struct pci_function_description func[];
323
+} __packed;
324
+
325
+struct pci_bus_relations2 {
326
+ struct pci_incoming_message incoming;
327
+ u32 device_count;
328
+ struct pci_function_description2 func[];
305329 } __packed;
306330
307331 struct pci_q_res_req_response {
308332 struct vmpacket_descriptor hdr;
309333 s32 status; /* negative values are failures */
310
- u32 probed_bar[6];
334
+ u32 probed_bar[PCI_STD_NUM_BARS];
311335 } __packed;
312336
313337 struct pci_set_power {
....@@ -365,6 +389,39 @@
365389 struct tran_int_desc int_desc;
366390 } __packed;
367391
392
+/*
393
+ * Note: the VM must pass a valid block id, wslot and bytes_requested.
394
+ */
395
+struct pci_read_block {
396
+ struct pci_message message_type;
397
+ u32 block_id;
398
+ union win_slot_encoding wslot;
399
+ u32 bytes_requested;
400
+} __packed;
401
+
402
+struct pci_read_block_response {
403
+ struct vmpacket_descriptor hdr;
404
+ u32 status;
405
+ u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
406
+} __packed;
407
+
408
+/*
409
+ * Note: the VM must pass a valid block id, wslot and byte_count.
410
+ */
411
+struct pci_write_block {
412
+ struct pci_message message_type;
413
+ u32 block_id;
414
+ union win_slot_encoding wslot;
415
+ u32 byte_count;
416
+ u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
417
+} __packed;
418
+
419
+struct pci_dev_inval_block {
420
+ struct pci_incoming_message incoming;
421
+ union win_slot_encoding wslot;
422
+ u64 block_mask;
423
+} __packed;
424
+
368425 struct pci_dev_incoming {
369426 struct pci_incoming_message incoming;
370427 union win_slot_encoding wslot;
....@@ -379,50 +436,6 @@
379436 static int pci_ring_size = (4 * PAGE_SIZE);
380437
381438 /*
382
- * Definitions or interrupt steering hypercall.
383
- */
384
-#define HV_PARTITION_ID_SELF ((u64)-1)
385
-#define HVCALL_RETARGET_INTERRUPT 0x7e
386
-
387
-struct hv_interrupt_entry {
388
- u32 source; /* 1 for MSI(-X) */
389
- u32 reserved1;
390
- u32 address;
391
- u32 data;
392
-};
393
-
394
-#define HV_VP_SET_BANK_COUNT_MAX 5 /* current implementation limit */
395
-
396
-struct hv_vp_set {
397
- u64 format; /* 0 (HvGenericSetSparse4k) */
398
- u64 valid_banks;
399
- u64 masks[HV_VP_SET_BANK_COUNT_MAX];
400
-};
401
-
402
-/*
403
- * flags for hv_device_interrupt_target.flags
404
- */
405
-#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1
406
-#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2
407
-
408
-struct hv_device_interrupt_target {
409
- u32 vector;
410
- u32 flags;
411
- union {
412
- u64 vp_mask;
413
- struct hv_vp_set vp_set;
414
- };
415
-};
416
-
417
-struct retarget_msi_interrupt {
418
- u64 partition_id; /* use "self" */
419
- u64 device_id;
420
- struct hv_interrupt_entry int_entry;
421
- u64 reserved2;
422
- struct hv_device_interrupt_target int_target;
423
-} __packed;
424
-
425
-/*
426439 * Driver specific state.
427440 */
428441
....@@ -430,12 +443,14 @@
430443 hv_pcibus_init = 0,
431444 hv_pcibus_probed,
432445 hv_pcibus_installed,
433
- hv_pcibus_removed,
446
+ hv_pcibus_removing,
434447 hv_pcibus_maximum
435448 };
436449
437450 struct hv_pcibus_device {
438451 struct pci_sysdata sysdata;
452
+ /* Protocol version negotiated with the host */
453
+ enum pci_protocol_version_t protocol_version;
439454 enum hv_pcibus_state state;
440455 refcount_t remove_lock;
441456 struct hv_device *hdev;
....@@ -460,12 +475,19 @@
460475 struct msi_controller msi_chip;
461476 struct irq_domain *irq_domain;
462477
463
- /* hypercall arg, must not cross page boundary */
464
- struct retarget_msi_interrupt retarget_msi_interrupt_params;
465
-
466478 spinlock_t retarget_msi_interrupt_lock;
467479
468480 struct workqueue_struct *wq;
481
+
482
+ /* Highest slot of child device with resources allocated */
483
+ int wslot_res_allocated;
484
+
485
+ /* hypercall arg, must not cross page boundary */
486
+ struct hv_retarget_device_interrupt retarget_msi_interrupt_params;
487
+
488
+ /*
489
+ * Don't put anything here: retarget_msi_interrupt_params must be last
490
+ */
469491 };
470492
471493 /*
....@@ -478,36 +500,44 @@
478500 struct hv_pcibus_device *bus;
479501 };
480502
503
+struct hv_pcidev_description {
504
+ u16 v_id; /* vendor ID */
505
+ u16 d_id; /* device ID */
506
+ u8 rev;
507
+ u8 prog_intf;
508
+ u8 subclass;
509
+ u8 base_class;
510
+ u32 subsystem_id;
511
+ union win_slot_encoding win_slot;
512
+ u32 ser; /* serial number */
513
+ u32 flags;
514
+ u16 virtual_numa_node;
515
+};
516
+
481517 struct hv_dr_state {
482518 struct list_head list_entry;
483519 u32 device_count;
484
- struct pci_function_description func[0];
485
-};
486
-
487
-enum hv_pcichild_state {
488
- hv_pcichild_init = 0,
489
- hv_pcichild_requirements,
490
- hv_pcichild_resourced,
491
- hv_pcichild_ejecting,
492
- hv_pcichild_maximum
520
+ struct hv_pcidev_description func[];
493521 };
494522
495523 struct hv_pci_dev {
496524 /* List protected by pci_rescan_remove_lock */
497525 struct list_head list_entry;
498526 refcount_t refs;
499
- enum hv_pcichild_state state;
500527 struct pci_slot *pci_slot;
501
- struct pci_function_description desc;
528
+ struct hv_pcidev_description desc;
502529 bool reported_missing;
503530 struct hv_pcibus_device *hbus;
504531 struct work_struct wrk;
532
+
533
+ void (*block_invalidate)(void *context, u64 block_mask);
534
+ void *invalidate_context;
505535
506536 /*
507537 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
508538 * read it back, for each of the BAR offsets within config space.
509539 */
510
- u32 probed_bar[6];
540
+ u32 probed_bar[PCI_STD_NUM_BARS];
511541 };
512542
513543 struct hv_pci_compl {
....@@ -821,6 +851,254 @@
821851 .write = hv_pcifront_write_config,
822852 };
823853
854
+/*
855
+ * Paravirtual backchannel
856
+ *
857
+ * Hyper-V SR-IOV provides a backchannel mechanism in software for
858
+ * communication between a VF driver and a PF driver. These
859
+ * "configuration blocks" are similar in concept to PCI configuration space,
860
+ * but instead of doing reads and writes in 32-bit chunks through a very slow
861
+ * path, packets of up to 128 bytes can be sent or received asynchronously.
862
+ *
863
+ * Nearly every SR-IOV device contains just such a communications channel in
864
+ * hardware, so using this one in software is usually optional. Using the
865
+ * software channel, however, allows driver implementers to leverage software
866
+ * tools that fuzz the communications channel looking for vulnerabilities.
867
+ *
868
+ * The usage model for these packets puts the responsibility for reading or
869
+ * writing on the VF driver. The VF driver sends a read or a write packet,
870
+ * indicating which "block" is being referred to by number.
871
+ *
872
+ * If the PF driver wishes to initiate communication, it can "invalidate" one or
873
+ * more of the first 64 blocks. This invalidation is delivered via a callback
874
+ * supplied by the VF driver by this driver.
875
+ *
876
+ * No protocol is implied, except that supplied by the PF and VF drivers.
877
+ */
878
+
879
+struct hv_read_config_compl {
880
+ struct hv_pci_compl comp_pkt;
881
+ void *buf;
882
+ unsigned int len;
883
+ unsigned int bytes_returned;
884
+};
885
+
886
+/**
887
+ * hv_pci_read_config_compl() - Invoked when a response packet
888
+ * for a read config block operation arrives.
889
+ * @context: Identifies the read config operation
890
+ * @resp: The response packet itself
891
+ * @resp_packet_size: Size in bytes of the response packet
892
+ */
893
+static void hv_pci_read_config_compl(void *context, struct pci_response *resp,
894
+ int resp_packet_size)
895
+{
896
+ struct hv_read_config_compl *comp = context;
897
+ struct pci_read_block_response *read_resp =
898
+ (struct pci_read_block_response *)resp;
899
+ unsigned int data_len, hdr_len;
900
+
901
+ hdr_len = offsetof(struct pci_read_block_response, bytes);
902
+ if (resp_packet_size < hdr_len) {
903
+ comp->comp_pkt.completion_status = -1;
904
+ goto out;
905
+ }
906
+
907
+ data_len = resp_packet_size - hdr_len;
908
+ if (data_len > 0 && read_resp->status == 0) {
909
+ comp->bytes_returned = min(comp->len, data_len);
910
+ memcpy(comp->buf, read_resp->bytes, comp->bytes_returned);
911
+ } else {
912
+ comp->bytes_returned = 0;
913
+ }
914
+
915
+ comp->comp_pkt.completion_status = read_resp->status;
916
+out:
917
+ complete(&comp->comp_pkt.host_event);
918
+}
919
+
920
+/**
921
+ * hv_read_config_block() - Sends a read config block request to
922
+ * the back-end driver running in the Hyper-V parent partition.
923
+ * @pdev: The PCI driver's representation for this device.
924
+ * @buf: Buffer into which the config block will be copied.
925
+ * @len: Size in bytes of buf.
926
+ * @block_id: Identifies the config block which has been requested.
927
+ * @bytes_returned: Size which came back from the back-end driver.
928
+ *
929
+ * Return: 0 on success, -errno on failure
930
+ */
931
+static int hv_read_config_block(struct pci_dev *pdev, void *buf,
932
+ unsigned int len, unsigned int block_id,
933
+ unsigned int *bytes_returned)
934
+{
935
+ struct hv_pcibus_device *hbus =
936
+ container_of(pdev->bus->sysdata, struct hv_pcibus_device,
937
+ sysdata);
938
+ struct {
939
+ struct pci_packet pkt;
940
+ char buf[sizeof(struct pci_read_block)];
941
+ } pkt;
942
+ struct hv_read_config_compl comp_pkt;
943
+ struct pci_read_block *read_blk;
944
+ int ret;
945
+
946
+ if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
947
+ return -EINVAL;
948
+
949
+ init_completion(&comp_pkt.comp_pkt.host_event);
950
+ comp_pkt.buf = buf;
951
+ comp_pkt.len = len;
952
+
953
+ memset(&pkt, 0, sizeof(pkt));
954
+ pkt.pkt.completion_func = hv_pci_read_config_compl;
955
+ pkt.pkt.compl_ctxt = &comp_pkt;
956
+ read_blk = (struct pci_read_block *)&pkt.pkt.message;
957
+ read_blk->message_type.type = PCI_READ_BLOCK;
958
+ read_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
959
+ read_blk->block_id = block_id;
960
+ read_blk->bytes_requested = len;
961
+
962
+ ret = vmbus_sendpacket(hbus->hdev->channel, read_blk,
963
+ sizeof(*read_blk), (unsigned long)&pkt.pkt,
964
+ VM_PKT_DATA_INBAND,
965
+ VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
966
+ if (ret)
967
+ return ret;
968
+
969
+ ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event);
970
+ if (ret)
971
+ return ret;
972
+
973
+ if (comp_pkt.comp_pkt.completion_status != 0 ||
974
+ comp_pkt.bytes_returned == 0) {
975
+ dev_err(&hbus->hdev->device,
976
+ "Read Config Block failed: 0x%x, bytes_returned=%d\n",
977
+ comp_pkt.comp_pkt.completion_status,
978
+ comp_pkt.bytes_returned);
979
+ return -EIO;
980
+ }
981
+
982
+ *bytes_returned = comp_pkt.bytes_returned;
983
+ return 0;
984
+}
985
+
986
+/**
987
+ * hv_pci_write_config_compl() - Invoked when a response packet for a write
988
+ * config block operation arrives.
989
+ * @context: Identifies the write config operation
990
+ * @resp: The response packet itself
991
+ * @resp_packet_size: Size in bytes of the response packet
992
+ */
993
+static void hv_pci_write_config_compl(void *context, struct pci_response *resp,
994
+ int resp_packet_size)
995
+{
996
+ struct hv_pci_compl *comp_pkt = context;
997
+
998
+ comp_pkt->completion_status = resp->status;
999
+ complete(&comp_pkt->host_event);
1000
+}
1001
+
1002
+/**
1003
+ * hv_write_config_block() - Sends a write config block request to the
1004
+ * back-end driver running in the Hyper-V parent partition.
1005
+ * @pdev: The PCI driver's representation for this device.
1006
+ * @buf: Buffer from which the config block will be copied.
1007
+ * @len: Size in bytes of buf.
1008
+ * @block_id: Identifies the config block which is being written.
1009
+ *
1010
+ * Return: 0 on success, -errno on failure
1011
+ */
1012
+static int hv_write_config_block(struct pci_dev *pdev, void *buf,
1013
+ unsigned int len, unsigned int block_id)
1014
+{
1015
+ struct hv_pcibus_device *hbus =
1016
+ container_of(pdev->bus->sysdata, struct hv_pcibus_device,
1017
+ sysdata);
1018
+ struct {
1019
+ struct pci_packet pkt;
1020
+ char buf[sizeof(struct pci_write_block)];
1021
+ u32 reserved;
1022
+ } pkt;
1023
+ struct hv_pci_compl comp_pkt;
1024
+ struct pci_write_block *write_blk;
1025
+ u32 pkt_size;
1026
+ int ret;
1027
+
1028
+ if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
1029
+ return -EINVAL;
1030
+
1031
+ init_completion(&comp_pkt.host_event);
1032
+
1033
+ memset(&pkt, 0, sizeof(pkt));
1034
+ pkt.pkt.completion_func = hv_pci_write_config_compl;
1035
+ pkt.pkt.compl_ctxt = &comp_pkt;
1036
+ write_blk = (struct pci_write_block *)&pkt.pkt.message;
1037
+ write_blk->message_type.type = PCI_WRITE_BLOCK;
1038
+ write_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
1039
+ write_blk->block_id = block_id;
1040
+ write_blk->byte_count = len;
1041
+ memcpy(write_blk->bytes, buf, len);
1042
+ pkt_size = offsetof(struct pci_write_block, bytes) + len;
1043
+ /*
1044
+ * This quirk is required on some hosts shipped around 2018, because
1045
+ * these hosts don't check the pkt_size correctly (new hosts have been
1046
+ * fixed since early 2019). The quirk is also safe on very old hosts
1047
+ * and new hosts, because, on them, what really matters is the length
1048
+ * specified in write_blk->byte_count.
1049
+ */
1050
+ pkt_size += sizeof(pkt.reserved);
1051
+
1052
+ ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size,
1053
+ (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND,
1054
+ VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1055
+ if (ret)
1056
+ return ret;
1057
+
1058
+ ret = wait_for_response(hbus->hdev, &comp_pkt.host_event);
1059
+ if (ret)
1060
+ return ret;
1061
+
1062
+ if (comp_pkt.completion_status != 0) {
1063
+ dev_err(&hbus->hdev->device,
1064
+ "Write Config Block failed: 0x%x\n",
1065
+ comp_pkt.completion_status);
1066
+ return -EIO;
1067
+ }
1068
+
1069
+ return 0;
1070
+}
1071
+
1072
+/**
1073
+ * hv_register_block_invalidate() - Invoked when a config block invalidation
1074
+ * arrives from the back-end driver.
1075
+ * @pdev: The PCI driver's representation for this device.
1076
+ * @context: Identifies the device.
1077
+ * @block_invalidate: Identifies all of the blocks being invalidated.
1078
+ *
1079
+ * Return: 0 on success, -errno on failure
1080
+ */
1081
+static int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
1082
+ void (*block_invalidate)(void *context,
1083
+ u64 block_mask))
1084
+{
1085
+ struct hv_pcibus_device *hbus =
1086
+ container_of(pdev->bus->sysdata, struct hv_pcibus_device,
1087
+ sysdata);
1088
+ struct hv_pci_dev *hpdev;
1089
+
1090
+ hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1091
+ if (!hpdev)
1092
+ return -ENODEV;
1093
+
1094
+ hpdev->block_invalidate = block_invalidate;
1095
+ hpdev->invalidate_context = context;
1096
+
1097
+ put_pcichild(hpdev);
1098
+ return 0;
1099
+
1100
+}
1101
+
8241102 /* Interrupt management hooks */
8251103 static void hv_int_desc_free(struct hv_pci_dev *hpdev,
8261104 struct tran_int_desc *int_desc)
....@@ -831,6 +1109,10 @@
8311109 u8 buffer[sizeof(struct pci_delete_interrupt)];
8321110 } ctxt;
8331111
1112
+ if (!int_desc->vector_count) {
1113
+ kfree(int_desc);
1114
+ return;
1115
+ }
8341116 memset(&ctxt, 0, sizeof(ctxt));
8351117 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
8361118 int_pkt->message_type.type =
....@@ -893,6 +1175,28 @@
8931175 pci_msi_mask_irq(data);
8941176 }
8951177
1178
+static unsigned int hv_msi_get_int_vector(struct irq_data *data)
1179
+{
1180
+ struct irq_cfg *cfg = irqd_cfg(data);
1181
+
1182
+ return cfg->vector;
1183
+}
1184
+
1185
+static int hv_msi_prepare(struct irq_domain *domain, struct device *dev,
1186
+ int nvec, msi_alloc_info_t *info)
1187
+{
1188
+ int ret = pci_msi_prepare(domain, dev, nvec, info);
1189
+
1190
+ /*
1191
+ * By using the interrupt remapper in the hypervisor IOMMU, contiguous
1192
+ * CPU vectors is not needed for multi-MSI
1193
+ */
1194
+ if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
1195
+ info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
1196
+
1197
+ return ret;
1198
+}
1199
+
8961200 /**
8971201 * hv_irq_unmask() - "Unmask" the IRQ by setting its current
8981202 * affinity.
....@@ -907,21 +1211,28 @@
9071211 {
9081212 struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
9091213 struct irq_cfg *cfg = irqd_cfg(data);
910
- struct retarget_msi_interrupt *params;
1214
+ struct hv_retarget_device_interrupt *params;
1215
+ struct tran_int_desc *int_desc;
9111216 struct hv_pcibus_device *hbus;
9121217 struct cpumask *dest;
1218
+ cpumask_var_t tmp;
9131219 struct pci_bus *pbus;
9141220 struct pci_dev *pdev;
9151221 unsigned long flags;
9161222 u32 var_size = 0;
917
- int cpu_vmbus;
918
- int cpu;
1223
+ int cpu, nr_bank;
9191224 u64 res;
9201225
9211226 dest = irq_data_get_effective_affinity_mask(data);
9221227 pdev = msi_desc_to_pci_dev(msi_desc);
9231228 pbus = pdev->bus;
9241229 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
1230
+ int_desc = data->chip_data;
1231
+ if (!int_desc) {
1232
+ dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n",
1233
+ __func__, data->irq);
1234
+ return;
1235
+ }
9251236
9261237 spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
9271238
....@@ -929,8 +1240,8 @@
9291240 memset(params, 0, sizeof(*params));
9301241 params->partition_id = HV_PARTITION_ID_SELF;
9311242 params->int_entry.source = 1; /* MSI(-X) */
932
- params->int_entry.address = msi_desc->msg.address_lo;
933
- params->int_entry.data = msi_desc->msg.data;
1243
+ params->int_entry.msi_entry.address = int_desc->address & 0xffffffff;
1244
+ params->int_entry.msi_entry.data = int_desc->data;
9341245 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
9351246 (hbus->hdev->dev_instance.b[4] << 16) |
9361247 (hbus->hdev->dev_instance.b[7] << 8) |
....@@ -945,7 +1256,7 @@
9451256 * negative effect (yet?).
9461257 */
9471258
948
- if (pci_protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
1259
+ if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
9491260 /*
9501261 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
9511262 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
....@@ -955,28 +1266,27 @@
9551266 */
9561267 params->int_target.flags |=
9571268 HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
958
- params->int_target.vp_set.valid_banks =
959
- (1ull << HV_VP_SET_BANK_COUNT_MAX) - 1;
1269
+
1270
+ if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) {
1271
+ res = 1;
1272
+ goto exit_unlock;
1273
+ }
1274
+
1275
+ cpumask_and(tmp, dest, cpu_online_mask);
1276
+ nr_bank = cpumask_to_vpset(&params->int_target.vp_set, tmp);
1277
+ free_cpumask_var(tmp);
1278
+
1279
+ if (nr_bank <= 0) {
1280
+ res = 1;
1281
+ goto exit_unlock;
1282
+ }
9601283
9611284 /*
9621285 * var-sized hypercall, var-size starts after vp_mask (thus
963
- * vp_set.format does not count, but vp_set.valid_banks does).
1286
+ * vp_set.format does not count, but vp_set.valid_bank_mask
1287
+ * does).
9641288 */
965
- var_size = 1 + HV_VP_SET_BANK_COUNT_MAX;
966
-
967
- for_each_cpu_and(cpu, dest, cpu_online_mask) {
968
- cpu_vmbus = hv_cpu_number_to_vp_number(cpu);
969
-
970
- if (cpu_vmbus >= HV_VP_SET_BANK_COUNT_MAX * 64) {
971
- dev_err(&hbus->hdev->device,
972
- "too high CPU %d", cpu_vmbus);
973
- res = 1;
974
- goto exit_unlock;
975
- }
976
-
977
- params->int_target.vp_set.masks[cpu_vmbus / 64] |=
978
- (1ULL << (cpu_vmbus & 63));
979
- }
1289
+ var_size = 1 + nr_bank;
9801290 } else {
9811291 for_each_cpu_and(cpu, dest, cpu_online_mask) {
9821292 params->int_target.vp_mask |=
....@@ -990,11 +1300,25 @@
9901300 exit_unlock:
9911301 spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
9921302
993
- if (res) {
1303
+ /*
1304
+ * During hibernation, when a CPU is offlined, the kernel tries
1305
+ * to move the interrupt to the remaining CPUs that haven't
1306
+ * been offlined yet. In this case, the below hv_do_hypercall()
1307
+ * always fails since the vmbus channel has been closed:
1308
+ * refer to cpu_disable_common() -> fixup_irqs() ->
1309
+ * irq_migrate_all_off_this_cpu() -> migrate_one_irq().
1310
+ *
1311
+ * Suppress the error message for hibernation because the failure
1312
+ * during hibernation does not matter (at this time all the devices
1313
+ * have been frozen). Note: the correct affinity info is still updated
1314
+ * into the irqdata data structure in migrate_one_irq() ->
1315
+ * irq_do_set_affinity() -> hv_set_affinity(), so later when the VM
1316
+ * resumes, hv_pci_restore_msi_state() is able to correctly restore
1317
+ * the interrupt with the correct affinity.
1318
+ */
1319
+ if (res && hbus->state != hv_pcibus_removing)
9941320 dev_err(&hbus->hdev->device,
9951321 "%s() failed: %#llx", __func__, res);
996
- return;
997
- }
9981322
9991323 pci_msi_unmask_irq(data);
10001324 }
....@@ -1018,12 +1342,12 @@
10181342
10191343 static u32 hv_compose_msi_req_v1(
10201344 struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
1021
- u32 slot, u8 vector)
1345
+ u32 slot, u8 vector, u8 vector_count)
10221346 {
10231347 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
10241348 int_pkt->wslot.slot = slot;
10251349 int_pkt->int_desc.vector = vector;
1026
- int_pkt->int_desc.vector_count = 1;
1350
+ int_pkt->int_desc.vector_count = vector_count;
10271351 int_pkt->int_desc.delivery_mode = dest_Fixed;
10281352
10291353 /*
....@@ -1037,14 +1361,14 @@
10371361
10381362 static u32 hv_compose_msi_req_v2(
10391363 struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
1040
- u32 slot, u8 vector)
1364
+ u32 slot, u8 vector, u8 vector_count)
10411365 {
10421366 int cpu;
10431367
10441368 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
10451369 int_pkt->wslot.slot = slot;
10461370 int_pkt->int_desc.vector = vector;
1047
- int_pkt->int_desc.vector_count = 1;
1371
+ int_pkt->int_desc.vector_count = vector_count;
10481372 int_pkt->int_desc.delivery_mode = dest_Fixed;
10491373
10501374 /*
....@@ -1072,15 +1396,16 @@
10721396 */
10731397 static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
10741398 {
1075
- struct irq_cfg *cfg = irqd_cfg(data);
10761399 struct hv_pcibus_device *hbus;
1400
+ struct vmbus_channel *channel;
10771401 struct hv_pci_dev *hpdev;
10781402 struct pci_bus *pbus;
10791403 struct pci_dev *pdev;
10801404 struct cpumask *dest;
1081
- unsigned long flags;
10821405 struct compose_comp_ctxt comp;
10831406 struct tran_int_desc *int_desc;
1407
+ struct msi_desc *msi_desc;
1408
+ u8 vector, vector_count;
10841409 struct {
10851410 struct pci_packet pci_pkt;
10861411 union {
....@@ -1092,43 +1417,80 @@
10921417 u32 size;
10931418 int ret;
10941419
1095
- pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
1420
+ /* Reuse the previous allocation */
1421
+ if (data->chip_data) {
1422
+ int_desc = data->chip_data;
1423
+ msg->address_hi = int_desc->address >> 32;
1424
+ msg->address_lo = int_desc->address & 0xffffffff;
1425
+ msg->data = int_desc->data;
1426
+ return;
1427
+ }
1428
+
1429
+ msi_desc = irq_data_get_msi_desc(data);
1430
+ pdev = msi_desc_to_pci_dev(msi_desc);
10961431 dest = irq_data_get_effective_affinity_mask(data);
10971432 pbus = pdev->bus;
10981433 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
1434
+ channel = hbus->hdev->channel;
10991435 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
11001436 if (!hpdev)
11011437 goto return_null_message;
11021438
1103
- /* Free any previous message that might have already been composed. */
1104
- if (data->chip_data) {
1105
- int_desc = data->chip_data;
1106
- data->chip_data = NULL;
1107
- hv_int_desc_free(hpdev, int_desc);
1108
- }
1109
-
11101439 int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
11111440 if (!int_desc)
11121441 goto drop_reference;
1442
+
1443
+ if (!msi_desc->msi_attrib.is_msix && msi_desc->nvec_used > 1) {
1444
+ /*
1445
+ * If this is not the first MSI of Multi MSI, we already have
1446
+ * a mapping. Can exit early.
1447
+ */
1448
+ if (msi_desc->irq != data->irq) {
1449
+ data->chip_data = int_desc;
1450
+ int_desc->address = msi_desc->msg.address_lo |
1451
+ (u64)msi_desc->msg.address_hi << 32;
1452
+ int_desc->data = msi_desc->msg.data +
1453
+ (data->irq - msi_desc->irq);
1454
+ msg->address_hi = msi_desc->msg.address_hi;
1455
+ msg->address_lo = msi_desc->msg.address_lo;
1456
+ msg->data = int_desc->data;
1457
+ put_pcichild(hpdev);
1458
+ return;
1459
+ }
1460
+ /*
1461
+ * The vector we select here is a dummy value. The correct
1462
+ * value gets sent to the hypervisor in unmask(). This needs
1463
+ * to be aligned with the count, and also not zero. Multi-msi
1464
+ * is powers of 2 up to 32, so 32 will always work here.
1465
+ */
1466
+ vector = 32;
1467
+ vector_count = msi_desc->nvec_used;
1468
+ } else {
1469
+ vector = hv_msi_get_int_vector(data);
1470
+ vector_count = 1;
1471
+ }
11131472
11141473 memset(&ctxt, 0, sizeof(ctxt));
11151474 init_completion(&comp.comp_pkt.host_event);
11161475 ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
11171476 ctxt.pci_pkt.compl_ctxt = &comp;
11181477
1119
- switch (pci_protocol_version) {
1478
+ switch (hbus->protocol_version) {
11201479 case PCI_PROTOCOL_VERSION_1_1:
11211480 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
11221481 dest,
11231482 hpdev->desc.win_slot.slot,
1124
- cfg->vector);
1483
+ vector,
1484
+ vector_count);
11251485 break;
11261486
11271487 case PCI_PROTOCOL_VERSION_1_2:
1488
+ case PCI_PROTOCOL_VERSION_1_3:
11281489 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
11291490 dest,
11301491 hpdev->desc.win_slot.slot,
1131
- cfg->vector);
1492
+ vector,
1493
+ vector_count);
11321494 break;
11331495
11341496 default:
....@@ -1153,41 +1515,44 @@
11531515 }
11541516
11551517 /*
1518
+ * Prevents hv_pci_onchannelcallback() from running concurrently
1519
+ * in the tasklet.
1520
+ */
1521
+ tasklet_disable(&channel->callback_event);
1522
+
1523
+ /*
11561524 * Since this function is called with IRQ locks held, can't
11571525 * do normal wait for completion; instead poll.
11581526 */
11591527 while (!try_wait_for_completion(&comp.comp_pkt.host_event)) {
1528
+ unsigned long flags;
1529
+
11601530 /* 0xFFFF means an invalid PCI VENDOR ID. */
11611531 if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) {
11621532 dev_err_once(&hbus->hdev->device,
11631533 "the device has gone\n");
1164
- goto free_int_desc;
1534
+ goto enable_tasklet;
11651535 }
11661536
11671537 /*
1168
- * When the higher level interrupt code calls us with
1169
- * interrupt disabled, we must poll the channel by calling
1170
- * the channel callback directly when channel->target_cpu is
1171
- * the current CPU. When the higher level interrupt code
1172
- * calls us with interrupt enabled, let's add the
1173
- * local_irq_save()/restore() to avoid race:
1174
- * hv_pci_onchannelcallback() can also run in tasklet.
1538
+ * Make sure that the ring buffer data structure doesn't get
1539
+ * freed while we dereference the ring buffer pointer. Test
1540
+ * for the channel's onchannel_callback being NULL within a
1541
+ * sched_lock critical section. See also the inline comments
1542
+ * in vmbus_reset_channel_cb().
11751543 */
1176
- local_irq_save(flags);
1177
-
1178
- if (hbus->hdev->channel->target_cpu == smp_processor_id())
1179
- hv_pci_onchannelcallback(hbus);
1180
-
1181
- local_irq_restore(flags);
1182
-
1183
- if (hpdev->state == hv_pcichild_ejecting) {
1184
- dev_err_once(&hbus->hdev->device,
1185
- "the device is being ejected\n");
1186
- goto free_int_desc;
1544
+ spin_lock_irqsave(&channel->sched_lock, flags);
1545
+ if (unlikely(channel->onchannel_callback == NULL)) {
1546
+ spin_unlock_irqrestore(&channel->sched_lock, flags);
1547
+ goto enable_tasklet;
11871548 }
1549
+ hv_pci_onchannelcallback(hbus);
1550
+ spin_unlock_irqrestore(&channel->sched_lock, flags);
11881551
11891552 udelay(100);
11901553 }
1554
+
1555
+ tasklet_enable(&channel->callback_event);
11911556
11921557 if (comp.comp_pkt.completion_status < 0) {
11931558 dev_err(&hbus->hdev->device,
....@@ -1212,6 +1577,8 @@
12121577 put_pcichild(hpdev);
12131578 return;
12141579
1580
+enable_tasklet:
1581
+ tasklet_enable(&channel->callback_event);
12151582 free_int_desc:
12161583 kfree(int_desc);
12171584 drop_reference:
....@@ -1232,16 +1599,8 @@
12321599 .irq_unmask = hv_irq_unmask,
12331600 };
12341601
1235
-static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
1236
- msi_alloc_info_t *arg)
1237
-{
1238
- return arg->msi_hwirq;
1239
-}
1240
-
12411602 static struct msi_domain_ops hv_msi_ops = {
1242
- .get_hwirq = hv_msi_domain_ops_get_hwirq,
1243
- .msi_prepare = pci_msi_prepare,
1244
- .set_desc = pci_msi_set_desc,
1603
+ .msi_prepare = hv_msi_prepare,
12451604 .msi_free = hv_msi_free,
12461605 };
12471606
....@@ -1332,7 +1691,7 @@
13321691 * so it's sufficient to just add them up without tracking alignment.
13331692 */
13341693 list_for_each_entry(hpdev, &hbus->children, list_entry) {
1335
- for (i = 0; i < 6; i++) {
1694
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
13361695 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
13371696 dev_err(&hbus->hdev->device,
13381697 "There's an I/O BAR in this list!\n");
....@@ -1403,10 +1762,27 @@
14031762
14041763 spin_lock_irqsave(&hbus->device_list_lock, flags);
14051764
1765
+ /*
1766
+ * Clear the memory enable bit, in case it's already set. This occurs
1767
+ * in the suspend path of hibernation, where the device is suspended,
1768
+ * resumed and suspended again: see hibernation_snapshot() and
1769
+ * hibernation_platform_enter().
1770
+ *
1771
+ * If the memory enable bit is already set, Hyper-V sliently ignores
1772
+ * the below BAR updates, and the related PCI device driver can not
1773
+ * work, because reading from the device register(s) always returns
1774
+ * 0xFFFFFFFF.
1775
+ */
1776
+ list_for_each_entry(hpdev, &hbus->children, list_entry) {
1777
+ _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command);
1778
+ command &= ~PCI_COMMAND_MEMORY;
1779
+ _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command);
1780
+ }
1781
+
14061782 /* Pick addresses for the BARs. */
14071783 do {
14081784 list_for_each_entry(hpdev, &hbus->children, list_entry) {
1409
- for (i = 0; i < 6; i++) {
1785
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
14101786 bar_val = hpdev->probed_bar[i];
14111787 if (bar_val == 0)
14121788 continue;
....@@ -1506,6 +1882,36 @@
15061882 }
15071883 }
15081884
1885
+/*
1886
+ * Set NUMA node for the devices on the bus
1887
+ */
1888
+static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus)
1889
+{
1890
+ struct pci_dev *dev;
1891
+ struct pci_bus *bus = hbus->pci_bus;
1892
+ struct hv_pci_dev *hv_dev;
1893
+
1894
+ list_for_each_entry(dev, &bus->devices, bus_list) {
1895
+ hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn));
1896
+ if (!hv_dev)
1897
+ continue;
1898
+
1899
+ if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY &&
1900
+ hv_dev->desc.virtual_numa_node < num_possible_nodes())
1901
+ /*
1902
+ * The kernel may boot with some NUMA nodes offline
1903
+ * (e.g. in a KDUMP kernel) or with NUMA disabled via
1904
+ * "numa=off". In those cases, adjust the host provided
1905
+ * NUMA node to a valid NUMA node used by the kernel.
1906
+ */
1907
+ set_dev_node(&dev->dev,
1908
+ numa_map_to_online_node(
1909
+ hv_dev->desc.virtual_numa_node));
1910
+
1911
+ put_pcichild(hv_dev);
1912
+ }
1913
+}
1914
+
15091915 /**
15101916 * create_root_hv_pci_bus() - Expose a new root PCI bus
15111917 * @hbus: Root PCI bus, as understood by this driver
....@@ -1528,6 +1934,7 @@
15281934
15291935 pci_lock_rescan_remove();
15301936 pci_scan_child_bus(hbus->pci_bus);
1937
+ hv_pci_assign_numa_node(hbus);
15311938 pci_bus_assign_resources(hbus->pci_bus);
15321939 hv_pci_assign_slots(hbus);
15331940 pci_bus_add_devices(hbus->pci_bus);
....@@ -1563,7 +1970,7 @@
15631970 "query resource requirements failed: %x\n",
15641971 resp->status);
15651972 } else {
1566
- for (i = 0; i < 6; i++) {
1973
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
15671974 completion->hpdev->probed_bar[i] =
15681975 q_res_req->probed_bar[i];
15691976 }
....@@ -1584,7 +1991,7 @@
15841991 * Return: Pointer to the new tracking struct
15851992 */
15861993 static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
1587
- struct pci_function_description *desc)
1994
+ struct hv_pcidev_description *desc)
15881995 {
15891996 struct hv_pci_dev *hpdev;
15901997 struct pci_child_message *res_req;
....@@ -1695,7 +2102,7 @@
16952102 {
16962103 u32 child_no;
16972104 bool found;
1698
- struct pci_function_description *new_desc;
2105
+ struct hv_pcidev_description *new_desc;
16992106 struct hv_pci_dev *hpdev;
17002107 struct hv_pcibus_device *hbus;
17012108 struct list_head removed;
....@@ -1796,6 +2203,7 @@
17962203 */
17972204 pci_lock_rescan_remove();
17982205 pci_scan_child_bus(hbus->pci_bus);
2206
+ hv_pci_assign_numa_node(hbus);
17992207 hv_pci_assign_slots(hbus);
18002208 pci_unlock_rescan_remove();
18012209 break;
....@@ -1814,41 +2222,31 @@
18142222 }
18152223
18162224 /**
1817
- * hv_pci_devices_present() - Handles list of new children
2225
+ * hv_pci_start_relations_work() - Queue work to start device discovery
18182226 * @hbus: Root PCI bus, as understood by this driver
1819
- * @relations: Packet from host listing children
2227
+ * @dr: The list of children returned from host
18202228 *
1821
- * This function is invoked whenever a new list of devices for
1822
- * this bus appears.
2229
+ * Return: 0 on success, -errno on failure
18232230 */
1824
-static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
1825
- struct pci_bus_relations *relations)
2231
+static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus,
2232
+ struct hv_dr_state *dr)
18262233 {
1827
- struct hv_dr_state *dr;
18282234 struct hv_dr_work *dr_wrk;
18292235 unsigned long flags;
18302236 bool pending_dr;
18312237
2238
+ if (hbus->state == hv_pcibus_removing) {
2239
+ dev_info(&hbus->hdev->device,
2240
+ "PCI VMBus BUS_RELATIONS: ignored\n");
2241
+ return -ENOENT;
2242
+ }
2243
+
18322244 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
18332245 if (!dr_wrk)
1834
- return;
1835
-
1836
- dr = kzalloc(offsetof(struct hv_dr_state, func) +
1837
- (sizeof(struct pci_function_description) *
1838
- (relations->device_count)), GFP_NOWAIT);
1839
- if (!dr) {
1840
- kfree(dr_wrk);
1841
- return;
1842
- }
2246
+ return -ENOMEM;
18432247
18442248 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
18452249 dr_wrk->bus = hbus;
1846
- dr->device_count = relations->device_count;
1847
- if (dr->device_count != 0) {
1848
- memcpy(dr->func, relations->func,
1849
- sizeof(struct pci_function_description) *
1850
- dr->device_count);
1851
- }
18522250
18532251 spin_lock_irqsave(&hbus->device_list_lock, flags);
18542252 /*
....@@ -1866,6 +2264,83 @@
18662264 get_hvpcibus(hbus);
18672265 queue_work(hbus->wq, &dr_wrk->wrk);
18682266 }
2267
+
2268
+ return 0;
2269
+}
2270
+
2271
+/**
2272
+ * hv_pci_devices_present() - Handle list of new children
2273
+ * @hbus: Root PCI bus, as understood by this driver
2274
+ * @relations: Packet from host listing children
2275
+ *
2276
+ * Process a new list of devices on the bus. The list of devices is
2277
+ * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS,
2278
+ * whenever a new list of devices for this bus appears.
2279
+ */
2280
+static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
2281
+ struct pci_bus_relations *relations)
2282
+{
2283
+ struct hv_dr_state *dr;
2284
+ int i;
2285
+
2286
+ dr = kzalloc(struct_size(dr, func, relations->device_count),
2287
+ GFP_NOWAIT);
2288
+ if (!dr)
2289
+ return;
2290
+
2291
+ dr->device_count = relations->device_count;
2292
+ for (i = 0; i < dr->device_count; i++) {
2293
+ dr->func[i].v_id = relations->func[i].v_id;
2294
+ dr->func[i].d_id = relations->func[i].d_id;
2295
+ dr->func[i].rev = relations->func[i].rev;
2296
+ dr->func[i].prog_intf = relations->func[i].prog_intf;
2297
+ dr->func[i].subclass = relations->func[i].subclass;
2298
+ dr->func[i].base_class = relations->func[i].base_class;
2299
+ dr->func[i].subsystem_id = relations->func[i].subsystem_id;
2300
+ dr->func[i].win_slot = relations->func[i].win_slot;
2301
+ dr->func[i].ser = relations->func[i].ser;
2302
+ }
2303
+
2304
+ if (hv_pci_start_relations_work(hbus, dr))
2305
+ kfree(dr);
2306
+}
2307
+
2308
+/**
2309
+ * hv_pci_devices_present2() - Handle list of new children
2310
+ * @hbus: Root PCI bus, as understood by this driver
2311
+ * @relations: Packet from host listing children
2312
+ *
2313
+ * This function is the v2 version of hv_pci_devices_present()
2314
+ */
2315
+static void hv_pci_devices_present2(struct hv_pcibus_device *hbus,
2316
+ struct pci_bus_relations2 *relations)
2317
+{
2318
+ struct hv_dr_state *dr;
2319
+ int i;
2320
+
2321
+ dr = kzalloc(struct_size(dr, func, relations->device_count),
2322
+ GFP_NOWAIT);
2323
+ if (!dr)
2324
+ return;
2325
+
2326
+ dr->device_count = relations->device_count;
2327
+ for (i = 0; i < dr->device_count; i++) {
2328
+ dr->func[i].v_id = relations->func[i].v_id;
2329
+ dr->func[i].d_id = relations->func[i].d_id;
2330
+ dr->func[i].rev = relations->func[i].rev;
2331
+ dr->func[i].prog_intf = relations->func[i].prog_intf;
2332
+ dr->func[i].subclass = relations->func[i].subclass;
2333
+ dr->func[i].base_class = relations->func[i].base_class;
2334
+ dr->func[i].subsystem_id = relations->func[i].subsystem_id;
2335
+ dr->func[i].win_slot = relations->func[i].win_slot;
2336
+ dr->func[i].ser = relations->func[i].ser;
2337
+ dr->func[i].flags = relations->func[i].flags;
2338
+ dr->func[i].virtual_numa_node =
2339
+ relations->func[i].virtual_numa_node;
2340
+ }
2341
+
2342
+ if (hv_pci_start_relations_work(hbus, dr))
2343
+ kfree(dr);
18692344 }
18702345
18712346 /**
....@@ -1892,8 +2367,6 @@
18922367
18932368 hpdev = container_of(work, struct hv_pci_dev, wrk);
18942369 hbus = hpdev->hbus;
1895
-
1896
- WARN_ON(hpdev->state != hv_pcichild_ejecting);
18972370
18982371 /*
18992372 * Ejection can come before or after the PCI bus has been set up, so
....@@ -1945,11 +2418,18 @@
19452418 */
19462419 static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
19472420 {
1948
- hpdev->state = hv_pcichild_ejecting;
2421
+ struct hv_pcibus_device *hbus = hpdev->hbus;
2422
+ struct hv_device *hdev = hbus->hdev;
2423
+
2424
+ if (hbus->state == hv_pcibus_removing) {
2425
+ dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n");
2426
+ return;
2427
+ }
2428
+
19492429 get_pcichild(hpdev);
19502430 INIT_WORK(&hpdev->wrk, hv_eject_device_work);
1951
- get_hvpcibus(hpdev->hbus);
1952
- queue_work(hpdev->hbus->wq, &hpdev->wrk);
2431
+ get_hvpcibus(hbus);
2432
+ queue_work(hbus->wq, &hpdev->wrk);
19532433 }
19542434
19552435 /**
....@@ -1973,6 +2453,8 @@
19732453 struct pci_response *response;
19742454 struct pci_incoming_message *new_message;
19752455 struct pci_bus_relations *bus_rel;
2456
+ struct pci_bus_relations2 *bus_rel2;
2457
+ struct pci_dev_inval_block *inval;
19762458 struct pci_dev_incoming *dev_message;
19772459 struct hv_pci_dev *hpdev;
19782460
....@@ -2028,15 +2510,28 @@
20282510
20292511 bus_rel = (struct pci_bus_relations *)buffer;
20302512 if (bytes_recvd <
2031
- offsetof(struct pci_bus_relations, func) +
2032
- (sizeof(struct pci_function_description) *
2033
- (bus_rel->device_count))) {
2513
+ struct_size(bus_rel, func,
2514
+ bus_rel->device_count)) {
20342515 dev_err(&hbus->hdev->device,
20352516 "bus relations too small\n");
20362517 break;
20372518 }
20382519
20392520 hv_pci_devices_present(hbus, bus_rel);
2521
+ break;
2522
+
2523
+ case PCI_BUS_RELATIONS2:
2524
+
2525
+ bus_rel2 = (struct pci_bus_relations2 *)buffer;
2526
+ if (bytes_recvd <
2527
+ struct_size(bus_rel2, func,
2528
+ bus_rel2->device_count)) {
2529
+ dev_err(&hbus->hdev->device,
2530
+ "bus relations v2 too small\n");
2531
+ break;
2532
+ }
2533
+
2534
+ hv_pci_devices_present2(hbus, bus_rel2);
20402535 break;
20412536
20422537 case PCI_EJECT:
....@@ -2046,6 +2541,21 @@
20462541 dev_message->wslot.slot);
20472542 if (hpdev) {
20482543 hv_pci_eject_device(hpdev);
2544
+ put_pcichild(hpdev);
2545
+ }
2546
+ break;
2547
+
2548
+ case PCI_INVALIDATE_BLOCK:
2549
+
2550
+ inval = (struct pci_dev_inval_block *)buffer;
2551
+ hpdev = get_pcichild_wslot(hbus,
2552
+ inval->wslot.slot);
2553
+ if (hpdev) {
2554
+ if (hpdev->block_invalidate) {
2555
+ hpdev->block_invalidate(
2556
+ hpdev->invalidate_context,
2557
+ inval->block_mask);
2558
+ }
20492559 put_pcichild(hpdev);
20502560 }
20512561 break;
....@@ -2071,7 +2581,10 @@
20712581
20722582 /**
20732583 * hv_pci_protocol_negotiation() - Set up protocol
2074
- * @hdev: VMBus's tracking struct for this root PCI bus
2584
+ * @hdev: VMBus's tracking struct for this root PCI bus.
2585
+ * @version: Array of supported channel protocol versions in
2586
+ * the order of probing - highest go first.
2587
+ * @num_version: Number of elements in the version array.
20752588 *
20762589 * This driver is intended to support running on Windows 10
20772590 * (server) and later versions. It will not run on earlier
....@@ -2085,8 +2598,11 @@
20852598 * failing if the host doesn't support the necessary protocol
20862599 * level.
20872600 */
2088
-static int hv_pci_protocol_negotiation(struct hv_device *hdev)
2601
+static int hv_pci_protocol_negotiation(struct hv_device *hdev,
2602
+ enum pci_protocol_version_t version[],
2603
+ int num_version)
20892604 {
2605
+ struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
20902606 struct pci_version_request *version_req;
20912607 struct hv_pci_compl comp_pkt;
20922608 struct pci_packet *pkt;
....@@ -2109,8 +2625,8 @@
21092625 version_req = (struct pci_version_request *)&pkt->message;
21102626 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
21112627
2112
- for (i = 0; i < ARRAY_SIZE(pci_protocol_versions); i++) {
2113
- version_req->protocol_version = pci_protocol_versions[i];
2628
+ for (i = 0; i < num_version; i++) {
2629
+ version_req->protocol_version = version[i];
21142630 ret = vmbus_sendpacket(hdev->channel, version_req,
21152631 sizeof(struct pci_version_request),
21162632 (unsigned long)pkt, VM_PKT_DATA_INBAND,
....@@ -2126,10 +2642,10 @@
21262642 }
21272643
21282644 if (comp_pkt.completion_status >= 0) {
2129
- pci_protocol_version = pci_protocol_versions[i];
2645
+ hbus->protocol_version = version[i];
21302646 dev_info(&hdev->device,
21312647 "PCI VMBus probing: Using version %#x\n",
2132
- pci_protocol_version);
2648
+ hbus->protocol_version);
21332649 goto exit;
21342650 }
21352651
....@@ -2299,6 +2815,8 @@
22992815 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
23002816 }
23012817
2818
+static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs);
2819
+
23022820 /**
23032821 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
23042822 * @hdev: VMBus's tracking struct for this root PCI bus
....@@ -2311,8 +2829,10 @@
23112829 struct pci_bus_d0_entry *d0_entry;
23122830 struct hv_pci_compl comp_pkt;
23132831 struct pci_packet *pkt;
2832
+ bool retry = true;
23142833 int ret;
23152834
2835
+enter_d0_retry:
23162836 /*
23172837 * Tell the host that the bus is ready to use, and moved into the
23182838 * powered-on state. This includes telling the host which region
....@@ -2338,6 +2858,38 @@
23382858
23392859 if (ret)
23402860 goto exit;
2861
+
2862
+ /*
2863
+ * In certain case (Kdump) the pci device of interest was
2864
+ * not cleanly shut down and resource is still held on host
2865
+ * side, the host could return invalid device status.
2866
+ * We need to explicitly request host to release the resource
2867
+ * and try to enter D0 again.
2868
+ */
2869
+ if (comp_pkt.completion_status < 0 && retry) {
2870
+ retry = false;
2871
+
2872
+ dev_err(&hdev->device, "Retrying D0 Entry\n");
2873
+
2874
+ /*
2875
+ * Hv_pci_bus_exit() calls hv_send_resource_released()
2876
+ * to free up resources of its child devices.
2877
+ * In the kdump kernel we need to set the
2878
+ * wslot_res_allocated to 255 so it scans all child
2879
+ * devices to release resources allocated in the
2880
+ * normal kernel before panic happened.
2881
+ */
2882
+ hbus->wslot_res_allocated = 255;
2883
+
2884
+ ret = hv_pci_bus_exit(hdev, true);
2885
+
2886
+ if (ret == 0) {
2887
+ kfree(pkt);
2888
+ goto enter_d0_retry;
2889
+ }
2890
+ dev_err(&hdev->device,
2891
+ "Retrying D0 failed with ret %d\n", ret);
2892
+ }
23412893
23422894 if (comp_pkt.completion_status < 0) {
23432895 dev_err(&hdev->device,
....@@ -2381,6 +2933,24 @@
23812933 if (!ret)
23822934 ret = wait_for_response(hdev, &comp);
23832935
2936
+ /*
2937
+ * In the case of fast device addition/removal, it's possible that
2938
+ * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we
2939
+ * already got a PCI_BUS_RELATIONS* message from the host and the
2940
+ * channel callback already scheduled a work to hbus->wq, which can be
2941
+ * running pci_devices_present_work() -> survey_child_resources() ->
2942
+ * complete(&hbus->survey_event), even after hv_pci_query_relations()
2943
+ * exits and the stack variable 'comp' is no longer valid; as a result,
2944
+ * a hang or a page fault may happen when the complete() calls
2945
+ * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from
2946
+ * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is
2947
+ * -ENODEV, there can't be any more work item scheduled to hbus->wq
2948
+ * after the flush_workqueue(): see vmbus_onoffer_rescind() ->
2949
+ * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() ->
2950
+ * channel->rescind = true.
2951
+ */
2952
+ flush_workqueue(hbus->wq);
2953
+
23842954 return ret;
23852955 }
23862956
....@@ -2410,10 +2980,10 @@
24102980 struct hv_pci_dev *hpdev;
24112981 struct pci_packet *pkt;
24122982 size_t size_res;
2413
- u32 wslot;
2983
+ int wslot;
24142984 int ret;
24152985
2416
- size_res = (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2)
2986
+ size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2)
24172987 ? sizeof(*res_assigned) : sizeof(*res_assigned2);
24182988
24192989 pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL);
....@@ -2432,7 +3002,7 @@
24323002 pkt->completion_func = hv_pci_generic_compl;
24333003 pkt->compl_ctxt = &comp_pkt;
24343004
2435
- if (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2) {
3005
+ if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) {
24363006 res_assigned =
24373007 (struct pci_resources_assigned *)&pkt->message;
24383008 res_assigned->message_type.type =
....@@ -2463,6 +3033,8 @@
24633033 comp_pkt.completion_status);
24643034 break;
24653035 }
3036
+
3037
+ hbus->wslot_res_allocated = wslot;
24663038 }
24673039
24683040 kfree(pkt);
....@@ -2481,10 +3053,10 @@
24813053 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
24823054 struct pci_child_message pkt;
24833055 struct hv_pci_dev *hpdev;
2484
- u32 wslot;
3056
+ int wslot;
24853057 int ret;
24863058
2487
- for (wslot = 0; wslot < 256; wslot++) {
3059
+ for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) {
24883060 hpdev = get_pcichild_wslot(hbus, wslot);
24893061 if (!hpdev)
24903062 continue;
....@@ -2499,7 +3071,11 @@
24993071 VM_PKT_DATA_INBAND, 0);
25003072 if (ret)
25013073 return ret;
3074
+
3075
+ hbus->wslot_res_allocated = wslot - 1;
25023076 }
3077
+
3078
+ hbus->wslot_res_allocated = -1;
25033079
25043080 return 0;
25053081 }
....@@ -2515,6 +3091,48 @@
25153091 complete(&hbus->remove_event);
25163092 }
25173093
3094
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
3095
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
3096
+
3097
+/*
3098
+ * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
3099
+ * as invalid for passthrough PCI devices of this driver.
3100
+ */
3101
+#define HVPCI_DOM_INVALID 0
3102
+
3103
+/**
3104
+ * hv_get_dom_num() - Get a valid PCI domain number
3105
+ * Check if the PCI domain number is in use, and return another number if
3106
+ * it is in use.
3107
+ *
3108
+ * @dom: Requested domain number
3109
+ *
3110
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
3111
+ */
3112
+static u16 hv_get_dom_num(u16 dom)
3113
+{
3114
+ unsigned int i;
3115
+
3116
+ if (test_and_set_bit(dom, hvpci_dom_map) == 0)
3117
+ return dom;
3118
+
3119
+ for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
3120
+ if (test_and_set_bit(i, hvpci_dom_map) == 0)
3121
+ return i;
3122
+ }
3123
+
3124
+ return HVPCI_DOM_INVALID;
3125
+}
3126
+
3127
+/**
3128
+ * hv_put_dom_num() - Mark the PCI domain number as free
3129
+ * @dom: Domain number to be freed
3130
+ */
3131
+static void hv_put_dom_num(u16 dom)
3132
+{
3133
+ clear_bit(dom, hvpci_dom_map);
3134
+}
3135
+
25183136 /**
25193137 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
25203138 * @hdev: VMBus's tracking struct for this root PCI bus
....@@ -2526,33 +3144,69 @@
25263144 const struct hv_vmbus_device_id *dev_id)
25273145 {
25283146 struct hv_pcibus_device *hbus;
3147
+ u16 dom_req, dom;
3148
+ char *name;
25293149 int ret;
25303150
25313151 /*
25323152 * hv_pcibus_device contains the hypercall arguments for retargeting in
25333153 * hv_irq_unmask(). Those must not cross a page boundary.
25343154 */
2535
- BUILD_BUG_ON(sizeof(*hbus) > PAGE_SIZE);
3155
+ BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE);
25363156
2537
- hbus = (struct hv_pcibus_device *)get_zeroed_page(GFP_KERNEL);
3157
+ /*
3158
+ * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural
3159
+ * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate
3160
+ * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and
3161
+ * alignment of hbus is important because hbus's field
3162
+ * retarget_msi_interrupt_params must not cross a 4KB page boundary.
3163
+ *
3164
+ * Here we prefer kzalloc to get_zeroed_page(), because a buffer
3165
+ * allocated by the latter is not tracked and scanned by kmemleak, and
3166
+ * hence kmemleak reports the pointer contained in the hbus buffer
3167
+ * (i.e. the hpdev struct, which is created in new_pcichild_device() and
3168
+ * is tracked by hbus->children) as memory leak (false positive).
3169
+ *
3170
+ * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be
3171
+ * used to allocate the hbus buffer and we can avoid the kmemleak false
3172
+ * positive by using kmemleak_alloc() and kmemleak_free() to ask
3173
+ * kmemleak to track and scan the hbus buffer.
3174
+ */
3175
+ hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
25383176 if (!hbus)
25393177 return -ENOMEM;
25403178 hbus->state = hv_pcibus_init;
3179
+ hbus->wslot_res_allocated = -1;
25413180
25423181 /*
2543
- * The PCI bus "domain" is what is called "segment" in ACPI and
2544
- * other specs. Pull it from the instance ID, to get something
2545
- * unique. Bytes 8 and 9 are what is used in Windows guests, so
2546
- * do the same thing for consistency. Note that, since this code
2547
- * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
2548
- * that (1) the only domain in use for something that looks like
2549
- * a physical PCI bus (which is actually emulated by the
2550
- * hypervisor) is domain 0 and (2) there will be no overlap
2551
- * between domains derived from these instance IDs in the same
2552
- * VM.
3182
+ * The PCI bus "domain" is what is called "segment" in ACPI and other
3183
+ * specs. Pull it from the instance ID, to get something usually
3184
+ * unique. In rare cases of collision, we will find out another number
3185
+ * not in use.
3186
+ *
3187
+ * Note that, since this code only runs in a Hyper-V VM, Hyper-V
3188
+ * together with this guest driver can guarantee that (1) The only
3189
+ * domain used by Gen1 VMs for something that looks like a physical
3190
+ * PCI bus (which is actually emulated by the hypervisor) is domain 0.
3191
+ * (2) There will be no overlap between domains (after fixing possible
3192
+ * collisions) in the same VM.
25533193 */
2554
- hbus->sysdata.domain = hdev->dev_instance.b[9] |
2555
- hdev->dev_instance.b[8] << 8;
3194
+ dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
3195
+ dom = hv_get_dom_num(dom_req);
3196
+
3197
+ if (dom == HVPCI_DOM_INVALID) {
3198
+ dev_err(&hdev->device,
3199
+ "Unable to use dom# 0x%hx or other numbers", dom_req);
3200
+ ret = -EINVAL;
3201
+ goto free_bus;
3202
+ }
3203
+
3204
+ if (dom != dom_req)
3205
+ dev_info(&hdev->device,
3206
+ "PCI dom# 0x%hx has collision, using 0x%hx",
3207
+ dom_req, dom);
3208
+
3209
+ hbus->sysdata.domain = dom;
25563210
25573211 hbus->hdev = hdev;
25583212 refcount_set(&hbus->remove_lock, 1);
....@@ -2567,7 +3221,7 @@
25673221 hbus->sysdata.domain);
25683222 if (!hbus->wq) {
25693223 ret = -ENOMEM;
2570
- goto free_bus;
3224
+ goto free_dom;
25713225 }
25723226
25733227 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
....@@ -2577,7 +3231,8 @@
25773231
25783232 hv_set_drvdata(hdev, hbus);
25793233
2580
- ret = hv_pci_protocol_negotiation(hdev);
3234
+ ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions,
3235
+ ARRAY_SIZE(pci_protocol_versions));
25813236 if (ret)
25823237 goto close;
25833238
....@@ -2594,7 +3249,14 @@
25943249 goto free_config;
25953250 }
25963251
2597
- hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus);
3252
+ name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance);
3253
+ if (!name) {
3254
+ ret = -ENOMEM;
3255
+ goto unmap;
3256
+ }
3257
+
3258
+ hbus->sysdata.fwnode = irq_domain_alloc_named_fwnode(name);
3259
+ kfree(name);
25983260 if (!hbus->sysdata.fwnode) {
25993261 ret = -ENOMEM;
26003262 goto unmap;
....@@ -2614,7 +3276,7 @@
26143276
26153277 ret = hv_pci_allocate_bridge_windows(hbus);
26163278 if (ret)
2617
- goto free_irq_domain;
3279
+ goto exit_d0;
26183280
26193281 ret = hv_send_resources_allocated(hdev);
26203282 if (ret)
....@@ -2632,6 +3294,8 @@
26323294
26333295 free_windows:
26343296 hv_pci_free_bridge_windows(hbus);
3297
+exit_d0:
3298
+ (void) hv_pci_bus_exit(hdev, true);
26353299 free_irq_domain:
26363300 irq_domain_remove(hbus->irq_domain);
26373301 free_fwnode:
....@@ -2644,20 +3308,23 @@
26443308 vmbus_close(hdev->channel);
26453309 destroy_wq:
26463310 destroy_workqueue(hbus->wq);
3311
+free_dom:
3312
+ hv_put_dom_num(hbus->sysdata.domain);
26473313 free_bus:
2648
- free_page((unsigned long)hbus);
3314
+ kfree(hbus);
26493315 return ret;
26503316 }
26513317
2652
-static void hv_pci_bus_exit(struct hv_device *hdev)
3318
+static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs)
26533319 {
26543320 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
26553321 struct {
26563322 struct pci_packet teardown_packet;
26573323 u8 buffer[sizeof(struct pci_message)];
26583324 } pkt;
2659
- struct pci_bus_relations relations;
26603325 struct hv_pci_compl comp_pkt;
3326
+ struct hv_pci_dev *hpdev, *tmp;
3327
+ unsigned long flags;
26613328 int ret;
26623329
26633330 /*
....@@ -2665,16 +3332,35 @@
26653332 * access the per-channel ringbuffer any longer.
26663333 */
26673334 if (hdev->channel->rescind)
2668
- return;
3335
+ return 0;
26693336
2670
- /* Delete any children which might still exist. */
2671
- memset(&relations, 0, sizeof(relations));
2672
- hv_pci_devices_present(hbus, &relations);
3337
+ if (!keep_devs) {
3338
+ struct list_head removed;
3339
+
3340
+ /* Move all present children to the list on stack */
3341
+ INIT_LIST_HEAD(&removed);
3342
+ spin_lock_irqsave(&hbus->device_list_lock, flags);
3343
+ list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry)
3344
+ list_move_tail(&hpdev->list_entry, &removed);
3345
+ spin_unlock_irqrestore(&hbus->device_list_lock, flags);
3346
+
3347
+ /* Remove all children in the list */
3348
+ list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) {
3349
+ list_del(&hpdev->list_entry);
3350
+ if (hpdev->pci_slot)
3351
+ pci_destroy_slot(hpdev->pci_slot);
3352
+ /* For the two refs got in new_pcichild_device() */
3353
+ put_pcichild(hpdev);
3354
+ put_pcichild(hpdev);
3355
+ }
3356
+ }
26733357
26743358 ret = hv_send_resources_released(hdev);
2675
- if (ret)
3359
+ if (ret) {
26763360 dev_err(&hdev->device,
26773361 "Couldn't send resources released packet(s)\n");
3362
+ return ret;
3363
+ }
26783364
26793365 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
26803366 init_completion(&comp_pkt.host_event);
....@@ -2687,8 +3373,13 @@
26873373 (unsigned long)&pkt.teardown_packet,
26883374 VM_PKT_DATA_INBAND,
26893375 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2690
- if (!ret)
2691
- wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ);
3376
+ if (ret)
3377
+ return ret;
3378
+
3379
+ if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0)
3380
+ return -ETIMEDOUT;
3381
+
3382
+ return 0;
26923383 }
26933384
26943385 /**
....@@ -2700,19 +3391,30 @@
27003391 static int hv_pci_remove(struct hv_device *hdev)
27013392 {
27023393 struct hv_pcibus_device *hbus;
3394
+ int ret;
27033395
27043396 hbus = hv_get_drvdata(hdev);
27053397 if (hbus->state == hv_pcibus_installed) {
3398
+ tasklet_disable(&hdev->channel->callback_event);
3399
+ hbus->state = hv_pcibus_removing;
3400
+ tasklet_enable(&hdev->channel->callback_event);
3401
+ destroy_workqueue(hbus->wq);
3402
+ hbus->wq = NULL;
3403
+ /*
3404
+ * At this point, no work is running or can be scheduled
3405
+ * on hbus-wq. We can't race with hv_pci_devices_present()
3406
+ * or hv_pci_eject_device(), it's safe to proceed.
3407
+ */
3408
+
27063409 /* Remove the bus from PCI's point of view. */
27073410 pci_lock_rescan_remove();
27083411 pci_stop_root_bus(hbus->pci_bus);
27093412 hv_pci_remove_slots(hbus);
27103413 pci_remove_root_bus(hbus->pci_bus);
27113414 pci_unlock_rescan_remove();
2712
- hbus->state = hv_pcibus_removed;
27133415 }
27143416
2715
- hv_pci_bus_exit(hdev);
3417
+ ret = hv_pci_bus_exit(hdev, false);
27163418
27173419 vmbus_close(hdev->channel);
27183420
....@@ -2724,9 +3426,128 @@
27243426 irq_domain_free_fwnode(hbus->sysdata.fwnode);
27253427 put_hvpcibus(hbus);
27263428 wait_for_completion(&hbus->remove_event);
2727
- destroy_workqueue(hbus->wq);
2728
- free_page((unsigned long)hbus);
3429
+
3430
+ hv_put_dom_num(hbus->sysdata.domain);
3431
+
3432
+ kfree(hbus);
3433
+ return ret;
3434
+}
3435
+
3436
+static int hv_pci_suspend(struct hv_device *hdev)
3437
+{
3438
+ struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3439
+ enum hv_pcibus_state old_state;
3440
+ int ret;
3441
+
3442
+ /*
3443
+ * hv_pci_suspend() must make sure there are no pending work items
3444
+ * before calling vmbus_close(), since it runs in a process context
3445
+ * as a callback in dpm_suspend(). When it starts to run, the channel
3446
+ * callback hv_pci_onchannelcallback(), which runs in a tasklet
3447
+ * context, can be still running concurrently and scheduling new work
3448
+ * items onto hbus->wq in hv_pci_devices_present() and
3449
+ * hv_pci_eject_device(), and the work item handlers can access the
3450
+ * vmbus channel, which can be being closed by hv_pci_suspend(), e.g.
3451
+ * the work item handler pci_devices_present_work() ->
3452
+ * new_pcichild_device() writes to the vmbus channel.
3453
+ *
3454
+ * To eliminate the race, hv_pci_suspend() disables the channel
3455
+ * callback tasklet, sets hbus->state to hv_pcibus_removing, and
3456
+ * re-enables the tasklet. This way, when hv_pci_suspend() proceeds,
3457
+ * it knows that no new work item can be scheduled, and then it flushes
3458
+ * hbus->wq and safely closes the vmbus channel.
3459
+ */
3460
+ tasklet_disable(&hdev->channel->callback_event);
3461
+
3462
+ /* Change the hbus state to prevent new work items. */
3463
+ old_state = hbus->state;
3464
+ if (hbus->state == hv_pcibus_installed)
3465
+ hbus->state = hv_pcibus_removing;
3466
+
3467
+ tasklet_enable(&hdev->channel->callback_event);
3468
+
3469
+ if (old_state != hv_pcibus_installed)
3470
+ return -EINVAL;
3471
+
3472
+ flush_workqueue(hbus->wq);
3473
+
3474
+ ret = hv_pci_bus_exit(hdev, true);
3475
+ if (ret)
3476
+ return ret;
3477
+
3478
+ vmbus_close(hdev->channel);
3479
+
27293480 return 0;
3481
+}
3482
+
3483
+static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg)
3484
+{
3485
+ struct msi_desc *entry;
3486
+ struct irq_data *irq_data;
3487
+
3488
+ for_each_pci_msi_entry(entry, pdev) {
3489
+ irq_data = irq_get_irq_data(entry->irq);
3490
+ if (WARN_ON_ONCE(!irq_data))
3491
+ return -EINVAL;
3492
+
3493
+ hv_compose_msi_msg(irq_data, &entry->msg);
3494
+ }
3495
+
3496
+ return 0;
3497
+}
3498
+
3499
+/*
3500
+ * Upon resume, pci_restore_msi_state() -> ... -> __pci_write_msi_msg()
3501
+ * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V
3502
+ * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg()
3503
+ * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping
3504
+ * Table entries.
3505
+ */
3506
+static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus)
3507
+{
3508
+ pci_walk_bus(hbus->pci_bus, hv_pci_restore_msi_msg, NULL);
3509
+}
3510
+
3511
+static int hv_pci_resume(struct hv_device *hdev)
3512
+{
3513
+ struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3514
+ enum pci_protocol_version_t version[1];
3515
+ int ret;
3516
+
3517
+ hbus->state = hv_pcibus_init;
3518
+
3519
+ ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
3520
+ hv_pci_onchannelcallback, hbus);
3521
+ if (ret)
3522
+ return ret;
3523
+
3524
+ /* Only use the version that was in use before hibernation. */
3525
+ version[0] = hbus->protocol_version;
3526
+ ret = hv_pci_protocol_negotiation(hdev, version, 1);
3527
+ if (ret)
3528
+ goto out;
3529
+
3530
+ ret = hv_pci_query_relations(hdev);
3531
+ if (ret)
3532
+ goto out;
3533
+
3534
+ ret = hv_pci_enter_d0(hdev);
3535
+ if (ret)
3536
+ goto out;
3537
+
3538
+ ret = hv_send_resources_allocated(hdev);
3539
+ if (ret)
3540
+ goto out;
3541
+
3542
+ prepopulate_bars(hbus);
3543
+
3544
+ hv_pci_restore_msi_state(hbus);
3545
+
3546
+ hbus->state = hv_pcibus_installed;
3547
+ return 0;
3548
+out:
3549
+ vmbus_close(hdev->channel);
3550
+ return ret;
27303551 }
27313552
27323553 static const struct hv_vmbus_device_id hv_pci_id_table[] = {
....@@ -2743,15 +3564,32 @@
27433564 .id_table = hv_pci_id_table,
27443565 .probe = hv_pci_probe,
27453566 .remove = hv_pci_remove,
3567
+ .suspend = hv_pci_suspend,
3568
+ .resume = hv_pci_resume,
27463569 };
27473570
27483571 static void __exit exit_hv_pci_drv(void)
27493572 {
27503573 vmbus_driver_unregister(&hv_pci_drv);
3574
+
3575
+ hvpci_block_ops.read_block = NULL;
3576
+ hvpci_block_ops.write_block = NULL;
3577
+ hvpci_block_ops.reg_blk_invalidate = NULL;
27513578 }
27523579
27533580 static int __init init_hv_pci_drv(void)
27543581 {
3582
+ if (!hv_is_hyperv_initialized())
3583
+ return -ENODEV;
3584
+
3585
+ /* Set the invalid domain number's bit, so it will not be used */
3586
+ set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
3587
+
3588
+ /* Initialize PCI block r/w interface */
3589
+ hvpci_block_ops.read_block = hv_read_config_block;
3590
+ hvpci_block_ops.write_block = hv_write_config_block;
3591
+ hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate;
3592
+
27553593 return vmbus_driver_register(&hv_pci_drv);
27563594 }
27573595