.. | .. |
---|
63 | 63 | enum pci_protocol_version_t { |
---|
64 | 64 | PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */ |
---|
65 | 65 | PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */ |
---|
| 66 | + PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3), /* Vibranium */ |
---|
66 | 67 | }; |
---|
67 | 68 | |
---|
68 | 69 | #define CPU_AFFINITY_ALL -1ULL |
---|
.. | .. |
---|
72 | 73 | * first. |
---|
73 | 74 | */ |
---|
74 | 75 | static enum pci_protocol_version_t pci_protocol_versions[] = { |
---|
| 76 | + PCI_PROTOCOL_VERSION_1_3, |
---|
75 | 77 | PCI_PROTOCOL_VERSION_1_2, |
---|
76 | 78 | PCI_PROTOCOL_VERSION_1_1, |
---|
77 | 79 | }; |
---|
78 | | - |
---|
79 | | -/* |
---|
80 | | - * Protocol version negotiated by hv_pci_protocol_negotiation(). |
---|
81 | | - */ |
---|
82 | | -static enum pci_protocol_version_t pci_protocol_version; |
---|
83 | 80 | |
---|
84 | 81 | #define PCI_CONFIG_MMIO_LENGTH 0x2000 |
---|
85 | 82 | #define CFG_PAGE_OFFSET 0x1000 |
---|
.. | .. |
---|
124 | 121 | PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16, |
---|
125 | 122 | PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17, |
---|
126 | 123 | PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */ |
---|
| 124 | + PCI_BUS_RELATIONS2 = PCI_MESSAGE_BASE + 0x19, |
---|
127 | 125 | PCI_MESSAGE_MAXIMUM |
---|
128 | 126 | }; |
---|
129 | 127 | |
---|
.. | .. |
---|
167 | 165 | u32 subsystem_id; |
---|
168 | 166 | union win_slot_encoding win_slot; |
---|
169 | 167 | u32 ser; /* serial number */ |
---|
| 168 | +} __packed; |
---|
| 169 | + |
---|
| 170 | +enum pci_device_description_flags { |
---|
| 171 | + HV_PCI_DEVICE_FLAG_NONE = 0x0, |
---|
| 172 | + HV_PCI_DEVICE_FLAG_NUMA_AFFINITY = 0x1, |
---|
| 173 | +}; |
---|
| 174 | + |
---|
| 175 | +struct pci_function_description2 { |
---|
| 176 | + u16 v_id; /* vendor ID */ |
---|
| 177 | + u16 d_id; /* device ID */ |
---|
| 178 | + u8 rev; |
---|
| 179 | + u8 prog_intf; |
---|
| 180 | + u8 subclass; |
---|
| 181 | + u8 base_class; |
---|
| 182 | + u32 subsystem_id; |
---|
| 183 | + union win_slot_encoding win_slot; |
---|
| 184 | + u32 ser; /* serial number */ |
---|
| 185 | + u32 flags; |
---|
| 186 | + u16 virtual_numa_node; |
---|
| 187 | + u16 reserved; |
---|
170 | 188 | } __packed; |
---|
171 | 189 | |
---|
172 | 190 | /** |
---|
.. | .. |
---|
265 | 283 | int resp_packet_size); |
---|
266 | 284 | void *compl_ctxt; |
---|
267 | 285 | |
---|
268 | | - struct pci_message message[0]; |
---|
| 286 | + struct pci_message message[]; |
---|
269 | 287 | }; |
---|
270 | 288 | |
---|
271 | 289 | /* |
---|
.. | .. |
---|
301 | 319 | struct pci_bus_relations { |
---|
302 | 320 | struct pci_incoming_message incoming; |
---|
303 | 321 | u32 device_count; |
---|
304 | | - struct pci_function_description func[0]; |
---|
| 322 | + struct pci_function_description func[]; |
---|
| 323 | +} __packed; |
---|
| 324 | + |
---|
| 325 | +struct pci_bus_relations2 { |
---|
| 326 | + struct pci_incoming_message incoming; |
---|
| 327 | + u32 device_count; |
---|
| 328 | + struct pci_function_description2 func[]; |
---|
305 | 329 | } __packed; |
---|
306 | 330 | |
---|
307 | 331 | struct pci_q_res_req_response { |
---|
308 | 332 | struct vmpacket_descriptor hdr; |
---|
309 | 333 | s32 status; /* negative values are failures */ |
---|
310 | | - u32 probed_bar[6]; |
---|
| 334 | + u32 probed_bar[PCI_STD_NUM_BARS]; |
---|
311 | 335 | } __packed; |
---|
312 | 336 | |
---|
313 | 337 | struct pci_set_power { |
---|
.. | .. |
---|
365 | 389 | struct tran_int_desc int_desc; |
---|
366 | 390 | } __packed; |
---|
367 | 391 | |
---|
| 392 | +/* |
---|
| 393 | + * Note: the VM must pass a valid block id, wslot and bytes_requested. |
---|
| 394 | + */ |
---|
| 395 | +struct pci_read_block { |
---|
| 396 | + struct pci_message message_type; |
---|
| 397 | + u32 block_id; |
---|
| 398 | + union win_slot_encoding wslot; |
---|
| 399 | + u32 bytes_requested; |
---|
| 400 | +} __packed; |
---|
| 401 | + |
---|
| 402 | +struct pci_read_block_response { |
---|
| 403 | + struct vmpacket_descriptor hdr; |
---|
| 404 | + u32 status; |
---|
| 405 | + u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; |
---|
| 406 | +} __packed; |
---|
| 407 | + |
---|
| 408 | +/* |
---|
| 409 | + * Note: the VM must pass a valid block id, wslot and byte_count. |
---|
| 410 | + */ |
---|
| 411 | +struct pci_write_block { |
---|
| 412 | + struct pci_message message_type; |
---|
| 413 | + u32 block_id; |
---|
| 414 | + union win_slot_encoding wslot; |
---|
| 415 | + u32 byte_count; |
---|
| 416 | + u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; |
---|
| 417 | +} __packed; |
---|
| 418 | + |
---|
| 419 | +struct pci_dev_inval_block { |
---|
| 420 | + struct pci_incoming_message incoming; |
---|
| 421 | + union win_slot_encoding wslot; |
---|
| 422 | + u64 block_mask; |
---|
| 423 | +} __packed; |
---|
| 424 | + |
---|
368 | 425 | struct pci_dev_incoming { |
---|
369 | 426 | struct pci_incoming_message incoming; |
---|
370 | 427 | union win_slot_encoding wslot; |
---|
.. | .. |
---|
379 | 436 | static int pci_ring_size = (4 * PAGE_SIZE); |
---|
380 | 437 | |
---|
381 | 438 | /* |
---|
382 | | - * Definitions or interrupt steering hypercall. |
---|
383 | | - */ |
---|
384 | | -#define HV_PARTITION_ID_SELF ((u64)-1) |
---|
385 | | -#define HVCALL_RETARGET_INTERRUPT 0x7e |
---|
386 | | - |
---|
387 | | -struct hv_interrupt_entry { |
---|
388 | | - u32 source; /* 1 for MSI(-X) */ |
---|
389 | | - u32 reserved1; |
---|
390 | | - u32 address; |
---|
391 | | - u32 data; |
---|
392 | | -}; |
---|
393 | | - |
---|
394 | | -#define HV_VP_SET_BANK_COUNT_MAX 5 /* current implementation limit */ |
---|
395 | | - |
---|
396 | | -struct hv_vp_set { |
---|
397 | | - u64 format; /* 0 (HvGenericSetSparse4k) */ |
---|
398 | | - u64 valid_banks; |
---|
399 | | - u64 masks[HV_VP_SET_BANK_COUNT_MAX]; |
---|
400 | | -}; |
---|
401 | | - |
---|
402 | | -/* |
---|
403 | | - * flags for hv_device_interrupt_target.flags |
---|
404 | | - */ |
---|
405 | | -#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 |
---|
406 | | -#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 |
---|
407 | | - |
---|
408 | | -struct hv_device_interrupt_target { |
---|
409 | | - u32 vector; |
---|
410 | | - u32 flags; |
---|
411 | | - union { |
---|
412 | | - u64 vp_mask; |
---|
413 | | - struct hv_vp_set vp_set; |
---|
414 | | - }; |
---|
415 | | -}; |
---|
416 | | - |
---|
417 | | -struct retarget_msi_interrupt { |
---|
418 | | - u64 partition_id; /* use "self" */ |
---|
419 | | - u64 device_id; |
---|
420 | | - struct hv_interrupt_entry int_entry; |
---|
421 | | - u64 reserved2; |
---|
422 | | - struct hv_device_interrupt_target int_target; |
---|
423 | | -} __packed; |
---|
424 | | - |
---|
425 | | -/* |
---|
426 | 439 | * Driver specific state. |
---|
427 | 440 | */ |
---|
428 | 441 | |
---|
.. | .. |
---|
430 | 443 | hv_pcibus_init = 0, |
---|
431 | 444 | hv_pcibus_probed, |
---|
432 | 445 | hv_pcibus_installed, |
---|
433 | | - hv_pcibus_removed, |
---|
| 446 | + hv_pcibus_removing, |
---|
434 | 447 | hv_pcibus_maximum |
---|
435 | 448 | }; |
---|
436 | 449 | |
---|
437 | 450 | struct hv_pcibus_device { |
---|
438 | 451 | struct pci_sysdata sysdata; |
---|
| 452 | + /* Protocol version negotiated with the host */ |
---|
| 453 | + enum pci_protocol_version_t protocol_version; |
---|
439 | 454 | enum hv_pcibus_state state; |
---|
440 | 455 | refcount_t remove_lock; |
---|
441 | 456 | struct hv_device *hdev; |
---|
.. | .. |
---|
460 | 475 | struct msi_controller msi_chip; |
---|
461 | 476 | struct irq_domain *irq_domain; |
---|
462 | 477 | |
---|
463 | | - /* hypercall arg, must not cross page boundary */ |
---|
464 | | - struct retarget_msi_interrupt retarget_msi_interrupt_params; |
---|
465 | | - |
---|
466 | 478 | spinlock_t retarget_msi_interrupt_lock; |
---|
467 | 479 | |
---|
468 | 480 | struct workqueue_struct *wq; |
---|
| 481 | + |
---|
| 482 | + /* Highest slot of child device with resources allocated */ |
---|
| 483 | + int wslot_res_allocated; |
---|
| 484 | + |
---|
| 485 | + /* hypercall arg, must not cross page boundary */ |
---|
| 486 | + struct hv_retarget_device_interrupt retarget_msi_interrupt_params; |
---|
| 487 | + |
---|
| 488 | + /* |
---|
| 489 | + * Don't put anything here: retarget_msi_interrupt_params must be last |
---|
| 490 | + */ |
---|
469 | 491 | }; |
---|
470 | 492 | |
---|
471 | 493 | /* |
---|
.. | .. |
---|
478 | 500 | struct hv_pcibus_device *bus; |
---|
479 | 501 | }; |
---|
480 | 502 | |
---|
| 503 | +struct hv_pcidev_description { |
---|
| 504 | + u16 v_id; /* vendor ID */ |
---|
| 505 | + u16 d_id; /* device ID */ |
---|
| 506 | + u8 rev; |
---|
| 507 | + u8 prog_intf; |
---|
| 508 | + u8 subclass; |
---|
| 509 | + u8 base_class; |
---|
| 510 | + u32 subsystem_id; |
---|
| 511 | + union win_slot_encoding win_slot; |
---|
| 512 | + u32 ser; /* serial number */ |
---|
| 513 | + u32 flags; |
---|
| 514 | + u16 virtual_numa_node; |
---|
| 515 | +}; |
---|
| 516 | + |
---|
481 | 517 | struct hv_dr_state { |
---|
482 | 518 | struct list_head list_entry; |
---|
483 | 519 | u32 device_count; |
---|
484 | | - struct pci_function_description func[0]; |
---|
| 520 | + struct hv_pcidev_description func[]; |
---|
485 | 521 | }; |
---|
486 | 522 | |
---|
487 | 523 | enum hv_pcichild_state { |
---|
.. | .. |
---|
498 | 534 | refcount_t refs; |
---|
499 | 535 | enum hv_pcichild_state state; |
---|
500 | 536 | struct pci_slot *pci_slot; |
---|
501 | | - struct pci_function_description desc; |
---|
| 537 | + struct hv_pcidev_description desc; |
---|
502 | 538 | bool reported_missing; |
---|
503 | 539 | struct hv_pcibus_device *hbus; |
---|
504 | 540 | struct work_struct wrk; |
---|
| 541 | + |
---|
| 542 | + void (*block_invalidate)(void *context, u64 block_mask); |
---|
| 543 | + void *invalidate_context; |
---|
505 | 544 | |
---|
506 | 545 | /* |
---|
507 | 546 | * What would be observed if one wrote 0xFFFFFFFF to a BAR and then |
---|
508 | 547 | * read it back, for each of the BAR offsets within config space. |
---|
509 | 548 | */ |
---|
510 | | - u32 probed_bar[6]; |
---|
| 549 | + u32 probed_bar[PCI_STD_NUM_BARS]; |
---|
511 | 550 | }; |
---|
512 | 551 | |
---|
513 | 552 | struct hv_pci_compl { |
---|
.. | .. |
---|
821 | 860 | .write = hv_pcifront_write_config, |
---|
822 | 861 | }; |
---|
823 | 862 | |
---|
| 863 | +/* |
---|
| 864 | + * Paravirtual backchannel |
---|
| 865 | + * |
---|
| 866 | + * Hyper-V SR-IOV provides a backchannel mechanism in software for |
---|
| 867 | + * communication between a VF driver and a PF driver. These |
---|
| 868 | + * "configuration blocks" are similar in concept to PCI configuration space, |
---|
| 869 | + * but instead of doing reads and writes in 32-bit chunks through a very slow |
---|
| 870 | + * path, packets of up to 128 bytes can be sent or received asynchronously. |
---|
| 871 | + * |
---|
| 872 | + * Nearly every SR-IOV device contains just such a communications channel in |
---|
| 873 | + * hardware, so using this one in software is usually optional. Using the |
---|
| 874 | + * software channel, however, allows driver implementers to leverage software |
---|
| 875 | + * tools that fuzz the communications channel looking for vulnerabilities. |
---|
| 876 | + * |
---|
| 877 | + * The usage model for these packets puts the responsibility for reading or |
---|
| 878 | + * writing on the VF driver. The VF driver sends a read or a write packet, |
---|
| 879 | + * indicating which "block" is being referred to by number. |
---|
| 880 | + * |
---|
| 881 | + * If the PF driver wishes to initiate communication, it can "invalidate" one or |
---|
| 882 | + * more of the first 64 blocks. This invalidation is delivered via a callback |
---|
| 883 | + * supplied by the VF driver by this driver. |
---|
| 884 | + * |
---|
| 885 | + * No protocol is implied, except that supplied by the PF and VF drivers. |
---|
| 886 | + */ |
---|
| 887 | + |
---|
| 888 | +struct hv_read_config_compl { |
---|
| 889 | + struct hv_pci_compl comp_pkt; |
---|
| 890 | + void *buf; |
---|
| 891 | + unsigned int len; |
---|
| 892 | + unsigned int bytes_returned; |
---|
| 893 | +}; |
---|
| 894 | + |
---|
| 895 | +/** |
---|
| 896 | + * hv_pci_read_config_compl() - Invoked when a response packet |
---|
| 897 | + * for a read config block operation arrives. |
---|
| 898 | + * @context: Identifies the read config operation |
---|
| 899 | + * @resp: The response packet itself |
---|
| 900 | + * @resp_packet_size: Size in bytes of the response packet |
---|
| 901 | + */ |
---|
| 902 | +static void hv_pci_read_config_compl(void *context, struct pci_response *resp, |
---|
| 903 | + int resp_packet_size) |
---|
| 904 | +{ |
---|
| 905 | + struct hv_read_config_compl *comp = context; |
---|
| 906 | + struct pci_read_block_response *read_resp = |
---|
| 907 | + (struct pci_read_block_response *)resp; |
---|
| 908 | + unsigned int data_len, hdr_len; |
---|
| 909 | + |
---|
| 910 | + hdr_len = offsetof(struct pci_read_block_response, bytes); |
---|
| 911 | + if (resp_packet_size < hdr_len) { |
---|
| 912 | + comp->comp_pkt.completion_status = -1; |
---|
| 913 | + goto out; |
---|
| 914 | + } |
---|
| 915 | + |
---|
| 916 | + data_len = resp_packet_size - hdr_len; |
---|
| 917 | + if (data_len > 0 && read_resp->status == 0) { |
---|
| 918 | + comp->bytes_returned = min(comp->len, data_len); |
---|
| 919 | + memcpy(comp->buf, read_resp->bytes, comp->bytes_returned); |
---|
| 920 | + } else { |
---|
| 921 | + comp->bytes_returned = 0; |
---|
| 922 | + } |
---|
| 923 | + |
---|
| 924 | + comp->comp_pkt.completion_status = read_resp->status; |
---|
| 925 | +out: |
---|
| 926 | + complete(&comp->comp_pkt.host_event); |
---|
| 927 | +} |
---|
| 928 | + |
---|
| 929 | +/** |
---|
| 930 | + * hv_read_config_block() - Sends a read config block request to |
---|
| 931 | + * the back-end driver running in the Hyper-V parent partition. |
---|
| 932 | + * @pdev: The PCI driver's representation for this device. |
---|
| 933 | + * @buf: Buffer into which the config block will be copied. |
---|
| 934 | + * @len: Size in bytes of buf. |
---|
| 935 | + * @block_id: Identifies the config block which has been requested. |
---|
| 936 | + * @bytes_returned: Size which came back from the back-end driver. |
---|
| 937 | + * |
---|
| 938 | + * Return: 0 on success, -errno on failure |
---|
| 939 | + */ |
---|
| 940 | +static int hv_read_config_block(struct pci_dev *pdev, void *buf, |
---|
| 941 | + unsigned int len, unsigned int block_id, |
---|
| 942 | + unsigned int *bytes_returned) |
---|
| 943 | +{ |
---|
| 944 | + struct hv_pcibus_device *hbus = |
---|
| 945 | + container_of(pdev->bus->sysdata, struct hv_pcibus_device, |
---|
| 946 | + sysdata); |
---|
| 947 | + struct { |
---|
| 948 | + struct pci_packet pkt; |
---|
| 949 | + char buf[sizeof(struct pci_read_block)]; |
---|
| 950 | + } pkt; |
---|
| 951 | + struct hv_read_config_compl comp_pkt; |
---|
| 952 | + struct pci_read_block *read_blk; |
---|
| 953 | + int ret; |
---|
| 954 | + |
---|
| 955 | + if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) |
---|
| 956 | + return -EINVAL; |
---|
| 957 | + |
---|
| 958 | + init_completion(&comp_pkt.comp_pkt.host_event); |
---|
| 959 | + comp_pkt.buf = buf; |
---|
| 960 | + comp_pkt.len = len; |
---|
| 961 | + |
---|
| 962 | + memset(&pkt, 0, sizeof(pkt)); |
---|
| 963 | + pkt.pkt.completion_func = hv_pci_read_config_compl; |
---|
| 964 | + pkt.pkt.compl_ctxt = &comp_pkt; |
---|
| 965 | + read_blk = (struct pci_read_block *)&pkt.pkt.message; |
---|
| 966 | + read_blk->message_type.type = PCI_READ_BLOCK; |
---|
| 967 | + read_blk->wslot.slot = devfn_to_wslot(pdev->devfn); |
---|
| 968 | + read_blk->block_id = block_id; |
---|
| 969 | + read_blk->bytes_requested = len; |
---|
| 970 | + |
---|
| 971 | + ret = vmbus_sendpacket(hbus->hdev->channel, read_blk, |
---|
| 972 | + sizeof(*read_blk), (unsigned long)&pkt.pkt, |
---|
| 973 | + VM_PKT_DATA_INBAND, |
---|
| 974 | + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); |
---|
| 975 | + if (ret) |
---|
| 976 | + return ret; |
---|
| 977 | + |
---|
| 978 | + ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event); |
---|
| 979 | + if (ret) |
---|
| 980 | + return ret; |
---|
| 981 | + |
---|
| 982 | + if (comp_pkt.comp_pkt.completion_status != 0 || |
---|
| 983 | + comp_pkt.bytes_returned == 0) { |
---|
| 984 | + dev_err(&hbus->hdev->device, |
---|
| 985 | + "Read Config Block failed: 0x%x, bytes_returned=%d\n", |
---|
| 986 | + comp_pkt.comp_pkt.completion_status, |
---|
| 987 | + comp_pkt.bytes_returned); |
---|
| 988 | + return -EIO; |
---|
| 989 | + } |
---|
| 990 | + |
---|
| 991 | + *bytes_returned = comp_pkt.bytes_returned; |
---|
| 992 | + return 0; |
---|
| 993 | +} |
---|
| 994 | + |
---|
| 995 | +/** |
---|
| 996 | + * hv_pci_write_config_compl() - Invoked when a response packet for a write |
---|
| 997 | + * config block operation arrives. |
---|
| 998 | + * @context: Identifies the write config operation |
---|
| 999 | + * @resp: The response packet itself |
---|
| 1000 | + * @resp_packet_size: Size in bytes of the response packet |
---|
| 1001 | + */ |
---|
| 1002 | +static void hv_pci_write_config_compl(void *context, struct pci_response *resp, |
---|
| 1003 | + int resp_packet_size) |
---|
| 1004 | +{ |
---|
| 1005 | + struct hv_pci_compl *comp_pkt = context; |
---|
| 1006 | + |
---|
| 1007 | + comp_pkt->completion_status = resp->status; |
---|
| 1008 | + complete(&comp_pkt->host_event); |
---|
| 1009 | +} |
---|
| 1010 | + |
---|
| 1011 | +/** |
---|
| 1012 | + * hv_write_config_block() - Sends a write config block request to the |
---|
| 1013 | + * back-end driver running in the Hyper-V parent partition. |
---|
| 1014 | + * @pdev: The PCI driver's representation for this device. |
---|
| 1015 | + * @buf: Buffer from which the config block will be copied. |
---|
| 1016 | + * @len: Size in bytes of buf. |
---|
| 1017 | + * @block_id: Identifies the config block which is being written. |
---|
| 1018 | + * |
---|
| 1019 | + * Return: 0 on success, -errno on failure |
---|
| 1020 | + */ |
---|
| 1021 | +static int hv_write_config_block(struct pci_dev *pdev, void *buf, |
---|
| 1022 | + unsigned int len, unsigned int block_id) |
---|
| 1023 | +{ |
---|
| 1024 | + struct hv_pcibus_device *hbus = |
---|
| 1025 | + container_of(pdev->bus->sysdata, struct hv_pcibus_device, |
---|
| 1026 | + sysdata); |
---|
| 1027 | + struct { |
---|
| 1028 | + struct pci_packet pkt; |
---|
| 1029 | + char buf[sizeof(struct pci_write_block)]; |
---|
| 1030 | + u32 reserved; |
---|
| 1031 | + } pkt; |
---|
| 1032 | + struct hv_pci_compl comp_pkt; |
---|
| 1033 | + struct pci_write_block *write_blk; |
---|
| 1034 | + u32 pkt_size; |
---|
| 1035 | + int ret; |
---|
| 1036 | + |
---|
| 1037 | + if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) |
---|
| 1038 | + return -EINVAL; |
---|
| 1039 | + |
---|
| 1040 | + init_completion(&comp_pkt.host_event); |
---|
| 1041 | + |
---|
| 1042 | + memset(&pkt, 0, sizeof(pkt)); |
---|
| 1043 | + pkt.pkt.completion_func = hv_pci_write_config_compl; |
---|
| 1044 | + pkt.pkt.compl_ctxt = &comp_pkt; |
---|
| 1045 | + write_blk = (struct pci_write_block *)&pkt.pkt.message; |
---|
| 1046 | + write_blk->message_type.type = PCI_WRITE_BLOCK; |
---|
| 1047 | + write_blk->wslot.slot = devfn_to_wslot(pdev->devfn); |
---|
| 1048 | + write_blk->block_id = block_id; |
---|
| 1049 | + write_blk->byte_count = len; |
---|
| 1050 | + memcpy(write_blk->bytes, buf, len); |
---|
| 1051 | + pkt_size = offsetof(struct pci_write_block, bytes) + len; |
---|
| 1052 | + /* |
---|
| 1053 | + * This quirk is required on some hosts shipped around 2018, because |
---|
| 1054 | + * these hosts don't check the pkt_size correctly (new hosts have been |
---|
| 1055 | + * fixed since early 2019). The quirk is also safe on very old hosts |
---|
| 1056 | + * and new hosts, because, on them, what really matters is the length |
---|
| 1057 | + * specified in write_blk->byte_count. |
---|
| 1058 | + */ |
---|
| 1059 | + pkt_size += sizeof(pkt.reserved); |
---|
| 1060 | + |
---|
| 1061 | + ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size, |
---|
| 1062 | + (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND, |
---|
| 1063 | + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); |
---|
| 1064 | + if (ret) |
---|
| 1065 | + return ret; |
---|
| 1066 | + |
---|
| 1067 | + ret = wait_for_response(hbus->hdev, &comp_pkt.host_event); |
---|
| 1068 | + if (ret) |
---|
| 1069 | + return ret; |
---|
| 1070 | + |
---|
| 1071 | + if (comp_pkt.completion_status != 0) { |
---|
| 1072 | + dev_err(&hbus->hdev->device, |
---|
| 1073 | + "Write Config Block failed: 0x%x\n", |
---|
| 1074 | + comp_pkt.completion_status); |
---|
| 1075 | + return -EIO; |
---|
| 1076 | + } |
---|
| 1077 | + |
---|
| 1078 | + return 0; |
---|
| 1079 | +} |
---|
| 1080 | + |
---|
| 1081 | +/** |
---|
| 1082 | + * hv_register_block_invalidate() - Invoked when a config block invalidation |
---|
| 1083 | + * arrives from the back-end driver. |
---|
| 1084 | + * @pdev: The PCI driver's representation for this device. |
---|
| 1085 | + * @context: Identifies the device. |
---|
| 1086 | + * @block_invalidate: Identifies all of the blocks being invalidated. |
---|
| 1087 | + * |
---|
| 1088 | + * Return: 0 on success, -errno on failure |
---|
| 1089 | + */ |
---|
| 1090 | +static int hv_register_block_invalidate(struct pci_dev *pdev, void *context, |
---|
| 1091 | + void (*block_invalidate)(void *context, |
---|
| 1092 | + u64 block_mask)) |
---|
| 1093 | +{ |
---|
| 1094 | + struct hv_pcibus_device *hbus = |
---|
| 1095 | + container_of(pdev->bus->sysdata, struct hv_pcibus_device, |
---|
| 1096 | + sysdata); |
---|
| 1097 | + struct hv_pci_dev *hpdev; |
---|
| 1098 | + |
---|
| 1099 | + hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); |
---|
| 1100 | + if (!hpdev) |
---|
| 1101 | + return -ENODEV; |
---|
| 1102 | + |
---|
| 1103 | + hpdev->block_invalidate = block_invalidate; |
---|
| 1104 | + hpdev->invalidate_context = context; |
---|
| 1105 | + |
---|
| 1106 | + put_pcichild(hpdev); |
---|
| 1107 | + return 0; |
---|
| 1108 | + |
---|
| 1109 | +} |
---|
| 1110 | + |
---|
824 | 1111 | /* Interrupt management hooks */ |
---|
825 | 1112 | static void hv_int_desc_free(struct hv_pci_dev *hpdev, |
---|
826 | 1113 | struct tran_int_desc *int_desc) |
---|
.. | .. |
---|
831 | 1118 | u8 buffer[sizeof(struct pci_delete_interrupt)]; |
---|
832 | 1119 | } ctxt; |
---|
833 | 1120 | |
---|
| 1121 | + if (!int_desc->vector_count) { |
---|
| 1122 | + kfree(int_desc); |
---|
| 1123 | + return; |
---|
| 1124 | + } |
---|
834 | 1125 | memset(&ctxt, 0, sizeof(ctxt)); |
---|
835 | 1126 | int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; |
---|
836 | 1127 | int_pkt->message_type.type = |
---|
.. | .. |
---|
893 | 1184 | pci_msi_mask_irq(data); |
---|
894 | 1185 | } |
---|
895 | 1186 | |
---|
| 1187 | +static unsigned int hv_msi_get_int_vector(struct irq_data *data) |
---|
| 1188 | +{ |
---|
| 1189 | + struct irq_cfg *cfg = irqd_cfg(data); |
---|
| 1190 | + |
---|
| 1191 | + return cfg->vector; |
---|
| 1192 | +} |
---|
| 1193 | + |
---|
| 1194 | +static int hv_msi_prepare(struct irq_domain *domain, struct device *dev, |
---|
| 1195 | + int nvec, msi_alloc_info_t *info) |
---|
| 1196 | +{ |
---|
| 1197 | + int ret = pci_msi_prepare(domain, dev, nvec, info); |
---|
| 1198 | + |
---|
| 1199 | + /* |
---|
| 1200 | + * By using the interrupt remapper in the hypervisor IOMMU, contiguous |
---|
| 1201 | + * CPU vectors is not needed for multi-MSI |
---|
| 1202 | + */ |
---|
| 1203 | + if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) |
---|
| 1204 | + info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; |
---|
| 1205 | + |
---|
| 1206 | + return ret; |
---|
| 1207 | +} |
---|
| 1208 | + |
---|
896 | 1209 | /** |
---|
897 | 1210 | * hv_irq_unmask() - "Unmask" the IRQ by setting its current |
---|
898 | 1211 | * affinity. |
---|
.. | .. |
---|
907 | 1220 | { |
---|
908 | 1221 | struct msi_desc *msi_desc = irq_data_get_msi_desc(data); |
---|
909 | 1222 | struct irq_cfg *cfg = irqd_cfg(data); |
---|
910 | | - struct retarget_msi_interrupt *params; |
---|
| 1223 | + struct hv_retarget_device_interrupt *params; |
---|
| 1224 | + struct tran_int_desc *int_desc; |
---|
911 | 1225 | struct hv_pcibus_device *hbus; |
---|
912 | 1226 | struct cpumask *dest; |
---|
| 1227 | + cpumask_var_t tmp; |
---|
913 | 1228 | struct pci_bus *pbus; |
---|
914 | 1229 | struct pci_dev *pdev; |
---|
915 | 1230 | unsigned long flags; |
---|
916 | 1231 | u32 var_size = 0; |
---|
917 | | - int cpu_vmbus; |
---|
918 | | - int cpu; |
---|
| 1232 | + int cpu, nr_bank; |
---|
919 | 1233 | u64 res; |
---|
920 | 1234 | |
---|
921 | 1235 | dest = irq_data_get_effective_affinity_mask(data); |
---|
922 | 1236 | pdev = msi_desc_to_pci_dev(msi_desc); |
---|
923 | 1237 | pbus = pdev->bus; |
---|
924 | 1238 | hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); |
---|
| 1239 | + int_desc = data->chip_data; |
---|
925 | 1240 | |
---|
926 | 1241 | spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); |
---|
927 | 1242 | |
---|
.. | .. |
---|
929 | 1244 | memset(params, 0, sizeof(*params)); |
---|
930 | 1245 | params->partition_id = HV_PARTITION_ID_SELF; |
---|
931 | 1246 | params->int_entry.source = 1; /* MSI(-X) */ |
---|
932 | | - params->int_entry.address = msi_desc->msg.address_lo; |
---|
933 | | - params->int_entry.data = msi_desc->msg.data; |
---|
| 1247 | + params->int_entry.msi_entry.address = int_desc->address & 0xffffffff; |
---|
| 1248 | + params->int_entry.msi_entry.data = int_desc->data; |
---|
934 | 1249 | params->device_id = (hbus->hdev->dev_instance.b[5] << 24) | |
---|
935 | 1250 | (hbus->hdev->dev_instance.b[4] << 16) | |
---|
936 | 1251 | (hbus->hdev->dev_instance.b[7] << 8) | |
---|
.. | .. |
---|
945 | 1260 | * negative effect (yet?). |
---|
946 | 1261 | */ |
---|
947 | 1262 | |
---|
948 | | - if (pci_protocol_version >= PCI_PROTOCOL_VERSION_1_2) { |
---|
| 1263 | + if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) { |
---|
949 | 1264 | /* |
---|
950 | 1265 | * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the |
---|
951 | 1266 | * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides |
---|
.. | .. |
---|
955 | 1270 | */ |
---|
956 | 1271 | params->int_target.flags |= |
---|
957 | 1272 | HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; |
---|
958 | | - params->int_target.vp_set.valid_banks = |
---|
959 | | - (1ull << HV_VP_SET_BANK_COUNT_MAX) - 1; |
---|
| 1273 | + |
---|
| 1274 | + if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) { |
---|
| 1275 | + res = 1; |
---|
| 1276 | + goto exit_unlock; |
---|
| 1277 | + } |
---|
| 1278 | + |
---|
| 1279 | + cpumask_and(tmp, dest, cpu_online_mask); |
---|
| 1280 | + nr_bank = cpumask_to_vpset(¶ms->int_target.vp_set, tmp); |
---|
| 1281 | + free_cpumask_var(tmp); |
---|
| 1282 | + |
---|
| 1283 | + if (nr_bank <= 0) { |
---|
| 1284 | + res = 1; |
---|
| 1285 | + goto exit_unlock; |
---|
| 1286 | + } |
---|
960 | 1287 | |
---|
961 | 1288 | /* |
---|
962 | 1289 | * var-sized hypercall, var-size starts after vp_mask (thus |
---|
963 | | - * vp_set.format does not count, but vp_set.valid_banks does). |
---|
| 1290 | + * vp_set.format does not count, but vp_set.valid_bank_mask |
---|
| 1291 | + * does). |
---|
964 | 1292 | */ |
---|
965 | | - var_size = 1 + HV_VP_SET_BANK_COUNT_MAX; |
---|
966 | | - |
---|
967 | | - for_each_cpu_and(cpu, dest, cpu_online_mask) { |
---|
968 | | - cpu_vmbus = hv_cpu_number_to_vp_number(cpu); |
---|
969 | | - |
---|
970 | | - if (cpu_vmbus >= HV_VP_SET_BANK_COUNT_MAX * 64) { |
---|
971 | | - dev_err(&hbus->hdev->device, |
---|
972 | | - "too high CPU %d", cpu_vmbus); |
---|
973 | | - res = 1; |
---|
974 | | - goto exit_unlock; |
---|
975 | | - } |
---|
976 | | - |
---|
977 | | - params->int_target.vp_set.masks[cpu_vmbus / 64] |= |
---|
978 | | - (1ULL << (cpu_vmbus & 63)); |
---|
979 | | - } |
---|
| 1293 | + var_size = 1 + nr_bank; |
---|
980 | 1294 | } else { |
---|
981 | 1295 | for_each_cpu_and(cpu, dest, cpu_online_mask) { |
---|
982 | 1296 | params->int_target.vp_mask |= |
---|
.. | .. |
---|
990 | 1304 | exit_unlock: |
---|
991 | 1305 | spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags); |
---|
992 | 1306 | |
---|
993 | | - if (res) { |
---|
| 1307 | + /* |
---|
| 1308 | + * During hibernation, when a CPU is offlined, the kernel tries |
---|
| 1309 | + * to move the interrupt to the remaining CPUs that haven't |
---|
| 1310 | + * been offlined yet. In this case, the below hv_do_hypercall() |
---|
| 1311 | + * always fails since the vmbus channel has been closed: |
---|
| 1312 | + * refer to cpu_disable_common() -> fixup_irqs() -> |
---|
| 1313 | + * irq_migrate_all_off_this_cpu() -> migrate_one_irq(). |
---|
| 1314 | + * |
---|
| 1315 | + * Suppress the error message for hibernation because the failure |
---|
| 1316 | + * during hibernation does not matter (at this time all the devices |
---|
| 1317 | + * have been frozen). Note: the correct affinity info is still updated |
---|
| 1318 | + * into the irqdata data structure in migrate_one_irq() -> |
---|
| 1319 | + * irq_do_set_affinity() -> hv_set_affinity(), so later when the VM |
---|
| 1320 | + * resumes, hv_pci_restore_msi_state() is able to correctly restore |
---|
| 1321 | + * the interrupt with the correct affinity. |
---|
| 1322 | + */ |
---|
| 1323 | + if (res && hbus->state != hv_pcibus_removing) |
---|
994 | 1324 | dev_err(&hbus->hdev->device, |
---|
995 | 1325 | "%s() failed: %#llx", __func__, res); |
---|
996 | | - return; |
---|
997 | | - } |
---|
998 | 1326 | |
---|
999 | 1327 | pci_msi_unmask_irq(data); |
---|
1000 | 1328 | } |
---|
.. | .. |
---|
1018 | 1346 | |
---|
1019 | 1347 | static u32 hv_compose_msi_req_v1( |
---|
1020 | 1348 | struct pci_create_interrupt *int_pkt, struct cpumask *affinity, |
---|
1021 | | - u32 slot, u8 vector) |
---|
| 1349 | + u32 slot, u8 vector, u8 vector_count) |
---|
1022 | 1350 | { |
---|
1023 | 1351 | int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; |
---|
1024 | 1352 | int_pkt->wslot.slot = slot; |
---|
1025 | 1353 | int_pkt->int_desc.vector = vector; |
---|
1026 | | - int_pkt->int_desc.vector_count = 1; |
---|
| 1354 | + int_pkt->int_desc.vector_count = vector_count; |
---|
1027 | 1355 | int_pkt->int_desc.delivery_mode = dest_Fixed; |
---|
1028 | 1356 | |
---|
1029 | 1357 | /* |
---|
.. | .. |
---|
1037 | 1365 | |
---|
1038 | 1366 | static u32 hv_compose_msi_req_v2( |
---|
1039 | 1367 | struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity, |
---|
1040 | | - u32 slot, u8 vector) |
---|
| 1368 | + u32 slot, u8 vector, u8 vector_count) |
---|
1041 | 1369 | { |
---|
1042 | 1370 | int cpu; |
---|
1043 | 1371 | |
---|
1044 | 1372 | int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2; |
---|
1045 | 1373 | int_pkt->wslot.slot = slot; |
---|
1046 | 1374 | int_pkt->int_desc.vector = vector; |
---|
1047 | | - int_pkt->int_desc.vector_count = 1; |
---|
| 1375 | + int_pkt->int_desc.vector_count = vector_count; |
---|
1048 | 1376 | int_pkt->int_desc.delivery_mode = dest_Fixed; |
---|
1049 | 1377 | |
---|
1050 | 1378 | /* |
---|
.. | .. |
---|
1072 | 1400 | */ |
---|
1073 | 1401 | static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) |
---|
1074 | 1402 | { |
---|
1075 | | - struct irq_cfg *cfg = irqd_cfg(data); |
---|
1076 | 1403 | struct hv_pcibus_device *hbus; |
---|
| 1404 | + struct vmbus_channel *channel; |
---|
1077 | 1405 | struct hv_pci_dev *hpdev; |
---|
1078 | 1406 | struct pci_bus *pbus; |
---|
1079 | 1407 | struct pci_dev *pdev; |
---|
1080 | 1408 | struct cpumask *dest; |
---|
1081 | | - unsigned long flags; |
---|
1082 | 1409 | struct compose_comp_ctxt comp; |
---|
1083 | 1410 | struct tran_int_desc *int_desc; |
---|
| 1411 | + struct msi_desc *msi_desc; |
---|
| 1412 | + u8 vector, vector_count; |
---|
1084 | 1413 | struct { |
---|
1085 | 1414 | struct pci_packet pci_pkt; |
---|
1086 | 1415 | union { |
---|
.. | .. |
---|
1092 | 1421 | u32 size; |
---|
1093 | 1422 | int ret; |
---|
1094 | 1423 | |
---|
1095 | | - pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); |
---|
| 1424 | + /* Reuse the previous allocation */ |
---|
| 1425 | + if (data->chip_data) { |
---|
| 1426 | + int_desc = data->chip_data; |
---|
| 1427 | + msg->address_hi = int_desc->address >> 32; |
---|
| 1428 | + msg->address_lo = int_desc->address & 0xffffffff; |
---|
| 1429 | + msg->data = int_desc->data; |
---|
| 1430 | + return; |
---|
| 1431 | + } |
---|
| 1432 | + |
---|
| 1433 | + msi_desc = irq_data_get_msi_desc(data); |
---|
| 1434 | + pdev = msi_desc_to_pci_dev(msi_desc); |
---|
1096 | 1435 | dest = irq_data_get_effective_affinity_mask(data); |
---|
1097 | 1436 | pbus = pdev->bus; |
---|
1098 | 1437 | hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); |
---|
| 1438 | + channel = hbus->hdev->channel; |
---|
1099 | 1439 | hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); |
---|
1100 | 1440 | if (!hpdev) |
---|
1101 | 1441 | goto return_null_message; |
---|
1102 | 1442 | |
---|
1103 | | - /* Free any previous message that might have already been composed. */ |
---|
1104 | | - if (data->chip_data) { |
---|
1105 | | - int_desc = data->chip_data; |
---|
1106 | | - data->chip_data = NULL; |
---|
1107 | | - hv_int_desc_free(hpdev, int_desc); |
---|
1108 | | - } |
---|
1109 | | - |
---|
1110 | 1443 | int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC); |
---|
1111 | 1444 | if (!int_desc) |
---|
1112 | 1445 | goto drop_reference; |
---|
| 1446 | + |
---|
| 1447 | + if (!msi_desc->msi_attrib.is_msix && msi_desc->nvec_used > 1) { |
---|
| 1448 | + /* |
---|
| 1449 | + * If this is not the first MSI of Multi MSI, we already have |
---|
| 1450 | + * a mapping. Can exit early. |
---|
| 1451 | + */ |
---|
| 1452 | + if (msi_desc->irq != data->irq) { |
---|
| 1453 | + data->chip_data = int_desc; |
---|
| 1454 | + int_desc->address = msi_desc->msg.address_lo | |
---|
| 1455 | + (u64)msi_desc->msg.address_hi << 32; |
---|
| 1456 | + int_desc->data = msi_desc->msg.data + |
---|
| 1457 | + (data->irq - msi_desc->irq); |
---|
| 1458 | + msg->address_hi = msi_desc->msg.address_hi; |
---|
| 1459 | + msg->address_lo = msi_desc->msg.address_lo; |
---|
| 1460 | + msg->data = int_desc->data; |
---|
| 1461 | + put_pcichild(hpdev); |
---|
| 1462 | + return; |
---|
| 1463 | + } |
---|
| 1464 | + /* |
---|
| 1465 | + * The vector we select here is a dummy value. The correct |
---|
| 1466 | + * value gets sent to the hypervisor in unmask(). This needs |
---|
| 1467 | + * to be aligned with the count, and also not zero. Multi-msi |
---|
| 1468 | + * is powers of 2 up to 32, so 32 will always work here. |
---|
| 1469 | + */ |
---|
| 1470 | + vector = 32; |
---|
| 1471 | + vector_count = msi_desc->nvec_used; |
---|
| 1472 | + } else { |
---|
| 1473 | + vector = hv_msi_get_int_vector(data); |
---|
| 1474 | + vector_count = 1; |
---|
| 1475 | + } |
---|
1113 | 1476 | |
---|
1114 | 1477 | memset(&ctxt, 0, sizeof(ctxt)); |
---|
1115 | 1478 | init_completion(&comp.comp_pkt.host_event); |
---|
1116 | 1479 | ctxt.pci_pkt.completion_func = hv_pci_compose_compl; |
---|
1117 | 1480 | ctxt.pci_pkt.compl_ctxt = ∁ |
---|
1118 | 1481 | |
---|
1119 | | - switch (pci_protocol_version) { |
---|
| 1482 | + switch (hbus->protocol_version) { |
---|
1120 | 1483 | case PCI_PROTOCOL_VERSION_1_1: |
---|
1121 | 1484 | size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, |
---|
1122 | 1485 | dest, |
---|
1123 | 1486 | hpdev->desc.win_slot.slot, |
---|
1124 | | - cfg->vector); |
---|
| 1487 | + vector, |
---|
| 1488 | + vector_count); |
---|
1125 | 1489 | break; |
---|
1126 | 1490 | |
---|
1127 | 1491 | case PCI_PROTOCOL_VERSION_1_2: |
---|
| 1492 | + case PCI_PROTOCOL_VERSION_1_3: |
---|
1128 | 1493 | size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2, |
---|
1129 | 1494 | dest, |
---|
1130 | 1495 | hpdev->desc.win_slot.slot, |
---|
1131 | | - cfg->vector); |
---|
| 1496 | + vector, |
---|
| 1497 | + vector_count); |
---|
1132 | 1498 | break; |
---|
1133 | 1499 | |
---|
1134 | 1500 | default: |
---|
.. | .. |
---|
1153 | 1519 | } |
---|
1154 | 1520 | |
---|
1155 | 1521 | /* |
---|
| 1522 | + * Prevents hv_pci_onchannelcallback() from running concurrently |
---|
| 1523 | + * in the tasklet. |
---|
| 1524 | + */ |
---|
| 1525 | + tasklet_disable(&channel->callback_event); |
---|
| 1526 | + |
---|
| 1527 | + /* |
---|
1156 | 1528 | * Since this function is called with IRQ locks held, can't |
---|
1157 | 1529 | * do normal wait for completion; instead poll. |
---|
1158 | 1530 | */ |
---|
1159 | 1531 | while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { |
---|
| 1532 | + unsigned long flags; |
---|
| 1533 | + |
---|
1160 | 1534 | /* 0xFFFF means an invalid PCI VENDOR ID. */ |
---|
1161 | 1535 | if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { |
---|
1162 | 1536 | dev_err_once(&hbus->hdev->device, |
---|
1163 | 1537 | "the device has gone\n"); |
---|
1164 | | - goto free_int_desc; |
---|
| 1538 | + goto enable_tasklet; |
---|
1165 | 1539 | } |
---|
1166 | 1540 | |
---|
1167 | 1541 | /* |
---|
1168 | | - * When the higher level interrupt code calls us with |
---|
1169 | | - * interrupt disabled, we must poll the channel by calling |
---|
1170 | | - * the channel callback directly when channel->target_cpu is |
---|
1171 | | - * the current CPU. When the higher level interrupt code |
---|
1172 | | - * calls us with interrupt enabled, let's add the |
---|
1173 | | - * local_irq_save()/restore() to avoid race: |
---|
1174 | | - * hv_pci_onchannelcallback() can also run in tasklet. |
---|
| 1542 | + * Make sure that the ring buffer data structure doesn't get |
---|
| 1543 | + * freed while we dereference the ring buffer pointer. Test |
---|
| 1544 | + * for the channel's onchannel_callback being NULL within a |
---|
| 1545 | + * sched_lock critical section. See also the inline comments |
---|
| 1546 | + * in vmbus_reset_channel_cb(). |
---|
1175 | 1547 | */ |
---|
1176 | | - local_irq_save(flags); |
---|
1177 | | - |
---|
1178 | | - if (hbus->hdev->channel->target_cpu == smp_processor_id()) |
---|
1179 | | - hv_pci_onchannelcallback(hbus); |
---|
1180 | | - |
---|
1181 | | - local_irq_restore(flags); |
---|
| 1548 | + spin_lock_irqsave(&channel->sched_lock, flags); |
---|
| 1549 | + if (unlikely(channel->onchannel_callback == NULL)) { |
---|
| 1550 | + spin_unlock_irqrestore(&channel->sched_lock, flags); |
---|
| 1551 | + goto enable_tasklet; |
---|
| 1552 | + } |
---|
| 1553 | + hv_pci_onchannelcallback(hbus); |
---|
| 1554 | + spin_unlock_irqrestore(&channel->sched_lock, flags); |
---|
1182 | 1555 | |
---|
1183 | 1556 | if (hpdev->state == hv_pcichild_ejecting) { |
---|
1184 | 1557 | dev_err_once(&hbus->hdev->device, |
---|
1185 | 1558 | "the device is being ejected\n"); |
---|
1186 | | - goto free_int_desc; |
---|
| 1559 | + goto enable_tasklet; |
---|
1187 | 1560 | } |
---|
1188 | 1561 | |
---|
1189 | 1562 | udelay(100); |
---|
1190 | 1563 | } |
---|
| 1564 | + |
---|
| 1565 | + tasklet_enable(&channel->callback_event); |
---|
1191 | 1566 | |
---|
1192 | 1567 | if (comp.comp_pkt.completion_status < 0) { |
---|
1193 | 1568 | dev_err(&hbus->hdev->device, |
---|
.. | .. |
---|
1212 | 1587 | put_pcichild(hpdev); |
---|
1213 | 1588 | return; |
---|
1214 | 1589 | |
---|
| 1590 | +enable_tasklet: |
---|
| 1591 | + tasklet_enable(&channel->callback_event); |
---|
1215 | 1592 | free_int_desc: |
---|
1216 | 1593 | kfree(int_desc); |
---|
1217 | 1594 | drop_reference: |
---|
.. | .. |
---|
1232 | 1609 | .irq_unmask = hv_irq_unmask, |
---|
1233 | 1610 | }; |
---|
1234 | 1611 | |
---|
1235 | | -static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info, |
---|
1236 | | - msi_alloc_info_t *arg) |
---|
1237 | | -{ |
---|
1238 | | - return arg->msi_hwirq; |
---|
1239 | | -} |
---|
1240 | | - |
---|
1241 | 1612 | static struct msi_domain_ops hv_msi_ops = { |
---|
1242 | | - .get_hwirq = hv_msi_domain_ops_get_hwirq, |
---|
1243 | | - .msi_prepare = pci_msi_prepare, |
---|
1244 | | - .set_desc = pci_msi_set_desc, |
---|
| 1613 | + .msi_prepare = hv_msi_prepare, |
---|
1245 | 1614 | .msi_free = hv_msi_free, |
---|
1246 | 1615 | }; |
---|
1247 | 1616 | |
---|
.. | .. |
---|
1332 | 1701 | * so it's sufficient to just add them up without tracking alignment. |
---|
1333 | 1702 | */ |
---|
1334 | 1703 | list_for_each_entry(hpdev, &hbus->children, list_entry) { |
---|
1335 | | - for (i = 0; i < 6; i++) { |
---|
| 1704 | + for (i = 0; i < PCI_STD_NUM_BARS; i++) { |
---|
1336 | 1705 | if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO) |
---|
1337 | 1706 | dev_err(&hbus->hdev->device, |
---|
1338 | 1707 | "There's an I/O BAR in this list!\n"); |
---|
.. | .. |
---|
1403 | 1772 | |
---|
1404 | 1773 | spin_lock_irqsave(&hbus->device_list_lock, flags); |
---|
1405 | 1774 | |
---|
| 1775 | + /* |
---|
| 1776 | + * Clear the memory enable bit, in case it's already set. This occurs |
---|
| 1777 | + * in the suspend path of hibernation, where the device is suspended, |
---|
| 1778 | + * resumed and suspended again: see hibernation_snapshot() and |
---|
| 1779 | + * hibernation_platform_enter(). |
---|
| 1780 | + * |
---|
| 1781 | + * If the memory enable bit is already set, Hyper-V sliently ignores |
---|
| 1782 | + * the below BAR updates, and the related PCI device driver can not |
---|
| 1783 | + * work, because reading from the device register(s) always returns |
---|
| 1784 | + * 0xFFFFFFFF. |
---|
| 1785 | + */ |
---|
| 1786 | + list_for_each_entry(hpdev, &hbus->children, list_entry) { |
---|
| 1787 | + _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command); |
---|
| 1788 | + command &= ~PCI_COMMAND_MEMORY; |
---|
| 1789 | + _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command); |
---|
| 1790 | + } |
---|
| 1791 | + |
---|
1406 | 1792 | /* Pick addresses for the BARs. */ |
---|
1407 | 1793 | do { |
---|
1408 | 1794 | list_for_each_entry(hpdev, &hbus->children, list_entry) { |
---|
1409 | | - for (i = 0; i < 6; i++) { |
---|
| 1795 | + for (i = 0; i < PCI_STD_NUM_BARS; i++) { |
---|
1410 | 1796 | bar_val = hpdev->probed_bar[i]; |
---|
1411 | 1797 | if (bar_val == 0) |
---|
1412 | 1798 | continue; |
---|
.. | .. |
---|
1506 | 1892 | } |
---|
1507 | 1893 | } |
---|
1508 | 1894 | |
---|
| 1895 | +/* |
---|
| 1896 | + * Set NUMA node for the devices on the bus |
---|
| 1897 | + */ |
---|
| 1898 | +static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus) |
---|
| 1899 | +{ |
---|
| 1900 | + struct pci_dev *dev; |
---|
| 1901 | + struct pci_bus *bus = hbus->pci_bus; |
---|
| 1902 | + struct hv_pci_dev *hv_dev; |
---|
| 1903 | + |
---|
| 1904 | + list_for_each_entry(dev, &bus->devices, bus_list) { |
---|
| 1905 | + hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn)); |
---|
| 1906 | + if (!hv_dev) |
---|
| 1907 | + continue; |
---|
| 1908 | + |
---|
| 1909 | + if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY && |
---|
| 1910 | + hv_dev->desc.virtual_numa_node < num_possible_nodes()) |
---|
| 1911 | + /* |
---|
| 1912 | + * The kernel may boot with some NUMA nodes offline |
---|
| 1913 | + * (e.g. in a KDUMP kernel) or with NUMA disabled via |
---|
| 1914 | + * "numa=off". In those cases, adjust the host provided |
---|
| 1915 | + * NUMA node to a valid NUMA node used by the kernel. |
---|
| 1916 | + */ |
---|
| 1917 | + set_dev_node(&dev->dev, |
---|
| 1918 | + numa_map_to_online_node( |
---|
| 1919 | + hv_dev->desc.virtual_numa_node)); |
---|
| 1920 | + |
---|
| 1921 | + put_pcichild(hv_dev); |
---|
| 1922 | + } |
---|
| 1923 | +} |
---|
| 1924 | + |
---|
1509 | 1925 | /** |
---|
1510 | 1926 | * create_root_hv_pci_bus() - Expose a new root PCI bus |
---|
1511 | 1927 | * @hbus: Root PCI bus, as understood by this driver |
---|
.. | .. |
---|
1528 | 1944 | |
---|
1529 | 1945 | pci_lock_rescan_remove(); |
---|
1530 | 1946 | pci_scan_child_bus(hbus->pci_bus); |
---|
| 1947 | + hv_pci_assign_numa_node(hbus); |
---|
1531 | 1948 | pci_bus_assign_resources(hbus->pci_bus); |
---|
1532 | 1949 | hv_pci_assign_slots(hbus); |
---|
1533 | 1950 | pci_bus_add_devices(hbus->pci_bus); |
---|
.. | .. |
---|
1563 | 1980 | "query resource requirements failed: %x\n", |
---|
1564 | 1981 | resp->status); |
---|
1565 | 1982 | } else { |
---|
1566 | | - for (i = 0; i < 6; i++) { |
---|
| 1983 | + for (i = 0; i < PCI_STD_NUM_BARS; i++) { |
---|
1567 | 1984 | completion->hpdev->probed_bar[i] = |
---|
1568 | 1985 | q_res_req->probed_bar[i]; |
---|
1569 | 1986 | } |
---|
.. | .. |
---|
1584 | 2001 | * Return: Pointer to the new tracking struct |
---|
1585 | 2002 | */ |
---|
1586 | 2003 | static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus, |
---|
1587 | | - struct pci_function_description *desc) |
---|
| 2004 | + struct hv_pcidev_description *desc) |
---|
1588 | 2005 | { |
---|
1589 | 2006 | struct hv_pci_dev *hpdev; |
---|
1590 | 2007 | struct pci_child_message *res_req; |
---|
.. | .. |
---|
1695 | 2112 | { |
---|
1696 | 2113 | u32 child_no; |
---|
1697 | 2114 | bool found; |
---|
1698 | | - struct pci_function_description *new_desc; |
---|
| 2115 | + struct hv_pcidev_description *new_desc; |
---|
1699 | 2116 | struct hv_pci_dev *hpdev; |
---|
1700 | 2117 | struct hv_pcibus_device *hbus; |
---|
1701 | 2118 | struct list_head removed; |
---|
.. | .. |
---|
1796 | 2213 | */ |
---|
1797 | 2214 | pci_lock_rescan_remove(); |
---|
1798 | 2215 | pci_scan_child_bus(hbus->pci_bus); |
---|
| 2216 | + hv_pci_assign_numa_node(hbus); |
---|
1799 | 2217 | hv_pci_assign_slots(hbus); |
---|
1800 | 2218 | pci_unlock_rescan_remove(); |
---|
1801 | 2219 | break; |
---|
.. | .. |
---|
1814 | 2232 | } |
---|
1815 | 2233 | |
---|
1816 | 2234 | /** |
---|
1817 | | - * hv_pci_devices_present() - Handles list of new children |
---|
| 2235 | + * hv_pci_start_relations_work() - Queue work to start device discovery |
---|
1818 | 2236 | * @hbus: Root PCI bus, as understood by this driver |
---|
1819 | | - * @relations: Packet from host listing children |
---|
| 2237 | + * @dr: The list of children returned from host |
---|
1820 | 2238 | * |
---|
1821 | | - * This function is invoked whenever a new list of devices for |
---|
1822 | | - * this bus appears. |
---|
| 2239 | + * Return: 0 on success, -errno on failure |
---|
1823 | 2240 | */ |
---|
1824 | | -static void hv_pci_devices_present(struct hv_pcibus_device *hbus, |
---|
1825 | | - struct pci_bus_relations *relations) |
---|
| 2241 | +static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus, |
---|
| 2242 | + struct hv_dr_state *dr) |
---|
1826 | 2243 | { |
---|
1827 | | - struct hv_dr_state *dr; |
---|
1828 | 2244 | struct hv_dr_work *dr_wrk; |
---|
1829 | 2245 | unsigned long flags; |
---|
1830 | 2246 | bool pending_dr; |
---|
1831 | 2247 | |
---|
| 2248 | + if (hbus->state == hv_pcibus_removing) { |
---|
| 2249 | + dev_info(&hbus->hdev->device, |
---|
| 2250 | + "PCI VMBus BUS_RELATIONS: ignored\n"); |
---|
| 2251 | + return -ENOENT; |
---|
| 2252 | + } |
---|
| 2253 | + |
---|
1832 | 2254 | dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT); |
---|
1833 | 2255 | if (!dr_wrk) |
---|
1834 | | - return; |
---|
1835 | | - |
---|
1836 | | - dr = kzalloc(offsetof(struct hv_dr_state, func) + |
---|
1837 | | - (sizeof(struct pci_function_description) * |
---|
1838 | | - (relations->device_count)), GFP_NOWAIT); |
---|
1839 | | - if (!dr) { |
---|
1840 | | - kfree(dr_wrk); |
---|
1841 | | - return; |
---|
1842 | | - } |
---|
| 2256 | + return -ENOMEM; |
---|
1843 | 2257 | |
---|
1844 | 2258 | INIT_WORK(&dr_wrk->wrk, pci_devices_present_work); |
---|
1845 | 2259 | dr_wrk->bus = hbus; |
---|
1846 | | - dr->device_count = relations->device_count; |
---|
1847 | | - if (dr->device_count != 0) { |
---|
1848 | | - memcpy(dr->func, relations->func, |
---|
1849 | | - sizeof(struct pci_function_description) * |
---|
1850 | | - dr->device_count); |
---|
1851 | | - } |
---|
1852 | 2260 | |
---|
1853 | 2261 | spin_lock_irqsave(&hbus->device_list_lock, flags); |
---|
1854 | 2262 | /* |
---|
.. | .. |
---|
1866 | 2274 | get_hvpcibus(hbus); |
---|
1867 | 2275 | queue_work(hbus->wq, &dr_wrk->wrk); |
---|
1868 | 2276 | } |
---|
| 2277 | + |
---|
| 2278 | + return 0; |
---|
| 2279 | +} |
---|
| 2280 | + |
---|
| 2281 | +/** |
---|
| 2282 | + * hv_pci_devices_present() - Handle list of new children |
---|
| 2283 | + * @hbus: Root PCI bus, as understood by this driver |
---|
| 2284 | + * @relations: Packet from host listing children |
---|
| 2285 | + * |
---|
| 2286 | + * Process a new list of devices on the bus. The list of devices is |
---|
| 2287 | + * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS, |
---|
| 2288 | + * whenever a new list of devices for this bus appears. |
---|
| 2289 | + */ |
---|
| 2290 | +static void hv_pci_devices_present(struct hv_pcibus_device *hbus, |
---|
| 2291 | + struct pci_bus_relations *relations) |
---|
| 2292 | +{ |
---|
| 2293 | + struct hv_dr_state *dr; |
---|
| 2294 | + int i; |
---|
| 2295 | + |
---|
| 2296 | + dr = kzalloc(struct_size(dr, func, relations->device_count), |
---|
| 2297 | + GFP_NOWAIT); |
---|
| 2298 | + if (!dr) |
---|
| 2299 | + return; |
---|
| 2300 | + |
---|
| 2301 | + dr->device_count = relations->device_count; |
---|
| 2302 | + for (i = 0; i < dr->device_count; i++) { |
---|
| 2303 | + dr->func[i].v_id = relations->func[i].v_id; |
---|
| 2304 | + dr->func[i].d_id = relations->func[i].d_id; |
---|
| 2305 | + dr->func[i].rev = relations->func[i].rev; |
---|
| 2306 | + dr->func[i].prog_intf = relations->func[i].prog_intf; |
---|
| 2307 | + dr->func[i].subclass = relations->func[i].subclass; |
---|
| 2308 | + dr->func[i].base_class = relations->func[i].base_class; |
---|
| 2309 | + dr->func[i].subsystem_id = relations->func[i].subsystem_id; |
---|
| 2310 | + dr->func[i].win_slot = relations->func[i].win_slot; |
---|
| 2311 | + dr->func[i].ser = relations->func[i].ser; |
---|
| 2312 | + } |
---|
| 2313 | + |
---|
| 2314 | + if (hv_pci_start_relations_work(hbus, dr)) |
---|
| 2315 | + kfree(dr); |
---|
| 2316 | +} |
---|
| 2317 | + |
---|
| 2318 | +/** |
---|
| 2319 | + * hv_pci_devices_present2() - Handle list of new children |
---|
| 2320 | + * @hbus: Root PCI bus, as understood by this driver |
---|
| 2321 | + * @relations: Packet from host listing children |
---|
| 2322 | + * |
---|
| 2323 | + * This function is the v2 version of hv_pci_devices_present() |
---|
| 2324 | + */ |
---|
| 2325 | +static void hv_pci_devices_present2(struct hv_pcibus_device *hbus, |
---|
| 2326 | + struct pci_bus_relations2 *relations) |
---|
| 2327 | +{ |
---|
| 2328 | + struct hv_dr_state *dr; |
---|
| 2329 | + int i; |
---|
| 2330 | + |
---|
| 2331 | + dr = kzalloc(struct_size(dr, func, relations->device_count), |
---|
| 2332 | + GFP_NOWAIT); |
---|
| 2333 | + if (!dr) |
---|
| 2334 | + return; |
---|
| 2335 | + |
---|
| 2336 | + dr->device_count = relations->device_count; |
---|
| 2337 | + for (i = 0; i < dr->device_count; i++) { |
---|
| 2338 | + dr->func[i].v_id = relations->func[i].v_id; |
---|
| 2339 | + dr->func[i].d_id = relations->func[i].d_id; |
---|
| 2340 | + dr->func[i].rev = relations->func[i].rev; |
---|
| 2341 | + dr->func[i].prog_intf = relations->func[i].prog_intf; |
---|
| 2342 | + dr->func[i].subclass = relations->func[i].subclass; |
---|
| 2343 | + dr->func[i].base_class = relations->func[i].base_class; |
---|
| 2344 | + dr->func[i].subsystem_id = relations->func[i].subsystem_id; |
---|
| 2345 | + dr->func[i].win_slot = relations->func[i].win_slot; |
---|
| 2346 | + dr->func[i].ser = relations->func[i].ser; |
---|
| 2347 | + dr->func[i].flags = relations->func[i].flags; |
---|
| 2348 | + dr->func[i].virtual_numa_node = |
---|
| 2349 | + relations->func[i].virtual_numa_node; |
---|
| 2350 | + } |
---|
| 2351 | + |
---|
| 2352 | + if (hv_pci_start_relations_work(hbus, dr)) |
---|
| 2353 | + kfree(dr); |
---|
1869 | 2354 | } |
---|
1870 | 2355 | |
---|
1871 | 2356 | /** |
---|
.. | .. |
---|
1945 | 2430 | */ |
---|
1946 | 2431 | static void hv_pci_eject_device(struct hv_pci_dev *hpdev) |
---|
1947 | 2432 | { |
---|
| 2433 | + struct hv_pcibus_device *hbus = hpdev->hbus; |
---|
| 2434 | + struct hv_device *hdev = hbus->hdev; |
---|
| 2435 | + |
---|
| 2436 | + if (hbus->state == hv_pcibus_removing) { |
---|
| 2437 | + dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n"); |
---|
| 2438 | + return; |
---|
| 2439 | + } |
---|
| 2440 | + |
---|
1948 | 2441 | hpdev->state = hv_pcichild_ejecting; |
---|
1949 | 2442 | get_pcichild(hpdev); |
---|
1950 | 2443 | INIT_WORK(&hpdev->wrk, hv_eject_device_work); |
---|
1951 | | - get_hvpcibus(hpdev->hbus); |
---|
1952 | | - queue_work(hpdev->hbus->wq, &hpdev->wrk); |
---|
| 2444 | + get_hvpcibus(hbus); |
---|
| 2445 | + queue_work(hbus->wq, &hpdev->wrk); |
---|
1953 | 2446 | } |
---|
1954 | 2447 | |
---|
1955 | 2448 | /** |
---|
.. | .. |
---|
1973 | 2466 | struct pci_response *response; |
---|
1974 | 2467 | struct pci_incoming_message *new_message; |
---|
1975 | 2468 | struct pci_bus_relations *bus_rel; |
---|
| 2469 | + struct pci_bus_relations2 *bus_rel2; |
---|
| 2470 | + struct pci_dev_inval_block *inval; |
---|
1976 | 2471 | struct pci_dev_incoming *dev_message; |
---|
1977 | 2472 | struct hv_pci_dev *hpdev; |
---|
1978 | 2473 | |
---|
.. | .. |
---|
2028 | 2523 | |
---|
2029 | 2524 | bus_rel = (struct pci_bus_relations *)buffer; |
---|
2030 | 2525 | if (bytes_recvd < |
---|
2031 | | - offsetof(struct pci_bus_relations, func) + |
---|
2032 | | - (sizeof(struct pci_function_description) * |
---|
2033 | | - (bus_rel->device_count))) { |
---|
| 2526 | + struct_size(bus_rel, func, |
---|
| 2527 | + bus_rel->device_count)) { |
---|
2034 | 2528 | dev_err(&hbus->hdev->device, |
---|
2035 | 2529 | "bus relations too small\n"); |
---|
2036 | 2530 | break; |
---|
2037 | 2531 | } |
---|
2038 | 2532 | |
---|
2039 | 2533 | hv_pci_devices_present(hbus, bus_rel); |
---|
| 2534 | + break; |
---|
| 2535 | + |
---|
| 2536 | + case PCI_BUS_RELATIONS2: |
---|
| 2537 | + |
---|
| 2538 | + bus_rel2 = (struct pci_bus_relations2 *)buffer; |
---|
| 2539 | + if (bytes_recvd < |
---|
| 2540 | + struct_size(bus_rel2, func, |
---|
| 2541 | + bus_rel2->device_count)) { |
---|
| 2542 | + dev_err(&hbus->hdev->device, |
---|
| 2543 | + "bus relations v2 too small\n"); |
---|
| 2544 | + break; |
---|
| 2545 | + } |
---|
| 2546 | + |
---|
| 2547 | + hv_pci_devices_present2(hbus, bus_rel2); |
---|
2040 | 2548 | break; |
---|
2041 | 2549 | |
---|
2042 | 2550 | case PCI_EJECT: |
---|
.. | .. |
---|
2046 | 2554 | dev_message->wslot.slot); |
---|
2047 | 2555 | if (hpdev) { |
---|
2048 | 2556 | hv_pci_eject_device(hpdev); |
---|
| 2557 | + put_pcichild(hpdev); |
---|
| 2558 | + } |
---|
| 2559 | + break; |
---|
| 2560 | + |
---|
| 2561 | + case PCI_INVALIDATE_BLOCK: |
---|
| 2562 | + |
---|
| 2563 | + inval = (struct pci_dev_inval_block *)buffer; |
---|
| 2564 | + hpdev = get_pcichild_wslot(hbus, |
---|
| 2565 | + inval->wslot.slot); |
---|
| 2566 | + if (hpdev) { |
---|
| 2567 | + if (hpdev->block_invalidate) { |
---|
| 2568 | + hpdev->block_invalidate( |
---|
| 2569 | + hpdev->invalidate_context, |
---|
| 2570 | + inval->block_mask); |
---|
| 2571 | + } |
---|
2049 | 2572 | put_pcichild(hpdev); |
---|
2050 | 2573 | } |
---|
2051 | 2574 | break; |
---|
.. | .. |
---|
2071 | 2594 | |
---|
2072 | 2595 | /** |
---|
2073 | 2596 | * hv_pci_protocol_negotiation() - Set up protocol |
---|
2074 | | - * @hdev: VMBus's tracking struct for this root PCI bus |
---|
| 2597 | + * @hdev: VMBus's tracking struct for this root PCI bus. |
---|
| 2598 | + * @version: Array of supported channel protocol versions in |
---|
| 2599 | + * the order of probing - highest go first. |
---|
| 2600 | + * @num_version: Number of elements in the version array. |
---|
2075 | 2601 | * |
---|
2076 | 2602 | * This driver is intended to support running on Windows 10 |
---|
2077 | 2603 | * (server) and later versions. It will not run on earlier |
---|
.. | .. |
---|
2085 | 2611 | * failing if the host doesn't support the necessary protocol |
---|
2086 | 2612 | * level. |
---|
2087 | 2613 | */ |
---|
2088 | | -static int hv_pci_protocol_negotiation(struct hv_device *hdev) |
---|
| 2614 | +static int hv_pci_protocol_negotiation(struct hv_device *hdev, |
---|
| 2615 | + enum pci_protocol_version_t version[], |
---|
| 2616 | + int num_version) |
---|
2089 | 2617 | { |
---|
| 2618 | + struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); |
---|
2090 | 2619 | struct pci_version_request *version_req; |
---|
2091 | 2620 | struct hv_pci_compl comp_pkt; |
---|
2092 | 2621 | struct pci_packet *pkt; |
---|
.. | .. |
---|
2109 | 2638 | version_req = (struct pci_version_request *)&pkt->message; |
---|
2110 | 2639 | version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; |
---|
2111 | 2640 | |
---|
2112 | | - for (i = 0; i < ARRAY_SIZE(pci_protocol_versions); i++) { |
---|
2113 | | - version_req->protocol_version = pci_protocol_versions[i]; |
---|
| 2641 | + for (i = 0; i < num_version; i++) { |
---|
| 2642 | + version_req->protocol_version = version[i]; |
---|
2114 | 2643 | ret = vmbus_sendpacket(hdev->channel, version_req, |
---|
2115 | 2644 | sizeof(struct pci_version_request), |
---|
2116 | 2645 | (unsigned long)pkt, VM_PKT_DATA_INBAND, |
---|
.. | .. |
---|
2126 | 2655 | } |
---|
2127 | 2656 | |
---|
2128 | 2657 | if (comp_pkt.completion_status >= 0) { |
---|
2129 | | - pci_protocol_version = pci_protocol_versions[i]; |
---|
| 2658 | + hbus->protocol_version = version[i]; |
---|
2130 | 2659 | dev_info(&hdev->device, |
---|
2131 | 2660 | "PCI VMBus probing: Using version %#x\n", |
---|
2132 | | - pci_protocol_version); |
---|
| 2661 | + hbus->protocol_version); |
---|
2133 | 2662 | goto exit; |
---|
2134 | 2663 | } |
---|
2135 | 2664 | |
---|
.. | .. |
---|
2299 | 2828 | vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH); |
---|
2300 | 2829 | } |
---|
2301 | 2830 | |
---|
| 2831 | +static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs); |
---|
| 2832 | + |
---|
2302 | 2833 | /** |
---|
2303 | 2834 | * hv_pci_enter_d0() - Bring the "bus" into the D0 power state |
---|
2304 | 2835 | * @hdev: VMBus's tracking struct for this root PCI bus |
---|
.. | .. |
---|
2410 | 2941 | struct hv_pci_dev *hpdev; |
---|
2411 | 2942 | struct pci_packet *pkt; |
---|
2412 | 2943 | size_t size_res; |
---|
2413 | | - u32 wslot; |
---|
| 2944 | + int wslot; |
---|
2414 | 2945 | int ret; |
---|
2415 | 2946 | |
---|
2416 | | - size_res = (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2) |
---|
| 2947 | + size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) |
---|
2417 | 2948 | ? sizeof(*res_assigned) : sizeof(*res_assigned2); |
---|
2418 | 2949 | |
---|
2419 | 2950 | pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL); |
---|
.. | .. |
---|
2432 | 2963 | pkt->completion_func = hv_pci_generic_compl; |
---|
2433 | 2964 | pkt->compl_ctxt = &comp_pkt; |
---|
2434 | 2965 | |
---|
2435 | | - if (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2) { |
---|
| 2966 | + if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) { |
---|
2436 | 2967 | res_assigned = |
---|
2437 | 2968 | (struct pci_resources_assigned *)&pkt->message; |
---|
2438 | 2969 | res_assigned->message_type.type = |
---|
.. | .. |
---|
2463 | 2994 | comp_pkt.completion_status); |
---|
2464 | 2995 | break; |
---|
2465 | 2996 | } |
---|
| 2997 | + |
---|
| 2998 | + hbus->wslot_res_allocated = wslot; |
---|
2466 | 2999 | } |
---|
2467 | 3000 | |
---|
2468 | 3001 | kfree(pkt); |
---|
.. | .. |
---|
2481 | 3014 | struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); |
---|
2482 | 3015 | struct pci_child_message pkt; |
---|
2483 | 3016 | struct hv_pci_dev *hpdev; |
---|
2484 | | - u32 wslot; |
---|
| 3017 | + int wslot; |
---|
2485 | 3018 | int ret; |
---|
2486 | 3019 | |
---|
2487 | | - for (wslot = 0; wslot < 256; wslot++) { |
---|
| 3020 | + for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) { |
---|
2488 | 3021 | hpdev = get_pcichild_wslot(hbus, wslot); |
---|
2489 | 3022 | if (!hpdev) |
---|
2490 | 3023 | continue; |
---|
.. | .. |
---|
2499 | 3032 | VM_PKT_DATA_INBAND, 0); |
---|
2500 | 3033 | if (ret) |
---|
2501 | 3034 | return ret; |
---|
| 3035 | + |
---|
| 3036 | + hbus->wslot_res_allocated = wslot - 1; |
---|
2502 | 3037 | } |
---|
| 3038 | + |
---|
| 3039 | + hbus->wslot_res_allocated = -1; |
---|
2503 | 3040 | |
---|
2504 | 3041 | return 0; |
---|
2505 | 3042 | } |
---|
.. | .. |
---|
2515 | 3052 | complete(&hbus->remove_event); |
---|
2516 | 3053 | } |
---|
2517 | 3054 | |
---|
| 3055 | +#define HVPCI_DOM_MAP_SIZE (64 * 1024) |
---|
| 3056 | +static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); |
---|
| 3057 | + |
---|
| 3058 | +/* |
---|
| 3059 | + * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 |
---|
| 3060 | + * as invalid for passthrough PCI devices of this driver. |
---|
| 3061 | + */ |
---|
| 3062 | +#define HVPCI_DOM_INVALID 0 |
---|
| 3063 | + |
---|
| 3064 | +/** |
---|
| 3065 | + * hv_get_dom_num() - Get a valid PCI domain number |
---|
| 3066 | + * Check if the PCI domain number is in use, and return another number if |
---|
| 3067 | + * it is in use. |
---|
| 3068 | + * |
---|
| 3069 | + * @dom: Requested domain number |
---|
| 3070 | + * |
---|
| 3071 | + * return: domain number on success, HVPCI_DOM_INVALID on failure |
---|
| 3072 | + */ |
---|
| 3073 | +static u16 hv_get_dom_num(u16 dom) |
---|
| 3074 | +{ |
---|
| 3075 | + unsigned int i; |
---|
| 3076 | + |
---|
| 3077 | + if (test_and_set_bit(dom, hvpci_dom_map) == 0) |
---|
| 3078 | + return dom; |
---|
| 3079 | + |
---|
| 3080 | + for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { |
---|
| 3081 | + if (test_and_set_bit(i, hvpci_dom_map) == 0) |
---|
| 3082 | + return i; |
---|
| 3083 | + } |
---|
| 3084 | + |
---|
| 3085 | + return HVPCI_DOM_INVALID; |
---|
| 3086 | +} |
---|
| 3087 | + |
---|
| 3088 | +/** |
---|
| 3089 | + * hv_put_dom_num() - Mark the PCI domain number as free |
---|
| 3090 | + * @dom: Domain number to be freed |
---|
| 3091 | + */ |
---|
| 3092 | +static void hv_put_dom_num(u16 dom) |
---|
| 3093 | +{ |
---|
| 3094 | + clear_bit(dom, hvpci_dom_map); |
---|
| 3095 | +} |
---|
| 3096 | + |
---|
2518 | 3097 | /** |
---|
2519 | 3098 | * hv_pci_probe() - New VMBus channel probe, for a root PCI bus |
---|
2520 | 3099 | * @hdev: VMBus's tracking struct for this root PCI bus |
---|
.. | .. |
---|
2526 | 3105 | const struct hv_vmbus_device_id *dev_id) |
---|
2527 | 3106 | { |
---|
2528 | 3107 | struct hv_pcibus_device *hbus; |
---|
| 3108 | + u16 dom_req, dom; |
---|
| 3109 | + char *name; |
---|
| 3110 | + bool enter_d0_retry = true; |
---|
2529 | 3111 | int ret; |
---|
2530 | 3112 | |
---|
2531 | 3113 | /* |
---|
2532 | 3114 | * hv_pcibus_device contains the hypercall arguments for retargeting in |
---|
2533 | 3115 | * hv_irq_unmask(). Those must not cross a page boundary. |
---|
2534 | 3116 | */ |
---|
2535 | | - BUILD_BUG_ON(sizeof(*hbus) > PAGE_SIZE); |
---|
| 3117 | + BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE); |
---|
2536 | 3118 | |
---|
2537 | | - hbus = (struct hv_pcibus_device *)get_zeroed_page(GFP_KERNEL); |
---|
| 3119 | + /* |
---|
| 3120 | + * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural |
---|
| 3121 | + * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate |
---|
| 3122 | + * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and |
---|
| 3123 | + * alignment of hbus is important because hbus's field |
---|
| 3124 | + * retarget_msi_interrupt_params must not cross a 4KB page boundary. |
---|
| 3125 | + * |
---|
| 3126 | + * Here we prefer kzalloc to get_zeroed_page(), because a buffer |
---|
| 3127 | + * allocated by the latter is not tracked and scanned by kmemleak, and |
---|
| 3128 | + * hence kmemleak reports the pointer contained in the hbus buffer |
---|
| 3129 | + * (i.e. the hpdev struct, which is created in new_pcichild_device() and |
---|
| 3130 | + * is tracked by hbus->children) as memory leak (false positive). |
---|
| 3131 | + * |
---|
| 3132 | + * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be |
---|
| 3133 | + * used to allocate the hbus buffer and we can avoid the kmemleak false |
---|
| 3134 | + * positive by using kmemleak_alloc() and kmemleak_free() to ask |
---|
| 3135 | + * kmemleak to track and scan the hbus buffer. |
---|
| 3136 | + */ |
---|
| 3137 | + hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); |
---|
2538 | 3138 | if (!hbus) |
---|
2539 | 3139 | return -ENOMEM; |
---|
2540 | 3140 | hbus->state = hv_pcibus_init; |
---|
| 3141 | + hbus->wslot_res_allocated = -1; |
---|
2541 | 3142 | |
---|
2542 | 3143 | /* |
---|
2543 | | - * The PCI bus "domain" is what is called "segment" in ACPI and |
---|
2544 | | - * other specs. Pull it from the instance ID, to get something |
---|
2545 | | - * unique. Bytes 8 and 9 are what is used in Windows guests, so |
---|
2546 | | - * do the same thing for consistency. Note that, since this code |
---|
2547 | | - * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee |
---|
2548 | | - * that (1) the only domain in use for something that looks like |
---|
2549 | | - * a physical PCI bus (which is actually emulated by the |
---|
2550 | | - * hypervisor) is domain 0 and (2) there will be no overlap |
---|
2551 | | - * between domains derived from these instance IDs in the same |
---|
2552 | | - * VM. |
---|
| 3144 | + * The PCI bus "domain" is what is called "segment" in ACPI and other |
---|
| 3145 | + * specs. Pull it from the instance ID, to get something usually |
---|
| 3146 | + * unique. In rare cases of collision, we will find out another number |
---|
| 3147 | + * not in use. |
---|
| 3148 | + * |
---|
| 3149 | + * Note that, since this code only runs in a Hyper-V VM, Hyper-V |
---|
| 3150 | + * together with this guest driver can guarantee that (1) The only |
---|
| 3151 | + * domain used by Gen1 VMs for something that looks like a physical |
---|
| 3152 | + * PCI bus (which is actually emulated by the hypervisor) is domain 0. |
---|
| 3153 | + * (2) There will be no overlap between domains (after fixing possible |
---|
| 3154 | + * collisions) in the same VM. |
---|
2553 | 3155 | */ |
---|
2554 | | - hbus->sysdata.domain = hdev->dev_instance.b[9] | |
---|
2555 | | - hdev->dev_instance.b[8] << 8; |
---|
| 3156 | + dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; |
---|
| 3157 | + dom = hv_get_dom_num(dom_req); |
---|
| 3158 | + |
---|
| 3159 | + if (dom == HVPCI_DOM_INVALID) { |
---|
| 3160 | + dev_err(&hdev->device, |
---|
| 3161 | + "Unable to use dom# 0x%hx or other numbers", dom_req); |
---|
| 3162 | + ret = -EINVAL; |
---|
| 3163 | + goto free_bus; |
---|
| 3164 | + } |
---|
| 3165 | + |
---|
| 3166 | + if (dom != dom_req) |
---|
| 3167 | + dev_info(&hdev->device, |
---|
| 3168 | + "PCI dom# 0x%hx has collision, using 0x%hx", |
---|
| 3169 | + dom_req, dom); |
---|
| 3170 | + |
---|
| 3171 | + hbus->sysdata.domain = dom; |
---|
2556 | 3172 | |
---|
2557 | 3173 | hbus->hdev = hdev; |
---|
2558 | 3174 | refcount_set(&hbus->remove_lock, 1); |
---|
.. | .. |
---|
2567 | 3183 | hbus->sysdata.domain); |
---|
2568 | 3184 | if (!hbus->wq) { |
---|
2569 | 3185 | ret = -ENOMEM; |
---|
2570 | | - goto free_bus; |
---|
| 3186 | + goto free_dom; |
---|
2571 | 3187 | } |
---|
2572 | 3188 | |
---|
2573 | 3189 | ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, |
---|
.. | .. |
---|
2577 | 3193 | |
---|
2578 | 3194 | hv_set_drvdata(hdev, hbus); |
---|
2579 | 3195 | |
---|
2580 | | - ret = hv_pci_protocol_negotiation(hdev); |
---|
| 3196 | + ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions, |
---|
| 3197 | + ARRAY_SIZE(pci_protocol_versions)); |
---|
2581 | 3198 | if (ret) |
---|
2582 | 3199 | goto close; |
---|
2583 | 3200 | |
---|
.. | .. |
---|
2594 | 3211 | goto free_config; |
---|
2595 | 3212 | } |
---|
2596 | 3213 | |
---|
2597 | | - hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus); |
---|
| 3214 | + name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance); |
---|
| 3215 | + if (!name) { |
---|
| 3216 | + ret = -ENOMEM; |
---|
| 3217 | + goto unmap; |
---|
| 3218 | + } |
---|
| 3219 | + |
---|
| 3220 | + hbus->sysdata.fwnode = irq_domain_alloc_named_fwnode(name); |
---|
| 3221 | + kfree(name); |
---|
2598 | 3222 | if (!hbus->sysdata.fwnode) { |
---|
2599 | 3223 | ret = -ENOMEM; |
---|
2600 | 3224 | goto unmap; |
---|
.. | .. |
---|
2604 | 3228 | if (ret) |
---|
2605 | 3229 | goto free_fwnode; |
---|
2606 | 3230 | |
---|
| 3231 | +retry: |
---|
2607 | 3232 | ret = hv_pci_query_relations(hdev); |
---|
2608 | 3233 | if (ret) |
---|
2609 | 3234 | goto free_irq_domain; |
---|
2610 | 3235 | |
---|
2611 | 3236 | ret = hv_pci_enter_d0(hdev); |
---|
| 3237 | + /* |
---|
| 3238 | + * In certain case (Kdump) the pci device of interest was |
---|
| 3239 | + * not cleanly shut down and resource is still held on host |
---|
| 3240 | + * side, the host could return invalid device status. |
---|
| 3241 | + * We need to explicitly request host to release the resource |
---|
| 3242 | + * and try to enter D0 again. |
---|
| 3243 | + * Since the hv_pci_bus_exit() call releases structures |
---|
| 3244 | + * of all its child devices, we need to start the retry from |
---|
| 3245 | + * hv_pci_query_relations() call, requesting host to send |
---|
| 3246 | + * the synchronous child device relations message before this |
---|
| 3247 | + * information is needed in hv_send_resources_allocated() |
---|
| 3248 | + * call later. |
---|
| 3249 | + */ |
---|
| 3250 | + if (ret == -EPROTO && enter_d0_retry) { |
---|
| 3251 | + enter_d0_retry = false; |
---|
| 3252 | + |
---|
| 3253 | + dev_err(&hdev->device, "Retrying D0 Entry\n"); |
---|
| 3254 | + |
---|
| 3255 | + /* |
---|
| 3256 | + * Hv_pci_bus_exit() calls hv_send_resources_released() |
---|
| 3257 | + * to free up resources of its child devices. |
---|
| 3258 | + * In the kdump kernel we need to set the |
---|
| 3259 | + * wslot_res_allocated to 255 so it scans all child |
---|
| 3260 | + * devices to release resources allocated in the |
---|
| 3261 | + * normal kernel before panic happened. |
---|
| 3262 | + */ |
---|
| 3263 | + hbus->wslot_res_allocated = 255; |
---|
| 3264 | + ret = hv_pci_bus_exit(hdev, true); |
---|
| 3265 | + |
---|
| 3266 | + if (ret == 0) |
---|
| 3267 | + goto retry; |
---|
| 3268 | + |
---|
| 3269 | + dev_err(&hdev->device, |
---|
| 3270 | + "Retrying D0 failed with ret %d\n", ret); |
---|
| 3271 | + } |
---|
2612 | 3272 | if (ret) |
---|
2613 | 3273 | goto free_irq_domain; |
---|
2614 | 3274 | |
---|
2615 | 3275 | ret = hv_pci_allocate_bridge_windows(hbus); |
---|
2616 | 3276 | if (ret) |
---|
2617 | | - goto free_irq_domain; |
---|
| 3277 | + goto exit_d0; |
---|
2618 | 3278 | |
---|
2619 | 3279 | ret = hv_send_resources_allocated(hdev); |
---|
2620 | 3280 | if (ret) |
---|
.. | .. |
---|
2632 | 3292 | |
---|
2633 | 3293 | free_windows: |
---|
2634 | 3294 | hv_pci_free_bridge_windows(hbus); |
---|
| 3295 | +exit_d0: |
---|
| 3296 | + (void) hv_pci_bus_exit(hdev, true); |
---|
2635 | 3297 | free_irq_domain: |
---|
2636 | 3298 | irq_domain_remove(hbus->irq_domain); |
---|
2637 | 3299 | free_fwnode: |
---|
.. | .. |
---|
2644 | 3306 | vmbus_close(hdev->channel); |
---|
2645 | 3307 | destroy_wq: |
---|
2646 | 3308 | destroy_workqueue(hbus->wq); |
---|
| 3309 | +free_dom: |
---|
| 3310 | + hv_put_dom_num(hbus->sysdata.domain); |
---|
2647 | 3311 | free_bus: |
---|
2648 | | - free_page((unsigned long)hbus); |
---|
| 3312 | + kfree(hbus); |
---|
2649 | 3313 | return ret; |
---|
2650 | 3314 | } |
---|
2651 | 3315 | |
---|
2652 | | -static void hv_pci_bus_exit(struct hv_device *hdev) |
---|
| 3316 | +static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) |
---|
2653 | 3317 | { |
---|
2654 | 3318 | struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); |
---|
2655 | 3319 | struct { |
---|
2656 | 3320 | struct pci_packet teardown_packet; |
---|
2657 | 3321 | u8 buffer[sizeof(struct pci_message)]; |
---|
2658 | 3322 | } pkt; |
---|
2659 | | - struct pci_bus_relations relations; |
---|
2660 | 3323 | struct hv_pci_compl comp_pkt; |
---|
| 3324 | + struct hv_pci_dev *hpdev, *tmp; |
---|
| 3325 | + unsigned long flags; |
---|
2661 | 3326 | int ret; |
---|
2662 | 3327 | |
---|
2663 | 3328 | /* |
---|
.. | .. |
---|
2665 | 3330 | * access the per-channel ringbuffer any longer. |
---|
2666 | 3331 | */ |
---|
2667 | 3332 | if (hdev->channel->rescind) |
---|
2668 | | - return; |
---|
| 3333 | + return 0; |
---|
2669 | 3334 | |
---|
2670 | | - /* Delete any children which might still exist. */ |
---|
2671 | | - memset(&relations, 0, sizeof(relations)); |
---|
2672 | | - hv_pci_devices_present(hbus, &relations); |
---|
| 3335 | + if (!keep_devs) { |
---|
| 3336 | + struct list_head removed; |
---|
| 3337 | + |
---|
| 3338 | + /* Move all present children to the list on stack */ |
---|
| 3339 | + INIT_LIST_HEAD(&removed); |
---|
| 3340 | + spin_lock_irqsave(&hbus->device_list_lock, flags); |
---|
| 3341 | + list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry) |
---|
| 3342 | + list_move_tail(&hpdev->list_entry, &removed); |
---|
| 3343 | + spin_unlock_irqrestore(&hbus->device_list_lock, flags); |
---|
| 3344 | + |
---|
| 3345 | + /* Remove all children in the list */ |
---|
| 3346 | + list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) { |
---|
| 3347 | + list_del(&hpdev->list_entry); |
---|
| 3348 | + if (hpdev->pci_slot) |
---|
| 3349 | + pci_destroy_slot(hpdev->pci_slot); |
---|
| 3350 | + /* For the two refs got in new_pcichild_device() */ |
---|
| 3351 | + put_pcichild(hpdev); |
---|
| 3352 | + put_pcichild(hpdev); |
---|
| 3353 | + } |
---|
| 3354 | + } |
---|
2673 | 3355 | |
---|
2674 | 3356 | ret = hv_send_resources_released(hdev); |
---|
2675 | | - if (ret) |
---|
| 3357 | + if (ret) { |
---|
2676 | 3358 | dev_err(&hdev->device, |
---|
2677 | 3359 | "Couldn't send resources released packet(s)\n"); |
---|
| 3360 | + return ret; |
---|
| 3361 | + } |
---|
2678 | 3362 | |
---|
2679 | 3363 | memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet)); |
---|
2680 | 3364 | init_completion(&comp_pkt.host_event); |
---|
.. | .. |
---|
2687 | 3371 | (unsigned long)&pkt.teardown_packet, |
---|
2688 | 3372 | VM_PKT_DATA_INBAND, |
---|
2689 | 3373 | VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); |
---|
2690 | | - if (!ret) |
---|
2691 | | - wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ); |
---|
| 3374 | + if (ret) |
---|
| 3375 | + return ret; |
---|
| 3376 | + |
---|
| 3377 | + if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) |
---|
| 3378 | + return -ETIMEDOUT; |
---|
| 3379 | + |
---|
| 3380 | + return 0; |
---|
2692 | 3381 | } |
---|
2693 | 3382 | |
---|
2694 | 3383 | /** |
---|
.. | .. |
---|
2700 | 3389 | static int hv_pci_remove(struct hv_device *hdev) |
---|
2701 | 3390 | { |
---|
2702 | 3391 | struct hv_pcibus_device *hbus; |
---|
| 3392 | + int ret; |
---|
2703 | 3393 | |
---|
2704 | 3394 | hbus = hv_get_drvdata(hdev); |
---|
2705 | 3395 | if (hbus->state == hv_pcibus_installed) { |
---|
| 3396 | + tasklet_disable(&hdev->channel->callback_event); |
---|
| 3397 | + hbus->state = hv_pcibus_removing; |
---|
| 3398 | + tasklet_enable(&hdev->channel->callback_event); |
---|
| 3399 | + destroy_workqueue(hbus->wq); |
---|
| 3400 | + hbus->wq = NULL; |
---|
| 3401 | + /* |
---|
| 3402 | + * At this point, no work is running or can be scheduled |
---|
| 3403 | + * on hbus-wq. We can't race with hv_pci_devices_present() |
---|
| 3404 | + * or hv_pci_eject_device(), it's safe to proceed. |
---|
| 3405 | + */ |
---|
| 3406 | + |
---|
2706 | 3407 | /* Remove the bus from PCI's point of view. */ |
---|
2707 | 3408 | pci_lock_rescan_remove(); |
---|
2708 | 3409 | pci_stop_root_bus(hbus->pci_bus); |
---|
2709 | 3410 | hv_pci_remove_slots(hbus); |
---|
2710 | 3411 | pci_remove_root_bus(hbus->pci_bus); |
---|
2711 | 3412 | pci_unlock_rescan_remove(); |
---|
2712 | | - hbus->state = hv_pcibus_removed; |
---|
2713 | 3413 | } |
---|
2714 | 3414 | |
---|
2715 | | - hv_pci_bus_exit(hdev); |
---|
| 3415 | + ret = hv_pci_bus_exit(hdev, false); |
---|
2716 | 3416 | |
---|
2717 | 3417 | vmbus_close(hdev->channel); |
---|
2718 | 3418 | |
---|
.. | .. |
---|
2724 | 3424 | irq_domain_free_fwnode(hbus->sysdata.fwnode); |
---|
2725 | 3425 | put_hvpcibus(hbus); |
---|
2726 | 3426 | wait_for_completion(&hbus->remove_event); |
---|
2727 | | - destroy_workqueue(hbus->wq); |
---|
2728 | | - free_page((unsigned long)hbus); |
---|
| 3427 | + |
---|
| 3428 | + hv_put_dom_num(hbus->sysdata.domain); |
---|
| 3429 | + |
---|
| 3430 | + kfree(hbus); |
---|
| 3431 | + return ret; |
---|
| 3432 | +} |
---|
| 3433 | + |
---|
| 3434 | +static int hv_pci_suspend(struct hv_device *hdev) |
---|
| 3435 | +{ |
---|
| 3436 | + struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); |
---|
| 3437 | + enum hv_pcibus_state old_state; |
---|
| 3438 | + int ret; |
---|
| 3439 | + |
---|
| 3440 | + /* |
---|
| 3441 | + * hv_pci_suspend() must make sure there are no pending work items |
---|
| 3442 | + * before calling vmbus_close(), since it runs in a process context |
---|
| 3443 | + * as a callback in dpm_suspend(). When it starts to run, the channel |
---|
| 3444 | + * callback hv_pci_onchannelcallback(), which runs in a tasklet |
---|
| 3445 | + * context, can be still running concurrently and scheduling new work |
---|
| 3446 | + * items onto hbus->wq in hv_pci_devices_present() and |
---|
| 3447 | + * hv_pci_eject_device(), and the work item handlers can access the |
---|
| 3448 | + * vmbus channel, which can be being closed by hv_pci_suspend(), e.g. |
---|
| 3449 | + * the work item handler pci_devices_present_work() -> |
---|
| 3450 | + * new_pcichild_device() writes to the vmbus channel. |
---|
| 3451 | + * |
---|
| 3452 | + * To eliminate the race, hv_pci_suspend() disables the channel |
---|
| 3453 | + * callback tasklet, sets hbus->state to hv_pcibus_removing, and |
---|
| 3454 | + * re-enables the tasklet. This way, when hv_pci_suspend() proceeds, |
---|
| 3455 | + * it knows that no new work item can be scheduled, and then it flushes |
---|
| 3456 | + * hbus->wq and safely closes the vmbus channel. |
---|
| 3457 | + */ |
---|
| 3458 | + tasklet_disable(&hdev->channel->callback_event); |
---|
| 3459 | + |
---|
| 3460 | + /* Change the hbus state to prevent new work items. */ |
---|
| 3461 | + old_state = hbus->state; |
---|
| 3462 | + if (hbus->state == hv_pcibus_installed) |
---|
| 3463 | + hbus->state = hv_pcibus_removing; |
---|
| 3464 | + |
---|
| 3465 | + tasklet_enable(&hdev->channel->callback_event); |
---|
| 3466 | + |
---|
| 3467 | + if (old_state != hv_pcibus_installed) |
---|
| 3468 | + return -EINVAL; |
---|
| 3469 | + |
---|
| 3470 | + flush_workqueue(hbus->wq); |
---|
| 3471 | + |
---|
| 3472 | + ret = hv_pci_bus_exit(hdev, true); |
---|
| 3473 | + if (ret) |
---|
| 3474 | + return ret; |
---|
| 3475 | + |
---|
| 3476 | + vmbus_close(hdev->channel); |
---|
| 3477 | + |
---|
2729 | 3478 | return 0; |
---|
| 3479 | +} |
---|
| 3480 | + |
---|
| 3481 | +static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg) |
---|
| 3482 | +{ |
---|
| 3483 | + struct msi_desc *entry; |
---|
| 3484 | + struct irq_data *irq_data; |
---|
| 3485 | + |
---|
| 3486 | + for_each_pci_msi_entry(entry, pdev) { |
---|
| 3487 | + irq_data = irq_get_irq_data(entry->irq); |
---|
| 3488 | + if (WARN_ON_ONCE(!irq_data)) |
---|
| 3489 | + return -EINVAL; |
---|
| 3490 | + |
---|
| 3491 | + hv_compose_msi_msg(irq_data, &entry->msg); |
---|
| 3492 | + } |
---|
| 3493 | + |
---|
| 3494 | + return 0; |
---|
| 3495 | +} |
---|
| 3496 | + |
---|
| 3497 | +/* |
---|
| 3498 | + * Upon resume, pci_restore_msi_state() -> ... -> __pci_write_msi_msg() |
---|
| 3499 | + * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V |
---|
| 3500 | + * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg() |
---|
| 3501 | + * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping |
---|
| 3502 | + * Table entries. |
---|
| 3503 | + */ |
---|
| 3504 | +static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus) |
---|
| 3505 | +{ |
---|
| 3506 | + pci_walk_bus(hbus->pci_bus, hv_pci_restore_msi_msg, NULL); |
---|
| 3507 | +} |
---|
| 3508 | + |
---|
| 3509 | +static int hv_pci_resume(struct hv_device *hdev) |
---|
| 3510 | +{ |
---|
| 3511 | + struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); |
---|
| 3512 | + enum pci_protocol_version_t version[1]; |
---|
| 3513 | + int ret; |
---|
| 3514 | + |
---|
| 3515 | + hbus->state = hv_pcibus_init; |
---|
| 3516 | + |
---|
| 3517 | + ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, |
---|
| 3518 | + hv_pci_onchannelcallback, hbus); |
---|
| 3519 | + if (ret) |
---|
| 3520 | + return ret; |
---|
| 3521 | + |
---|
| 3522 | + /* Only use the version that was in use before hibernation. */ |
---|
| 3523 | + version[0] = hbus->protocol_version; |
---|
| 3524 | + ret = hv_pci_protocol_negotiation(hdev, version, 1); |
---|
| 3525 | + if (ret) |
---|
| 3526 | + goto out; |
---|
| 3527 | + |
---|
| 3528 | + ret = hv_pci_query_relations(hdev); |
---|
| 3529 | + if (ret) |
---|
| 3530 | + goto out; |
---|
| 3531 | + |
---|
| 3532 | + ret = hv_pci_enter_d0(hdev); |
---|
| 3533 | + if (ret) |
---|
| 3534 | + goto out; |
---|
| 3535 | + |
---|
| 3536 | + ret = hv_send_resources_allocated(hdev); |
---|
| 3537 | + if (ret) |
---|
| 3538 | + goto out; |
---|
| 3539 | + |
---|
| 3540 | + prepopulate_bars(hbus); |
---|
| 3541 | + |
---|
| 3542 | + hv_pci_restore_msi_state(hbus); |
---|
| 3543 | + |
---|
| 3544 | + hbus->state = hv_pcibus_installed; |
---|
| 3545 | + return 0; |
---|
| 3546 | +out: |
---|
| 3547 | + vmbus_close(hdev->channel); |
---|
| 3548 | + return ret; |
---|
2730 | 3549 | } |
---|
2731 | 3550 | |
---|
2732 | 3551 | static const struct hv_vmbus_device_id hv_pci_id_table[] = { |
---|
.. | .. |
---|
2743 | 3562 | .id_table = hv_pci_id_table, |
---|
2744 | 3563 | .probe = hv_pci_probe, |
---|
2745 | 3564 | .remove = hv_pci_remove, |
---|
| 3565 | + .suspend = hv_pci_suspend, |
---|
| 3566 | + .resume = hv_pci_resume, |
---|
2746 | 3567 | }; |
---|
2747 | 3568 | |
---|
2748 | 3569 | static void __exit exit_hv_pci_drv(void) |
---|
2749 | 3570 | { |
---|
2750 | 3571 | vmbus_driver_unregister(&hv_pci_drv); |
---|
| 3572 | + |
---|
| 3573 | + hvpci_block_ops.read_block = NULL; |
---|
| 3574 | + hvpci_block_ops.write_block = NULL; |
---|
| 3575 | + hvpci_block_ops.reg_blk_invalidate = NULL; |
---|
2751 | 3576 | } |
---|
2752 | 3577 | |
---|
2753 | 3578 | static int __init init_hv_pci_drv(void) |
---|
2754 | 3579 | { |
---|
| 3580 | + if (!hv_is_hyperv_initialized()) |
---|
| 3581 | + return -ENODEV; |
---|
| 3582 | + |
---|
| 3583 | + /* Set the invalid domain number's bit, so it will not be used */ |
---|
| 3584 | + set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); |
---|
| 3585 | + |
---|
| 3586 | + /* Initialize PCI block r/w interface */ |
---|
| 3587 | + hvpci_block_ops.read_block = hv_read_config_block; |
---|
| 3588 | + hvpci_block_ops.write_block = hv_write_config_block; |
---|
| 3589 | + hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate; |
---|
| 3590 | + |
---|
2755 | 3591 | return vmbus_driver_register(&hv_pci_drv); |
---|
2756 | 3592 | } |
---|
2757 | 3593 | |
---|