From a5969cabbb4660eab42b6ef0412cbbd1200cf14d Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Sat, 12 Oct 2024 07:10:09 +0000 Subject: [PATCH] 修改led为gpio --- kernel/drivers/pci/controller/pci-hyperv.c | 1274 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 files changed, 1,056 insertions(+), 218 deletions(-) diff --git a/kernel/drivers/pci/controller/pci-hyperv.c b/kernel/drivers/pci/controller/pci-hyperv.c index 5c28498..2d6c77d 100644 --- a/kernel/drivers/pci/controller/pci-hyperv.c +++ b/kernel/drivers/pci/controller/pci-hyperv.c @@ -63,6 +63,7 @@ enum pci_protocol_version_t { PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */ PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */ + PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3), /* Vibranium */ }; #define CPU_AFFINITY_ALL -1ULL @@ -72,14 +73,10 @@ * first. */ static enum pci_protocol_version_t pci_protocol_versions[] = { + PCI_PROTOCOL_VERSION_1_3, PCI_PROTOCOL_VERSION_1_2, PCI_PROTOCOL_VERSION_1_1, }; - -/* - * Protocol version negotiated by hv_pci_protocol_negotiation(). - */ -static enum pci_protocol_version_t pci_protocol_version; #define PCI_CONFIG_MMIO_LENGTH 0x2000 #define CFG_PAGE_OFFSET 0x1000 @@ -124,6 +121,7 @@ PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16, PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17, PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */ + PCI_BUS_RELATIONS2 = PCI_MESSAGE_BASE + 0x19, PCI_MESSAGE_MAXIMUM }; @@ -167,6 +165,26 @@ u32 subsystem_id; union win_slot_encoding win_slot; u32 ser; /* serial number */ +} __packed; + +enum pci_device_description_flags { + HV_PCI_DEVICE_FLAG_NONE = 0x0, + HV_PCI_DEVICE_FLAG_NUMA_AFFINITY = 0x1, +}; + +struct pci_function_description2 { + u16 v_id; /* vendor ID */ + u16 d_id; /* device ID */ + u8 rev; + u8 prog_intf; + u8 subclass; + u8 base_class; + u32 subsystem_id; + union win_slot_encoding win_slot; + u32 ser; /* serial number */ + u32 flags; + u16 virtual_numa_node; + u16 reserved; } __packed; /** @@ -265,7 +283,7 @@ int resp_packet_size); void *compl_ctxt; - struct pci_message message[0]; + struct pci_message message[]; }; /* @@ -301,13 +319,19 @@ struct pci_bus_relations { struct pci_incoming_message incoming; u32 device_count; - struct pci_function_description func[0]; + struct pci_function_description func[]; +} __packed; + +struct pci_bus_relations2 { + struct pci_incoming_message incoming; + u32 device_count; + struct pci_function_description2 func[]; } __packed; struct pci_q_res_req_response { struct vmpacket_descriptor hdr; s32 status; /* negative values are failures */ - u32 probed_bar[6]; + u32 probed_bar[PCI_STD_NUM_BARS]; } __packed; struct pci_set_power { @@ -365,6 +389,39 @@ struct tran_int_desc int_desc; } __packed; +/* + * Note: the VM must pass a valid block id, wslot and bytes_requested. + */ +struct pci_read_block { + struct pci_message message_type; + u32 block_id; + union win_slot_encoding wslot; + u32 bytes_requested; +} __packed; + +struct pci_read_block_response { + struct vmpacket_descriptor hdr; + u32 status; + u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; +} __packed; + +/* + * Note: the VM must pass a valid block id, wslot and byte_count. + */ +struct pci_write_block { + struct pci_message message_type; + u32 block_id; + union win_slot_encoding wslot; + u32 byte_count; + u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; +} __packed; + +struct pci_dev_inval_block { + struct pci_incoming_message incoming; + union win_slot_encoding wslot; + u64 block_mask; +} __packed; + struct pci_dev_incoming { struct pci_incoming_message incoming; union win_slot_encoding wslot; @@ -379,50 +436,6 @@ static int pci_ring_size = (4 * PAGE_SIZE); /* - * Definitions or interrupt steering hypercall. - */ -#define HV_PARTITION_ID_SELF ((u64)-1) -#define HVCALL_RETARGET_INTERRUPT 0x7e - -struct hv_interrupt_entry { - u32 source; /* 1 for MSI(-X) */ - u32 reserved1; - u32 address; - u32 data; -}; - -#define HV_VP_SET_BANK_COUNT_MAX 5 /* current implementation limit */ - -struct hv_vp_set { - u64 format; /* 0 (HvGenericSetSparse4k) */ - u64 valid_banks; - u64 masks[HV_VP_SET_BANK_COUNT_MAX]; -}; - -/* - * flags for hv_device_interrupt_target.flags - */ -#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 -#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 - -struct hv_device_interrupt_target { - u32 vector; - u32 flags; - union { - u64 vp_mask; - struct hv_vp_set vp_set; - }; -}; - -struct retarget_msi_interrupt { - u64 partition_id; /* use "self" */ - u64 device_id; - struct hv_interrupt_entry int_entry; - u64 reserved2; - struct hv_device_interrupt_target int_target; -} __packed; - -/* * Driver specific state. */ @@ -430,12 +443,14 @@ hv_pcibus_init = 0, hv_pcibus_probed, hv_pcibus_installed, - hv_pcibus_removed, + hv_pcibus_removing, hv_pcibus_maximum }; struct hv_pcibus_device { struct pci_sysdata sysdata; + /* Protocol version negotiated with the host */ + enum pci_protocol_version_t protocol_version; enum hv_pcibus_state state; refcount_t remove_lock; struct hv_device *hdev; @@ -460,12 +475,19 @@ struct msi_controller msi_chip; struct irq_domain *irq_domain; - /* hypercall arg, must not cross page boundary */ - struct retarget_msi_interrupt retarget_msi_interrupt_params; - spinlock_t retarget_msi_interrupt_lock; struct workqueue_struct *wq; + + /* Highest slot of child device with resources allocated */ + int wslot_res_allocated; + + /* hypercall arg, must not cross page boundary */ + struct hv_retarget_device_interrupt retarget_msi_interrupt_params; + + /* + * Don't put anything here: retarget_msi_interrupt_params must be last + */ }; /* @@ -478,36 +500,44 @@ struct hv_pcibus_device *bus; }; +struct hv_pcidev_description { + u16 v_id; /* vendor ID */ + u16 d_id; /* device ID */ + u8 rev; + u8 prog_intf; + u8 subclass; + u8 base_class; + u32 subsystem_id; + union win_slot_encoding win_slot; + u32 ser; /* serial number */ + u32 flags; + u16 virtual_numa_node; +}; + struct hv_dr_state { struct list_head list_entry; u32 device_count; - struct pci_function_description func[0]; -}; - -enum hv_pcichild_state { - hv_pcichild_init = 0, - hv_pcichild_requirements, - hv_pcichild_resourced, - hv_pcichild_ejecting, - hv_pcichild_maximum + struct hv_pcidev_description func[]; }; struct hv_pci_dev { /* List protected by pci_rescan_remove_lock */ struct list_head list_entry; refcount_t refs; - enum hv_pcichild_state state; struct pci_slot *pci_slot; - struct pci_function_description desc; + struct hv_pcidev_description desc; bool reported_missing; struct hv_pcibus_device *hbus; struct work_struct wrk; + + void (*block_invalidate)(void *context, u64 block_mask); + void *invalidate_context; /* * What would be observed if one wrote 0xFFFFFFFF to a BAR and then * read it back, for each of the BAR offsets within config space. */ - u32 probed_bar[6]; + u32 probed_bar[PCI_STD_NUM_BARS]; }; struct hv_pci_compl { @@ -821,6 +851,254 @@ .write = hv_pcifront_write_config, }; +/* + * Paravirtual backchannel + * + * Hyper-V SR-IOV provides a backchannel mechanism in software for + * communication between a VF driver and a PF driver. These + * "configuration blocks" are similar in concept to PCI configuration space, + * but instead of doing reads and writes in 32-bit chunks through a very slow + * path, packets of up to 128 bytes can be sent or received asynchronously. + * + * Nearly every SR-IOV device contains just such a communications channel in + * hardware, so using this one in software is usually optional. Using the + * software channel, however, allows driver implementers to leverage software + * tools that fuzz the communications channel looking for vulnerabilities. + * + * The usage model for these packets puts the responsibility for reading or + * writing on the VF driver. The VF driver sends a read or a write packet, + * indicating which "block" is being referred to by number. + * + * If the PF driver wishes to initiate communication, it can "invalidate" one or + * more of the first 64 blocks. This invalidation is delivered via a callback + * supplied by the VF driver by this driver. + * + * No protocol is implied, except that supplied by the PF and VF drivers. + */ + +struct hv_read_config_compl { + struct hv_pci_compl comp_pkt; + void *buf; + unsigned int len; + unsigned int bytes_returned; +}; + +/** + * hv_pci_read_config_compl() - Invoked when a response packet + * for a read config block operation arrives. + * @context: Identifies the read config operation + * @resp: The response packet itself + * @resp_packet_size: Size in bytes of the response packet + */ +static void hv_pci_read_config_compl(void *context, struct pci_response *resp, + int resp_packet_size) +{ + struct hv_read_config_compl *comp = context; + struct pci_read_block_response *read_resp = + (struct pci_read_block_response *)resp; + unsigned int data_len, hdr_len; + + hdr_len = offsetof(struct pci_read_block_response, bytes); + if (resp_packet_size < hdr_len) { + comp->comp_pkt.completion_status = -1; + goto out; + } + + data_len = resp_packet_size - hdr_len; + if (data_len > 0 && read_resp->status == 0) { + comp->bytes_returned = min(comp->len, data_len); + memcpy(comp->buf, read_resp->bytes, comp->bytes_returned); + } else { + comp->bytes_returned = 0; + } + + comp->comp_pkt.completion_status = read_resp->status; +out: + complete(&comp->comp_pkt.host_event); +} + +/** + * hv_read_config_block() - Sends a read config block request to + * the back-end driver running in the Hyper-V parent partition. + * @pdev: The PCI driver's representation for this device. + * @buf: Buffer into which the config block will be copied. + * @len: Size in bytes of buf. + * @block_id: Identifies the config block which has been requested. + * @bytes_returned: Size which came back from the back-end driver. + * + * Return: 0 on success, -errno on failure + */ +static int hv_read_config_block(struct pci_dev *pdev, void *buf, + unsigned int len, unsigned int block_id, + unsigned int *bytes_returned) +{ + struct hv_pcibus_device *hbus = + container_of(pdev->bus->sysdata, struct hv_pcibus_device, + sysdata); + struct { + struct pci_packet pkt; + char buf[sizeof(struct pci_read_block)]; + } pkt; + struct hv_read_config_compl comp_pkt; + struct pci_read_block *read_blk; + int ret; + + if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) + return -EINVAL; + + init_completion(&comp_pkt.comp_pkt.host_event); + comp_pkt.buf = buf; + comp_pkt.len = len; + + memset(&pkt, 0, sizeof(pkt)); + pkt.pkt.completion_func = hv_pci_read_config_compl; + pkt.pkt.compl_ctxt = &comp_pkt; + read_blk = (struct pci_read_block *)&pkt.pkt.message; + read_blk->message_type.type = PCI_READ_BLOCK; + read_blk->wslot.slot = devfn_to_wslot(pdev->devfn); + read_blk->block_id = block_id; + read_blk->bytes_requested = len; + + ret = vmbus_sendpacket(hbus->hdev->channel, read_blk, + sizeof(*read_blk), (unsigned long)&pkt.pkt, + VM_PKT_DATA_INBAND, + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + if (ret) + return ret; + + ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event); + if (ret) + return ret; + + if (comp_pkt.comp_pkt.completion_status != 0 || + comp_pkt.bytes_returned == 0) { + dev_err(&hbus->hdev->device, + "Read Config Block failed: 0x%x, bytes_returned=%d\n", + comp_pkt.comp_pkt.completion_status, + comp_pkt.bytes_returned); + return -EIO; + } + + *bytes_returned = comp_pkt.bytes_returned; + return 0; +} + +/** + * hv_pci_write_config_compl() - Invoked when a response packet for a write + * config block operation arrives. + * @context: Identifies the write config operation + * @resp: The response packet itself + * @resp_packet_size: Size in bytes of the response packet + */ +static void hv_pci_write_config_compl(void *context, struct pci_response *resp, + int resp_packet_size) +{ + struct hv_pci_compl *comp_pkt = context; + + comp_pkt->completion_status = resp->status; + complete(&comp_pkt->host_event); +} + +/** + * hv_write_config_block() - Sends a write config block request to the + * back-end driver running in the Hyper-V parent partition. + * @pdev: The PCI driver's representation for this device. + * @buf: Buffer from which the config block will be copied. + * @len: Size in bytes of buf. + * @block_id: Identifies the config block which is being written. + * + * Return: 0 on success, -errno on failure + */ +static int hv_write_config_block(struct pci_dev *pdev, void *buf, + unsigned int len, unsigned int block_id) +{ + struct hv_pcibus_device *hbus = + container_of(pdev->bus->sysdata, struct hv_pcibus_device, + sysdata); + struct { + struct pci_packet pkt; + char buf[sizeof(struct pci_write_block)]; + u32 reserved; + } pkt; + struct hv_pci_compl comp_pkt; + struct pci_write_block *write_blk; + u32 pkt_size; + int ret; + + if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) + return -EINVAL; + + init_completion(&comp_pkt.host_event); + + memset(&pkt, 0, sizeof(pkt)); + pkt.pkt.completion_func = hv_pci_write_config_compl; + pkt.pkt.compl_ctxt = &comp_pkt; + write_blk = (struct pci_write_block *)&pkt.pkt.message; + write_blk->message_type.type = PCI_WRITE_BLOCK; + write_blk->wslot.slot = devfn_to_wslot(pdev->devfn); + write_blk->block_id = block_id; + write_blk->byte_count = len; + memcpy(write_blk->bytes, buf, len); + pkt_size = offsetof(struct pci_write_block, bytes) + len; + /* + * This quirk is required on some hosts shipped around 2018, because + * these hosts don't check the pkt_size correctly (new hosts have been + * fixed since early 2019). The quirk is also safe on very old hosts + * and new hosts, because, on them, what really matters is the length + * specified in write_blk->byte_count. + */ + pkt_size += sizeof(pkt.reserved); + + ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size, + (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND, + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + if (ret) + return ret; + + ret = wait_for_response(hbus->hdev, &comp_pkt.host_event); + if (ret) + return ret; + + if (comp_pkt.completion_status != 0) { + dev_err(&hbus->hdev->device, + "Write Config Block failed: 0x%x\n", + comp_pkt.completion_status); + return -EIO; + } + + return 0; +} + +/** + * hv_register_block_invalidate() - Invoked when a config block invalidation + * arrives from the back-end driver. + * @pdev: The PCI driver's representation for this device. + * @context: Identifies the device. + * @block_invalidate: Identifies all of the blocks being invalidated. + * + * Return: 0 on success, -errno on failure + */ +static int hv_register_block_invalidate(struct pci_dev *pdev, void *context, + void (*block_invalidate)(void *context, + u64 block_mask)) +{ + struct hv_pcibus_device *hbus = + container_of(pdev->bus->sysdata, struct hv_pcibus_device, + sysdata); + struct hv_pci_dev *hpdev; + + hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); + if (!hpdev) + return -ENODEV; + + hpdev->block_invalidate = block_invalidate; + hpdev->invalidate_context = context; + + put_pcichild(hpdev); + return 0; + +} + /* Interrupt management hooks */ static void hv_int_desc_free(struct hv_pci_dev *hpdev, struct tran_int_desc *int_desc) @@ -831,6 +1109,10 @@ u8 buffer[sizeof(struct pci_delete_interrupt)]; } ctxt; + if (!int_desc->vector_count) { + kfree(int_desc); + return; + } memset(&ctxt, 0, sizeof(ctxt)); int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; int_pkt->message_type.type = @@ -893,6 +1175,28 @@ pci_msi_mask_irq(data); } +static unsigned int hv_msi_get_int_vector(struct irq_data *data) +{ + struct irq_cfg *cfg = irqd_cfg(data); + + return cfg->vector; +} + +static int hv_msi_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *info) +{ + int ret = pci_msi_prepare(domain, dev, nvec, info); + + /* + * By using the interrupt remapper in the hypervisor IOMMU, contiguous + * CPU vectors is not needed for multi-MSI + */ + if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) + info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + + return ret; +} + /** * hv_irq_unmask() - "Unmask" the IRQ by setting its current * affinity. @@ -907,21 +1211,28 @@ { struct msi_desc *msi_desc = irq_data_get_msi_desc(data); struct irq_cfg *cfg = irqd_cfg(data); - struct retarget_msi_interrupt *params; + struct hv_retarget_device_interrupt *params; + struct tran_int_desc *int_desc; struct hv_pcibus_device *hbus; struct cpumask *dest; + cpumask_var_t tmp; struct pci_bus *pbus; struct pci_dev *pdev; unsigned long flags; u32 var_size = 0; - int cpu_vmbus; - int cpu; + int cpu, nr_bank; u64 res; dest = irq_data_get_effective_affinity_mask(data); pdev = msi_desc_to_pci_dev(msi_desc); pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); + int_desc = data->chip_data; + if (!int_desc) { + dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n", + __func__, data->irq); + return; + } spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); @@ -929,8 +1240,8 @@ memset(params, 0, sizeof(*params)); params->partition_id = HV_PARTITION_ID_SELF; params->int_entry.source = 1; /* MSI(-X) */ - params->int_entry.address = msi_desc->msg.address_lo; - params->int_entry.data = msi_desc->msg.data; + params->int_entry.msi_entry.address = int_desc->address & 0xffffffff; + params->int_entry.msi_entry.data = int_desc->data; params->device_id = (hbus->hdev->dev_instance.b[5] << 24) | (hbus->hdev->dev_instance.b[4] << 16) | (hbus->hdev->dev_instance.b[7] << 8) | @@ -945,7 +1256,7 @@ * negative effect (yet?). */ - if (pci_protocol_version >= PCI_PROTOCOL_VERSION_1_2) { + if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) { /* * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides @@ -955,28 +1266,27 @@ */ params->int_target.flags |= HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; - params->int_target.vp_set.valid_banks = - (1ull << HV_VP_SET_BANK_COUNT_MAX) - 1; + + if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) { + res = 1; + goto exit_unlock; + } + + cpumask_and(tmp, dest, cpu_online_mask); + nr_bank = cpumask_to_vpset(¶ms->int_target.vp_set, tmp); + free_cpumask_var(tmp); + + if (nr_bank <= 0) { + res = 1; + goto exit_unlock; + } /* * var-sized hypercall, var-size starts after vp_mask (thus - * vp_set.format does not count, but vp_set.valid_banks does). + * vp_set.format does not count, but vp_set.valid_bank_mask + * does). */ - var_size = 1 + HV_VP_SET_BANK_COUNT_MAX; - - for_each_cpu_and(cpu, dest, cpu_online_mask) { - cpu_vmbus = hv_cpu_number_to_vp_number(cpu); - - if (cpu_vmbus >= HV_VP_SET_BANK_COUNT_MAX * 64) { - dev_err(&hbus->hdev->device, - "too high CPU %d", cpu_vmbus); - res = 1; - goto exit_unlock; - } - - params->int_target.vp_set.masks[cpu_vmbus / 64] |= - (1ULL << (cpu_vmbus & 63)); - } + var_size = 1 + nr_bank; } else { for_each_cpu_and(cpu, dest, cpu_online_mask) { params->int_target.vp_mask |= @@ -990,11 +1300,25 @@ exit_unlock: spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags); - if (res) { + /* + * During hibernation, when a CPU is offlined, the kernel tries + * to move the interrupt to the remaining CPUs that haven't + * been offlined yet. In this case, the below hv_do_hypercall() + * always fails since the vmbus channel has been closed: + * refer to cpu_disable_common() -> fixup_irqs() -> + * irq_migrate_all_off_this_cpu() -> migrate_one_irq(). + * + * Suppress the error message for hibernation because the failure + * during hibernation does not matter (at this time all the devices + * have been frozen). Note: the correct affinity info is still updated + * into the irqdata data structure in migrate_one_irq() -> + * irq_do_set_affinity() -> hv_set_affinity(), so later when the VM + * resumes, hv_pci_restore_msi_state() is able to correctly restore + * the interrupt with the correct affinity. + */ + if (res && hbus->state != hv_pcibus_removing) dev_err(&hbus->hdev->device, "%s() failed: %#llx", __func__, res); - return; - } pci_msi_unmask_irq(data); } @@ -1018,12 +1342,12 @@ static u32 hv_compose_msi_req_v1( struct pci_create_interrupt *int_pkt, struct cpumask *affinity, - u32 slot, u8 vector) + u32 slot, u8 vector, u8 vector_count) { int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; - int_pkt->int_desc.vector_count = 1; + int_pkt->int_desc.vector_count = vector_count; int_pkt->int_desc.delivery_mode = dest_Fixed; /* @@ -1037,14 +1361,14 @@ static u32 hv_compose_msi_req_v2( struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity, - u32 slot, u8 vector) + u32 slot, u8 vector, u8 vector_count) { int cpu; int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2; int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; - int_pkt->int_desc.vector_count = 1; + int_pkt->int_desc.vector_count = vector_count; int_pkt->int_desc.delivery_mode = dest_Fixed; /* @@ -1072,15 +1396,16 @@ */ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { - struct irq_cfg *cfg = irqd_cfg(data); struct hv_pcibus_device *hbus; + struct vmbus_channel *channel; struct hv_pci_dev *hpdev; struct pci_bus *pbus; struct pci_dev *pdev; struct cpumask *dest; - unsigned long flags; struct compose_comp_ctxt comp; struct tran_int_desc *int_desc; + struct msi_desc *msi_desc; + u8 vector, vector_count; struct { struct pci_packet pci_pkt; union { @@ -1092,43 +1417,80 @@ u32 size; int ret; - pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); + /* Reuse the previous allocation */ + if (data->chip_data) { + int_desc = data->chip_data; + msg->address_hi = int_desc->address >> 32; + msg->address_lo = int_desc->address & 0xffffffff; + msg->data = int_desc->data; + return; + } + + msi_desc = irq_data_get_msi_desc(data); + pdev = msi_desc_to_pci_dev(msi_desc); dest = irq_data_get_effective_affinity_mask(data); pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); + channel = hbus->hdev->channel; hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); if (!hpdev) goto return_null_message; - /* Free any previous message that might have already been composed. */ - if (data->chip_data) { - int_desc = data->chip_data; - data->chip_data = NULL; - hv_int_desc_free(hpdev, int_desc); - } - int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC); if (!int_desc) goto drop_reference; + + if (!msi_desc->msi_attrib.is_msix && msi_desc->nvec_used > 1) { + /* + * If this is not the first MSI of Multi MSI, we already have + * a mapping. Can exit early. + */ + if (msi_desc->irq != data->irq) { + data->chip_data = int_desc; + int_desc->address = msi_desc->msg.address_lo | + (u64)msi_desc->msg.address_hi << 32; + int_desc->data = msi_desc->msg.data + + (data->irq - msi_desc->irq); + msg->address_hi = msi_desc->msg.address_hi; + msg->address_lo = msi_desc->msg.address_lo; + msg->data = int_desc->data; + put_pcichild(hpdev); + return; + } + /* + * The vector we select here is a dummy value. The correct + * value gets sent to the hypervisor in unmask(). This needs + * to be aligned with the count, and also not zero. Multi-msi + * is powers of 2 up to 32, so 32 will always work here. + */ + vector = 32; + vector_count = msi_desc->nvec_used; + } else { + vector = hv_msi_get_int_vector(data); + vector_count = 1; + } memset(&ctxt, 0, sizeof(ctxt)); init_completion(&comp.comp_pkt.host_event); ctxt.pci_pkt.completion_func = hv_pci_compose_compl; ctxt.pci_pkt.compl_ctxt = ∁ - switch (pci_protocol_version) { + switch (hbus->protocol_version) { case PCI_PROTOCOL_VERSION_1_1: size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, dest, hpdev->desc.win_slot.slot, - cfg->vector); + vector, + vector_count); break; case PCI_PROTOCOL_VERSION_1_2: + case PCI_PROTOCOL_VERSION_1_3: size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2, dest, hpdev->desc.win_slot.slot, - cfg->vector); + vector, + vector_count); break; default: @@ -1153,41 +1515,44 @@ } /* + * Prevents hv_pci_onchannelcallback() from running concurrently + * in the tasklet. + */ + tasklet_disable(&channel->callback_event); + + /* * Since this function is called with IRQ locks held, can't * do normal wait for completion; instead poll. */ while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { + unsigned long flags; + /* 0xFFFF means an invalid PCI VENDOR ID. */ if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { dev_err_once(&hbus->hdev->device, "the device has gone\n"); - goto free_int_desc; + goto enable_tasklet; } /* - * When the higher level interrupt code calls us with - * interrupt disabled, we must poll the channel by calling - * the channel callback directly when channel->target_cpu is - * the current CPU. When the higher level interrupt code - * calls us with interrupt enabled, let's add the - * local_irq_save()/restore() to avoid race: - * hv_pci_onchannelcallback() can also run in tasklet. + * Make sure that the ring buffer data structure doesn't get + * freed while we dereference the ring buffer pointer. Test + * for the channel's onchannel_callback being NULL within a + * sched_lock critical section. See also the inline comments + * in vmbus_reset_channel_cb(). */ - local_irq_save(flags); - - if (hbus->hdev->channel->target_cpu == smp_processor_id()) - hv_pci_onchannelcallback(hbus); - - local_irq_restore(flags); - - if (hpdev->state == hv_pcichild_ejecting) { - dev_err_once(&hbus->hdev->device, - "the device is being ejected\n"); - goto free_int_desc; + spin_lock_irqsave(&channel->sched_lock, flags); + if (unlikely(channel->onchannel_callback == NULL)) { + spin_unlock_irqrestore(&channel->sched_lock, flags); + goto enable_tasklet; } + hv_pci_onchannelcallback(hbus); + spin_unlock_irqrestore(&channel->sched_lock, flags); udelay(100); } + + tasklet_enable(&channel->callback_event); if (comp.comp_pkt.completion_status < 0) { dev_err(&hbus->hdev->device, @@ -1212,6 +1577,8 @@ put_pcichild(hpdev); return; +enable_tasklet: + tasklet_enable(&channel->callback_event); free_int_desc: kfree(int_desc); drop_reference: @@ -1232,16 +1599,8 @@ .irq_unmask = hv_irq_unmask, }; -static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info, - msi_alloc_info_t *arg) -{ - return arg->msi_hwirq; -} - static struct msi_domain_ops hv_msi_ops = { - .get_hwirq = hv_msi_domain_ops_get_hwirq, - .msi_prepare = pci_msi_prepare, - .set_desc = pci_msi_set_desc, + .msi_prepare = hv_msi_prepare, .msi_free = hv_msi_free, }; @@ -1332,7 +1691,7 @@ * so it's sufficient to just add them up without tracking alignment. */ list_for_each_entry(hpdev, &hbus->children, list_entry) { - for (i = 0; i < 6; i++) { + for (i = 0; i < PCI_STD_NUM_BARS; i++) { if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO) dev_err(&hbus->hdev->device, "There's an I/O BAR in this list!\n"); @@ -1403,10 +1762,27 @@ spin_lock_irqsave(&hbus->device_list_lock, flags); + /* + * Clear the memory enable bit, in case it's already set. This occurs + * in the suspend path of hibernation, where the device is suspended, + * resumed and suspended again: see hibernation_snapshot() and + * hibernation_platform_enter(). + * + * If the memory enable bit is already set, Hyper-V sliently ignores + * the below BAR updates, and the related PCI device driver can not + * work, because reading from the device register(s) always returns + * 0xFFFFFFFF. + */ + list_for_each_entry(hpdev, &hbus->children, list_entry) { + _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command); + command &= ~PCI_COMMAND_MEMORY; + _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command); + } + /* Pick addresses for the BARs. */ do { list_for_each_entry(hpdev, &hbus->children, list_entry) { - for (i = 0; i < 6; i++) { + for (i = 0; i < PCI_STD_NUM_BARS; i++) { bar_val = hpdev->probed_bar[i]; if (bar_val == 0) continue; @@ -1506,6 +1882,36 @@ } } +/* + * Set NUMA node for the devices on the bus + */ +static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus) +{ + struct pci_dev *dev; + struct pci_bus *bus = hbus->pci_bus; + struct hv_pci_dev *hv_dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn)); + if (!hv_dev) + continue; + + if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY && + hv_dev->desc.virtual_numa_node < num_possible_nodes()) + /* + * The kernel may boot with some NUMA nodes offline + * (e.g. in a KDUMP kernel) or with NUMA disabled via + * "numa=off". In those cases, adjust the host provided + * NUMA node to a valid NUMA node used by the kernel. + */ + set_dev_node(&dev->dev, + numa_map_to_online_node( + hv_dev->desc.virtual_numa_node)); + + put_pcichild(hv_dev); + } +} + /** * create_root_hv_pci_bus() - Expose a new root PCI bus * @hbus: Root PCI bus, as understood by this driver @@ -1528,6 +1934,7 @@ pci_lock_rescan_remove(); pci_scan_child_bus(hbus->pci_bus); + hv_pci_assign_numa_node(hbus); pci_bus_assign_resources(hbus->pci_bus); hv_pci_assign_slots(hbus); pci_bus_add_devices(hbus->pci_bus); @@ -1563,7 +1970,7 @@ "query resource requirements failed: %x\n", resp->status); } else { - for (i = 0; i < 6; i++) { + for (i = 0; i < PCI_STD_NUM_BARS; i++) { completion->hpdev->probed_bar[i] = q_res_req->probed_bar[i]; } @@ -1584,7 +1991,7 @@ * Return: Pointer to the new tracking struct */ static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus, - struct pci_function_description *desc) + struct hv_pcidev_description *desc) { struct hv_pci_dev *hpdev; struct pci_child_message *res_req; @@ -1695,7 +2102,7 @@ { u32 child_no; bool found; - struct pci_function_description *new_desc; + struct hv_pcidev_description *new_desc; struct hv_pci_dev *hpdev; struct hv_pcibus_device *hbus; struct list_head removed; @@ -1796,6 +2203,7 @@ */ pci_lock_rescan_remove(); pci_scan_child_bus(hbus->pci_bus); + hv_pci_assign_numa_node(hbus); hv_pci_assign_slots(hbus); pci_unlock_rescan_remove(); break; @@ -1814,41 +2222,31 @@ } /** - * hv_pci_devices_present() - Handles list of new children + * hv_pci_start_relations_work() - Queue work to start device discovery * @hbus: Root PCI bus, as understood by this driver - * @relations: Packet from host listing children + * @dr: The list of children returned from host * - * This function is invoked whenever a new list of devices for - * this bus appears. + * Return: 0 on success, -errno on failure */ -static void hv_pci_devices_present(struct hv_pcibus_device *hbus, - struct pci_bus_relations *relations) +static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus, + struct hv_dr_state *dr) { - struct hv_dr_state *dr; struct hv_dr_work *dr_wrk; unsigned long flags; bool pending_dr; + if (hbus->state == hv_pcibus_removing) { + dev_info(&hbus->hdev->device, + "PCI VMBus BUS_RELATIONS: ignored\n"); + return -ENOENT; + } + dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT); if (!dr_wrk) - return; - - dr = kzalloc(offsetof(struct hv_dr_state, func) + - (sizeof(struct pci_function_description) * - (relations->device_count)), GFP_NOWAIT); - if (!dr) { - kfree(dr_wrk); - return; - } + return -ENOMEM; INIT_WORK(&dr_wrk->wrk, pci_devices_present_work); dr_wrk->bus = hbus; - dr->device_count = relations->device_count; - if (dr->device_count != 0) { - memcpy(dr->func, relations->func, - sizeof(struct pci_function_description) * - dr->device_count); - } spin_lock_irqsave(&hbus->device_list_lock, flags); /* @@ -1866,6 +2264,83 @@ get_hvpcibus(hbus); queue_work(hbus->wq, &dr_wrk->wrk); } + + return 0; +} + +/** + * hv_pci_devices_present() - Handle list of new children + * @hbus: Root PCI bus, as understood by this driver + * @relations: Packet from host listing children + * + * Process a new list of devices on the bus. The list of devices is + * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS, + * whenever a new list of devices for this bus appears. + */ +static void hv_pci_devices_present(struct hv_pcibus_device *hbus, + struct pci_bus_relations *relations) +{ + struct hv_dr_state *dr; + int i; + + dr = kzalloc(struct_size(dr, func, relations->device_count), + GFP_NOWAIT); + if (!dr) + return; + + dr->device_count = relations->device_count; + for (i = 0; i < dr->device_count; i++) { + dr->func[i].v_id = relations->func[i].v_id; + dr->func[i].d_id = relations->func[i].d_id; + dr->func[i].rev = relations->func[i].rev; + dr->func[i].prog_intf = relations->func[i].prog_intf; + dr->func[i].subclass = relations->func[i].subclass; + dr->func[i].base_class = relations->func[i].base_class; + dr->func[i].subsystem_id = relations->func[i].subsystem_id; + dr->func[i].win_slot = relations->func[i].win_slot; + dr->func[i].ser = relations->func[i].ser; + } + + if (hv_pci_start_relations_work(hbus, dr)) + kfree(dr); +} + +/** + * hv_pci_devices_present2() - Handle list of new children + * @hbus: Root PCI bus, as understood by this driver + * @relations: Packet from host listing children + * + * This function is the v2 version of hv_pci_devices_present() + */ +static void hv_pci_devices_present2(struct hv_pcibus_device *hbus, + struct pci_bus_relations2 *relations) +{ + struct hv_dr_state *dr; + int i; + + dr = kzalloc(struct_size(dr, func, relations->device_count), + GFP_NOWAIT); + if (!dr) + return; + + dr->device_count = relations->device_count; + for (i = 0; i < dr->device_count; i++) { + dr->func[i].v_id = relations->func[i].v_id; + dr->func[i].d_id = relations->func[i].d_id; + dr->func[i].rev = relations->func[i].rev; + dr->func[i].prog_intf = relations->func[i].prog_intf; + dr->func[i].subclass = relations->func[i].subclass; + dr->func[i].base_class = relations->func[i].base_class; + dr->func[i].subsystem_id = relations->func[i].subsystem_id; + dr->func[i].win_slot = relations->func[i].win_slot; + dr->func[i].ser = relations->func[i].ser; + dr->func[i].flags = relations->func[i].flags; + dr->func[i].virtual_numa_node = + relations->func[i].virtual_numa_node; + } + + if (hv_pci_start_relations_work(hbus, dr)) + kfree(dr); } /** @@ -1892,8 +2367,6 @@ hpdev = container_of(work, struct hv_pci_dev, wrk); hbus = hpdev->hbus; - - WARN_ON(hpdev->state != hv_pcichild_ejecting); /* * Ejection can come before or after the PCI bus has been set up, so @@ -1945,11 +2418,18 @@ */ static void hv_pci_eject_device(struct hv_pci_dev *hpdev) { - hpdev->state = hv_pcichild_ejecting; + struct hv_pcibus_device *hbus = hpdev->hbus; + struct hv_device *hdev = hbus->hdev; + + if (hbus->state == hv_pcibus_removing) { + dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n"); + return; + } + get_pcichild(hpdev); INIT_WORK(&hpdev->wrk, hv_eject_device_work); - get_hvpcibus(hpdev->hbus); - queue_work(hpdev->hbus->wq, &hpdev->wrk); + get_hvpcibus(hbus); + queue_work(hbus->wq, &hpdev->wrk); } /** @@ -1973,6 +2453,8 @@ struct pci_response *response; struct pci_incoming_message *new_message; struct pci_bus_relations *bus_rel; + struct pci_bus_relations2 *bus_rel2; + struct pci_dev_inval_block *inval; struct pci_dev_incoming *dev_message; struct hv_pci_dev *hpdev; @@ -2028,15 +2510,28 @@ bus_rel = (struct pci_bus_relations *)buffer; if (bytes_recvd < - offsetof(struct pci_bus_relations, func) + - (sizeof(struct pci_function_description) * - (bus_rel->device_count))) { + struct_size(bus_rel, func, + bus_rel->device_count)) { dev_err(&hbus->hdev->device, "bus relations too small\n"); break; } hv_pci_devices_present(hbus, bus_rel); + break; + + case PCI_BUS_RELATIONS2: + + bus_rel2 = (struct pci_bus_relations2 *)buffer; + if (bytes_recvd < + struct_size(bus_rel2, func, + bus_rel2->device_count)) { + dev_err(&hbus->hdev->device, + "bus relations v2 too small\n"); + break; + } + + hv_pci_devices_present2(hbus, bus_rel2); break; case PCI_EJECT: @@ -2046,6 +2541,21 @@ dev_message->wslot.slot); if (hpdev) { hv_pci_eject_device(hpdev); + put_pcichild(hpdev); + } + break; + + case PCI_INVALIDATE_BLOCK: + + inval = (struct pci_dev_inval_block *)buffer; + hpdev = get_pcichild_wslot(hbus, + inval->wslot.slot); + if (hpdev) { + if (hpdev->block_invalidate) { + hpdev->block_invalidate( + hpdev->invalidate_context, + inval->block_mask); + } put_pcichild(hpdev); } break; @@ -2071,7 +2581,10 @@ /** * hv_pci_protocol_negotiation() - Set up protocol - * @hdev: VMBus's tracking struct for this root PCI bus + * @hdev: VMBus's tracking struct for this root PCI bus. + * @version: Array of supported channel protocol versions in + * the order of probing - highest go first. + * @num_version: Number of elements in the version array. * * This driver is intended to support running on Windows 10 * (server) and later versions. It will not run on earlier @@ -2085,8 +2598,11 @@ * failing if the host doesn't support the necessary protocol * level. */ -static int hv_pci_protocol_negotiation(struct hv_device *hdev) +static int hv_pci_protocol_negotiation(struct hv_device *hdev, + enum pci_protocol_version_t version[], + int num_version) { + struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); struct pci_version_request *version_req; struct hv_pci_compl comp_pkt; struct pci_packet *pkt; @@ -2109,8 +2625,8 @@ version_req = (struct pci_version_request *)&pkt->message; version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; - for (i = 0; i < ARRAY_SIZE(pci_protocol_versions); i++) { - version_req->protocol_version = pci_protocol_versions[i]; + for (i = 0; i < num_version; i++) { + version_req->protocol_version = version[i]; ret = vmbus_sendpacket(hdev->channel, version_req, sizeof(struct pci_version_request), (unsigned long)pkt, VM_PKT_DATA_INBAND, @@ -2126,10 +2642,10 @@ } if (comp_pkt.completion_status >= 0) { - pci_protocol_version = pci_protocol_versions[i]; + hbus->protocol_version = version[i]; dev_info(&hdev->device, "PCI VMBus probing: Using version %#x\n", - pci_protocol_version); + hbus->protocol_version); goto exit; } @@ -2299,6 +2815,8 @@ vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH); } +static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs); + /** * hv_pci_enter_d0() - Bring the "bus" into the D0 power state * @hdev: VMBus's tracking struct for this root PCI bus @@ -2311,8 +2829,10 @@ struct pci_bus_d0_entry *d0_entry; struct hv_pci_compl comp_pkt; struct pci_packet *pkt; + bool retry = true; int ret; +enter_d0_retry: /* * Tell the host that the bus is ready to use, and moved into the * powered-on state. This includes telling the host which region @@ -2338,6 +2858,38 @@ if (ret) goto exit; + + /* + * In certain case (Kdump) the pci device of interest was + * not cleanly shut down and resource is still held on host + * side, the host could return invalid device status. + * We need to explicitly request host to release the resource + * and try to enter D0 again. + */ + if (comp_pkt.completion_status < 0 && retry) { + retry = false; + + dev_err(&hdev->device, "Retrying D0 Entry\n"); + + /* + * Hv_pci_bus_exit() calls hv_send_resource_released() + * to free up resources of its child devices. + * In the kdump kernel we need to set the + * wslot_res_allocated to 255 so it scans all child + * devices to release resources allocated in the + * normal kernel before panic happened. + */ + hbus->wslot_res_allocated = 255; + + ret = hv_pci_bus_exit(hdev, true); + + if (ret == 0) { + kfree(pkt); + goto enter_d0_retry; + } + dev_err(&hdev->device, + "Retrying D0 failed with ret %d\n", ret); + } if (comp_pkt.completion_status < 0) { dev_err(&hdev->device, @@ -2381,6 +2933,24 @@ if (!ret) ret = wait_for_response(hdev, &comp); + /* + * In the case of fast device addition/removal, it's possible that + * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we + * already got a PCI_BUS_RELATIONS* message from the host and the + * channel callback already scheduled a work to hbus->wq, which can be + * running pci_devices_present_work() -> survey_child_resources() -> + * complete(&hbus->survey_event), even after hv_pci_query_relations() + * exits and the stack variable 'comp' is no longer valid; as a result, + * a hang or a page fault may happen when the complete() calls + * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from + * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is + * -ENODEV, there can't be any more work item scheduled to hbus->wq + * after the flush_workqueue(): see vmbus_onoffer_rescind() -> + * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() -> + * channel->rescind = true. + */ + flush_workqueue(hbus->wq); + return ret; } @@ -2410,10 +2980,10 @@ struct hv_pci_dev *hpdev; struct pci_packet *pkt; size_t size_res; - u32 wslot; + int wslot; int ret; - size_res = (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2) + size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) ? sizeof(*res_assigned) : sizeof(*res_assigned2); pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL); @@ -2432,7 +3002,7 @@ pkt->completion_func = hv_pci_generic_compl; pkt->compl_ctxt = &comp_pkt; - if (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2) { + if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) { res_assigned = (struct pci_resources_assigned *)&pkt->message; res_assigned->message_type.type = @@ -2463,6 +3033,8 @@ comp_pkt.completion_status); break; } + + hbus->wslot_res_allocated = wslot; } kfree(pkt); @@ -2481,10 +3053,10 @@ struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); struct pci_child_message pkt; struct hv_pci_dev *hpdev; - u32 wslot; + int wslot; int ret; - for (wslot = 0; wslot < 256; wslot++) { + for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) { hpdev = get_pcichild_wslot(hbus, wslot); if (!hpdev) continue; @@ -2499,7 +3071,11 @@ VM_PKT_DATA_INBAND, 0); if (ret) return ret; + + hbus->wslot_res_allocated = wslot - 1; } + + hbus->wslot_res_allocated = -1; return 0; } @@ -2515,6 +3091,48 @@ complete(&hbus->remove_event); } +#define HVPCI_DOM_MAP_SIZE (64 * 1024) +static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); + +/* + * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 + * as invalid for passthrough PCI devices of this driver. + */ +#define HVPCI_DOM_INVALID 0 + +/** + * hv_get_dom_num() - Get a valid PCI domain number + * Check if the PCI domain number is in use, and return another number if + * it is in use. + * + * @dom: Requested domain number + * + * return: domain number on success, HVPCI_DOM_INVALID on failure + */ +static u16 hv_get_dom_num(u16 dom) +{ + unsigned int i; + + if (test_and_set_bit(dom, hvpci_dom_map) == 0) + return dom; + + for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { + if (test_and_set_bit(i, hvpci_dom_map) == 0) + return i; + } + + return HVPCI_DOM_INVALID; +} + +/** + * hv_put_dom_num() - Mark the PCI domain number as free + * @dom: Domain number to be freed + */ +static void hv_put_dom_num(u16 dom) +{ + clear_bit(dom, hvpci_dom_map); +} + /** * hv_pci_probe() - New VMBus channel probe, for a root PCI bus * @hdev: VMBus's tracking struct for this root PCI bus @@ -2526,33 +3144,69 @@ const struct hv_vmbus_device_id *dev_id) { struct hv_pcibus_device *hbus; + u16 dom_req, dom; + char *name; int ret; /* * hv_pcibus_device contains the hypercall arguments for retargeting in * hv_irq_unmask(). Those must not cross a page boundary. */ - BUILD_BUG_ON(sizeof(*hbus) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE); - hbus = (struct hv_pcibus_device *)get_zeroed_page(GFP_KERNEL); + /* + * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural + * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate + * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and + * alignment of hbus is important because hbus's field + * retarget_msi_interrupt_params must not cross a 4KB page boundary. + * + * Here we prefer kzalloc to get_zeroed_page(), because a buffer + * allocated by the latter is not tracked and scanned by kmemleak, and + * hence kmemleak reports the pointer contained in the hbus buffer + * (i.e. the hpdev struct, which is created in new_pcichild_device() and + * is tracked by hbus->children) as memory leak (false positive). + * + * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be + * used to allocate the hbus buffer and we can avoid the kmemleak false + * positive by using kmemleak_alloc() and kmemleak_free() to ask + * kmemleak to track and scan the hbus buffer. + */ + hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); if (!hbus) return -ENOMEM; hbus->state = hv_pcibus_init; + hbus->wslot_res_allocated = -1; /* - * The PCI bus "domain" is what is called "segment" in ACPI and - * other specs. Pull it from the instance ID, to get something - * unique. Bytes 8 and 9 are what is used in Windows guests, so - * do the same thing for consistency. Note that, since this code - * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee - * that (1) the only domain in use for something that looks like - * a physical PCI bus (which is actually emulated by the - * hypervisor) is domain 0 and (2) there will be no overlap - * between domains derived from these instance IDs in the same - * VM. + * The PCI bus "domain" is what is called "segment" in ACPI and other + * specs. Pull it from the instance ID, to get something usually + * unique. In rare cases of collision, we will find out another number + * not in use. + * + * Note that, since this code only runs in a Hyper-V VM, Hyper-V + * together with this guest driver can guarantee that (1) The only + * domain used by Gen1 VMs for something that looks like a physical + * PCI bus (which is actually emulated by the hypervisor) is domain 0. + * (2) There will be no overlap between domains (after fixing possible + * collisions) in the same VM. */ - hbus->sysdata.domain = hdev->dev_instance.b[9] | - hdev->dev_instance.b[8] << 8; + dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; + dom = hv_get_dom_num(dom_req); + + if (dom == HVPCI_DOM_INVALID) { + dev_err(&hdev->device, + "Unable to use dom# 0x%hx or other numbers", dom_req); + ret = -EINVAL; + goto free_bus; + } + + if (dom != dom_req) + dev_info(&hdev->device, + "PCI dom# 0x%hx has collision, using 0x%hx", + dom_req, dom); + + hbus->sysdata.domain = dom; hbus->hdev = hdev; refcount_set(&hbus->remove_lock, 1); @@ -2567,7 +3221,7 @@ hbus->sysdata.domain); if (!hbus->wq) { ret = -ENOMEM; - goto free_bus; + goto free_dom; } ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, @@ -2577,7 +3231,8 @@ hv_set_drvdata(hdev, hbus); - ret = hv_pci_protocol_negotiation(hdev); + ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions, + ARRAY_SIZE(pci_protocol_versions)); if (ret) goto close; @@ -2594,7 +3249,14 @@ goto free_config; } - hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus); + name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance); + if (!name) { + ret = -ENOMEM; + goto unmap; + } + + hbus->sysdata.fwnode = irq_domain_alloc_named_fwnode(name); + kfree(name); if (!hbus->sysdata.fwnode) { ret = -ENOMEM; goto unmap; @@ -2614,7 +3276,7 @@ ret = hv_pci_allocate_bridge_windows(hbus); if (ret) - goto free_irq_domain; + goto exit_d0; ret = hv_send_resources_allocated(hdev); if (ret) @@ -2632,6 +3294,8 @@ free_windows: hv_pci_free_bridge_windows(hbus); +exit_d0: + (void) hv_pci_bus_exit(hdev, true); free_irq_domain: irq_domain_remove(hbus->irq_domain); free_fwnode: @@ -2644,20 +3308,23 @@ vmbus_close(hdev->channel); destroy_wq: destroy_workqueue(hbus->wq); +free_dom: + hv_put_dom_num(hbus->sysdata.domain); free_bus: - free_page((unsigned long)hbus); + kfree(hbus); return ret; } -static void hv_pci_bus_exit(struct hv_device *hdev) +static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) { struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); struct { struct pci_packet teardown_packet; u8 buffer[sizeof(struct pci_message)]; } pkt; - struct pci_bus_relations relations; struct hv_pci_compl comp_pkt; + struct hv_pci_dev *hpdev, *tmp; + unsigned long flags; int ret; /* @@ -2665,16 +3332,35 @@ * access the per-channel ringbuffer any longer. */ if (hdev->channel->rescind) - return; + return 0; - /* Delete any children which might still exist. */ - memset(&relations, 0, sizeof(relations)); - hv_pci_devices_present(hbus, &relations); + if (!keep_devs) { + struct list_head removed; + + /* Move all present children to the list on stack */ + INIT_LIST_HEAD(&removed); + spin_lock_irqsave(&hbus->device_list_lock, flags); + list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry) + list_move_tail(&hpdev->list_entry, &removed); + spin_unlock_irqrestore(&hbus->device_list_lock, flags); + + /* Remove all children in the list */ + list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) { + list_del(&hpdev->list_entry); + if (hpdev->pci_slot) + pci_destroy_slot(hpdev->pci_slot); + /* For the two refs got in new_pcichild_device() */ + put_pcichild(hpdev); + put_pcichild(hpdev); + } + } ret = hv_send_resources_released(hdev); - if (ret) + if (ret) { dev_err(&hdev->device, "Couldn't send resources released packet(s)\n"); + return ret; + } memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet)); init_completion(&comp_pkt.host_event); @@ -2687,8 +3373,13 @@ (unsigned long)&pkt.teardown_packet, VM_PKT_DATA_INBAND, VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if (!ret) - wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ); + if (ret) + return ret; + + if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) + return -ETIMEDOUT; + + return 0; } /** @@ -2700,19 +3391,30 @@ static int hv_pci_remove(struct hv_device *hdev) { struct hv_pcibus_device *hbus; + int ret; hbus = hv_get_drvdata(hdev); if (hbus->state == hv_pcibus_installed) { + tasklet_disable(&hdev->channel->callback_event); + hbus->state = hv_pcibus_removing; + tasklet_enable(&hdev->channel->callback_event); + destroy_workqueue(hbus->wq); + hbus->wq = NULL; + /* + * At this point, no work is running or can be scheduled + * on hbus-wq. We can't race with hv_pci_devices_present() + * or hv_pci_eject_device(), it's safe to proceed. + */ + /* Remove the bus from PCI's point of view. */ pci_lock_rescan_remove(); pci_stop_root_bus(hbus->pci_bus); hv_pci_remove_slots(hbus); pci_remove_root_bus(hbus->pci_bus); pci_unlock_rescan_remove(); - hbus->state = hv_pcibus_removed; } - hv_pci_bus_exit(hdev); + ret = hv_pci_bus_exit(hdev, false); vmbus_close(hdev->channel); @@ -2724,9 +3426,128 @@ irq_domain_free_fwnode(hbus->sysdata.fwnode); put_hvpcibus(hbus); wait_for_completion(&hbus->remove_event); - destroy_workqueue(hbus->wq); - free_page((unsigned long)hbus); + + hv_put_dom_num(hbus->sysdata.domain); + + kfree(hbus); + return ret; +} + +static int hv_pci_suspend(struct hv_device *hdev) +{ + struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); + enum hv_pcibus_state old_state; + int ret; + + /* + * hv_pci_suspend() must make sure there are no pending work items + * before calling vmbus_close(), since it runs in a process context + * as a callback in dpm_suspend(). When it starts to run, the channel + * callback hv_pci_onchannelcallback(), which runs in a tasklet + * context, can be still running concurrently and scheduling new work + * items onto hbus->wq in hv_pci_devices_present() and + * hv_pci_eject_device(), and the work item handlers can access the + * vmbus channel, which can be being closed by hv_pci_suspend(), e.g. + * the work item handler pci_devices_present_work() -> + * new_pcichild_device() writes to the vmbus channel. + * + * To eliminate the race, hv_pci_suspend() disables the channel + * callback tasklet, sets hbus->state to hv_pcibus_removing, and + * re-enables the tasklet. This way, when hv_pci_suspend() proceeds, + * it knows that no new work item can be scheduled, and then it flushes + * hbus->wq and safely closes the vmbus channel. + */ + tasklet_disable(&hdev->channel->callback_event); + + /* Change the hbus state to prevent new work items. */ + old_state = hbus->state; + if (hbus->state == hv_pcibus_installed) + hbus->state = hv_pcibus_removing; + + tasklet_enable(&hdev->channel->callback_event); + + if (old_state != hv_pcibus_installed) + return -EINVAL; + + flush_workqueue(hbus->wq); + + ret = hv_pci_bus_exit(hdev, true); + if (ret) + return ret; + + vmbus_close(hdev->channel); + return 0; +} + +static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg) +{ + struct msi_desc *entry; + struct irq_data *irq_data; + + for_each_pci_msi_entry(entry, pdev) { + irq_data = irq_get_irq_data(entry->irq); + if (WARN_ON_ONCE(!irq_data)) + return -EINVAL; + + hv_compose_msi_msg(irq_data, &entry->msg); + } + + return 0; +} + +/* + * Upon resume, pci_restore_msi_state() -> ... -> __pci_write_msi_msg() + * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V + * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg() + * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping + * Table entries. + */ +static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus) +{ + pci_walk_bus(hbus->pci_bus, hv_pci_restore_msi_msg, NULL); +} + +static int hv_pci_resume(struct hv_device *hdev) +{ + struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); + enum pci_protocol_version_t version[1]; + int ret; + + hbus->state = hv_pcibus_init; + + ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, + hv_pci_onchannelcallback, hbus); + if (ret) + return ret; + + /* Only use the version that was in use before hibernation. */ + version[0] = hbus->protocol_version; + ret = hv_pci_protocol_negotiation(hdev, version, 1); + if (ret) + goto out; + + ret = hv_pci_query_relations(hdev); + if (ret) + goto out; + + ret = hv_pci_enter_d0(hdev); + if (ret) + goto out; + + ret = hv_send_resources_allocated(hdev); + if (ret) + goto out; + + prepopulate_bars(hbus); + + hv_pci_restore_msi_state(hbus); + + hbus->state = hv_pcibus_installed; + return 0; +out: + vmbus_close(hdev->channel); + return ret; } static const struct hv_vmbus_device_id hv_pci_id_table[] = { @@ -2743,15 +3564,32 @@ .id_table = hv_pci_id_table, .probe = hv_pci_probe, .remove = hv_pci_remove, + .suspend = hv_pci_suspend, + .resume = hv_pci_resume, }; static void __exit exit_hv_pci_drv(void) { vmbus_driver_unregister(&hv_pci_drv); + + hvpci_block_ops.read_block = NULL; + hvpci_block_ops.write_block = NULL; + hvpci_block_ops.reg_blk_invalidate = NULL; } static int __init init_hv_pci_drv(void) { + if (!hv_is_hyperv_initialized()) + return -ENODEV; + + /* Set the invalid domain number's bit, so it will not be used */ + set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); + + /* Initialize PCI block r/w interface */ + hvpci_block_ops.read_block = hv_read_config_block; + hvpci_block_ops.write_block = hv_write_config_block; + hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate; + return vmbus_driver_register(&hv_pci_drv); } -- Gitblit v1.6.2