From 072de836f53be56a70cecf70b43ae43b7ce17376 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 11 Dec 2023 10:08:36 +0000
Subject: [PATCH] mk-rootfs.sh
---
kernel/drivers/hv/channel_mgmt.c | 700 ++++++++++++++++++++++++++++++++--------------------------
1 files changed, 384 insertions(+), 316 deletions(-)
diff --git a/kernel/drivers/hv/channel_mgmt.c b/kernel/drivers/hv/channel_mgmt.c
index a3f6933..5b902ad 100644
--- a/kernel/drivers/hv/channel_mgmt.c
+++ b/kernel/drivers/hv/channel_mgmt.c
@@ -1,18 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2009, Microsoft Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
*
* Authors:
* Haiyang Zhang <haiyangz@microsoft.com>
@@ -30,14 +18,15 @@
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/delay.h>
+#include <linux/cpu.h>
#include <linux/hyperv.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
-static void init_vp_index(struct vmbus_channel *channel, u16 dev_type);
+static void init_vp_index(struct vmbus_channel *channel);
-static const struct vmbus_device vmbus_devs[] = {
+const struct vmbus_device vmbus_devs[] = {
/* IDE */
{ .dev_type = HV_IDE,
HV_IDE_GUID,
@@ -141,7 +130,7 @@
};
static const struct {
- uuid_le guid;
+ guid_t guid;
} vmbus_unsupported_devs[] = {
{ HV_AVMA1_GUID },
{ HV_AVMA2_GUID },
@@ -171,26 +160,26 @@
spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
}
-static bool is_unsupported_vmbus_devs(const uuid_le *guid)
+static bool is_unsupported_vmbus_devs(const guid_t *guid)
{
int i;
for (i = 0; i < ARRAY_SIZE(vmbus_unsupported_devs); i++)
- if (!uuid_le_cmp(*guid, vmbus_unsupported_devs[i].guid))
+ if (guid_equal(guid, &vmbus_unsupported_devs[i].guid))
return true;
return false;
}
static u16 hv_get_dev_type(const struct vmbus_channel *channel)
{
- const uuid_le *guid = &channel->offermsg.offer.if_type;
+ const guid_t *guid = &channel->offermsg.offer.if_type;
u16 i;
if (is_hvsock_channel(channel) || is_unsupported_vmbus_devs(guid))
return HV_UNKNOWN;
for (i = HV_IDE; i < HV_UNKNOWN; i++) {
- if (!uuid_le_cmp(*guid, vmbus_devs[i].guid))
+ if (guid_equal(guid, &vmbus_devs[i].guid))
return i;
}
pr_info("Unknown GUID: %pUl\n", guid);
@@ -198,24 +187,19 @@
}
/**
- * vmbus_prep_negotiate_resp() - Create default response for Hyper-V Negotiate message
+ * vmbus_prep_negotiate_resp() - Create default response for Negotiate message
* @icmsghdrp: Pointer to msg header structure
- * @icmsg_negotiate: Pointer to negotiate message structure
* @buf: Raw buffer channel data
+ * @fw_version: The framework versions we can support.
+ * @fw_vercnt: The size of @fw_version.
+ * @srv_version: The service versions we can support.
+ * @srv_vercnt: The size of @srv_version.
+ * @nego_fw_version: The selected framework version.
+ * @nego_srv_version: The selected service version.
*
- * @icmsghdrp is of type &struct icmsg_hdr.
+ * Note: Versions are given in decreasing order.
+ *
* Set up and fill in default negotiate response message.
- *
- * The fw_version and fw_vercnt specifies the framework version that
- * we can support.
- *
- * The srv_version and srv_vercnt specifies the service
- * versions we can support.
- *
- * Versions are given in decreasing order.
- *
- * nego_fw_version and nego_srv_version store the selected protocol versions.
- *
* Mainly used by Hyper-V drivers.
*/
bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
@@ -332,14 +316,15 @@
if (!channel)
return NULL;
- spin_lock_init(&channel->lock);
+ spin_lock_init(&channel->sched_lock);
init_completion(&channel->rescind_event);
INIT_LIST_HEAD(&channel->sc_list);
- INIT_LIST_HEAD(&channel->percpu_list);
tasklet_init(&channel->callback_event,
vmbus_on_event, (unsigned long)channel);
+
+ hv_ringbuffer_pre_init(channel);
return channel;
}
@@ -355,22 +340,48 @@
kobject_put(&channel->kobj);
}
-static void percpu_channel_enq(void *arg)
+void vmbus_channel_map_relid(struct vmbus_channel *channel)
{
- struct vmbus_channel *channel = arg;
- struct hv_per_cpu_context *hv_cpu
- = this_cpu_ptr(hv_context.cpu_context);
-
- list_add_tail_rcu(&channel->percpu_list, &hv_cpu->chan_list);
+ if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
+ return;
+ /*
+ * The mapping of the channel's relid is visible from the CPUs that
+ * execute vmbus_chan_sched() by the time that vmbus_chan_sched() will
+ * execute:
+ *
+ * (a) In the "normal (i.e., not resuming from hibernation)" path,
+ * the full barrier in virt_store_mb() guarantees that the store
+ * is propagated to all CPUs before the add_channel_work work
+ * is queued. In turn, add_channel_work is queued before the
+ * channel's ring buffer is allocated/initialized and the
+ * OPENCHANNEL message for the channel is sent in vmbus_open().
+ * Hyper-V won't start sending the interrupts for the channel
+ * before the OPENCHANNEL message is acked. The memory barrier
+ * in vmbus_chan_sched() -> sync_test_and_clear_bit() ensures
+ * that vmbus_chan_sched() must find the channel's relid in
+ * recv_int_page before retrieving the channel pointer from the
+ * array of channels.
+ *
+ * (b) In the "resuming from hibernation" path, the virt_store_mb()
+ * guarantees that the store is propagated to all CPUs before
+ * the VMBus connection is marked as ready for the resume event
+ * (cf. check_ready_for_resume_event()). The interrupt handler
+ * of the VMBus driver and vmbus_chan_sched() can not run before
+ * vmbus_bus_resume() has completed execution (cf. resume_noirq).
+ */
+ virt_store_mb(
+ vmbus_connection.channels[channel->offermsg.child_relid],
+ channel);
}
-static void percpu_channel_deq(void *arg)
+void vmbus_channel_unmap_relid(struct vmbus_channel *channel)
{
- struct vmbus_channel *channel = arg;
-
- list_del_rcu(&channel->percpu_list);
+ if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
+ return;
+ WRITE_ONCE(
+ vmbus_connection.channels[channel->offermsg.child_relid],
+ NULL);
}
-
static void vmbus_release_relid(u32 relid)
{
@@ -386,51 +397,49 @@
trace_vmbus_release_relid(&msg, ret);
}
-void hv_process_channel_removal(u32 relid)
+void hv_process_channel_removal(struct vmbus_channel *channel)
{
- unsigned long flags;
- struct vmbus_channel *primary_channel, *channel;
-
- BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex));
-
- /*
- * Make sure channel is valid as we may have raced.
- */
- channel = relid2channel(relid);
- if (!channel)
- return;
-
+ lockdep_assert_held(&vmbus_connection.channel_mutex);
BUG_ON(!channel->rescind);
- if (channel->target_cpu != get_cpu()) {
- put_cpu();
- smp_call_function_single(channel->target_cpu,
- percpu_channel_deq, channel, true);
- } else {
- percpu_channel_deq(channel);
- put_cpu();
- }
-
- if (channel->primary_channel == NULL) {
- list_del(&channel->listentry);
-
- primary_channel = channel;
- } else {
- primary_channel = channel->primary_channel;
- spin_lock_irqsave(&primary_channel->lock, flags);
- list_del(&channel->sc_list);
- primary_channel->num_sc--;
- spin_unlock_irqrestore(&primary_channel->lock, flags);
- }
/*
- * We need to free the bit for init_vp_index() to work in the case
- * of sub-channel, when we reload drivers like hv_netvsc.
+ * hv_process_channel_removal() could find INVALID_RELID only for
+ * hv_sock channels. See the inline comments in vmbus_onoffer().
*/
- if (channel->affinity_policy == HV_LOCALIZED)
- cpumask_clear_cpu(channel->target_cpu,
- &primary_channel->alloced_cpus_in_node);
+ WARN_ON(channel->offermsg.child_relid == INVALID_RELID &&
+ !is_hvsock_channel(channel));
- vmbus_release_relid(relid);
+ /*
+ * Upon suspend, an in-use hv_sock channel is removed from the array of
+ * channels and the relid is invalidated. After hibernation, when the
+ * user-space appplication destroys the channel, it's unnecessary and
+ * unsafe to remove the channel from the array of channels. See also
+ * the inline comments before the call of vmbus_release_relid() below.
+ */
+ if (channel->offermsg.child_relid != INVALID_RELID)
+ vmbus_channel_unmap_relid(channel);
+
+ if (channel->primary_channel == NULL)
+ list_del(&channel->listentry);
+ else
+ list_del(&channel->sc_list);
+
+ /*
+ * If this is a "perf" channel, updates the hv_numa_map[] masks so that
+ * init_vp_index() can (re-)use the CPU.
+ */
+ if (hv_is_perf_channel(channel))
+ hv_clear_alloced_cpu(channel->target_cpu);
+
+ /*
+ * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and
+ * the relid is invalidated; after hibernation, when the user-space app
+ * destroys the channel, the relid is INVALID_RELID, and in this case
+ * it's unnecessary and unsafe to release the old relid, since the same
+ * relid can refer to a completely different channel now.
+ */
+ if (channel->offermsg.child_relid != INVALID_RELID)
+ vmbus_release_relid(channel->offermsg.child_relid);
free_channel(channel);
}
@@ -454,23 +463,7 @@
struct vmbus_channel *newchannel =
container_of(work, struct vmbus_channel, add_channel_work);
struct vmbus_channel *primary_channel = newchannel->primary_channel;
- unsigned long flags;
- u16 dev_type;
int ret;
-
- dev_type = hv_get_dev_type(newchannel);
-
- init_vp_index(newchannel, dev_type);
-
- if (newchannel->target_cpu != get_cpu()) {
- put_cpu();
- smp_call_function_single(newchannel->target_cpu,
- percpu_channel_enq,
- newchannel, true);
- } else {
- percpu_channel_enq(newchannel);
- put_cpu();
- }
/*
* This state is used to indicate a successful open
@@ -503,18 +496,22 @@
if (!newchannel->device_obj)
goto err_deq_chan;
- newchannel->device_obj->device_id = dev_type;
+ newchannel->device_obj->device_id = newchannel->device_id;
/*
* Add the new device to the bus. This will kick off device-driver
* binding which eventually invokes the device driver's AddDevice()
* method.
+ *
+ * If vmbus_device_register() fails, the 'device_obj' is freed in
+ * vmbus_device_release() as called by device_unregister() in the
+ * error path of vmbus_device_register(). In the outside error
+ * path, there's no need to free it.
*/
ret = vmbus_device_register(newchannel->device_obj);
if (ret != 0) {
pr_err("unable to add child device object (relid %d)\n",
newchannel->offermsg.child_relid);
- kfree(newchannel->device_obj);
goto err_deq_chan;
}
@@ -530,25 +527,15 @@
*/
newchannel->probe_done = true;
- if (primary_channel == NULL) {
+ if (primary_channel == NULL)
list_del(&newchannel->listentry);
- } else {
- spin_lock_irqsave(&primary_channel->lock, flags);
+ else
list_del(&newchannel->sc_list);
- spin_unlock_irqrestore(&primary_channel->lock, flags);
- }
+
+ /* vmbus_process_offer() has mapped the channel. */
+ vmbus_channel_unmap_relid(newchannel);
mutex_unlock(&vmbus_connection.channel_mutex);
-
- if (newchannel->target_cpu != get_cpu()) {
- put_cpu();
- smp_call_function_single(newchannel->target_cpu,
- percpu_channel_deq,
- newchannel, true);
- } else {
- percpu_channel_deq(newchannel);
- put_cpu();
- }
vmbus_release_relid(newchannel->offermsg.child_relid);
@@ -563,10 +550,40 @@
{
struct vmbus_channel *channel;
struct workqueue_struct *wq;
- unsigned long flags;
bool fnew = true;
+ /*
+ * Synchronize vmbus_process_offer() and CPU hotplugging:
+ *
+ * CPU1 CPU2
+ *
+ * [vmbus_process_offer()] [Hot removal of the CPU]
+ *
+ * CPU_READ_LOCK CPUS_WRITE_LOCK
+ * LOAD cpu_online_mask SEARCH chn_list
+ * STORE target_cpu LOAD target_cpu
+ * INSERT chn_list STORE cpu_online_mask
+ * CPUS_READ_UNLOCK CPUS_WRITE_UNLOCK
+ *
+ * Forbids: CPU1's LOAD from *not* seing CPU2's STORE &&
+ * CPU2's SEARCH from *not* seeing CPU1's INSERT
+ *
+ * Forbids: CPU2's SEARCH from seeing CPU1's INSERT &&
+ * CPU2's LOAD from *not* seing CPU1's STORE
+ */
+ cpus_read_lock();
+
+ /*
+ * Serializes the modifications of the chn_list list as well as
+ * the accesses to next_numa_node_id in init_vp_index().
+ */
mutex_lock(&vmbus_connection.channel_mutex);
+
+ init_vp_index(newchannel);
+
+ /* Remember the channels that should be cleaned up upon suspend. */
+ if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel))
+ atomic_inc(&vmbus_connection.nr_chan_close_on_suspend);
/*
* Now that we have acquired the channel_mutex,
@@ -575,24 +592,25 @@
atomic_dec(&vmbus_connection.offer_in_progress);
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
- if (!uuid_le_cmp(channel->offermsg.offer.if_type,
- newchannel->offermsg.offer.if_type) &&
- !uuid_le_cmp(channel->offermsg.offer.if_instance,
- newchannel->offermsg.offer.if_instance)) {
+ if (guid_equal(&channel->offermsg.offer.if_type,
+ &newchannel->offermsg.offer.if_type) &&
+ guid_equal(&channel->offermsg.offer.if_instance,
+ &newchannel->offermsg.offer.if_instance)) {
fnew = false;
break;
}
}
- if (fnew)
+ if (fnew) {
list_add_tail(&newchannel->listentry,
&vmbus_connection.chn_list);
- else {
+ } else {
/*
* Check to see if this is a valid sub-channel.
*/
if (newchannel->offermsg.offer.sub_channel_index == 0) {
mutex_unlock(&vmbus_connection.channel_mutex);
+ cpus_read_unlock();
/*
* Don't call free_channel(), because newchannel->kobj
* is not initialized yet.
@@ -605,12 +623,13 @@
* Process the sub-channel.
*/
newchannel->primary_channel = channel;
- spin_lock_irqsave(&channel->lock, flags);
list_add_tail(&newchannel->sc_list, &channel->sc_list);
- spin_unlock_irqrestore(&channel->lock, flags);
}
+ vmbus_channel_map_relid(newchannel);
+
mutex_unlock(&vmbus_connection.channel_mutex);
+ cpus_read_unlock();
/*
* vmbus_process_offer() mustn't call channel->sc_creation_callback()
@@ -643,73 +662,57 @@
* We use this state to statically distribute the channel interrupt load.
*/
static int next_numa_node_id;
-/*
- * init_vp_index() accesses global variables like next_numa_node_id, and
- * it can run concurrently for primary channels and sub-channels: see
- * vmbus_process_offer(), so we need the lock to protect the global
- * variables.
- */
-static DEFINE_SPINLOCK(bind_channel_to_cpu_lock);
/*
* Starting with Win8, we can statically distribute the incoming
* channel interrupt load by binding a channel to VCPU.
- * We distribute the interrupt loads to one or more NUMA nodes based on
- * the channel's affinity_policy.
*
* For pre-win8 hosts or non-performance critical channels we assign the
- * first CPU in the first NUMA node.
+ * VMBUS_CONNECT_CPU.
+ *
+ * Starting with win8, performance critical channels will be distributed
+ * evenly among all the available NUMA nodes. Once the node is assigned,
+ * we will assign the CPU based on a simple round robin scheme.
*/
-static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
+static void init_vp_index(struct vmbus_channel *channel)
{
- u32 cur_cpu;
- bool perf_chn = vmbus_devs[dev_type].perf_device;
- struct vmbus_channel *primary = channel->primary_channel;
- int next_node;
+ bool perf_chn = hv_is_perf_channel(channel);
cpumask_var_t available_mask;
struct cpumask *alloced_mask;
+ u32 target_cpu;
+ int numa_node;
if ((vmbus_proto_version == VERSION_WS2008) ||
(vmbus_proto_version == VERSION_WIN7) || (!perf_chn) ||
!alloc_cpumask_var(&available_mask, GFP_KERNEL)) {
/*
* Prior to win8, all channel interrupts are
- * delivered on cpu 0.
+ * delivered on VMBUS_CONNECT_CPU.
* Also if the channel is not a performance critical
- * channel, bind it to cpu 0.
- * In case alloc_cpumask_var() fails, bind it to cpu 0.
+ * channel, bind it to VMBUS_CONNECT_CPU.
+ * In case alloc_cpumask_var() fails, bind it to
+ * VMBUS_CONNECT_CPU.
*/
- channel->numa_node = 0;
- channel->target_cpu = 0;
- channel->target_vp = hv_cpu_number_to_vp_number(0);
+ channel->target_cpu = VMBUS_CONNECT_CPU;
+ if (perf_chn)
+ hv_set_alloced_cpu(VMBUS_CONNECT_CPU);
return;
}
- spin_lock(&bind_channel_to_cpu_lock);
-
- /*
- * Based on the channel affinity policy, we will assign the NUMA
- * nodes.
- */
-
- if ((channel->affinity_policy == HV_BALANCED) || (!primary)) {
- while (true) {
- next_node = next_numa_node_id++;
- if (next_node == nr_node_ids) {
- next_node = next_numa_node_id = 0;
- continue;
- }
- if (cpumask_empty(cpumask_of_node(next_node)))
- continue;
- break;
+ while (true) {
+ numa_node = next_numa_node_id++;
+ if (numa_node == nr_node_ids) {
+ next_numa_node_id = 0;
+ continue;
}
- channel->numa_node = next_node;
- primary = channel;
+ if (cpumask_empty(cpumask_of_node(numa_node)))
+ continue;
+ break;
}
- alloced_mask = &hv_context.hv_numa_map[primary->numa_node];
+ alloced_mask = &hv_context.hv_numa_map[numa_node];
if (cpumask_weight(alloced_mask) ==
- cpumask_weight(cpumask_of_node(primary->numa_node))) {
+ cpumask_weight(cpumask_of_node(numa_node))) {
/*
* We have cycled through all the CPUs in the node;
* reset the alloced map.
@@ -717,59 +720,12 @@
cpumask_clear(alloced_mask);
}
- cpumask_xor(available_mask, alloced_mask,
- cpumask_of_node(primary->numa_node));
+ cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node));
- cur_cpu = -1;
+ target_cpu = cpumask_first(available_mask);
+ cpumask_set_cpu(target_cpu, alloced_mask);
- if (primary->affinity_policy == HV_LOCALIZED) {
- /*
- * Normally Hyper-V host doesn't create more subchannels
- * than there are VCPUs on the node but it is possible when not
- * all present VCPUs on the node are initialized by guest.
- * Clear the alloced_cpus_in_node to start over.
- */
- if (cpumask_equal(&primary->alloced_cpus_in_node,
- cpumask_of_node(primary->numa_node)))
- cpumask_clear(&primary->alloced_cpus_in_node);
- }
-
- while (true) {
- cur_cpu = cpumask_next(cur_cpu, available_mask);
- if (cur_cpu >= nr_cpu_ids) {
- cur_cpu = -1;
- cpumask_copy(available_mask,
- cpumask_of_node(primary->numa_node));
- continue;
- }
-
- if (primary->affinity_policy == HV_LOCALIZED) {
- /*
- * NOTE: in the case of sub-channel, we clear the
- * sub-channel related bit(s) in
- * primary->alloced_cpus_in_node in
- * hv_process_channel_removal(), so when we
- * reload drivers like hv_netvsc in SMP guest, here
- * we're able to re-allocate
- * bit from primary->alloced_cpus_in_node.
- */
- if (!cpumask_test_cpu(cur_cpu,
- &primary->alloced_cpus_in_node)) {
- cpumask_set_cpu(cur_cpu,
- &primary->alloced_cpus_in_node);
- cpumask_set_cpu(cur_cpu, alloced_mask);
- break;
- }
- } else {
- cpumask_set_cpu(cur_cpu, alloced_mask);
- break;
- }
- }
-
- channel->target_cpu = cur_cpu;
- channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu);
-
- spin_unlock(&bind_channel_to_cpu_lock);
+ channel->target_cpu = target_cpu;
free_cpumask_var(available_mask);
}
@@ -896,6 +852,68 @@
vmbus_wait_for_unload();
}
+static void check_ready_for_resume_event(void)
+{
+ /*
+ * If all the old primary channels have been fixed up, then it's safe
+ * to resume.
+ */
+ if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume))
+ complete(&vmbus_connection.ready_for_resume_event);
+}
+
+static void vmbus_setup_channel_state(struct vmbus_channel *channel,
+ struct vmbus_channel_offer_channel *offer)
+{
+ /*
+ * Setup state for signalling the host.
+ */
+ channel->sig_event = VMBUS_EVENT_CONNECTION_ID;
+
+ if (vmbus_proto_version != VERSION_WS2008) {
+ channel->is_dedicated_interrupt =
+ (offer->is_dedicated_interrupt != 0);
+ channel->sig_event = offer->connection_id;
+ }
+
+ memcpy(&channel->offermsg, offer,
+ sizeof(struct vmbus_channel_offer_channel));
+ channel->monitor_grp = (u8)offer->monitorid / 32;
+ channel->monitor_bit = (u8)offer->monitorid % 32;
+ channel->device_id = hv_get_dev_type(channel);
+}
+
+/*
+ * find_primary_channel_by_offer - Get the channel object given the new offer.
+ * This is only used in the resume path of hibernation.
+ */
+static struct vmbus_channel *
+find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer)
+{
+ struct vmbus_channel *channel = NULL, *iter;
+ const guid_t *inst1, *inst2;
+
+ /* Ignore sub-channel offers. */
+ if (offer->offer.sub_channel_index != 0)
+ return NULL;
+
+ mutex_lock(&vmbus_connection.channel_mutex);
+
+ list_for_each_entry(iter, &vmbus_connection.chn_list, listentry) {
+ inst1 = &iter->offermsg.offer.if_instance;
+ inst2 = &offer->offer.if_instance;
+
+ if (guid_equal(inst1, inst2)) {
+ channel = iter;
+ break;
+ }
+ }
+
+ mutex_unlock(&vmbus_connection.channel_mutex);
+
+ return channel;
+}
+
/*
* vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
*
@@ -903,11 +921,84 @@
static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
{
struct vmbus_channel_offer_channel *offer;
- struct vmbus_channel *newchannel;
+ struct vmbus_channel *oldchannel, *newchannel;
+ size_t offer_sz;
offer = (struct vmbus_channel_offer_channel *)hdr;
trace_vmbus_onoffer(offer);
+
+ oldchannel = find_primary_channel_by_offer(offer);
+
+ if (oldchannel != NULL) {
+ /*
+ * We're resuming from hibernation: all the sub-channel and
+ * hv_sock channels we had before the hibernation should have
+ * been cleaned up, and now we must be seeing a re-offered
+ * primary channel that we had before the hibernation.
+ */
+
+ /*
+ * { Initially: channel relid = INVALID_RELID,
+ * channels[valid_relid] = NULL }
+ *
+ * CPU1 CPU2
+ *
+ * [vmbus_onoffer()] [vmbus_device_release()]
+ *
+ * LOCK channel_mutex LOCK channel_mutex
+ * STORE channel relid = valid_relid LOAD r1 = channel relid
+ * MAP_RELID channel if (r1 != INVALID_RELID)
+ * UNLOCK channel_mutex UNMAP_RELID channel
+ * UNLOCK channel_mutex
+ *
+ * Forbids: r1 == valid_relid &&
+ * channels[valid_relid] == channel
+ *
+ * Note. r1 can be INVALID_RELID only for an hv_sock channel.
+ * None of the hv_sock channels which were present before the
+ * suspend are re-offered upon the resume. See the WARN_ON()
+ * in hv_process_channel_removal().
+ */
+ mutex_lock(&vmbus_connection.channel_mutex);
+
+ atomic_dec(&vmbus_connection.offer_in_progress);
+
+ WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID);
+ /* Fix up the relid. */
+ oldchannel->offermsg.child_relid = offer->child_relid;
+
+ offer_sz = sizeof(*offer);
+ if (memcmp(offer, &oldchannel->offermsg, offer_sz) != 0) {
+ /*
+ * This is not an error, since the host can also change
+ * the other field(s) of the offer, e.g. on WS RS5
+ * (Build 17763), the offer->connection_id of the
+ * Mellanox VF vmbus device can change when the host
+ * reoffers the device upon resume.
+ */
+ pr_debug("vmbus offer changed: relid=%d\n",
+ offer->child_relid);
+
+ print_hex_dump_debug("Old vmbus offer: ",
+ DUMP_PREFIX_OFFSET, 16, 4,
+ &oldchannel->offermsg, offer_sz,
+ false);
+ print_hex_dump_debug("New vmbus offer: ",
+ DUMP_PREFIX_OFFSET, 16, 4,
+ offer, offer_sz, false);
+
+ /* Fix up the old channel. */
+ vmbus_setup_channel_state(oldchannel, offer);
+ }
+
+ /* Add the channel back to the array of channels. */
+ vmbus_channel_map_relid(oldchannel);
+ check_ready_for_resume_event();
+
+ mutex_unlock(&vmbus_connection.channel_mutex);
+ return;
+ }
/* Allocate the channel object and save this offer. */
newchannel = alloc_channel();
@@ -918,23 +1009,19 @@
return;
}
- /*
- * Setup state for signalling the host.
- */
- newchannel->sig_event = VMBUS_EVENT_CONNECTION_ID;
-
- if (vmbus_proto_version != VERSION_WS2008) {
- newchannel->is_dedicated_interrupt =
- (offer->is_dedicated_interrupt != 0);
- newchannel->sig_event = offer->connection_id;
- }
-
- memcpy(&newchannel->offermsg, offer,
- sizeof(struct vmbus_channel_offer_channel));
- newchannel->monitor_grp = (u8)offer->monitorid / 32;
- newchannel->monitor_bit = (u8)offer->monitorid % 32;
+ vmbus_setup_channel_state(newchannel, offer);
vmbus_process_offer(newchannel);
+}
+
+static void check_ready_for_suspend_event(void)
+{
+ /*
+ * If all the sub-channels or hv_sock channels have been cleaned up,
+ * then it's safe to suspend.
+ */
+ if (atomic_dec_and_test(&vmbus_connection.nr_chan_close_on_suspend))
+ complete(&vmbus_connection.ready_for_suspend_event);
}
/*
@@ -947,6 +1034,7 @@
struct vmbus_channel_rescind_offer *rescind;
struct vmbus_channel *channel;
struct device *dev;
+ bool clean_up_chan_for_suspend;
rescind = (struct vmbus_channel_rescind_offer *)hdr;
@@ -958,11 +1046,22 @@
* offer comes in first and then the rescind.
* Since we process these events in work elements,
* and with preemption, we may end up processing
- * the events out of order. Given that we handle these
- * work elements on the same CPU, this is possible only
- * in the case of preemption. In any case wait here
- * until the offer processing has moved beyond the
- * point where the channel is discoverable.
+ * the events out of order. We rely on the synchronization
+ * provided by offer_in_progress and by channel_mutex for
+ * ordering these events:
+ *
+ * { Initially: offer_in_progress = 1 }
+ *
+ * CPU1 CPU2
+ *
+ * [vmbus_onoffer()] [vmbus_onoffer_rescind()]
+ *
+ * LOCK channel_mutex WAIT_ON offer_in_progress == 0
+ * DECREMENT offer_in_progress LOCK channel_mutex
+ * STORE channels[] LOAD channels[]
+ * UNLOCK channel_mutex UNLOCK channel_mutex
+ *
+ * Forbids: CPU2's LOAD from *not* seeing CPU1's STORE
*/
while (atomic_read(&vmbus_connection.offer_in_progress) != 0) {
@@ -986,6 +1085,8 @@
return;
}
+ clean_up_chan_for_suspend = is_hvsock_channel(channel) ||
+ is_sub_channel(channel);
/*
* Before setting channel->rescind in vmbus_rescind_cleanup(), we
* should make sure the channel callback is not running any more.
@@ -1011,6 +1112,10 @@
if (channel->device_obj) {
if (channel->chn_rescind_callback) {
channel->chn_rescind_callback(channel);
+
+ if (clean_up_chan_for_suspend)
+ check_ready_for_suspend_event();
+
return;
}
/*
@@ -1036,12 +1141,17 @@
* The channel is currently not open;
* it is safe for us to cleanup the channel.
*/
- hv_process_channel_removal(rescind->child_relid);
+ hv_process_channel_removal(channel);
} else {
complete(&channel->rescind_event);
}
mutex_unlock(&vmbus_connection.channel_mutex);
}
+
+ /* The "channel" may have been freed. Do not access it any longer. */
+
+ if (clean_up_chan_for_suspend)
+ check_ready_for_suspend_event();
}
void vmbus_hvsock_device_unregister(struct vmbus_channel *channel)
@@ -1250,30 +1360,36 @@
/* Channel message dispatch table */
const struct vmbus_channel_message_table_entry
channel_message_table[CHANNELMSG_COUNT] = {
- { CHANNELMSG_INVALID, 0, NULL },
- { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer },
- { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind },
- { CHANNELMSG_REQUESTOFFERS, 0, NULL },
- { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered },
- { CHANNELMSG_OPENCHANNEL, 0, NULL },
- { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result },
- { CHANNELMSG_CLOSECHANNEL, 0, NULL },
- { CHANNELMSG_GPADL_HEADER, 0, NULL },
- { CHANNELMSG_GPADL_BODY, 0, NULL },
- { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created },
- { CHANNELMSG_GPADL_TEARDOWN, 0, NULL },
- { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown },
- { CHANNELMSG_RELID_RELEASED, 0, NULL },
- { CHANNELMSG_INITIATE_CONTACT, 0, NULL },
- { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response },
- { CHANNELMSG_UNLOAD, 0, NULL },
- { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response },
- { CHANNELMSG_18, 0, NULL },
- { CHANNELMSG_19, 0, NULL },
- { CHANNELMSG_20, 0, NULL },
- { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL },
- { CHANNELMSG_22, 0, NULL },
- { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL },
+ { CHANNELMSG_INVALID, 0, NULL, 0},
+ { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer,
+ sizeof(struct vmbus_channel_offer_channel)},
+ { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind,
+ sizeof(struct vmbus_channel_rescind_offer) },
+ { CHANNELMSG_REQUESTOFFERS, 0, NULL, 0},
+ { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered, 0},
+ { CHANNELMSG_OPENCHANNEL, 0, NULL, 0},
+ { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result,
+ sizeof(struct vmbus_channel_open_result)},
+ { CHANNELMSG_CLOSECHANNEL, 0, NULL, 0},
+ { CHANNELMSG_GPADL_HEADER, 0, NULL, 0},
+ { CHANNELMSG_GPADL_BODY, 0, NULL, 0},
+ { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created,
+ sizeof(struct vmbus_channel_gpadl_created)},
+ { CHANNELMSG_GPADL_TEARDOWN, 0, NULL, 0},
+ { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown,
+ sizeof(struct vmbus_channel_gpadl_torndown) },
+ { CHANNELMSG_RELID_RELEASED, 0, NULL, 0},
+ { CHANNELMSG_INITIATE_CONTACT, 0, NULL, 0},
+ { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response,
+ sizeof(struct vmbus_channel_version_response)},
+ { CHANNELMSG_UNLOAD, 0, NULL, 0},
+ { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response, 0},
+ { CHANNELMSG_18, 0, NULL, 0},
+ { CHANNELMSG_19, 0, NULL, 0},
+ { CHANNELMSG_20, 0, NULL, 0},
+ { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0},
+ { CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0},
+ { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0},
};
/*
@@ -1281,13 +1397,8 @@
*
* This is invoked in the vmbus worker thread context.
*/
-void vmbus_onmessage(void *context)
+void vmbus_onmessage(struct vmbus_channel_message_header *hdr)
{
- struct hv_message *msg = context;
- struct vmbus_channel_message_header *hdr;
-
- hdr = (struct vmbus_channel_message_header *)msg->u.payload;
-
trace_vmbus_on_message(hdr);
/*
@@ -1332,49 +1443,6 @@
return ret;
}
-
-/*
- * Retrieve the (sub) channel on which to send an outgoing request.
- * When a primary channel has multiple sub-channels, we try to
- * distribute the load equally amongst all available channels.
- */
-struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
-{
- struct list_head *cur, *tmp;
- int cur_cpu;
- struct vmbus_channel *cur_channel;
- struct vmbus_channel *outgoing_channel = primary;
- int next_channel;
- int i = 1;
-
- if (list_empty(&primary->sc_list))
- return outgoing_channel;
-
- next_channel = primary->next_oc++;
-
- if (next_channel > (primary->num_sc)) {
- primary->next_oc = 0;
- return outgoing_channel;
- }
-
- cur_cpu = hv_cpu_number_to_vp_number(smp_processor_id());
- list_for_each_safe(cur, tmp, &primary->sc_list) {
- cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
- if (cur_channel->state != CHANNEL_OPENED_STATE)
- continue;
-
- if (cur_channel->target_vp == cur_cpu)
- return cur_channel;
-
- if (i == next_channel)
- return cur_channel;
-
- i++;
- }
-
- return outgoing_channel;
-}
-EXPORT_SYMBOL_GPL(vmbus_get_outgoing_channel);
static void invoke_sc_cb(struct vmbus_channel *primary_channel)
{
--
Gitblit v1.6.2