~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,18 +1,6 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* Copyright (c) 2009, Microsoft Corporation.
3		- *
4		- * This program is free software; you can redistribute it and/or modify it
5		- * under the terms and conditions of the GNU General Public License,
6		- * version 2, as published by the Free Software Foundation.
7		- *
8		- * This program is distributed in the hope it will be useful, but WITHOUT
9		- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10		- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11		- * more details.
12		- *
13		- * You should have received a copy of the GNU General Public License along with
14		- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15		- * Place - Suite 330, Boston, MA 02111-1307 USA.
16	4	*
17	5	* Authors:
18	6	* Haiyang Zhang <haiyangz@microsoft.com>
..	..	@@ -30,14 +18,15 @@
30	18	#include <linux/module.h>
31	19	#include <linux/completion.h>
32	20	#include <linux/delay.h>
	21	+#include <linux/cpu.h>
33	22	#include <linux/hyperv.h>
34	23	#include <asm/mshyperv.h>
35	24
36	25	#include "hyperv_vmbus.h"
37	26
38		-static void init_vp_index(struct vmbus_channel *channel, u16 dev_type);
	27	+static void init_vp_index(struct vmbus_channel *channel);
39	28
40		-static const struct vmbus_device vmbus_devs[] = {
	29	+const struct vmbus_device vmbus_devs[] = {
41	30	/* IDE */
42	31	{ .dev_type = HV_IDE,
43	32	HV_IDE_GUID,
..	..	@@ -141,7 +130,7 @@
141	130	};
142	131
143	132	static const struct {
144		- uuid_le guid;
	133	+ guid_t guid;
145	134	} vmbus_unsupported_devs[] = {
146	135	{ HV_AVMA1_GUID },
147	136	{ HV_AVMA2_GUID },
..	..	@@ -171,26 +160,26 @@
171	160	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
172	161	}
173	162
174		-static bool is_unsupported_vmbus_devs(const uuid_le *guid)
	163	+static bool is_unsupported_vmbus_devs(const guid_t *guid)
175	164	{
176	165	int i;
177	166
178	167	for (i = 0; i < ARRAY_SIZE(vmbus_unsupported_devs); i++)
179		- if (!uuid_le_cmp(*guid, vmbus_unsupported_devs[i].guid))
	168	+ if (guid_equal(guid, &vmbus_unsupported_devs[i].guid))
180	169	return true;
181	170	return false;
182	171	}
183	172
184	173	static u16 hv_get_dev_type(const struct vmbus_channel *channel)
185	174	{
186		- const uuid_le *guid = &channel->offermsg.offer.if_type;
	175	+ const guid_t *guid = &channel->offermsg.offer.if_type;
187	176	u16 i;
188	177
189	178	if (is_hvsock_channel(channel) \|\| is_unsupported_vmbus_devs(guid))
190	179	return HV_UNKNOWN;
191	180
192	181	for (i = HV_IDE; i < HV_UNKNOWN; i++) {
193		- if (!uuid_le_cmp(*guid, vmbus_devs[i].guid))
	182	+ if (guid_equal(guid, &vmbus_devs[i].guid))
194	183	return i;
195	184	}
196	185	pr_info("Unknown GUID: %pUl\n", guid);
..	..	@@ -198,24 +187,19 @@
198	187	}
199	188
200	189	/**
201		- * vmbus_prep_negotiate_resp() - Create default response for Hyper-V Negotiate message
	190	+ * vmbus_prep_negotiate_resp() - Create default response for Negotiate message
202	191	* @icmsghdrp: Pointer to msg header structure
203		- * @icmsg_negotiate: Pointer to negotiate message structure
204	192	* @buf: Raw buffer channel data
	193	+ * @fw_version: The framework versions we can support.
	194	+ * @fw_vercnt: The size of @fw_version.
	195	+ * @srv_version: The service versions we can support.
	196	+ * @srv_vercnt: The size of @srv_version.
	197	+ * @nego_fw_version: The selected framework version.
	198	+ * @nego_srv_version: The selected service version.
205	199	*
206		- * @icmsghdrp is of type &struct icmsg_hdr.
	200	+ * Note: Versions are given in decreasing order.
	201	+ *
207	202	* Set up and fill in default negotiate response message.
208		- *
209		- * The fw_version and fw_vercnt specifies the framework version that
210		- * we can support.
211		- *
212		- * The srv_version and srv_vercnt specifies the service
213		- * versions we can support.
214		- *
215		- * Versions are given in decreasing order.
216		- *
217		- * nego_fw_version and nego_srv_version store the selected protocol versions.
218		- *
219	203	* Mainly used by Hyper-V drivers.
220	204	*/
221	205	bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
..	..	@@ -332,14 +316,15 @@
332	316	if (!channel)
333	317	return NULL;
334	318
335		- spin_lock_init(&channel->lock);
	319	+ spin_lock_init(&channel->sched_lock);
336	320	init_completion(&channel->rescind_event);
337	321
338	322	INIT_LIST_HEAD(&channel->sc_list);
339		- INIT_LIST_HEAD(&channel->percpu_list);
340	323
341	324	tasklet_init(&channel->callback_event,
342	325	vmbus_on_event, (unsigned long)channel);
	326	+
	327	+ hv_ringbuffer_pre_init(channel);
343	328
344	329	return channel;
345	330	}
..	..	@@ -355,22 +340,48 @@
355	340	kobject_put(&channel->kobj);
356	341	}
357	342
358		-static void percpu_channel_enq(void *arg)
	343	+void vmbus_channel_map_relid(struct vmbus_channel *channel)
359	344	{
360		- struct vmbus_channel *channel = arg;
361		- struct hv_per_cpu_context *hv_cpu
362		- = this_cpu_ptr(hv_context.cpu_context);
363		-
364		- list_add_tail_rcu(&channel->percpu_list, &hv_cpu->chan_list);
	345	+ if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
	346	+ return;
	347	+ /*
	348	+ * The mapping of the channel's relid is visible from the CPUs that
	349	+ * execute vmbus_chan_sched() by the time that vmbus_chan_sched() will
	350	+ * execute:
	351	+ *
	352	+ * (a) In the "normal (i.e., not resuming from hibernation)" path,
	353	+ * the full barrier in virt_store_mb() guarantees that the store
	354	+ * is propagated to all CPUs before the add_channel_work work
	355	+ * is queued. In turn, add_channel_work is queued before the
	356	+ * channel's ring buffer is allocated/initialized and the
	357	+ * OPENCHANNEL message for the channel is sent in vmbus_open().
	358	+ * Hyper-V won't start sending the interrupts for the channel
	359	+ * before the OPENCHANNEL message is acked. The memory barrier
	360	+ * in vmbus_chan_sched() -> sync_test_and_clear_bit() ensures
	361	+ * that vmbus_chan_sched() must find the channel's relid in
	362	+ * recv_int_page before retrieving the channel pointer from the
	363	+ * array of channels.
	364	+ *
	365	+ * (b) In the "resuming from hibernation" path, the virt_store_mb()
	366	+ * guarantees that the store is propagated to all CPUs before
	367	+ * the VMBus connection is marked as ready for the resume event
	368	+ * (cf. check_ready_for_resume_event()). The interrupt handler
	369	+ * of the VMBus driver and vmbus_chan_sched() can not run before
	370	+ * vmbus_bus_resume() has completed execution (cf. resume_noirq).
	371	+ */
	372	+ virt_store_mb(
	373	+ vmbus_connection.channels[channel->offermsg.child_relid],
	374	+ channel);
365	375	}
366	376
367		-static void percpu_channel_deq(void *arg)
	377	+void vmbus_channel_unmap_relid(struct vmbus_channel *channel)
368	378	{
369		- struct vmbus_channel *channel = arg;
370		-
371		- list_del_rcu(&channel->percpu_list);
	379	+ if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
	380	+ return;
	381	+ WRITE_ONCE(
	382	+ vmbus_connection.channels[channel->offermsg.child_relid],
	383	+ NULL);
372	384	}
373		-
374	385
375	386	static void vmbus_release_relid(u32 relid)
376	387	{
..	..	@@ -386,51 +397,49 @@
386	397	trace_vmbus_release_relid(&msg, ret);
387	398	}
388	399
389		-void hv_process_channel_removal(u32 relid)
	400	+void hv_process_channel_removal(struct vmbus_channel *channel)
390	401	{
391		- unsigned long flags;
392		- struct vmbus_channel primary_channel, channel;
393		-
394		- BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex));
395		-
396		- /*
397		- * Make sure channel is valid as we may have raced.
398		- */
399		- channel = relid2channel(relid);
400		- if (!channel)
401		- return;
402		-
	402	+ lockdep_assert_held(&vmbus_connection.channel_mutex);
403	403	BUG_ON(!channel->rescind);
404		- if (channel->target_cpu != get_cpu()) {
405		- put_cpu();
406		- smp_call_function_single(channel->target_cpu,
407		- percpu_channel_deq, channel, true);
408		- } else {
409		- percpu_channel_deq(channel);
410		- put_cpu();
411		- }
412		-
413		- if (channel->primary_channel == NULL) {
414		- list_del(&channel->listentry);
415		-
416		- primary_channel = channel;
417		- } else {
418		- primary_channel = channel->primary_channel;
419		- spin_lock_irqsave(&primary_channel->lock, flags);
420		- list_del(&channel->sc_list);
421		- primary_channel->num_sc--;
422		- spin_unlock_irqrestore(&primary_channel->lock, flags);
423		- }
424	404
425	405	/*
426		- * We need to free the bit for init_vp_index() to work in the case
427		- * of sub-channel, when we reload drivers like hv_netvsc.
	406	+ * hv_process_channel_removal() could find INVALID_RELID only for
	407	+ * hv_sock channels. See the inline comments in vmbus_onoffer().
428	408	*/
429		- if (channel->affinity_policy == HV_LOCALIZED)
430		- cpumask_clear_cpu(channel->target_cpu,
431		- &primary_channel->alloced_cpus_in_node);
	409	+ WARN_ON(channel->offermsg.child_relid == INVALID_RELID &&
	410	+ !is_hvsock_channel(channel));
432	411
433		- vmbus_release_relid(relid);
	412	+ /*
	413	+ * Upon suspend, an in-use hv_sock channel is removed from the array of
	414	+ * channels and the relid is invalidated. After hibernation, when the
	415	+ * user-space appplication destroys the channel, it's unnecessary and
	416	+ * unsafe to remove the channel from the array of channels. See also
	417	+ * the inline comments before the call of vmbus_release_relid() below.
	418	+ */
	419	+ if (channel->offermsg.child_relid != INVALID_RELID)
	420	+ vmbus_channel_unmap_relid(channel);
	421	+
	422	+ if (channel->primary_channel == NULL)
	423	+ list_del(&channel->listentry);
	424	+ else
	425	+ list_del(&channel->sc_list);
	426	+
	427	+ /*
	428	+ * If this is a "perf" channel, updates the hv_numa_map[] masks so that
	429	+ * init_vp_index() can (re-)use the CPU.
	430	+ */
	431	+ if (hv_is_perf_channel(channel))
	432	+ hv_clear_alloced_cpu(channel->target_cpu);
	433	+
	434	+ /*
	435	+ * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and
	436	+ * the relid is invalidated; after hibernation, when the user-space app
	437	+ * destroys the channel, the relid is INVALID_RELID, and in this case
	438	+ * it's unnecessary and unsafe to release the old relid, since the same
	439	+ * relid can refer to a completely different channel now.
	440	+ */
	441	+ if (channel->offermsg.child_relid != INVALID_RELID)
	442	+ vmbus_release_relid(channel->offermsg.child_relid);
434	443
435	444	free_channel(channel);
436	445	}
..	..	@@ -454,23 +463,7 @@
454	463	struct vmbus_channel *newchannel =
455	464	container_of(work, struct vmbus_channel, add_channel_work);
456	465	struct vmbus_channel *primary_channel = newchannel->primary_channel;
457		- unsigned long flags;
458		- u16 dev_type;
459	466	int ret;
460		-
461		- dev_type = hv_get_dev_type(newchannel);
462		-
463		- init_vp_index(newchannel, dev_type);
464		-
465		- if (newchannel->target_cpu != get_cpu()) {
466		- put_cpu();
467		- smp_call_function_single(newchannel->target_cpu,
468		- percpu_channel_enq,
469		- newchannel, true);
470		- } else {
471		- percpu_channel_enq(newchannel);
472		- put_cpu();
473		- }
474	467
475	468	/*
476	469	* This state is used to indicate a successful open
..	..	@@ -503,18 +496,22 @@
503	496	if (!newchannel->device_obj)
504	497	goto err_deq_chan;
505	498
506		- newchannel->device_obj->device_id = dev_type;
	499	+ newchannel->device_obj->device_id = newchannel->device_id;
507	500	/*
508	501	* Add the new device to the bus. This will kick off device-driver
509	502	* binding which eventually invokes the device driver's AddDevice()
510	503	* method.
	504	+ *
	505	+ * If vmbus_device_register() fails, the 'device_obj' is freed in
	506	+ * vmbus_device_release() as called by device_unregister() in the
	507	+ * error path of vmbus_device_register(). In the outside error
	508	+ * path, there's no need to free it.
511	509	*/
512	510	ret = vmbus_device_register(newchannel->device_obj);
513	511
514	512	if (ret != 0) {
515	513	pr_err("unable to add child device object (relid %d)\n",
516	514	newchannel->offermsg.child_relid);
517		- kfree(newchannel->device_obj);
518	515	goto err_deq_chan;
519	516	}
520	517
..	..	@@ -530,25 +527,15 @@
530	527	*/
531	528	newchannel->probe_done = true;
532	529
533		- if (primary_channel == NULL) {
	530	+ if (primary_channel == NULL)
534	531	list_del(&newchannel->listentry);
535		- } else {
536		- spin_lock_irqsave(&primary_channel->lock, flags);
	532	+ else
537	533	list_del(&newchannel->sc_list);
538		- spin_unlock_irqrestore(&primary_channel->lock, flags);
539		- }
	534	+
	535	+ /* vmbus_process_offer() has mapped the channel. */
	536	+ vmbus_channel_unmap_relid(newchannel);
540	537
541	538	mutex_unlock(&vmbus_connection.channel_mutex);
542		-
543		- if (newchannel->target_cpu != get_cpu()) {
544		- put_cpu();
545		- smp_call_function_single(newchannel->target_cpu,
546		- percpu_channel_deq,
547		- newchannel, true);
548		- } else {
549		- percpu_channel_deq(newchannel);
550		- put_cpu();
551		- }
552	539
553	540	vmbus_release_relid(newchannel->offermsg.child_relid);
554	541
..	..	@@ -563,10 +550,40 @@
563	550	{
564	551	struct vmbus_channel *channel;
565	552	struct workqueue_struct *wq;
566		- unsigned long flags;
567	553	bool fnew = true;
568	554
	555	+ /*
	556	+ * Synchronize vmbus_process_offer() and CPU hotplugging:
	557	+ *
	558	+ * CPU1 CPU2
	559	+ *
	560	+ * [vmbus_process_offer()] [Hot removal of the CPU]
	561	+ *
	562	+ * CPU_READ_LOCK CPUS_WRITE_LOCK
	563	+ * LOAD cpu_online_mask SEARCH chn_list
	564	+ * STORE target_cpu LOAD target_cpu
	565	+ * INSERT chn_list STORE cpu_online_mask
	566	+ * CPUS_READ_UNLOCK CPUS_WRITE_UNLOCK
	567	+ *
	568	+ * Forbids: CPU1's LOAD from not seing CPU2's STORE &&
	569	+ * CPU2's SEARCH from not seeing CPU1's INSERT
	570	+ *
	571	+ * Forbids: CPU2's SEARCH from seeing CPU1's INSERT &&
	572	+ * CPU2's LOAD from not seing CPU1's STORE
	573	+ */
	574	+ cpus_read_lock();
	575	+
	576	+ /*
	577	+ * Serializes the modifications of the chn_list list as well as
	578	+ * the accesses to next_numa_node_id in init_vp_index().
	579	+ */
569	580	mutex_lock(&vmbus_connection.channel_mutex);
	581	+
	582	+ init_vp_index(newchannel);
	583	+
	584	+ /* Remember the channels that should be cleaned up upon suspend. */
	585	+ if (is_hvsock_channel(newchannel) \|\| is_sub_channel(newchannel))
	586	+ atomic_inc(&vmbus_connection.nr_chan_close_on_suspend);
570	587
571	588	/*
572	589	* Now that we have acquired the channel_mutex,
..	..	@@ -575,24 +592,25 @@
575	592	atomic_dec(&vmbus_connection.offer_in_progress);
576	593
577	594	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
578		- if (!uuid_le_cmp(channel->offermsg.offer.if_type,
579		- newchannel->offermsg.offer.if_type) &&
580		- !uuid_le_cmp(channel->offermsg.offer.if_instance,
581		- newchannel->offermsg.offer.if_instance)) {
	595	+ if (guid_equal(&channel->offermsg.offer.if_type,
	596	+ &newchannel->offermsg.offer.if_type) &&
	597	+ guid_equal(&channel->offermsg.offer.if_instance,
	598	+ &newchannel->offermsg.offer.if_instance)) {
582	599	fnew = false;
583	600	break;
584	601	}
585	602	}
586	603
587		- if (fnew)
	604	+ if (fnew) {
588	605	list_add_tail(&newchannel->listentry,
589	606	&vmbus_connection.chn_list);
590		- else {
	607	+ } else {
591	608	/*
592	609	* Check to see if this is a valid sub-channel.
593	610	*/
594	611	if (newchannel->offermsg.offer.sub_channel_index == 0) {
595	612	mutex_unlock(&vmbus_connection.channel_mutex);
	613	+ cpus_read_unlock();
596	614	/*
597	615	* Don't call free_channel(), because newchannel->kobj
598	616	* is not initialized yet.
..	..	@@ -605,12 +623,13 @@
605	623	* Process the sub-channel.
606	624	*/
607	625	newchannel->primary_channel = channel;
608		- spin_lock_irqsave(&channel->lock, flags);
609	626	list_add_tail(&newchannel->sc_list, &channel->sc_list);
610		- spin_unlock_irqrestore(&channel->lock, flags);
611	627	}
612	628
	629	+ vmbus_channel_map_relid(newchannel);
	630	+
613	631	mutex_unlock(&vmbus_connection.channel_mutex);
	632	+ cpus_read_unlock();
614	633
615	634	/*
616	635	* vmbus_process_offer() mustn't call channel->sc_creation_callback()
..	..	@@ -643,73 +662,57 @@
643	662	* We use this state to statically distribute the channel interrupt load.
644	663	*/
645	664	static int next_numa_node_id;
646		-/*
647		- * init_vp_index() accesses global variables like next_numa_node_id, and
648		- * it can run concurrently for primary channels and sub-channels: see
649		- * vmbus_process_offer(), so we need the lock to protect the global
650		- * variables.
651		- */
652		-static DEFINE_SPINLOCK(bind_channel_to_cpu_lock);
653	665
654	666	/*
655	667	* Starting with Win8, we can statically distribute the incoming
656	668	* channel interrupt load by binding a channel to VCPU.
657		- * We distribute the interrupt loads to one or more NUMA nodes based on
658		- * the channel's affinity_policy.
659	669	*
660	670	* For pre-win8 hosts or non-performance critical channels we assign the
661		- * first CPU in the first NUMA node.
	671	+ * VMBUS_CONNECT_CPU.
	672	+ *
	673	+ * Starting with win8, performance critical channels will be distributed
	674	+ * evenly among all the available NUMA nodes. Once the node is assigned,
	675	+ * we will assign the CPU based on a simple round robin scheme.
662	676	*/
663		-static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
	677	+static void init_vp_index(struct vmbus_channel *channel)
664	678	{
665		- u32 cur_cpu;
666		- bool perf_chn = vmbus_devs[dev_type].perf_device;
667		- struct vmbus_channel *primary = channel->primary_channel;
668		- int next_node;
	679	+ bool perf_chn = hv_is_perf_channel(channel);
669	680	cpumask_var_t available_mask;
670	681	struct cpumask *alloced_mask;
	682	+ u32 target_cpu;
	683	+ int numa_node;
671	684
672	685	if ((vmbus_proto_version == VERSION_WS2008) \|\|
673	686	(vmbus_proto_version == VERSION_WIN7) \|\| (!perf_chn) \|\|
674	687	!alloc_cpumask_var(&available_mask, GFP_KERNEL)) {
675	688	/*
676	689	* Prior to win8, all channel interrupts are
677		- * delivered on cpu 0.
	690	+ * delivered on VMBUS_CONNECT_CPU.
678	691	* Also if the channel is not a performance critical
679		- * channel, bind it to cpu 0.
680		- * In case alloc_cpumask_var() fails, bind it to cpu 0.
	692	+ * channel, bind it to VMBUS_CONNECT_CPU.
	693	+ * In case alloc_cpumask_var() fails, bind it to
	694	+ * VMBUS_CONNECT_CPU.
681	695	*/
682		- channel->numa_node = 0;
683		- channel->target_cpu = 0;
684		- channel->target_vp = hv_cpu_number_to_vp_number(0);
	696	+ channel->target_cpu = VMBUS_CONNECT_CPU;
	697	+ if (perf_chn)
	698	+ hv_set_alloced_cpu(VMBUS_CONNECT_CPU);
685	699	return;
686	700	}
687	701
688		- spin_lock(&bind_channel_to_cpu_lock);
689		-
690		- /*
691		- * Based on the channel affinity policy, we will assign the NUMA
692		- * nodes.
693		- */
694		-
695		- if ((channel->affinity_policy == HV_BALANCED) \|\| (!primary)) {
696		- while (true) {
697		- next_node = next_numa_node_id++;
698		- if (next_node == nr_node_ids) {
699		- next_node = next_numa_node_id = 0;
700		- continue;
701		- }
702		- if (cpumask_empty(cpumask_of_node(next_node)))
703		- continue;
704		- break;
	702	+ while (true) {
	703	+ numa_node = next_numa_node_id++;
	704	+ if (numa_node == nr_node_ids) {
	705	+ next_numa_node_id = 0;
	706	+ continue;
705	707	}
706		- channel->numa_node = next_node;
707		- primary = channel;
	708	+ if (cpumask_empty(cpumask_of_node(numa_node)))
	709	+ continue;
	710	+ break;
708	711	}
709		- alloced_mask = &hv_context.hv_numa_map[primary->numa_node];
	712	+ alloced_mask = &hv_context.hv_numa_map[numa_node];
710	713
711	714	if (cpumask_weight(alloced_mask) ==
712		- cpumask_weight(cpumask_of_node(primary->numa_node))) {
	715	+ cpumask_weight(cpumask_of_node(numa_node))) {
713	716	/*
714	717	* We have cycled through all the CPUs in the node;
715	718	* reset the alloced map.
..	..	@@ -717,59 +720,12 @@
717	720	cpumask_clear(alloced_mask);
718	721	}
719	722
720		- cpumask_xor(available_mask, alloced_mask,
721		- cpumask_of_node(primary->numa_node));
	723	+ cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node));
722	724
723		- cur_cpu = -1;
	725	+ target_cpu = cpumask_first(available_mask);
	726	+ cpumask_set_cpu(target_cpu, alloced_mask);
724	727
725		- if (primary->affinity_policy == HV_LOCALIZED) {
726		- /*
727		- * Normally Hyper-V host doesn't create more subchannels
728		- * than there are VCPUs on the node but it is possible when not
729		- * all present VCPUs on the node are initialized by guest.
730		- * Clear the alloced_cpus_in_node to start over.
731		- */
732		- if (cpumask_equal(&primary->alloced_cpus_in_node,
733		- cpumask_of_node(primary->numa_node)))
734		- cpumask_clear(&primary->alloced_cpus_in_node);
735		- }
736		-
737		- while (true) {
738		- cur_cpu = cpumask_next(cur_cpu, available_mask);
739		- if (cur_cpu >= nr_cpu_ids) {
740		- cur_cpu = -1;
741		- cpumask_copy(available_mask,
742		- cpumask_of_node(primary->numa_node));
743		- continue;
744		- }
745		-
746		- if (primary->affinity_policy == HV_LOCALIZED) {
747		- /*
748		- * NOTE: in the case of sub-channel, we clear the
749		- * sub-channel related bit(s) in
750		- * primary->alloced_cpus_in_node in
751		- * hv_process_channel_removal(), so when we
752		- * reload drivers like hv_netvsc in SMP guest, here
753		- * we're able to re-allocate
754		- * bit from primary->alloced_cpus_in_node.
755		- */
756		- if (!cpumask_test_cpu(cur_cpu,
757		- &primary->alloced_cpus_in_node)) {
758		- cpumask_set_cpu(cur_cpu,
759		- &primary->alloced_cpus_in_node);
760		- cpumask_set_cpu(cur_cpu, alloced_mask);
761		- break;
762		- }
763		- } else {
764		- cpumask_set_cpu(cur_cpu, alloced_mask);
765		- break;
766		- }
767		- }
768		-
769		- channel->target_cpu = cur_cpu;
770		- channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu);
771		-
772		- spin_unlock(&bind_channel_to_cpu_lock);
	728	+ channel->target_cpu = target_cpu;
773	729
774	730	free_cpumask_var(available_mask);
775	731	}
..	..	@@ -809,11 +765,22 @@
809	765	if (completion_done(&vmbus_connection.unload_event))
810	766	goto completed;
811	767
812		- for_each_online_cpu(cpu) {
	768	+ for_each_present_cpu(cpu) {
813	769	struct hv_per_cpu_context *hv_cpu
814	770	= per_cpu_ptr(hv_context.cpu_context, cpu);
815	771
	772	+ /*
	773	+ * In a CoCo VM the synic_message_page is not allocated
	774	+ * in hv_synic_alloc(). Instead it is set/cleared in
	775	+ * hv_synic_enable_regs() and hv_synic_disable_regs()
	776	+ * such that it is set only when the CPU is online. If
	777	+ * not all present CPUs are online, the message page
	778	+ * might be NULL, so skip such CPUs.
	779	+ */
816	780	page_addr = hv_cpu->synic_message_page;
	781	+ if (!page_addr)
	782	+ continue;
	783	+
817	784	msg = (struct hv_message *)page_addr
818	785	+ VMBUS_MESSAGE_SINT;
819	786
..	..	@@ -847,11 +814,14 @@
847	814	* maybe-pending messages on all CPUs to be able to receive new
848	815	* messages after we reconnect.
849	816	*/
850		- for_each_online_cpu(cpu) {
	817	+ for_each_present_cpu(cpu) {
851	818	struct hv_per_cpu_context *hv_cpu
852	819	= per_cpu_ptr(hv_context.cpu_context, cpu);
853	820
854	821	page_addr = hv_cpu->synic_message_page;
	822	+ if (!page_addr)
	823	+ continue;
	824	+
855	825	msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
856	826	msg->header.message_type = HVMSG_NONE;
857	827	}
..	..	@@ -896,6 +866,68 @@
896	866	vmbus_wait_for_unload();
897	867	}
898	868
	869	+static void check_ready_for_resume_event(void)
	870	+{
	871	+ /*
	872	+ * If all the old primary channels have been fixed up, then it's safe
	873	+ * to resume.
	874	+ */
	875	+ if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume))
	876	+ complete(&vmbus_connection.ready_for_resume_event);
	877	+}
	878	+
	879	+static void vmbus_setup_channel_state(struct vmbus_channel *channel,
	880	+ struct vmbus_channel_offer_channel *offer)
	881	+{
	882	+ /*
	883	+ * Setup state for signalling the host.
	884	+ */
	885	+ channel->sig_event = VMBUS_EVENT_CONNECTION_ID;
	886	+
	887	+ if (vmbus_proto_version != VERSION_WS2008) {
	888	+ channel->is_dedicated_interrupt =
	889	+ (offer->is_dedicated_interrupt != 0);
	890	+ channel->sig_event = offer->connection_id;
	891	+ }
	892	+
	893	+ memcpy(&channel->offermsg, offer,
	894	+ sizeof(struct vmbus_channel_offer_channel));
	895	+ channel->monitor_grp = (u8)offer->monitorid / 32;
	896	+ channel->monitor_bit = (u8)offer->monitorid % 32;
	897	+ channel->device_id = hv_get_dev_type(channel);
	898	+}
	899	+
	900	+/*
	901	+ * find_primary_channel_by_offer - Get the channel object given the new offer.
	902	+ * This is only used in the resume path of hibernation.
	903	+ */
	904	+static struct vmbus_channel *
	905	+find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer)
	906	+{
	907	+ struct vmbus_channel channel = NULL, iter;
	908	+ const guid_t inst1, inst2;
	909	+
	910	+ /* Ignore sub-channel offers. */
	911	+ if (offer->offer.sub_channel_index != 0)
	912	+ return NULL;
	913	+
	914	+ mutex_lock(&vmbus_connection.channel_mutex);
	915	+
	916	+ list_for_each_entry(iter, &vmbus_connection.chn_list, listentry) {
	917	+ inst1 = &iter->offermsg.offer.if_instance;
	918	+ inst2 = &offer->offer.if_instance;
	919	+
	920	+ if (guid_equal(inst1, inst2)) {
	921	+ channel = iter;
	922	+ break;
	923	+ }
	924	+ }
	925	+
	926	+ mutex_unlock(&vmbus_connection.channel_mutex);
	927	+
	928	+ return channel;
	929	+}
	930	+
899	931	/*
900	932	* vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
901	933	*
..	..	@@ -903,11 +935,84 @@
903	935	static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
904	936	{
905	937	struct vmbus_channel_offer_channel *offer;
906		- struct vmbus_channel *newchannel;
	938	+ struct vmbus_channel oldchannel, newchannel;
	939	+ size_t offer_sz;
907	940
908	941	offer = (struct vmbus_channel_offer_channel *)hdr;
909	942
910	943	trace_vmbus_onoffer(offer);
	944	+
	945	+ oldchannel = find_primary_channel_by_offer(offer);
	946	+
	947	+ if (oldchannel != NULL) {
	948	+ /*
	949	+ * We're resuming from hibernation: all the sub-channel and
	950	+ * hv_sock channels we had before the hibernation should have
	951	+ * been cleaned up, and now we must be seeing a re-offered
	952	+ * primary channel that we had before the hibernation.
	953	+ */
	954	+
	955	+ /*
	956	+ * { Initially: channel relid = INVALID_RELID,
	957	+ * channels[valid_relid] = NULL }
	958	+ *
	959	+ * CPU1 CPU2
	960	+ *
	961	+ * [vmbus_onoffer()] [vmbus_device_release()]
	962	+ *
	963	+ * LOCK channel_mutex LOCK channel_mutex
	964	+ * STORE channel relid = valid_relid LOAD r1 = channel relid
	965	+ * MAP_RELID channel if (r1 != INVALID_RELID)
	966	+ * UNLOCK channel_mutex UNMAP_RELID channel
	967	+ * UNLOCK channel_mutex
	968	+ *
	969	+ * Forbids: r1 == valid_relid &&
	970	+ * channels[valid_relid] == channel
	971	+ *
	972	+ * Note. r1 can be INVALID_RELID only for an hv_sock channel.
	973	+ * None of the hv_sock channels which were present before the
	974	+ * suspend are re-offered upon the resume. See the WARN_ON()
	975	+ * in hv_process_channel_removal().
	976	+ */
	977	+ mutex_lock(&vmbus_connection.channel_mutex);
	978	+
	979	+ atomic_dec(&vmbus_connection.offer_in_progress);
	980	+
	981	+ WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID);
	982	+ /* Fix up the relid. */
	983	+ oldchannel->offermsg.child_relid = offer->child_relid;
	984	+
	985	+ offer_sz = sizeof(*offer);
	986	+ if (memcmp(offer, &oldchannel->offermsg, offer_sz) != 0) {
	987	+ /*
	988	+ * This is not an error, since the host can also change
	989	+ * the other field(s) of the offer, e.g. on WS RS5
	990	+ * (Build 17763), the offer->connection_id of the
	991	+ * Mellanox VF vmbus device can change when the host
	992	+ * reoffers the device upon resume.
	993	+ */
	994	+ pr_debug("vmbus offer changed: relid=%d\n",
	995	+ offer->child_relid);
	996	+
	997	+ print_hex_dump_debug("Old vmbus offer: ",
	998	+ DUMP_PREFIX_OFFSET, 16, 4,
	999	+ &oldchannel->offermsg, offer_sz,
	1000	+ false);
	1001	+ print_hex_dump_debug("New vmbus offer: ",
	1002	+ DUMP_PREFIX_OFFSET, 16, 4,
	1003	+ offer, offer_sz, false);
	1004	+
	1005	+ /* Fix up the old channel. */
	1006	+ vmbus_setup_channel_state(oldchannel, offer);
	1007	+ }
	1008	+
	1009	+ /* Add the channel back to the array of channels. */
	1010	+ vmbus_channel_map_relid(oldchannel);
	1011	+ check_ready_for_resume_event();
	1012	+
	1013	+ mutex_unlock(&vmbus_connection.channel_mutex);
	1014	+ return;
	1015	+ }
911	1016
912	1017	/* Allocate the channel object and save this offer. */
913	1018	newchannel = alloc_channel();
..	..	@@ -918,23 +1023,19 @@
918	1023	return;
919	1024	}
920	1025
921		- /*
922		- * Setup state for signalling the host.
923		- */
924		- newchannel->sig_event = VMBUS_EVENT_CONNECTION_ID;
925		-
926		- if (vmbus_proto_version != VERSION_WS2008) {
927		- newchannel->is_dedicated_interrupt =
928		- (offer->is_dedicated_interrupt != 0);
929		- newchannel->sig_event = offer->connection_id;
930		- }
931		-
932		- memcpy(&newchannel->offermsg, offer,
933		- sizeof(struct vmbus_channel_offer_channel));
934		- newchannel->monitor_grp = (u8)offer->monitorid / 32;
935		- newchannel->monitor_bit = (u8)offer->monitorid % 32;
	1026	+ vmbus_setup_channel_state(newchannel, offer);
936	1027
937	1028	vmbus_process_offer(newchannel);
	1029	+}
	1030	+
	1031	+static void check_ready_for_suspend_event(void)
	1032	+{
	1033	+ /*
	1034	+ * If all the sub-channels or hv_sock channels have been cleaned up,
	1035	+ * then it's safe to suspend.
	1036	+ */
	1037	+ if (atomic_dec_and_test(&vmbus_connection.nr_chan_close_on_suspend))
	1038	+ complete(&vmbus_connection.ready_for_suspend_event);
938	1039	}
939	1040
940	1041	/*
..	..	@@ -947,6 +1048,7 @@
947	1048	struct vmbus_channel_rescind_offer *rescind;
948	1049	struct vmbus_channel *channel;
949	1050	struct device *dev;
	1051	+ bool clean_up_chan_for_suspend;
950	1052
951	1053	rescind = (struct vmbus_channel_rescind_offer *)hdr;
952	1054
..	..	@@ -958,11 +1060,22 @@
958	1060	* offer comes in first and then the rescind.
959	1061	* Since we process these events in work elements,
960	1062	* and with preemption, we may end up processing
961		- * the events out of order. Given that we handle these
962		- * work elements on the same CPU, this is possible only
963		- * in the case of preemption. In any case wait here
964		- * until the offer processing has moved beyond the
965		- * point where the channel is discoverable.
	1063	+ * the events out of order. We rely on the synchronization
	1064	+ * provided by offer_in_progress and by channel_mutex for
	1065	+ * ordering these events:
	1066	+ *
	1067	+ * { Initially: offer_in_progress = 1 }
	1068	+ *
	1069	+ * CPU1 CPU2
	1070	+ *
	1071	+ * [vmbus_onoffer()] [vmbus_onoffer_rescind()]
	1072	+ *
	1073	+ * LOCK channel_mutex WAIT_ON offer_in_progress == 0
	1074	+ * DECREMENT offer_in_progress LOCK channel_mutex
	1075	+ * STORE channels[] LOAD channels[]
	1076	+ * UNLOCK channel_mutex UNLOCK channel_mutex
	1077	+ *
	1078	+ * Forbids: CPU2's LOAD from not seeing CPU1's STORE
966	1079	*/
967	1080
968	1081	while (atomic_read(&vmbus_connection.offer_in_progress) != 0) {
..	..	@@ -986,6 +1099,8 @@
986	1099	return;
987	1100	}
988	1101
	1102	+ clean_up_chan_for_suspend = is_hvsock_channel(channel) \|\|
	1103	+ is_sub_channel(channel);
989	1104	/*
990	1105	* Before setting channel->rescind in vmbus_rescind_cleanup(), we
991	1106	* should make sure the channel callback is not running any more.
..	..	@@ -1011,6 +1126,10 @@
1011	1126	if (channel->device_obj) {
1012	1127	if (channel->chn_rescind_callback) {
1013	1128	channel->chn_rescind_callback(channel);
	1129	+
	1130	+ if (clean_up_chan_for_suspend)
	1131	+ check_ready_for_suspend_event();
	1132	+
1014	1133	return;
1015	1134	}
1016	1135	/*
..	..	@@ -1036,12 +1155,17 @@
1036	1155	* The channel is currently not open;
1037	1156	* it is safe for us to cleanup the channel.
1038	1157	*/
1039		- hv_process_channel_removal(rescind->child_relid);
	1158	+ hv_process_channel_removal(channel);
1040	1159	} else {
1041	1160	complete(&channel->rescind_event);
1042	1161	}
1043	1162	mutex_unlock(&vmbus_connection.channel_mutex);
1044	1163	}
	1164	+
	1165	+ /* The "channel" may have been freed. Do not access it any longer. */
	1166	+
	1167	+ if (clean_up_chan_for_suspend)
	1168	+ check_ready_for_suspend_event();
1045	1169	}
1046	1170
1047	1171	void vmbus_hvsock_device_unregister(struct vmbus_channel *channel)
..	..	@@ -1250,30 +1374,36 @@
1250	1374	/* Channel message dispatch table */
1251	1375	const struct vmbus_channel_message_table_entry
1252	1376	channel_message_table[CHANNELMSG_COUNT] = {
1253		- { CHANNELMSG_INVALID, 0, NULL },
1254		- { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer },
1255		- { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind },
1256		- { CHANNELMSG_REQUESTOFFERS, 0, NULL },
1257		- { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered },
1258		- { CHANNELMSG_OPENCHANNEL, 0, NULL },
1259		- { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result },
1260		- { CHANNELMSG_CLOSECHANNEL, 0, NULL },
1261		- { CHANNELMSG_GPADL_HEADER, 0, NULL },
1262		- { CHANNELMSG_GPADL_BODY, 0, NULL },
1263		- { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created },
1264		- { CHANNELMSG_GPADL_TEARDOWN, 0, NULL },
1265		- { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown },
1266		- { CHANNELMSG_RELID_RELEASED, 0, NULL },
1267		- { CHANNELMSG_INITIATE_CONTACT, 0, NULL },
1268		- { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response },
1269		- { CHANNELMSG_UNLOAD, 0, NULL },
1270		- { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response },
1271		- { CHANNELMSG_18, 0, NULL },
1272		- { CHANNELMSG_19, 0, NULL },
1273		- { CHANNELMSG_20, 0, NULL },
1274		- { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL },
1275		- { CHANNELMSG_22, 0, NULL },
1276		- { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL },
	1377	+ { CHANNELMSG_INVALID, 0, NULL, 0},
	1378	+ { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer,
	1379	+ sizeof(struct vmbus_channel_offer_channel)},
	1380	+ { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind,
	1381	+ sizeof(struct vmbus_channel_rescind_offer) },
	1382	+ { CHANNELMSG_REQUESTOFFERS, 0, NULL, 0},
	1383	+ { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered, 0},
	1384	+ { CHANNELMSG_OPENCHANNEL, 0, NULL, 0},
	1385	+ { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result,
	1386	+ sizeof(struct vmbus_channel_open_result)},
	1387	+ { CHANNELMSG_CLOSECHANNEL, 0, NULL, 0},
	1388	+ { CHANNELMSG_GPADL_HEADER, 0, NULL, 0},
	1389	+ { CHANNELMSG_GPADL_BODY, 0, NULL, 0},
	1390	+ { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created,
	1391	+ sizeof(struct vmbus_channel_gpadl_created)},
	1392	+ { CHANNELMSG_GPADL_TEARDOWN, 0, NULL, 0},
	1393	+ { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown,
	1394	+ sizeof(struct vmbus_channel_gpadl_torndown) },
	1395	+ { CHANNELMSG_RELID_RELEASED, 0, NULL, 0},
	1396	+ { CHANNELMSG_INITIATE_CONTACT, 0, NULL, 0},
	1397	+ { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response,
	1398	+ sizeof(struct vmbus_channel_version_response)},
	1399	+ { CHANNELMSG_UNLOAD, 0, NULL, 0},
	1400	+ { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response, 0},
	1401	+ { CHANNELMSG_18, 0, NULL, 0},
	1402	+ { CHANNELMSG_19, 0, NULL, 0},
	1403	+ { CHANNELMSG_20, 0, NULL, 0},
	1404	+ { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0},
	1405	+ { CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0},
	1406	+ { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0},
1277	1407	};
1278	1408
1279	1409	/*
..	..	@@ -1281,13 +1411,8 @@
1281	1411	*
1282	1412	* This is invoked in the vmbus worker thread context.
1283	1413	*/
1284		-void vmbus_onmessage(void *context)
	1414	+void vmbus_onmessage(struct vmbus_channel_message_header *hdr)
1285	1415	{
1286		- struct hv_message *msg = context;
1287		- struct vmbus_channel_message_header *hdr;
1288		-
1289		- hdr = (struct vmbus_channel_message_header *)msg->u.payload;
1290		-
1291	1416	trace_vmbus_on_message(hdr);
1292	1417
1293	1418	/*
..	..	@@ -1332,49 +1457,6 @@
1332	1457
1333	1458	return ret;
1334	1459	}
1335		-
1336		-/*
1337		- * Retrieve the (sub) channel on which to send an outgoing request.
1338		- * When a primary channel has multiple sub-channels, we try to
1339		- * distribute the load equally amongst all available channels.
1340		- */
1341		-struct vmbus_channel vmbus_get_outgoing_channel(struct vmbus_channel primary)
1342		-{
1343		- struct list_head cur, tmp;
1344		- int cur_cpu;
1345		- struct vmbus_channel *cur_channel;
1346		- struct vmbus_channel *outgoing_channel = primary;
1347		- int next_channel;
1348		- int i = 1;
1349		-
1350		- if (list_empty(&primary->sc_list))
1351		- return outgoing_channel;
1352		-
1353		- next_channel = primary->next_oc++;
1354		-
1355		- if (next_channel > (primary->num_sc)) {
1356		- primary->next_oc = 0;
1357		- return outgoing_channel;
1358		- }
1359		-
1360		- cur_cpu = hv_cpu_number_to_vp_number(smp_processor_id());
1361		- list_for_each_safe(cur, tmp, &primary->sc_list) {
1362		- cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
1363		- if (cur_channel->state != CHANNEL_OPENED_STATE)
1364		- continue;
1365		-
1366		- if (cur_channel->target_vp == cur_cpu)
1367		- return cur_channel;
1368		-
1369		- if (i == next_channel)
1370		- return cur_channel;
1371		-
1372		- i++;
1373		- }
1374		-
1375		- return outgoing_channel;
1376		-}
1377		-EXPORT_SYMBOL_GPL(vmbus_get_outgoing_channel);
1378	1460
1379	1461	static void invoke_sc_cb(struct vmbus_channel *primary_channel)
1380	1462	{