hc
2024-05-10 37f49e37ab4cb5d0bc4c60eb5c6d4dd57db767bb
kernel/drivers/hv/channel_mgmt.c
....@@ -1,18 +1,6 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Copyright (c) 2009, Microsoft Corporation.
3
- *
4
- * This program is free software; you can redistribute it and/or modify it
5
- * under the terms and conditions of the GNU General Public License,
6
- * version 2, as published by the Free Software Foundation.
7
- *
8
- * This program is distributed in the hope it will be useful, but WITHOUT
9
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11
- * more details.
12
- *
13
- * You should have received a copy of the GNU General Public License along with
14
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
- * Place - Suite 330, Boston, MA 02111-1307 USA.
164 *
175 * Authors:
186 * Haiyang Zhang <haiyangz@microsoft.com>
....@@ -30,14 +18,15 @@
3018 #include <linux/module.h>
3119 #include <linux/completion.h>
3220 #include <linux/delay.h>
21
+#include <linux/cpu.h>
3322 #include <linux/hyperv.h>
3423 #include <asm/mshyperv.h>
3524
3625 #include "hyperv_vmbus.h"
3726
38
-static void init_vp_index(struct vmbus_channel *channel, u16 dev_type);
27
+static void init_vp_index(struct vmbus_channel *channel);
3928
40
-static const struct vmbus_device vmbus_devs[] = {
29
+const struct vmbus_device vmbus_devs[] = {
4130 /* IDE */
4231 { .dev_type = HV_IDE,
4332 HV_IDE_GUID,
....@@ -141,7 +130,7 @@
141130 };
142131
143132 static const struct {
144
- uuid_le guid;
133
+ guid_t guid;
145134 } vmbus_unsupported_devs[] = {
146135 { HV_AVMA1_GUID },
147136 { HV_AVMA2_GUID },
....@@ -171,26 +160,26 @@
171160 spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
172161 }
173162
174
-static bool is_unsupported_vmbus_devs(const uuid_le *guid)
163
+static bool is_unsupported_vmbus_devs(const guid_t *guid)
175164 {
176165 int i;
177166
178167 for (i = 0; i < ARRAY_SIZE(vmbus_unsupported_devs); i++)
179
- if (!uuid_le_cmp(*guid, vmbus_unsupported_devs[i].guid))
168
+ if (guid_equal(guid, &vmbus_unsupported_devs[i].guid))
180169 return true;
181170 return false;
182171 }
183172
184173 static u16 hv_get_dev_type(const struct vmbus_channel *channel)
185174 {
186
- const uuid_le *guid = &channel->offermsg.offer.if_type;
175
+ const guid_t *guid = &channel->offermsg.offer.if_type;
187176 u16 i;
188177
189178 if (is_hvsock_channel(channel) || is_unsupported_vmbus_devs(guid))
190179 return HV_UNKNOWN;
191180
192181 for (i = HV_IDE; i < HV_UNKNOWN; i++) {
193
- if (!uuid_le_cmp(*guid, vmbus_devs[i].guid))
182
+ if (guid_equal(guid, &vmbus_devs[i].guid))
194183 return i;
195184 }
196185 pr_info("Unknown GUID: %pUl\n", guid);
....@@ -198,24 +187,19 @@
198187 }
199188
200189 /**
201
- * vmbus_prep_negotiate_resp() - Create default response for Hyper-V Negotiate message
190
+ * vmbus_prep_negotiate_resp() - Create default response for Negotiate message
202191 * @icmsghdrp: Pointer to msg header structure
203
- * @icmsg_negotiate: Pointer to negotiate message structure
204192 * @buf: Raw buffer channel data
193
+ * @fw_version: The framework versions we can support.
194
+ * @fw_vercnt: The size of @fw_version.
195
+ * @srv_version: The service versions we can support.
196
+ * @srv_vercnt: The size of @srv_version.
197
+ * @nego_fw_version: The selected framework version.
198
+ * @nego_srv_version: The selected service version.
205199 *
206
- * @icmsghdrp is of type &struct icmsg_hdr.
200
+ * Note: Versions are given in decreasing order.
201
+ *
207202 * Set up and fill in default negotiate response message.
208
- *
209
- * The fw_version and fw_vercnt specifies the framework version that
210
- * we can support.
211
- *
212
- * The srv_version and srv_vercnt specifies the service
213
- * versions we can support.
214
- *
215
- * Versions are given in decreasing order.
216
- *
217
- * nego_fw_version and nego_srv_version store the selected protocol versions.
218
- *
219203 * Mainly used by Hyper-V drivers.
220204 */
221205 bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
....@@ -332,14 +316,15 @@
332316 if (!channel)
333317 return NULL;
334318
335
- spin_lock_init(&channel->lock);
319
+ spin_lock_init(&channel->sched_lock);
336320 init_completion(&channel->rescind_event);
337321
338322 INIT_LIST_HEAD(&channel->sc_list);
339
- INIT_LIST_HEAD(&channel->percpu_list);
340323
341324 tasklet_init(&channel->callback_event,
342325 vmbus_on_event, (unsigned long)channel);
326
+
327
+ hv_ringbuffer_pre_init(channel);
343328
344329 return channel;
345330 }
....@@ -355,22 +340,48 @@
355340 kobject_put(&channel->kobj);
356341 }
357342
358
-static void percpu_channel_enq(void *arg)
343
+void vmbus_channel_map_relid(struct vmbus_channel *channel)
359344 {
360
- struct vmbus_channel *channel = arg;
361
- struct hv_per_cpu_context *hv_cpu
362
- = this_cpu_ptr(hv_context.cpu_context);
363
-
364
- list_add_tail_rcu(&channel->percpu_list, &hv_cpu->chan_list);
345
+ if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
346
+ return;
347
+ /*
348
+ * The mapping of the channel's relid is visible from the CPUs that
349
+ * execute vmbus_chan_sched() by the time that vmbus_chan_sched() will
350
+ * execute:
351
+ *
352
+ * (a) In the "normal (i.e., not resuming from hibernation)" path,
353
+ * the full barrier in virt_store_mb() guarantees that the store
354
+ * is propagated to all CPUs before the add_channel_work work
355
+ * is queued. In turn, add_channel_work is queued before the
356
+ * channel's ring buffer is allocated/initialized and the
357
+ * OPENCHANNEL message for the channel is sent in vmbus_open().
358
+ * Hyper-V won't start sending the interrupts for the channel
359
+ * before the OPENCHANNEL message is acked. The memory barrier
360
+ * in vmbus_chan_sched() -> sync_test_and_clear_bit() ensures
361
+ * that vmbus_chan_sched() must find the channel's relid in
362
+ * recv_int_page before retrieving the channel pointer from the
363
+ * array of channels.
364
+ *
365
+ * (b) In the "resuming from hibernation" path, the virt_store_mb()
366
+ * guarantees that the store is propagated to all CPUs before
367
+ * the VMBus connection is marked as ready for the resume event
368
+ * (cf. check_ready_for_resume_event()). The interrupt handler
369
+ * of the VMBus driver and vmbus_chan_sched() can not run before
370
+ * vmbus_bus_resume() has completed execution (cf. resume_noirq).
371
+ */
372
+ virt_store_mb(
373
+ vmbus_connection.channels[channel->offermsg.child_relid],
374
+ channel);
365375 }
366376
367
-static void percpu_channel_deq(void *arg)
377
+void vmbus_channel_unmap_relid(struct vmbus_channel *channel)
368378 {
369
- struct vmbus_channel *channel = arg;
370
-
371
- list_del_rcu(&channel->percpu_list);
379
+ if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
380
+ return;
381
+ WRITE_ONCE(
382
+ vmbus_connection.channels[channel->offermsg.child_relid],
383
+ NULL);
372384 }
373
-
374385
375386 static void vmbus_release_relid(u32 relid)
376387 {
....@@ -386,51 +397,49 @@
386397 trace_vmbus_release_relid(&msg, ret);
387398 }
388399
389
-void hv_process_channel_removal(u32 relid)
400
+void hv_process_channel_removal(struct vmbus_channel *channel)
390401 {
391
- unsigned long flags;
392
- struct vmbus_channel *primary_channel, *channel;
393
-
394
- BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex));
395
-
396
- /*
397
- * Make sure channel is valid as we may have raced.
398
- */
399
- channel = relid2channel(relid);
400
- if (!channel)
401
- return;
402
-
402
+ lockdep_assert_held(&vmbus_connection.channel_mutex);
403403 BUG_ON(!channel->rescind);
404
- if (channel->target_cpu != get_cpu()) {
405
- put_cpu();
406
- smp_call_function_single(channel->target_cpu,
407
- percpu_channel_deq, channel, true);
408
- } else {
409
- percpu_channel_deq(channel);
410
- put_cpu();
411
- }
412
-
413
- if (channel->primary_channel == NULL) {
414
- list_del(&channel->listentry);
415
-
416
- primary_channel = channel;
417
- } else {
418
- primary_channel = channel->primary_channel;
419
- spin_lock_irqsave(&primary_channel->lock, flags);
420
- list_del(&channel->sc_list);
421
- primary_channel->num_sc--;
422
- spin_unlock_irqrestore(&primary_channel->lock, flags);
423
- }
424404
425405 /*
426
- * We need to free the bit for init_vp_index() to work in the case
427
- * of sub-channel, when we reload drivers like hv_netvsc.
406
+ * hv_process_channel_removal() could find INVALID_RELID only for
407
+ * hv_sock channels. See the inline comments in vmbus_onoffer().
428408 */
429
- if (channel->affinity_policy == HV_LOCALIZED)
430
- cpumask_clear_cpu(channel->target_cpu,
431
- &primary_channel->alloced_cpus_in_node);
409
+ WARN_ON(channel->offermsg.child_relid == INVALID_RELID &&
410
+ !is_hvsock_channel(channel));
432411
433
- vmbus_release_relid(relid);
412
+ /*
413
+ * Upon suspend, an in-use hv_sock channel is removed from the array of
414
+ * channels and the relid is invalidated. After hibernation, when the
415
+ * user-space appplication destroys the channel, it's unnecessary and
416
+ * unsafe to remove the channel from the array of channels. See also
417
+ * the inline comments before the call of vmbus_release_relid() below.
418
+ */
419
+ if (channel->offermsg.child_relid != INVALID_RELID)
420
+ vmbus_channel_unmap_relid(channel);
421
+
422
+ if (channel->primary_channel == NULL)
423
+ list_del(&channel->listentry);
424
+ else
425
+ list_del(&channel->sc_list);
426
+
427
+ /*
428
+ * If this is a "perf" channel, updates the hv_numa_map[] masks so that
429
+ * init_vp_index() can (re-)use the CPU.
430
+ */
431
+ if (hv_is_perf_channel(channel))
432
+ hv_clear_alloced_cpu(channel->target_cpu);
433
+
434
+ /*
435
+ * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and
436
+ * the relid is invalidated; after hibernation, when the user-space app
437
+ * destroys the channel, the relid is INVALID_RELID, and in this case
438
+ * it's unnecessary and unsafe to release the old relid, since the same
439
+ * relid can refer to a completely different channel now.
440
+ */
441
+ if (channel->offermsg.child_relid != INVALID_RELID)
442
+ vmbus_release_relid(channel->offermsg.child_relid);
434443
435444 free_channel(channel);
436445 }
....@@ -454,23 +463,7 @@
454463 struct vmbus_channel *newchannel =
455464 container_of(work, struct vmbus_channel, add_channel_work);
456465 struct vmbus_channel *primary_channel = newchannel->primary_channel;
457
- unsigned long flags;
458
- u16 dev_type;
459466 int ret;
460
-
461
- dev_type = hv_get_dev_type(newchannel);
462
-
463
- init_vp_index(newchannel, dev_type);
464
-
465
- if (newchannel->target_cpu != get_cpu()) {
466
- put_cpu();
467
- smp_call_function_single(newchannel->target_cpu,
468
- percpu_channel_enq,
469
- newchannel, true);
470
- } else {
471
- percpu_channel_enq(newchannel);
472
- put_cpu();
473
- }
474467
475468 /*
476469 * This state is used to indicate a successful open
....@@ -503,18 +496,22 @@
503496 if (!newchannel->device_obj)
504497 goto err_deq_chan;
505498
506
- newchannel->device_obj->device_id = dev_type;
499
+ newchannel->device_obj->device_id = newchannel->device_id;
507500 /*
508501 * Add the new device to the bus. This will kick off device-driver
509502 * binding which eventually invokes the device driver's AddDevice()
510503 * method.
504
+ *
505
+ * If vmbus_device_register() fails, the 'device_obj' is freed in
506
+ * vmbus_device_release() as called by device_unregister() in the
507
+ * error path of vmbus_device_register(). In the outside error
508
+ * path, there's no need to free it.
511509 */
512510 ret = vmbus_device_register(newchannel->device_obj);
513511
514512 if (ret != 0) {
515513 pr_err("unable to add child device object (relid %d)\n",
516514 newchannel->offermsg.child_relid);
517
- kfree(newchannel->device_obj);
518515 goto err_deq_chan;
519516 }
520517
....@@ -530,25 +527,15 @@
530527 */
531528 newchannel->probe_done = true;
532529
533
- if (primary_channel == NULL) {
530
+ if (primary_channel == NULL)
534531 list_del(&newchannel->listentry);
535
- } else {
536
- spin_lock_irqsave(&primary_channel->lock, flags);
532
+ else
537533 list_del(&newchannel->sc_list);
538
- spin_unlock_irqrestore(&primary_channel->lock, flags);
539
- }
534
+
535
+ /* vmbus_process_offer() has mapped the channel. */
536
+ vmbus_channel_unmap_relid(newchannel);
540537
541538 mutex_unlock(&vmbus_connection.channel_mutex);
542
-
543
- if (newchannel->target_cpu != get_cpu()) {
544
- put_cpu();
545
- smp_call_function_single(newchannel->target_cpu,
546
- percpu_channel_deq,
547
- newchannel, true);
548
- } else {
549
- percpu_channel_deq(newchannel);
550
- put_cpu();
551
- }
552539
553540 vmbus_release_relid(newchannel->offermsg.child_relid);
554541
....@@ -563,10 +550,40 @@
563550 {
564551 struct vmbus_channel *channel;
565552 struct workqueue_struct *wq;
566
- unsigned long flags;
567553 bool fnew = true;
568554
555
+ /*
556
+ * Synchronize vmbus_process_offer() and CPU hotplugging:
557
+ *
558
+ * CPU1 CPU2
559
+ *
560
+ * [vmbus_process_offer()] [Hot removal of the CPU]
561
+ *
562
+ * CPU_READ_LOCK CPUS_WRITE_LOCK
563
+ * LOAD cpu_online_mask SEARCH chn_list
564
+ * STORE target_cpu LOAD target_cpu
565
+ * INSERT chn_list STORE cpu_online_mask
566
+ * CPUS_READ_UNLOCK CPUS_WRITE_UNLOCK
567
+ *
568
+ * Forbids: CPU1's LOAD from *not* seing CPU2's STORE &&
569
+ * CPU2's SEARCH from *not* seeing CPU1's INSERT
570
+ *
571
+ * Forbids: CPU2's SEARCH from seeing CPU1's INSERT &&
572
+ * CPU2's LOAD from *not* seing CPU1's STORE
573
+ */
574
+ cpus_read_lock();
575
+
576
+ /*
577
+ * Serializes the modifications of the chn_list list as well as
578
+ * the accesses to next_numa_node_id in init_vp_index().
579
+ */
569580 mutex_lock(&vmbus_connection.channel_mutex);
581
+
582
+ init_vp_index(newchannel);
583
+
584
+ /* Remember the channels that should be cleaned up upon suspend. */
585
+ if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel))
586
+ atomic_inc(&vmbus_connection.nr_chan_close_on_suspend);
570587
571588 /*
572589 * Now that we have acquired the channel_mutex,
....@@ -575,24 +592,25 @@
575592 atomic_dec(&vmbus_connection.offer_in_progress);
576593
577594 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
578
- if (!uuid_le_cmp(channel->offermsg.offer.if_type,
579
- newchannel->offermsg.offer.if_type) &&
580
- !uuid_le_cmp(channel->offermsg.offer.if_instance,
581
- newchannel->offermsg.offer.if_instance)) {
595
+ if (guid_equal(&channel->offermsg.offer.if_type,
596
+ &newchannel->offermsg.offer.if_type) &&
597
+ guid_equal(&channel->offermsg.offer.if_instance,
598
+ &newchannel->offermsg.offer.if_instance)) {
582599 fnew = false;
583600 break;
584601 }
585602 }
586603
587
- if (fnew)
604
+ if (fnew) {
588605 list_add_tail(&newchannel->listentry,
589606 &vmbus_connection.chn_list);
590
- else {
607
+ } else {
591608 /*
592609 * Check to see if this is a valid sub-channel.
593610 */
594611 if (newchannel->offermsg.offer.sub_channel_index == 0) {
595612 mutex_unlock(&vmbus_connection.channel_mutex);
613
+ cpus_read_unlock();
596614 /*
597615 * Don't call free_channel(), because newchannel->kobj
598616 * is not initialized yet.
....@@ -605,12 +623,13 @@
605623 * Process the sub-channel.
606624 */
607625 newchannel->primary_channel = channel;
608
- spin_lock_irqsave(&channel->lock, flags);
609626 list_add_tail(&newchannel->sc_list, &channel->sc_list);
610
- spin_unlock_irqrestore(&channel->lock, flags);
611627 }
612628
629
+ vmbus_channel_map_relid(newchannel);
630
+
613631 mutex_unlock(&vmbus_connection.channel_mutex);
632
+ cpus_read_unlock();
614633
615634 /*
616635 * vmbus_process_offer() mustn't call channel->sc_creation_callback()
....@@ -643,73 +662,57 @@
643662 * We use this state to statically distribute the channel interrupt load.
644663 */
645664 static int next_numa_node_id;
646
-/*
647
- * init_vp_index() accesses global variables like next_numa_node_id, and
648
- * it can run concurrently for primary channels and sub-channels: see
649
- * vmbus_process_offer(), so we need the lock to protect the global
650
- * variables.
651
- */
652
-static DEFINE_SPINLOCK(bind_channel_to_cpu_lock);
653665
654666 /*
655667 * Starting with Win8, we can statically distribute the incoming
656668 * channel interrupt load by binding a channel to VCPU.
657
- * We distribute the interrupt loads to one or more NUMA nodes based on
658
- * the channel's affinity_policy.
659669 *
660670 * For pre-win8 hosts or non-performance critical channels we assign the
661
- * first CPU in the first NUMA node.
671
+ * VMBUS_CONNECT_CPU.
672
+ *
673
+ * Starting with win8, performance critical channels will be distributed
674
+ * evenly among all the available NUMA nodes. Once the node is assigned,
675
+ * we will assign the CPU based on a simple round robin scheme.
662676 */
663
-static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
677
+static void init_vp_index(struct vmbus_channel *channel)
664678 {
665
- u32 cur_cpu;
666
- bool perf_chn = vmbus_devs[dev_type].perf_device;
667
- struct vmbus_channel *primary = channel->primary_channel;
668
- int next_node;
679
+ bool perf_chn = hv_is_perf_channel(channel);
669680 cpumask_var_t available_mask;
670681 struct cpumask *alloced_mask;
682
+ u32 target_cpu;
683
+ int numa_node;
671684
672685 if ((vmbus_proto_version == VERSION_WS2008) ||
673686 (vmbus_proto_version == VERSION_WIN7) || (!perf_chn) ||
674687 !alloc_cpumask_var(&available_mask, GFP_KERNEL)) {
675688 /*
676689 * Prior to win8, all channel interrupts are
677
- * delivered on cpu 0.
690
+ * delivered on VMBUS_CONNECT_CPU.
678691 * Also if the channel is not a performance critical
679
- * channel, bind it to cpu 0.
680
- * In case alloc_cpumask_var() fails, bind it to cpu 0.
692
+ * channel, bind it to VMBUS_CONNECT_CPU.
693
+ * In case alloc_cpumask_var() fails, bind it to
694
+ * VMBUS_CONNECT_CPU.
681695 */
682
- channel->numa_node = 0;
683
- channel->target_cpu = 0;
684
- channel->target_vp = hv_cpu_number_to_vp_number(0);
696
+ channel->target_cpu = VMBUS_CONNECT_CPU;
697
+ if (perf_chn)
698
+ hv_set_alloced_cpu(VMBUS_CONNECT_CPU);
685699 return;
686700 }
687701
688
- spin_lock(&bind_channel_to_cpu_lock);
689
-
690
- /*
691
- * Based on the channel affinity policy, we will assign the NUMA
692
- * nodes.
693
- */
694
-
695
- if ((channel->affinity_policy == HV_BALANCED) || (!primary)) {
696
- while (true) {
697
- next_node = next_numa_node_id++;
698
- if (next_node == nr_node_ids) {
699
- next_node = next_numa_node_id = 0;
700
- continue;
701
- }
702
- if (cpumask_empty(cpumask_of_node(next_node)))
703
- continue;
704
- break;
702
+ while (true) {
703
+ numa_node = next_numa_node_id++;
704
+ if (numa_node == nr_node_ids) {
705
+ next_numa_node_id = 0;
706
+ continue;
705707 }
706
- channel->numa_node = next_node;
707
- primary = channel;
708
+ if (cpumask_empty(cpumask_of_node(numa_node)))
709
+ continue;
710
+ break;
708711 }
709
- alloced_mask = &hv_context.hv_numa_map[primary->numa_node];
712
+ alloced_mask = &hv_context.hv_numa_map[numa_node];
710713
711714 if (cpumask_weight(alloced_mask) ==
712
- cpumask_weight(cpumask_of_node(primary->numa_node))) {
715
+ cpumask_weight(cpumask_of_node(numa_node))) {
713716 /*
714717 * We have cycled through all the CPUs in the node;
715718 * reset the alloced map.
....@@ -717,59 +720,12 @@
717720 cpumask_clear(alloced_mask);
718721 }
719722
720
- cpumask_xor(available_mask, alloced_mask,
721
- cpumask_of_node(primary->numa_node));
723
+ cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node));
722724
723
- cur_cpu = -1;
725
+ target_cpu = cpumask_first(available_mask);
726
+ cpumask_set_cpu(target_cpu, alloced_mask);
724727
725
- if (primary->affinity_policy == HV_LOCALIZED) {
726
- /*
727
- * Normally Hyper-V host doesn't create more subchannels
728
- * than there are VCPUs on the node but it is possible when not
729
- * all present VCPUs on the node are initialized by guest.
730
- * Clear the alloced_cpus_in_node to start over.
731
- */
732
- if (cpumask_equal(&primary->alloced_cpus_in_node,
733
- cpumask_of_node(primary->numa_node)))
734
- cpumask_clear(&primary->alloced_cpus_in_node);
735
- }
736
-
737
- while (true) {
738
- cur_cpu = cpumask_next(cur_cpu, available_mask);
739
- if (cur_cpu >= nr_cpu_ids) {
740
- cur_cpu = -1;
741
- cpumask_copy(available_mask,
742
- cpumask_of_node(primary->numa_node));
743
- continue;
744
- }
745
-
746
- if (primary->affinity_policy == HV_LOCALIZED) {
747
- /*
748
- * NOTE: in the case of sub-channel, we clear the
749
- * sub-channel related bit(s) in
750
- * primary->alloced_cpus_in_node in
751
- * hv_process_channel_removal(), so when we
752
- * reload drivers like hv_netvsc in SMP guest, here
753
- * we're able to re-allocate
754
- * bit from primary->alloced_cpus_in_node.
755
- */
756
- if (!cpumask_test_cpu(cur_cpu,
757
- &primary->alloced_cpus_in_node)) {
758
- cpumask_set_cpu(cur_cpu,
759
- &primary->alloced_cpus_in_node);
760
- cpumask_set_cpu(cur_cpu, alloced_mask);
761
- break;
762
- }
763
- } else {
764
- cpumask_set_cpu(cur_cpu, alloced_mask);
765
- break;
766
- }
767
- }
768
-
769
- channel->target_cpu = cur_cpu;
770
- channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu);
771
-
772
- spin_unlock(&bind_channel_to_cpu_lock);
728
+ channel->target_cpu = target_cpu;
773729
774730 free_cpumask_var(available_mask);
775731 }
....@@ -809,11 +765,22 @@
809765 if (completion_done(&vmbus_connection.unload_event))
810766 goto completed;
811767
812
- for_each_online_cpu(cpu) {
768
+ for_each_present_cpu(cpu) {
813769 struct hv_per_cpu_context *hv_cpu
814770 = per_cpu_ptr(hv_context.cpu_context, cpu);
815771
772
+ /*
773
+ * In a CoCo VM the synic_message_page is not allocated
774
+ * in hv_synic_alloc(). Instead it is set/cleared in
775
+ * hv_synic_enable_regs() and hv_synic_disable_regs()
776
+ * such that it is set only when the CPU is online. If
777
+ * not all present CPUs are online, the message page
778
+ * might be NULL, so skip such CPUs.
779
+ */
816780 page_addr = hv_cpu->synic_message_page;
781
+ if (!page_addr)
782
+ continue;
783
+
817784 msg = (struct hv_message *)page_addr
818785 + VMBUS_MESSAGE_SINT;
819786
....@@ -847,11 +814,14 @@
847814 * maybe-pending messages on all CPUs to be able to receive new
848815 * messages after we reconnect.
849816 */
850
- for_each_online_cpu(cpu) {
817
+ for_each_present_cpu(cpu) {
851818 struct hv_per_cpu_context *hv_cpu
852819 = per_cpu_ptr(hv_context.cpu_context, cpu);
853820
854821 page_addr = hv_cpu->synic_message_page;
822
+ if (!page_addr)
823
+ continue;
824
+
855825 msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
856826 msg->header.message_type = HVMSG_NONE;
857827 }
....@@ -896,6 +866,68 @@
896866 vmbus_wait_for_unload();
897867 }
898868
869
+static void check_ready_for_resume_event(void)
870
+{
871
+ /*
872
+ * If all the old primary channels have been fixed up, then it's safe
873
+ * to resume.
874
+ */
875
+ if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume))
876
+ complete(&vmbus_connection.ready_for_resume_event);
877
+}
878
+
879
+static void vmbus_setup_channel_state(struct vmbus_channel *channel,
880
+ struct vmbus_channel_offer_channel *offer)
881
+{
882
+ /*
883
+ * Setup state for signalling the host.
884
+ */
885
+ channel->sig_event = VMBUS_EVENT_CONNECTION_ID;
886
+
887
+ if (vmbus_proto_version != VERSION_WS2008) {
888
+ channel->is_dedicated_interrupt =
889
+ (offer->is_dedicated_interrupt != 0);
890
+ channel->sig_event = offer->connection_id;
891
+ }
892
+
893
+ memcpy(&channel->offermsg, offer,
894
+ sizeof(struct vmbus_channel_offer_channel));
895
+ channel->monitor_grp = (u8)offer->monitorid / 32;
896
+ channel->monitor_bit = (u8)offer->monitorid % 32;
897
+ channel->device_id = hv_get_dev_type(channel);
898
+}
899
+
900
+/*
901
+ * find_primary_channel_by_offer - Get the channel object given the new offer.
902
+ * This is only used in the resume path of hibernation.
903
+ */
904
+static struct vmbus_channel *
905
+find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer)
906
+{
907
+ struct vmbus_channel *channel = NULL, *iter;
908
+ const guid_t *inst1, *inst2;
909
+
910
+ /* Ignore sub-channel offers. */
911
+ if (offer->offer.sub_channel_index != 0)
912
+ return NULL;
913
+
914
+ mutex_lock(&vmbus_connection.channel_mutex);
915
+
916
+ list_for_each_entry(iter, &vmbus_connection.chn_list, listentry) {
917
+ inst1 = &iter->offermsg.offer.if_instance;
918
+ inst2 = &offer->offer.if_instance;
919
+
920
+ if (guid_equal(inst1, inst2)) {
921
+ channel = iter;
922
+ break;
923
+ }
924
+ }
925
+
926
+ mutex_unlock(&vmbus_connection.channel_mutex);
927
+
928
+ return channel;
929
+}
930
+
899931 /*
900932 * vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
901933 *
....@@ -903,11 +935,84 @@
903935 static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
904936 {
905937 struct vmbus_channel_offer_channel *offer;
906
- struct vmbus_channel *newchannel;
938
+ struct vmbus_channel *oldchannel, *newchannel;
939
+ size_t offer_sz;
907940
908941 offer = (struct vmbus_channel_offer_channel *)hdr;
909942
910943 trace_vmbus_onoffer(offer);
944
+
945
+ oldchannel = find_primary_channel_by_offer(offer);
946
+
947
+ if (oldchannel != NULL) {
948
+ /*
949
+ * We're resuming from hibernation: all the sub-channel and
950
+ * hv_sock channels we had before the hibernation should have
951
+ * been cleaned up, and now we must be seeing a re-offered
952
+ * primary channel that we had before the hibernation.
953
+ */
954
+
955
+ /*
956
+ * { Initially: channel relid = INVALID_RELID,
957
+ * channels[valid_relid] = NULL }
958
+ *
959
+ * CPU1 CPU2
960
+ *
961
+ * [vmbus_onoffer()] [vmbus_device_release()]
962
+ *
963
+ * LOCK channel_mutex LOCK channel_mutex
964
+ * STORE channel relid = valid_relid LOAD r1 = channel relid
965
+ * MAP_RELID channel if (r1 != INVALID_RELID)
966
+ * UNLOCK channel_mutex UNMAP_RELID channel
967
+ * UNLOCK channel_mutex
968
+ *
969
+ * Forbids: r1 == valid_relid &&
970
+ * channels[valid_relid] == channel
971
+ *
972
+ * Note. r1 can be INVALID_RELID only for an hv_sock channel.
973
+ * None of the hv_sock channels which were present before the
974
+ * suspend are re-offered upon the resume. See the WARN_ON()
975
+ * in hv_process_channel_removal().
976
+ */
977
+ mutex_lock(&vmbus_connection.channel_mutex);
978
+
979
+ atomic_dec(&vmbus_connection.offer_in_progress);
980
+
981
+ WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID);
982
+ /* Fix up the relid. */
983
+ oldchannel->offermsg.child_relid = offer->child_relid;
984
+
985
+ offer_sz = sizeof(*offer);
986
+ if (memcmp(offer, &oldchannel->offermsg, offer_sz) != 0) {
987
+ /*
988
+ * This is not an error, since the host can also change
989
+ * the other field(s) of the offer, e.g. on WS RS5
990
+ * (Build 17763), the offer->connection_id of the
991
+ * Mellanox VF vmbus device can change when the host
992
+ * reoffers the device upon resume.
993
+ */
994
+ pr_debug("vmbus offer changed: relid=%d\n",
995
+ offer->child_relid);
996
+
997
+ print_hex_dump_debug("Old vmbus offer: ",
998
+ DUMP_PREFIX_OFFSET, 16, 4,
999
+ &oldchannel->offermsg, offer_sz,
1000
+ false);
1001
+ print_hex_dump_debug("New vmbus offer: ",
1002
+ DUMP_PREFIX_OFFSET, 16, 4,
1003
+ offer, offer_sz, false);
1004
+
1005
+ /* Fix up the old channel. */
1006
+ vmbus_setup_channel_state(oldchannel, offer);
1007
+ }
1008
+
1009
+ /* Add the channel back to the array of channels. */
1010
+ vmbus_channel_map_relid(oldchannel);
1011
+ check_ready_for_resume_event();
1012
+
1013
+ mutex_unlock(&vmbus_connection.channel_mutex);
1014
+ return;
1015
+ }
9111016
9121017 /* Allocate the channel object and save this offer. */
9131018 newchannel = alloc_channel();
....@@ -918,23 +1023,19 @@
9181023 return;
9191024 }
9201025
921
- /*
922
- * Setup state for signalling the host.
923
- */
924
- newchannel->sig_event = VMBUS_EVENT_CONNECTION_ID;
925
-
926
- if (vmbus_proto_version != VERSION_WS2008) {
927
- newchannel->is_dedicated_interrupt =
928
- (offer->is_dedicated_interrupt != 0);
929
- newchannel->sig_event = offer->connection_id;
930
- }
931
-
932
- memcpy(&newchannel->offermsg, offer,
933
- sizeof(struct vmbus_channel_offer_channel));
934
- newchannel->monitor_grp = (u8)offer->monitorid / 32;
935
- newchannel->monitor_bit = (u8)offer->monitorid % 32;
1026
+ vmbus_setup_channel_state(newchannel, offer);
9361027
9371028 vmbus_process_offer(newchannel);
1029
+}
1030
+
1031
+static void check_ready_for_suspend_event(void)
1032
+{
1033
+ /*
1034
+ * If all the sub-channels or hv_sock channels have been cleaned up,
1035
+ * then it's safe to suspend.
1036
+ */
1037
+ if (atomic_dec_and_test(&vmbus_connection.nr_chan_close_on_suspend))
1038
+ complete(&vmbus_connection.ready_for_suspend_event);
9381039 }
9391040
9401041 /*
....@@ -947,6 +1048,7 @@
9471048 struct vmbus_channel_rescind_offer *rescind;
9481049 struct vmbus_channel *channel;
9491050 struct device *dev;
1051
+ bool clean_up_chan_for_suspend;
9501052
9511053 rescind = (struct vmbus_channel_rescind_offer *)hdr;
9521054
....@@ -958,11 +1060,22 @@
9581060 * offer comes in first and then the rescind.
9591061 * Since we process these events in work elements,
9601062 * and with preemption, we may end up processing
961
- * the events out of order. Given that we handle these
962
- * work elements on the same CPU, this is possible only
963
- * in the case of preemption. In any case wait here
964
- * until the offer processing has moved beyond the
965
- * point where the channel is discoverable.
1063
+ * the events out of order. We rely on the synchronization
1064
+ * provided by offer_in_progress and by channel_mutex for
1065
+ * ordering these events:
1066
+ *
1067
+ * { Initially: offer_in_progress = 1 }
1068
+ *
1069
+ * CPU1 CPU2
1070
+ *
1071
+ * [vmbus_onoffer()] [vmbus_onoffer_rescind()]
1072
+ *
1073
+ * LOCK channel_mutex WAIT_ON offer_in_progress == 0
1074
+ * DECREMENT offer_in_progress LOCK channel_mutex
1075
+ * STORE channels[] LOAD channels[]
1076
+ * UNLOCK channel_mutex UNLOCK channel_mutex
1077
+ *
1078
+ * Forbids: CPU2's LOAD from *not* seeing CPU1's STORE
9661079 */
9671080
9681081 while (atomic_read(&vmbus_connection.offer_in_progress) != 0) {
....@@ -986,6 +1099,8 @@
9861099 return;
9871100 }
9881101
1102
+ clean_up_chan_for_suspend = is_hvsock_channel(channel) ||
1103
+ is_sub_channel(channel);
9891104 /*
9901105 * Before setting channel->rescind in vmbus_rescind_cleanup(), we
9911106 * should make sure the channel callback is not running any more.
....@@ -1011,6 +1126,10 @@
10111126 if (channel->device_obj) {
10121127 if (channel->chn_rescind_callback) {
10131128 channel->chn_rescind_callback(channel);
1129
+
1130
+ if (clean_up_chan_for_suspend)
1131
+ check_ready_for_suspend_event();
1132
+
10141133 return;
10151134 }
10161135 /*
....@@ -1036,12 +1155,17 @@
10361155 * The channel is currently not open;
10371156 * it is safe for us to cleanup the channel.
10381157 */
1039
- hv_process_channel_removal(rescind->child_relid);
1158
+ hv_process_channel_removal(channel);
10401159 } else {
10411160 complete(&channel->rescind_event);
10421161 }
10431162 mutex_unlock(&vmbus_connection.channel_mutex);
10441163 }
1164
+
1165
+ /* The "channel" may have been freed. Do not access it any longer. */
1166
+
1167
+ if (clean_up_chan_for_suspend)
1168
+ check_ready_for_suspend_event();
10451169 }
10461170
10471171 void vmbus_hvsock_device_unregister(struct vmbus_channel *channel)
....@@ -1250,30 +1374,36 @@
12501374 /* Channel message dispatch table */
12511375 const struct vmbus_channel_message_table_entry
12521376 channel_message_table[CHANNELMSG_COUNT] = {
1253
- { CHANNELMSG_INVALID, 0, NULL },
1254
- { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer },
1255
- { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind },
1256
- { CHANNELMSG_REQUESTOFFERS, 0, NULL },
1257
- { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered },
1258
- { CHANNELMSG_OPENCHANNEL, 0, NULL },
1259
- { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result },
1260
- { CHANNELMSG_CLOSECHANNEL, 0, NULL },
1261
- { CHANNELMSG_GPADL_HEADER, 0, NULL },
1262
- { CHANNELMSG_GPADL_BODY, 0, NULL },
1263
- { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created },
1264
- { CHANNELMSG_GPADL_TEARDOWN, 0, NULL },
1265
- { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown },
1266
- { CHANNELMSG_RELID_RELEASED, 0, NULL },
1267
- { CHANNELMSG_INITIATE_CONTACT, 0, NULL },
1268
- { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response },
1269
- { CHANNELMSG_UNLOAD, 0, NULL },
1270
- { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response },
1271
- { CHANNELMSG_18, 0, NULL },
1272
- { CHANNELMSG_19, 0, NULL },
1273
- { CHANNELMSG_20, 0, NULL },
1274
- { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL },
1275
- { CHANNELMSG_22, 0, NULL },
1276
- { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL },
1377
+ { CHANNELMSG_INVALID, 0, NULL, 0},
1378
+ { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer,
1379
+ sizeof(struct vmbus_channel_offer_channel)},
1380
+ { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind,
1381
+ sizeof(struct vmbus_channel_rescind_offer) },
1382
+ { CHANNELMSG_REQUESTOFFERS, 0, NULL, 0},
1383
+ { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered, 0},
1384
+ { CHANNELMSG_OPENCHANNEL, 0, NULL, 0},
1385
+ { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result,
1386
+ sizeof(struct vmbus_channel_open_result)},
1387
+ { CHANNELMSG_CLOSECHANNEL, 0, NULL, 0},
1388
+ { CHANNELMSG_GPADL_HEADER, 0, NULL, 0},
1389
+ { CHANNELMSG_GPADL_BODY, 0, NULL, 0},
1390
+ { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created,
1391
+ sizeof(struct vmbus_channel_gpadl_created)},
1392
+ { CHANNELMSG_GPADL_TEARDOWN, 0, NULL, 0},
1393
+ { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown,
1394
+ sizeof(struct vmbus_channel_gpadl_torndown) },
1395
+ { CHANNELMSG_RELID_RELEASED, 0, NULL, 0},
1396
+ { CHANNELMSG_INITIATE_CONTACT, 0, NULL, 0},
1397
+ { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response,
1398
+ sizeof(struct vmbus_channel_version_response)},
1399
+ { CHANNELMSG_UNLOAD, 0, NULL, 0},
1400
+ { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response, 0},
1401
+ { CHANNELMSG_18, 0, NULL, 0},
1402
+ { CHANNELMSG_19, 0, NULL, 0},
1403
+ { CHANNELMSG_20, 0, NULL, 0},
1404
+ { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0},
1405
+ { CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0},
1406
+ { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0},
12771407 };
12781408
12791409 /*
....@@ -1281,13 +1411,8 @@
12811411 *
12821412 * This is invoked in the vmbus worker thread context.
12831413 */
1284
-void vmbus_onmessage(void *context)
1414
+void vmbus_onmessage(struct vmbus_channel_message_header *hdr)
12851415 {
1286
- struct hv_message *msg = context;
1287
- struct vmbus_channel_message_header *hdr;
1288
-
1289
- hdr = (struct vmbus_channel_message_header *)msg->u.payload;
1290
-
12911416 trace_vmbus_on_message(hdr);
12921417
12931418 /*
....@@ -1332,49 +1457,6 @@
13321457
13331458 return ret;
13341459 }
1335
-
1336
-/*
1337
- * Retrieve the (sub) channel on which to send an outgoing request.
1338
- * When a primary channel has multiple sub-channels, we try to
1339
- * distribute the load equally amongst all available channels.
1340
- */
1341
-struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
1342
-{
1343
- struct list_head *cur, *tmp;
1344
- int cur_cpu;
1345
- struct vmbus_channel *cur_channel;
1346
- struct vmbus_channel *outgoing_channel = primary;
1347
- int next_channel;
1348
- int i = 1;
1349
-
1350
- if (list_empty(&primary->sc_list))
1351
- return outgoing_channel;
1352
-
1353
- next_channel = primary->next_oc++;
1354
-
1355
- if (next_channel > (primary->num_sc)) {
1356
- primary->next_oc = 0;
1357
- return outgoing_channel;
1358
- }
1359
-
1360
- cur_cpu = hv_cpu_number_to_vp_number(smp_processor_id());
1361
- list_for_each_safe(cur, tmp, &primary->sc_list) {
1362
- cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
1363
- if (cur_channel->state != CHANNEL_OPENED_STATE)
1364
- continue;
1365
-
1366
- if (cur_channel->target_vp == cur_cpu)
1367
- return cur_channel;
1368
-
1369
- if (i == next_channel)
1370
- return cur_channel;
1371
-
1372
- i++;
1373
- }
1374
-
1375
- return outgoing_channel;
1376
-}
1377
-EXPORT_SYMBOL_GPL(vmbus_get_outgoing_channel);
13781460
13791461 static void invoke_sc_cb(struct vmbus_channel *primary_channel)
13801462 {