forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-11 072de836f53be56a70cecf70b43ae43b7ce17376
kernel/drivers/hv/channel_mgmt.c
....@@ -1,18 +1,6 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Copyright (c) 2009, Microsoft Corporation.
3
- *
4
- * This program is free software; you can redistribute it and/or modify it
5
- * under the terms and conditions of the GNU General Public License,
6
- * version 2, as published by the Free Software Foundation.
7
- *
8
- * This program is distributed in the hope it will be useful, but WITHOUT
9
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11
- * more details.
12
- *
13
- * You should have received a copy of the GNU General Public License along with
14
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
- * Place - Suite 330, Boston, MA 02111-1307 USA.
164 *
175 * Authors:
186 * Haiyang Zhang <haiyangz@microsoft.com>
....@@ -30,14 +18,15 @@
3018 #include <linux/module.h>
3119 #include <linux/completion.h>
3220 #include <linux/delay.h>
21
+#include <linux/cpu.h>
3322 #include <linux/hyperv.h>
3423 #include <asm/mshyperv.h>
3524
3625 #include "hyperv_vmbus.h"
3726
38
-static void init_vp_index(struct vmbus_channel *channel, u16 dev_type);
27
+static void init_vp_index(struct vmbus_channel *channel);
3928
40
-static const struct vmbus_device vmbus_devs[] = {
29
+const struct vmbus_device vmbus_devs[] = {
4130 /* IDE */
4231 { .dev_type = HV_IDE,
4332 HV_IDE_GUID,
....@@ -141,7 +130,7 @@
141130 };
142131
143132 static const struct {
144
- uuid_le guid;
133
+ guid_t guid;
145134 } vmbus_unsupported_devs[] = {
146135 { HV_AVMA1_GUID },
147136 { HV_AVMA2_GUID },
....@@ -171,26 +160,26 @@
171160 spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
172161 }
173162
174
-static bool is_unsupported_vmbus_devs(const uuid_le *guid)
163
+static bool is_unsupported_vmbus_devs(const guid_t *guid)
175164 {
176165 int i;
177166
178167 for (i = 0; i < ARRAY_SIZE(vmbus_unsupported_devs); i++)
179
- if (!uuid_le_cmp(*guid, vmbus_unsupported_devs[i].guid))
168
+ if (guid_equal(guid, &vmbus_unsupported_devs[i].guid))
180169 return true;
181170 return false;
182171 }
183172
184173 static u16 hv_get_dev_type(const struct vmbus_channel *channel)
185174 {
186
- const uuid_le *guid = &channel->offermsg.offer.if_type;
175
+ const guid_t *guid = &channel->offermsg.offer.if_type;
187176 u16 i;
188177
189178 if (is_hvsock_channel(channel) || is_unsupported_vmbus_devs(guid))
190179 return HV_UNKNOWN;
191180
192181 for (i = HV_IDE; i < HV_UNKNOWN; i++) {
193
- if (!uuid_le_cmp(*guid, vmbus_devs[i].guid))
182
+ if (guid_equal(guid, &vmbus_devs[i].guid))
194183 return i;
195184 }
196185 pr_info("Unknown GUID: %pUl\n", guid);
....@@ -198,24 +187,19 @@
198187 }
199188
200189 /**
201
- * vmbus_prep_negotiate_resp() - Create default response for Hyper-V Negotiate message
190
+ * vmbus_prep_negotiate_resp() - Create default response for Negotiate message
202191 * @icmsghdrp: Pointer to msg header structure
203
- * @icmsg_negotiate: Pointer to negotiate message structure
204192 * @buf: Raw buffer channel data
193
+ * @fw_version: The framework versions we can support.
194
+ * @fw_vercnt: The size of @fw_version.
195
+ * @srv_version: The service versions we can support.
196
+ * @srv_vercnt: The size of @srv_version.
197
+ * @nego_fw_version: The selected framework version.
198
+ * @nego_srv_version: The selected service version.
205199 *
206
- * @icmsghdrp is of type &struct icmsg_hdr.
200
+ * Note: Versions are given in decreasing order.
201
+ *
207202 * Set up and fill in default negotiate response message.
208
- *
209
- * The fw_version and fw_vercnt specifies the framework version that
210
- * we can support.
211
- *
212
- * The srv_version and srv_vercnt specifies the service
213
- * versions we can support.
214
- *
215
- * Versions are given in decreasing order.
216
- *
217
- * nego_fw_version and nego_srv_version store the selected protocol versions.
218
- *
219203 * Mainly used by Hyper-V drivers.
220204 */
221205 bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
....@@ -332,14 +316,15 @@
332316 if (!channel)
333317 return NULL;
334318
335
- spin_lock_init(&channel->lock);
319
+ spin_lock_init(&channel->sched_lock);
336320 init_completion(&channel->rescind_event);
337321
338322 INIT_LIST_HEAD(&channel->sc_list);
339
- INIT_LIST_HEAD(&channel->percpu_list);
340323
341324 tasklet_init(&channel->callback_event,
342325 vmbus_on_event, (unsigned long)channel);
326
+
327
+ hv_ringbuffer_pre_init(channel);
343328
344329 return channel;
345330 }
....@@ -355,22 +340,48 @@
355340 kobject_put(&channel->kobj);
356341 }
357342
358
-static void percpu_channel_enq(void *arg)
343
+void vmbus_channel_map_relid(struct vmbus_channel *channel)
359344 {
360
- struct vmbus_channel *channel = arg;
361
- struct hv_per_cpu_context *hv_cpu
362
- = this_cpu_ptr(hv_context.cpu_context);
363
-
364
- list_add_tail_rcu(&channel->percpu_list, &hv_cpu->chan_list);
345
+ if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
346
+ return;
347
+ /*
348
+ * The mapping of the channel's relid is visible from the CPUs that
349
+ * execute vmbus_chan_sched() by the time that vmbus_chan_sched() will
350
+ * execute:
351
+ *
352
+ * (a) In the "normal (i.e., not resuming from hibernation)" path,
353
+ * the full barrier in virt_store_mb() guarantees that the store
354
+ * is propagated to all CPUs before the add_channel_work work
355
+ * is queued. In turn, add_channel_work is queued before the
356
+ * channel's ring buffer is allocated/initialized and the
357
+ * OPENCHANNEL message for the channel is sent in vmbus_open().
358
+ * Hyper-V won't start sending the interrupts for the channel
359
+ * before the OPENCHANNEL message is acked. The memory barrier
360
+ * in vmbus_chan_sched() -> sync_test_and_clear_bit() ensures
361
+ * that vmbus_chan_sched() must find the channel's relid in
362
+ * recv_int_page before retrieving the channel pointer from the
363
+ * array of channels.
364
+ *
365
+ * (b) In the "resuming from hibernation" path, the virt_store_mb()
366
+ * guarantees that the store is propagated to all CPUs before
367
+ * the VMBus connection is marked as ready for the resume event
368
+ * (cf. check_ready_for_resume_event()). The interrupt handler
369
+ * of the VMBus driver and vmbus_chan_sched() can not run before
370
+ * vmbus_bus_resume() has completed execution (cf. resume_noirq).
371
+ */
372
+ virt_store_mb(
373
+ vmbus_connection.channels[channel->offermsg.child_relid],
374
+ channel);
365375 }
366376
367
-static void percpu_channel_deq(void *arg)
377
+void vmbus_channel_unmap_relid(struct vmbus_channel *channel)
368378 {
369
- struct vmbus_channel *channel = arg;
370
-
371
- list_del_rcu(&channel->percpu_list);
379
+ if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
380
+ return;
381
+ WRITE_ONCE(
382
+ vmbus_connection.channels[channel->offermsg.child_relid],
383
+ NULL);
372384 }
373
-
374385
375386 static void vmbus_release_relid(u32 relid)
376387 {
....@@ -386,51 +397,49 @@
386397 trace_vmbus_release_relid(&msg, ret);
387398 }
388399
389
-void hv_process_channel_removal(u32 relid)
400
+void hv_process_channel_removal(struct vmbus_channel *channel)
390401 {
391
- unsigned long flags;
392
- struct vmbus_channel *primary_channel, *channel;
393
-
394
- BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex));
395
-
396
- /*
397
- * Make sure channel is valid as we may have raced.
398
- */
399
- channel = relid2channel(relid);
400
- if (!channel)
401
- return;
402
-
402
+ lockdep_assert_held(&vmbus_connection.channel_mutex);
403403 BUG_ON(!channel->rescind);
404
- if (channel->target_cpu != get_cpu()) {
405
- put_cpu();
406
- smp_call_function_single(channel->target_cpu,
407
- percpu_channel_deq, channel, true);
408
- } else {
409
- percpu_channel_deq(channel);
410
- put_cpu();
411
- }
412
-
413
- if (channel->primary_channel == NULL) {
414
- list_del(&channel->listentry);
415
-
416
- primary_channel = channel;
417
- } else {
418
- primary_channel = channel->primary_channel;
419
- spin_lock_irqsave(&primary_channel->lock, flags);
420
- list_del(&channel->sc_list);
421
- primary_channel->num_sc--;
422
- spin_unlock_irqrestore(&primary_channel->lock, flags);
423
- }
424404
425405 /*
426
- * We need to free the bit for init_vp_index() to work in the case
427
- * of sub-channel, when we reload drivers like hv_netvsc.
406
+ * hv_process_channel_removal() could find INVALID_RELID only for
407
+ * hv_sock channels. See the inline comments in vmbus_onoffer().
428408 */
429
- if (channel->affinity_policy == HV_LOCALIZED)
430
- cpumask_clear_cpu(channel->target_cpu,
431
- &primary_channel->alloced_cpus_in_node);
409
+ WARN_ON(channel->offermsg.child_relid == INVALID_RELID &&
410
+ !is_hvsock_channel(channel));
432411
433
- vmbus_release_relid(relid);
412
+ /*
413
+ * Upon suspend, an in-use hv_sock channel is removed from the array of
414
+ * channels and the relid is invalidated. After hibernation, when the
415
+ * user-space appplication destroys the channel, it's unnecessary and
416
+ * unsafe to remove the channel from the array of channels. See also
417
+ * the inline comments before the call of vmbus_release_relid() below.
418
+ */
419
+ if (channel->offermsg.child_relid != INVALID_RELID)
420
+ vmbus_channel_unmap_relid(channel);
421
+
422
+ if (channel->primary_channel == NULL)
423
+ list_del(&channel->listentry);
424
+ else
425
+ list_del(&channel->sc_list);
426
+
427
+ /*
428
+ * If this is a "perf" channel, updates the hv_numa_map[] masks so that
429
+ * init_vp_index() can (re-)use the CPU.
430
+ */
431
+ if (hv_is_perf_channel(channel))
432
+ hv_clear_alloced_cpu(channel->target_cpu);
433
+
434
+ /*
435
+ * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and
436
+ * the relid is invalidated; after hibernation, when the user-space app
437
+ * destroys the channel, the relid is INVALID_RELID, and in this case
438
+ * it's unnecessary and unsafe to release the old relid, since the same
439
+ * relid can refer to a completely different channel now.
440
+ */
441
+ if (channel->offermsg.child_relid != INVALID_RELID)
442
+ vmbus_release_relid(channel->offermsg.child_relid);
434443
435444 free_channel(channel);
436445 }
....@@ -454,23 +463,7 @@
454463 struct vmbus_channel *newchannel =
455464 container_of(work, struct vmbus_channel, add_channel_work);
456465 struct vmbus_channel *primary_channel = newchannel->primary_channel;
457
- unsigned long flags;
458
- u16 dev_type;
459466 int ret;
460
-
461
- dev_type = hv_get_dev_type(newchannel);
462
-
463
- init_vp_index(newchannel, dev_type);
464
-
465
- if (newchannel->target_cpu != get_cpu()) {
466
- put_cpu();
467
- smp_call_function_single(newchannel->target_cpu,
468
- percpu_channel_enq,
469
- newchannel, true);
470
- } else {
471
- percpu_channel_enq(newchannel);
472
- put_cpu();
473
- }
474467
475468 /*
476469 * This state is used to indicate a successful open
....@@ -503,18 +496,22 @@
503496 if (!newchannel->device_obj)
504497 goto err_deq_chan;
505498
506
- newchannel->device_obj->device_id = dev_type;
499
+ newchannel->device_obj->device_id = newchannel->device_id;
507500 /*
508501 * Add the new device to the bus. This will kick off device-driver
509502 * binding which eventually invokes the device driver's AddDevice()
510503 * method.
504
+ *
505
+ * If vmbus_device_register() fails, the 'device_obj' is freed in
506
+ * vmbus_device_release() as called by device_unregister() in the
507
+ * error path of vmbus_device_register(). In the outside error
508
+ * path, there's no need to free it.
511509 */
512510 ret = vmbus_device_register(newchannel->device_obj);
513511
514512 if (ret != 0) {
515513 pr_err("unable to add child device object (relid %d)\n",
516514 newchannel->offermsg.child_relid);
517
- kfree(newchannel->device_obj);
518515 goto err_deq_chan;
519516 }
520517
....@@ -530,25 +527,15 @@
530527 */
531528 newchannel->probe_done = true;
532529
533
- if (primary_channel == NULL) {
530
+ if (primary_channel == NULL)
534531 list_del(&newchannel->listentry);
535
- } else {
536
- spin_lock_irqsave(&primary_channel->lock, flags);
532
+ else
537533 list_del(&newchannel->sc_list);
538
- spin_unlock_irqrestore(&primary_channel->lock, flags);
539
- }
534
+
535
+ /* vmbus_process_offer() has mapped the channel. */
536
+ vmbus_channel_unmap_relid(newchannel);
540537
541538 mutex_unlock(&vmbus_connection.channel_mutex);
542
-
543
- if (newchannel->target_cpu != get_cpu()) {
544
- put_cpu();
545
- smp_call_function_single(newchannel->target_cpu,
546
- percpu_channel_deq,
547
- newchannel, true);
548
- } else {
549
- percpu_channel_deq(newchannel);
550
- put_cpu();
551
- }
552539
553540 vmbus_release_relid(newchannel->offermsg.child_relid);
554541
....@@ -563,10 +550,40 @@
563550 {
564551 struct vmbus_channel *channel;
565552 struct workqueue_struct *wq;
566
- unsigned long flags;
567553 bool fnew = true;
568554
555
+ /*
556
+ * Synchronize vmbus_process_offer() and CPU hotplugging:
557
+ *
558
+ * CPU1 CPU2
559
+ *
560
+ * [vmbus_process_offer()] [Hot removal of the CPU]
561
+ *
562
+ * CPU_READ_LOCK CPUS_WRITE_LOCK
563
+ * LOAD cpu_online_mask SEARCH chn_list
564
+ * STORE target_cpu LOAD target_cpu
565
+ * INSERT chn_list STORE cpu_online_mask
566
+ * CPUS_READ_UNLOCK CPUS_WRITE_UNLOCK
567
+ *
568
+ * Forbids: CPU1's LOAD from *not* seing CPU2's STORE &&
569
+ * CPU2's SEARCH from *not* seeing CPU1's INSERT
570
+ *
571
+ * Forbids: CPU2's SEARCH from seeing CPU1's INSERT &&
572
+ * CPU2's LOAD from *not* seing CPU1's STORE
573
+ */
574
+ cpus_read_lock();
575
+
576
+ /*
577
+ * Serializes the modifications of the chn_list list as well as
578
+ * the accesses to next_numa_node_id in init_vp_index().
579
+ */
569580 mutex_lock(&vmbus_connection.channel_mutex);
581
+
582
+ init_vp_index(newchannel);
583
+
584
+ /* Remember the channels that should be cleaned up upon suspend. */
585
+ if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel))
586
+ atomic_inc(&vmbus_connection.nr_chan_close_on_suspend);
570587
571588 /*
572589 * Now that we have acquired the channel_mutex,
....@@ -575,24 +592,25 @@
575592 atomic_dec(&vmbus_connection.offer_in_progress);
576593
577594 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
578
- if (!uuid_le_cmp(channel->offermsg.offer.if_type,
579
- newchannel->offermsg.offer.if_type) &&
580
- !uuid_le_cmp(channel->offermsg.offer.if_instance,
581
- newchannel->offermsg.offer.if_instance)) {
595
+ if (guid_equal(&channel->offermsg.offer.if_type,
596
+ &newchannel->offermsg.offer.if_type) &&
597
+ guid_equal(&channel->offermsg.offer.if_instance,
598
+ &newchannel->offermsg.offer.if_instance)) {
582599 fnew = false;
583600 break;
584601 }
585602 }
586603
587
- if (fnew)
604
+ if (fnew) {
588605 list_add_tail(&newchannel->listentry,
589606 &vmbus_connection.chn_list);
590
- else {
607
+ } else {
591608 /*
592609 * Check to see if this is a valid sub-channel.
593610 */
594611 if (newchannel->offermsg.offer.sub_channel_index == 0) {
595612 mutex_unlock(&vmbus_connection.channel_mutex);
613
+ cpus_read_unlock();
596614 /*
597615 * Don't call free_channel(), because newchannel->kobj
598616 * is not initialized yet.
....@@ -605,12 +623,13 @@
605623 * Process the sub-channel.
606624 */
607625 newchannel->primary_channel = channel;
608
- spin_lock_irqsave(&channel->lock, flags);
609626 list_add_tail(&newchannel->sc_list, &channel->sc_list);
610
- spin_unlock_irqrestore(&channel->lock, flags);
611627 }
612628
629
+ vmbus_channel_map_relid(newchannel);
630
+
613631 mutex_unlock(&vmbus_connection.channel_mutex);
632
+ cpus_read_unlock();
614633
615634 /*
616635 * vmbus_process_offer() mustn't call channel->sc_creation_callback()
....@@ -643,73 +662,57 @@
643662 * We use this state to statically distribute the channel interrupt load.
644663 */
645664 static int next_numa_node_id;
646
-/*
647
- * init_vp_index() accesses global variables like next_numa_node_id, and
648
- * it can run concurrently for primary channels and sub-channels: see
649
- * vmbus_process_offer(), so we need the lock to protect the global
650
- * variables.
651
- */
652
-static DEFINE_SPINLOCK(bind_channel_to_cpu_lock);
653665
654666 /*
655667 * Starting with Win8, we can statically distribute the incoming
656668 * channel interrupt load by binding a channel to VCPU.
657
- * We distribute the interrupt loads to one or more NUMA nodes based on
658
- * the channel's affinity_policy.
659669 *
660670 * For pre-win8 hosts or non-performance critical channels we assign the
661
- * first CPU in the first NUMA node.
671
+ * VMBUS_CONNECT_CPU.
672
+ *
673
+ * Starting with win8, performance critical channels will be distributed
674
+ * evenly among all the available NUMA nodes. Once the node is assigned,
675
+ * we will assign the CPU based on a simple round robin scheme.
662676 */
663
-static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
677
+static void init_vp_index(struct vmbus_channel *channel)
664678 {
665
- u32 cur_cpu;
666
- bool perf_chn = vmbus_devs[dev_type].perf_device;
667
- struct vmbus_channel *primary = channel->primary_channel;
668
- int next_node;
679
+ bool perf_chn = hv_is_perf_channel(channel);
669680 cpumask_var_t available_mask;
670681 struct cpumask *alloced_mask;
682
+ u32 target_cpu;
683
+ int numa_node;
671684
672685 if ((vmbus_proto_version == VERSION_WS2008) ||
673686 (vmbus_proto_version == VERSION_WIN7) || (!perf_chn) ||
674687 !alloc_cpumask_var(&available_mask, GFP_KERNEL)) {
675688 /*
676689 * Prior to win8, all channel interrupts are
677
- * delivered on cpu 0.
690
+ * delivered on VMBUS_CONNECT_CPU.
678691 * Also if the channel is not a performance critical
679
- * channel, bind it to cpu 0.
680
- * In case alloc_cpumask_var() fails, bind it to cpu 0.
692
+ * channel, bind it to VMBUS_CONNECT_CPU.
693
+ * In case alloc_cpumask_var() fails, bind it to
694
+ * VMBUS_CONNECT_CPU.
681695 */
682
- channel->numa_node = 0;
683
- channel->target_cpu = 0;
684
- channel->target_vp = hv_cpu_number_to_vp_number(0);
696
+ channel->target_cpu = VMBUS_CONNECT_CPU;
697
+ if (perf_chn)
698
+ hv_set_alloced_cpu(VMBUS_CONNECT_CPU);
685699 return;
686700 }
687701
688
- spin_lock(&bind_channel_to_cpu_lock);
689
-
690
- /*
691
- * Based on the channel affinity policy, we will assign the NUMA
692
- * nodes.
693
- */
694
-
695
- if ((channel->affinity_policy == HV_BALANCED) || (!primary)) {
696
- while (true) {
697
- next_node = next_numa_node_id++;
698
- if (next_node == nr_node_ids) {
699
- next_node = next_numa_node_id = 0;
700
- continue;
701
- }
702
- if (cpumask_empty(cpumask_of_node(next_node)))
703
- continue;
704
- break;
702
+ while (true) {
703
+ numa_node = next_numa_node_id++;
704
+ if (numa_node == nr_node_ids) {
705
+ next_numa_node_id = 0;
706
+ continue;
705707 }
706
- channel->numa_node = next_node;
707
- primary = channel;
708
+ if (cpumask_empty(cpumask_of_node(numa_node)))
709
+ continue;
710
+ break;
708711 }
709
- alloced_mask = &hv_context.hv_numa_map[primary->numa_node];
712
+ alloced_mask = &hv_context.hv_numa_map[numa_node];
710713
711714 if (cpumask_weight(alloced_mask) ==
712
- cpumask_weight(cpumask_of_node(primary->numa_node))) {
715
+ cpumask_weight(cpumask_of_node(numa_node))) {
713716 /*
714717 * We have cycled through all the CPUs in the node;
715718 * reset the alloced map.
....@@ -717,59 +720,12 @@
717720 cpumask_clear(alloced_mask);
718721 }
719722
720
- cpumask_xor(available_mask, alloced_mask,
721
- cpumask_of_node(primary->numa_node));
723
+ cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node));
722724
723
- cur_cpu = -1;
725
+ target_cpu = cpumask_first(available_mask);
726
+ cpumask_set_cpu(target_cpu, alloced_mask);
724727
725
- if (primary->affinity_policy == HV_LOCALIZED) {
726
- /*
727
- * Normally Hyper-V host doesn't create more subchannels
728
- * than there are VCPUs on the node but it is possible when not
729
- * all present VCPUs on the node are initialized by guest.
730
- * Clear the alloced_cpus_in_node to start over.
731
- */
732
- if (cpumask_equal(&primary->alloced_cpus_in_node,
733
- cpumask_of_node(primary->numa_node)))
734
- cpumask_clear(&primary->alloced_cpus_in_node);
735
- }
736
-
737
- while (true) {
738
- cur_cpu = cpumask_next(cur_cpu, available_mask);
739
- if (cur_cpu >= nr_cpu_ids) {
740
- cur_cpu = -1;
741
- cpumask_copy(available_mask,
742
- cpumask_of_node(primary->numa_node));
743
- continue;
744
- }
745
-
746
- if (primary->affinity_policy == HV_LOCALIZED) {
747
- /*
748
- * NOTE: in the case of sub-channel, we clear the
749
- * sub-channel related bit(s) in
750
- * primary->alloced_cpus_in_node in
751
- * hv_process_channel_removal(), so when we
752
- * reload drivers like hv_netvsc in SMP guest, here
753
- * we're able to re-allocate
754
- * bit from primary->alloced_cpus_in_node.
755
- */
756
- if (!cpumask_test_cpu(cur_cpu,
757
- &primary->alloced_cpus_in_node)) {
758
- cpumask_set_cpu(cur_cpu,
759
- &primary->alloced_cpus_in_node);
760
- cpumask_set_cpu(cur_cpu, alloced_mask);
761
- break;
762
- }
763
- } else {
764
- cpumask_set_cpu(cur_cpu, alloced_mask);
765
- break;
766
- }
767
- }
768
-
769
- channel->target_cpu = cur_cpu;
770
- channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu);
771
-
772
- spin_unlock(&bind_channel_to_cpu_lock);
728
+ channel->target_cpu = target_cpu;
773729
774730 free_cpumask_var(available_mask);
775731 }
....@@ -896,6 +852,68 @@
896852 vmbus_wait_for_unload();
897853 }
898854
855
+static void check_ready_for_resume_event(void)
856
+{
857
+ /*
858
+ * If all the old primary channels have been fixed up, then it's safe
859
+ * to resume.
860
+ */
861
+ if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume))
862
+ complete(&vmbus_connection.ready_for_resume_event);
863
+}
864
+
865
+static void vmbus_setup_channel_state(struct vmbus_channel *channel,
866
+ struct vmbus_channel_offer_channel *offer)
867
+{
868
+ /*
869
+ * Setup state for signalling the host.
870
+ */
871
+ channel->sig_event = VMBUS_EVENT_CONNECTION_ID;
872
+
873
+ if (vmbus_proto_version != VERSION_WS2008) {
874
+ channel->is_dedicated_interrupt =
875
+ (offer->is_dedicated_interrupt != 0);
876
+ channel->sig_event = offer->connection_id;
877
+ }
878
+
879
+ memcpy(&channel->offermsg, offer,
880
+ sizeof(struct vmbus_channel_offer_channel));
881
+ channel->monitor_grp = (u8)offer->monitorid / 32;
882
+ channel->monitor_bit = (u8)offer->monitorid % 32;
883
+ channel->device_id = hv_get_dev_type(channel);
884
+}
885
+
886
+/*
887
+ * find_primary_channel_by_offer - Get the channel object given the new offer.
888
+ * This is only used in the resume path of hibernation.
889
+ */
890
+static struct vmbus_channel *
891
+find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer)
892
+{
893
+ struct vmbus_channel *channel = NULL, *iter;
894
+ const guid_t *inst1, *inst2;
895
+
896
+ /* Ignore sub-channel offers. */
897
+ if (offer->offer.sub_channel_index != 0)
898
+ return NULL;
899
+
900
+ mutex_lock(&vmbus_connection.channel_mutex);
901
+
902
+ list_for_each_entry(iter, &vmbus_connection.chn_list, listentry) {
903
+ inst1 = &iter->offermsg.offer.if_instance;
904
+ inst2 = &offer->offer.if_instance;
905
+
906
+ if (guid_equal(inst1, inst2)) {
907
+ channel = iter;
908
+ break;
909
+ }
910
+ }
911
+
912
+ mutex_unlock(&vmbus_connection.channel_mutex);
913
+
914
+ return channel;
915
+}
916
+
899917 /*
900918 * vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
901919 *
....@@ -903,11 +921,84 @@
903921 static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
904922 {
905923 struct vmbus_channel_offer_channel *offer;
906
- struct vmbus_channel *newchannel;
924
+ struct vmbus_channel *oldchannel, *newchannel;
925
+ size_t offer_sz;
907926
908927 offer = (struct vmbus_channel_offer_channel *)hdr;
909928
910929 trace_vmbus_onoffer(offer);
930
+
931
+ oldchannel = find_primary_channel_by_offer(offer);
932
+
933
+ if (oldchannel != NULL) {
934
+ /*
935
+ * We're resuming from hibernation: all the sub-channel and
936
+ * hv_sock channels we had before the hibernation should have
937
+ * been cleaned up, and now we must be seeing a re-offered
938
+ * primary channel that we had before the hibernation.
939
+ */
940
+
941
+ /*
942
+ * { Initially: channel relid = INVALID_RELID,
943
+ * channels[valid_relid] = NULL }
944
+ *
945
+ * CPU1 CPU2
946
+ *
947
+ * [vmbus_onoffer()] [vmbus_device_release()]
948
+ *
949
+ * LOCK channel_mutex LOCK channel_mutex
950
+ * STORE channel relid = valid_relid LOAD r1 = channel relid
951
+ * MAP_RELID channel if (r1 != INVALID_RELID)
952
+ * UNLOCK channel_mutex UNMAP_RELID channel
953
+ * UNLOCK channel_mutex
954
+ *
955
+ * Forbids: r1 == valid_relid &&
956
+ * channels[valid_relid] == channel
957
+ *
958
+ * Note. r1 can be INVALID_RELID only for an hv_sock channel.
959
+ * None of the hv_sock channels which were present before the
960
+ * suspend are re-offered upon the resume. See the WARN_ON()
961
+ * in hv_process_channel_removal().
962
+ */
963
+ mutex_lock(&vmbus_connection.channel_mutex);
964
+
965
+ atomic_dec(&vmbus_connection.offer_in_progress);
966
+
967
+ WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID);
968
+ /* Fix up the relid. */
969
+ oldchannel->offermsg.child_relid = offer->child_relid;
970
+
971
+ offer_sz = sizeof(*offer);
972
+ if (memcmp(offer, &oldchannel->offermsg, offer_sz) != 0) {
973
+ /*
974
+ * This is not an error, since the host can also change
975
+ * the other field(s) of the offer, e.g. on WS RS5
976
+ * (Build 17763), the offer->connection_id of the
977
+ * Mellanox VF vmbus device can change when the host
978
+ * reoffers the device upon resume.
979
+ */
980
+ pr_debug("vmbus offer changed: relid=%d\n",
981
+ offer->child_relid);
982
+
983
+ print_hex_dump_debug("Old vmbus offer: ",
984
+ DUMP_PREFIX_OFFSET, 16, 4,
985
+ &oldchannel->offermsg, offer_sz,
986
+ false);
987
+ print_hex_dump_debug("New vmbus offer: ",
988
+ DUMP_PREFIX_OFFSET, 16, 4,
989
+ offer, offer_sz, false);
990
+
991
+ /* Fix up the old channel. */
992
+ vmbus_setup_channel_state(oldchannel, offer);
993
+ }
994
+
995
+ /* Add the channel back to the array of channels. */
996
+ vmbus_channel_map_relid(oldchannel);
997
+ check_ready_for_resume_event();
998
+
999
+ mutex_unlock(&vmbus_connection.channel_mutex);
1000
+ return;
1001
+ }
9111002
9121003 /* Allocate the channel object and save this offer. */
9131004 newchannel = alloc_channel();
....@@ -918,23 +1009,19 @@
9181009 return;
9191010 }
9201011
921
- /*
922
- * Setup state for signalling the host.
923
- */
924
- newchannel->sig_event = VMBUS_EVENT_CONNECTION_ID;
925
-
926
- if (vmbus_proto_version != VERSION_WS2008) {
927
- newchannel->is_dedicated_interrupt =
928
- (offer->is_dedicated_interrupt != 0);
929
- newchannel->sig_event = offer->connection_id;
930
- }
931
-
932
- memcpy(&newchannel->offermsg, offer,
933
- sizeof(struct vmbus_channel_offer_channel));
934
- newchannel->monitor_grp = (u8)offer->monitorid / 32;
935
- newchannel->monitor_bit = (u8)offer->monitorid % 32;
1012
+ vmbus_setup_channel_state(newchannel, offer);
9361013
9371014 vmbus_process_offer(newchannel);
1015
+}
1016
+
1017
+static void check_ready_for_suspend_event(void)
1018
+{
1019
+ /*
1020
+ * If all the sub-channels or hv_sock channels have been cleaned up,
1021
+ * then it's safe to suspend.
1022
+ */
1023
+ if (atomic_dec_and_test(&vmbus_connection.nr_chan_close_on_suspend))
1024
+ complete(&vmbus_connection.ready_for_suspend_event);
9381025 }
9391026
9401027 /*
....@@ -947,6 +1034,7 @@
9471034 struct vmbus_channel_rescind_offer *rescind;
9481035 struct vmbus_channel *channel;
9491036 struct device *dev;
1037
+ bool clean_up_chan_for_suspend;
9501038
9511039 rescind = (struct vmbus_channel_rescind_offer *)hdr;
9521040
....@@ -958,11 +1046,22 @@
9581046 * offer comes in first and then the rescind.
9591047 * Since we process these events in work elements,
9601048 * and with preemption, we may end up processing
961
- * the events out of order. Given that we handle these
962
- * work elements on the same CPU, this is possible only
963
- * in the case of preemption. In any case wait here
964
- * until the offer processing has moved beyond the
965
- * point where the channel is discoverable.
1049
+ * the events out of order. We rely on the synchronization
1050
+ * provided by offer_in_progress and by channel_mutex for
1051
+ * ordering these events:
1052
+ *
1053
+ * { Initially: offer_in_progress = 1 }
1054
+ *
1055
+ * CPU1 CPU2
1056
+ *
1057
+ * [vmbus_onoffer()] [vmbus_onoffer_rescind()]
1058
+ *
1059
+ * LOCK channel_mutex WAIT_ON offer_in_progress == 0
1060
+ * DECREMENT offer_in_progress LOCK channel_mutex
1061
+ * STORE channels[] LOAD channels[]
1062
+ * UNLOCK channel_mutex UNLOCK channel_mutex
1063
+ *
1064
+ * Forbids: CPU2's LOAD from *not* seeing CPU1's STORE
9661065 */
9671066
9681067 while (atomic_read(&vmbus_connection.offer_in_progress) != 0) {
....@@ -986,6 +1085,8 @@
9861085 return;
9871086 }
9881087
1088
+ clean_up_chan_for_suspend = is_hvsock_channel(channel) ||
1089
+ is_sub_channel(channel);
9891090 /*
9901091 * Before setting channel->rescind in vmbus_rescind_cleanup(), we
9911092 * should make sure the channel callback is not running any more.
....@@ -1011,6 +1112,10 @@
10111112 if (channel->device_obj) {
10121113 if (channel->chn_rescind_callback) {
10131114 channel->chn_rescind_callback(channel);
1115
+
1116
+ if (clean_up_chan_for_suspend)
1117
+ check_ready_for_suspend_event();
1118
+
10141119 return;
10151120 }
10161121 /*
....@@ -1036,12 +1141,17 @@
10361141 * The channel is currently not open;
10371142 * it is safe for us to cleanup the channel.
10381143 */
1039
- hv_process_channel_removal(rescind->child_relid);
1144
+ hv_process_channel_removal(channel);
10401145 } else {
10411146 complete(&channel->rescind_event);
10421147 }
10431148 mutex_unlock(&vmbus_connection.channel_mutex);
10441149 }
1150
+
1151
+ /* The "channel" may have been freed. Do not access it any longer. */
1152
+
1153
+ if (clean_up_chan_for_suspend)
1154
+ check_ready_for_suspend_event();
10451155 }
10461156
10471157 void vmbus_hvsock_device_unregister(struct vmbus_channel *channel)
....@@ -1250,30 +1360,36 @@
12501360 /* Channel message dispatch table */
12511361 const struct vmbus_channel_message_table_entry
12521362 channel_message_table[CHANNELMSG_COUNT] = {
1253
- { CHANNELMSG_INVALID, 0, NULL },
1254
- { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer },
1255
- { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind },
1256
- { CHANNELMSG_REQUESTOFFERS, 0, NULL },
1257
- { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered },
1258
- { CHANNELMSG_OPENCHANNEL, 0, NULL },
1259
- { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result },
1260
- { CHANNELMSG_CLOSECHANNEL, 0, NULL },
1261
- { CHANNELMSG_GPADL_HEADER, 0, NULL },
1262
- { CHANNELMSG_GPADL_BODY, 0, NULL },
1263
- { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created },
1264
- { CHANNELMSG_GPADL_TEARDOWN, 0, NULL },
1265
- { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown },
1266
- { CHANNELMSG_RELID_RELEASED, 0, NULL },
1267
- { CHANNELMSG_INITIATE_CONTACT, 0, NULL },
1268
- { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response },
1269
- { CHANNELMSG_UNLOAD, 0, NULL },
1270
- { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response },
1271
- { CHANNELMSG_18, 0, NULL },
1272
- { CHANNELMSG_19, 0, NULL },
1273
- { CHANNELMSG_20, 0, NULL },
1274
- { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL },
1275
- { CHANNELMSG_22, 0, NULL },
1276
- { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL },
1363
+ { CHANNELMSG_INVALID, 0, NULL, 0},
1364
+ { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer,
1365
+ sizeof(struct vmbus_channel_offer_channel)},
1366
+ { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind,
1367
+ sizeof(struct vmbus_channel_rescind_offer) },
1368
+ { CHANNELMSG_REQUESTOFFERS, 0, NULL, 0},
1369
+ { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered, 0},
1370
+ { CHANNELMSG_OPENCHANNEL, 0, NULL, 0},
1371
+ { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result,
1372
+ sizeof(struct vmbus_channel_open_result)},
1373
+ { CHANNELMSG_CLOSECHANNEL, 0, NULL, 0},
1374
+ { CHANNELMSG_GPADL_HEADER, 0, NULL, 0},
1375
+ { CHANNELMSG_GPADL_BODY, 0, NULL, 0},
1376
+ { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created,
1377
+ sizeof(struct vmbus_channel_gpadl_created)},
1378
+ { CHANNELMSG_GPADL_TEARDOWN, 0, NULL, 0},
1379
+ { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown,
1380
+ sizeof(struct vmbus_channel_gpadl_torndown) },
1381
+ { CHANNELMSG_RELID_RELEASED, 0, NULL, 0},
1382
+ { CHANNELMSG_INITIATE_CONTACT, 0, NULL, 0},
1383
+ { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response,
1384
+ sizeof(struct vmbus_channel_version_response)},
1385
+ { CHANNELMSG_UNLOAD, 0, NULL, 0},
1386
+ { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response, 0},
1387
+ { CHANNELMSG_18, 0, NULL, 0},
1388
+ { CHANNELMSG_19, 0, NULL, 0},
1389
+ { CHANNELMSG_20, 0, NULL, 0},
1390
+ { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0},
1391
+ { CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0},
1392
+ { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0},
12771393 };
12781394
12791395 /*
....@@ -1281,13 +1397,8 @@
12811397 *
12821398 * This is invoked in the vmbus worker thread context.
12831399 */
1284
-void vmbus_onmessage(void *context)
1400
+void vmbus_onmessage(struct vmbus_channel_message_header *hdr)
12851401 {
1286
- struct hv_message *msg = context;
1287
- struct vmbus_channel_message_header *hdr;
1288
-
1289
- hdr = (struct vmbus_channel_message_header *)msg->u.payload;
1290
-
12911402 trace_vmbus_on_message(hdr);
12921403
12931404 /*
....@@ -1332,49 +1443,6 @@
13321443
13331444 return ret;
13341445 }
1335
-
1336
-/*
1337
- * Retrieve the (sub) channel on which to send an outgoing request.
1338
- * When a primary channel has multiple sub-channels, we try to
1339
- * distribute the load equally amongst all available channels.
1340
- */
1341
-struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
1342
-{
1343
- struct list_head *cur, *tmp;
1344
- int cur_cpu;
1345
- struct vmbus_channel *cur_channel;
1346
- struct vmbus_channel *outgoing_channel = primary;
1347
- int next_channel;
1348
- int i = 1;
1349
-
1350
- if (list_empty(&primary->sc_list))
1351
- return outgoing_channel;
1352
-
1353
- next_channel = primary->next_oc++;
1354
-
1355
- if (next_channel > (primary->num_sc)) {
1356
- primary->next_oc = 0;
1357
- return outgoing_channel;
1358
- }
1359
-
1360
- cur_cpu = hv_cpu_number_to_vp_number(smp_processor_id());
1361
- list_for_each_safe(cur, tmp, &primary->sc_list) {
1362
- cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
1363
- if (cur_channel->state != CHANNEL_OPENED_STATE)
1364
- continue;
1365
-
1366
- if (cur_channel->target_vp == cur_cpu)
1367
- return cur_channel;
1368
-
1369
- if (i == next_channel)
1370
- return cur_channel;
1371
-
1372
- i++;
1373
- }
1374
-
1375
- return outgoing_channel;
1376
-}
1377
-EXPORT_SYMBOL_GPL(vmbus_get_outgoing_channel);
13781446
13791447 static void invoke_sc_cb(struct vmbus_channel *primary_channel)
13801448 {