hc
2024-10-22 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5
kernel/net/vmw_vsock/hyperv_transport.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Hyper-V transport for vsock
34 *
....@@ -6,31 +7,22 @@
67 * support in the VM by introducing the new vsock transport.
78 *
89 * Copyright (c) 2017, Microsoft Corporation.
9
- *
10
- * This program is free software; you can redistribute it and/or modify it
11
- * under the terms and conditions of the GNU General Public License,
12
- * version 2, as published by the Free Software Foundation.
13
- *
14
- * This program is distributed in the hope it will be useful, but WITHOUT
15
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17
- * more details.
18
- *
1910 */
2011 #include <linux/module.h>
2112 #include <linux/vmalloc.h>
2213 #include <linux/hyperv.h>
2314 #include <net/sock.h>
2415 #include <net/af_vsock.h>
16
+#include <asm/hyperv-tlfs.h>
2517
26
-/* The host side's design of the feature requires 6 exact 4KB pages for
27
- * recv/send rings respectively -- this is suboptimal considering memory
28
- * consumption, however unluckily we have to live with it, before the
29
- * host comes up with a better design in the future.
18
+/* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some
19
+ * stricter requirements on the hv_sock ring buffer size of six 4K pages.
20
+ * hyperv-tlfs defines HV_HYP_PAGE_SIZE as 4K. Newer hosts don't have this
21
+ * limitation; but, keep the defaults the same for compat.
3022 */
31
-#define PAGE_SIZE_4K 4096
32
-#define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6)
33
-#define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6)
23
+#define RINGBUFFER_HVS_RCV_SIZE (HV_HYP_PAGE_SIZE * 6)
24
+#define RINGBUFFER_HVS_SND_SIZE (HV_HYP_PAGE_SIZE * 6)
25
+#define RINGBUFFER_HVS_MAX_SIZE (HV_HYP_PAGE_SIZE * 64)
3426
3527 /* The MTU is 16KB per the host side's design */
3628 #define HVS_MTU_SIZE (1024 * 16)
....@@ -55,14 +47,16 @@
5547 };
5648
5749 /* We can send up to HVS_MTU_SIZE bytes of payload to the host, but let's use
58
- * a small size, i.e. HVS_SEND_BUF_SIZE, to minimize the dynamically-allocated
59
- * buffer, because tests show there is no significant performance difference.
50
+ * a smaller size, i.e. HVS_SEND_BUF_SIZE, to maximize concurrency between the
51
+ * guest and the host processing as one VMBUS packet is the smallest processing
52
+ * unit.
6053 *
6154 * Note: the buffer can be eliminated in the future when we add new VMBus
6255 * ringbuffer APIs that allow us to directly copy data from userspace buffer
6356 * to VMBus ringbuffer.
6457 */
65
-#define HVS_SEND_BUF_SIZE (PAGE_SIZE_4K - sizeof(struct vmpipe_proto_header))
58
+#define HVS_SEND_BUF_SIZE \
59
+ (HV_HYP_PAGE_SIZE - sizeof(struct vmpipe_proto_header))
6660
6761 struct hvs_send_buf {
6862 /* The header before the payload data */
....@@ -85,11 +79,11 @@
8579 VMBUS_PKT_TRAILER_SIZE)
8680
8781 union hvs_service_id {
88
- uuid_le srv_id;
82
+ guid_t srv_id;
8983
9084 struct {
9185 unsigned int svm_port;
92
- unsigned char b[sizeof(uuid_le) - sizeof(unsigned int)];
86
+ unsigned char b[sizeof(guid_t) - sizeof(unsigned int)];
9387 };
9488 };
9589
....@@ -97,8 +91,8 @@
9791 struct hvsock {
9892 struct vsock_sock *vsk;
9993
100
- uuid_le vm_srv_id;
101
- uuid_le host_srv_id;
94
+ guid_t vm_srv_id;
95
+ guid_t host_srv_id;
10296
10397 struct vmbus_channel *chan;
10498 struct vmpacket_descriptor *recv_desc;
....@@ -154,21 +148,23 @@
154148 */
155149
156150 /* 00000000-facb-11e6-bd58-64006a7986d3 */
157
-static const uuid_le srv_id_template =
158
- UUID_LE(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58,
159
- 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3);
151
+static const guid_t srv_id_template =
152
+ GUID_INIT(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58,
153
+ 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3);
160154
161
-static bool is_valid_srv_id(const uuid_le *id)
155
+static bool hvs_check_transport(struct vsock_sock *vsk);
156
+
157
+static bool is_valid_srv_id(const guid_t *id)
162158 {
163
- return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(uuid_le) - 4);
159
+ return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(guid_t) - 4);
164160 }
165161
166
-static unsigned int get_port_by_srv_id(const uuid_le *svr_id)
162
+static unsigned int get_port_by_srv_id(const guid_t *svr_id)
167163 {
168164 return *((unsigned int *)svr_id);
169165 }
170166
171
-static void hvs_addr_init(struct sockaddr_vm *addr, const uuid_le *svr_id)
167
+static void hvs_addr_init(struct sockaddr_vm *addr, const guid_t *svr_id)
172168 {
173169 unsigned int port = get_port_by_srv_id(svr_id);
174170
....@@ -289,7 +285,7 @@
289285
290286 static void hvs_open_connection(struct vmbus_channel *chan)
291287 {
292
- uuid_le *if_instance, *if_type;
288
+ guid_t *if_instance, *if_type;
293289 unsigned char conn_from_host;
294290
295291 struct sockaddr_vm addr;
....@@ -297,7 +293,9 @@
297293 struct vsock_sock *vnew = NULL;
298294 struct hvsock *hvs = NULL;
299295 struct hvsock *hvs_new = NULL;
296
+ int rcvbuf;
300297 int ret;
298
+ int sndbuf;
301299
302300 if_type = &chan->offermsg.offer.if_type;
303301 if_instance = &chan->offermsg.offer.if_instance;
....@@ -319,8 +317,7 @@
319317 if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog)
320318 goto out;
321319
322
- new = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
323
- sk->sk_type, 0);
320
+ new = vsock_create_connected(sk);
324321 if (!new)
325322 goto out;
326323
....@@ -333,6 +330,14 @@
333330 vsock_addr_init(&vnew->remote_addr,
334331 VMADDR_CID_HOST, VMADDR_PORT_ANY);
335332 vnew->remote_addr.svm_port = get_port_by_srv_id(if_instance);
333
+ ret = vsock_assign_transport(vnew, vsock_sk(sk));
334
+ /* Transport assigned (looking at remote_addr) must be the
335
+ * same where we received the request.
336
+ */
337
+ if (ret || !hvs_check_transport(vnew)) {
338
+ sock_put(new);
339
+ goto out;
340
+ }
336341 hvs_new = vnew->trans;
337342 hvs_new->chan = chan;
338343 } else {
....@@ -341,9 +346,34 @@
341346 }
342347
343348 set_channel_read_mode(chan, HV_CALL_DIRECT);
344
- ret = vmbus_open(chan, RINGBUFFER_HVS_SND_SIZE,
345
- RINGBUFFER_HVS_RCV_SIZE, NULL, 0,
346
- hvs_channel_cb, conn_from_host ? new : sk);
349
+
350
+ /* Use the socket buffer sizes as hints for the VMBUS ring size. For
351
+ * server side sockets, 'sk' is the parent socket and thus, this will
352
+ * allow the child sockets to inherit the size from the parent. Keep
353
+ * the mins to the default value and align to page size as per VMBUS
354
+ * requirements.
355
+ * For the max, the socket core library will limit the socket buffer
356
+ * size that can be set by the user, but, since currently, the hv_sock
357
+ * VMBUS ring buffer is physically contiguous allocation, restrict it
358
+ * further.
359
+ * Older versions of hv_sock host side code cannot handle bigger VMBUS
360
+ * ring buffer size. Use the version number to limit the change to newer
361
+ * versions.
362
+ */
363
+ if (vmbus_proto_version < VERSION_WIN10_V5) {
364
+ sndbuf = RINGBUFFER_HVS_SND_SIZE;
365
+ rcvbuf = RINGBUFFER_HVS_RCV_SIZE;
366
+ } else {
367
+ sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE);
368
+ sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE);
369
+ sndbuf = ALIGN(sndbuf, HV_HYP_PAGE_SIZE);
370
+ rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE);
371
+ rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE);
372
+ rcvbuf = ALIGN(rcvbuf, HV_HYP_PAGE_SIZE);
373
+ }
374
+
375
+ ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb,
376
+ conn_from_host ? new : sk);
347377 if (ret != 0) {
348378 if (conn_from_host) {
349379 hvs_new->chan = NULL;
....@@ -369,9 +399,8 @@
369399
370400 if (conn_from_host) {
371401 new->sk_state = TCP_ESTABLISHED;
372
- sk->sk_ack_backlog++;
402
+ sk_acceptq_added(sk);
373403
374
- hvs_addr_init(&vnew->local_addr, if_type);
375404 hvs_new->vm_srv_id = *if_type;
376405 hvs_new->host_srv_id = *if_instance;
377406
....@@ -402,6 +431,7 @@
402431 static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk)
403432 {
404433 struct hvsock *hvs;
434
+ struct sock *sk = sk_vsock(vsk);
405435
406436 hvs = kzalloc(sizeof(*hvs), GFP_KERNEL);
407437 if (!hvs)
....@@ -409,7 +439,8 @@
409439
410440 vsk->trans = hvs;
411441 hvs->vsk = vsk;
412
-
442
+ sk->sk_sndbuf = RINGBUFFER_HVS_SND_SIZE;
443
+ sk->sk_rcvbuf = RINGBUFFER_HVS_RCV_SIZE;
413444 return 0;
414445 }
415446
....@@ -491,12 +522,9 @@
491522
492523 static void hvs_release(struct vsock_sock *vsk)
493524 {
494
- struct sock *sk = sk_vsock(vsk);
495525 bool remove_sock;
496526
497
- lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
498527 remove_sock = hvs_close_lock_held(vsk);
499
- release_sock(sk);
500528 if (remove_sock)
501529 vsock_remove_sock(vsk);
502530 }
....@@ -601,28 +629,44 @@
601629 struct hvsock *hvs = vsk->trans;
602630 struct vmbus_channel *chan = hvs->chan;
603631 struct hvs_send_buf *send_buf;
604
- ssize_t to_write, max_writable, ret;
632
+ ssize_t to_write, max_writable;
633
+ ssize_t ret = 0;
634
+ ssize_t bytes_written = 0;
605635
606
- BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K);
636
+ BUILD_BUG_ON(sizeof(*send_buf) != HV_HYP_PAGE_SIZE);
607637
608638 send_buf = kmalloc(sizeof(*send_buf), GFP_KERNEL);
609639 if (!send_buf)
610640 return -ENOMEM;
611641
612
- max_writable = hvs_channel_writable_bytes(chan);
613
- to_write = min_t(ssize_t, len, max_writable);
614
- to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE);
642
+ /* Reader(s) could be draining data from the channel as we write.
643
+ * Maximize bandwidth, by iterating until the channel is found to be
644
+ * full.
645
+ */
646
+ while (len) {
647
+ max_writable = hvs_channel_writable_bytes(chan);
648
+ if (!max_writable)
649
+ break;
650
+ to_write = min_t(ssize_t, len, max_writable);
651
+ to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE);
652
+ /* memcpy_from_msg is safe for loop as it advances the offsets
653
+ * within the message iterator.
654
+ */
655
+ ret = memcpy_from_msg(send_buf->data, msg, to_write);
656
+ if (ret < 0)
657
+ goto out;
615658
616
- ret = memcpy_from_msg(send_buf->data, msg, to_write);
617
- if (ret < 0)
618
- goto out;
659
+ ret = hvs_send_data(hvs->chan, send_buf, to_write);
660
+ if (ret < 0)
661
+ goto out;
619662
620
- ret = hvs_send_data(hvs->chan, send_buf, to_write);
621
- if (ret < 0)
622
- goto out;
623
-
624
- ret = to_write;
663
+ bytes_written += to_write;
664
+ len -= to_write;
665
+ }
625666 out:
667
+ /* If any data has been sent, return that */
668
+ if (bytes_written)
669
+ ret = bytes_written;
626670 kfree(send_buf);
627671 return ret;
628672 }
....@@ -752,37 +796,9 @@
752796 return 0;
753797 }
754798
755
-static void hvs_set_buffer_size(struct vsock_sock *vsk, u64 val)
756
-{
757
- /* Ignored. */
758
-}
759
-
760
-static void hvs_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
761
-{
762
- /* Ignored. */
763
-}
764
-
765
-static void hvs_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
766
-{
767
- /* Ignored. */
768
-}
769
-
770
-static u64 hvs_get_buffer_size(struct vsock_sock *vsk)
771
-{
772
- return -ENOPROTOOPT;
773
-}
774
-
775
-static u64 hvs_get_min_buffer_size(struct vsock_sock *vsk)
776
-{
777
- return -ENOPROTOOPT;
778
-}
779
-
780
-static u64 hvs_get_max_buffer_size(struct vsock_sock *vsk)
781
-{
782
- return -ENOPROTOOPT;
783
-}
784
-
785799 static struct vsock_transport hvs_transport = {
800
+ .module = THIS_MODULE,
801
+
786802 .get_local_cid = hvs_get_local_cid,
787803
788804 .init = hvs_sock_init,
....@@ -815,13 +831,12 @@
815831 .notify_send_pre_enqueue = hvs_notify_send_pre_enqueue,
816832 .notify_send_post_enqueue = hvs_notify_send_post_enqueue,
817833
818
- .set_buffer_size = hvs_set_buffer_size,
819
- .set_min_buffer_size = hvs_set_min_buffer_size,
820
- .set_max_buffer_size = hvs_set_max_buffer_size,
821
- .get_buffer_size = hvs_get_buffer_size,
822
- .get_min_buffer_size = hvs_get_min_buffer_size,
823
- .get_max_buffer_size = hvs_get_max_buffer_size,
824834 };
835
+
836
+static bool hvs_check_transport(struct vsock_sock *vsk)
837
+{
838
+ return vsk->transport == &hvs_transport;
839
+}
825840
826841 static int hvs_probe(struct hv_device *hdev,
827842 const struct hv_vmbus_device_id *dev_id)
....@@ -847,6 +862,24 @@
847862 return 0;
848863 }
849864
865
+/* hv_sock connections can not persist across hibernation, and all the hv_sock
866
+ * channels are forced to be rescinded before hibernation: see
867
+ * vmbus_bus_suspend(). Here the dummy hvs_suspend() and hvs_resume()
868
+ * are only needed because hibernation requires that every vmbus device's
869
+ * driver should have a .suspend and .resume callback: see vmbus_suspend().
870
+ */
871
+static int hvs_suspend(struct hv_device *hv_dev)
872
+{
873
+ /* Dummy */
874
+ return 0;
875
+}
876
+
877
+static int hvs_resume(struct hv_device *dev)
878
+{
879
+ /* Dummy */
880
+ return 0;
881
+}
882
+
850883 /* This isn't really used. See vmbus_match() and vmbus_probe() */
851884 static const struct hv_vmbus_device_id id_table[] = {
852885 {},
....@@ -858,6 +891,8 @@
858891 .id_table = id_table,
859892 .probe = hvs_probe,
860893 .remove = hvs_remove,
894
+ .suspend = hvs_suspend,
895
+ .resume = hvs_resume,
861896 };
862897
863898 static int __init hvs_init(void)
....@@ -871,7 +906,7 @@
871906 if (ret != 0)
872907 return ret;
873908
874
- ret = vsock_core_init(&hvs_transport);
909
+ ret = vsock_core_register(&hvs_transport, VSOCK_TRANSPORT_F_G2H);
875910 if (ret) {
876911 vmbus_driver_unregister(&hvs_drv);
877912 return ret;
....@@ -882,7 +917,7 @@
882917
883918 static void __exit hvs_exit(void)
884919 {
885
- vsock_core_exit();
920
+ vsock_core_unregister(&hvs_transport);
886921 vmbus_driver_unregister(&hvs_drv);
887922 }
888923