.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* |
---|
2 | 3 | * Hyper-V transport for vsock |
---|
3 | 4 | * |
---|
.. | .. |
---|
6 | 7 | * support in the VM by introducing the new vsock transport. |
---|
7 | 8 | * |
---|
8 | 9 | * Copyright (c) 2017, Microsoft Corporation. |
---|
9 | | - * |
---|
10 | | - * This program is free software; you can redistribute it and/or modify it |
---|
11 | | - * under the terms and conditions of the GNU General Public License, |
---|
12 | | - * version 2, as published by the Free Software Foundation. |
---|
13 | | - * |
---|
14 | | - * This program is distributed in the hope it will be useful, but WITHOUT |
---|
15 | | - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
---|
16 | | - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
---|
17 | | - * more details. |
---|
18 | | - * |
---|
19 | 10 | */ |
---|
20 | 11 | #include <linux/module.h> |
---|
21 | 12 | #include <linux/vmalloc.h> |
---|
22 | 13 | #include <linux/hyperv.h> |
---|
23 | 14 | #include <net/sock.h> |
---|
24 | 15 | #include <net/af_vsock.h> |
---|
| 16 | +#include <asm/hyperv-tlfs.h> |
---|
25 | 17 | |
---|
26 | | -/* The host side's design of the feature requires 6 exact 4KB pages for |
---|
27 | | - * recv/send rings respectively -- this is suboptimal considering memory |
---|
28 | | - * consumption, however unluckily we have to live with it, before the |
---|
29 | | - * host comes up with a better design in the future. |
---|
| 18 | +/* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some |
---|
| 19 | + * stricter requirements on the hv_sock ring buffer size of six 4K pages. |
---|
| 20 | + * hyperv-tlfs defines HV_HYP_PAGE_SIZE as 4K. Newer hosts don't have this |
---|
| 21 | + * limitation; but, keep the defaults the same for compat. |
---|
30 | 22 | */ |
---|
31 | | -#define PAGE_SIZE_4K 4096 |
---|
32 | | -#define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6) |
---|
33 | | -#define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6) |
---|
| 23 | +#define RINGBUFFER_HVS_RCV_SIZE (HV_HYP_PAGE_SIZE * 6) |
---|
| 24 | +#define RINGBUFFER_HVS_SND_SIZE (HV_HYP_PAGE_SIZE * 6) |
---|
| 25 | +#define RINGBUFFER_HVS_MAX_SIZE (HV_HYP_PAGE_SIZE * 64) |
---|
34 | 26 | |
---|
35 | 27 | /* The MTU is 16KB per the host side's design */ |
---|
36 | 28 | #define HVS_MTU_SIZE (1024 * 16) |
---|
.. | .. |
---|
55 | 47 | }; |
---|
56 | 48 | |
---|
57 | 49 | /* We can send up to HVS_MTU_SIZE bytes of payload to the host, but let's use |
---|
58 | | - * a small size, i.e. HVS_SEND_BUF_SIZE, to minimize the dynamically-allocated |
---|
59 | | - * buffer, because tests show there is no significant performance difference. |
---|
| 50 | + * a smaller size, i.e. HVS_SEND_BUF_SIZE, to maximize concurrency between the |
---|
| 51 | + * guest and the host processing as one VMBUS packet is the smallest processing |
---|
| 52 | + * unit. |
---|
60 | 53 | * |
---|
61 | 54 | * Note: the buffer can be eliminated in the future when we add new VMBus |
---|
62 | 55 | * ringbuffer APIs that allow us to directly copy data from userspace buffer |
---|
63 | 56 | * to VMBus ringbuffer. |
---|
64 | 57 | */ |
---|
65 | | -#define HVS_SEND_BUF_SIZE (PAGE_SIZE_4K - sizeof(struct vmpipe_proto_header)) |
---|
| 58 | +#define HVS_SEND_BUF_SIZE \ |
---|
| 59 | + (HV_HYP_PAGE_SIZE - sizeof(struct vmpipe_proto_header)) |
---|
66 | 60 | |
---|
67 | 61 | struct hvs_send_buf { |
---|
68 | 62 | /* The header before the payload data */ |
---|
.. | .. |
---|
85 | 79 | VMBUS_PKT_TRAILER_SIZE) |
---|
86 | 80 | |
---|
87 | 81 | union hvs_service_id { |
---|
88 | | - uuid_le srv_id; |
---|
| 82 | + guid_t srv_id; |
---|
89 | 83 | |
---|
90 | 84 | struct { |
---|
91 | 85 | unsigned int svm_port; |
---|
92 | | - unsigned char b[sizeof(uuid_le) - sizeof(unsigned int)]; |
---|
| 86 | + unsigned char b[sizeof(guid_t) - sizeof(unsigned int)]; |
---|
93 | 87 | }; |
---|
94 | 88 | }; |
---|
95 | 89 | |
---|
.. | .. |
---|
97 | 91 | struct hvsock { |
---|
98 | 92 | struct vsock_sock *vsk; |
---|
99 | 93 | |
---|
100 | | - uuid_le vm_srv_id; |
---|
101 | | - uuid_le host_srv_id; |
---|
| 94 | + guid_t vm_srv_id; |
---|
| 95 | + guid_t host_srv_id; |
---|
102 | 96 | |
---|
103 | 97 | struct vmbus_channel *chan; |
---|
104 | 98 | struct vmpacket_descriptor *recv_desc; |
---|
.. | .. |
---|
154 | 148 | */ |
---|
155 | 149 | |
---|
156 | 150 | /* 00000000-facb-11e6-bd58-64006a7986d3 */ |
---|
157 | | -static const uuid_le srv_id_template = |
---|
158 | | - UUID_LE(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58, |
---|
159 | | - 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3); |
---|
| 151 | +static const guid_t srv_id_template = |
---|
| 152 | + GUID_INIT(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58, |
---|
| 153 | + 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3); |
---|
160 | 154 | |
---|
161 | | -static bool is_valid_srv_id(const uuid_le *id) |
---|
| 155 | +static bool hvs_check_transport(struct vsock_sock *vsk); |
---|
| 156 | + |
---|
| 157 | +static bool is_valid_srv_id(const guid_t *id) |
---|
162 | 158 | { |
---|
163 | | - return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(uuid_le) - 4); |
---|
| 159 | + return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(guid_t) - 4); |
---|
164 | 160 | } |
---|
165 | 161 | |
---|
166 | | -static unsigned int get_port_by_srv_id(const uuid_le *svr_id) |
---|
| 162 | +static unsigned int get_port_by_srv_id(const guid_t *svr_id) |
---|
167 | 163 | { |
---|
168 | 164 | return *((unsigned int *)svr_id); |
---|
169 | 165 | } |
---|
170 | 166 | |
---|
171 | | -static void hvs_addr_init(struct sockaddr_vm *addr, const uuid_le *svr_id) |
---|
| 167 | +static void hvs_addr_init(struct sockaddr_vm *addr, const guid_t *svr_id) |
---|
172 | 168 | { |
---|
173 | 169 | unsigned int port = get_port_by_srv_id(svr_id); |
---|
174 | 170 | |
---|
.. | .. |
---|
289 | 285 | |
---|
290 | 286 | static void hvs_open_connection(struct vmbus_channel *chan) |
---|
291 | 287 | { |
---|
292 | | - uuid_le *if_instance, *if_type; |
---|
| 288 | + guid_t *if_instance, *if_type; |
---|
293 | 289 | unsigned char conn_from_host; |
---|
294 | 290 | |
---|
295 | 291 | struct sockaddr_vm addr; |
---|
.. | .. |
---|
297 | 293 | struct vsock_sock *vnew = NULL; |
---|
298 | 294 | struct hvsock *hvs = NULL; |
---|
299 | 295 | struct hvsock *hvs_new = NULL; |
---|
| 296 | + int rcvbuf; |
---|
300 | 297 | int ret; |
---|
| 298 | + int sndbuf; |
---|
301 | 299 | |
---|
302 | 300 | if_type = &chan->offermsg.offer.if_type; |
---|
303 | 301 | if_instance = &chan->offermsg.offer.if_instance; |
---|
.. | .. |
---|
319 | 317 | if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog) |
---|
320 | 318 | goto out; |
---|
321 | 319 | |
---|
322 | | - new = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, |
---|
323 | | - sk->sk_type, 0); |
---|
| 320 | + new = vsock_create_connected(sk); |
---|
324 | 321 | if (!new) |
---|
325 | 322 | goto out; |
---|
326 | 323 | |
---|
.. | .. |
---|
333 | 330 | vsock_addr_init(&vnew->remote_addr, |
---|
334 | 331 | VMADDR_CID_HOST, VMADDR_PORT_ANY); |
---|
335 | 332 | vnew->remote_addr.svm_port = get_port_by_srv_id(if_instance); |
---|
| 333 | + ret = vsock_assign_transport(vnew, vsock_sk(sk)); |
---|
| 334 | + /* Transport assigned (looking at remote_addr) must be the |
---|
| 335 | + * same where we received the request. |
---|
| 336 | + */ |
---|
| 337 | + if (ret || !hvs_check_transport(vnew)) { |
---|
| 338 | + sock_put(new); |
---|
| 339 | + goto out; |
---|
| 340 | + } |
---|
336 | 341 | hvs_new = vnew->trans; |
---|
337 | 342 | hvs_new->chan = chan; |
---|
338 | 343 | } else { |
---|
.. | .. |
---|
341 | 346 | } |
---|
342 | 347 | |
---|
343 | 348 | set_channel_read_mode(chan, HV_CALL_DIRECT); |
---|
344 | | - ret = vmbus_open(chan, RINGBUFFER_HVS_SND_SIZE, |
---|
345 | | - RINGBUFFER_HVS_RCV_SIZE, NULL, 0, |
---|
346 | | - hvs_channel_cb, conn_from_host ? new : sk); |
---|
| 349 | + |
---|
| 350 | + /* Use the socket buffer sizes as hints for the VMBUS ring size. For |
---|
| 351 | + * server side sockets, 'sk' is the parent socket and thus, this will |
---|
| 352 | + * allow the child sockets to inherit the size from the parent. Keep |
---|
| 353 | + * the mins to the default value and align to page size as per VMBUS |
---|
| 354 | + * requirements. |
---|
| 355 | + * For the max, the socket core library will limit the socket buffer |
---|
| 356 | + * size that can be set by the user, but, since currently, the hv_sock |
---|
| 357 | + * VMBUS ring buffer is physically contiguous allocation, restrict it |
---|
| 358 | + * further. |
---|
| 359 | + * Older versions of hv_sock host side code cannot handle bigger VMBUS |
---|
| 360 | + * ring buffer size. Use the version number to limit the change to newer |
---|
| 361 | + * versions. |
---|
| 362 | + */ |
---|
| 363 | + if (vmbus_proto_version < VERSION_WIN10_V5) { |
---|
| 364 | + sndbuf = RINGBUFFER_HVS_SND_SIZE; |
---|
| 365 | + rcvbuf = RINGBUFFER_HVS_RCV_SIZE; |
---|
| 366 | + } else { |
---|
| 367 | + sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE); |
---|
| 368 | + sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE); |
---|
| 369 | + sndbuf = ALIGN(sndbuf, HV_HYP_PAGE_SIZE); |
---|
| 370 | + rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE); |
---|
| 371 | + rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE); |
---|
| 372 | + rcvbuf = ALIGN(rcvbuf, HV_HYP_PAGE_SIZE); |
---|
| 373 | + } |
---|
| 374 | + |
---|
| 375 | + ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb, |
---|
| 376 | + conn_from_host ? new : sk); |
---|
347 | 377 | if (ret != 0) { |
---|
348 | 378 | if (conn_from_host) { |
---|
349 | 379 | hvs_new->chan = NULL; |
---|
.. | .. |
---|
369 | 399 | |
---|
370 | 400 | if (conn_from_host) { |
---|
371 | 401 | new->sk_state = TCP_ESTABLISHED; |
---|
372 | | - sk->sk_ack_backlog++; |
---|
| 402 | + sk_acceptq_added(sk); |
---|
373 | 403 | |
---|
374 | | - hvs_addr_init(&vnew->local_addr, if_type); |
---|
375 | 404 | hvs_new->vm_srv_id = *if_type; |
---|
376 | 405 | hvs_new->host_srv_id = *if_instance; |
---|
377 | 406 | |
---|
.. | .. |
---|
402 | 431 | static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk) |
---|
403 | 432 | { |
---|
404 | 433 | struct hvsock *hvs; |
---|
| 434 | + struct sock *sk = sk_vsock(vsk); |
---|
405 | 435 | |
---|
406 | 436 | hvs = kzalloc(sizeof(*hvs), GFP_KERNEL); |
---|
407 | 437 | if (!hvs) |
---|
.. | .. |
---|
409 | 439 | |
---|
410 | 440 | vsk->trans = hvs; |
---|
411 | 441 | hvs->vsk = vsk; |
---|
412 | | - |
---|
| 442 | + sk->sk_sndbuf = RINGBUFFER_HVS_SND_SIZE; |
---|
| 443 | + sk->sk_rcvbuf = RINGBUFFER_HVS_RCV_SIZE; |
---|
413 | 444 | return 0; |
---|
414 | 445 | } |
---|
415 | 446 | |
---|
.. | .. |
---|
491 | 522 | |
---|
492 | 523 | static void hvs_release(struct vsock_sock *vsk) |
---|
493 | 524 | { |
---|
494 | | - struct sock *sk = sk_vsock(vsk); |
---|
495 | 525 | bool remove_sock; |
---|
496 | 526 | |
---|
497 | | - lock_sock_nested(sk, SINGLE_DEPTH_NESTING); |
---|
498 | 527 | remove_sock = hvs_close_lock_held(vsk); |
---|
499 | | - release_sock(sk); |
---|
500 | 528 | if (remove_sock) |
---|
501 | 529 | vsock_remove_sock(vsk); |
---|
502 | 530 | } |
---|
.. | .. |
---|
601 | 629 | struct hvsock *hvs = vsk->trans; |
---|
602 | 630 | struct vmbus_channel *chan = hvs->chan; |
---|
603 | 631 | struct hvs_send_buf *send_buf; |
---|
604 | | - ssize_t to_write, max_writable, ret; |
---|
| 632 | + ssize_t to_write, max_writable; |
---|
| 633 | + ssize_t ret = 0; |
---|
| 634 | + ssize_t bytes_written = 0; |
---|
605 | 635 | |
---|
606 | | - BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K); |
---|
| 636 | + BUILD_BUG_ON(sizeof(*send_buf) != HV_HYP_PAGE_SIZE); |
---|
607 | 637 | |
---|
608 | 638 | send_buf = kmalloc(sizeof(*send_buf), GFP_KERNEL); |
---|
609 | 639 | if (!send_buf) |
---|
610 | 640 | return -ENOMEM; |
---|
611 | 641 | |
---|
612 | | - max_writable = hvs_channel_writable_bytes(chan); |
---|
613 | | - to_write = min_t(ssize_t, len, max_writable); |
---|
614 | | - to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE); |
---|
| 642 | + /* Reader(s) could be draining data from the channel as we write. |
---|
| 643 | + * Maximize bandwidth, by iterating until the channel is found to be |
---|
| 644 | + * full. |
---|
| 645 | + */ |
---|
| 646 | + while (len) { |
---|
| 647 | + max_writable = hvs_channel_writable_bytes(chan); |
---|
| 648 | + if (!max_writable) |
---|
| 649 | + break; |
---|
| 650 | + to_write = min_t(ssize_t, len, max_writable); |
---|
| 651 | + to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE); |
---|
| 652 | + /* memcpy_from_msg is safe for loop as it advances the offsets |
---|
| 653 | + * within the message iterator. |
---|
| 654 | + */ |
---|
| 655 | + ret = memcpy_from_msg(send_buf->data, msg, to_write); |
---|
| 656 | + if (ret < 0) |
---|
| 657 | + goto out; |
---|
615 | 658 | |
---|
616 | | - ret = memcpy_from_msg(send_buf->data, msg, to_write); |
---|
617 | | - if (ret < 0) |
---|
618 | | - goto out; |
---|
| 659 | + ret = hvs_send_data(hvs->chan, send_buf, to_write); |
---|
| 660 | + if (ret < 0) |
---|
| 661 | + goto out; |
---|
619 | 662 | |
---|
620 | | - ret = hvs_send_data(hvs->chan, send_buf, to_write); |
---|
621 | | - if (ret < 0) |
---|
622 | | - goto out; |
---|
623 | | - |
---|
624 | | - ret = to_write; |
---|
| 663 | + bytes_written += to_write; |
---|
| 664 | + len -= to_write; |
---|
| 665 | + } |
---|
625 | 666 | out: |
---|
| 667 | + /* If any data has been sent, return that */ |
---|
| 668 | + if (bytes_written) |
---|
| 669 | + ret = bytes_written; |
---|
626 | 670 | kfree(send_buf); |
---|
627 | 671 | return ret; |
---|
628 | 672 | } |
---|
.. | .. |
---|
752 | 796 | return 0; |
---|
753 | 797 | } |
---|
754 | 798 | |
---|
755 | | -static void hvs_set_buffer_size(struct vsock_sock *vsk, u64 val) |
---|
756 | | -{ |
---|
757 | | - /* Ignored. */ |
---|
758 | | -} |
---|
759 | | - |
---|
760 | | -static void hvs_set_min_buffer_size(struct vsock_sock *vsk, u64 val) |
---|
761 | | -{ |
---|
762 | | - /* Ignored. */ |
---|
763 | | -} |
---|
764 | | - |
---|
765 | | -static void hvs_set_max_buffer_size(struct vsock_sock *vsk, u64 val) |
---|
766 | | -{ |
---|
767 | | - /* Ignored. */ |
---|
768 | | -} |
---|
769 | | - |
---|
770 | | -static u64 hvs_get_buffer_size(struct vsock_sock *vsk) |
---|
771 | | -{ |
---|
772 | | - return -ENOPROTOOPT; |
---|
773 | | -} |
---|
774 | | - |
---|
775 | | -static u64 hvs_get_min_buffer_size(struct vsock_sock *vsk) |
---|
776 | | -{ |
---|
777 | | - return -ENOPROTOOPT; |
---|
778 | | -} |
---|
779 | | - |
---|
780 | | -static u64 hvs_get_max_buffer_size(struct vsock_sock *vsk) |
---|
781 | | -{ |
---|
782 | | - return -ENOPROTOOPT; |
---|
783 | | -} |
---|
784 | | - |
---|
785 | 799 | static struct vsock_transport hvs_transport = { |
---|
| 800 | + .module = THIS_MODULE, |
---|
| 801 | + |
---|
786 | 802 | .get_local_cid = hvs_get_local_cid, |
---|
787 | 803 | |
---|
788 | 804 | .init = hvs_sock_init, |
---|
.. | .. |
---|
815 | 831 | .notify_send_pre_enqueue = hvs_notify_send_pre_enqueue, |
---|
816 | 832 | .notify_send_post_enqueue = hvs_notify_send_post_enqueue, |
---|
817 | 833 | |
---|
818 | | - .set_buffer_size = hvs_set_buffer_size, |
---|
819 | | - .set_min_buffer_size = hvs_set_min_buffer_size, |
---|
820 | | - .set_max_buffer_size = hvs_set_max_buffer_size, |
---|
821 | | - .get_buffer_size = hvs_get_buffer_size, |
---|
822 | | - .get_min_buffer_size = hvs_get_min_buffer_size, |
---|
823 | | - .get_max_buffer_size = hvs_get_max_buffer_size, |
---|
824 | 834 | }; |
---|
| 835 | + |
---|
| 836 | +static bool hvs_check_transport(struct vsock_sock *vsk) |
---|
| 837 | +{ |
---|
| 838 | + return vsk->transport == &hvs_transport; |
---|
| 839 | +} |
---|
825 | 840 | |
---|
826 | 841 | static int hvs_probe(struct hv_device *hdev, |
---|
827 | 842 | const struct hv_vmbus_device_id *dev_id) |
---|
.. | .. |
---|
847 | 862 | return 0; |
---|
848 | 863 | } |
---|
849 | 864 | |
---|
| 865 | +/* hv_sock connections can not persist across hibernation, and all the hv_sock |
---|
| 866 | + * channels are forced to be rescinded before hibernation: see |
---|
| 867 | + * vmbus_bus_suspend(). Here the dummy hvs_suspend() and hvs_resume() |
---|
| 868 | + * are only needed because hibernation requires that every vmbus device's |
---|
| 869 | + * driver should have a .suspend and .resume callback: see vmbus_suspend(). |
---|
| 870 | + */ |
---|
| 871 | +static int hvs_suspend(struct hv_device *hv_dev) |
---|
| 872 | +{ |
---|
| 873 | + /* Dummy */ |
---|
| 874 | + return 0; |
---|
| 875 | +} |
---|
| 876 | + |
---|
| 877 | +static int hvs_resume(struct hv_device *dev) |
---|
| 878 | +{ |
---|
| 879 | + /* Dummy */ |
---|
| 880 | + return 0; |
---|
| 881 | +} |
---|
| 882 | + |
---|
850 | 883 | /* This isn't really used. See vmbus_match() and vmbus_probe() */ |
---|
851 | 884 | static const struct hv_vmbus_device_id id_table[] = { |
---|
852 | 885 | {}, |
---|
.. | .. |
---|
858 | 891 | .id_table = id_table, |
---|
859 | 892 | .probe = hvs_probe, |
---|
860 | 893 | .remove = hvs_remove, |
---|
| 894 | + .suspend = hvs_suspend, |
---|
| 895 | + .resume = hvs_resume, |
---|
861 | 896 | }; |
---|
862 | 897 | |
---|
863 | 898 | static int __init hvs_init(void) |
---|
.. | .. |
---|
871 | 906 | if (ret != 0) |
---|
872 | 907 | return ret; |
---|
873 | 908 | |
---|
874 | | - ret = vsock_core_init(&hvs_transport); |
---|
| 909 | + ret = vsock_core_register(&hvs_transport, VSOCK_TRANSPORT_F_G2H); |
---|
875 | 910 | if (ret) { |
---|
876 | 911 | vmbus_driver_unregister(&hvs_drv); |
---|
877 | 912 | return ret; |
---|
.. | .. |
---|
882 | 917 | |
---|
883 | 918 | static void __exit hvs_exit(void) |
---|
884 | 919 | { |
---|
885 | | - vsock_core_exit(); |
---|
| 920 | + vsock_core_unregister(&hvs_transport); |
---|
886 | 921 | vmbus_driver_unregister(&hvs_drv); |
---|
887 | 922 | } |
---|
888 | 923 | |
---|