.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
---|
1 | 2 | /* Copyright (C) 2009 Red Hat, Inc. |
---|
2 | 3 | * Author: Michael S. Tsirkin <mst@redhat.com> |
---|
3 | | - * |
---|
4 | | - * This work is licensed under the terms of the GNU GPL, version 2. |
---|
5 | 4 | * |
---|
6 | 5 | * virtio-net server in host kernel. |
---|
7 | 6 | */ |
---|
.. | .. |
---|
74 | 73 | VHOST_NET_FEATURES = VHOST_FEATURES | |
---|
75 | 74 | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | |
---|
76 | 75 | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | |
---|
77 | | - (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
---|
| 76 | + (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
---|
78 | 77 | }; |
---|
79 | 78 | |
---|
80 | 79 | enum { |
---|
.. | .. |
---|
116 | 115 | * For RX, number of batched heads |
---|
117 | 116 | */ |
---|
118 | 117 | int done_idx; |
---|
| 118 | + /* Number of XDP frames batched */ |
---|
| 119 | + int batched_xdp; |
---|
119 | 120 | /* an array of userspace buffers info */ |
---|
120 | 121 | struct ubuf_info *ubuf_info; |
---|
121 | 122 | /* Reference counting for outstanding ubufs. |
---|
.. | .. |
---|
123 | 124 | struct vhost_net_ubuf_ref *ubufs; |
---|
124 | 125 | struct ptr_ring *rx_ring; |
---|
125 | 126 | struct vhost_net_buf rxq; |
---|
| 127 | + /* Batched XDP buffs */ |
---|
| 128 | + struct xdp_buff *xdp; |
---|
126 | 129 | }; |
---|
127 | 130 | |
---|
128 | 131 | struct vhost_net { |
---|
.. | .. |
---|
137 | 140 | unsigned tx_zcopy_err; |
---|
138 | 141 | /* Flush in progress. Protected by tx vq lock. */ |
---|
139 | 142 | bool tx_flush; |
---|
| 143 | + /* Private page frag */ |
---|
| 144 | + struct page_frag page_frag; |
---|
| 145 | + /* Refcount bias of page frag */ |
---|
| 146 | + int refcnt_bias; |
---|
140 | 147 | }; |
---|
141 | 148 | |
---|
142 | 149 | static unsigned vhost_net_zcopy_mask __read_mostly; |
---|
.. | .. |
---|
338 | 345 | sock_flag(sock->sk, SOCK_ZEROCOPY); |
---|
339 | 346 | } |
---|
340 | 347 | |
---|
| 348 | +static bool vhost_sock_xdp(struct socket *sock) |
---|
| 349 | +{ |
---|
| 350 | + return sock_flag(sock->sk, SOCK_XDP); |
---|
| 351 | +} |
---|
| 352 | + |
---|
341 | 353 | /* In case of DMA done not in order in lower device driver for some reason. |
---|
342 | 354 | * upend_idx is used to track end of used idx, done_idx is used to track head |
---|
343 | 355 | * of used idx. Once lower device DMA done contiguously, we will signal KVM |
---|
.. | .. |
---|
412 | 424 | struct vhost_net_virtqueue *nvq = |
---|
413 | 425 | container_of(vq, struct vhost_net_virtqueue, vq); |
---|
414 | 426 | struct vhost_poll *poll = n->poll + (nvq - n->vqs); |
---|
415 | | - if (!vq->private_data) |
---|
| 427 | + if (!vhost_vq_get_backend(vq)) |
---|
416 | 428 | return; |
---|
417 | 429 | vhost_poll_stop(poll); |
---|
418 | 430 | } |
---|
.. | .. |
---|
425 | 437 | struct vhost_poll *poll = n->poll + (nvq - n->vqs); |
---|
426 | 438 | struct socket *sock; |
---|
427 | 439 | |
---|
428 | | - sock = vq->private_data; |
---|
| 440 | + sock = vhost_vq_get_backend(vq); |
---|
429 | 441 | if (!sock) |
---|
430 | 442 | return 0; |
---|
431 | 443 | |
---|
.. | .. |
---|
444 | 456 | nvq->done_idx = 0; |
---|
445 | 457 | } |
---|
446 | 458 | |
---|
447 | | -static int vhost_net_tx_get_vq_desc(struct vhost_net *net, |
---|
448 | | - struct vhost_net_virtqueue *nvq, |
---|
449 | | - unsigned int *out_num, unsigned int *in_num, |
---|
450 | | - bool *busyloop_intr) |
---|
| 459 | +static void vhost_tx_batch(struct vhost_net *net, |
---|
| 460 | + struct vhost_net_virtqueue *nvq, |
---|
| 461 | + struct socket *sock, |
---|
| 462 | + struct msghdr *msghdr) |
---|
451 | 463 | { |
---|
452 | | - struct vhost_virtqueue *vq = &nvq->vq; |
---|
453 | | - unsigned long uninitialized_var(endtime); |
---|
454 | | - int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), |
---|
| 464 | + struct tun_msg_ctl ctl = { |
---|
| 465 | + .type = TUN_MSG_PTR, |
---|
| 466 | + .num = nvq->batched_xdp, |
---|
| 467 | + .ptr = nvq->xdp, |
---|
| 468 | + }; |
---|
| 469 | + int i, err; |
---|
| 470 | + |
---|
| 471 | + if (nvq->batched_xdp == 0) |
---|
| 472 | + goto signal_used; |
---|
| 473 | + |
---|
| 474 | + msghdr->msg_control = &ctl; |
---|
| 475 | + msghdr->msg_controllen = sizeof(ctl); |
---|
| 476 | + err = sock->ops->sendmsg(sock, msghdr, 0); |
---|
| 477 | + if (unlikely(err < 0)) { |
---|
| 478 | + vq_err(&nvq->vq, "Fail to batch sending packets\n"); |
---|
| 479 | + |
---|
| 480 | + /* free pages owned by XDP; since this is an unlikely error path, |
---|
| 481 | + * keep it simple and avoid more complex bulk update for the |
---|
| 482 | + * used pages |
---|
| 483 | + */ |
---|
| 484 | + for (i = 0; i < nvq->batched_xdp; ++i) |
---|
| 485 | + put_page(virt_to_head_page(nvq->xdp[i].data)); |
---|
| 486 | + nvq->batched_xdp = 0; |
---|
| 487 | + nvq->done_idx = 0; |
---|
| 488 | + return; |
---|
| 489 | + } |
---|
| 490 | + |
---|
| 491 | +signal_used: |
---|
| 492 | + vhost_net_signal_used(nvq); |
---|
| 493 | + nvq->batched_xdp = 0; |
---|
| 494 | +} |
---|
| 495 | + |
---|
| 496 | +static int sock_has_rx_data(struct socket *sock) |
---|
| 497 | +{ |
---|
| 498 | + if (unlikely(!sock)) |
---|
| 499 | + return 0; |
---|
| 500 | + |
---|
| 501 | + if (sock->ops->peek_len) |
---|
| 502 | + return sock->ops->peek_len(sock); |
---|
| 503 | + |
---|
| 504 | + return skb_queue_empty(&sock->sk->sk_receive_queue); |
---|
| 505 | +} |
---|
| 506 | + |
---|
| 507 | +static void vhost_net_busy_poll_try_queue(struct vhost_net *net, |
---|
| 508 | + struct vhost_virtqueue *vq) |
---|
| 509 | +{ |
---|
| 510 | + if (!vhost_vq_avail_empty(&net->dev, vq)) { |
---|
| 511 | + vhost_poll_queue(&vq->poll); |
---|
| 512 | + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) { |
---|
| 513 | + vhost_disable_notify(&net->dev, vq); |
---|
| 514 | + vhost_poll_queue(&vq->poll); |
---|
| 515 | + } |
---|
| 516 | +} |
---|
| 517 | + |
---|
| 518 | +static void vhost_net_busy_poll(struct vhost_net *net, |
---|
| 519 | + struct vhost_virtqueue *rvq, |
---|
| 520 | + struct vhost_virtqueue *tvq, |
---|
| 521 | + bool *busyloop_intr, |
---|
| 522 | + bool poll_rx) |
---|
| 523 | +{ |
---|
| 524 | + unsigned long busyloop_timeout; |
---|
| 525 | + unsigned long endtime; |
---|
| 526 | + struct socket *sock; |
---|
| 527 | + struct vhost_virtqueue *vq = poll_rx ? tvq : rvq; |
---|
| 528 | + |
---|
| 529 | + /* Try to hold the vq mutex of the paired virtqueue. We can't |
---|
| 530 | + * use mutex_lock() here since we could not guarantee a |
---|
| 531 | + * consistenet lock ordering. |
---|
| 532 | + */ |
---|
| 533 | + if (!mutex_trylock(&vq->mutex)) |
---|
| 534 | + return; |
---|
| 535 | + |
---|
| 536 | + vhost_disable_notify(&net->dev, vq); |
---|
| 537 | + sock = vhost_vq_get_backend(rvq); |
---|
| 538 | + |
---|
| 539 | + busyloop_timeout = poll_rx ? rvq->busyloop_timeout: |
---|
| 540 | + tvq->busyloop_timeout; |
---|
| 541 | + |
---|
| 542 | + preempt_disable(); |
---|
| 543 | + endtime = busy_clock() + busyloop_timeout; |
---|
| 544 | + |
---|
| 545 | + while (vhost_can_busy_poll(endtime)) { |
---|
| 546 | + if (vhost_has_work(&net->dev)) { |
---|
| 547 | + *busyloop_intr = true; |
---|
| 548 | + break; |
---|
| 549 | + } |
---|
| 550 | + |
---|
| 551 | + if ((sock_has_rx_data(sock) && |
---|
| 552 | + !vhost_vq_avail_empty(&net->dev, rvq)) || |
---|
| 553 | + !vhost_vq_avail_empty(&net->dev, tvq)) |
---|
| 554 | + break; |
---|
| 555 | + |
---|
| 556 | + cpu_relax(); |
---|
| 557 | + } |
---|
| 558 | + |
---|
| 559 | + preempt_enable(); |
---|
| 560 | + |
---|
| 561 | + if (poll_rx || sock_has_rx_data(sock)) |
---|
| 562 | + vhost_net_busy_poll_try_queue(net, vq); |
---|
| 563 | + else if (!poll_rx) /* On tx here, sock has no rx data. */ |
---|
| 564 | + vhost_enable_notify(&net->dev, rvq); |
---|
| 565 | + |
---|
| 566 | + mutex_unlock(&vq->mutex); |
---|
| 567 | +} |
---|
| 568 | + |
---|
| 569 | +static int vhost_net_tx_get_vq_desc(struct vhost_net *net, |
---|
| 570 | + struct vhost_net_virtqueue *tnvq, |
---|
| 571 | + unsigned int *out_num, unsigned int *in_num, |
---|
| 572 | + struct msghdr *msghdr, bool *busyloop_intr) |
---|
| 573 | +{ |
---|
| 574 | + struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; |
---|
| 575 | + struct vhost_virtqueue *rvq = &rnvq->vq; |
---|
| 576 | + struct vhost_virtqueue *tvq = &tnvq->vq; |
---|
| 577 | + |
---|
| 578 | + int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), |
---|
455 | 579 | out_num, in_num, NULL, NULL); |
---|
456 | 580 | |
---|
457 | | - if (r == vq->num && vq->busyloop_timeout) { |
---|
458 | | - if (!vhost_sock_zcopy(vq->private_data)) |
---|
459 | | - vhost_net_signal_used(nvq); |
---|
460 | | - preempt_disable(); |
---|
461 | | - endtime = busy_clock() + vq->busyloop_timeout; |
---|
462 | | - while (vhost_can_busy_poll(endtime)) { |
---|
463 | | - if (vhost_has_work(vq->dev)) { |
---|
464 | | - *busyloop_intr = true; |
---|
465 | | - break; |
---|
466 | | - } |
---|
467 | | - if (!vhost_vq_avail_empty(vq->dev, vq)) |
---|
468 | | - break; |
---|
469 | | - cpu_relax(); |
---|
470 | | - } |
---|
471 | | - preempt_enable(); |
---|
472 | | - r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), |
---|
| 581 | + if (r == tvq->num && tvq->busyloop_timeout) { |
---|
| 582 | + /* Flush batched packets first */ |
---|
| 583 | + if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq))) |
---|
| 584 | + vhost_tx_batch(net, tnvq, |
---|
| 585 | + vhost_vq_get_backend(tvq), |
---|
| 586 | + msghdr); |
---|
| 587 | + |
---|
| 588 | + vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false); |
---|
| 589 | + |
---|
| 590 | + r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), |
---|
473 | 591 | out_num, in_num, NULL, NULL); |
---|
474 | 592 | } |
---|
475 | 593 | |
---|
.. | .. |
---|
506 | 624 | struct vhost_virtqueue *vq = &nvq->vq; |
---|
507 | 625 | int ret; |
---|
508 | 626 | |
---|
509 | | - ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr); |
---|
| 627 | + ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr); |
---|
510 | 628 | |
---|
511 | 629 | if (ret < 0 || ret == vq->num) |
---|
512 | 630 | return ret; |
---|
.. | .. |
---|
534 | 652 | !vhost_vq_avail_empty(vq->dev, vq); |
---|
535 | 653 | } |
---|
536 | 654 | |
---|
| 655 | +#define SKB_FRAG_PAGE_ORDER get_order(32768) |
---|
| 656 | + |
---|
| 657 | +static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz, |
---|
| 658 | + struct page_frag *pfrag, gfp_t gfp) |
---|
| 659 | +{ |
---|
| 660 | + if (pfrag->page) { |
---|
| 661 | + if (pfrag->offset + sz <= pfrag->size) |
---|
| 662 | + return true; |
---|
| 663 | + __page_frag_cache_drain(pfrag->page, net->refcnt_bias); |
---|
| 664 | + } |
---|
| 665 | + |
---|
| 666 | + pfrag->offset = 0; |
---|
| 667 | + net->refcnt_bias = 0; |
---|
| 668 | + if (SKB_FRAG_PAGE_ORDER) { |
---|
| 669 | + /* Avoid direct reclaim but allow kswapd to wake */ |
---|
| 670 | + pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | |
---|
| 671 | + __GFP_COMP | __GFP_NOWARN | |
---|
| 672 | + __GFP_NORETRY, |
---|
| 673 | + SKB_FRAG_PAGE_ORDER); |
---|
| 674 | + if (likely(pfrag->page)) { |
---|
| 675 | + pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; |
---|
| 676 | + goto done; |
---|
| 677 | + } |
---|
| 678 | + } |
---|
| 679 | + pfrag->page = alloc_page(gfp); |
---|
| 680 | + if (likely(pfrag->page)) { |
---|
| 681 | + pfrag->size = PAGE_SIZE; |
---|
| 682 | + goto done; |
---|
| 683 | + } |
---|
| 684 | + return false; |
---|
| 685 | + |
---|
| 686 | +done: |
---|
| 687 | + net->refcnt_bias = USHRT_MAX; |
---|
| 688 | + page_ref_add(pfrag->page, USHRT_MAX - 1); |
---|
| 689 | + return true; |
---|
| 690 | +} |
---|
| 691 | + |
---|
| 692 | +#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) |
---|
| 693 | + |
---|
| 694 | +static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, |
---|
| 695 | + struct iov_iter *from) |
---|
| 696 | +{ |
---|
| 697 | + struct vhost_virtqueue *vq = &nvq->vq; |
---|
| 698 | + struct vhost_net *net = container_of(vq->dev, struct vhost_net, |
---|
| 699 | + dev); |
---|
| 700 | + struct socket *sock = vhost_vq_get_backend(vq); |
---|
| 701 | + struct page_frag *alloc_frag = &net->page_frag; |
---|
| 702 | + struct virtio_net_hdr *gso; |
---|
| 703 | + struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; |
---|
| 704 | + struct tun_xdp_hdr *hdr; |
---|
| 705 | + size_t len = iov_iter_count(from); |
---|
| 706 | + int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0; |
---|
| 707 | + int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); |
---|
| 708 | + int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen); |
---|
| 709 | + int sock_hlen = nvq->sock_hlen; |
---|
| 710 | + void *buf; |
---|
| 711 | + int copied; |
---|
| 712 | + |
---|
| 713 | + if (unlikely(len < nvq->sock_hlen)) |
---|
| 714 | + return -EFAULT; |
---|
| 715 | + |
---|
| 716 | + if (SKB_DATA_ALIGN(len + pad) + |
---|
| 717 | + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) |
---|
| 718 | + return -ENOSPC; |
---|
| 719 | + |
---|
| 720 | + buflen += SKB_DATA_ALIGN(len + pad); |
---|
| 721 | + alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); |
---|
| 722 | + if (unlikely(!vhost_net_page_frag_refill(net, buflen, |
---|
| 723 | + alloc_frag, GFP_KERNEL))) |
---|
| 724 | + return -ENOMEM; |
---|
| 725 | + |
---|
| 726 | + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; |
---|
| 727 | + copied = copy_page_from_iter(alloc_frag->page, |
---|
| 728 | + alloc_frag->offset + |
---|
| 729 | + offsetof(struct tun_xdp_hdr, gso), |
---|
| 730 | + sock_hlen, from); |
---|
| 731 | + if (copied != sock_hlen) |
---|
| 732 | + return -EFAULT; |
---|
| 733 | + |
---|
| 734 | + hdr = buf; |
---|
| 735 | + gso = &hdr->gso; |
---|
| 736 | + |
---|
| 737 | + if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && |
---|
| 738 | + vhost16_to_cpu(vq, gso->csum_start) + |
---|
| 739 | + vhost16_to_cpu(vq, gso->csum_offset) + 2 > |
---|
| 740 | + vhost16_to_cpu(vq, gso->hdr_len)) { |
---|
| 741 | + gso->hdr_len = cpu_to_vhost16(vq, |
---|
| 742 | + vhost16_to_cpu(vq, gso->csum_start) + |
---|
| 743 | + vhost16_to_cpu(vq, gso->csum_offset) + 2); |
---|
| 744 | + |
---|
| 745 | + if (vhost16_to_cpu(vq, gso->hdr_len) > len) |
---|
| 746 | + return -EINVAL; |
---|
| 747 | + } |
---|
| 748 | + |
---|
| 749 | + len -= sock_hlen; |
---|
| 750 | + copied = copy_page_from_iter(alloc_frag->page, |
---|
| 751 | + alloc_frag->offset + pad, |
---|
| 752 | + len, from); |
---|
| 753 | + if (copied != len) |
---|
| 754 | + return -EFAULT; |
---|
| 755 | + |
---|
| 756 | + xdp->data_hard_start = buf; |
---|
| 757 | + xdp->data = buf + pad; |
---|
| 758 | + xdp->data_end = xdp->data + len; |
---|
| 759 | + hdr->buflen = buflen; |
---|
| 760 | + xdp->frame_sz = buflen; |
---|
| 761 | + |
---|
| 762 | + --net->refcnt_bias; |
---|
| 763 | + alloc_frag->offset += buflen; |
---|
| 764 | + |
---|
| 765 | + ++nvq->batched_xdp; |
---|
| 766 | + |
---|
| 767 | + return 0; |
---|
| 768 | +} |
---|
| 769 | + |
---|
537 | 770 | static void handle_tx_copy(struct vhost_net *net, struct socket *sock) |
---|
538 | 771 | { |
---|
539 | 772 | struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; |
---|
.. | .. |
---|
550 | 783 | size_t len, total_len = 0; |
---|
551 | 784 | int err; |
---|
552 | 785 | int sent_pkts = 0; |
---|
| 786 | + bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); |
---|
553 | 787 | |
---|
554 | 788 | do { |
---|
555 | 789 | bool busyloop_intr = false; |
---|
| 790 | + |
---|
| 791 | + if (nvq->done_idx == VHOST_NET_BATCH) |
---|
| 792 | + vhost_tx_batch(net, nvq, sock, &msg); |
---|
556 | 793 | |
---|
557 | 794 | head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, |
---|
558 | 795 | &busyloop_intr); |
---|
.. | .. |
---|
571 | 808 | break; |
---|
572 | 809 | } |
---|
573 | 810 | |
---|
574 | | - vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); |
---|
575 | | - vq->heads[nvq->done_idx].len = 0; |
---|
576 | | - |
---|
577 | 811 | total_len += len; |
---|
578 | | - if (tx_can_batch(vq, total_len)) |
---|
579 | | - msg.msg_flags |= MSG_MORE; |
---|
580 | | - else |
---|
581 | | - msg.msg_flags &= ~MSG_MORE; |
---|
| 812 | + |
---|
| 813 | + /* For simplicity, TX batching is only enabled if |
---|
| 814 | + * sndbuf is unlimited. |
---|
| 815 | + */ |
---|
| 816 | + if (sock_can_batch) { |
---|
| 817 | + err = vhost_net_build_xdp(nvq, &msg.msg_iter); |
---|
| 818 | + if (!err) { |
---|
| 819 | + goto done; |
---|
| 820 | + } else if (unlikely(err != -ENOSPC)) { |
---|
| 821 | + vhost_tx_batch(net, nvq, sock, &msg); |
---|
| 822 | + vhost_discard_vq_desc(vq, 1); |
---|
| 823 | + vhost_net_enable_vq(net, vq); |
---|
| 824 | + break; |
---|
| 825 | + } |
---|
| 826 | + |
---|
| 827 | + /* We can't build XDP buff, go for single |
---|
| 828 | + * packet path but let's flush batched |
---|
| 829 | + * packets. |
---|
| 830 | + */ |
---|
| 831 | + vhost_tx_batch(net, nvq, sock, &msg); |
---|
| 832 | + msg.msg_control = NULL; |
---|
| 833 | + } else { |
---|
| 834 | + if (tx_can_batch(vq, total_len)) |
---|
| 835 | + msg.msg_flags |= MSG_MORE; |
---|
| 836 | + else |
---|
| 837 | + msg.msg_flags &= ~MSG_MORE; |
---|
| 838 | + } |
---|
582 | 839 | |
---|
583 | 840 | /* TODO: Check specific error and bomb out unless ENOBUFS? */ |
---|
584 | 841 | err = sock->ops->sendmsg(sock, &msg, len); |
---|
.. | .. |
---|
590 | 847 | if (err != len) |
---|
591 | 848 | pr_debug("Truncated TX packet: len %d != %zd\n", |
---|
592 | 849 | err, len); |
---|
593 | | - if (++nvq->done_idx >= VHOST_NET_BATCH) |
---|
594 | | - vhost_net_signal_used(nvq); |
---|
| 850 | +done: |
---|
| 851 | + vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); |
---|
| 852 | + vq->heads[nvq->done_idx].len = 0; |
---|
| 853 | + ++nvq->done_idx; |
---|
595 | 854 | } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); |
---|
596 | 855 | |
---|
597 | | - vhost_net_signal_used(nvq); |
---|
| 856 | + vhost_tx_batch(net, nvq, sock, &msg); |
---|
598 | 857 | } |
---|
599 | 858 | |
---|
600 | 859 | static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) |
---|
.. | .. |
---|
610 | 869 | .msg_controllen = 0, |
---|
611 | 870 | .msg_flags = MSG_DONTWAIT, |
---|
612 | 871 | }; |
---|
| 872 | + struct tun_msg_ctl ctl; |
---|
613 | 873 | size_t len, total_len = 0; |
---|
614 | 874 | int err; |
---|
615 | | - struct vhost_net_ubuf_ref *uninitialized_var(ubufs); |
---|
| 875 | + struct vhost_net_ubuf_ref *ubufs; |
---|
616 | 876 | struct ubuf_info *ubuf; |
---|
617 | 877 | bool zcopy_used; |
---|
618 | 878 | int sent_pkts = 0; |
---|
.. | .. |
---|
653 | 913 | ubuf->ctx = nvq->ubufs; |
---|
654 | 914 | ubuf->desc = nvq->upend_idx; |
---|
655 | 915 | refcount_set(&ubuf->refcnt, 1); |
---|
656 | | - msg.msg_control = ubuf; |
---|
657 | | - msg.msg_controllen = sizeof(ubuf); |
---|
| 916 | + msg.msg_control = &ctl; |
---|
| 917 | + ctl.type = TUN_MSG_UBUF; |
---|
| 918 | + ctl.ptr = ubuf; |
---|
| 919 | + msg.msg_controllen = sizeof(ctl); |
---|
658 | 920 | ubufs = nvq->ubufs; |
---|
659 | 921 | atomic_inc(&ubufs->refcount); |
---|
660 | 922 | nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; |
---|
.. | .. |
---|
702 | 964 | struct vhost_virtqueue *vq = &nvq->vq; |
---|
703 | 965 | struct socket *sock; |
---|
704 | 966 | |
---|
705 | | - mutex_lock(&vq->mutex); |
---|
706 | | - sock = vq->private_data; |
---|
| 967 | + mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX); |
---|
| 968 | + sock = vhost_vq_get_backend(vq); |
---|
707 | 969 | if (!sock) |
---|
708 | 970 | goto out; |
---|
709 | 971 | |
---|
710 | | - if (!vq_iotlb_prefetch(vq)) |
---|
| 972 | + if (!vq_meta_prefetch(vq)) |
---|
711 | 973 | goto out; |
---|
712 | 974 | |
---|
713 | 975 | vhost_disable_notify(&net->dev, vq); |
---|
.. | .. |
---|
743 | 1005 | return len; |
---|
744 | 1006 | } |
---|
745 | 1007 | |
---|
746 | | -static int sk_has_rx_data(struct sock *sk) |
---|
747 | | -{ |
---|
748 | | - struct socket *sock = sk->sk_socket; |
---|
749 | | - |
---|
750 | | - if (sock->ops->peek_len) |
---|
751 | | - return sock->ops->peek_len(sock); |
---|
752 | | - |
---|
753 | | - return skb_queue_empty(&sk->sk_receive_queue); |
---|
754 | | -} |
---|
755 | | - |
---|
756 | 1008 | static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, |
---|
757 | 1009 | bool *busyloop_intr) |
---|
758 | 1010 | { |
---|
.. | .. |
---|
760 | 1012 | struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; |
---|
761 | 1013 | struct vhost_virtqueue *rvq = &rnvq->vq; |
---|
762 | 1014 | struct vhost_virtqueue *tvq = &tnvq->vq; |
---|
763 | | - unsigned long uninitialized_var(endtime); |
---|
764 | 1015 | int len = peek_head_len(rnvq, sk); |
---|
765 | 1016 | |
---|
766 | | - if (!len && tvq->busyloop_timeout) { |
---|
| 1017 | + if (!len && rvq->busyloop_timeout) { |
---|
767 | 1018 | /* Flush batched heads first */ |
---|
768 | 1019 | vhost_net_signal_used(rnvq); |
---|
769 | 1020 | /* Both tx vq and rx socket were polled here */ |
---|
770 | | - mutex_lock_nested(&tvq->mutex, 1); |
---|
771 | | - vhost_disable_notify(&net->dev, tvq); |
---|
772 | | - |
---|
773 | | - preempt_disable(); |
---|
774 | | - endtime = busy_clock() + tvq->busyloop_timeout; |
---|
775 | | - |
---|
776 | | - while (vhost_can_busy_poll(endtime)) { |
---|
777 | | - if (vhost_has_work(&net->dev)) { |
---|
778 | | - *busyloop_intr = true; |
---|
779 | | - break; |
---|
780 | | - } |
---|
781 | | - if ((sk_has_rx_data(sk) && |
---|
782 | | - !vhost_vq_avail_empty(&net->dev, rvq)) || |
---|
783 | | - !vhost_vq_avail_empty(&net->dev, tvq)) |
---|
784 | | - break; |
---|
785 | | - cpu_relax(); |
---|
786 | | - } |
---|
787 | | - |
---|
788 | | - preempt_enable(); |
---|
789 | | - |
---|
790 | | - if (!vhost_vq_avail_empty(&net->dev, tvq)) { |
---|
791 | | - vhost_poll_queue(&tvq->poll); |
---|
792 | | - } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) { |
---|
793 | | - vhost_disable_notify(&net->dev, tvq); |
---|
794 | | - vhost_poll_queue(&tvq->poll); |
---|
795 | | - } |
---|
796 | | - |
---|
797 | | - mutex_unlock(&tvq->mutex); |
---|
| 1021 | + vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); |
---|
798 | 1022 | |
---|
799 | 1023 | len = peek_head_len(rnvq, sk); |
---|
800 | 1024 | } |
---|
.. | .. |
---|
828 | 1052 | /* len is always initialized before use since we are always called with |
---|
829 | 1053 | * datalen > 0. |
---|
830 | 1054 | */ |
---|
831 | | - u32 uninitialized_var(len); |
---|
| 1055 | + u32 len; |
---|
832 | 1056 | |
---|
833 | 1057 | while (datalen > 0 && headcount < quota) { |
---|
834 | 1058 | if (unlikely(seg >= UIO_MAXIOV)) { |
---|
.. | .. |
---|
885 | 1109 | { |
---|
886 | 1110 | struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; |
---|
887 | 1111 | struct vhost_virtqueue *vq = &nvq->vq; |
---|
888 | | - unsigned uninitialized_var(in), log; |
---|
| 1112 | + unsigned in, log; |
---|
889 | 1113 | struct vhost_log *vq_log; |
---|
890 | 1114 | struct msghdr msg = { |
---|
891 | 1115 | .msg_name = NULL, |
---|
.. | .. |
---|
909 | 1133 | __virtio16 num_buffers; |
---|
910 | 1134 | int recv_pkts = 0; |
---|
911 | 1135 | |
---|
912 | | - mutex_lock_nested(&vq->mutex, 0); |
---|
913 | | - sock = vq->private_data; |
---|
| 1136 | + mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX); |
---|
| 1137 | + sock = vhost_vq_get_backend(vq); |
---|
914 | 1138 | if (!sock) |
---|
915 | 1139 | goto out; |
---|
916 | 1140 | |
---|
917 | | - if (!vq_iotlb_prefetch(vq)) |
---|
| 1141 | + if (!vq_meta_prefetch(vq)) |
---|
918 | 1142 | goto out; |
---|
919 | 1143 | |
---|
920 | 1144 | vhost_disable_notify(&net->dev, vq); |
---|
.. | .. |
---|
1065 | 1289 | struct vhost_dev *dev; |
---|
1066 | 1290 | struct vhost_virtqueue **vqs; |
---|
1067 | 1291 | void **queue; |
---|
| 1292 | + struct xdp_buff *xdp; |
---|
1068 | 1293 | int i; |
---|
1069 | 1294 | |
---|
1070 | 1295 | n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); |
---|
.. | .. |
---|
1085 | 1310 | } |
---|
1086 | 1311 | n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue; |
---|
1087 | 1312 | |
---|
| 1313 | + xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL); |
---|
| 1314 | + if (!xdp) { |
---|
| 1315 | + kfree(vqs); |
---|
| 1316 | + kvfree(n); |
---|
| 1317 | + kfree(queue); |
---|
| 1318 | + return -ENOMEM; |
---|
| 1319 | + } |
---|
| 1320 | + n->vqs[VHOST_NET_VQ_TX].xdp = xdp; |
---|
| 1321 | + |
---|
1088 | 1322 | dev = &n->dev; |
---|
1089 | 1323 | vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq; |
---|
1090 | 1324 | vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq; |
---|
.. | .. |
---|
1095 | 1329 | n->vqs[i].ubuf_info = NULL; |
---|
1096 | 1330 | n->vqs[i].upend_idx = 0; |
---|
1097 | 1331 | n->vqs[i].done_idx = 0; |
---|
| 1332 | + n->vqs[i].batched_xdp = 0; |
---|
1098 | 1333 | n->vqs[i].vhost_hlen = 0; |
---|
1099 | 1334 | n->vqs[i].sock_hlen = 0; |
---|
1100 | 1335 | n->vqs[i].rx_ring = NULL; |
---|
.. | .. |
---|
1102 | 1337 | } |
---|
1103 | 1338 | vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX, |
---|
1104 | 1339 | UIO_MAXIOV + VHOST_NET_BATCH, |
---|
1105 | | - VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT); |
---|
| 1340 | + VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true, |
---|
| 1341 | + NULL); |
---|
1106 | 1342 | |
---|
1107 | 1343 | vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev); |
---|
1108 | 1344 | vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev); |
---|
1109 | 1345 | |
---|
1110 | 1346 | f->private_data = n; |
---|
| 1347 | + n->page_frag.page = NULL; |
---|
| 1348 | + n->refcnt_bias = 0; |
---|
1111 | 1349 | |
---|
1112 | 1350 | return 0; |
---|
1113 | 1351 | } |
---|
.. | .. |
---|
1120 | 1358 | container_of(vq, struct vhost_net_virtqueue, vq); |
---|
1121 | 1359 | |
---|
1122 | 1360 | mutex_lock(&vq->mutex); |
---|
1123 | | - sock = vq->private_data; |
---|
| 1361 | + sock = vhost_vq_get_backend(vq); |
---|
1124 | 1362 | vhost_net_disable_vq(n, vq); |
---|
1125 | | - vq->private_data = NULL; |
---|
| 1363 | + vhost_vq_set_backend(vq, NULL); |
---|
1126 | 1364 | vhost_net_buf_unproduce(nvq); |
---|
1127 | 1365 | nvq->rx_ring = NULL; |
---|
1128 | 1366 | mutex_unlock(&vq->mutex); |
---|
.. | .. |
---|
1175 | 1413 | if (rx_sock) |
---|
1176 | 1414 | sockfd_put(rx_sock); |
---|
1177 | 1415 | /* Make sure no callbacks are outstanding */ |
---|
1178 | | - synchronize_rcu_bh(); |
---|
| 1416 | + synchronize_rcu(); |
---|
1179 | 1417 | /* We do an extra flush before freeing memory, |
---|
1180 | 1418 | * since jobs can re-queue themselves. */ |
---|
1181 | 1419 | vhost_net_flush(n); |
---|
1182 | 1420 | kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); |
---|
| 1421 | + kfree(n->vqs[VHOST_NET_VQ_TX].xdp); |
---|
1183 | 1422 | kfree(n->dev.vqs); |
---|
| 1423 | + if (n->page_frag.page) |
---|
| 1424 | + __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias); |
---|
1184 | 1425 | kvfree(n); |
---|
1185 | 1426 | return 0; |
---|
1186 | 1427 | } |
---|
.. | .. |
---|
1209 | 1450 | return ERR_PTR(r); |
---|
1210 | 1451 | } |
---|
1211 | 1452 | |
---|
1212 | | -static struct ptr_ring *get_tap_ptr_ring(int fd) |
---|
| 1453 | +static struct ptr_ring *get_tap_ptr_ring(struct file *file) |
---|
1213 | 1454 | { |
---|
1214 | 1455 | struct ptr_ring *ring; |
---|
1215 | | - struct file *file = fget(fd); |
---|
1216 | | - |
---|
1217 | | - if (!file) |
---|
1218 | | - return NULL; |
---|
1219 | 1456 | ring = tun_get_tx_ring(file); |
---|
1220 | 1457 | if (!IS_ERR(ring)) |
---|
1221 | 1458 | goto out; |
---|
.. | .. |
---|
1224 | 1461 | goto out; |
---|
1225 | 1462 | ring = NULL; |
---|
1226 | 1463 | out: |
---|
1227 | | - fput(file); |
---|
1228 | 1464 | return ring; |
---|
1229 | 1465 | } |
---|
1230 | 1466 | |
---|
.. | .. |
---|
1281 | 1517 | nvq = &n->vqs[index]; |
---|
1282 | 1518 | mutex_lock(&vq->mutex); |
---|
1283 | 1519 | |
---|
| 1520 | + if (fd == -1) |
---|
| 1521 | + vhost_clear_msg(&n->dev); |
---|
| 1522 | + |
---|
1284 | 1523 | /* Verify that ring has been setup correctly. */ |
---|
1285 | 1524 | if (!vhost_vq_access_ok(vq)) { |
---|
1286 | 1525 | r = -EFAULT; |
---|
.. | .. |
---|
1293 | 1532 | } |
---|
1294 | 1533 | |
---|
1295 | 1534 | /* start polling new socket */ |
---|
1296 | | - oldsock = vq->private_data; |
---|
| 1535 | + oldsock = vhost_vq_get_backend(vq); |
---|
1297 | 1536 | if (sock != oldsock) { |
---|
1298 | 1537 | ubufs = vhost_net_ubuf_alloc(vq, |
---|
1299 | 1538 | sock && vhost_sock_zcopy(sock)); |
---|
.. | .. |
---|
1303 | 1542 | } |
---|
1304 | 1543 | |
---|
1305 | 1544 | vhost_net_disable_vq(n, vq); |
---|
1306 | | - vq->private_data = sock; |
---|
| 1545 | + vhost_vq_set_backend(vq, sock); |
---|
1307 | 1546 | vhost_net_buf_unproduce(nvq); |
---|
1308 | 1547 | r = vhost_vq_init_access(vq); |
---|
1309 | 1548 | if (r) |
---|
.. | .. |
---|
1311 | 1550 | r = vhost_net_enable_vq(n, vq); |
---|
1312 | 1551 | if (r) |
---|
1313 | 1552 | goto err_used; |
---|
1314 | | - if (index == VHOST_NET_VQ_RX) |
---|
1315 | | - nvq->rx_ring = get_tap_ptr_ring(fd); |
---|
| 1553 | + if (index == VHOST_NET_VQ_RX) { |
---|
| 1554 | + if (sock) |
---|
| 1555 | + nvq->rx_ring = get_tap_ptr_ring(sock->file); |
---|
| 1556 | + else |
---|
| 1557 | + nvq->rx_ring = NULL; |
---|
| 1558 | + } |
---|
1316 | 1559 | |
---|
1317 | 1560 | oldubufs = nvq->ubufs; |
---|
1318 | 1561 | nvq->ubufs = ubufs; |
---|
.. | .. |
---|
1340 | 1583 | return 0; |
---|
1341 | 1584 | |
---|
1342 | 1585 | err_used: |
---|
1343 | | - vq->private_data = oldsock; |
---|
| 1586 | + vhost_vq_set_backend(vq, oldsock); |
---|
1344 | 1587 | vhost_net_enable_vq(n, vq); |
---|
1345 | 1588 | if (ubufs) |
---|
1346 | 1589 | vhost_net_ubuf_put_wait_and_free(ubufs); |
---|
.. | .. |
---|
1359 | 1602 | struct socket *tx_sock = NULL; |
---|
1360 | 1603 | struct socket *rx_sock = NULL; |
---|
1361 | 1604 | long err; |
---|
1362 | | - struct vhost_umem *umem; |
---|
| 1605 | + struct vhost_iotlb *umem; |
---|
1363 | 1606 | |
---|
1364 | 1607 | mutex_lock(&n->dev.mutex); |
---|
1365 | 1608 | err = vhost_dev_check_owner(&n->dev); |
---|
.. | .. |
---|
1382 | 1625 | if (rx_sock) |
---|
1383 | 1626 | sockfd_put(rx_sock); |
---|
1384 | 1627 | return err; |
---|
1385 | | -} |
---|
1386 | | - |
---|
1387 | | -static int vhost_net_set_backend_features(struct vhost_net *n, u64 features) |
---|
1388 | | -{ |
---|
1389 | | - int i; |
---|
1390 | | - |
---|
1391 | | - mutex_lock(&n->dev.mutex); |
---|
1392 | | - for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { |
---|
1393 | | - mutex_lock(&n->vqs[i].vq.mutex); |
---|
1394 | | - n->vqs[i].vq.acked_backend_features = features; |
---|
1395 | | - mutex_unlock(&n->vqs[i].vq.mutex); |
---|
1396 | | - } |
---|
1397 | | - mutex_unlock(&n->dev.mutex); |
---|
1398 | | - |
---|
1399 | | - return 0; |
---|
1400 | 1628 | } |
---|
1401 | 1629 | |
---|
1402 | 1630 | static int vhost_net_set_features(struct vhost_net *n, u64 features) |
---|
.. | .. |
---|
1422 | 1650 | !vhost_log_access_ok(&n->dev)) |
---|
1423 | 1651 | goto out_unlock; |
---|
1424 | 1652 | |
---|
1425 | | - if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) { |
---|
| 1653 | + if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { |
---|
1426 | 1654 | if (vhost_init_device_iotlb(&n->dev, true)) |
---|
1427 | 1655 | goto out_unlock; |
---|
1428 | 1656 | } |
---|
.. | .. |
---|
1499 | 1727 | return -EFAULT; |
---|
1500 | 1728 | if (features & ~VHOST_NET_BACKEND_FEATURES) |
---|
1501 | 1729 | return -EOPNOTSUPP; |
---|
1502 | | - return vhost_net_set_backend_features(n, features); |
---|
| 1730 | + vhost_set_backend_features(&n->dev, features); |
---|
| 1731 | + return 0; |
---|
1503 | 1732 | case VHOST_RESET_OWNER: |
---|
1504 | 1733 | return vhost_net_reset_owner(n); |
---|
1505 | 1734 | case VHOST_SET_OWNER: |
---|
.. | .. |
---|
1515 | 1744 | return r; |
---|
1516 | 1745 | } |
---|
1517 | 1746 | } |
---|
1518 | | - |
---|
1519 | | -#ifdef CONFIG_COMPAT |
---|
1520 | | -static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, |
---|
1521 | | - unsigned long arg) |
---|
1522 | | -{ |
---|
1523 | | - return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); |
---|
1524 | | -} |
---|
1525 | | -#endif |
---|
1526 | 1747 | |
---|
1527 | 1748 | static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) |
---|
1528 | 1749 | { |
---|
.. | .. |
---|
1559 | 1780 | .write_iter = vhost_net_chr_write_iter, |
---|
1560 | 1781 | .poll = vhost_net_chr_poll, |
---|
1561 | 1782 | .unlocked_ioctl = vhost_net_ioctl, |
---|
1562 | | -#ifdef CONFIG_COMPAT |
---|
1563 | | - .compat_ioctl = vhost_net_compat_ioctl, |
---|
1564 | | -#endif |
---|
| 1783 | + .compat_ioctl = compat_ptr_ioctl, |
---|
1565 | 1784 | .open = vhost_net_open, |
---|
1566 | 1785 | .llseek = noop_llseek, |
---|
1567 | 1786 | }; |
---|