From 072de836f53be56a70cecf70b43ae43b7ce17376 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Mon, 11 Dec 2023 10:08:36 +0000
Subject: [PATCH] mk-rootfs.sh
---
kernel/drivers/vhost/net.c | 484 ++++++++++++++++++++++++++++++++++++++--------------
1 files changed, 350 insertions(+), 134 deletions(-)
diff --git a/kernel/drivers/vhost/net.c b/kernel/drivers/vhost/net.c
index 0c7bbc9..5beb207 100644
--- a/kernel/drivers/vhost/net.c
+++ b/kernel/drivers/vhost/net.c
@@ -1,7 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2009 Red Hat, Inc.
* Author: Michael S. Tsirkin <mst@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
*
* virtio-net server in host kernel.
*/
@@ -74,7 +73,7 @@
VHOST_NET_FEATURES = VHOST_FEATURES |
(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
(1ULL << VIRTIO_NET_F_MRG_RXBUF) |
- (1ULL << VIRTIO_F_IOMMU_PLATFORM)
+ (1ULL << VIRTIO_F_ACCESS_PLATFORM)
};
enum {
@@ -116,6 +115,8 @@
* For RX, number of batched heads
*/
int done_idx;
+ /* Number of XDP frames batched */
+ int batched_xdp;
/* an array of userspace buffers info */
struct ubuf_info *ubuf_info;
/* Reference counting for outstanding ubufs.
@@ -123,6 +124,8 @@
struct vhost_net_ubuf_ref *ubufs;
struct ptr_ring *rx_ring;
struct vhost_net_buf rxq;
+ /* Batched XDP buffs */
+ struct xdp_buff *xdp;
};
struct vhost_net {
@@ -137,6 +140,10 @@
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
+ /* Private page frag */
+ struct page_frag page_frag;
+ /* Refcount bias of page frag */
+ int refcnt_bias;
};
static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -338,6 +345,11 @@
sock_flag(sock->sk, SOCK_ZEROCOPY);
}
+static bool vhost_sock_xdp(struct socket *sock)
+{
+ return sock_flag(sock->sk, SOCK_XDP);
+}
+
/* In case of DMA done not in order in lower device driver for some reason.
* upend_idx is used to track end of used idx, done_idx is used to track head
* of used idx. Once lower device DMA done contiguously, we will signal KVM
@@ -412,7 +424,7 @@
struct vhost_net_virtqueue *nvq =
container_of(vq, struct vhost_net_virtqueue, vq);
struct vhost_poll *poll = n->poll + (nvq - n->vqs);
- if (!vq->private_data)
+ if (!vhost_vq_get_backend(vq))
return;
vhost_poll_stop(poll);
}
@@ -425,7 +437,7 @@
struct vhost_poll *poll = n->poll + (nvq - n->vqs);
struct socket *sock;
- sock = vq->private_data;
+ sock = vhost_vq_get_backend(vq);
if (!sock)
return 0;
@@ -444,32 +456,138 @@
nvq->done_idx = 0;
}
-static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
- struct vhost_net_virtqueue *nvq,
- unsigned int *out_num, unsigned int *in_num,
- bool *busyloop_intr)
+static void vhost_tx_batch(struct vhost_net *net,
+ struct vhost_net_virtqueue *nvq,
+ struct socket *sock,
+ struct msghdr *msghdr)
{
- struct vhost_virtqueue *vq = &nvq->vq;
- unsigned long uninitialized_var(endtime);
- int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+ struct tun_msg_ctl ctl = {
+ .type = TUN_MSG_PTR,
+ .num = nvq->batched_xdp,
+ .ptr = nvq->xdp,
+ };
+ int i, err;
+
+ if (nvq->batched_xdp == 0)
+ goto signal_used;
+
+ msghdr->msg_control = &ctl;
+ msghdr->msg_controllen = sizeof(ctl);
+ err = sock->ops->sendmsg(sock, msghdr, 0);
+ if (unlikely(err < 0)) {
+ vq_err(&nvq->vq, "Fail to batch sending packets\n");
+
+ /* free pages owned by XDP; since this is an unlikely error path,
+ * keep it simple and avoid more complex bulk update for the
+ * used pages
+ */
+ for (i = 0; i < nvq->batched_xdp; ++i)
+ put_page(virt_to_head_page(nvq->xdp[i].data));
+ nvq->batched_xdp = 0;
+ nvq->done_idx = 0;
+ return;
+ }
+
+signal_used:
+ vhost_net_signal_used(nvq);
+ nvq->batched_xdp = 0;
+}
+
+static int sock_has_rx_data(struct socket *sock)
+{
+ if (unlikely(!sock))
+ return 0;
+
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
+ return skb_queue_empty(&sock->sk->sk_receive_queue);
+}
+
+static void vhost_net_busy_poll_try_queue(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ if (!vhost_vq_avail_empty(&net->dev, vq)) {
+ vhost_poll_queue(&vq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+ vhost_disable_notify(&net->dev, vq);
+ vhost_poll_queue(&vq->poll);
+ }
+}
+
+static void vhost_net_busy_poll(struct vhost_net *net,
+ struct vhost_virtqueue *rvq,
+ struct vhost_virtqueue *tvq,
+ bool *busyloop_intr,
+ bool poll_rx)
+{
+ unsigned long busyloop_timeout;
+ unsigned long endtime;
+ struct socket *sock;
+ struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
+
+ /* Try to hold the vq mutex of the paired virtqueue. We can't
+ * use mutex_lock() here since we could not guarantee a
+ * consistenet lock ordering.
+ */
+ if (!mutex_trylock(&vq->mutex))
+ return;
+
+ vhost_disable_notify(&net->dev, vq);
+ sock = vhost_vq_get_backend(rvq);
+
+ busyloop_timeout = poll_rx ? rvq->busyloop_timeout:
+ tvq->busyloop_timeout;
+
+ preempt_disable();
+ endtime = busy_clock() + busyloop_timeout;
+
+ while (vhost_can_busy_poll(endtime)) {
+ if (vhost_has_work(&net->dev)) {
+ *busyloop_intr = true;
+ break;
+ }
+
+ if ((sock_has_rx_data(sock) &&
+ !vhost_vq_avail_empty(&net->dev, rvq)) ||
+ !vhost_vq_avail_empty(&net->dev, tvq))
+ break;
+
+ cpu_relax();
+ }
+
+ preempt_enable();
+
+ if (poll_rx || sock_has_rx_data(sock))
+ vhost_net_busy_poll_try_queue(net, vq);
+ else if (!poll_rx) /* On tx here, sock has no rx data. */
+ vhost_enable_notify(&net->dev, rvq);
+
+ mutex_unlock(&vq->mutex);
+}
+
+static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
+ struct vhost_net_virtqueue *tnvq,
+ unsigned int *out_num, unsigned int *in_num,
+ struct msghdr *msghdr, bool *busyloop_intr)
+{
+ struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
+ struct vhost_virtqueue *rvq = &rnvq->vq;
+ struct vhost_virtqueue *tvq = &tnvq->vq;
+
+ int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
out_num, in_num, NULL, NULL);
- if (r == vq->num && vq->busyloop_timeout) {
- if (!vhost_sock_zcopy(vq->private_data))
- vhost_net_signal_used(nvq);
- preempt_disable();
- endtime = busy_clock() + vq->busyloop_timeout;
- while (vhost_can_busy_poll(endtime)) {
- if (vhost_has_work(vq->dev)) {
- *busyloop_intr = true;
- break;
- }
- if (!vhost_vq_avail_empty(vq->dev, vq))
- break;
- cpu_relax();
- }
- preempt_enable();
- r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+ if (r == tvq->num && tvq->busyloop_timeout) {
+ /* Flush batched packets first */
+ if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq)))
+ vhost_tx_batch(net, tnvq,
+ vhost_vq_get_backend(tvq),
+ msghdr);
+
+ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
+
+ r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
out_num, in_num, NULL, NULL);
}
@@ -506,7 +624,7 @@
struct vhost_virtqueue *vq = &nvq->vq;
int ret;
- ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr);
+ ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
if (ret < 0 || ret == vq->num)
return ret;
@@ -534,6 +652,121 @@
!vhost_vq_avail_empty(vq->dev, vq);
}
+#define SKB_FRAG_PAGE_ORDER get_order(32768)
+
+static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
+ struct page_frag *pfrag, gfp_t gfp)
+{
+ if (pfrag->page) {
+ if (pfrag->offset + sz <= pfrag->size)
+ return true;
+ __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
+ }
+
+ pfrag->offset = 0;
+ net->refcnt_bias = 0;
+ if (SKB_FRAG_PAGE_ORDER) {
+ /* Avoid direct reclaim but allow kswapd to wake */
+ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP | __GFP_NOWARN |
+ __GFP_NORETRY,
+ SKB_FRAG_PAGE_ORDER);
+ if (likely(pfrag->page)) {
+ pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
+ goto done;
+ }
+ }
+ pfrag->page = alloc_page(gfp);
+ if (likely(pfrag->page)) {
+ pfrag->size = PAGE_SIZE;
+ goto done;
+ }
+ return false;
+
+done:
+ net->refcnt_bias = USHRT_MAX;
+ page_ref_add(pfrag->page, USHRT_MAX - 1);
+ return true;
+}
+
+#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+
+static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
+ struct iov_iter *from)
+{
+ struct vhost_virtqueue *vq = &nvq->vq;
+ struct vhost_net *net = container_of(vq->dev, struct vhost_net,
+ dev);
+ struct socket *sock = vhost_vq_get_backend(vq);
+ struct page_frag *alloc_frag = &net->page_frag;
+ struct virtio_net_hdr *gso;
+ struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
+ struct tun_xdp_hdr *hdr;
+ size_t len = iov_iter_count(from);
+ int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
+ int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
+ int sock_hlen = nvq->sock_hlen;
+ void *buf;
+ int copied;
+
+ if (unlikely(len < nvq->sock_hlen))
+ return -EFAULT;
+
+ if (SKB_DATA_ALIGN(len + pad) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
+ return -ENOSPC;
+
+ buflen += SKB_DATA_ALIGN(len + pad);
+ alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
+ if (unlikely(!vhost_net_page_frag_refill(net, buflen,
+ alloc_frag, GFP_KERNEL)))
+ return -ENOMEM;
+
+ buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+ copied = copy_page_from_iter(alloc_frag->page,
+ alloc_frag->offset +
+ offsetof(struct tun_xdp_hdr, gso),
+ sock_hlen, from);
+ if (copied != sock_hlen)
+ return -EFAULT;
+
+ hdr = buf;
+ gso = &hdr->gso;
+
+ if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+ vhost16_to_cpu(vq, gso->csum_start) +
+ vhost16_to_cpu(vq, gso->csum_offset) + 2 >
+ vhost16_to_cpu(vq, gso->hdr_len)) {
+ gso->hdr_len = cpu_to_vhost16(vq,
+ vhost16_to_cpu(vq, gso->csum_start) +
+ vhost16_to_cpu(vq, gso->csum_offset) + 2);
+
+ if (vhost16_to_cpu(vq, gso->hdr_len) > len)
+ return -EINVAL;
+ }
+
+ len -= sock_hlen;
+ copied = copy_page_from_iter(alloc_frag->page,
+ alloc_frag->offset + pad,
+ len, from);
+ if (copied != len)
+ return -EFAULT;
+
+ xdp->data_hard_start = buf;
+ xdp->data = buf + pad;
+ xdp->data_end = xdp->data + len;
+ hdr->buflen = buflen;
+ xdp->frame_sz = buflen;
+
+ --net->refcnt_bias;
+ alloc_frag->offset += buflen;
+
+ ++nvq->batched_xdp;
+
+ return 0;
+}
+
static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
@@ -550,9 +783,13 @@
size_t len, total_len = 0;
int err;
int sent_pkts = 0;
+ bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
do {
bool busyloop_intr = false;
+
+ if (nvq->done_idx == VHOST_NET_BATCH)
+ vhost_tx_batch(net, nvq, sock, &msg);
head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
&busyloop_intr);
@@ -571,14 +808,34 @@
break;
}
- vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
- vq->heads[nvq->done_idx].len = 0;
-
total_len += len;
- if (tx_can_batch(vq, total_len))
- msg.msg_flags |= MSG_MORE;
- else
- msg.msg_flags &= ~MSG_MORE;
+
+ /* For simplicity, TX batching is only enabled if
+ * sndbuf is unlimited.
+ */
+ if (sock_can_batch) {
+ err = vhost_net_build_xdp(nvq, &msg.msg_iter);
+ if (!err) {
+ goto done;
+ } else if (unlikely(err != -ENOSPC)) {
+ vhost_tx_batch(net, nvq, sock, &msg);
+ vhost_discard_vq_desc(vq, 1);
+ vhost_net_enable_vq(net, vq);
+ break;
+ }
+
+ /* We can't build XDP buff, go for single
+ * packet path but let's flush batched
+ * packets.
+ */
+ vhost_tx_batch(net, nvq, sock, &msg);
+ msg.msg_control = NULL;
+ } else {
+ if (tx_can_batch(vq, total_len))
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags &= ~MSG_MORE;
+ }
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(sock, &msg, len);
@@ -590,11 +847,13 @@
if (err != len)
pr_debug("Truncated TX packet: len %d != %zd\n",
err, len);
- if (++nvq->done_idx >= VHOST_NET_BATCH)
- vhost_net_signal_used(nvq);
+done:
+ vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
+ vq->heads[nvq->done_idx].len = 0;
+ ++nvq->done_idx;
} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
- vhost_net_signal_used(nvq);
+ vhost_tx_batch(net, nvq, sock, &msg);
}
static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
@@ -610,9 +869,10 @@
.msg_controllen = 0,
.msg_flags = MSG_DONTWAIT,
};
+ struct tun_msg_ctl ctl;
size_t len, total_len = 0;
int err;
- struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+ struct vhost_net_ubuf_ref *ubufs;
struct ubuf_info *ubuf;
bool zcopy_used;
int sent_pkts = 0;
@@ -653,8 +913,10 @@
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
refcount_set(&ubuf->refcnt, 1);
- msg.msg_control = ubuf;
- msg.msg_controllen = sizeof(ubuf);
+ msg.msg_control = &ctl;
+ ctl.type = TUN_MSG_UBUF;
+ ctl.ptr = ubuf;
+ msg.msg_controllen = sizeof(ctl);
ubufs = nvq->ubufs;
atomic_inc(&ubufs->refcount);
nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
@@ -702,12 +964,12 @@
struct vhost_virtqueue *vq = &nvq->vq;
struct socket *sock;
- mutex_lock(&vq->mutex);
- sock = vq->private_data;
+ mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
+ sock = vhost_vq_get_backend(vq);
if (!sock)
goto out;
- if (!vq_iotlb_prefetch(vq))
+ if (!vq_meta_prefetch(vq))
goto out;
vhost_disable_notify(&net->dev, vq);
@@ -743,16 +1005,6 @@
return len;
}
-static int sk_has_rx_data(struct sock *sk)
-{
- struct socket *sock = sk->sk_socket;
-
- if (sock->ops->peek_len)
- return sock->ops->peek_len(sock);
-
- return skb_queue_empty(&sk->sk_receive_queue);
-}
-
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
bool *busyloop_intr)
{
@@ -760,41 +1012,13 @@
struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *rvq = &rnvq->vq;
struct vhost_virtqueue *tvq = &tnvq->vq;
- unsigned long uninitialized_var(endtime);
int len = peek_head_len(rnvq, sk);
- if (!len && tvq->busyloop_timeout) {
+ if (!len && rvq->busyloop_timeout) {
/* Flush batched heads first */
vhost_net_signal_used(rnvq);
/* Both tx vq and rx socket were polled here */
- mutex_lock_nested(&tvq->mutex, 1);
- vhost_disable_notify(&net->dev, tvq);
-
- preempt_disable();
- endtime = busy_clock() + tvq->busyloop_timeout;
-
- while (vhost_can_busy_poll(endtime)) {
- if (vhost_has_work(&net->dev)) {
- *busyloop_intr = true;
- break;
- }
- if ((sk_has_rx_data(sk) &&
- !vhost_vq_avail_empty(&net->dev, rvq)) ||
- !vhost_vq_avail_empty(&net->dev, tvq))
- break;
- cpu_relax();
- }
-
- preempt_enable();
-
- if (!vhost_vq_avail_empty(&net->dev, tvq)) {
- vhost_poll_queue(&tvq->poll);
- } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
- vhost_disable_notify(&net->dev, tvq);
- vhost_poll_queue(&tvq->poll);
- }
-
- mutex_unlock(&tvq->mutex);
+ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
len = peek_head_len(rnvq, sk);
}
@@ -828,7 +1052,7 @@
/* len is always initialized before use since we are always called with
* datalen > 0.
*/
- u32 uninitialized_var(len);
+ u32 len;
while (datalen > 0 && headcount < quota) {
if (unlikely(seg >= UIO_MAXIOV)) {
@@ -885,7 +1109,7 @@
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
struct vhost_virtqueue *vq = &nvq->vq;
- unsigned uninitialized_var(in), log;
+ unsigned in, log;
struct vhost_log *vq_log;
struct msghdr msg = {
.msg_name = NULL,
@@ -909,12 +1133,12 @@
__virtio16 num_buffers;
int recv_pkts = 0;
- mutex_lock_nested(&vq->mutex, 0);
- sock = vq->private_data;
+ mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
+ sock = vhost_vq_get_backend(vq);
if (!sock)
goto out;
- if (!vq_iotlb_prefetch(vq))
+ if (!vq_meta_prefetch(vq))
goto out;
vhost_disable_notify(&net->dev, vq);
@@ -1065,6 +1289,7 @@
struct vhost_dev *dev;
struct vhost_virtqueue **vqs;
void **queue;
+ struct xdp_buff *xdp;
int i;
n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
@@ -1085,6 +1310,15 @@
}
n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue;
+ xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL);
+ if (!xdp) {
+ kfree(vqs);
+ kvfree(n);
+ kfree(queue);
+ return -ENOMEM;
+ }
+ n->vqs[VHOST_NET_VQ_TX].xdp = xdp;
+
dev = &n->dev;
vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
@@ -1095,6 +1329,7 @@
n->vqs[i].ubuf_info = NULL;
n->vqs[i].upend_idx = 0;
n->vqs[i].done_idx = 0;
+ n->vqs[i].batched_xdp = 0;
n->vqs[i].vhost_hlen = 0;
n->vqs[i].sock_hlen = 0;
n->vqs[i].rx_ring = NULL;
@@ -1102,12 +1337,15 @@
}
vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
UIO_MAXIOV + VHOST_NET_BATCH,
- VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT);
+ VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
+ NULL);
vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
f->private_data = n;
+ n->page_frag.page = NULL;
+ n->refcnt_bias = 0;
return 0;
}
@@ -1120,9 +1358,9 @@
container_of(vq, struct vhost_net_virtqueue, vq);
mutex_lock(&vq->mutex);
- sock = vq->private_data;
+ sock = vhost_vq_get_backend(vq);
vhost_net_disable_vq(n, vq);
- vq->private_data = NULL;
+ vhost_vq_set_backend(vq, NULL);
vhost_net_buf_unproduce(nvq);
nvq->rx_ring = NULL;
mutex_unlock(&vq->mutex);
@@ -1175,12 +1413,15 @@
if (rx_sock)
sockfd_put(rx_sock);
/* Make sure no callbacks are outstanding */
- synchronize_rcu_bh();
+ synchronize_rcu();
/* We do an extra flush before freeing memory,
* since jobs can re-queue themselves. */
vhost_net_flush(n);
kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
+ kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
kfree(n->dev.vqs);
+ if (n->page_frag.page)
+ __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
kvfree(n);
return 0;
}
@@ -1209,13 +1450,9 @@
return ERR_PTR(r);
}
-static struct ptr_ring *get_tap_ptr_ring(int fd)
+static struct ptr_ring *get_tap_ptr_ring(struct file *file)
{
struct ptr_ring *ring;
- struct file *file = fget(fd);
-
- if (!file)
- return NULL;
ring = tun_get_tx_ring(file);
if (!IS_ERR(ring))
goto out;
@@ -1224,7 +1461,6 @@
goto out;
ring = NULL;
out:
- fput(file);
return ring;
}
@@ -1293,7 +1529,7 @@
}
/* start polling new socket */
- oldsock = vq->private_data;
+ oldsock = vhost_vq_get_backend(vq);
if (sock != oldsock) {
ubufs = vhost_net_ubuf_alloc(vq,
sock && vhost_sock_zcopy(sock));
@@ -1303,7 +1539,7 @@
}
vhost_net_disable_vq(n, vq);
- vq->private_data = sock;
+ vhost_vq_set_backend(vq, sock);
vhost_net_buf_unproduce(nvq);
r = vhost_vq_init_access(vq);
if (r)
@@ -1311,8 +1547,12 @@
r = vhost_net_enable_vq(n, vq);
if (r)
goto err_used;
- if (index == VHOST_NET_VQ_RX)
- nvq->rx_ring = get_tap_ptr_ring(fd);
+ if (index == VHOST_NET_VQ_RX) {
+ if (sock)
+ nvq->rx_ring = get_tap_ptr_ring(sock->file);
+ else
+ nvq->rx_ring = NULL;
+ }
oldubufs = nvq->ubufs;
nvq->ubufs = ubufs;
@@ -1340,7 +1580,7 @@
return 0;
err_used:
- vq->private_data = oldsock;
+ vhost_vq_set_backend(vq, oldsock);
vhost_net_enable_vq(n, vq);
if (ubufs)
vhost_net_ubuf_put_wait_and_free(ubufs);
@@ -1359,7 +1599,7 @@
struct socket *tx_sock = NULL;
struct socket *rx_sock = NULL;
long err;
- struct vhost_umem *umem;
+ struct vhost_iotlb *umem;
mutex_lock(&n->dev.mutex);
err = vhost_dev_check_owner(&n->dev);
@@ -1382,21 +1622,6 @@
if (rx_sock)
sockfd_put(rx_sock);
return err;
-}
-
-static int vhost_net_set_backend_features(struct vhost_net *n, u64 features)
-{
- int i;
-
- mutex_lock(&n->dev.mutex);
- for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
- mutex_lock(&n->vqs[i].vq.mutex);
- n->vqs[i].vq.acked_backend_features = features;
- mutex_unlock(&n->vqs[i].vq.mutex);
- }
- mutex_unlock(&n->dev.mutex);
-
- return 0;
}
static int vhost_net_set_features(struct vhost_net *n, u64 features)
@@ -1422,7 +1647,7 @@
!vhost_log_access_ok(&n->dev))
goto out_unlock;
- if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
+ if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) {
if (vhost_init_device_iotlb(&n->dev, true))
goto out_unlock;
}
@@ -1499,7 +1724,8 @@
return -EFAULT;
if (features & ~VHOST_NET_BACKEND_FEATURES)
return -EOPNOTSUPP;
- return vhost_net_set_backend_features(n, features);
+ vhost_set_backend_features(&n->dev, features);
+ return 0;
case VHOST_RESET_OWNER:
return vhost_net_reset_owner(n);
case VHOST_SET_OWNER:
@@ -1515,14 +1741,6 @@
return r;
}
}
-
-#ifdef CONFIG_COMPAT
-static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
- unsigned long arg)
-{
- return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
-}
-#endif
static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
@@ -1559,9 +1777,7 @@
.write_iter = vhost_net_chr_write_iter,
.poll = vhost_net_chr_poll,
.unlocked_ioctl = vhost_net_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = vhost_net_compat_ioctl,
-#endif
+ .compat_ioctl = compat_ptr_ioctl,
.open = vhost_net_open,
.llseek = noop_llseek,
};
--
Gitblit v1.6.2