From bedbef8ad3e75a304af6361af235302bcc61d06b Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 14 May 2024 06:39:01 +0000 Subject: [PATCH] 修改内核路径 --- kernel/drivers/vhost/vhost.c | 782 +++++++++++++++++++++++++++++++------------------------ 1 files changed, 439 insertions(+), 343 deletions(-) diff --git a/kernel/drivers/vhost/vhost.c b/kernel/drivers/vhost/vhost.c index ee478b5..de11036 100644 --- a/kernel/drivers/vhost/vhost.c +++ b/kernel/drivers/vhost/vhost.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2009 Red Hat, Inc. * Copyright (C) 2006 Rusty Russell IBM Corporation * @@ -6,8 +7,6 @@ * Inspiration, some code, and most witty comments come from * Documentation/virtual/lguest/lguest.c, by Rusty Russell * - * This work is licensed under the terms of the GNU GPL, version 2. - * * Generic code for virtio server in host kernel. */ @@ -15,7 +14,6 @@ #include <linux/vhost.h> #include <linux/uio.h> #include <linux/mm.h> -#include <linux/mmu_context.h> #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/poll.h> @@ -50,10 +48,6 @@ #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num]) #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num]) - -INTERVAL_TREE_DEFINE(struct vhost_umem_node, - rb, __u64, __subtree_last, - START, LAST, static inline, vhost_umem_interval_tree); #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY static void vhost_disable_cross_endian(struct vhost_virtqueue *vq) @@ -171,11 +165,16 @@ void *key) { struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait); + struct vhost_work *work = &poll->work; if (!(key_to_poll(key) & poll->mask)) return 0; - vhost_poll_queue(poll); + if (!poll->dev->use_worker) + work->fn(work); + else + vhost_poll_queue(poll); + return 0; } @@ -205,7 +204,6 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file) { __poll_t mask; - int ret = 0; if (poll->wqh) return 0; @@ -215,10 +213,10 @@ vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask)); if (mask & EPOLLERR) { vhost_poll_stop(poll); - ret = -EINVAL; + return -EINVAL; } - return ret; + return 0; } EXPORT_SYMBOL_GPL(vhost_poll_start); @@ -300,6 +298,18 @@ __vhost_vq_meta_reset(d->vqs[i]); } +static void vhost_vring_call_reset(struct vhost_vring_call *call_ctx) +{ + call_ctx->ctx = NULL; + memset(&call_ctx->producer, 0x0, sizeof(struct irq_bypass_producer)); +} + +bool vhost_vq_is_setup(struct vhost_virtqueue *vq) +{ + return vq->avail && vq->desc && vq->used && vhost_vq_access_ok(vq); +} +EXPORT_SYMBOL_GPL(vhost_vq_is_setup); + static void vhost_vq_reset(struct vhost_dev *dev, struct vhost_virtqueue *vq) { @@ -321,13 +331,13 @@ vq->log_base = NULL; vq->error_ctx = NULL; vq->kick = NULL; - vq->call_ctx = NULL; vq->log_ctx = NULL; vhost_disable_cross_endian(vq); vhost_reset_is_le(vq); vq->busyloop_timeout = 0; vq->umem = NULL; vq->iotlb = NULL; + vhost_vring_call_reset(&vq->call_ctx); __vhost_vq_meta_reset(vq); } @@ -336,10 +346,8 @@ struct vhost_dev *dev = data; struct vhost_work *work, *work_next; struct llist_node *node; - mm_segment_t oldfs = get_fs(); - set_fs(USER_DS); - use_mm(dev->mm); + kthread_use_mm(dev->mm); for (;;) { /* mb paired w/ kthread_stop */ @@ -367,8 +375,7 @@ schedule(); } } - unuse_mm(dev->mm); - set_fs(oldfs); + kthread_unuse_mm(dev->mm); return 0; } @@ -431,9 +438,38 @@ } EXPORT_SYMBOL_GPL(vhost_exceeds_weight); +static size_t vhost_get_avail_size(struct vhost_virtqueue *vq, + unsigned int num) +{ + size_t event __maybe_unused = + vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + + return sizeof(*vq->avail) + + sizeof(*vq->avail->ring) * num + event; +} + +static size_t vhost_get_used_size(struct vhost_virtqueue *vq, + unsigned int num) +{ + size_t event __maybe_unused = + vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + + return sizeof(*vq->used) + + sizeof(*vq->used->ring) * num + event; +} + +static size_t vhost_get_desc_size(struct vhost_virtqueue *vq, + unsigned int num) +{ + return sizeof(*vq->desc) * num; +} + void vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue **vqs, int nvqs, - int iov_limit, int weight, int byte_weight) + int iov_limit, int weight, int byte_weight, + bool use_worker, + int (*msg_handler)(struct vhost_dev *dev, + struct vhost_iotlb_msg *msg)) { struct vhost_virtqueue *vq; int i; @@ -449,6 +485,8 @@ dev->iov_limit = iov_limit; dev->weight = weight; dev->byte_weight = byte_weight; + dev->use_worker = use_worker; + dev->msg_handler = msg_handler; init_llist_head(&dev->work_list); init_waitqueue_head(&dev->wait); INIT_LIST_HEAD(&dev->read_list); @@ -511,6 +549,36 @@ } EXPORT_SYMBOL_GPL(vhost_dev_has_owner); +static void vhost_attach_mm(struct vhost_dev *dev) +{ + /* No owner, become one */ + if (dev->use_worker) { + dev->mm = get_task_mm(current); + } else { + /* vDPA device does not use worker thead, so there's + * no need to hold the address space for mm. This help + * to avoid deadlock in the case of mmap() which may + * held the refcnt of the file and depends on release + * method to remove vma. + */ + dev->mm = current->mm; + mmgrab(dev->mm); + } +} + +static void vhost_detach_mm(struct vhost_dev *dev) +{ + if (!dev->mm) + return; + + if (dev->use_worker) + mmput(dev->mm); + else + mmdrop(dev->mm); + + dev->mm = NULL; +} + /* Caller should have device mutex */ long vhost_dev_set_owner(struct vhost_dev *dev) { @@ -523,21 +591,24 @@ goto err_mm; } - /* No owner, become one */ - dev->mm = get_task_mm(current); + vhost_attach_mm(dev); + dev->kcov_handle = kcov_common_handle(); - worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); - if (IS_ERR(worker)) { - err = PTR_ERR(worker); - goto err_worker; + if (dev->use_worker) { + worker = kthread_create(vhost_worker, dev, + "vhost-%d", current->pid); + if (IS_ERR(worker)) { + err = PTR_ERR(worker); + goto err_worker; + } + + dev->worker = worker; + wake_up_process(worker); /* avoid contributing to loadavg */ + + err = vhost_attach_cgroups(dev); + if (err) + goto err_cgroup; } - - dev->worker = worker; - wake_up_process(worker); /* avoid contributing to loadavg */ - - err = vhost_attach_cgroups(dev); - if (err) - goto err_cgroup; err = vhost_dev_alloc_iovecs(dev); if (err) @@ -545,33 +616,37 @@ return 0; err_cgroup: - kthread_stop(worker); - dev->worker = NULL; + if (dev->worker) { + kthread_stop(dev->worker); + dev->worker = NULL; + } err_worker: - if (dev->mm) - mmput(dev->mm); - dev->mm = NULL; + vhost_detach_mm(dev); dev->kcov_handle = 0; err_mm: return err; } EXPORT_SYMBOL_GPL(vhost_dev_set_owner); -struct vhost_umem *vhost_dev_reset_owner_prepare(void) +static struct vhost_iotlb *iotlb_alloc(void) { - return kvzalloc(sizeof(struct vhost_umem), GFP_KERNEL); + return vhost_iotlb_alloc(max_iotlb_entries, + VHOST_IOTLB_FLAG_RETIRE); +} + +struct vhost_iotlb *vhost_dev_reset_owner_prepare(void) +{ + return iotlb_alloc(); } EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare); /* Caller should have device mutex */ -void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem) +void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem) { int i; vhost_dev_cleanup(dev); - /* Restore memory to default empty mapping. */ - INIT_LIST_HEAD(&umem->umem_list); dev->umem = umem; /* We don't need VQ locks below since vhost_dev_cleanup makes sure * VQs aren't running. @@ -594,29 +669,7 @@ } EXPORT_SYMBOL_GPL(vhost_dev_stop); -static void vhost_umem_free(struct vhost_umem *umem, - struct vhost_umem_node *node) -{ - vhost_umem_interval_tree_remove(node, &umem->umem_tree); - list_del(&node->link); - kfree(node); - umem->numem--; -} - -static void vhost_umem_clean(struct vhost_umem *umem) -{ - struct vhost_umem_node *node, *tmp; - - if (!umem) - return; - - list_for_each_entry_safe(node, tmp, &umem->umem_list, link) - vhost_umem_free(umem, node); - - kvfree(umem); -} - -static void vhost_clear_msg(struct vhost_dev *dev) +void vhost_clear_msg(struct vhost_dev *dev) { struct vhost_msg_node *node, *n; @@ -634,6 +687,7 @@ spin_unlock(&dev->iotlb_lock); } +EXPORT_SYMBOL_GPL(vhost_clear_msg); void vhost_dev_cleanup(struct vhost_dev *dev) { @@ -644,8 +698,8 @@ eventfd_ctx_put(dev->vqs[i]->error_ctx); if (dev->vqs[i]->kick) fput(dev->vqs[i]->kick); - if (dev->vqs[i]->call_ctx) - eventfd_ctx_put(dev->vqs[i]->call_ctx); + if (dev->vqs[i]->call_ctx.ctx) + eventfd_ctx_put(dev->vqs[i]->call_ctx.ctx); vhost_vq_reset(dev, dev->vqs[i]); } vhost_dev_free_iovecs(dev); @@ -653,9 +707,9 @@ eventfd_ctx_put(dev->log_ctx); dev->log_ctx = NULL; /* No one will access memory at this point */ - vhost_umem_clean(dev->umem); + vhost_iotlb_free(dev->umem); dev->umem = NULL; - vhost_umem_clean(dev->iotlb); + vhost_iotlb_free(dev->iotlb); dev->iotlb = NULL; vhost_clear_msg(dev); wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); @@ -665,9 +719,7 @@ dev->worker = NULL; dev->kcov_handle = 0; } - if (dev->mm) - mmput(dev->mm); - dev->mm = NULL; + vhost_detach_mm(dev); } EXPORT_SYMBOL_GPL(vhost_dev_cleanup); @@ -680,7 +732,7 @@ a + (unsigned long)log_base > ULONG_MAX) return false; - return access_ok(VERIFY_WRITE, log_base + a, + return access_ok(log_base + a, (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); } @@ -697,27 +749,26 @@ } /* Caller should have vq mutex and device mutex. */ -static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem, +static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem, int log_all) { - struct vhost_umem_node *node; + struct vhost_iotlb_map *map; if (!umem) return false; - list_for_each_entry(node, &umem->umem_list, link) { - unsigned long a = node->userspace_addr; + list_for_each_entry(map, &umem->list, link) { + unsigned long a = map->addr; - if (vhost_overflow(node->userspace_addr, node->size)) + if (vhost_overflow(map->addr, map->size)) return false; - if (!access_ok(VERIFY_WRITE, (void __user *)a, - node->size)) + if (!access_ok((void __user *)a, map->size)) return false; else if (log_all && !log_access_ok(log_base, - node->start, - node->size)) + map->start, + map->size)) return false; } return true; @@ -727,17 +778,17 @@ u64 addr, unsigned int size, int type) { - const struct vhost_umem_node *node = vq->meta_iotlb[type]; + const struct vhost_iotlb_map *map = vq->meta_iotlb[type]; - if (!node) + if (!map) return NULL; - return (void *)(uintptr_t)(node->userspace_addr + addr - node->start); + return (void __user *)(uintptr_t)(map->addr + addr - map->start); } /* Can we switch to this memory table? */ /* Caller should have device mutex but not vq mutex */ -static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, +static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem, int log_all) { int i; @@ -871,7 +922,7 @@ * not happen in this case. */ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, - void *addr, unsigned int size, + void __user *addr, unsigned int size, int type) { void __user *uaddr = vhost_vq_meta_fetch(vq, @@ -884,7 +935,7 @@ #define vhost_put_user(vq, x, ptr) \ ({ \ - int ret = -EFAULT; \ + int ret; \ if (!vq->iotlb) { \ ret = __put_user(x, ptr); \ } else { \ @@ -898,6 +949,34 @@ } \ ret; \ }) + +static inline int vhost_put_avail_event(struct vhost_virtqueue *vq) +{ + return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx), + vhost_avail_event(vq)); +} + +static inline int vhost_put_used(struct vhost_virtqueue *vq, + struct vring_used_elem *head, int idx, + int count) +{ + return vhost_copy_to_user(vq, vq->used->ring + idx, head, + count * sizeof(*head)); +} + +static inline int vhost_put_used_flags(struct vhost_virtqueue *vq) + +{ + return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags), + &vq->used->flags); +} + +static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) + +{ + return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx), + &vq->used->idx); +} #define vhost_get_user(vq, x, ptr, type) \ ({ \ @@ -937,45 +1016,41 @@ mutex_unlock(&d->vqs[i]->mutex); } -static int vhost_new_umem_range(struct vhost_umem *umem, - u64 start, u64 size, u64 end, - u64 userspace_addr, int perm) +static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq, + __virtio16 *idx) { - struct vhost_umem_node *tmp, *node; - - if (!size) - return -EFAULT; - - node = kmalloc(sizeof(*node), GFP_ATOMIC); - if (!node) - return -ENOMEM; - - if (umem->numem == max_iotlb_entries) { - tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link); - vhost_umem_free(umem, tmp); - } - - node->start = start; - node->size = size; - node->last = end; - node->userspace_addr = userspace_addr; - node->perm = perm; - INIT_LIST_HEAD(&node->link); - list_add_tail(&node->link, &umem->umem_list); - vhost_umem_interval_tree_insert(node, &umem->umem_tree); - umem->numem++; - - return 0; + return vhost_get_avail(vq, *idx, &vq->avail->idx); } -static void vhost_del_umem_range(struct vhost_umem *umem, - u64 start, u64 end) +static inline int vhost_get_avail_head(struct vhost_virtqueue *vq, + __virtio16 *head, int idx) { - struct vhost_umem_node *node; + return vhost_get_avail(vq, *head, + &vq->avail->ring[idx & (vq->num - 1)]); +} - while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, - start, end))) - vhost_umem_free(umem, node); +static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq, + __virtio16 *flags) +{ + return vhost_get_avail(vq, *flags, &vq->avail->flags); +} + +static inline int vhost_get_used_event(struct vhost_virtqueue *vq, + __virtio16 *event) +{ + return vhost_get_avail(vq, *event, vhost_used_event(vq)); +} + +static inline int vhost_get_used_idx(struct vhost_virtqueue *vq, + __virtio16 *idx) +{ + return vhost_get_used(vq, *idx, &vq->used->idx); +} + +static inline int vhost_get_desc(struct vhost_virtqueue *vq, + struct vring_desc *desc, int idx) +{ + return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc)); } static void vhost_iotlb_notify_vq(struct vhost_dev *d, @@ -1008,10 +1083,10 @@ return false; if ((access & VHOST_ACCESS_RO) && - !access_ok(VERIFY_READ, (void __user *)a, size)) + !access_ok((void __user *)a, size)) return false; if ((access & VHOST_ACCESS_WO) && - !access_ok(VERIFY_WRITE, (void __user *)a, size)) + !access_ok((void __user *)a, size)) return false; return true; } @@ -1034,9 +1109,9 @@ break; } vhost_vq_meta_reset(dev); - if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size, - msg->iova + msg->size - 1, - msg->uaddr, msg->perm)) { + if (vhost_iotlb_add_range(dev->iotlb, msg->iova, + msg->iova + msg->size - 1, + msg->uaddr, msg->perm)) { ret = -ENOMEM; break; } @@ -1048,8 +1123,8 @@ break; } vhost_vq_meta_reset(dev); - vhost_del_umem_range(dev->iotlb, msg->iova, - msg->iova + msg->size - 1); + vhost_iotlb_del_range(dev->iotlb, msg->iova, + msg->iova + msg->size - 1); break; default: ret = -EINVAL; @@ -1095,7 +1170,12 @@ ret = -EINVAL; goto done; } - if (vhost_process_iotlb_msg(dev, &msg)) { + + if (dev->msg_handler) + ret = dev->msg_handler(dev, &msg); + else + ret = vhost_process_iotlb_msg(dev, &msg); + if (ret) { ret = -EFAULT; goto done; } @@ -1217,59 +1297,58 @@ } static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, - struct vring_desc __user *desc, - struct vring_avail __user *avail, - struct vring_used __user *used) + vring_desc_t __user *desc, + vring_avail_t __user *avail, + vring_used_t __user *used) { - size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + /* If an IOTLB device is present, the vring addresses are + * GIOVAs. Access validation occurs at prefetch time. */ + if (vq->iotlb) + return true; - return access_ok(VERIFY_READ, desc, num * sizeof *desc) && - access_ok(VERIFY_READ, avail, - sizeof *avail + num * sizeof *avail->ring + s) && - access_ok(VERIFY_WRITE, used, - sizeof *used + num * sizeof *used->ring + s); + return access_ok(desc, vhost_get_desc_size(vq, num)) && + access_ok(avail, vhost_get_avail_size(vq, num)) && + access_ok(used, vhost_get_used_size(vq, num)); } static void vhost_vq_meta_update(struct vhost_virtqueue *vq, - const struct vhost_umem_node *node, + const struct vhost_iotlb_map *map, int type) { int access = (type == VHOST_ADDR_USED) ? VHOST_ACCESS_WO : VHOST_ACCESS_RO; - if (likely(node->perm & access)) - vq->meta_iotlb[type] = node; + if (likely(map->perm & access)) + vq->meta_iotlb[type] = map; } static bool iotlb_access_ok(struct vhost_virtqueue *vq, int access, u64 addr, u64 len, int type) { - const struct vhost_umem_node *node; - struct vhost_umem *umem = vq->iotlb; + const struct vhost_iotlb_map *map; + struct vhost_iotlb *umem = vq->iotlb; u64 s = 0, size, orig_addr = addr, last = addr + len - 1; if (vhost_vq_meta_fetch(vq, addr, len, type)) return true; while (len > s) { - node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, - addr, - last); - if (node == NULL || node->start > addr) { + map = vhost_iotlb_itree_first(umem, addr, last); + if (map == NULL || map->start > addr) { vhost_iotlb_miss(vq, addr, access); return false; - } else if (!(node->perm & access)) { + } else if (!(map->perm & access)) { /* Report the possible access violation by * request another translation from userspace. */ return false; } - size = node->size - addr + node->start; + size = map->size - addr + map->start; if (orig_addr == addr && size >= len) - vhost_vq_meta_update(vq, node, type); + vhost_vq_meta_update(vq, map, type); s += size; addr += size; @@ -1278,26 +1357,22 @@ return true; } -int vq_iotlb_prefetch(struct vhost_virtqueue *vq) +int vq_meta_prefetch(struct vhost_virtqueue *vq) { - size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; unsigned int num = vq->num; if (!vq->iotlb) return 1; - return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc, - num * sizeof(*vq->desc), VHOST_ADDR_DESC) && - iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail, - sizeof *vq->avail + - num * sizeof(*vq->avail->ring) + s, + return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc, + vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) && + iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail, + vhost_get_avail_size(vq, num), VHOST_ADDR_AVAIL) && - iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used, - sizeof *vq->used + - num * sizeof(*vq->used->ring) + s, - VHOST_ADDR_USED); + iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used, + vhost_get_used_size(vq, num), VHOST_ADDR_USED); } -EXPORT_SYMBOL_GPL(vq_iotlb_prefetch); +EXPORT_SYMBOL_GPL(vq_meta_prefetch); /* Can we log writes? */ /* Caller should have device mutex but not vq mutex */ @@ -1307,18 +1382,28 @@ } EXPORT_SYMBOL_GPL(vhost_log_access_ok); +static bool vq_log_used_access_ok(struct vhost_virtqueue *vq, + void __user *log_base, + bool log_used, + u64 log_addr) +{ + /* If an IOTLB device is present, log_addr is a GIOVA that + * will never be logged by log_used(). */ + if (vq->iotlb) + return true; + + return !log_used || log_access_ok(log_base, log_addr, + vhost_get_used_size(vq, vq->num)); +} + /* Verify access for write logging. */ /* Caller should have vq mutex and device mutex */ static bool vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base) { - size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; - return vq_memory_access_ok(log_base, vq->umem, vhost_has_feature(vq, VHOST_F_LOG_ALL)) && - (!vq->log_used || log_access_ok(log_base, vq->log_addr, - sizeof *vq->used + - vq->num * sizeof *vq->used->ring + s)); + vq_log_used_access_ok(vq, log_base, vq->log_used, vq->log_addr); } /* Can we start vq? */ @@ -1328,33 +1413,15 @@ if (!vq_log_access_ok(vq, vq->log_base)) return false; - /* Access validation occurs at prefetch time with IOTLB */ - if (vq->iotlb) - return true; - return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); } EXPORT_SYMBOL_GPL(vhost_vq_access_ok); - -static struct vhost_umem *vhost_umem_alloc(void) -{ - struct vhost_umem *umem = kvzalloc(sizeof(*umem), GFP_KERNEL); - - if (!umem) - return NULL; - - umem->umem_tree = RB_ROOT_CACHED; - umem->numem = 0; - INIT_LIST_HEAD(&umem->umem_list); - - return umem; -} static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) { struct vhost_memory mem, *newmem; struct vhost_memory_region *region; - struct vhost_umem *newumem, *oldumem; + struct vhost_iotlb *newumem, *oldumem; unsigned long size = offsetof(struct vhost_memory, regions); int i; @@ -1371,12 +1438,12 @@ memcpy(newmem, &mem, size); if (copy_from_user(newmem->regions, m->regions, - mem.nregions * sizeof *m->regions)) { + flex_array_size(newmem, regions, mem.nregions))) { kvfree(newmem); return -EFAULT; } - newumem = vhost_umem_alloc(); + newumem = iotlb_alloc(); if (!newumem) { kvfree(newmem); return -ENOMEM; @@ -1385,13 +1452,12 @@ for (region = newmem->regions; region < newmem->regions + mem.nregions; region++) { - if (vhost_new_umem_range(newumem, - region->guest_phys_addr, - region->memory_size, - region->guest_phys_addr + - region->memory_size - 1, - region->userspace_addr, - VHOST_ACCESS_RW)) + if (vhost_iotlb_add_range(newumem, + region->guest_phys_addr, + region->guest_phys_addr + + region->memory_size - 1, + region->userspace_addr, + VHOST_MAP_RW)) goto err; } @@ -1409,15 +1475,112 @@ } kvfree(newmem); - vhost_umem_clean(oldumem); + vhost_iotlb_free(oldumem); return 0; err: - vhost_umem_clean(newumem); + vhost_iotlb_free(newumem); kvfree(newmem); return -EFAULT; } +static long vhost_vring_set_num(struct vhost_dev *d, + struct vhost_virtqueue *vq, + void __user *argp) +{ + struct vhost_vring_state s; + + /* Resizing ring with an active backend? + * You don't want to do that. */ + if (vq->private_data) + return -EBUSY; + + if (copy_from_user(&s, argp, sizeof s)) + return -EFAULT; + + if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) + return -EINVAL; + vq->num = s.num; + + return 0; +} + +static long vhost_vring_set_addr(struct vhost_dev *d, + struct vhost_virtqueue *vq, + void __user *argp) +{ + struct vhost_vring_addr a; + + if (copy_from_user(&a, argp, sizeof a)) + return -EFAULT; + if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) + return -EOPNOTSUPP; + + /* For 32bit, verify that the top 32bits of the user + data are set to zero. */ + if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr || + (u64)(unsigned long)a.used_user_addr != a.used_user_addr || + (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) + return -EFAULT; + + /* Make sure it's safe to cast pointers to vring types. */ + BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE); + BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE); + if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) || + (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) || + (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1))) + return -EINVAL; + + /* We only verify access here if backend is configured. + * If it is not, we don't as size might not have been setup. + * We will verify when backend is configured. */ + if (vq->private_data) { + if (!vq_access_ok(vq, vq->num, + (void __user *)(unsigned long)a.desc_user_addr, + (void __user *)(unsigned long)a.avail_user_addr, + (void __user *)(unsigned long)a.used_user_addr)) + return -EINVAL; + + /* Also validate log access for used ring if enabled. */ + if (!vq_log_used_access_ok(vq, vq->log_base, + a.flags & (0x1 << VHOST_VRING_F_LOG), + a.log_guest_addr)) + return -EINVAL; + } + + vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); + vq->desc = (void __user *)(unsigned long)a.desc_user_addr; + vq->avail = (void __user *)(unsigned long)a.avail_user_addr; + vq->log_addr = a.log_guest_addr; + vq->used = (void __user *)(unsigned long)a.used_user_addr; + + return 0; +} + +static long vhost_vring_set_num_addr(struct vhost_dev *d, + struct vhost_virtqueue *vq, + unsigned int ioctl, + void __user *argp) +{ + long r; + + mutex_lock(&vq->mutex); + + switch (ioctl) { + case VHOST_SET_VRING_NUM: + r = vhost_vring_set_num(d, vq, argp); + break; + case VHOST_SET_VRING_ADDR: + r = vhost_vring_set_addr(d, vq, argp); + break; + default: + BUG(); + } + + mutex_unlock(&vq->mutex); + + return r; +} long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) { struct file *eventfp, *filep = NULL; @@ -1427,7 +1590,6 @@ struct vhost_virtqueue *vq; struct vhost_vring_state s; struct vhost_vring_file f; - struct vhost_vring_addr a; u32 idx; long r; @@ -1440,26 +1602,14 @@ idx = array_index_nospec(idx, d->nvqs); vq = d->vqs[idx]; + if (ioctl == VHOST_SET_VRING_NUM || + ioctl == VHOST_SET_VRING_ADDR) { + return vhost_vring_set_num_addr(d, vq, ioctl, argp); + } + mutex_lock(&vq->mutex); switch (ioctl) { - case VHOST_SET_VRING_NUM: - /* Resizing ring with an active backend? - * You don't want to do that. */ - if (vq->private_data) { - r = -EBUSY; - break; - } - if (copy_from_user(&s, argp, sizeof s)) { - r = -EFAULT; - break; - } - if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) { - r = -EINVAL; - break; - } - vq->num = s.num; - break; case VHOST_SET_VRING_BASE: /* Moving base with an active backend? * You don't want to do that. */ @@ -1471,82 +1621,34 @@ r = -EFAULT; break; } - if (s.num > 0xffff) { - r = -EINVAL; - break; + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) { + vq->last_avail_idx = s.num & 0xffff; + vq->last_used_idx = (s.num >> 16) & 0xffff; + } else { + if (s.num > 0xffff) { + r = -EINVAL; + break; + } + vq->last_avail_idx = s.num; } - vq->last_avail_idx = s.num; /* Forget the cached index value. */ vq->avail_idx = vq->last_avail_idx; break; case VHOST_GET_VRING_BASE: s.index = idx; - s.num = vq->last_avail_idx; + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) + s.num = (u32)vq->last_avail_idx | ((u32)vq->last_used_idx << 16); + else + s.num = vq->last_avail_idx; if (copy_to_user(argp, &s, sizeof s)) r = -EFAULT; - break; - case VHOST_SET_VRING_ADDR: - if (copy_from_user(&a, argp, sizeof a)) { - r = -EFAULT; - break; - } - if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) { - r = -EOPNOTSUPP; - break; - } - /* For 32bit, verify that the top 32bits of the user - data are set to zero. */ - if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr || - (u64)(unsigned long)a.used_user_addr != a.used_user_addr || - (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) { - r = -EFAULT; - break; - } - - /* Make sure it's safe to cast pointers to vring types. */ - BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE); - BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE); - if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) || - (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) || - (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1))) { - r = -EINVAL; - break; - } - - /* We only verify access here if backend is configured. - * If it is not, we don't as size might not have been setup. - * We will verify when backend is configured. */ - if (vq->private_data) { - if (!vq_access_ok(vq, vq->num, - (void __user *)(unsigned long)a.desc_user_addr, - (void __user *)(unsigned long)a.avail_user_addr, - (void __user *)(unsigned long)a.used_user_addr)) { - r = -EINVAL; - break; - } - - /* Also validate log access for used ring if enabled. */ - if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) && - !log_access_ok(vq->log_base, a.log_guest_addr, - sizeof *vq->used + - vq->num * sizeof *vq->used->ring)) { - r = -EINVAL; - break; - } - } - - vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG)); - vq->desc = (void __user *)(unsigned long)a.desc_user_addr; - vq->avail = (void __user *)(unsigned long)a.avail_user_addr; - vq->log_addr = a.log_guest_addr; - vq->used = (void __user *)(unsigned long)a.used_user_addr; break; case VHOST_SET_VRING_KICK: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } - eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + eventfp = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_fget(f.fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; @@ -1562,19 +1664,20 @@ r = -EFAULT; break; } - ctx = f.fd == -1 ? NULL : eventfd_ctx_fdget(f.fd); + ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd); if (IS_ERR(ctx)) { r = PTR_ERR(ctx); break; } - swap(ctx, vq->call_ctx); + + swap(ctx, vq->call_ctx.ctx); break; case VHOST_SET_VRING_ERR: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } - ctx = f.fd == -1 ? NULL : eventfd_ctx_fdget(f.fd); + ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd); if (IS_ERR(ctx)) { r = PTR_ERR(ctx); break; @@ -1625,10 +1728,10 @@ int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled) { - struct vhost_umem *niotlb, *oiotlb; + struct vhost_iotlb *niotlb, *oiotlb; int i; - niotlb = vhost_umem_alloc(); + niotlb = iotlb_alloc(); if (!niotlb) return -ENOMEM; @@ -1644,7 +1747,7 @@ mutex_unlock(&vq->mutex); } - vhost_umem_clean(oiotlb); + vhost_iotlb_free(oiotlb); return 0; } @@ -1699,7 +1802,7 @@ r = get_user(fd, (int __user *)argp); if (r < 0) break; - ctx = fd == -1 ? NULL : eventfd_ctx_fdget(fd); + ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd); if (IS_ERR(ctx)) { r = PTR_ERR(ctx); break; @@ -1724,7 +1827,7 @@ /* TODO: This is really inefficient. We need something like get_user() * (instruction directly accesses the data, with an exception table entry - * returning -EFAULT). See Documentation/x86/exception-tables.txt. + * returning -EFAULT). See Documentation/x86/exception-tables.rst. */ static int set_bit_to_user(int nr, void __user *addr) { @@ -1734,15 +1837,14 @@ int bit = nr + (log % PAGE_SIZE) * 8; int r; - r = get_user_pages_fast(log, 1, 1, &page); + r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page); if (r < 0) return r; BUG_ON(r != 1); base = kmap_atomic(page); set_bit(bit, base); kunmap_atomic(base); - set_page_dirty_lock(page); - put_page(page); + unpin_user_pages_dirty_lock(&page, 1, true); return 0; } @@ -1774,8 +1876,8 @@ static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len) { - struct vhost_umem *umem = vq->umem; - struct vhost_umem_node *u; + struct vhost_iotlb *umem = vq->umem; + struct vhost_iotlb_map *u; u64 start, end, l, min; int r; bool hit = false; @@ -1785,16 +1887,15 @@ /* More than one GPAs can be mapped into a single HVA. So * iterate all possible umems here to be safe. */ - list_for_each_entry(u, &umem->umem_list, link) { - if (u->userspace_addr > hva - 1 + len || - u->userspace_addr - 1 + u->size < hva) + list_for_each_entry(u, &umem->list, link) { + if (u->addr > hva - 1 + len || + u->addr - 1 + u->size < hva) continue; - start = max(u->userspace_addr, hva); - end = min(u->userspace_addr - 1 + u->size, - hva - 1 + len); + start = max(u->addr, hva); + end = min(u->addr - 1 + u->size, hva - 1 + len); l = end - start + 1; r = log_write(vq->log_base, - u->start + start - u->userspace_addr, + u->start + start - u->addr, l); if (r < 0) return r; @@ -1814,7 +1915,7 @@ static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len) { - struct iovec iov[64]; + struct iovec *iov = vq->log_iov; int i, ret; if (!vq->iotlb) @@ -1874,8 +1975,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq) { void __user *used; - if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags), - &vq->used->flags) < 0) + if (vhost_put_used_flags(vq)) return -EFAULT; if (unlikely(vq->log_used)) { /* Make sure the flag is seen before log. */ @@ -1892,8 +1992,7 @@ static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event) { - if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx), - vhost_avail_event(vq))) + if (vhost_put_avail_event(vq)) return -EFAULT; if (unlikely(vq->log_used)) { void __user *used; @@ -1925,11 +2024,11 @@ goto err; vq->signalled_used_valid = false; if (!vq->iotlb && - !access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) { + !access_ok(&vq->used->idx, sizeof vq->used->idx)) { r = -EFAULT; goto err; } - r = vhost_get_used(vq, last_used_idx, &vq->used->idx); + r = vhost_get_used_idx(vq, &last_used_idx); if (r) { vq_err(vq, "Can't access used idx at %p\n", &vq->used->idx); @@ -1947,11 +2046,11 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, struct iovec iov[], int iov_size, int access) { - const struct vhost_umem_node *node; + const struct vhost_iotlb_map *map; struct vhost_dev *dev = vq->dev; - struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem; + struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem; struct iovec *_iov; - u64 s = 0; + u64 s = 0, last = addr + len - 1; int ret = 0; while ((u64)len > s) { @@ -1961,25 +2060,24 @@ break; } - node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, - addr, addr + len - 1); - if (node == NULL || node->start > addr) { + map = vhost_iotlb_itree_first(umem, addr, last); + if (map == NULL || map->start > addr) { if (umem != dev->iotlb) { ret = -EFAULT; break; } ret = -EAGAIN; break; - } else if (!(node->perm & access)) { + } else if (!(map->perm & access)) { ret = -EPERM; break; } _iov = iov + ret; - size = node->size - addr + node->start; + size = map->size - addr + map->start; _iov->iov_len = min((u64)len - s, size); _iov->iov_base = (void __user *)(unsigned long) - (node->userspace_addr + addr - node->start); + (map->addr + addr - map->start); s += size; addr += size; ++ret; @@ -2035,11 +2133,6 @@ return ret; } iov_iter_init(&from, READ, vq->indirect, ret, len); - - /* We will use the result as an address to read from, so most - * architectures only need a compiler barrier here. */ - read_barrier_depends(); - count = len / sizeof desc; /* Buffers are chained via a 16 bit next field, so * we can have at most 2^16 of these. */ @@ -2128,7 +2221,7 @@ last_avail_idx = vq->last_avail_idx; if (vq->avail_idx == vq->last_avail_idx) { - if (unlikely(vhost_get_avail(vq, avail_idx, &vq->avail->idx))) { + if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) { vq_err(vq, "Failed to access avail idx at %p\n", &vq->avail->idx); return -EFAULT; @@ -2155,8 +2248,7 @@ /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ - if (unlikely(vhost_get_avail(vq, ring_head, - &vq->avail->ring[last_avail_idx & (vq->num - 1)]))) { + if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) { vq_err(vq, "Failed to read head: idx %d address %p\n", last_avail_idx, &vq->avail->ring[last_avail_idx % vq->num]); @@ -2191,8 +2283,7 @@ i, vq->num, head); return -EINVAL; } - ret = vhost_copy_from_user(vq, &desc, vq->desc + i, - sizeof desc); + ret = vhost_get_desc(vq, &desc, i); if (unlikely(ret)) { vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", i, vq->desc + i); @@ -2279,22 +2370,13 @@ struct vring_used_elem *heads, unsigned count) { - struct vring_used_elem __user *used; + vring_used_elem_t __user *used; u16 old, new; int start; start = vq->last_used_idx & (vq->num - 1); used = vq->used->ring + start; - if (count == 1) { - if (vhost_put_user(vq, heads[0].id, &used->id)) { - vq_err(vq, "Failed to write used id"); - return -EFAULT; - } - if (vhost_put_user(vq, heads[0].len, &used->len)) { - vq_err(vq, "Failed to write used len"); - return -EFAULT; - } - } else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) { + if (vhost_put_used(vq, heads, start, count)) { vq_err(vq, "Failed to write used"); return -EFAULT; } @@ -2336,8 +2418,7 @@ /* Make sure buffer is written before we update index. */ smp_wmb(); - if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx), - &vq->used->idx)) { + if (vhost_put_used_idx(vq)) { vq_err(vq, "Failed to increment used idx"); return -EFAULT; } @@ -2370,7 +2451,7 @@ if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { __virtio16 flags; - if (vhost_get_avail(vq, flags, &vq->avail->flags)) { + if (vhost_get_avail_flags(vq, &flags)) { vq_err(vq, "Failed to get flags"); return true; } @@ -2384,7 +2465,7 @@ if (unlikely(!v)) return true; - if (vhost_get_avail(vq, event, vhost_used_event(vq))) { + if (vhost_get_used_event(vq, &event)) { vq_err(vq, "Failed to get used event idx"); return true; } @@ -2395,8 +2476,8 @@ void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) { /* Signal the Guest tell them we used something up. */ - if (vq->call_ctx && vhost_notify(dev, vq)) - eventfd_signal(vq->call_ctx, 1); + if (vq->call_ctx.ctx && vhost_notify(dev, vq)) + eventfd_signal(vq->call_ctx.ctx, 1); } EXPORT_SYMBOL_GPL(vhost_signal); @@ -2429,7 +2510,7 @@ if (vq->avail_idx != vq->last_avail_idx) return false; - r = vhost_get_avail(vq, avail_idx, &vq->avail->idx); + r = vhost_get_avail_idx(vq, &avail_idx); if (unlikely(r)) return false; vq->avail_idx = vhost16_to_cpu(vq, avail_idx); @@ -2465,7 +2546,7 @@ /* They could have slipped one in as we were doing that: make * sure it's written, then check again. */ smp_mb(); - r = vhost_get_avail(vq, avail_idx, &vq->avail->idx); + r = vhost_get_avail_idx(vq, &avail_idx); if (r) { vq_err(vq, "Failed to check avail idx at %p: %d\n", &vq->avail->idx, r); @@ -2487,7 +2568,7 @@ if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) { r = vhost_update_used_flags(vq); if (r) - vq_err(vq, "Failed to enable notification at %p: %d\n", + vq_err(vq, "Failed to disable notification at %p: %d\n", &vq->used->flags, r); } } @@ -2536,6 +2617,21 @@ } EXPORT_SYMBOL_GPL(vhost_dequeue_msg); +void vhost_set_backend_features(struct vhost_dev *dev, u64 features) +{ + struct vhost_virtqueue *vq; + int i; + + mutex_lock(&dev->mutex); + for (i = 0; i < dev->nvqs; ++i) { + vq = dev->vqs[i]; + mutex_lock(&vq->mutex); + vq->acked_backend_features = features; + mutex_unlock(&vq->mutex); + } + mutex_unlock(&dev->mutex); +} +EXPORT_SYMBOL_GPL(vhost_set_backend_features); static int __init vhost_init(void) { -- Gitblit v1.6.2