From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/io_uring/io_uring.c | 439 ++++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 343 insertions(+), 96 deletions(-)
diff --git a/kernel/io_uring/io_uring.c b/kernel/io_uring/io_uring.c
index 35a9bcd..a966de4 100644
--- a/kernel/io_uring/io_uring.c
+++ b/kernel/io_uring/io_uring.c
@@ -496,6 +496,7 @@
struct file *file;
struct wait_queue_head *head;
__poll_t events;
+ int retries;
struct wait_queue_entry wait;
};
@@ -588,7 +589,9 @@
int msg_flags;
int bgid;
size_t len;
+ size_t done_io;
struct io_buffer *kbuf;
+ void __user *msg_control;
};
struct io_open {
@@ -749,6 +752,7 @@
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
REQ_F_ARM_LTIMEOUT_BIT,
+ REQ_F_PARTIAL_IO_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_NOWAIT_READ_BIT,
REQ_F_NOWAIT_WRITE_BIT,
@@ -804,6 +808,8 @@
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+ /* request has already done partial IO */
+ REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
};
struct async_poll {
@@ -1098,7 +1104,8 @@
unsigned nr_args);
static void io_clean_op(struct io_kiocb *req);
static struct file *io_file_get(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd, bool fixed);
+ struct io_kiocb *req, int fd, bool fixed,
+ unsigned int issue_flags);
static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
@@ -1524,6 +1531,8 @@
static void io_queue_deferred(struct io_ring_ctx *ctx)
{
+ lockdep_assert_held(&ctx->completion_lock);
+
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
@@ -1575,12 +1584,22 @@
io_queue_deferred(ctx);
}
-static inline void io_commit_cqring(struct io_ring_ctx *ctx)
+static inline bool io_commit_needs_flush(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active))
- __io_commit_cqring_flush(ctx);
+ return ctx->off_timeout_used || ctx->drain_active;
+}
+
+static inline void __io_commit_cqring(struct io_ring_ctx *ctx)
+{
/* order cqe stores with ring update */
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
+}
+
+static inline void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+ if (unlikely(io_commit_needs_flush(ctx)))
+ __io_commit_cqring_flush(ctx);
+ __io_commit_cqring(ctx);
}
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
@@ -2205,9 +2224,12 @@
}
req->io_task_work.func(req, &locked);
node = next;
+ if (unlikely(need_resched())) {
+ ctx_flush_and_put(ctx, &locked);
+ ctx = NULL;
+ cond_resched();
+ }
} while (node);
-
- cond_resched();
}
ctx_flush_and_put(ctx, &locked);
@@ -2465,6 +2487,15 @@
static inline bool io_run_task_work(void)
{
+ /*
+ * PF_IO_WORKER never returns to userspace, so check here if we have
+ * notify work that needs processing.
+ */
+ if (current->flags & PF_IO_WORKER &&
+ test_thread_flag(TIF_NOTIFY_RESUME)) {
+ __set_current_state(TASK_RUNNING);
+ tracehook_notify_resume(NULL);
+ }
if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
__set_current_state(TASK_RUNNING);
tracehook_notify_signal();
@@ -2488,17 +2519,36 @@
io_init_req_batch(&rb);
while (!list_empty(done)) {
+ struct io_uring_cqe *cqe;
+ unsigned cflags;
+
req = list_first_entry(done, struct io_kiocb, inflight_entry);
list_del(&req->inflight_entry);
-
- io_fill_cqe_req(req, req->result, io_put_rw_kbuf(req));
+ cflags = io_put_rw_kbuf(req);
(*nr_events)++;
+
+ cqe = io_get_cqe(ctx);
+ if (cqe) {
+ WRITE_ONCE(cqe->user_data, req->user_data);
+ WRITE_ONCE(cqe->res, req->result);
+ WRITE_ONCE(cqe->flags, cflags);
+ } else {
+ spin_lock(&ctx->completion_lock);
+ io_cqring_event_overflow(ctx, req->user_data,
+ req->result, cflags);
+ spin_unlock(&ctx->completion_lock);
+ }
if (req_ref_put_and_test(req))
io_req_free_batch(&rb, req, &ctx->submit_state);
}
- io_commit_cqring(ctx);
+ if (io_commit_needs_flush(ctx)) {
+ spin_lock(&ctx->completion_lock);
+ __io_commit_cqring_flush(ctx);
+ spin_unlock(&ctx->completion_lock);
+ }
+ __io_commit_cqring(ctx);
io_cqring_ev_posted_iopoll(ctx);
io_req_free_batch_finish(ctx, &rb);
}
@@ -2625,6 +2675,11 @@
break;
}
ret = io_do_iopoll(ctx, &nr_events, min);
+
+ if (task_sigpending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
} while (!ret && nr_events < min && !need_resched());
out:
mutex_unlock(&ctx->uring_lock);
@@ -2692,17 +2747,32 @@
}
#endif
-static bool __io_complete_rw_common(struct io_kiocb *req, long res)
+/*
+ * Trigger the notifications after having done some IO, and finish the write
+ * accounting, if any.
+ */
+static void io_req_io_end(struct io_kiocb *req)
{
- if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
+ struct io_rw *rw = &req->rw;
+
+ if (rw->kiocb.ki_flags & IOCB_WRITE) {
kiocb_end_write(req);
fsnotify_modify(req->file);
} else {
fsnotify_access(req->file);
}
+}
+
+static bool __io_complete_rw_common(struct io_kiocb *req, long res)
+{
if (res != req->result) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
+ /*
+ * Reissue will start accounting again, finish the
+ * current cycle.
+ */
+ io_req_io_end(req);
req->flags |= REQ_F_REISSUE;
return true;
}
@@ -2712,7 +2782,7 @@
return false;
}
-static inline int io_fixup_rw_res(struct io_kiocb *req, unsigned res)
+static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
{
struct io_async_rw *io = req->async_data;
@@ -2744,12 +2814,10 @@
}
}
-static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
- unsigned int issue_flags)
+static void io_req_rw_complete(struct io_kiocb *req, bool *locked)
{
- if (__io_complete_rw_common(req, res))
- return;
- __io_req_complete(req, issue_flags, io_fixup_rw_res(req, res), io_put_rw_kbuf(req));
+ io_req_io_end(req);
+ io_req_task_complete(req, locked);
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
@@ -2759,7 +2827,7 @@
if (__io_complete_rw_common(req, res))
return;
req->result = io_fixup_rw_res(req, res);
- req->io_task_work.func = io_req_task_complete;
+ req->io_task_work.func = io_req_rw_complete;
io_req_task_work_add(req);
}
@@ -2911,14 +2979,6 @@
req->flags |= REQ_F_ISREG;
kiocb->ki_pos = READ_ONCE(sqe->off);
- if (kiocb->ki_pos == -1) {
- if (!(file->f_mode & FMODE_STREAM)) {
- req->flags |= REQ_F_CUR_POS;
- kiocb->ki_pos = file->f_pos;
- } else {
- kiocb->ki_pos = 0;
- }
- }
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
@@ -3000,6 +3060,23 @@
}
}
+static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
+{
+ struct kiocb *kiocb = &req->rw.kiocb;
+
+ if (kiocb->ki_pos != -1)
+ return &kiocb->ki_pos;
+
+ if (!(req->file->f_mode & FMODE_STREAM)) {
+ req->flags |= REQ_F_CUR_POS;
+ kiocb->ki_pos = req->file->f_pos;
+ return &kiocb->ki_pos;
+ }
+
+ kiocb->ki_pos = 0;
+ return NULL;
+}
+
static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
unsigned int issue_flags)
{
@@ -3007,10 +3084,20 @@
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
- if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
- __io_complete_rw(req, ret, 0, issue_flags);
- else
+ if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) {
+ if (!__io_complete_rw_common(req, ret)) {
+ /*
+ * Safe to call io_end from here as we're inline
+ * from the submission path.
+ */
+ io_req_io_end(req);
+ __io_req_complete(req, issue_flags,
+ io_fixup_rw_res(req, ret),
+ io_put_rw_kbuf(req));
+ }
+ } else {
io_rw_done(kiocb, ret);
+ }
if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
@@ -3292,6 +3379,7 @@
struct kiocb *kiocb = &req->rw.kiocb;
struct file *file = req->file;
ssize_t ret = 0;
+ loff_t *ppos;
/*
* Don't support polled IO through this interface, and we can't
@@ -3302,6 +3390,8 @@
return -EOPNOTSUPP;
if (kiocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
+
+ ppos = io_kiocb_ppos(kiocb);
while (iov_iter_count(iter)) {
struct iovec iovec;
@@ -3316,10 +3406,10 @@
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
- iovec.iov_len, io_kiocb_ppos(kiocb));
+ iovec.iov_len, ppos);
} else {
nr = file->f_op->write(file, iovec.iov_base,
- iovec.iov_len, io_kiocb_ppos(kiocb));
+ iovec.iov_len, ppos);
}
if (nr < 0) {
@@ -3520,6 +3610,7 @@
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct iov_iter_state __state, *state;
ssize_t ret, ret2;
+ loff_t *ppos;
if (rw) {
iter = &rw->iter;
@@ -3552,7 +3643,9 @@
return ret ?: -EAGAIN;
}
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
+ ppos = io_kiocb_update_pos(req);
+
+ ret = rw_verify_area(READ, req->file, ppos, req->result);
if (unlikely(ret)) {
kfree(iovec);
return ret;
@@ -3656,6 +3749,7 @@
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct iov_iter_state __state, *state;
ssize_t ret, ret2;
+ loff_t *ppos;
if (rw) {
iter = &rw->iter;
@@ -3686,7 +3780,9 @@
(req->flags & REQ_F_ISREG))
goto copy_iov;
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
+ ppos = io_kiocb_update_pos(req);
+
+ ret = rw_verify_area(WRITE, req->file, ppos, req->result);
if (unlikely(ret))
goto out_free;
@@ -3926,7 +4022,7 @@
return -EAGAIN;
in = io_file_get(req->ctx, req, sp->splice_fd_in,
- (sp->flags & SPLICE_F_FD_IN_FIXED));
+ (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
if (!in) {
ret = -EBADF;
goto done;
@@ -3966,7 +4062,7 @@
return -EAGAIN;
in = io_file_get(req->ctx, req, sp->splice_fd_in,
- (sp->flags & SPLICE_F_FD_IN_FIXED));
+ (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
if (!in) {
ret = -EBADF;
goto done;
@@ -4148,9 +4244,11 @@
if (issue_flags & IO_URING_F_NONBLOCK) {
/*
* Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
- * it'll always -EAGAIN
+ * it'll always -EAGAIN. Note that we test for __O_TMPFILE
+ * because O_TMPFILE includes O_DIRECTORY, which isn't a flag
+ * we need to force async for.
*/
- if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
+ if (req->open.how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
return -EAGAIN;
op.lookup_flags |= LOOKUP_CACHED;
op.open_flag |= O_NONBLOCK;
@@ -4623,6 +4721,13 @@
}
#if defined(CONFIG_NET)
+static bool io_net_retry(struct socket *sock, int flags)
+{
+ if (!(flags & MSG_WAITALL))
+ return false;
+ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
static int io_setup_async_msg(struct io_kiocb *req,
struct io_async_msghdr *kmsg)
{
@@ -4640,8 +4745,10 @@
if (async_msg->msg.msg_name)
async_msg->msg.msg_name = &async_msg->addr;
/* if were using fast_iov, set it to the new one */
- if (!async_msg->free_iov)
- async_msg->msg.msg_iter.iov = async_msg->fast_iov;
+ if (!kmsg->free_iov) {
+ size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov;
+ async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx];
+ }
return -EAGAIN;
}
@@ -4649,10 +4756,16 @@
static int io_sendmsg_copy_hdr(struct io_kiocb *req,
struct io_async_msghdr *iomsg)
{
+ struct io_sr_msg *sr = &req->sr_msg;
+ int ret;
+
iomsg->msg.msg_name = &iomsg->addr;
iomsg->free_iov = iomsg->fast_iov;
- return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
+ ret = sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
req->sr_msg.msg_flags, &iomsg->free_iov);
+ /* save msg_control as sys_sendmsg() overwrites it */
+ sr->msg_control = iomsg->msg.msg_control;
+ return ret;
}
static int io_sendmsg_prep_async(struct io_kiocb *req)
@@ -4686,12 +4799,14 @@
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
+ sr->done_io = 0;
return 0;
}
static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
+ struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
unsigned flags;
int min_ret = 0;
@@ -4707,6 +4822,8 @@
if (ret)
return ret;
kmsg = &iomsg;
+ } else {
+ kmsg->msg.msg_control = sr->msg_control;
}
flags = req->sr_msg.msg_flags;
@@ -4716,17 +4833,27 @@
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
- if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return io_setup_async_msg(req, kmsg);
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return io_setup_async_msg(req, kmsg);
+ }
+ req_set_fail(req);
+ }
/* fast path, check for non-NULL to avoid function call */
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < min_ret)
- req_set_fail(req);
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4762,13 +4889,24 @@
msg.msg_flags = flags;
ret = sock_sendmsg(sock, &msg);
- if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
-
- if (ret < min_ret)
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return -EAGAIN;
+ }
req_set_fail(req);
+ }
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4904,7 +5042,7 @@
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
sr->bgid = READ_ONCE(sqe->buf_group);
- sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ sr->msg_flags = READ_ONCE(sqe->msg_flags);
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
@@ -4912,12 +5050,14 @@
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
+ sr->done_io = 0;
return 0;
}
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
+ struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
struct io_buffer *kbuf;
unsigned flags;
@@ -4950,15 +5090,27 @@
flags = req->sr_msg.msg_flags;
if (force_nonblock)
flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
+ if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
kmsg->uaddr, flags);
- if (force_nonblock && ret == -EAGAIN)
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && force_nonblock)
+ return io_setup_async_msg(req, kmsg);
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ kmsg->msg.msg_controllen = 0;
+ kmsg->msg.msg_control = NULL;
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return io_setup_async_msg(req, kmsg);
+ }
+ req_set_fail(req);
+ } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+ req_set_fail(req);
+ }
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_recv_kbuf(req);
@@ -4966,8 +5118,10 @@
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
- req_set_fail(req);
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, cflags);
return 0;
}
@@ -5014,15 +5168,29 @@
min_ret = iov_iter_count(&msg.msg_iter);
ret = sock_recvmsg(sock, &msg, flags);
- if (force_nonblock && ret == -EAGAIN)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && force_nonblock)
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return -EAGAIN;
+ }
+ req_set_fail(req);
+ } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
+ req_set_fail(req);
+ }
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_recv_kbuf(req);
- if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
- req_set_fail(req);
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, cflags);
return 0;
}
@@ -5060,9 +5228,6 @@
struct file *file;
int ret, fd;
- if (req->file->f_flags & O_NONBLOCK)
- req->flags |= REQ_F_NOWAIT;
-
if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile);
if (unlikely(fd < 0))
@@ -5075,6 +5240,8 @@
if (!fixed)
put_unused_fd(fd);
ret = PTR_ERR(file);
+ /* safe to retry */
+ req->flags |= REQ_F_PARTIAL_IO;
if (ret == -EAGAIN && force_nonblock)
return -EAGAIN;
if (ret == -ERESTARTSYS)
@@ -5419,6 +5586,7 @@
if (ret > 0)
return;
+ io_tw_lock(req->ctx, locked);
io_poll_remove_entries(req);
spin_lock(&ctx->completion_lock);
hash_del(&req->hash_node);
@@ -5631,6 +5799,14 @@
IO_APOLL_READY
};
+/*
+ * We can't reliably detect loops in repeated poll triggers and issue
+ * subsequently failing. But rather than fail these immediately, allow a
+ * certain amount of retries before we give up. Given that this condition
+ * should _rarely_ trigger even once, we should be fine with a larger value.
+ */
+#define APOLL_MAX_RETRY 128
+
static int io_arm_poll_handler(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -5641,8 +5817,6 @@
int ret;
if (!req->file || !file_can_poll(req->file))
- return IO_APOLL_ABORTED;
- if (req->flags & REQ_F_POLLED)
return IO_APOLL_ABORTED;
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
@@ -5658,9 +5832,19 @@
mask |= POLLOUT | POLLWRNORM;
}
- apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
- if (unlikely(!apoll))
- return IO_APOLL_ABORTED;
+ if (req->flags & REQ_F_POLLED) {
+ apoll = req->apoll;
+ kfree(apoll->double_poll);
+ if (unlikely(!--apoll->poll.retries)) {
+ apoll->double_poll = NULL;
+ return IO_APOLL_ABORTED;
+ }
+ } else {
+ apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
+ if (unlikely(!apoll))
+ return IO_APOLL_ABORTED;
+ apoll->poll.retries = APOLL_MAX_RETRY;
+ }
apoll->double_poll = NULL;
req->apoll = apoll;
req->flags |= REQ_F_POLLED;
@@ -5831,6 +6015,8 @@
struct io_kiocb *preq;
int ret2, ret = 0;
+ io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+
spin_lock(&ctx->completion_lock);
preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
if (!preq || !io_poll_disarm(preq)) {
@@ -5862,6 +6048,7 @@
req_set_fail(req);
/* complete update request, we're done with it */
io_req_complete(req, ret);
+ io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
return 0;
}
@@ -6726,6 +6913,15 @@
*/
if (ret != -EAGAIN || !(req->ctx->flags & IORING_SETUP_IOPOLL))
break;
+ if (io_wq_worker_stopped())
+ break;
+ /*
+ * If REQ_F_NOWAIT is set, then don't wait or retry with
+ * poll. -EAGAIN is final for that case.
+ */
+ if (req->flags & REQ_F_NOWAIT)
+ break;
+
cond_resched();
} while (1);
}
@@ -6763,13 +6959,16 @@
}
static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd)
+ struct io_kiocb *req, int fd,
+ unsigned int issue_flags)
{
- struct file *file;
+ struct file *file = NULL;
unsigned long file_ptr;
+ io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
+
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
- return NULL;
+ goto out;
fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
file = (struct file *) (file_ptr & FFS_MASK);
@@ -6777,6 +6976,8 @@
/* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
io_req_set_rsrc_node(req);
+out:
+ io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
return file;
}
@@ -6794,10 +6995,11 @@
}
static inline struct file *io_file_get(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd, bool fixed)
+ struct io_kiocb *req, int fd, bool fixed,
+ unsigned int issue_flags)
{
if (fixed)
- return io_file_get_fixed(ctx, req, fd);
+ return io_file_get_fixed(ctx, req, fd, issue_flags);
else
return io_file_get_normal(ctx, req, fd);
}
@@ -7019,7 +7221,8 @@
if (io_op_defs[req->opcode].needs_file) {
req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
- (sqe_flags & IOSQE_FIXED_FILE));
+ (sqe_flags & IOSQE_FIXED_FILE),
+ IO_URING_F_NONBLOCK);
if (unlikely(!req->file))
ret = -EBADF;
}
@@ -7447,12 +7650,21 @@
return -EINTR;
}
+static bool current_pending_io(void)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (!tctx)
+ return false;
+ return percpu_counter_read_positive(&tctx->inflight);
+}
+
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
- ktime_t timeout)
+ ktime_t *timeout)
{
- int ret;
+ int io_wait, ret;
/* make sure we run task_work before checking for signals */
ret = io_run_task_work_sig();
@@ -7462,9 +7674,19 @@
if (test_bit(0, &ctx->check_cq_overflow))
return 1;
- if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
- return -ETIME;
- return 1;
+ /*
+ * Mark us as being in io_wait if we have pending requests, so cpufreq
+ * can take into account that the task is waiting for IO - turns out
+ * to be important for low QD IO.
+ */
+ io_wait = current->in_iowait;
+ if (current_pending_io())
+ current->in_iowait = 1;
+ ret = 1;
+ if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
+ ret = -ETIME;
+ current->in_iowait = io_wait;
+ return ret;
}
/*
@@ -7525,7 +7747,7 @@
}
prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
- ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
+ ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
finish_wait(&ctx->cq_wait, &iowq.wq);
cond_resched();
} while (ret > 0);
@@ -8927,14 +9149,17 @@
pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
pages, vmas);
if (pret == nr_pages) {
+ struct file *file = vmas[0]->vm_file;
+
/* don't support file backed memory */
for (i = 0; i < nr_pages; i++) {
- struct vm_area_struct *vma = vmas[i];
-
- if (vma_is_shmem(vma))
+ if (vmas[i]->vm_file != file) {
+ ret = -EINVAL;
+ break;
+ }
+ if (!file)
continue;
- if (vma->vm_file &&
- !is_file_hugepages(vma->vm_file)) {
+ if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) {
ret = -EOPNOTSUPP;
break;
}
@@ -9367,7 +9592,18 @@
/* there is little hope left, don't run it too often */
interval = HZ * 60;
}
- } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
+ /*
+ * This is really an uninterruptible wait, as it has to be
+ * complete. But it's also run from a kworker, which doesn't
+ * take signals, so it's fine to make it interruptible. This
+ * avoids scenarios where we knowingly can wait much longer
+ * on completions, for example if someone does a SIGSTOP on
+ * a task that needs to finish task_work to make this loop
+ * complete. That's a synthetic situation that should not
+ * cause a stuck task backtrace, and hence a potential panic
+ * on stuck tasks if that is enabled.
+ */
+ } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));
init_completion(&exit.completion);
init_task_work(&exit.task_work, io_tctx_exit_cb);
@@ -9392,7 +9628,12 @@
wake_up_process(node->task);
mutex_unlock(&ctx->uring_lock);
- wait_for_completion(&exit.completion);
+ /*
+ * See comment above for
+ * wait_for_completion_interruptible_timeout() on why this
+ * wait is marked as interruptible.
+ */
+ wait_for_completion_interruptible(&exit.completion);
mutex_lock(&ctx->uring_lock);
}
mutex_unlock(&ctx->uring_lock);
@@ -9444,6 +9685,10 @@
/* if we failed setting up the ctx, we might not have any rings */
io_iopoll_try_reap_events(ctx);
+
+ /* drop cached put refs after potentially doing completions */
+ if (current->io_uring)
+ io_uring_drop_tctx_refs(current);
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
/*
@@ -9556,6 +9801,7 @@
while (!list_empty_careful(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
+ cond_resched();
}
}
@@ -10223,7 +10469,7 @@
if (!ctx)
return -ENOMEM;
ctx->compat = in_compat_syscall();
- if (!capable(CAP_IPC_LOCK))
+ if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK))
ctx->user = get_uid(current_user());
/*
@@ -10751,8 +10997,6 @@
return -ENXIO;
if (ctx->restricted) {
- if (opcode >= IORING_REGISTER_LAST)
- return -EINVAL;
opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
if (!test_bit(opcode, ctx->restrictions.register_op))
return -EACCES;
@@ -10884,6 +11128,9 @@
long ret = -EBADF;
struct fd f;
+ if (opcode >= IORING_REGISTER_LAST)
+ return -EINVAL;
+
f = fdget(fd);
if (!f.file)
return -EBADF;
--
Gitblit v1.6.2