From 8ac6c7a54ed1b98d142dce24b11c6de6a1e239a5 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 22 Oct 2024 10:36:11 +0000 Subject: [PATCH] 修改4g拨号为QMI,需要在系统里后台执行quectel-CM --- kernel/io_uring/io_uring.c | 439 ++++++++++++++++++++++++++++++++++++++++++------------ 1 files changed, 343 insertions(+), 96 deletions(-) diff --git a/kernel/io_uring/io_uring.c b/kernel/io_uring/io_uring.c index 35a9bcd..a966de4 100644 --- a/kernel/io_uring/io_uring.c +++ b/kernel/io_uring/io_uring.c @@ -496,6 +496,7 @@ struct file *file; struct wait_queue_head *head; __poll_t events; + int retries; struct wait_queue_entry wait; }; @@ -588,7 +589,9 @@ int msg_flags; int bgid; size_t len; + size_t done_io; struct io_buffer *kbuf; + void __user *msg_control; }; struct io_open { @@ -749,6 +752,7 @@ REQ_F_CREDS_BIT, REQ_F_REFCOUNT_BIT, REQ_F_ARM_LTIMEOUT_BIT, + REQ_F_PARTIAL_IO_BIT, /* keep async read/write and isreg together and in order */ REQ_F_NOWAIT_READ_BIT, REQ_F_NOWAIT_WRITE_BIT, @@ -804,6 +808,8 @@ REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), /* there is a linked timeout that has to be armed */ REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), + /* request has already done partial IO */ + REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), }; struct async_poll { @@ -1098,7 +1104,8 @@ unsigned nr_args); static void io_clean_op(struct io_kiocb *req); static struct file *io_file_get(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd, bool fixed); + struct io_kiocb *req, int fd, bool fixed, + unsigned int issue_flags); static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); @@ -1524,6 +1531,8 @@ static void io_queue_deferred(struct io_ring_ctx *ctx) { + lockdep_assert_held(&ctx->completion_lock); + while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); @@ -1575,12 +1584,22 @@ io_queue_deferred(ctx); } -static inline void io_commit_cqring(struct io_ring_ctx *ctx) +static inline bool io_commit_needs_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active)) - __io_commit_cqring_flush(ctx); + return ctx->off_timeout_used || ctx->drain_active; +} + +static inline void __io_commit_cqring(struct io_ring_ctx *ctx) +{ /* order cqe stores with ring update */ smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); +} + +static inline void io_commit_cqring(struct io_ring_ctx *ctx) +{ + if (unlikely(io_commit_needs_flush(ctx))) + __io_commit_cqring_flush(ctx); + __io_commit_cqring(ctx); } static inline bool io_sqring_full(struct io_ring_ctx *ctx) @@ -2205,9 +2224,12 @@ } req->io_task_work.func(req, &locked); node = next; + if (unlikely(need_resched())) { + ctx_flush_and_put(ctx, &locked); + ctx = NULL; + cond_resched(); + } } while (node); - - cond_resched(); } ctx_flush_and_put(ctx, &locked); @@ -2465,6 +2487,15 @@ static inline bool io_run_task_work(void) { + /* + * PF_IO_WORKER never returns to userspace, so check here if we have + * notify work that needs processing. + */ + if (current->flags & PF_IO_WORKER && + test_thread_flag(TIF_NOTIFY_RESUME)) { + __set_current_state(TASK_RUNNING); + tracehook_notify_resume(NULL); + } if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { __set_current_state(TASK_RUNNING); tracehook_notify_signal(); @@ -2488,17 +2519,36 @@ io_init_req_batch(&rb); while (!list_empty(done)) { + struct io_uring_cqe *cqe; + unsigned cflags; + req = list_first_entry(done, struct io_kiocb, inflight_entry); list_del(&req->inflight_entry); - - io_fill_cqe_req(req, req->result, io_put_rw_kbuf(req)); + cflags = io_put_rw_kbuf(req); (*nr_events)++; + + cqe = io_get_cqe(ctx); + if (cqe) { + WRITE_ONCE(cqe->user_data, req->user_data); + WRITE_ONCE(cqe->res, req->result); + WRITE_ONCE(cqe->flags, cflags); + } else { + spin_lock(&ctx->completion_lock); + io_cqring_event_overflow(ctx, req->user_data, + req->result, cflags); + spin_unlock(&ctx->completion_lock); + } if (req_ref_put_and_test(req)) io_req_free_batch(&rb, req, &ctx->submit_state); } - io_commit_cqring(ctx); + if (io_commit_needs_flush(ctx)) { + spin_lock(&ctx->completion_lock); + __io_commit_cqring_flush(ctx); + spin_unlock(&ctx->completion_lock); + } + __io_commit_cqring(ctx); io_cqring_ev_posted_iopoll(ctx); io_req_free_batch_finish(ctx, &rb); } @@ -2625,6 +2675,11 @@ break; } ret = io_do_iopoll(ctx, &nr_events, min); + + if (task_sigpending(current)) { + ret = -EINTR; + goto out; + } } while (!ret && nr_events < min && !need_resched()); out: mutex_unlock(&ctx->uring_lock); @@ -2692,17 +2747,32 @@ } #endif -static bool __io_complete_rw_common(struct io_kiocb *req, long res) +/* + * Trigger the notifications after having done some IO, and finish the write + * accounting, if any. + */ +static void io_req_io_end(struct io_kiocb *req) { - if (req->rw.kiocb.ki_flags & IOCB_WRITE) { + struct io_rw *rw = &req->rw; + + if (rw->kiocb.ki_flags & IOCB_WRITE) { kiocb_end_write(req); fsnotify_modify(req->file); } else { fsnotify_access(req->file); } +} + +static bool __io_complete_rw_common(struct io_kiocb *req, long res) +{ if (res != req->result) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { + /* + * Reissue will start accounting again, finish the + * current cycle. + */ + io_req_io_end(req); req->flags |= REQ_F_REISSUE; return true; } @@ -2712,7 +2782,7 @@ return false; } -static inline int io_fixup_rw_res(struct io_kiocb *req, unsigned res) +static inline int io_fixup_rw_res(struct io_kiocb *req, long res) { struct io_async_rw *io = req->async_data; @@ -2744,12 +2814,10 @@ } } -static void __io_complete_rw(struct io_kiocb *req, long res, long res2, - unsigned int issue_flags) +static void io_req_rw_complete(struct io_kiocb *req, bool *locked) { - if (__io_complete_rw_common(req, res)) - return; - __io_req_complete(req, issue_flags, io_fixup_rw_res(req, res), io_put_rw_kbuf(req)); + io_req_io_end(req); + io_req_task_complete(req, locked); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) @@ -2759,7 +2827,7 @@ if (__io_complete_rw_common(req, res)) return; req->result = io_fixup_rw_res(req, res); - req->io_task_work.func = io_req_task_complete; + req->io_task_work.func = io_req_rw_complete; io_req_task_work_add(req); } @@ -2911,14 +2979,6 @@ req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); - if (kiocb->ki_pos == -1) { - if (!(file->f_mode & FMODE_STREAM)) { - req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = file->f_pos; - } else { - kiocb->ki_pos = 0; - } - } kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); @@ -3000,6 +3060,23 @@ } } +static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) +{ + struct kiocb *kiocb = &req->rw.kiocb; + + if (kiocb->ki_pos != -1) + return &kiocb->ki_pos; + + if (!(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + return &kiocb->ki_pos; + } + + kiocb->ki_pos = 0; + return NULL; +} + static void kiocb_done(struct kiocb *kiocb, ssize_t ret, unsigned int issue_flags) { @@ -3007,10 +3084,20 @@ if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; - if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) - __io_complete_rw(req, ret, 0, issue_flags); - else + if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) { + if (!__io_complete_rw_common(req, ret)) { + /* + * Safe to call io_end from here as we're inline + * from the submission path. + */ + io_req_io_end(req); + __io_req_complete(req, issue_flags, + io_fixup_rw_res(req, ret), + io_put_rw_kbuf(req)); + } + } else { io_rw_done(kiocb, ret); + } if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; @@ -3292,6 +3379,7 @@ struct kiocb *kiocb = &req->rw.kiocb; struct file *file = req->file; ssize_t ret = 0; + loff_t *ppos; /* * Don't support polled IO through this interface, and we can't @@ -3302,6 +3390,8 @@ return -EOPNOTSUPP; if (kiocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; + + ppos = io_kiocb_ppos(kiocb); while (iov_iter_count(iter)) { struct iovec iovec; @@ -3316,10 +3406,10 @@ if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, io_kiocb_ppos(kiocb)); + iovec.iov_len, ppos); } else { nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, io_kiocb_ppos(kiocb)); + iovec.iov_len, ppos); } if (nr < 0) { @@ -3520,6 +3610,7 @@ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct iov_iter_state __state, *state; ssize_t ret, ret2; + loff_t *ppos; if (rw) { iter = &rw->iter; @@ -3552,7 +3643,9 @@ return ret ?: -EAGAIN; } - ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); + ppos = io_kiocb_update_pos(req); + + ret = rw_verify_area(READ, req->file, ppos, req->result); if (unlikely(ret)) { kfree(iovec); return ret; @@ -3656,6 +3749,7 @@ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct iov_iter_state __state, *state; ssize_t ret, ret2; + loff_t *ppos; if (rw) { iter = &rw->iter; @@ -3686,7 +3780,9 @@ (req->flags & REQ_F_ISREG)) goto copy_iov; - ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); + ppos = io_kiocb_update_pos(req); + + ret = rw_verify_area(WRITE, req->file, ppos, req->result); if (unlikely(ret)) goto out_free; @@ -3926,7 +4022,7 @@ return -EAGAIN; in = io_file_get(req->ctx, req, sp->splice_fd_in, - (sp->flags & SPLICE_F_FD_IN_FIXED)); + (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags); if (!in) { ret = -EBADF; goto done; @@ -3966,7 +4062,7 @@ return -EAGAIN; in = io_file_get(req->ctx, req, sp->splice_fd_in, - (sp->flags & SPLICE_F_FD_IN_FIXED)); + (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags); if (!in) { ret = -EBADF; goto done; @@ -4148,9 +4244,11 @@ if (issue_flags & IO_URING_F_NONBLOCK) { /* * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, - * it'll always -EAGAIN + * it'll always -EAGAIN. Note that we test for __O_TMPFILE + * because O_TMPFILE includes O_DIRECTORY, which isn't a flag + * we need to force async for. */ - if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) + if (req->open.how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) return -EAGAIN; op.lookup_flags |= LOOKUP_CACHED; op.open_flag |= O_NONBLOCK; @@ -4623,6 +4721,13 @@ } #if defined(CONFIG_NET) +static bool io_net_retry(struct socket *sock, int flags) +{ + if (!(flags & MSG_WAITALL)) + return false; + return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; +} + static int io_setup_async_msg(struct io_kiocb *req, struct io_async_msghdr *kmsg) { @@ -4640,8 +4745,10 @@ if (async_msg->msg.msg_name) async_msg->msg.msg_name = &async_msg->addr; /* if were using fast_iov, set it to the new one */ - if (!async_msg->free_iov) - async_msg->msg.msg_iter.iov = async_msg->fast_iov; + if (!kmsg->free_iov) { + size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov; + async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx]; + } return -EAGAIN; } @@ -4649,10 +4756,16 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { + struct io_sr_msg *sr = &req->sr_msg; + int ret; + iomsg->msg.msg_name = &iomsg->addr; iomsg->free_iov = iomsg->fast_iov; - return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, + ret = sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, req->sr_msg.msg_flags, &iomsg->free_iov); + /* save msg_control as sys_sendmsg() overwrites it */ + sr->msg_control = iomsg->msg.msg_control; + return ret; } static int io_sendmsg_prep_async(struct io_kiocb *req) @@ -4686,12 +4799,14 @@ if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif + sr->done_io = 0; return 0; } static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; + struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; unsigned flags; int min_ret = 0; @@ -4707,6 +4822,8 @@ if (ret) return ret; kmsg = &iomsg; + } else { + kmsg->msg.msg_control = sr->msg_control; } flags = req->sr_msg.msg_flags; @@ -4716,17 +4833,27 @@ min_ret = iov_iter_count(&kmsg->msg.msg_iter); ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (ret < min_ret) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } + req_set_fail(req); + } /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < min_ret) - req_set_fail(req); + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4762,13 +4889,24 @@ msg.msg_flags = flags; ret = sock_sendmsg(sock, &msg); - if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - - if (ret < min_ret) + if (ret < min_ret) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } req_set_fail(req); + } + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4904,7 +5042,7 @@ sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->bgid = READ_ONCE(sqe->buf_group); - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + sr->msg_flags = READ_ONCE(sqe->msg_flags); if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -4912,12 +5050,14 @@ if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif + sr->done_io = 0; return 0; } static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; + struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; struct io_buffer *kbuf; unsigned flags; @@ -4950,15 +5090,27 @@ flags = req->sr_msg.msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) + if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) min_ret = iov_iter_count(&kmsg->msg.msg_iter); ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (ret < min_ret) { + if (ret == -EAGAIN && force_nonblock) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_control = NULL; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } + req_set_fail(req); + } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + req_set_fail(req); + } if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_recv_kbuf(req); @@ -4966,8 +5118,10 @@ if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) - req_set_fail(req); + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, cflags); return 0; } @@ -5014,15 +5168,29 @@ min_ret = iov_iter_count(&msg.msg_iter); ret = sock_recvmsg(sock, &msg, flags); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (ret < min_ret) { + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } + req_set_fail(req); + } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { out_free: + req_set_fail(req); + } if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_recv_kbuf(req); - if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) - req_set_fail(req); + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, cflags); return 0; } @@ -5060,9 +5228,6 @@ struct file *file; int ret, fd; - if (req->file->f_flags & O_NONBLOCK) - req->flags |= REQ_F_NOWAIT; - if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); if (unlikely(fd < 0)) @@ -5075,6 +5240,8 @@ if (!fixed) put_unused_fd(fd); ret = PTR_ERR(file); + /* safe to retry */ + req->flags |= REQ_F_PARTIAL_IO; if (ret == -EAGAIN && force_nonblock) return -EAGAIN; if (ret == -ERESTARTSYS) @@ -5419,6 +5586,7 @@ if (ret > 0) return; + io_tw_lock(req->ctx, locked); io_poll_remove_entries(req); spin_lock(&ctx->completion_lock); hash_del(&req->hash_node); @@ -5631,6 +5799,14 @@ IO_APOLL_READY }; +/* + * We can't reliably detect loops in repeated poll triggers and issue + * subsequently failing. But rather than fail these immediately, allow a + * certain amount of retries before we give up. Given that this condition + * should _rarely_ trigger even once, we should be fine with a larger value. + */ +#define APOLL_MAX_RETRY 128 + static int io_arm_poll_handler(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -5641,8 +5817,6 @@ int ret; if (!req->file || !file_can_poll(req->file)) - return IO_APOLL_ABORTED; - if (req->flags & REQ_F_POLLED) return IO_APOLL_ABORTED; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; @@ -5658,9 +5832,19 @@ mask |= POLLOUT | POLLWRNORM; } - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) - return IO_APOLL_ABORTED; + if (req->flags & REQ_F_POLLED) { + apoll = req->apoll; + kfree(apoll->double_poll); + if (unlikely(!--apoll->poll.retries)) { + apoll->double_poll = NULL; + return IO_APOLL_ABORTED; + } + } else { + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return IO_APOLL_ABORTED; + apoll->poll.retries = APOLL_MAX_RETRY; + } apoll->double_poll = NULL; req->apoll = apoll; req->flags |= REQ_F_POLLED; @@ -5831,6 +6015,8 @@ struct io_kiocb *preq; int ret2, ret = 0; + io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + spin_lock(&ctx->completion_lock); preq = io_poll_find(ctx, req->poll_update.old_user_data, true); if (!preq || !io_poll_disarm(preq)) { @@ -5862,6 +6048,7 @@ req_set_fail(req); /* complete update request, we're done with it */ io_req_complete(req, ret); + io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); return 0; } @@ -6726,6 +6913,15 @@ */ if (ret != -EAGAIN || !(req->ctx->flags & IORING_SETUP_IOPOLL)) break; + if (io_wq_worker_stopped()) + break; + /* + * If REQ_F_NOWAIT is set, then don't wait or retry with + * poll. -EAGAIN is final for that case. + */ + if (req->flags & REQ_F_NOWAIT) + break; + cond_resched(); } while (1); } @@ -6763,13 +6959,16 @@ } static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd) + struct io_kiocb *req, int fd, + unsigned int issue_flags) { - struct file *file; + struct file *file = NULL; unsigned long file_ptr; + io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - return NULL; + goto out; fd = array_index_nospec(fd, ctx->nr_user_files); file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; file = (struct file *) (file_ptr & FFS_MASK); @@ -6777,6 +6976,8 @@ /* mask in overlapping REQ_F and FFS bits */ req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); io_req_set_rsrc_node(req); +out: + io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); return file; } @@ -6794,10 +6995,11 @@ } static inline struct file *io_file_get(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd, bool fixed) + struct io_kiocb *req, int fd, bool fixed, + unsigned int issue_flags) { if (fixed) - return io_file_get_fixed(ctx, req, fd); + return io_file_get_fixed(ctx, req, fd, issue_flags); else return io_file_get_normal(ctx, req, fd); } @@ -7019,7 +7221,8 @@ if (io_op_defs[req->opcode].needs_file) { req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), - (sqe_flags & IOSQE_FIXED_FILE)); + (sqe_flags & IOSQE_FIXED_FILE), + IO_URING_F_NONBLOCK); if (unlikely(!req->file)) ret = -EBADF; } @@ -7447,12 +7650,21 @@ return -EINTR; } +static bool current_pending_io(void) +{ + struct io_uring_task *tctx = current->io_uring; + + if (!tctx) + return false; + return percpu_counter_read_positive(&tctx->inflight); +} + /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - ktime_t timeout) + ktime_t *timeout) { - int ret; + int io_wait, ret; /* make sure we run task_work before checking for signals */ ret = io_run_task_work_sig(); @@ -7462,9 +7674,19 @@ if (test_bit(0, &ctx->check_cq_overflow)) return 1; - if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) - return -ETIME; - return 1; + /* + * Mark us as being in io_wait if we have pending requests, so cpufreq + * can take into account that the task is waiting for IO - turns out + * to be important for low QD IO. + */ + io_wait = current->in_iowait; + if (current_pending_io()) + current->in_iowait = 1; + ret = 1; + if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS)) + ret = -ETIME; + current->in_iowait = io_wait; + return ret; } /* @@ -7525,7 +7747,7 @@ } prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, timeout); + ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); finish_wait(&ctx->cq_wait, &iowq.wq); cond_resched(); } while (ret > 0); @@ -8927,14 +9149,17 @@ pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages, vmas); if (pret == nr_pages) { + struct file *file = vmas[0]->vm_file; + /* don't support file backed memory */ for (i = 0; i < nr_pages; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma_is_shmem(vma)) + if (vmas[i]->vm_file != file) { + ret = -EINVAL; + break; + } + if (!file) continue; - if (vma->vm_file && - !is_file_hugepages(vma->vm_file)) { + if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { ret = -EOPNOTSUPP; break; } @@ -9367,7 +9592,18 @@ /* there is little hope left, don't run it too often */ interval = HZ * 60; } - } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); + /* + * This is really an uninterruptible wait, as it has to be + * complete. But it's also run from a kworker, which doesn't + * take signals, so it's fine to make it interruptible. This + * avoids scenarios where we knowingly can wait much longer + * on completions, for example if someone does a SIGSTOP on + * a task that needs to finish task_work to make this loop + * complete. That's a synthetic situation that should not + * cause a stuck task backtrace, and hence a potential panic + * on stuck tasks if that is enabled. + */ + } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval)); init_completion(&exit.completion); init_task_work(&exit.task_work, io_tctx_exit_cb); @@ -9392,7 +9628,12 @@ wake_up_process(node->task); mutex_unlock(&ctx->uring_lock); - wait_for_completion(&exit.completion); + /* + * See comment above for + * wait_for_completion_interruptible_timeout() on why this + * wait is marked as interruptible. + */ + wait_for_completion_interruptible(&exit.completion); mutex_lock(&ctx->uring_lock); } mutex_unlock(&ctx->uring_lock); @@ -9444,6 +9685,10 @@ /* if we failed setting up the ctx, we might not have any rings */ io_iopoll_try_reap_events(ctx); + + /* drop cached put refs after potentially doing completions */ + if (current->io_uring) + io_uring_drop_tctx_refs(current); INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* @@ -9556,6 +9801,7 @@ while (!list_empty_careful(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; + cond_resched(); } } @@ -10223,7 +10469,7 @@ if (!ctx) return -ENOMEM; ctx->compat = in_compat_syscall(); - if (!capable(CAP_IPC_LOCK)) + if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); /* @@ -10751,8 +10997,6 @@ return -ENXIO; if (ctx->restricted) { - if (opcode >= IORING_REGISTER_LAST) - return -EINVAL; opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; @@ -10884,6 +11128,9 @@ long ret = -EBADF; struct fd f; + if (opcode >= IORING_REGISTER_LAST) + return -EINVAL; + f = fdget(fd); if (!f.file) return -EBADF; -- Gitblit v1.6.2