.. | .. |
---|
496 | 496 | struct file *file; |
---|
497 | 497 | struct wait_queue_head *head; |
---|
498 | 498 | __poll_t events; |
---|
| 499 | + int retries; |
---|
499 | 500 | struct wait_queue_entry wait; |
---|
500 | 501 | }; |
---|
501 | 502 | |
---|
.. | .. |
---|
588 | 589 | int msg_flags; |
---|
589 | 590 | int bgid; |
---|
590 | 591 | size_t len; |
---|
| 592 | + size_t done_io; |
---|
591 | 593 | struct io_buffer *kbuf; |
---|
| 594 | + void __user *msg_control; |
---|
592 | 595 | }; |
---|
593 | 596 | |
---|
594 | 597 | struct io_open { |
---|
.. | .. |
---|
749 | 752 | REQ_F_CREDS_BIT, |
---|
750 | 753 | REQ_F_REFCOUNT_BIT, |
---|
751 | 754 | REQ_F_ARM_LTIMEOUT_BIT, |
---|
| 755 | + REQ_F_PARTIAL_IO_BIT, |
---|
752 | 756 | /* keep async read/write and isreg together and in order */ |
---|
753 | 757 | REQ_F_NOWAIT_READ_BIT, |
---|
754 | 758 | REQ_F_NOWAIT_WRITE_BIT, |
---|
.. | .. |
---|
804 | 808 | REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), |
---|
805 | 809 | /* there is a linked timeout that has to be armed */ |
---|
806 | 810 | REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), |
---|
| 811 | + /* request has already done partial IO */ |
---|
| 812 | + REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), |
---|
807 | 813 | }; |
---|
808 | 814 | |
---|
809 | 815 | struct async_poll { |
---|
.. | .. |
---|
1098 | 1104 | unsigned nr_args); |
---|
1099 | 1105 | static void io_clean_op(struct io_kiocb *req); |
---|
1100 | 1106 | static struct file *io_file_get(struct io_ring_ctx *ctx, |
---|
1101 | | - struct io_kiocb *req, int fd, bool fixed); |
---|
| 1107 | + struct io_kiocb *req, int fd, bool fixed, |
---|
| 1108 | + unsigned int issue_flags); |
---|
1102 | 1109 | static void __io_queue_sqe(struct io_kiocb *req); |
---|
1103 | 1110 | static void io_rsrc_put_work(struct work_struct *work); |
---|
1104 | 1111 | |
---|
.. | .. |
---|
1524 | 1531 | |
---|
1525 | 1532 | static void io_queue_deferred(struct io_ring_ctx *ctx) |
---|
1526 | 1533 | { |
---|
| 1534 | + lockdep_assert_held(&ctx->completion_lock); |
---|
| 1535 | + |
---|
1527 | 1536 | while (!list_empty(&ctx->defer_list)) { |
---|
1528 | 1537 | struct io_defer_entry *de = list_first_entry(&ctx->defer_list, |
---|
1529 | 1538 | struct io_defer_entry, list); |
---|
.. | .. |
---|
1575 | 1584 | io_queue_deferred(ctx); |
---|
1576 | 1585 | } |
---|
1577 | 1586 | |
---|
1578 | | -static inline void io_commit_cqring(struct io_ring_ctx *ctx) |
---|
| 1587 | +static inline bool io_commit_needs_flush(struct io_ring_ctx *ctx) |
---|
1579 | 1588 | { |
---|
1580 | | - if (unlikely(ctx->off_timeout_used || ctx->drain_active)) |
---|
1581 | | - __io_commit_cqring_flush(ctx); |
---|
| 1589 | + return ctx->off_timeout_used || ctx->drain_active; |
---|
| 1590 | +} |
---|
| 1591 | + |
---|
| 1592 | +static inline void __io_commit_cqring(struct io_ring_ctx *ctx) |
---|
| 1593 | +{ |
---|
1582 | 1594 | /* order cqe stores with ring update */ |
---|
1583 | 1595 | smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); |
---|
| 1596 | +} |
---|
| 1597 | + |
---|
| 1598 | +static inline void io_commit_cqring(struct io_ring_ctx *ctx) |
---|
| 1599 | +{ |
---|
| 1600 | + if (unlikely(io_commit_needs_flush(ctx))) |
---|
| 1601 | + __io_commit_cqring_flush(ctx); |
---|
| 1602 | + __io_commit_cqring(ctx); |
---|
1584 | 1603 | } |
---|
1585 | 1604 | |
---|
1586 | 1605 | static inline bool io_sqring_full(struct io_ring_ctx *ctx) |
---|
.. | .. |
---|
2205 | 2224 | } |
---|
2206 | 2225 | req->io_task_work.func(req, &locked); |
---|
2207 | 2226 | node = next; |
---|
| 2227 | + if (unlikely(need_resched())) { |
---|
| 2228 | + ctx_flush_and_put(ctx, &locked); |
---|
| 2229 | + ctx = NULL; |
---|
| 2230 | + cond_resched(); |
---|
| 2231 | + } |
---|
2208 | 2232 | } while (node); |
---|
2209 | | - |
---|
2210 | | - cond_resched(); |
---|
2211 | 2233 | } |
---|
2212 | 2234 | |
---|
2213 | 2235 | ctx_flush_and_put(ctx, &locked); |
---|
.. | .. |
---|
2465 | 2487 | |
---|
2466 | 2488 | static inline bool io_run_task_work(void) |
---|
2467 | 2489 | { |
---|
| 2490 | + /* |
---|
| 2491 | + * PF_IO_WORKER never returns to userspace, so check here if we have |
---|
| 2492 | + * notify work that needs processing. |
---|
| 2493 | + */ |
---|
| 2494 | + if (current->flags & PF_IO_WORKER && |
---|
| 2495 | + test_thread_flag(TIF_NOTIFY_RESUME)) { |
---|
| 2496 | + __set_current_state(TASK_RUNNING); |
---|
| 2497 | + tracehook_notify_resume(NULL); |
---|
| 2498 | + } |
---|
2468 | 2499 | if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { |
---|
2469 | 2500 | __set_current_state(TASK_RUNNING); |
---|
2470 | 2501 | tracehook_notify_signal(); |
---|
.. | .. |
---|
2488 | 2519 | |
---|
2489 | 2520 | io_init_req_batch(&rb); |
---|
2490 | 2521 | while (!list_empty(done)) { |
---|
| 2522 | + struct io_uring_cqe *cqe; |
---|
| 2523 | + unsigned cflags; |
---|
| 2524 | + |
---|
2491 | 2525 | req = list_first_entry(done, struct io_kiocb, inflight_entry); |
---|
2492 | 2526 | list_del(&req->inflight_entry); |
---|
2493 | | - |
---|
2494 | | - io_fill_cqe_req(req, req->result, io_put_rw_kbuf(req)); |
---|
| 2527 | + cflags = io_put_rw_kbuf(req); |
---|
2495 | 2528 | (*nr_events)++; |
---|
| 2529 | + |
---|
| 2530 | + cqe = io_get_cqe(ctx); |
---|
| 2531 | + if (cqe) { |
---|
| 2532 | + WRITE_ONCE(cqe->user_data, req->user_data); |
---|
| 2533 | + WRITE_ONCE(cqe->res, req->result); |
---|
| 2534 | + WRITE_ONCE(cqe->flags, cflags); |
---|
| 2535 | + } else { |
---|
| 2536 | + spin_lock(&ctx->completion_lock); |
---|
| 2537 | + io_cqring_event_overflow(ctx, req->user_data, |
---|
| 2538 | + req->result, cflags); |
---|
| 2539 | + spin_unlock(&ctx->completion_lock); |
---|
| 2540 | + } |
---|
2496 | 2541 | |
---|
2497 | 2542 | if (req_ref_put_and_test(req)) |
---|
2498 | 2543 | io_req_free_batch(&rb, req, &ctx->submit_state); |
---|
2499 | 2544 | } |
---|
2500 | 2545 | |
---|
2501 | | - io_commit_cqring(ctx); |
---|
| 2546 | + if (io_commit_needs_flush(ctx)) { |
---|
| 2547 | + spin_lock(&ctx->completion_lock); |
---|
| 2548 | + __io_commit_cqring_flush(ctx); |
---|
| 2549 | + spin_unlock(&ctx->completion_lock); |
---|
| 2550 | + } |
---|
| 2551 | + __io_commit_cqring(ctx); |
---|
2502 | 2552 | io_cqring_ev_posted_iopoll(ctx); |
---|
2503 | 2553 | io_req_free_batch_finish(ctx, &rb); |
---|
2504 | 2554 | } |
---|
.. | .. |
---|
2625 | 2675 | break; |
---|
2626 | 2676 | } |
---|
2627 | 2677 | ret = io_do_iopoll(ctx, &nr_events, min); |
---|
| 2678 | + |
---|
| 2679 | + if (task_sigpending(current)) { |
---|
| 2680 | + ret = -EINTR; |
---|
| 2681 | + goto out; |
---|
| 2682 | + } |
---|
2628 | 2683 | } while (!ret && nr_events < min && !need_resched()); |
---|
2629 | 2684 | out: |
---|
2630 | 2685 | mutex_unlock(&ctx->uring_lock); |
---|
.. | .. |
---|
2692 | 2747 | } |
---|
2693 | 2748 | #endif |
---|
2694 | 2749 | |
---|
2695 | | -static bool __io_complete_rw_common(struct io_kiocb *req, long res) |
---|
| 2750 | +/* |
---|
| 2751 | + * Trigger the notifications after having done some IO, and finish the write |
---|
| 2752 | + * accounting, if any. |
---|
| 2753 | + */ |
---|
| 2754 | +static void io_req_io_end(struct io_kiocb *req) |
---|
2696 | 2755 | { |
---|
2697 | | - if (req->rw.kiocb.ki_flags & IOCB_WRITE) { |
---|
| 2756 | + struct io_rw *rw = &req->rw; |
---|
| 2757 | + |
---|
| 2758 | + if (rw->kiocb.ki_flags & IOCB_WRITE) { |
---|
2698 | 2759 | kiocb_end_write(req); |
---|
2699 | 2760 | fsnotify_modify(req->file); |
---|
2700 | 2761 | } else { |
---|
2701 | 2762 | fsnotify_access(req->file); |
---|
2702 | 2763 | } |
---|
| 2764 | +} |
---|
| 2765 | + |
---|
| 2766 | +static bool __io_complete_rw_common(struct io_kiocb *req, long res) |
---|
| 2767 | +{ |
---|
2703 | 2768 | if (res != req->result) { |
---|
2704 | 2769 | if ((res == -EAGAIN || res == -EOPNOTSUPP) && |
---|
2705 | 2770 | io_rw_should_reissue(req)) { |
---|
| 2771 | + /* |
---|
| 2772 | + * Reissue will start accounting again, finish the |
---|
| 2773 | + * current cycle. |
---|
| 2774 | + */ |
---|
| 2775 | + io_req_io_end(req); |
---|
2706 | 2776 | req->flags |= REQ_F_REISSUE; |
---|
2707 | 2777 | return true; |
---|
2708 | 2778 | } |
---|
.. | .. |
---|
2712 | 2782 | return false; |
---|
2713 | 2783 | } |
---|
2714 | 2784 | |
---|
2715 | | -static inline int io_fixup_rw_res(struct io_kiocb *req, unsigned res) |
---|
| 2785 | +static inline int io_fixup_rw_res(struct io_kiocb *req, long res) |
---|
2716 | 2786 | { |
---|
2717 | 2787 | struct io_async_rw *io = req->async_data; |
---|
2718 | 2788 | |
---|
.. | .. |
---|
2744 | 2814 | } |
---|
2745 | 2815 | } |
---|
2746 | 2816 | |
---|
2747 | | -static void __io_complete_rw(struct io_kiocb *req, long res, long res2, |
---|
2748 | | - unsigned int issue_flags) |
---|
| 2817 | +static void io_req_rw_complete(struct io_kiocb *req, bool *locked) |
---|
2749 | 2818 | { |
---|
2750 | | - if (__io_complete_rw_common(req, res)) |
---|
2751 | | - return; |
---|
2752 | | - __io_req_complete(req, issue_flags, io_fixup_rw_res(req, res), io_put_rw_kbuf(req)); |
---|
| 2819 | + io_req_io_end(req); |
---|
| 2820 | + io_req_task_complete(req, locked); |
---|
2753 | 2821 | } |
---|
2754 | 2822 | |
---|
2755 | 2823 | static void io_complete_rw(struct kiocb *kiocb, long res, long res2) |
---|
.. | .. |
---|
2759 | 2827 | if (__io_complete_rw_common(req, res)) |
---|
2760 | 2828 | return; |
---|
2761 | 2829 | req->result = io_fixup_rw_res(req, res); |
---|
2762 | | - req->io_task_work.func = io_req_task_complete; |
---|
| 2830 | + req->io_task_work.func = io_req_rw_complete; |
---|
2763 | 2831 | io_req_task_work_add(req); |
---|
2764 | 2832 | } |
---|
2765 | 2833 | |
---|
.. | .. |
---|
2911 | 2979 | req->flags |= REQ_F_ISREG; |
---|
2912 | 2980 | |
---|
2913 | 2981 | kiocb->ki_pos = READ_ONCE(sqe->off); |
---|
2914 | | - if (kiocb->ki_pos == -1) { |
---|
2915 | | - if (!(file->f_mode & FMODE_STREAM)) { |
---|
2916 | | - req->flags |= REQ_F_CUR_POS; |
---|
2917 | | - kiocb->ki_pos = file->f_pos; |
---|
2918 | | - } else { |
---|
2919 | | - kiocb->ki_pos = 0; |
---|
2920 | | - } |
---|
2921 | | - } |
---|
2922 | 2982 | kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); |
---|
2923 | 2983 | kiocb->ki_flags = iocb_flags(kiocb->ki_filp); |
---|
2924 | 2984 | ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); |
---|
.. | .. |
---|
3000 | 3060 | } |
---|
3001 | 3061 | } |
---|
3002 | 3062 | |
---|
| 3063 | +static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) |
---|
| 3064 | +{ |
---|
| 3065 | + struct kiocb *kiocb = &req->rw.kiocb; |
---|
| 3066 | + |
---|
| 3067 | + if (kiocb->ki_pos != -1) |
---|
| 3068 | + return &kiocb->ki_pos; |
---|
| 3069 | + |
---|
| 3070 | + if (!(req->file->f_mode & FMODE_STREAM)) { |
---|
| 3071 | + req->flags |= REQ_F_CUR_POS; |
---|
| 3072 | + kiocb->ki_pos = req->file->f_pos; |
---|
| 3073 | + return &kiocb->ki_pos; |
---|
| 3074 | + } |
---|
| 3075 | + |
---|
| 3076 | + kiocb->ki_pos = 0; |
---|
| 3077 | + return NULL; |
---|
| 3078 | +} |
---|
| 3079 | + |
---|
3003 | 3080 | static void kiocb_done(struct kiocb *kiocb, ssize_t ret, |
---|
3004 | 3081 | unsigned int issue_flags) |
---|
3005 | 3082 | { |
---|
.. | .. |
---|
3007 | 3084 | |
---|
3008 | 3085 | if (req->flags & REQ_F_CUR_POS) |
---|
3009 | 3086 | req->file->f_pos = kiocb->ki_pos; |
---|
3010 | | - if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) |
---|
3011 | | - __io_complete_rw(req, ret, 0, issue_flags); |
---|
3012 | | - else |
---|
| 3087 | + if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) { |
---|
| 3088 | + if (!__io_complete_rw_common(req, ret)) { |
---|
| 3089 | + /* |
---|
| 3090 | + * Safe to call io_end from here as we're inline |
---|
| 3091 | + * from the submission path. |
---|
| 3092 | + */ |
---|
| 3093 | + io_req_io_end(req); |
---|
| 3094 | + __io_req_complete(req, issue_flags, |
---|
| 3095 | + io_fixup_rw_res(req, ret), |
---|
| 3096 | + io_put_rw_kbuf(req)); |
---|
| 3097 | + } |
---|
| 3098 | + } else { |
---|
3013 | 3099 | io_rw_done(kiocb, ret); |
---|
| 3100 | + } |
---|
3014 | 3101 | |
---|
3015 | 3102 | if (req->flags & REQ_F_REISSUE) { |
---|
3016 | 3103 | req->flags &= ~REQ_F_REISSUE; |
---|
.. | .. |
---|
3292 | 3379 | struct kiocb *kiocb = &req->rw.kiocb; |
---|
3293 | 3380 | struct file *file = req->file; |
---|
3294 | 3381 | ssize_t ret = 0; |
---|
| 3382 | + loff_t *ppos; |
---|
3295 | 3383 | |
---|
3296 | 3384 | /* |
---|
3297 | 3385 | * Don't support polled IO through this interface, and we can't |
---|
.. | .. |
---|
3302 | 3390 | return -EOPNOTSUPP; |
---|
3303 | 3391 | if (kiocb->ki_flags & IOCB_NOWAIT) |
---|
3304 | 3392 | return -EAGAIN; |
---|
| 3393 | + |
---|
| 3394 | + ppos = io_kiocb_ppos(kiocb); |
---|
3305 | 3395 | |
---|
3306 | 3396 | while (iov_iter_count(iter)) { |
---|
3307 | 3397 | struct iovec iovec; |
---|
.. | .. |
---|
3316 | 3406 | |
---|
3317 | 3407 | if (rw == READ) { |
---|
3318 | 3408 | nr = file->f_op->read(file, iovec.iov_base, |
---|
3319 | | - iovec.iov_len, io_kiocb_ppos(kiocb)); |
---|
| 3409 | + iovec.iov_len, ppos); |
---|
3320 | 3410 | } else { |
---|
3321 | 3411 | nr = file->f_op->write(file, iovec.iov_base, |
---|
3322 | | - iovec.iov_len, io_kiocb_ppos(kiocb)); |
---|
| 3412 | + iovec.iov_len, ppos); |
---|
3323 | 3413 | } |
---|
3324 | 3414 | |
---|
3325 | 3415 | if (nr < 0) { |
---|
.. | .. |
---|
3520 | 3610 | bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; |
---|
3521 | 3611 | struct iov_iter_state __state, *state; |
---|
3522 | 3612 | ssize_t ret, ret2; |
---|
| 3613 | + loff_t *ppos; |
---|
3523 | 3614 | |
---|
3524 | 3615 | if (rw) { |
---|
3525 | 3616 | iter = &rw->iter; |
---|
.. | .. |
---|
3552 | 3643 | return ret ?: -EAGAIN; |
---|
3553 | 3644 | } |
---|
3554 | 3645 | |
---|
3555 | | - ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); |
---|
| 3646 | + ppos = io_kiocb_update_pos(req); |
---|
| 3647 | + |
---|
| 3648 | + ret = rw_verify_area(READ, req->file, ppos, req->result); |
---|
3556 | 3649 | if (unlikely(ret)) { |
---|
3557 | 3650 | kfree(iovec); |
---|
3558 | 3651 | return ret; |
---|
.. | .. |
---|
3656 | 3749 | bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; |
---|
3657 | 3750 | struct iov_iter_state __state, *state; |
---|
3658 | 3751 | ssize_t ret, ret2; |
---|
| 3752 | + loff_t *ppos; |
---|
3659 | 3753 | |
---|
3660 | 3754 | if (rw) { |
---|
3661 | 3755 | iter = &rw->iter; |
---|
.. | .. |
---|
3686 | 3780 | (req->flags & REQ_F_ISREG)) |
---|
3687 | 3781 | goto copy_iov; |
---|
3688 | 3782 | |
---|
3689 | | - ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); |
---|
| 3783 | + ppos = io_kiocb_update_pos(req); |
---|
| 3784 | + |
---|
| 3785 | + ret = rw_verify_area(WRITE, req->file, ppos, req->result); |
---|
3690 | 3786 | if (unlikely(ret)) |
---|
3691 | 3787 | goto out_free; |
---|
3692 | 3788 | |
---|
.. | .. |
---|
3926 | 4022 | return -EAGAIN; |
---|
3927 | 4023 | |
---|
3928 | 4024 | in = io_file_get(req->ctx, req, sp->splice_fd_in, |
---|
3929 | | - (sp->flags & SPLICE_F_FD_IN_FIXED)); |
---|
| 4025 | + (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags); |
---|
3930 | 4026 | if (!in) { |
---|
3931 | 4027 | ret = -EBADF; |
---|
3932 | 4028 | goto done; |
---|
.. | .. |
---|
3966 | 4062 | return -EAGAIN; |
---|
3967 | 4063 | |
---|
3968 | 4064 | in = io_file_get(req->ctx, req, sp->splice_fd_in, |
---|
3969 | | - (sp->flags & SPLICE_F_FD_IN_FIXED)); |
---|
| 4065 | + (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags); |
---|
3970 | 4066 | if (!in) { |
---|
3971 | 4067 | ret = -EBADF; |
---|
3972 | 4068 | goto done; |
---|
.. | .. |
---|
4148 | 4244 | if (issue_flags & IO_URING_F_NONBLOCK) { |
---|
4149 | 4245 | /* |
---|
4150 | 4246 | * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, |
---|
4151 | | - * it'll always -EAGAIN |
---|
| 4247 | + * it'll always -EAGAIN. Note that we test for __O_TMPFILE |
---|
| 4248 | + * because O_TMPFILE includes O_DIRECTORY, which isn't a flag |
---|
| 4249 | + * we need to force async for. |
---|
4152 | 4250 | */ |
---|
4153 | | - if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) |
---|
| 4251 | + if (req->open.how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) |
---|
4154 | 4252 | return -EAGAIN; |
---|
4155 | 4253 | op.lookup_flags |= LOOKUP_CACHED; |
---|
4156 | 4254 | op.open_flag |= O_NONBLOCK; |
---|
.. | .. |
---|
4623 | 4721 | } |
---|
4624 | 4722 | |
---|
4625 | 4723 | #if defined(CONFIG_NET) |
---|
| 4724 | +static bool io_net_retry(struct socket *sock, int flags) |
---|
| 4725 | +{ |
---|
| 4726 | + if (!(flags & MSG_WAITALL)) |
---|
| 4727 | + return false; |
---|
| 4728 | + return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; |
---|
| 4729 | +} |
---|
| 4730 | + |
---|
4626 | 4731 | static int io_setup_async_msg(struct io_kiocb *req, |
---|
4627 | 4732 | struct io_async_msghdr *kmsg) |
---|
4628 | 4733 | { |
---|
.. | .. |
---|
4640 | 4745 | if (async_msg->msg.msg_name) |
---|
4641 | 4746 | async_msg->msg.msg_name = &async_msg->addr; |
---|
4642 | 4747 | /* if were using fast_iov, set it to the new one */ |
---|
4643 | | - if (!async_msg->free_iov) |
---|
4644 | | - async_msg->msg.msg_iter.iov = async_msg->fast_iov; |
---|
| 4748 | + if (!kmsg->free_iov) { |
---|
| 4749 | + size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov; |
---|
| 4750 | + async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx]; |
---|
| 4751 | + } |
---|
4645 | 4752 | |
---|
4646 | 4753 | return -EAGAIN; |
---|
4647 | 4754 | } |
---|
.. | .. |
---|
4649 | 4756 | static int io_sendmsg_copy_hdr(struct io_kiocb *req, |
---|
4650 | 4757 | struct io_async_msghdr *iomsg) |
---|
4651 | 4758 | { |
---|
| 4759 | + struct io_sr_msg *sr = &req->sr_msg; |
---|
| 4760 | + int ret; |
---|
| 4761 | + |
---|
4652 | 4762 | iomsg->msg.msg_name = &iomsg->addr; |
---|
4653 | 4763 | iomsg->free_iov = iomsg->fast_iov; |
---|
4654 | | - return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, |
---|
| 4764 | + ret = sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, |
---|
4655 | 4765 | req->sr_msg.msg_flags, &iomsg->free_iov); |
---|
| 4766 | + /* save msg_control as sys_sendmsg() overwrites it */ |
---|
| 4767 | + sr->msg_control = iomsg->msg.msg_control; |
---|
| 4768 | + return ret; |
---|
4656 | 4769 | } |
---|
4657 | 4770 | |
---|
4658 | 4771 | static int io_sendmsg_prep_async(struct io_kiocb *req) |
---|
.. | .. |
---|
4686 | 4799 | if (req->ctx->compat) |
---|
4687 | 4800 | sr->msg_flags |= MSG_CMSG_COMPAT; |
---|
4688 | 4801 | #endif |
---|
| 4802 | + sr->done_io = 0; |
---|
4689 | 4803 | return 0; |
---|
4690 | 4804 | } |
---|
4691 | 4805 | |
---|
4692 | 4806 | static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) |
---|
4693 | 4807 | { |
---|
4694 | 4808 | struct io_async_msghdr iomsg, *kmsg; |
---|
| 4809 | + struct io_sr_msg *sr = &req->sr_msg; |
---|
4695 | 4810 | struct socket *sock; |
---|
4696 | 4811 | unsigned flags; |
---|
4697 | 4812 | int min_ret = 0; |
---|
.. | .. |
---|
4707 | 4822 | if (ret) |
---|
4708 | 4823 | return ret; |
---|
4709 | 4824 | kmsg = &iomsg; |
---|
| 4825 | + } else { |
---|
| 4826 | + kmsg->msg.msg_control = sr->msg_control; |
---|
4710 | 4827 | } |
---|
4711 | 4828 | |
---|
4712 | 4829 | flags = req->sr_msg.msg_flags; |
---|
.. | .. |
---|
4716 | 4833 | min_ret = iov_iter_count(&kmsg->msg.msg_iter); |
---|
4717 | 4834 | |
---|
4718 | 4835 | ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); |
---|
4719 | | - if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) |
---|
4720 | | - return io_setup_async_msg(req, kmsg); |
---|
4721 | | - if (ret == -ERESTARTSYS) |
---|
4722 | | - ret = -EINTR; |
---|
4723 | 4836 | |
---|
| 4837 | + if (ret < min_ret) { |
---|
| 4838 | + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) |
---|
| 4839 | + return io_setup_async_msg(req, kmsg); |
---|
| 4840 | + if (ret == -ERESTARTSYS) |
---|
| 4841 | + ret = -EINTR; |
---|
| 4842 | + if (ret > 0 && io_net_retry(sock, flags)) { |
---|
| 4843 | + sr->done_io += ret; |
---|
| 4844 | + req->flags |= REQ_F_PARTIAL_IO; |
---|
| 4845 | + return io_setup_async_msg(req, kmsg); |
---|
| 4846 | + } |
---|
| 4847 | + req_set_fail(req); |
---|
| 4848 | + } |
---|
4724 | 4849 | /* fast path, check for non-NULL to avoid function call */ |
---|
4725 | 4850 | if (kmsg->free_iov) |
---|
4726 | 4851 | kfree(kmsg->free_iov); |
---|
4727 | 4852 | req->flags &= ~REQ_F_NEED_CLEANUP; |
---|
4728 | | - if (ret < min_ret) |
---|
4729 | | - req_set_fail(req); |
---|
| 4853 | + if (ret >= 0) |
---|
| 4854 | + ret += sr->done_io; |
---|
| 4855 | + else if (sr->done_io) |
---|
| 4856 | + ret = sr->done_io; |
---|
4730 | 4857 | __io_req_complete(req, issue_flags, ret, 0); |
---|
4731 | 4858 | return 0; |
---|
4732 | 4859 | } |
---|
.. | .. |
---|
4762 | 4889 | |
---|
4763 | 4890 | msg.msg_flags = flags; |
---|
4764 | 4891 | ret = sock_sendmsg(sock, &msg); |
---|
4765 | | - if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) |
---|
4766 | | - return -EAGAIN; |
---|
4767 | | - if (ret == -ERESTARTSYS) |
---|
4768 | | - ret = -EINTR; |
---|
4769 | | - |
---|
4770 | | - if (ret < min_ret) |
---|
| 4892 | + if (ret < min_ret) { |
---|
| 4893 | + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) |
---|
| 4894 | + return -EAGAIN; |
---|
| 4895 | + if (ret == -ERESTARTSYS) |
---|
| 4896 | + ret = -EINTR; |
---|
| 4897 | + if (ret > 0 && io_net_retry(sock, flags)) { |
---|
| 4898 | + sr->len -= ret; |
---|
| 4899 | + sr->buf += ret; |
---|
| 4900 | + sr->done_io += ret; |
---|
| 4901 | + req->flags |= REQ_F_PARTIAL_IO; |
---|
| 4902 | + return -EAGAIN; |
---|
| 4903 | + } |
---|
4771 | 4904 | req_set_fail(req); |
---|
| 4905 | + } |
---|
| 4906 | + if (ret >= 0) |
---|
| 4907 | + ret += sr->done_io; |
---|
| 4908 | + else if (sr->done_io) |
---|
| 4909 | + ret = sr->done_io; |
---|
4772 | 4910 | __io_req_complete(req, issue_flags, ret, 0); |
---|
4773 | 4911 | return 0; |
---|
4774 | 4912 | } |
---|
.. | .. |
---|
4904 | 5042 | sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); |
---|
4905 | 5043 | sr->len = READ_ONCE(sqe->len); |
---|
4906 | 5044 | sr->bgid = READ_ONCE(sqe->buf_group); |
---|
4907 | | - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; |
---|
| 5045 | + sr->msg_flags = READ_ONCE(sqe->msg_flags); |
---|
4908 | 5046 | if (sr->msg_flags & MSG_DONTWAIT) |
---|
4909 | 5047 | req->flags |= REQ_F_NOWAIT; |
---|
4910 | 5048 | |
---|
.. | .. |
---|
4912 | 5050 | if (req->ctx->compat) |
---|
4913 | 5051 | sr->msg_flags |= MSG_CMSG_COMPAT; |
---|
4914 | 5052 | #endif |
---|
| 5053 | + sr->done_io = 0; |
---|
4915 | 5054 | return 0; |
---|
4916 | 5055 | } |
---|
4917 | 5056 | |
---|
4918 | 5057 | static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) |
---|
4919 | 5058 | { |
---|
4920 | 5059 | struct io_async_msghdr iomsg, *kmsg; |
---|
| 5060 | + struct io_sr_msg *sr = &req->sr_msg; |
---|
4921 | 5061 | struct socket *sock; |
---|
4922 | 5062 | struct io_buffer *kbuf; |
---|
4923 | 5063 | unsigned flags; |
---|
.. | .. |
---|
4950 | 5090 | flags = req->sr_msg.msg_flags; |
---|
4951 | 5091 | if (force_nonblock) |
---|
4952 | 5092 | flags |= MSG_DONTWAIT; |
---|
4953 | | - if (flags & MSG_WAITALL) |
---|
| 5093 | + if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) |
---|
4954 | 5094 | min_ret = iov_iter_count(&kmsg->msg.msg_iter); |
---|
4955 | 5095 | |
---|
4956 | 5096 | ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, |
---|
4957 | 5097 | kmsg->uaddr, flags); |
---|
4958 | | - if (force_nonblock && ret == -EAGAIN) |
---|
4959 | | - return io_setup_async_msg(req, kmsg); |
---|
4960 | | - if (ret == -ERESTARTSYS) |
---|
4961 | | - ret = -EINTR; |
---|
| 5098 | + if (ret < min_ret) { |
---|
| 5099 | + if (ret == -EAGAIN && force_nonblock) |
---|
| 5100 | + return io_setup_async_msg(req, kmsg); |
---|
| 5101 | + if (ret == -ERESTARTSYS) |
---|
| 5102 | + ret = -EINTR; |
---|
| 5103 | + if (ret > 0 && io_net_retry(sock, flags)) { |
---|
| 5104 | + kmsg->msg.msg_controllen = 0; |
---|
| 5105 | + kmsg->msg.msg_control = NULL; |
---|
| 5106 | + sr->done_io += ret; |
---|
| 5107 | + req->flags |= REQ_F_PARTIAL_IO; |
---|
| 5108 | + return io_setup_async_msg(req, kmsg); |
---|
| 5109 | + } |
---|
| 5110 | + req_set_fail(req); |
---|
| 5111 | + } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { |
---|
| 5112 | + req_set_fail(req); |
---|
| 5113 | + } |
---|
4962 | 5114 | |
---|
4963 | 5115 | if (req->flags & REQ_F_BUFFER_SELECTED) |
---|
4964 | 5116 | cflags = io_put_recv_kbuf(req); |
---|
.. | .. |
---|
4966 | 5118 | if (kmsg->free_iov) |
---|
4967 | 5119 | kfree(kmsg->free_iov); |
---|
4968 | 5120 | req->flags &= ~REQ_F_NEED_CLEANUP; |
---|
4969 | | - if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) |
---|
4970 | | - req_set_fail(req); |
---|
| 5121 | + if (ret >= 0) |
---|
| 5122 | + ret += sr->done_io; |
---|
| 5123 | + else if (sr->done_io) |
---|
| 5124 | + ret = sr->done_io; |
---|
4971 | 5125 | __io_req_complete(req, issue_flags, ret, cflags); |
---|
4972 | 5126 | return 0; |
---|
4973 | 5127 | } |
---|
.. | .. |
---|
5014 | 5168 | min_ret = iov_iter_count(&msg.msg_iter); |
---|
5015 | 5169 | |
---|
5016 | 5170 | ret = sock_recvmsg(sock, &msg, flags); |
---|
5017 | | - if (force_nonblock && ret == -EAGAIN) |
---|
5018 | | - return -EAGAIN; |
---|
5019 | | - if (ret == -ERESTARTSYS) |
---|
5020 | | - ret = -EINTR; |
---|
| 5171 | + if (ret < min_ret) { |
---|
| 5172 | + if (ret == -EAGAIN && force_nonblock) |
---|
| 5173 | + return -EAGAIN; |
---|
| 5174 | + if (ret == -ERESTARTSYS) |
---|
| 5175 | + ret = -EINTR; |
---|
| 5176 | + if (ret > 0 && io_net_retry(sock, flags)) { |
---|
| 5177 | + sr->len -= ret; |
---|
| 5178 | + sr->buf += ret; |
---|
| 5179 | + sr->done_io += ret; |
---|
| 5180 | + req->flags |= REQ_F_PARTIAL_IO; |
---|
| 5181 | + return -EAGAIN; |
---|
| 5182 | + } |
---|
| 5183 | + req_set_fail(req); |
---|
| 5184 | + } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { |
---|
5021 | 5185 | out_free: |
---|
| 5186 | + req_set_fail(req); |
---|
| 5187 | + } |
---|
5022 | 5188 | if (req->flags & REQ_F_BUFFER_SELECTED) |
---|
5023 | 5189 | cflags = io_put_recv_kbuf(req); |
---|
5024 | | - if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) |
---|
5025 | | - req_set_fail(req); |
---|
| 5190 | + if (ret >= 0) |
---|
| 5191 | + ret += sr->done_io; |
---|
| 5192 | + else if (sr->done_io) |
---|
| 5193 | + ret = sr->done_io; |
---|
5026 | 5194 | __io_req_complete(req, issue_flags, ret, cflags); |
---|
5027 | 5195 | return 0; |
---|
5028 | 5196 | } |
---|
.. | .. |
---|
5060 | 5228 | struct file *file; |
---|
5061 | 5229 | int ret, fd; |
---|
5062 | 5230 | |
---|
5063 | | - if (req->file->f_flags & O_NONBLOCK) |
---|
5064 | | - req->flags |= REQ_F_NOWAIT; |
---|
5065 | | - |
---|
5066 | 5231 | if (!fixed) { |
---|
5067 | 5232 | fd = __get_unused_fd_flags(accept->flags, accept->nofile); |
---|
5068 | 5233 | if (unlikely(fd < 0)) |
---|
.. | .. |
---|
5075 | 5240 | if (!fixed) |
---|
5076 | 5241 | put_unused_fd(fd); |
---|
5077 | 5242 | ret = PTR_ERR(file); |
---|
| 5243 | + /* safe to retry */ |
---|
| 5244 | + req->flags |= REQ_F_PARTIAL_IO; |
---|
5078 | 5245 | if (ret == -EAGAIN && force_nonblock) |
---|
5079 | 5246 | return -EAGAIN; |
---|
5080 | 5247 | if (ret == -ERESTARTSYS) |
---|
.. | .. |
---|
5419 | 5586 | if (ret > 0) |
---|
5420 | 5587 | return; |
---|
5421 | 5588 | |
---|
| 5589 | + io_tw_lock(req->ctx, locked); |
---|
5422 | 5590 | io_poll_remove_entries(req); |
---|
5423 | 5591 | spin_lock(&ctx->completion_lock); |
---|
5424 | 5592 | hash_del(&req->hash_node); |
---|
.. | .. |
---|
5631 | 5799 | IO_APOLL_READY |
---|
5632 | 5800 | }; |
---|
5633 | 5801 | |
---|
| 5802 | +/* |
---|
| 5803 | + * We can't reliably detect loops in repeated poll triggers and issue |
---|
| 5804 | + * subsequently failing. But rather than fail these immediately, allow a |
---|
| 5805 | + * certain amount of retries before we give up. Given that this condition |
---|
| 5806 | + * should _rarely_ trigger even once, we should be fine with a larger value. |
---|
| 5807 | + */ |
---|
| 5808 | +#define APOLL_MAX_RETRY 128 |
---|
| 5809 | + |
---|
5634 | 5810 | static int io_arm_poll_handler(struct io_kiocb *req) |
---|
5635 | 5811 | { |
---|
5636 | 5812 | const struct io_op_def *def = &io_op_defs[req->opcode]; |
---|
.. | .. |
---|
5641 | 5817 | int ret; |
---|
5642 | 5818 | |
---|
5643 | 5819 | if (!req->file || !file_can_poll(req->file)) |
---|
5644 | | - return IO_APOLL_ABORTED; |
---|
5645 | | - if (req->flags & REQ_F_POLLED) |
---|
5646 | 5820 | return IO_APOLL_ABORTED; |
---|
5647 | 5821 | if (!def->pollin && !def->pollout) |
---|
5648 | 5822 | return IO_APOLL_ABORTED; |
---|
.. | .. |
---|
5658 | 5832 | mask |= POLLOUT | POLLWRNORM; |
---|
5659 | 5833 | } |
---|
5660 | 5834 | |
---|
5661 | | - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); |
---|
5662 | | - if (unlikely(!apoll)) |
---|
5663 | | - return IO_APOLL_ABORTED; |
---|
| 5835 | + if (req->flags & REQ_F_POLLED) { |
---|
| 5836 | + apoll = req->apoll; |
---|
| 5837 | + kfree(apoll->double_poll); |
---|
| 5838 | + if (unlikely(!--apoll->poll.retries)) { |
---|
| 5839 | + apoll->double_poll = NULL; |
---|
| 5840 | + return IO_APOLL_ABORTED; |
---|
| 5841 | + } |
---|
| 5842 | + } else { |
---|
| 5843 | + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); |
---|
| 5844 | + if (unlikely(!apoll)) |
---|
| 5845 | + return IO_APOLL_ABORTED; |
---|
| 5846 | + apoll->poll.retries = APOLL_MAX_RETRY; |
---|
| 5847 | + } |
---|
5664 | 5848 | apoll->double_poll = NULL; |
---|
5665 | 5849 | req->apoll = apoll; |
---|
5666 | 5850 | req->flags |= REQ_F_POLLED; |
---|
.. | .. |
---|
5831 | 6015 | struct io_kiocb *preq; |
---|
5832 | 6016 | int ret2, ret = 0; |
---|
5833 | 6017 | |
---|
| 6018 | + io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); |
---|
| 6019 | + |
---|
5834 | 6020 | spin_lock(&ctx->completion_lock); |
---|
5835 | 6021 | preq = io_poll_find(ctx, req->poll_update.old_user_data, true); |
---|
5836 | 6022 | if (!preq || !io_poll_disarm(preq)) { |
---|
.. | .. |
---|
5862 | 6048 | req_set_fail(req); |
---|
5863 | 6049 | /* complete update request, we're done with it */ |
---|
5864 | 6050 | io_req_complete(req, ret); |
---|
| 6051 | + io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); |
---|
5865 | 6052 | return 0; |
---|
5866 | 6053 | } |
---|
5867 | 6054 | |
---|
.. | .. |
---|
6726 | 6913 | */ |
---|
6727 | 6914 | if (ret != -EAGAIN || !(req->ctx->flags & IORING_SETUP_IOPOLL)) |
---|
6728 | 6915 | break; |
---|
| 6916 | + if (io_wq_worker_stopped()) |
---|
| 6917 | + break; |
---|
| 6918 | + /* |
---|
| 6919 | + * If REQ_F_NOWAIT is set, then don't wait or retry with |
---|
| 6920 | + * poll. -EAGAIN is final for that case. |
---|
| 6921 | + */ |
---|
| 6922 | + if (req->flags & REQ_F_NOWAIT) |
---|
| 6923 | + break; |
---|
| 6924 | + |
---|
6729 | 6925 | cond_resched(); |
---|
6730 | 6926 | } while (1); |
---|
6731 | 6927 | } |
---|
.. | .. |
---|
6763 | 6959 | } |
---|
6764 | 6960 | |
---|
6765 | 6961 | static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, |
---|
6766 | | - struct io_kiocb *req, int fd) |
---|
| 6962 | + struct io_kiocb *req, int fd, |
---|
| 6963 | + unsigned int issue_flags) |
---|
6767 | 6964 | { |
---|
6768 | | - struct file *file; |
---|
| 6965 | + struct file *file = NULL; |
---|
6769 | 6966 | unsigned long file_ptr; |
---|
6770 | 6967 | |
---|
| 6968 | + io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); |
---|
| 6969 | + |
---|
6771 | 6970 | if (unlikely((unsigned int)fd >= ctx->nr_user_files)) |
---|
6772 | | - return NULL; |
---|
| 6971 | + goto out; |
---|
6773 | 6972 | fd = array_index_nospec(fd, ctx->nr_user_files); |
---|
6774 | 6973 | file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; |
---|
6775 | 6974 | file = (struct file *) (file_ptr & FFS_MASK); |
---|
.. | .. |
---|
6777 | 6976 | /* mask in overlapping REQ_F and FFS bits */ |
---|
6778 | 6977 | req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); |
---|
6779 | 6978 | io_req_set_rsrc_node(req); |
---|
| 6979 | +out: |
---|
| 6980 | + io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); |
---|
6780 | 6981 | return file; |
---|
6781 | 6982 | } |
---|
6782 | 6983 | |
---|
.. | .. |
---|
6794 | 6995 | } |
---|
6795 | 6996 | |
---|
6796 | 6997 | static inline struct file *io_file_get(struct io_ring_ctx *ctx, |
---|
6797 | | - struct io_kiocb *req, int fd, bool fixed) |
---|
| 6998 | + struct io_kiocb *req, int fd, bool fixed, |
---|
| 6999 | + unsigned int issue_flags) |
---|
6798 | 7000 | { |
---|
6799 | 7001 | if (fixed) |
---|
6800 | | - return io_file_get_fixed(ctx, req, fd); |
---|
| 7002 | + return io_file_get_fixed(ctx, req, fd, issue_flags); |
---|
6801 | 7003 | else |
---|
6802 | 7004 | return io_file_get_normal(ctx, req, fd); |
---|
6803 | 7005 | } |
---|
.. | .. |
---|
7019 | 7221 | |
---|
7020 | 7222 | if (io_op_defs[req->opcode].needs_file) { |
---|
7021 | 7223 | req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), |
---|
7022 | | - (sqe_flags & IOSQE_FIXED_FILE)); |
---|
| 7224 | + (sqe_flags & IOSQE_FIXED_FILE), |
---|
| 7225 | + IO_URING_F_NONBLOCK); |
---|
7023 | 7226 | if (unlikely(!req->file)) |
---|
7024 | 7227 | ret = -EBADF; |
---|
7025 | 7228 | } |
---|
.. | .. |
---|
7447 | 7650 | return -EINTR; |
---|
7448 | 7651 | } |
---|
7449 | 7652 | |
---|
| 7653 | +static bool current_pending_io(void) |
---|
| 7654 | +{ |
---|
| 7655 | + struct io_uring_task *tctx = current->io_uring; |
---|
| 7656 | + |
---|
| 7657 | + if (!tctx) |
---|
| 7658 | + return false; |
---|
| 7659 | + return percpu_counter_read_positive(&tctx->inflight); |
---|
| 7660 | +} |
---|
| 7661 | + |
---|
7450 | 7662 | /* when returns >0, the caller should retry */ |
---|
7451 | 7663 | static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, |
---|
7452 | 7664 | struct io_wait_queue *iowq, |
---|
7453 | | - ktime_t timeout) |
---|
| 7665 | + ktime_t *timeout) |
---|
7454 | 7666 | { |
---|
7455 | | - int ret; |
---|
| 7667 | + int io_wait, ret; |
---|
7456 | 7668 | |
---|
7457 | 7669 | /* make sure we run task_work before checking for signals */ |
---|
7458 | 7670 | ret = io_run_task_work_sig(); |
---|
.. | .. |
---|
7462 | 7674 | if (test_bit(0, &ctx->check_cq_overflow)) |
---|
7463 | 7675 | return 1; |
---|
7464 | 7676 | |
---|
7465 | | - if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) |
---|
7466 | | - return -ETIME; |
---|
7467 | | - return 1; |
---|
| 7677 | + /* |
---|
| 7678 | + * Mark us as being in io_wait if we have pending requests, so cpufreq |
---|
| 7679 | + * can take into account that the task is waiting for IO - turns out |
---|
| 7680 | + * to be important for low QD IO. |
---|
| 7681 | + */ |
---|
| 7682 | + io_wait = current->in_iowait; |
---|
| 7683 | + if (current_pending_io()) |
---|
| 7684 | + current->in_iowait = 1; |
---|
| 7685 | + ret = 1; |
---|
| 7686 | + if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS)) |
---|
| 7687 | + ret = -ETIME; |
---|
| 7688 | + current->in_iowait = io_wait; |
---|
| 7689 | + return ret; |
---|
7468 | 7690 | } |
---|
7469 | 7691 | |
---|
7470 | 7692 | /* |
---|
.. | .. |
---|
7525 | 7747 | } |
---|
7526 | 7748 | prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, |
---|
7527 | 7749 | TASK_INTERRUPTIBLE); |
---|
7528 | | - ret = io_cqring_wait_schedule(ctx, &iowq, timeout); |
---|
| 7750 | + ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); |
---|
7529 | 7751 | finish_wait(&ctx->cq_wait, &iowq.wq); |
---|
7530 | 7752 | cond_resched(); |
---|
7531 | 7753 | } while (ret > 0); |
---|
.. | .. |
---|
8927 | 9149 | pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, |
---|
8928 | 9150 | pages, vmas); |
---|
8929 | 9151 | if (pret == nr_pages) { |
---|
| 9152 | + struct file *file = vmas[0]->vm_file; |
---|
| 9153 | + |
---|
8930 | 9154 | /* don't support file backed memory */ |
---|
8931 | 9155 | for (i = 0; i < nr_pages; i++) { |
---|
8932 | | - struct vm_area_struct *vma = vmas[i]; |
---|
8933 | | - |
---|
8934 | | - if (vma_is_shmem(vma)) |
---|
| 9156 | + if (vmas[i]->vm_file != file) { |
---|
| 9157 | + ret = -EINVAL; |
---|
| 9158 | + break; |
---|
| 9159 | + } |
---|
| 9160 | + if (!file) |
---|
8935 | 9161 | continue; |
---|
8936 | | - if (vma->vm_file && |
---|
8937 | | - !is_file_hugepages(vma->vm_file)) { |
---|
| 9162 | + if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { |
---|
8938 | 9163 | ret = -EOPNOTSUPP; |
---|
8939 | 9164 | break; |
---|
8940 | 9165 | } |
---|
.. | .. |
---|
9367 | 9592 | /* there is little hope left, don't run it too often */ |
---|
9368 | 9593 | interval = HZ * 60; |
---|
9369 | 9594 | } |
---|
9370 | | - } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); |
---|
| 9595 | + /* |
---|
| 9596 | + * This is really an uninterruptible wait, as it has to be |
---|
| 9597 | + * complete. But it's also run from a kworker, which doesn't |
---|
| 9598 | + * take signals, so it's fine to make it interruptible. This |
---|
| 9599 | + * avoids scenarios where we knowingly can wait much longer |
---|
| 9600 | + * on completions, for example if someone does a SIGSTOP on |
---|
| 9601 | + * a task that needs to finish task_work to make this loop |
---|
| 9602 | + * complete. That's a synthetic situation that should not |
---|
| 9603 | + * cause a stuck task backtrace, and hence a potential panic |
---|
| 9604 | + * on stuck tasks if that is enabled. |
---|
| 9605 | + */ |
---|
| 9606 | + } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval)); |
---|
9371 | 9607 | |
---|
9372 | 9608 | init_completion(&exit.completion); |
---|
9373 | 9609 | init_task_work(&exit.task_work, io_tctx_exit_cb); |
---|
.. | .. |
---|
9392 | 9628 | wake_up_process(node->task); |
---|
9393 | 9629 | |
---|
9394 | 9630 | mutex_unlock(&ctx->uring_lock); |
---|
9395 | | - wait_for_completion(&exit.completion); |
---|
| 9631 | + /* |
---|
| 9632 | + * See comment above for |
---|
| 9633 | + * wait_for_completion_interruptible_timeout() on why this |
---|
| 9634 | + * wait is marked as interruptible. |
---|
| 9635 | + */ |
---|
| 9636 | + wait_for_completion_interruptible(&exit.completion); |
---|
9396 | 9637 | mutex_lock(&ctx->uring_lock); |
---|
9397 | 9638 | } |
---|
9398 | 9639 | mutex_unlock(&ctx->uring_lock); |
---|
.. | .. |
---|
9444 | 9685 | |
---|
9445 | 9686 | /* if we failed setting up the ctx, we might not have any rings */ |
---|
9446 | 9687 | io_iopoll_try_reap_events(ctx); |
---|
| 9688 | + |
---|
| 9689 | + /* drop cached put refs after potentially doing completions */ |
---|
| 9690 | + if (current->io_uring) |
---|
| 9691 | + io_uring_drop_tctx_refs(current); |
---|
9447 | 9692 | |
---|
9448 | 9693 | INIT_WORK(&ctx->exit_work, io_ring_exit_work); |
---|
9449 | 9694 | /* |
---|
.. | .. |
---|
9556 | 9801 | while (!list_empty_careful(&ctx->iopoll_list)) { |
---|
9557 | 9802 | io_iopoll_try_reap_events(ctx); |
---|
9558 | 9803 | ret = true; |
---|
| 9804 | + cond_resched(); |
---|
9559 | 9805 | } |
---|
9560 | 9806 | } |
---|
9561 | 9807 | |
---|
.. | .. |
---|
10223 | 10469 | if (!ctx) |
---|
10224 | 10470 | return -ENOMEM; |
---|
10225 | 10471 | ctx->compat = in_compat_syscall(); |
---|
10226 | | - if (!capable(CAP_IPC_LOCK)) |
---|
| 10472 | + if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) |
---|
10227 | 10473 | ctx->user = get_uid(current_user()); |
---|
10228 | 10474 | |
---|
10229 | 10475 | /* |
---|
.. | .. |
---|
10751 | 10997 | return -ENXIO; |
---|
10752 | 10998 | |
---|
10753 | 10999 | if (ctx->restricted) { |
---|
10754 | | - if (opcode >= IORING_REGISTER_LAST) |
---|
10755 | | - return -EINVAL; |
---|
10756 | 11000 | opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); |
---|
10757 | 11001 | if (!test_bit(opcode, ctx->restrictions.register_op)) |
---|
10758 | 11002 | return -EACCES; |
---|
.. | .. |
---|
10884 | 11128 | long ret = -EBADF; |
---|
10885 | 11129 | struct fd f; |
---|
10886 | 11130 | |
---|
| 11131 | + if (opcode >= IORING_REGISTER_LAST) |
---|
| 11132 | + return -EINVAL; |
---|
| 11133 | + |
---|
10887 | 11134 | f = fdget(fd); |
---|
10888 | 11135 | if (!f.file) |
---|
10889 | 11136 | return -EBADF; |
---|