hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/io_uring/io_uring.c
....@@ -496,6 +496,7 @@
496496 struct file *file;
497497 struct wait_queue_head *head;
498498 __poll_t events;
499
+ int retries;
499500 struct wait_queue_entry wait;
500501 };
501502
....@@ -588,7 +589,9 @@
588589 int msg_flags;
589590 int bgid;
590591 size_t len;
592
+ size_t done_io;
591593 struct io_buffer *kbuf;
594
+ void __user *msg_control;
592595 };
593596
594597 struct io_open {
....@@ -749,6 +752,7 @@
749752 REQ_F_CREDS_BIT,
750753 REQ_F_REFCOUNT_BIT,
751754 REQ_F_ARM_LTIMEOUT_BIT,
755
+ REQ_F_PARTIAL_IO_BIT,
752756 /* keep async read/write and isreg together and in order */
753757 REQ_F_NOWAIT_READ_BIT,
754758 REQ_F_NOWAIT_WRITE_BIT,
....@@ -804,6 +808,8 @@
804808 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
805809 /* there is a linked timeout that has to be armed */
806810 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
811
+ /* request has already done partial IO */
812
+ REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
807813 };
808814
809815 struct async_poll {
....@@ -1098,7 +1104,8 @@
10981104 unsigned nr_args);
10991105 static void io_clean_op(struct io_kiocb *req);
11001106 static struct file *io_file_get(struct io_ring_ctx *ctx,
1101
- struct io_kiocb *req, int fd, bool fixed);
1107
+ struct io_kiocb *req, int fd, bool fixed,
1108
+ unsigned int issue_flags);
11021109 static void __io_queue_sqe(struct io_kiocb *req);
11031110 static void io_rsrc_put_work(struct work_struct *work);
11041111
....@@ -1524,6 +1531,8 @@
15241531
15251532 static void io_queue_deferred(struct io_ring_ctx *ctx)
15261533 {
1534
+ lockdep_assert_held(&ctx->completion_lock);
1535
+
15271536 while (!list_empty(&ctx->defer_list)) {
15281537 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
15291538 struct io_defer_entry, list);
....@@ -1575,12 +1584,22 @@
15751584 io_queue_deferred(ctx);
15761585 }
15771586
1578
-static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1587
+static inline bool io_commit_needs_flush(struct io_ring_ctx *ctx)
15791588 {
1580
- if (unlikely(ctx->off_timeout_used || ctx->drain_active))
1581
- __io_commit_cqring_flush(ctx);
1589
+ return ctx->off_timeout_used || ctx->drain_active;
1590
+}
1591
+
1592
+static inline void __io_commit_cqring(struct io_ring_ctx *ctx)
1593
+{
15821594 /* order cqe stores with ring update */
15831595 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1596
+}
1597
+
1598
+static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1599
+{
1600
+ if (unlikely(io_commit_needs_flush(ctx)))
1601
+ __io_commit_cqring_flush(ctx);
1602
+ __io_commit_cqring(ctx);
15841603 }
15851604
15861605 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
....@@ -2205,9 +2224,12 @@
22052224 }
22062225 req->io_task_work.func(req, &locked);
22072226 node = next;
2227
+ if (unlikely(need_resched())) {
2228
+ ctx_flush_and_put(ctx, &locked);
2229
+ ctx = NULL;
2230
+ cond_resched();
2231
+ }
22082232 } while (node);
2209
-
2210
- cond_resched();
22112233 }
22122234
22132235 ctx_flush_and_put(ctx, &locked);
....@@ -2465,6 +2487,15 @@
24652487
24662488 static inline bool io_run_task_work(void)
24672489 {
2490
+ /*
2491
+ * PF_IO_WORKER never returns to userspace, so check here if we have
2492
+ * notify work that needs processing.
2493
+ */
2494
+ if (current->flags & PF_IO_WORKER &&
2495
+ test_thread_flag(TIF_NOTIFY_RESUME)) {
2496
+ __set_current_state(TASK_RUNNING);
2497
+ tracehook_notify_resume(NULL);
2498
+ }
24682499 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
24692500 __set_current_state(TASK_RUNNING);
24702501 tracehook_notify_signal();
....@@ -2488,17 +2519,36 @@
24882519
24892520 io_init_req_batch(&rb);
24902521 while (!list_empty(done)) {
2522
+ struct io_uring_cqe *cqe;
2523
+ unsigned cflags;
2524
+
24912525 req = list_first_entry(done, struct io_kiocb, inflight_entry);
24922526 list_del(&req->inflight_entry);
2493
-
2494
- io_fill_cqe_req(req, req->result, io_put_rw_kbuf(req));
2527
+ cflags = io_put_rw_kbuf(req);
24952528 (*nr_events)++;
2529
+
2530
+ cqe = io_get_cqe(ctx);
2531
+ if (cqe) {
2532
+ WRITE_ONCE(cqe->user_data, req->user_data);
2533
+ WRITE_ONCE(cqe->res, req->result);
2534
+ WRITE_ONCE(cqe->flags, cflags);
2535
+ } else {
2536
+ spin_lock(&ctx->completion_lock);
2537
+ io_cqring_event_overflow(ctx, req->user_data,
2538
+ req->result, cflags);
2539
+ spin_unlock(&ctx->completion_lock);
2540
+ }
24962541
24972542 if (req_ref_put_and_test(req))
24982543 io_req_free_batch(&rb, req, &ctx->submit_state);
24992544 }
25002545
2501
- io_commit_cqring(ctx);
2546
+ if (io_commit_needs_flush(ctx)) {
2547
+ spin_lock(&ctx->completion_lock);
2548
+ __io_commit_cqring_flush(ctx);
2549
+ spin_unlock(&ctx->completion_lock);
2550
+ }
2551
+ __io_commit_cqring(ctx);
25022552 io_cqring_ev_posted_iopoll(ctx);
25032553 io_req_free_batch_finish(ctx, &rb);
25042554 }
....@@ -2625,6 +2675,11 @@
26252675 break;
26262676 }
26272677 ret = io_do_iopoll(ctx, &nr_events, min);
2678
+
2679
+ if (task_sigpending(current)) {
2680
+ ret = -EINTR;
2681
+ goto out;
2682
+ }
26282683 } while (!ret && nr_events < min && !need_resched());
26292684 out:
26302685 mutex_unlock(&ctx->uring_lock);
....@@ -2692,17 +2747,32 @@
26922747 }
26932748 #endif
26942749
2695
-static bool __io_complete_rw_common(struct io_kiocb *req, long res)
2750
+/*
2751
+ * Trigger the notifications after having done some IO, and finish the write
2752
+ * accounting, if any.
2753
+ */
2754
+static void io_req_io_end(struct io_kiocb *req)
26962755 {
2697
- if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
2756
+ struct io_rw *rw = &req->rw;
2757
+
2758
+ if (rw->kiocb.ki_flags & IOCB_WRITE) {
26982759 kiocb_end_write(req);
26992760 fsnotify_modify(req->file);
27002761 } else {
27012762 fsnotify_access(req->file);
27022763 }
2764
+}
2765
+
2766
+static bool __io_complete_rw_common(struct io_kiocb *req, long res)
2767
+{
27032768 if (res != req->result) {
27042769 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
27052770 io_rw_should_reissue(req)) {
2771
+ /*
2772
+ * Reissue will start accounting again, finish the
2773
+ * current cycle.
2774
+ */
2775
+ io_req_io_end(req);
27062776 req->flags |= REQ_F_REISSUE;
27072777 return true;
27082778 }
....@@ -2712,7 +2782,7 @@
27122782 return false;
27132783 }
27142784
2715
-static inline int io_fixup_rw_res(struct io_kiocb *req, unsigned res)
2785
+static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
27162786 {
27172787 struct io_async_rw *io = req->async_data;
27182788
....@@ -2744,12 +2814,10 @@
27442814 }
27452815 }
27462816
2747
-static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2748
- unsigned int issue_flags)
2817
+static void io_req_rw_complete(struct io_kiocb *req, bool *locked)
27492818 {
2750
- if (__io_complete_rw_common(req, res))
2751
- return;
2752
- __io_req_complete(req, issue_flags, io_fixup_rw_res(req, res), io_put_rw_kbuf(req));
2819
+ io_req_io_end(req);
2820
+ io_req_task_complete(req, locked);
27532821 }
27542822
27552823 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
....@@ -2759,7 +2827,7 @@
27592827 if (__io_complete_rw_common(req, res))
27602828 return;
27612829 req->result = io_fixup_rw_res(req, res);
2762
- req->io_task_work.func = io_req_task_complete;
2830
+ req->io_task_work.func = io_req_rw_complete;
27632831 io_req_task_work_add(req);
27642832 }
27652833
....@@ -2911,14 +2979,6 @@
29112979 req->flags |= REQ_F_ISREG;
29122980
29132981 kiocb->ki_pos = READ_ONCE(sqe->off);
2914
- if (kiocb->ki_pos == -1) {
2915
- if (!(file->f_mode & FMODE_STREAM)) {
2916
- req->flags |= REQ_F_CUR_POS;
2917
- kiocb->ki_pos = file->f_pos;
2918
- } else {
2919
- kiocb->ki_pos = 0;
2920
- }
2921
- }
29222982 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
29232983 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
29242984 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
....@@ -3000,6 +3060,23 @@
30003060 }
30013061 }
30023062
3063
+static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
3064
+{
3065
+ struct kiocb *kiocb = &req->rw.kiocb;
3066
+
3067
+ if (kiocb->ki_pos != -1)
3068
+ return &kiocb->ki_pos;
3069
+
3070
+ if (!(req->file->f_mode & FMODE_STREAM)) {
3071
+ req->flags |= REQ_F_CUR_POS;
3072
+ kiocb->ki_pos = req->file->f_pos;
3073
+ return &kiocb->ki_pos;
3074
+ }
3075
+
3076
+ kiocb->ki_pos = 0;
3077
+ return NULL;
3078
+}
3079
+
30033080 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
30043081 unsigned int issue_flags)
30053082 {
....@@ -3007,10 +3084,20 @@
30073084
30083085 if (req->flags & REQ_F_CUR_POS)
30093086 req->file->f_pos = kiocb->ki_pos;
3010
- if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
3011
- __io_complete_rw(req, ret, 0, issue_flags);
3012
- else
3087
+ if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) {
3088
+ if (!__io_complete_rw_common(req, ret)) {
3089
+ /*
3090
+ * Safe to call io_end from here as we're inline
3091
+ * from the submission path.
3092
+ */
3093
+ io_req_io_end(req);
3094
+ __io_req_complete(req, issue_flags,
3095
+ io_fixup_rw_res(req, ret),
3096
+ io_put_rw_kbuf(req));
3097
+ }
3098
+ } else {
30133099 io_rw_done(kiocb, ret);
3100
+ }
30143101
30153102 if (req->flags & REQ_F_REISSUE) {
30163103 req->flags &= ~REQ_F_REISSUE;
....@@ -3292,6 +3379,7 @@
32923379 struct kiocb *kiocb = &req->rw.kiocb;
32933380 struct file *file = req->file;
32943381 ssize_t ret = 0;
3382
+ loff_t *ppos;
32953383
32963384 /*
32973385 * Don't support polled IO through this interface, and we can't
....@@ -3302,6 +3390,8 @@
33023390 return -EOPNOTSUPP;
33033391 if (kiocb->ki_flags & IOCB_NOWAIT)
33043392 return -EAGAIN;
3393
+
3394
+ ppos = io_kiocb_ppos(kiocb);
33053395
33063396 while (iov_iter_count(iter)) {
33073397 struct iovec iovec;
....@@ -3316,10 +3406,10 @@
33163406
33173407 if (rw == READ) {
33183408 nr = file->f_op->read(file, iovec.iov_base,
3319
- iovec.iov_len, io_kiocb_ppos(kiocb));
3409
+ iovec.iov_len, ppos);
33203410 } else {
33213411 nr = file->f_op->write(file, iovec.iov_base,
3322
- iovec.iov_len, io_kiocb_ppos(kiocb));
3412
+ iovec.iov_len, ppos);
33233413 }
33243414
33253415 if (nr < 0) {
....@@ -3520,6 +3610,7 @@
35203610 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
35213611 struct iov_iter_state __state, *state;
35223612 ssize_t ret, ret2;
3613
+ loff_t *ppos;
35233614
35243615 if (rw) {
35253616 iter = &rw->iter;
....@@ -3552,7 +3643,9 @@
35523643 return ret ?: -EAGAIN;
35533644 }
35543645
3555
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
3646
+ ppos = io_kiocb_update_pos(req);
3647
+
3648
+ ret = rw_verify_area(READ, req->file, ppos, req->result);
35563649 if (unlikely(ret)) {
35573650 kfree(iovec);
35583651 return ret;
....@@ -3656,6 +3749,7 @@
36563749 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
36573750 struct iov_iter_state __state, *state;
36583751 ssize_t ret, ret2;
3752
+ loff_t *ppos;
36593753
36603754 if (rw) {
36613755 iter = &rw->iter;
....@@ -3686,7 +3780,9 @@
36863780 (req->flags & REQ_F_ISREG))
36873781 goto copy_iov;
36883782
3689
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
3783
+ ppos = io_kiocb_update_pos(req);
3784
+
3785
+ ret = rw_verify_area(WRITE, req->file, ppos, req->result);
36903786 if (unlikely(ret))
36913787 goto out_free;
36923788
....@@ -3926,7 +4022,7 @@
39264022 return -EAGAIN;
39274023
39284024 in = io_file_get(req->ctx, req, sp->splice_fd_in,
3929
- (sp->flags & SPLICE_F_FD_IN_FIXED));
4025
+ (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
39304026 if (!in) {
39314027 ret = -EBADF;
39324028 goto done;
....@@ -3966,7 +4062,7 @@
39664062 return -EAGAIN;
39674063
39684064 in = io_file_get(req->ctx, req, sp->splice_fd_in,
3969
- (sp->flags & SPLICE_F_FD_IN_FIXED));
4065
+ (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
39704066 if (!in) {
39714067 ret = -EBADF;
39724068 goto done;
....@@ -4148,9 +4244,11 @@
41484244 if (issue_flags & IO_URING_F_NONBLOCK) {
41494245 /*
41504246 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4151
- * it'll always -EAGAIN
4247
+ * it'll always -EAGAIN. Note that we test for __O_TMPFILE
4248
+ * because O_TMPFILE includes O_DIRECTORY, which isn't a flag
4249
+ * we need to force async for.
41524250 */
4153
- if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
4251
+ if (req->open.how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
41544252 return -EAGAIN;
41554253 op.lookup_flags |= LOOKUP_CACHED;
41564254 op.open_flag |= O_NONBLOCK;
....@@ -4623,6 +4721,13 @@
46234721 }
46244722
46254723 #if defined(CONFIG_NET)
4724
+static bool io_net_retry(struct socket *sock, int flags)
4725
+{
4726
+ if (!(flags & MSG_WAITALL))
4727
+ return false;
4728
+ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
4729
+}
4730
+
46264731 static int io_setup_async_msg(struct io_kiocb *req,
46274732 struct io_async_msghdr *kmsg)
46284733 {
....@@ -4640,8 +4745,10 @@
46404745 if (async_msg->msg.msg_name)
46414746 async_msg->msg.msg_name = &async_msg->addr;
46424747 /* if were using fast_iov, set it to the new one */
4643
- if (!async_msg->free_iov)
4644
- async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4748
+ if (!kmsg->free_iov) {
4749
+ size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov;
4750
+ async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx];
4751
+ }
46454752
46464753 return -EAGAIN;
46474754 }
....@@ -4649,10 +4756,16 @@
46494756 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
46504757 struct io_async_msghdr *iomsg)
46514758 {
4759
+ struct io_sr_msg *sr = &req->sr_msg;
4760
+ int ret;
4761
+
46524762 iomsg->msg.msg_name = &iomsg->addr;
46534763 iomsg->free_iov = iomsg->fast_iov;
4654
- return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4764
+ ret = sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
46554765 req->sr_msg.msg_flags, &iomsg->free_iov);
4766
+ /* save msg_control as sys_sendmsg() overwrites it */
4767
+ sr->msg_control = iomsg->msg.msg_control;
4768
+ return ret;
46564769 }
46574770
46584771 static int io_sendmsg_prep_async(struct io_kiocb *req)
....@@ -4686,12 +4799,14 @@
46864799 if (req->ctx->compat)
46874800 sr->msg_flags |= MSG_CMSG_COMPAT;
46884801 #endif
4802
+ sr->done_io = 0;
46894803 return 0;
46904804 }
46914805
46924806 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
46934807 {
46944808 struct io_async_msghdr iomsg, *kmsg;
4809
+ struct io_sr_msg *sr = &req->sr_msg;
46954810 struct socket *sock;
46964811 unsigned flags;
46974812 int min_ret = 0;
....@@ -4707,6 +4822,8 @@
47074822 if (ret)
47084823 return ret;
47094824 kmsg = &iomsg;
4825
+ } else {
4826
+ kmsg->msg.msg_control = sr->msg_control;
47104827 }
47114828
47124829 flags = req->sr_msg.msg_flags;
....@@ -4716,17 +4833,27 @@
47164833 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
47174834
47184835 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4719
- if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4720
- return io_setup_async_msg(req, kmsg);
4721
- if (ret == -ERESTARTSYS)
4722
- ret = -EINTR;
47234836
4837
+ if (ret < min_ret) {
4838
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
4839
+ return io_setup_async_msg(req, kmsg);
4840
+ if (ret == -ERESTARTSYS)
4841
+ ret = -EINTR;
4842
+ if (ret > 0 && io_net_retry(sock, flags)) {
4843
+ sr->done_io += ret;
4844
+ req->flags |= REQ_F_PARTIAL_IO;
4845
+ return io_setup_async_msg(req, kmsg);
4846
+ }
4847
+ req_set_fail(req);
4848
+ }
47244849 /* fast path, check for non-NULL to avoid function call */
47254850 if (kmsg->free_iov)
47264851 kfree(kmsg->free_iov);
47274852 req->flags &= ~REQ_F_NEED_CLEANUP;
4728
- if (ret < min_ret)
4729
- req_set_fail(req);
4853
+ if (ret >= 0)
4854
+ ret += sr->done_io;
4855
+ else if (sr->done_io)
4856
+ ret = sr->done_io;
47304857 __io_req_complete(req, issue_flags, ret, 0);
47314858 return 0;
47324859 }
....@@ -4762,13 +4889,24 @@
47624889
47634890 msg.msg_flags = flags;
47644891 ret = sock_sendmsg(sock, &msg);
4765
- if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4766
- return -EAGAIN;
4767
- if (ret == -ERESTARTSYS)
4768
- ret = -EINTR;
4769
-
4770
- if (ret < min_ret)
4892
+ if (ret < min_ret) {
4893
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
4894
+ return -EAGAIN;
4895
+ if (ret == -ERESTARTSYS)
4896
+ ret = -EINTR;
4897
+ if (ret > 0 && io_net_retry(sock, flags)) {
4898
+ sr->len -= ret;
4899
+ sr->buf += ret;
4900
+ sr->done_io += ret;
4901
+ req->flags |= REQ_F_PARTIAL_IO;
4902
+ return -EAGAIN;
4903
+ }
47714904 req_set_fail(req);
4905
+ }
4906
+ if (ret >= 0)
4907
+ ret += sr->done_io;
4908
+ else if (sr->done_io)
4909
+ ret = sr->done_io;
47724910 __io_req_complete(req, issue_flags, ret, 0);
47734911 return 0;
47744912 }
....@@ -4904,7 +5042,7 @@
49045042 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
49055043 sr->len = READ_ONCE(sqe->len);
49065044 sr->bgid = READ_ONCE(sqe->buf_group);
4907
- sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
5045
+ sr->msg_flags = READ_ONCE(sqe->msg_flags);
49085046 if (sr->msg_flags & MSG_DONTWAIT)
49095047 req->flags |= REQ_F_NOWAIT;
49105048
....@@ -4912,12 +5050,14 @@
49125050 if (req->ctx->compat)
49135051 sr->msg_flags |= MSG_CMSG_COMPAT;
49145052 #endif
5053
+ sr->done_io = 0;
49155054 return 0;
49165055 }
49175056
49185057 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
49195058 {
49205059 struct io_async_msghdr iomsg, *kmsg;
5060
+ struct io_sr_msg *sr = &req->sr_msg;
49215061 struct socket *sock;
49225062 struct io_buffer *kbuf;
49235063 unsigned flags;
....@@ -4950,15 +5090,27 @@
49505090 flags = req->sr_msg.msg_flags;
49515091 if (force_nonblock)
49525092 flags |= MSG_DONTWAIT;
4953
- if (flags & MSG_WAITALL)
5093
+ if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
49545094 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
49555095
49565096 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
49575097 kmsg->uaddr, flags);
4958
- if (force_nonblock && ret == -EAGAIN)
4959
- return io_setup_async_msg(req, kmsg);
4960
- if (ret == -ERESTARTSYS)
4961
- ret = -EINTR;
5098
+ if (ret < min_ret) {
5099
+ if (ret == -EAGAIN && force_nonblock)
5100
+ return io_setup_async_msg(req, kmsg);
5101
+ if (ret == -ERESTARTSYS)
5102
+ ret = -EINTR;
5103
+ if (ret > 0 && io_net_retry(sock, flags)) {
5104
+ kmsg->msg.msg_controllen = 0;
5105
+ kmsg->msg.msg_control = NULL;
5106
+ sr->done_io += ret;
5107
+ req->flags |= REQ_F_PARTIAL_IO;
5108
+ return io_setup_async_msg(req, kmsg);
5109
+ }
5110
+ req_set_fail(req);
5111
+ } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
5112
+ req_set_fail(req);
5113
+ }
49625114
49635115 if (req->flags & REQ_F_BUFFER_SELECTED)
49645116 cflags = io_put_recv_kbuf(req);
....@@ -4966,8 +5118,10 @@
49665118 if (kmsg->free_iov)
49675119 kfree(kmsg->free_iov);
49685120 req->flags &= ~REQ_F_NEED_CLEANUP;
4969
- if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
4970
- req_set_fail(req);
5121
+ if (ret >= 0)
5122
+ ret += sr->done_io;
5123
+ else if (sr->done_io)
5124
+ ret = sr->done_io;
49715125 __io_req_complete(req, issue_flags, ret, cflags);
49725126 return 0;
49735127 }
....@@ -5014,15 +5168,29 @@
50145168 min_ret = iov_iter_count(&msg.msg_iter);
50155169
50165170 ret = sock_recvmsg(sock, &msg, flags);
5017
- if (force_nonblock && ret == -EAGAIN)
5018
- return -EAGAIN;
5019
- if (ret == -ERESTARTSYS)
5020
- ret = -EINTR;
5171
+ if (ret < min_ret) {
5172
+ if (ret == -EAGAIN && force_nonblock)
5173
+ return -EAGAIN;
5174
+ if (ret == -ERESTARTSYS)
5175
+ ret = -EINTR;
5176
+ if (ret > 0 && io_net_retry(sock, flags)) {
5177
+ sr->len -= ret;
5178
+ sr->buf += ret;
5179
+ sr->done_io += ret;
5180
+ req->flags |= REQ_F_PARTIAL_IO;
5181
+ return -EAGAIN;
5182
+ }
5183
+ req_set_fail(req);
5184
+ } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
50215185 out_free:
5186
+ req_set_fail(req);
5187
+ }
50225188 if (req->flags & REQ_F_BUFFER_SELECTED)
50235189 cflags = io_put_recv_kbuf(req);
5024
- if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
5025
- req_set_fail(req);
5190
+ if (ret >= 0)
5191
+ ret += sr->done_io;
5192
+ else if (sr->done_io)
5193
+ ret = sr->done_io;
50265194 __io_req_complete(req, issue_flags, ret, cflags);
50275195 return 0;
50285196 }
....@@ -5060,9 +5228,6 @@
50605228 struct file *file;
50615229 int ret, fd;
50625230
5063
- if (req->file->f_flags & O_NONBLOCK)
5064
- req->flags |= REQ_F_NOWAIT;
5065
-
50665231 if (!fixed) {
50675232 fd = __get_unused_fd_flags(accept->flags, accept->nofile);
50685233 if (unlikely(fd < 0))
....@@ -5075,6 +5240,8 @@
50755240 if (!fixed)
50765241 put_unused_fd(fd);
50775242 ret = PTR_ERR(file);
5243
+ /* safe to retry */
5244
+ req->flags |= REQ_F_PARTIAL_IO;
50785245 if (ret == -EAGAIN && force_nonblock)
50795246 return -EAGAIN;
50805247 if (ret == -ERESTARTSYS)
....@@ -5419,6 +5586,7 @@
54195586 if (ret > 0)
54205587 return;
54215588
5589
+ io_tw_lock(req->ctx, locked);
54225590 io_poll_remove_entries(req);
54235591 spin_lock(&ctx->completion_lock);
54245592 hash_del(&req->hash_node);
....@@ -5631,6 +5799,14 @@
56315799 IO_APOLL_READY
56325800 };
56335801
5802
+/*
5803
+ * We can't reliably detect loops in repeated poll triggers and issue
5804
+ * subsequently failing. But rather than fail these immediately, allow a
5805
+ * certain amount of retries before we give up. Given that this condition
5806
+ * should _rarely_ trigger even once, we should be fine with a larger value.
5807
+ */
5808
+#define APOLL_MAX_RETRY 128
5809
+
56345810 static int io_arm_poll_handler(struct io_kiocb *req)
56355811 {
56365812 const struct io_op_def *def = &io_op_defs[req->opcode];
....@@ -5641,8 +5817,6 @@
56415817 int ret;
56425818
56435819 if (!req->file || !file_can_poll(req->file))
5644
- return IO_APOLL_ABORTED;
5645
- if (req->flags & REQ_F_POLLED)
56465820 return IO_APOLL_ABORTED;
56475821 if (!def->pollin && !def->pollout)
56485822 return IO_APOLL_ABORTED;
....@@ -5658,9 +5832,19 @@
56585832 mask |= POLLOUT | POLLWRNORM;
56595833 }
56605834
5661
- apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5662
- if (unlikely(!apoll))
5663
- return IO_APOLL_ABORTED;
5835
+ if (req->flags & REQ_F_POLLED) {
5836
+ apoll = req->apoll;
5837
+ kfree(apoll->double_poll);
5838
+ if (unlikely(!--apoll->poll.retries)) {
5839
+ apoll->double_poll = NULL;
5840
+ return IO_APOLL_ABORTED;
5841
+ }
5842
+ } else {
5843
+ apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5844
+ if (unlikely(!apoll))
5845
+ return IO_APOLL_ABORTED;
5846
+ apoll->poll.retries = APOLL_MAX_RETRY;
5847
+ }
56645848 apoll->double_poll = NULL;
56655849 req->apoll = apoll;
56665850 req->flags |= REQ_F_POLLED;
....@@ -5831,6 +6015,8 @@
58316015 struct io_kiocb *preq;
58326016 int ret2, ret = 0;
58336017
6018
+ io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6019
+
58346020 spin_lock(&ctx->completion_lock);
58356021 preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
58366022 if (!preq || !io_poll_disarm(preq)) {
....@@ -5862,6 +6048,7 @@
58626048 req_set_fail(req);
58636049 /* complete update request, we're done with it */
58646050 io_req_complete(req, ret);
6051
+ io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
58656052 return 0;
58666053 }
58676054
....@@ -6726,6 +6913,15 @@
67266913 */
67276914 if (ret != -EAGAIN || !(req->ctx->flags & IORING_SETUP_IOPOLL))
67286915 break;
6916
+ if (io_wq_worker_stopped())
6917
+ break;
6918
+ /*
6919
+ * If REQ_F_NOWAIT is set, then don't wait or retry with
6920
+ * poll. -EAGAIN is final for that case.
6921
+ */
6922
+ if (req->flags & REQ_F_NOWAIT)
6923
+ break;
6924
+
67296925 cond_resched();
67306926 } while (1);
67316927 }
....@@ -6763,13 +6959,16 @@
67636959 }
67646960
67656961 static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
6766
- struct io_kiocb *req, int fd)
6962
+ struct io_kiocb *req, int fd,
6963
+ unsigned int issue_flags)
67676964 {
6768
- struct file *file;
6965
+ struct file *file = NULL;
67696966 unsigned long file_ptr;
67706967
6968
+ io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6969
+
67716970 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6772
- return NULL;
6971
+ goto out;
67736972 fd = array_index_nospec(fd, ctx->nr_user_files);
67746973 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
67756974 file = (struct file *) (file_ptr & FFS_MASK);
....@@ -6777,6 +6976,8 @@
67776976 /* mask in overlapping REQ_F and FFS bits */
67786977 req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
67796978 io_req_set_rsrc_node(req);
6979
+out:
6980
+ io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
67806981 return file;
67816982 }
67826983
....@@ -6794,10 +6995,11 @@
67946995 }
67956996
67966997 static inline struct file *io_file_get(struct io_ring_ctx *ctx,
6797
- struct io_kiocb *req, int fd, bool fixed)
6998
+ struct io_kiocb *req, int fd, bool fixed,
6999
+ unsigned int issue_flags)
67987000 {
67997001 if (fixed)
6800
- return io_file_get_fixed(ctx, req, fd);
7002
+ return io_file_get_fixed(ctx, req, fd, issue_flags);
68017003 else
68027004 return io_file_get_normal(ctx, req, fd);
68037005 }
....@@ -7019,7 +7221,8 @@
70197221
70207222 if (io_op_defs[req->opcode].needs_file) {
70217223 req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
7022
- (sqe_flags & IOSQE_FIXED_FILE));
7224
+ (sqe_flags & IOSQE_FIXED_FILE),
7225
+ IO_URING_F_NONBLOCK);
70237226 if (unlikely(!req->file))
70247227 ret = -EBADF;
70257228 }
....@@ -7447,12 +7650,21 @@
74477650 return -EINTR;
74487651 }
74497652
7653
+static bool current_pending_io(void)
7654
+{
7655
+ struct io_uring_task *tctx = current->io_uring;
7656
+
7657
+ if (!tctx)
7658
+ return false;
7659
+ return percpu_counter_read_positive(&tctx->inflight);
7660
+}
7661
+
74507662 /* when returns >0, the caller should retry */
74517663 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
74527664 struct io_wait_queue *iowq,
7453
- ktime_t timeout)
7665
+ ktime_t *timeout)
74547666 {
7455
- int ret;
7667
+ int io_wait, ret;
74567668
74577669 /* make sure we run task_work before checking for signals */
74587670 ret = io_run_task_work_sig();
....@@ -7462,9 +7674,19 @@
74627674 if (test_bit(0, &ctx->check_cq_overflow))
74637675 return 1;
74647676
7465
- if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
7466
- return -ETIME;
7467
- return 1;
7677
+ /*
7678
+ * Mark us as being in io_wait if we have pending requests, so cpufreq
7679
+ * can take into account that the task is waiting for IO - turns out
7680
+ * to be important for low QD IO.
7681
+ */
7682
+ io_wait = current->in_iowait;
7683
+ if (current_pending_io())
7684
+ current->in_iowait = 1;
7685
+ ret = 1;
7686
+ if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
7687
+ ret = -ETIME;
7688
+ current->in_iowait = io_wait;
7689
+ return ret;
74687690 }
74697691
74707692 /*
....@@ -7525,7 +7747,7 @@
75257747 }
75267748 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
75277749 TASK_INTERRUPTIBLE);
7528
- ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
7750
+ ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
75297751 finish_wait(&ctx->cq_wait, &iowq.wq);
75307752 cond_resched();
75317753 } while (ret > 0);
....@@ -8927,14 +9149,17 @@
89279149 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
89289150 pages, vmas);
89299151 if (pret == nr_pages) {
9152
+ struct file *file = vmas[0]->vm_file;
9153
+
89309154 /* don't support file backed memory */
89319155 for (i = 0; i < nr_pages; i++) {
8932
- struct vm_area_struct *vma = vmas[i];
8933
-
8934
- if (vma_is_shmem(vma))
9156
+ if (vmas[i]->vm_file != file) {
9157
+ ret = -EINVAL;
9158
+ break;
9159
+ }
9160
+ if (!file)
89359161 continue;
8936
- if (vma->vm_file &&
8937
- !is_file_hugepages(vma->vm_file)) {
9162
+ if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) {
89389163 ret = -EOPNOTSUPP;
89399164 break;
89409165 }
....@@ -9367,7 +9592,18 @@
93679592 /* there is little hope left, don't run it too often */
93689593 interval = HZ * 60;
93699594 }
9370
- } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
9595
+ /*
9596
+ * This is really an uninterruptible wait, as it has to be
9597
+ * complete. But it's also run from a kworker, which doesn't
9598
+ * take signals, so it's fine to make it interruptible. This
9599
+ * avoids scenarios where we knowingly can wait much longer
9600
+ * on completions, for example if someone does a SIGSTOP on
9601
+ * a task that needs to finish task_work to make this loop
9602
+ * complete. That's a synthetic situation that should not
9603
+ * cause a stuck task backtrace, and hence a potential panic
9604
+ * on stuck tasks if that is enabled.
9605
+ */
9606
+ } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));
93719607
93729608 init_completion(&exit.completion);
93739609 init_task_work(&exit.task_work, io_tctx_exit_cb);
....@@ -9392,7 +9628,12 @@
93929628 wake_up_process(node->task);
93939629
93949630 mutex_unlock(&ctx->uring_lock);
9395
- wait_for_completion(&exit.completion);
9631
+ /*
9632
+ * See comment above for
9633
+ * wait_for_completion_interruptible_timeout() on why this
9634
+ * wait is marked as interruptible.
9635
+ */
9636
+ wait_for_completion_interruptible(&exit.completion);
93969637 mutex_lock(&ctx->uring_lock);
93979638 }
93989639 mutex_unlock(&ctx->uring_lock);
....@@ -9444,6 +9685,10 @@
94449685
94459686 /* if we failed setting up the ctx, we might not have any rings */
94469687 io_iopoll_try_reap_events(ctx);
9688
+
9689
+ /* drop cached put refs after potentially doing completions */
9690
+ if (current->io_uring)
9691
+ io_uring_drop_tctx_refs(current);
94479692
94489693 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
94499694 /*
....@@ -9556,6 +9801,7 @@
95569801 while (!list_empty_careful(&ctx->iopoll_list)) {
95579802 io_iopoll_try_reap_events(ctx);
95589803 ret = true;
9804
+ cond_resched();
95599805 }
95609806 }
95619807
....@@ -10223,7 +10469,7 @@
1022310469 if (!ctx)
1022410470 return -ENOMEM;
1022510471 ctx->compat = in_compat_syscall();
10226
- if (!capable(CAP_IPC_LOCK))
10472
+ if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK))
1022710473 ctx->user = get_uid(current_user());
1022810474
1022910475 /*
....@@ -10751,8 +10997,6 @@
1075110997 return -ENXIO;
1075210998
1075310999 if (ctx->restricted) {
10754
- if (opcode >= IORING_REGISTER_LAST)
10755
- return -EINVAL;
1075611000 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
1075711001 if (!test_bit(opcode, ctx->restrictions.register_op))
1075811002 return -EACCES;
....@@ -10884,6 +11128,9 @@
1088411128 long ret = -EBADF;
1088511129 struct fd f;
1088611130
11131
+ if (opcode >= IORING_REGISTER_LAST)
11132
+ return -EINVAL;
11133
+
1088711134 f = fdget(fd);
1088811135 if (!f.file)
1088911136 return -EBADF;