.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* |
---|
2 | 3 | * fs/eventpoll.c (Efficient event retrieval implementation) |
---|
3 | 4 | * Copyright (C) 2001,...,2009 Davide Libenzi |
---|
4 | 5 | * |
---|
5 | | - * This program is free software; you can redistribute it and/or modify |
---|
6 | | - * it under the terms of the GNU General Public License as published by |
---|
7 | | - * the Free Software Foundation; either version 2 of the License, or |
---|
8 | | - * (at your option) any later version. |
---|
9 | | - * |
---|
10 | 6 | * Davide Libenzi <davidel@xmailserver.org> |
---|
11 | | - * |
---|
12 | 7 | */ |
---|
13 | 8 | |
---|
14 | 9 | #include <linux/init.h> |
---|
.. | .. |
---|
45 | 40 | #include <linux/rculist.h> |
---|
46 | 41 | #include <net/busy_poll.h> |
---|
47 | 42 | |
---|
| 43 | +#include <trace/hooks/fs.h> |
---|
| 44 | + |
---|
48 | 45 | /* |
---|
49 | 46 | * LOCKING: |
---|
50 | 47 | * There are three level of locking required by epoll : |
---|
51 | 48 | * |
---|
52 | 49 | * 1) epmutex (mutex) |
---|
53 | 50 | * 2) ep->mtx (mutex) |
---|
54 | | - * 3) ep->wq.lock (spinlock) |
---|
| 51 | + * 3) ep->lock (rwlock) |
---|
55 | 52 | * |
---|
56 | 53 | * The acquire order is the one listed above, from 1 to 3. |
---|
57 | | - * We need a spinlock (ep->wq.lock) because we manipulate objects |
---|
| 54 | + * We need a rwlock (ep->lock) because we manipulate objects |
---|
58 | 55 | * from inside the poll callback, that might be triggered from |
---|
59 | 56 | * a wake_up() that in turn might be called from IRQ context. |
---|
60 | 57 | * So we can't sleep inside the poll callback and hence we need |
---|
.. | .. |
---|
86 | 83 | * of epoll file descriptors, we use the current recursion depth as |
---|
87 | 84 | * the lockdep subkey. |
---|
88 | 85 | * It is possible to drop the "ep->mtx" and to use the global |
---|
89 | | - * mutex "epmutex" (together with "ep->wq.lock") to have it working, |
---|
| 86 | + * mutex "epmutex" (together with "ep->lock") to have it working, |
---|
90 | 87 | * but having "ep->mtx" will make the interface more scalable. |
---|
91 | 88 | * Events that require holding "epmutex" are very rare, while for |
---|
92 | 89 | * normal operations the epoll private "ep->mtx" will guarantee |
---|
.. | .. |
---|
183 | 180 | * This structure is stored inside the "private_data" member of the file |
---|
184 | 181 | * structure and represents the main data structure for the eventpoll |
---|
185 | 182 | * interface. |
---|
186 | | - * |
---|
187 | | - * Access to it is protected by the lock inside wq. |
---|
188 | 183 | */ |
---|
189 | 184 | struct eventpoll { |
---|
190 | 185 | /* |
---|
.. | .. |
---|
204 | 199 | /* List of ready file descriptors */ |
---|
205 | 200 | struct list_head rdllist; |
---|
206 | 201 | |
---|
| 202 | + /* Lock which protects rdllist and ovflist */ |
---|
| 203 | + rwlock_t lock; |
---|
| 204 | + |
---|
207 | 205 | /* RB tree root used to store monitored fd structs */ |
---|
208 | 206 | struct rb_root_cached rbr; |
---|
209 | 207 | |
---|
210 | 208 | /* |
---|
211 | 209 | * This is a single linked list that chains all the "struct epitem" that |
---|
212 | 210 | * happened while transferring ready events to userspace w/out |
---|
213 | | - * holding ->wq.lock. |
---|
| 211 | + * holding ->lock. |
---|
214 | 212 | */ |
---|
215 | 213 | struct epitem *ovflist; |
---|
216 | 214 | |
---|
.. | .. |
---|
228 | 226 | #ifdef CONFIG_NET_RX_BUSY_POLL |
---|
229 | 227 | /* used to track busy poll napi_id */ |
---|
230 | 228 | unsigned int napi_id; |
---|
| 229 | +#endif |
---|
| 230 | + |
---|
| 231 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC |
---|
| 232 | + /* tracks wakeup nests for lockdep validation */ |
---|
| 233 | + u8 nests; |
---|
231 | 234 | #endif |
---|
232 | 235 | }; |
---|
233 | 236 | |
---|
.. | .. |
---|
294 | 297 | |
---|
295 | 298 | #include <linux/sysctl.h> |
---|
296 | 299 | |
---|
297 | | -static long zero; |
---|
| 300 | +static long long_zero; |
---|
298 | 301 | static long long_max = LONG_MAX; |
---|
299 | 302 | |
---|
300 | 303 | struct ctl_table epoll_table[] = { |
---|
.. | .. |
---|
304 | 307 | .maxlen = sizeof(max_user_watches), |
---|
305 | 308 | .mode = 0644, |
---|
306 | 309 | .proc_handler = proc_doulongvec_minmax, |
---|
307 | | - .extra1 = &zero, |
---|
| 310 | + .extra1 = &long_zero, |
---|
308 | 311 | .extra2 = &long_max, |
---|
309 | 312 | }, |
---|
310 | 313 | { } |
---|
.. | .. |
---|
357 | 360 | return container_of(p, struct ep_pqueue, pt)->epi; |
---|
358 | 361 | } |
---|
359 | 362 | |
---|
360 | | -/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ |
---|
361 | | -static inline int ep_op_has_event(int op) |
---|
362 | | -{ |
---|
363 | | - return op != EPOLL_CTL_DEL; |
---|
364 | | -} |
---|
365 | | - |
---|
366 | 363 | /* Initialize the poll safe wake up structure */ |
---|
367 | 364 | static void ep_nested_calls_init(struct nested_calls *ncalls) |
---|
368 | 365 | { |
---|
.. | .. |
---|
380 | 377 | */ |
---|
381 | 378 | static inline int ep_events_available(struct eventpoll *ep) |
---|
382 | 379 | { |
---|
383 | | - return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; |
---|
| 380 | + return !list_empty_careful(&ep->rdllist) || |
---|
| 381 | + READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; |
---|
384 | 382 | } |
---|
385 | 383 | |
---|
386 | 384 | #ifdef CONFIG_NET_RX_BUSY_POLL |
---|
.. | .. |
---|
470 | 468 | * no re-entered. |
---|
471 | 469 | * |
---|
472 | 470 | * @ncalls: Pointer to the nested_calls structure to be used for this call. |
---|
473 | | - * @max_nests: Maximum number of allowed nesting calls. |
---|
474 | 471 | * @nproc: Nested call core function pointer. |
---|
475 | 472 | * @priv: Opaque data to be passed to the @nproc callback. |
---|
476 | 473 | * @cookie: Cookie to be used to identify this nested call. |
---|
.. | .. |
---|
479 | 476 | * Returns: Returns the code returned by the @nproc callback, or -1 if |
---|
480 | 477 | * the maximum recursion limit has been exceeded. |
---|
481 | 478 | */ |
---|
482 | | -static int ep_call_nested(struct nested_calls *ncalls, int max_nests, |
---|
| 479 | +static int ep_call_nested(struct nested_calls *ncalls, |
---|
483 | 480 | int (*nproc)(void *, void *, int), void *priv, |
---|
484 | 481 | void *cookie, void *ctx) |
---|
485 | 482 | { |
---|
.. | .. |
---|
498 | 495 | */ |
---|
499 | 496 | list_for_each_entry(tncur, lsthead, llink) { |
---|
500 | 497 | if (tncur->ctx == ctx && |
---|
501 | | - (tncur->cookie == cookie || ++call_nests > max_nests)) { |
---|
| 498 | + (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) { |
---|
502 | 499 | /* |
---|
503 | 500 | * Ops ... loop detected or maximum nest level reached. |
---|
504 | 501 | * We abort this wake by breaking the cycle itself. |
---|
.. | .. |
---|
554 | 551 | */ |
---|
555 | 552 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
---|
556 | 553 | |
---|
557 | | -static struct nested_calls poll_safewake_ncalls; |
---|
558 | | - |
---|
559 | | -static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) |
---|
| 554 | +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, |
---|
| 555 | + unsigned pollflags) |
---|
560 | 556 | { |
---|
| 557 | + struct eventpoll *ep_src; |
---|
561 | 558 | unsigned long flags; |
---|
562 | | - wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie; |
---|
| 559 | + u8 nests = 0; |
---|
563 | 560 | |
---|
564 | | - spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1); |
---|
565 | | - wake_up_locked_poll(wqueue, EPOLLIN); |
---|
566 | | - spin_unlock_irqrestore(&wqueue->lock, flags); |
---|
567 | | - |
---|
568 | | - return 0; |
---|
569 | | -} |
---|
570 | | - |
---|
571 | | -static void ep_poll_safewake(wait_queue_head_t *wq) |
---|
572 | | -{ |
---|
573 | | - int this_cpu = get_cpu(); |
---|
574 | | - |
---|
575 | | - ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, |
---|
576 | | - ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); |
---|
577 | | - |
---|
578 | | - put_cpu(); |
---|
| 561 | + /* |
---|
| 562 | + * To set the subclass or nesting level for spin_lock_irqsave_nested() |
---|
| 563 | + * it might be natural to create a per-cpu nest count. However, since |
---|
| 564 | + * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can |
---|
| 565 | + * schedule() in the -rt kernel, the per-cpu variable are no longer |
---|
| 566 | + * protected. Thus, we are introducing a per eventpoll nest field. |
---|
| 567 | + * If we are not being call from ep_poll_callback(), epi is NULL and |
---|
| 568 | + * we are at the first level of nesting, 0. Otherwise, we are being |
---|
| 569 | + * called from ep_poll_callback() and if a previous wakeup source is |
---|
| 570 | + * not an epoll file itself, we are at depth 1 since the wakeup source |
---|
| 571 | + * is depth 0. If the wakeup source is a previous epoll file in the |
---|
| 572 | + * wakeup chain then we use its nests value and record ours as |
---|
| 573 | + * nests + 1. The previous epoll file nests value is stable since its |
---|
| 574 | + * already holding its own poll_wait.lock. |
---|
| 575 | + */ |
---|
| 576 | + if (epi) { |
---|
| 577 | + if ((is_file_epoll(epi->ffd.file))) { |
---|
| 578 | + ep_src = epi->ffd.file->private_data; |
---|
| 579 | + nests = ep_src->nests; |
---|
| 580 | + } else { |
---|
| 581 | + nests = 1; |
---|
| 582 | + } |
---|
| 583 | + } |
---|
| 584 | + spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); |
---|
| 585 | + ep->nests = nests + 1; |
---|
| 586 | + wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); |
---|
| 587 | + ep->nests = 0; |
---|
| 588 | + spin_unlock_irqrestore(&ep->poll_wait.lock, flags); |
---|
579 | 589 | } |
---|
580 | 590 | |
---|
581 | 591 | #else |
---|
582 | 592 | |
---|
583 | | -static void ep_poll_safewake(wait_queue_head_t *wq) |
---|
| 593 | +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, |
---|
| 594 | + unsigned pollflags) |
---|
584 | 595 | { |
---|
585 | | - wake_up_poll(wq, EPOLLIN); |
---|
| 596 | + wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); |
---|
586 | 597 | } |
---|
587 | 598 | |
---|
588 | 599 | #endif |
---|
.. | .. |
---|
674 | 685 | void *priv, int depth, bool ep_locked) |
---|
675 | 686 | { |
---|
676 | 687 | __poll_t res; |
---|
677 | | - int pwake = 0; |
---|
678 | 688 | struct epitem *epi, *nepi; |
---|
679 | 689 | LIST_HEAD(txlist); |
---|
680 | 690 | |
---|
.. | .. |
---|
696 | 706 | * because we want the "sproc" callback to be able to do it |
---|
697 | 707 | * in a lockless way. |
---|
698 | 708 | */ |
---|
699 | | - spin_lock_irq(&ep->wq.lock); |
---|
| 709 | + write_lock_irq(&ep->lock); |
---|
700 | 710 | list_splice_init(&ep->rdllist, &txlist); |
---|
701 | | - ep->ovflist = NULL; |
---|
702 | | - spin_unlock_irq(&ep->wq.lock); |
---|
| 711 | + WRITE_ONCE(ep->ovflist, NULL); |
---|
| 712 | + write_unlock_irq(&ep->lock); |
---|
703 | 713 | |
---|
704 | 714 | /* |
---|
705 | 715 | * Now call the callback function. |
---|
706 | 716 | */ |
---|
707 | 717 | res = (*sproc)(ep, &txlist, priv); |
---|
708 | 718 | |
---|
709 | | - spin_lock_irq(&ep->wq.lock); |
---|
| 719 | + write_lock_irq(&ep->lock); |
---|
710 | 720 | /* |
---|
711 | 721 | * During the time we spent inside the "sproc" callback, some |
---|
712 | 722 | * other events might have been queued by the poll callback. |
---|
713 | 723 | * We re-insert them inside the main ready-list here. |
---|
714 | 724 | */ |
---|
715 | | - for (nepi = ep->ovflist; (epi = nepi) != NULL; |
---|
| 725 | + for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; |
---|
716 | 726 | nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { |
---|
717 | 727 | /* |
---|
718 | 728 | * We need to check if the item is already in the list. |
---|
.. | .. |
---|
721 | 731 | * contain them, and the list_splice() below takes care of them. |
---|
722 | 732 | */ |
---|
723 | 733 | if (!ep_is_linked(epi)) { |
---|
724 | | - list_add_tail(&epi->rdllink, &ep->rdllist); |
---|
| 734 | + /* |
---|
| 735 | + * ->ovflist is LIFO, so we have to reverse it in order |
---|
| 736 | + * to keep in FIFO. |
---|
| 737 | + */ |
---|
| 738 | + list_add(&epi->rdllink, &ep->rdllist); |
---|
725 | 739 | ep_pm_stay_awake(epi); |
---|
726 | 740 | } |
---|
727 | 741 | } |
---|
.. | .. |
---|
730 | 744 | * releasing the lock, events will be queued in the normal way inside |
---|
731 | 745 | * ep->rdllist. |
---|
732 | 746 | */ |
---|
733 | | - ep->ovflist = EP_UNACTIVE_PTR; |
---|
| 747 | + WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); |
---|
734 | 748 | |
---|
735 | 749 | /* |
---|
736 | 750 | * Quickly re-inject items left on "txlist". |
---|
.. | .. |
---|
739 | 753 | __pm_relax(ep->ws); |
---|
740 | 754 | |
---|
741 | 755 | if (!list_empty(&ep->rdllist)) { |
---|
742 | | - /* |
---|
743 | | - * Wake up (if active) both the eventpoll wait list and |
---|
744 | | - * the ->poll() wait list (delayed after we release the lock). |
---|
745 | | - */ |
---|
746 | 756 | if (waitqueue_active(&ep->wq)) |
---|
747 | | - wake_up_locked(&ep->wq); |
---|
748 | | - if (waitqueue_active(&ep->poll_wait)) |
---|
749 | | - pwake++; |
---|
| 757 | + wake_up(&ep->wq); |
---|
750 | 758 | } |
---|
751 | | - spin_unlock_irq(&ep->wq.lock); |
---|
| 759 | + |
---|
| 760 | + write_unlock_irq(&ep->lock); |
---|
752 | 761 | |
---|
753 | 762 | if (!ep_locked) |
---|
754 | 763 | mutex_unlock(&ep->mtx); |
---|
755 | | - |
---|
756 | | - /* We have to call this outside the lock */ |
---|
757 | | - if (pwake) |
---|
758 | | - ep_poll_safewake(&ep->poll_wait); |
---|
759 | 764 | |
---|
760 | 765 | return res; |
---|
761 | 766 | } |
---|
.. | .. |
---|
788 | 793 | |
---|
789 | 794 | rb_erase_cached(&epi->rbn, &ep->rbr); |
---|
790 | 795 | |
---|
791 | | - spin_lock_irq(&ep->wq.lock); |
---|
| 796 | + write_lock_irq(&ep->lock); |
---|
792 | 797 | if (ep_is_linked(epi)) |
---|
793 | 798 | list_del_init(&epi->rdllink); |
---|
794 | | - spin_unlock_irq(&ep->wq.lock); |
---|
| 799 | + write_unlock_irq(&ep->lock); |
---|
795 | 800 | |
---|
796 | 801 | wakeup_source_unregister(ep_wakeup_source(epi)); |
---|
797 | 802 | /* |
---|
.. | .. |
---|
815 | 820 | |
---|
816 | 821 | /* We need to release all tasks waiting for these file */ |
---|
817 | 822 | if (waitqueue_active(&ep->poll_wait)) |
---|
818 | | - ep_poll_safewake(&ep->poll_wait); |
---|
| 823 | + ep_poll_safewake(ep, NULL, 0); |
---|
819 | 824 | |
---|
820 | 825 | /* |
---|
821 | 826 | * We need to lock this because we could be hit by |
---|
.. | .. |
---|
841 | 846 | * Walks through the whole tree by freeing each "struct epitem". At this |
---|
842 | 847 | * point we are sure no poll callbacks will be lingering around, and also by |
---|
843 | 848 | * holding "epmutex" we can be sure that no file cleanup code will hit |
---|
844 | | - * us during this operation. So we can avoid the lock on "ep->wq.lock". |
---|
| 849 | + * us during this operation. So we can avoid the lock on "ep->lock". |
---|
845 | 850 | * We do not need to lock ep->mtx, either, we only do it to prevent |
---|
846 | 851 | * a lockdep warning. |
---|
847 | 852 | */ |
---|
.. | .. |
---|
1022 | 1027 | goto free_uid; |
---|
1023 | 1028 | |
---|
1024 | 1029 | mutex_init(&ep->mtx); |
---|
| 1030 | + rwlock_init(&ep->lock); |
---|
1025 | 1031 | init_waitqueue_head(&ep->wq); |
---|
1026 | 1032 | init_waitqueue_head(&ep->poll_wait); |
---|
1027 | 1033 | INIT_LIST_HEAD(&ep->rdllist); |
---|
.. | .. |
---|
1067 | 1073 | return epir; |
---|
1068 | 1074 | } |
---|
1069 | 1075 | |
---|
1070 | | -#ifdef CONFIG_CHECKPOINT_RESTORE |
---|
| 1076 | +#ifdef CONFIG_KCMP |
---|
1071 | 1077 | static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) |
---|
1072 | 1078 | { |
---|
1073 | 1079 | struct rb_node *rbp; |
---|
.. | .. |
---|
1109 | 1115 | |
---|
1110 | 1116 | return file_raw; |
---|
1111 | 1117 | } |
---|
1112 | | -#endif /* CONFIG_CHECKPOINT_RESTORE */ |
---|
| 1118 | +#endif /* CONFIG_KCMP */ |
---|
| 1119 | + |
---|
| 1120 | +/** |
---|
| 1121 | + * Adds a new entry to the tail of the list in a lockless way, i.e. |
---|
| 1122 | + * multiple CPUs are allowed to call this function concurrently. |
---|
| 1123 | + * |
---|
| 1124 | + * Beware: it is necessary to prevent any other modifications of the |
---|
| 1125 | + * existing list until all changes are completed, in other words |
---|
| 1126 | + * concurrent list_add_tail_lockless() calls should be protected |
---|
| 1127 | + * with a read lock, where write lock acts as a barrier which |
---|
| 1128 | + * makes sure all list_add_tail_lockless() calls are fully |
---|
| 1129 | + * completed. |
---|
| 1130 | + * |
---|
| 1131 | + * Also an element can be locklessly added to the list only in one |
---|
| 1132 | + * direction i.e. either to the tail either to the head, otherwise |
---|
| 1133 | + * concurrent access will corrupt the list. |
---|
| 1134 | + * |
---|
| 1135 | + * Returns %false if element has been already added to the list, %true |
---|
| 1136 | + * otherwise. |
---|
| 1137 | + */ |
---|
| 1138 | +static inline bool list_add_tail_lockless(struct list_head *new, |
---|
| 1139 | + struct list_head *head) |
---|
| 1140 | +{ |
---|
| 1141 | + struct list_head *prev; |
---|
| 1142 | + |
---|
| 1143 | + /* |
---|
| 1144 | + * This is simple 'new->next = head' operation, but cmpxchg() |
---|
| 1145 | + * is used in order to detect that same element has been just |
---|
| 1146 | + * added to the list from another CPU: the winner observes |
---|
| 1147 | + * new->next == new. |
---|
| 1148 | + */ |
---|
| 1149 | + if (cmpxchg(&new->next, new, head) != new) |
---|
| 1150 | + return false; |
---|
| 1151 | + |
---|
| 1152 | + /* |
---|
| 1153 | + * Initially ->next of a new element must be updated with the head |
---|
| 1154 | + * (we are inserting to the tail) and only then pointers are atomically |
---|
| 1155 | + * exchanged. XCHG guarantees memory ordering, thus ->next should be |
---|
| 1156 | + * updated before pointers are actually swapped and pointers are |
---|
| 1157 | + * swapped before prev->next is updated. |
---|
| 1158 | + */ |
---|
| 1159 | + |
---|
| 1160 | + prev = xchg(&head->prev, new); |
---|
| 1161 | + |
---|
| 1162 | + /* |
---|
| 1163 | + * It is safe to modify prev->next and new->prev, because a new element |
---|
| 1164 | + * is added only to the tail and new->next is updated before XCHG. |
---|
| 1165 | + */ |
---|
| 1166 | + |
---|
| 1167 | + prev->next = new; |
---|
| 1168 | + new->prev = prev; |
---|
| 1169 | + |
---|
| 1170 | + return true; |
---|
| 1171 | +} |
---|
| 1172 | + |
---|
| 1173 | +/** |
---|
| 1174 | + * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, |
---|
| 1175 | + * i.e. multiple CPUs are allowed to call this function concurrently. |
---|
| 1176 | + * |
---|
| 1177 | + * Returns %false if epi element has been already chained, %true otherwise. |
---|
| 1178 | + */ |
---|
| 1179 | +static inline bool chain_epi_lockless(struct epitem *epi) |
---|
| 1180 | +{ |
---|
| 1181 | + struct eventpoll *ep = epi->ep; |
---|
| 1182 | + |
---|
| 1183 | + /* Fast preliminary check */ |
---|
| 1184 | + if (epi->next != EP_UNACTIVE_PTR) |
---|
| 1185 | + return false; |
---|
| 1186 | + |
---|
| 1187 | + /* Check that the same epi has not been just chained from another CPU */ |
---|
| 1188 | + if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) |
---|
| 1189 | + return false; |
---|
| 1190 | + |
---|
| 1191 | + /* Atomically exchange tail */ |
---|
| 1192 | + epi->next = xchg(&ep->ovflist, epi); |
---|
| 1193 | + |
---|
| 1194 | + return true; |
---|
| 1195 | +} |
---|
1113 | 1196 | |
---|
1114 | 1197 | /* |
---|
1115 | 1198 | * This is the callback that is passed to the wait queue wakeup |
---|
1116 | 1199 | * mechanism. It is called by the stored file descriptors when they |
---|
1117 | 1200 | * have events to report. |
---|
| 1201 | + * |
---|
| 1202 | + * This callback takes a read lock in order not to content with concurrent |
---|
| 1203 | + * events from another file descriptors, thus all modifications to ->rdllist |
---|
| 1204 | + * or ->ovflist are lockless. Read lock is paired with the write lock from |
---|
| 1205 | + * ep_scan_ready_list(), which stops all list modifications and guarantees |
---|
| 1206 | + * that lists state is seen correctly. |
---|
| 1207 | + * |
---|
| 1208 | + * Another thing worth to mention is that ep_poll_callback() can be called |
---|
| 1209 | + * concurrently for the same @epi from different CPUs if poll table was inited |
---|
| 1210 | + * with several wait queues entries. Plural wakeup from different CPUs of a |
---|
| 1211 | + * single wait queue is serialized by wq.lock, but the case when multiple wait |
---|
| 1212 | + * queues are used should be detected accordingly. This is detected using |
---|
| 1213 | + * cmpxchg() operation. |
---|
1118 | 1214 | */ |
---|
1119 | 1215 | static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) |
---|
1120 | 1216 | { |
---|
1121 | 1217 | int pwake = 0; |
---|
1122 | | - unsigned long flags; |
---|
1123 | 1218 | struct epitem *epi = ep_item_from_wait(wait); |
---|
1124 | 1219 | struct eventpoll *ep = epi->ep; |
---|
1125 | 1220 | __poll_t pollflags = key_to_poll(key); |
---|
| 1221 | + unsigned long flags; |
---|
1126 | 1222 | int ewake = 0; |
---|
1127 | 1223 | |
---|
1128 | | - spin_lock_irqsave(&ep->wq.lock, flags); |
---|
| 1224 | + read_lock_irqsave(&ep->lock, flags); |
---|
1129 | 1225 | |
---|
1130 | 1226 | ep_set_busy_poll_napi_id(epi); |
---|
1131 | 1227 | |
---|
.. | .. |
---|
1153 | 1249 | * semantics). All the events that happen during that period of time are |
---|
1154 | 1250 | * chained in ep->ovflist and requeued later on. |
---|
1155 | 1251 | */ |
---|
1156 | | - if (ep->ovflist != EP_UNACTIVE_PTR) { |
---|
1157 | | - if (epi->next == EP_UNACTIVE_PTR) { |
---|
1158 | | - epi->next = ep->ovflist; |
---|
1159 | | - ep->ovflist = epi; |
---|
1160 | | - if (epi->ws) { |
---|
1161 | | - /* |
---|
1162 | | - * Activate ep->ws since epi->ws may get |
---|
1163 | | - * deactivated at any time. |
---|
1164 | | - */ |
---|
1165 | | - __pm_stay_awake(ep->ws); |
---|
1166 | | - } |
---|
1167 | | - |
---|
1168 | | - } |
---|
1169 | | - goto out_unlock; |
---|
1170 | | - } |
---|
1171 | | - |
---|
1172 | | - /* If this file is already in the ready list we exit soon */ |
---|
1173 | | - if (!ep_is_linked(epi)) { |
---|
1174 | | - list_add_tail(&epi->rdllink, &ep->rdllist); |
---|
1175 | | - ep_pm_stay_awake_rcu(epi); |
---|
| 1252 | + if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { |
---|
| 1253 | + if (chain_epi_lockless(epi)) |
---|
| 1254 | + ep_pm_stay_awake_rcu(epi); |
---|
| 1255 | + } else if (!ep_is_linked(epi)) { |
---|
| 1256 | + /* In the usual case, add event to ready list. */ |
---|
| 1257 | + if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) |
---|
| 1258 | + ep_pm_stay_awake_rcu(epi); |
---|
1176 | 1259 | } |
---|
1177 | 1260 | |
---|
1178 | 1261 | /* |
---|
.. | .. |
---|
1196 | 1279 | break; |
---|
1197 | 1280 | } |
---|
1198 | 1281 | } |
---|
1199 | | - wake_up_locked(&ep->wq); |
---|
| 1282 | + wake_up(&ep->wq); |
---|
1200 | 1283 | } |
---|
1201 | 1284 | if (waitqueue_active(&ep->poll_wait)) |
---|
1202 | 1285 | pwake++; |
---|
1203 | 1286 | |
---|
1204 | 1287 | out_unlock: |
---|
1205 | | - spin_unlock_irqrestore(&ep->wq.lock, flags); |
---|
| 1288 | + read_unlock_irqrestore(&ep->lock, flags); |
---|
1206 | 1289 | |
---|
1207 | 1290 | /* We have to call this outside the lock */ |
---|
1208 | 1291 | if (pwake) |
---|
1209 | | - ep_poll_safewake(&ep->poll_wait); |
---|
| 1292 | + ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); |
---|
1210 | 1293 | |
---|
1211 | 1294 | if (!(epi->event.events & EPOLLEXCLUSIVE)) |
---|
1212 | 1295 | ewake = 1; |
---|
.. | .. |
---|
1332 | 1415 | } |
---|
1333 | 1416 | } else { |
---|
1334 | 1417 | error = ep_call_nested(&poll_loop_ncalls, |
---|
1335 | | - EP_MAX_NESTS, |
---|
1336 | 1418 | reverse_path_check_proc, |
---|
1337 | 1419 | child_file, child_file, |
---|
1338 | 1420 | current); |
---|
.. | .. |
---|
1366 | 1448 | /* let's call this for all tfiles */ |
---|
1367 | 1449 | list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { |
---|
1368 | 1450 | path_count_init(); |
---|
1369 | | - error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, |
---|
| 1451 | + error = ep_call_nested(&poll_loop_ncalls, |
---|
1370 | 1452 | reverse_path_check_proc, current_file, |
---|
1371 | 1453 | current_file, current); |
---|
1372 | 1454 | if (error) |
---|
.. | .. |
---|
1379 | 1461 | { |
---|
1380 | 1462 | struct name_snapshot n; |
---|
1381 | 1463 | struct wakeup_source *ws; |
---|
| 1464 | + char ws_name[64]; |
---|
1382 | 1465 | |
---|
| 1466 | + strlcpy(ws_name, "eventpoll", sizeof(ws_name)); |
---|
| 1467 | + trace_android_vh_ep_create_wakeup_source(ws_name, sizeof(ws_name)); |
---|
1383 | 1468 | if (!epi->ep->ws) { |
---|
1384 | | - epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); |
---|
| 1469 | + epi->ep->ws = wakeup_source_register(NULL, ws_name); |
---|
1385 | 1470 | if (!epi->ep->ws) |
---|
1386 | 1471 | return -ENOMEM; |
---|
1387 | 1472 | } |
---|
1388 | 1473 | |
---|
1389 | 1474 | take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); |
---|
1390 | | - ws = wakeup_source_register(NULL, n.name); |
---|
| 1475 | + strlcpy(ws_name, n.name.name, sizeof(ws_name)); |
---|
| 1476 | + trace_android_vh_ep_create_wakeup_source(ws_name, sizeof(ws_name)); |
---|
| 1477 | + ws = wakeup_source_register(NULL, ws_name); |
---|
1391 | 1478 | release_dentry_name_snapshot(&n); |
---|
1392 | 1479 | |
---|
1393 | 1480 | if (!ws) |
---|
.. | .. |
---|
1489 | 1576 | goto error_unregister; |
---|
1490 | 1577 | |
---|
1491 | 1578 | /* We have to drop the new item inside our item list to keep track of it */ |
---|
1492 | | - spin_lock_irq(&ep->wq.lock); |
---|
| 1579 | + write_lock_irq(&ep->lock); |
---|
1493 | 1580 | |
---|
1494 | 1581 | /* record NAPI ID of new item if present */ |
---|
1495 | 1582 | ep_set_busy_poll_napi_id(epi); |
---|
.. | .. |
---|
1501 | 1588 | |
---|
1502 | 1589 | /* Notify waiting tasks that events are available */ |
---|
1503 | 1590 | if (waitqueue_active(&ep->wq)) |
---|
1504 | | - wake_up_locked(&ep->wq); |
---|
| 1591 | + wake_up(&ep->wq); |
---|
1505 | 1592 | if (waitqueue_active(&ep->poll_wait)) |
---|
1506 | 1593 | pwake++; |
---|
1507 | 1594 | } |
---|
1508 | 1595 | |
---|
1509 | | - spin_unlock_irq(&ep->wq.lock); |
---|
| 1596 | + write_unlock_irq(&ep->lock); |
---|
1510 | 1597 | |
---|
1511 | 1598 | atomic_long_inc(&ep->user->epoll_watches); |
---|
1512 | 1599 | |
---|
1513 | 1600 | /* We have to call this outside the lock */ |
---|
1514 | 1601 | if (pwake) |
---|
1515 | | - ep_poll_safewake(&ep->poll_wait); |
---|
| 1602 | + ep_poll_safewake(ep, NULL, 0); |
---|
1516 | 1603 | |
---|
1517 | 1604 | return 0; |
---|
1518 | 1605 | |
---|
.. | .. |
---|
1531 | 1618 | * list, since that is used/cleaned only inside a section bound by "mtx". |
---|
1532 | 1619 | * And ep_insert() is called with "mtx" held. |
---|
1533 | 1620 | */ |
---|
1534 | | - spin_lock_irq(&ep->wq.lock); |
---|
| 1621 | + write_lock_irq(&ep->lock); |
---|
1535 | 1622 | if (ep_is_linked(epi)) |
---|
1536 | 1623 | list_del_init(&epi->rdllink); |
---|
1537 | | - spin_unlock_irq(&ep->wq.lock); |
---|
| 1624 | + write_unlock_irq(&ep->lock); |
---|
1538 | 1625 | |
---|
1539 | 1626 | wakeup_source_unregister(ep_wakeup_source(epi)); |
---|
1540 | 1627 | |
---|
.. | .. |
---|
1578 | 1665 | * 1) Flush epi changes above to other CPUs. This ensures |
---|
1579 | 1666 | * we do not miss events from ep_poll_callback if an |
---|
1580 | 1667 | * event occurs immediately after we call f_op->poll(). |
---|
1581 | | - * We need this because we did not take ep->wq.lock while |
---|
| 1668 | + * We need this because we did not take ep->lock while |
---|
1582 | 1669 | * changing epi above (but ep_poll_callback does take |
---|
1583 | | - * ep->wq.lock). |
---|
| 1670 | + * ep->lock). |
---|
1584 | 1671 | * |
---|
1585 | 1672 | * 2) We also need to ensure we do not miss _past_ events |
---|
1586 | 1673 | * when calling f_op->poll(). This barrier also |
---|
.. | .. |
---|
1599 | 1686 | * list, push it inside. |
---|
1600 | 1687 | */ |
---|
1601 | 1688 | if (ep_item_poll(epi, &pt, 1)) { |
---|
1602 | | - spin_lock_irq(&ep->wq.lock); |
---|
| 1689 | + write_lock_irq(&ep->lock); |
---|
1603 | 1690 | if (!ep_is_linked(epi)) { |
---|
1604 | 1691 | list_add_tail(&epi->rdllink, &ep->rdllist); |
---|
1605 | 1692 | ep_pm_stay_awake(epi); |
---|
1606 | 1693 | |
---|
1607 | 1694 | /* Notify waiting tasks that events are available */ |
---|
1608 | 1695 | if (waitqueue_active(&ep->wq)) |
---|
1609 | | - wake_up_locked(&ep->wq); |
---|
| 1696 | + wake_up(&ep->wq); |
---|
1610 | 1697 | if (waitqueue_active(&ep->poll_wait)) |
---|
1611 | 1698 | pwake++; |
---|
1612 | 1699 | } |
---|
1613 | | - spin_unlock_irq(&ep->wq.lock); |
---|
| 1700 | + write_unlock_irq(&ep->lock); |
---|
1614 | 1701 | } |
---|
1615 | 1702 | |
---|
1616 | 1703 | /* We have to call this outside the lock */ |
---|
1617 | 1704 | if (pwake) |
---|
1618 | | - ep_poll_safewake(&ep->poll_wait); |
---|
| 1705 | + ep_poll_safewake(ep, NULL, 0); |
---|
1619 | 1706 | |
---|
1620 | 1707 | return 0; |
---|
1621 | 1708 | } |
---|
.. | .. |
---|
1625 | 1712 | { |
---|
1626 | 1713 | struct ep_send_events_data *esed = priv; |
---|
1627 | 1714 | __poll_t revents; |
---|
1628 | | - struct epitem *epi; |
---|
1629 | | - struct epoll_event __user *uevent; |
---|
| 1715 | + struct epitem *epi, *tmp; |
---|
| 1716 | + struct epoll_event __user *uevent = esed->events; |
---|
1630 | 1717 | struct wakeup_source *ws; |
---|
1631 | 1718 | poll_table pt; |
---|
1632 | 1719 | |
---|
1633 | 1720 | init_poll_funcptr(&pt, NULL); |
---|
| 1721 | + esed->res = 0; |
---|
1634 | 1722 | |
---|
1635 | 1723 | /* |
---|
1636 | 1724 | * We can loop without lock because we are passed a task private list. |
---|
1637 | 1725 | * Items cannot vanish during the loop because ep_scan_ready_list() is |
---|
1638 | 1726 | * holding "mtx" during this call. |
---|
1639 | 1727 | */ |
---|
1640 | | - for (esed->res = 0, uevent = esed->events; |
---|
1641 | | - !list_empty(head) && esed->res < esed->maxevents;) { |
---|
1642 | | - epi = list_first_entry(head, struct epitem, rdllink); |
---|
| 1728 | + lockdep_assert_held(&ep->mtx); |
---|
| 1729 | + |
---|
| 1730 | + list_for_each_entry_safe(epi, tmp, head, rdllink) { |
---|
| 1731 | + if (esed->res >= esed->maxevents) |
---|
| 1732 | + break; |
---|
1643 | 1733 | |
---|
1644 | 1734 | /* |
---|
1645 | 1735 | * Activate ep->ws before deactivating epi->ws to prevent |
---|
.. | .. |
---|
1659 | 1749 | |
---|
1660 | 1750 | list_del_init(&epi->rdllink); |
---|
1661 | 1751 | |
---|
1662 | | - revents = ep_item_poll(epi, &pt, 1); |
---|
1663 | | - |
---|
1664 | 1752 | /* |
---|
1665 | 1753 | * If the event mask intersect the caller-requested one, |
---|
1666 | 1754 | * deliver the event to userspace. Again, ep_scan_ready_list() |
---|
1667 | | - * is holding "mtx", so no operations coming from userspace |
---|
| 1755 | + * is holding ep->mtx, so no operations coming from userspace |
---|
1668 | 1756 | * can change the item. |
---|
1669 | 1757 | */ |
---|
1670 | | - if (revents) { |
---|
1671 | | - if (__put_user(revents, &uevent->events) || |
---|
1672 | | - __put_user(epi->event.data, &uevent->data)) { |
---|
1673 | | - list_add(&epi->rdllink, head); |
---|
1674 | | - ep_pm_stay_awake(epi); |
---|
1675 | | - if (!esed->res) |
---|
1676 | | - esed->res = -EFAULT; |
---|
1677 | | - return 0; |
---|
1678 | | - } |
---|
1679 | | - esed->res++; |
---|
1680 | | - uevent++; |
---|
1681 | | - if (epi->event.events & EPOLLONESHOT) |
---|
1682 | | - epi->event.events &= EP_PRIVATE_BITS; |
---|
1683 | | - else if (!(epi->event.events & EPOLLET)) { |
---|
1684 | | - /* |
---|
1685 | | - * If this file has been added with Level |
---|
1686 | | - * Trigger mode, we need to insert back inside |
---|
1687 | | - * the ready list, so that the next call to |
---|
1688 | | - * epoll_wait() will check again the events |
---|
1689 | | - * availability. At this point, no one can insert |
---|
1690 | | - * into ep->rdllist besides us. The epoll_ctl() |
---|
1691 | | - * callers are locked out by |
---|
1692 | | - * ep_scan_ready_list() holding "mtx" and the |
---|
1693 | | - * poll callback will queue them in ep->ovflist. |
---|
1694 | | - */ |
---|
1695 | | - list_add_tail(&epi->rdllink, &ep->rdllist); |
---|
1696 | | - ep_pm_stay_awake(epi); |
---|
1697 | | - } |
---|
| 1758 | + revents = ep_item_poll(epi, &pt, 1); |
---|
| 1759 | + if (!revents) |
---|
| 1760 | + continue; |
---|
| 1761 | + |
---|
| 1762 | + if (__put_user(revents, &uevent->events) || |
---|
| 1763 | + __put_user(epi->event.data, &uevent->data)) { |
---|
| 1764 | + list_add(&epi->rdllink, head); |
---|
| 1765 | + ep_pm_stay_awake(epi); |
---|
| 1766 | + if (!esed->res) |
---|
| 1767 | + esed->res = -EFAULT; |
---|
| 1768 | + return 0; |
---|
| 1769 | + } |
---|
| 1770 | + esed->res++; |
---|
| 1771 | + uevent++; |
---|
| 1772 | + if (epi->event.events & EPOLLONESHOT) |
---|
| 1773 | + epi->event.events &= EP_PRIVATE_BITS; |
---|
| 1774 | + else if (!(epi->event.events & EPOLLET)) { |
---|
| 1775 | + /* |
---|
| 1776 | + * If this file has been added with Level |
---|
| 1777 | + * Trigger mode, we need to insert back inside |
---|
| 1778 | + * the ready list, so that the next call to |
---|
| 1779 | + * epoll_wait() will check again the events |
---|
| 1780 | + * availability. At this point, no one can insert |
---|
| 1781 | + * into ep->rdllist besides us. The epoll_ctl() |
---|
| 1782 | + * callers are locked out by |
---|
| 1783 | + * ep_scan_ready_list() holding "mtx" and the |
---|
| 1784 | + * poll callback will queue them in ep->ovflist. |
---|
| 1785 | + */ |
---|
| 1786 | + list_add_tail(&epi->rdllink, &ep->rdllist); |
---|
| 1787 | + ep_pm_stay_awake(epi); |
---|
1698 | 1788 | } |
---|
1699 | 1789 | } |
---|
1700 | 1790 | |
---|
.. | .. |
---|
1722 | 1812 | |
---|
1723 | 1813 | ktime_get_ts64(&now); |
---|
1724 | 1814 | return timespec64_add_safe(now, ts); |
---|
| 1815 | +} |
---|
| 1816 | + |
---|
| 1817 | +/* |
---|
| 1818 | + * autoremove_wake_function, but remove even on failure to wake up, because we |
---|
| 1819 | + * know that default_wake_function/ttwu will only fail if the thread is already |
---|
| 1820 | + * woken, and in that case the ep_poll loop will remove the entry anyways, not |
---|
| 1821 | + * try to reuse it. |
---|
| 1822 | + */ |
---|
| 1823 | +static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, |
---|
| 1824 | + unsigned int mode, int sync, void *key) |
---|
| 1825 | +{ |
---|
| 1826 | + int ret = default_wake_function(wq_entry, mode, sync, key); |
---|
| 1827 | + |
---|
| 1828 | + /* |
---|
| 1829 | + * Pairs with list_empty_careful in ep_poll, and ensures future loop |
---|
| 1830 | + * iterations see the cause of this wakeup. |
---|
| 1831 | + */ |
---|
| 1832 | + list_del_init_careful(&wq_entry->entry); |
---|
| 1833 | + return ret; |
---|
1725 | 1834 | } |
---|
1726 | 1835 | |
---|
1727 | 1836 | /** |
---|
.. | .. |
---|
1760 | 1869 | } else if (timeout == 0) { |
---|
1761 | 1870 | /* |
---|
1762 | 1871 | * Avoid the unnecessary trip to the wait queue loop, if the |
---|
1763 | | - * caller specified a non blocking operation. |
---|
| 1872 | + * caller specified a non blocking operation. We still need |
---|
| 1873 | + * lock because we could race and not see an epi being added |
---|
| 1874 | + * to the ready list while in irq callback. Thus incorrectly |
---|
| 1875 | + * returning 0 back to userspace. |
---|
1764 | 1876 | */ |
---|
1765 | 1877 | timed_out = 1; |
---|
1766 | | - spin_lock_irq(&ep->wq.lock); |
---|
1767 | | - goto check_events; |
---|
| 1878 | + |
---|
| 1879 | + write_lock_irq(&ep->lock); |
---|
| 1880 | + eavail = ep_events_available(ep); |
---|
| 1881 | + write_unlock_irq(&ep->lock); |
---|
| 1882 | + |
---|
| 1883 | + goto send_events; |
---|
1768 | 1884 | } |
---|
1769 | 1885 | |
---|
1770 | 1886 | fetch_events: |
---|
.. | .. |
---|
1772 | 1888 | if (!ep_events_available(ep)) |
---|
1773 | 1889 | ep_busy_loop(ep, timed_out); |
---|
1774 | 1890 | |
---|
1775 | | - spin_lock_irq(&ep->wq.lock); |
---|
1776 | | - |
---|
1777 | | - if (!ep_events_available(ep)) { |
---|
1778 | | - /* |
---|
1779 | | - * Busy poll timed out. Drop NAPI ID for now, we can add |
---|
1780 | | - * it back in when we have moved a socket with a valid NAPI |
---|
1781 | | - * ID onto the ready list. |
---|
1782 | | - */ |
---|
1783 | | - ep_reset_busy_poll_napi_id(ep); |
---|
1784 | | - |
---|
1785 | | - /* |
---|
1786 | | - * We don't have any available event to return to the caller. |
---|
1787 | | - * We need to sleep here, and we will be wake up by |
---|
1788 | | - * ep_poll_callback() when events will become available. |
---|
1789 | | - */ |
---|
1790 | | - init_waitqueue_entry(&wait, current); |
---|
1791 | | - __add_wait_queue_exclusive(&ep->wq, &wait); |
---|
1792 | | - |
---|
1793 | | - for (;;) { |
---|
1794 | | - /* |
---|
1795 | | - * We don't want to sleep if the ep_poll_callback() sends us |
---|
1796 | | - * a wakeup in between. That's why we set the task state |
---|
1797 | | - * to TASK_INTERRUPTIBLE before doing the checks. |
---|
1798 | | - */ |
---|
1799 | | - set_current_state(TASK_INTERRUPTIBLE); |
---|
1800 | | - /* |
---|
1801 | | - * Always short-circuit for fatal signals to allow |
---|
1802 | | - * threads to make a timely exit without the chance of |
---|
1803 | | - * finding more events available and fetching |
---|
1804 | | - * repeatedly. |
---|
1805 | | - */ |
---|
1806 | | - if (fatal_signal_pending(current)) { |
---|
1807 | | - res = -EINTR; |
---|
1808 | | - break; |
---|
1809 | | - } |
---|
1810 | | - if (ep_events_available(ep) || timed_out) |
---|
1811 | | - break; |
---|
1812 | | - if (signal_pending(current)) { |
---|
1813 | | - res = -EINTR; |
---|
1814 | | - break; |
---|
1815 | | - } |
---|
1816 | | - |
---|
1817 | | - spin_unlock_irq(&ep->wq.lock); |
---|
1818 | | - if (!freezable_schedule_hrtimeout_range(to, slack, |
---|
1819 | | - HRTIMER_MODE_ABS)) |
---|
1820 | | - timed_out = 1; |
---|
1821 | | - |
---|
1822 | | - spin_lock_irq(&ep->wq.lock); |
---|
1823 | | - } |
---|
1824 | | - |
---|
1825 | | - __remove_wait_queue(&ep->wq, &wait); |
---|
1826 | | - __set_current_state(TASK_RUNNING); |
---|
1827 | | - } |
---|
1828 | | -check_events: |
---|
1829 | | - /* Is it worth to try to dig for events ? */ |
---|
1830 | 1891 | eavail = ep_events_available(ep); |
---|
| 1892 | + if (eavail) |
---|
| 1893 | + goto send_events; |
---|
1831 | 1894 | |
---|
1832 | | - spin_unlock_irq(&ep->wq.lock); |
---|
| 1895 | + /* |
---|
| 1896 | + * Busy poll timed out. Drop NAPI ID for now, we can add |
---|
| 1897 | + * it back in when we have moved a socket with a valid NAPI |
---|
| 1898 | + * ID onto the ready list. |
---|
| 1899 | + */ |
---|
| 1900 | + ep_reset_busy_poll_napi_id(ep); |
---|
1833 | 1901 | |
---|
| 1902 | + do { |
---|
| 1903 | + /* |
---|
| 1904 | + * Internally init_wait() uses autoremove_wake_function(), |
---|
| 1905 | + * thus wait entry is removed from the wait queue on each |
---|
| 1906 | + * wakeup. Why it is important? In case of several waiters |
---|
| 1907 | + * each new wakeup will hit the next waiter, giving it the |
---|
| 1908 | + * chance to harvest new event. Otherwise wakeup can be |
---|
| 1909 | + * lost. This is also good performance-wise, because on |
---|
| 1910 | + * normal wakeup path no need to call __remove_wait_queue() |
---|
| 1911 | + * explicitly, thus ep->lock is not taken, which halts the |
---|
| 1912 | + * event delivery. |
---|
| 1913 | + * |
---|
| 1914 | + * In fact, we now use an even more aggressive function that |
---|
| 1915 | + * unconditionally removes, because we don't reuse the wait |
---|
| 1916 | + * entry between loop iterations. This lets us also avoid the |
---|
| 1917 | + * performance issue if a process is killed, causing all of its |
---|
| 1918 | + * threads to wake up without being removed normally. |
---|
| 1919 | + */ |
---|
| 1920 | + init_wait(&wait); |
---|
| 1921 | + wait.func = ep_autoremove_wake_function; |
---|
| 1922 | + |
---|
| 1923 | + write_lock_irq(&ep->lock); |
---|
| 1924 | + /* |
---|
| 1925 | + * Barrierless variant, waitqueue_active() is called under |
---|
| 1926 | + * the same lock on wakeup ep_poll_callback() side, so it |
---|
| 1927 | + * is safe to avoid an explicit barrier. |
---|
| 1928 | + */ |
---|
| 1929 | + __set_current_state(TASK_INTERRUPTIBLE); |
---|
| 1930 | + |
---|
| 1931 | + /* |
---|
| 1932 | + * Do the final check under the lock. ep_scan_ready_list() |
---|
| 1933 | + * plays with two lists (->rdllist and ->ovflist) and there |
---|
| 1934 | + * is always a race when both lists are empty for short |
---|
| 1935 | + * period of time although events are pending, so lock is |
---|
| 1936 | + * important. |
---|
| 1937 | + */ |
---|
| 1938 | + eavail = ep_events_available(ep); |
---|
| 1939 | + if (!eavail) { |
---|
| 1940 | + if (signal_pending(current)) |
---|
| 1941 | + res = -EINTR; |
---|
| 1942 | + else |
---|
| 1943 | + __add_wait_queue_exclusive(&ep->wq, &wait); |
---|
| 1944 | + } |
---|
| 1945 | + write_unlock_irq(&ep->lock); |
---|
| 1946 | + |
---|
| 1947 | + if (!eavail && !res) |
---|
| 1948 | + timed_out = !freezable_schedule_hrtimeout_range(to, slack, |
---|
| 1949 | + HRTIMER_MODE_ABS); |
---|
| 1950 | + |
---|
| 1951 | + /* |
---|
| 1952 | + * We were woken up, thus go and try to harvest some events. |
---|
| 1953 | + * If timed out and still on the wait queue, recheck eavail |
---|
| 1954 | + * carefully under lock, below. |
---|
| 1955 | + */ |
---|
| 1956 | + eavail = 1; |
---|
| 1957 | + } while (0); |
---|
| 1958 | + |
---|
| 1959 | + __set_current_state(TASK_RUNNING); |
---|
| 1960 | + |
---|
| 1961 | + if (!list_empty_careful(&wait.entry)) { |
---|
| 1962 | + write_lock_irq(&ep->lock); |
---|
| 1963 | + /* |
---|
| 1964 | + * If the thread timed out and is not on the wait queue, it |
---|
| 1965 | + * means that the thread was woken up after its timeout expired |
---|
| 1966 | + * before it could reacquire the lock. Thus, when wait.entry is |
---|
| 1967 | + * empty, it needs to harvest events. |
---|
| 1968 | + */ |
---|
| 1969 | + if (timed_out) |
---|
| 1970 | + eavail = list_empty(&wait.entry); |
---|
| 1971 | + __remove_wait_queue(&ep->wq, &wait); |
---|
| 1972 | + write_unlock_irq(&ep->lock); |
---|
| 1973 | + } |
---|
| 1974 | + |
---|
| 1975 | +send_events: |
---|
| 1976 | + if (fatal_signal_pending(current)) { |
---|
| 1977 | + /* |
---|
| 1978 | + * Always short-circuit for fatal signals to allow |
---|
| 1979 | + * threads to make a timely exit without the chance of |
---|
| 1980 | + * finding more events available and fetching |
---|
| 1981 | + * repeatedly. |
---|
| 1982 | + */ |
---|
| 1983 | + res = -EINTR; |
---|
| 1984 | + } |
---|
1834 | 1985 | /* |
---|
1835 | 1986 | * Try to transfer events to user space. In case we get 0 events and |
---|
1836 | 1987 | * there's still timeout left over, we go trying again in search of |
---|
.. | .. |
---|
1875 | 2026 | ep_tovisit = epi->ffd.file->private_data; |
---|
1876 | 2027 | if (ep_tovisit->gen == loop_check_gen) |
---|
1877 | 2028 | continue; |
---|
1878 | | - error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, |
---|
| 2029 | + error = ep_call_nested(&poll_loop_ncalls, |
---|
1879 | 2030 | ep_loop_check_proc, epi->ffd.file, |
---|
1880 | 2031 | ep_tovisit, current); |
---|
1881 | 2032 | if (error != 0) |
---|
.. | .. |
---|
1914 | 2065 | */ |
---|
1915 | 2066 | static int ep_loop_check(struct eventpoll *ep, struct file *file) |
---|
1916 | 2067 | { |
---|
1917 | | - return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, |
---|
| 2068 | + return ep_call_nested(&poll_loop_ncalls, |
---|
1918 | 2069 | ep_loop_check_proc, file, ep, current); |
---|
1919 | 2070 | } |
---|
1920 | 2071 | |
---|
.. | .. |
---|
1991 | 2142 | return do_epoll_create(0); |
---|
1992 | 2143 | } |
---|
1993 | 2144 | |
---|
1994 | | -/* |
---|
1995 | | - * The following function implements the controller interface for |
---|
1996 | | - * the eventpoll file that enables the insertion/removal/change of |
---|
1997 | | - * file descriptors inside the interest set. |
---|
1998 | | - */ |
---|
1999 | | -SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, |
---|
2000 | | - struct epoll_event __user *, event) |
---|
| 2145 | +static inline int epoll_mutex_lock(struct mutex *mutex, int depth, |
---|
| 2146 | + bool nonblock) |
---|
| 2147 | +{ |
---|
| 2148 | + if (!nonblock) { |
---|
| 2149 | + mutex_lock_nested(mutex, depth); |
---|
| 2150 | + return 0; |
---|
| 2151 | + } |
---|
| 2152 | + if (mutex_trylock(mutex)) |
---|
| 2153 | + return 0; |
---|
| 2154 | + return -EAGAIN; |
---|
| 2155 | +} |
---|
| 2156 | + |
---|
| 2157 | +int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, |
---|
| 2158 | + bool nonblock) |
---|
2001 | 2159 | { |
---|
2002 | 2160 | int error; |
---|
2003 | 2161 | int full_check = 0; |
---|
2004 | 2162 | struct fd f, tf; |
---|
2005 | 2163 | struct eventpoll *ep; |
---|
2006 | 2164 | struct epitem *epi; |
---|
2007 | | - struct epoll_event epds; |
---|
2008 | 2165 | struct eventpoll *tep = NULL; |
---|
2009 | | - |
---|
2010 | | - error = -EFAULT; |
---|
2011 | | - if (ep_op_has_event(op) && |
---|
2012 | | - copy_from_user(&epds, event, sizeof(struct epoll_event))) |
---|
2013 | | - goto error_return; |
---|
2014 | 2166 | |
---|
2015 | 2167 | error = -EBADF; |
---|
2016 | 2168 | f = fdget(epfd); |
---|
.. | .. |
---|
2029 | 2181 | |
---|
2030 | 2182 | /* Check if EPOLLWAKEUP is allowed */ |
---|
2031 | 2183 | if (ep_op_has_event(op)) |
---|
2032 | | - ep_take_care_of_epollwakeup(&epds); |
---|
| 2184 | + ep_take_care_of_epollwakeup(epds); |
---|
2033 | 2185 | |
---|
2034 | 2186 | /* |
---|
2035 | 2187 | * We have to check that the file structure underneath the file descriptor |
---|
.. | .. |
---|
2045 | 2197 | * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. |
---|
2046 | 2198 | * Also, we do not currently supported nested exclusive wakeups. |
---|
2047 | 2199 | */ |
---|
2048 | | - if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) { |
---|
| 2200 | + if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { |
---|
2049 | 2201 | if (op == EPOLL_CTL_MOD) |
---|
2050 | 2202 | goto error_tgt_fput; |
---|
2051 | 2203 | if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || |
---|
2052 | | - (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) |
---|
| 2204 | + (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) |
---|
2053 | 2205 | goto error_tgt_fput; |
---|
2054 | 2206 | } |
---|
2055 | 2207 | |
---|
.. | .. |
---|
2074 | 2226 | * deep wakeup paths from forming in parallel through multiple |
---|
2075 | 2227 | * EPOLL_CTL_ADD operations. |
---|
2076 | 2228 | */ |
---|
2077 | | - mutex_lock_nested(&ep->mtx, 0); |
---|
| 2229 | + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); |
---|
| 2230 | + if (error) |
---|
| 2231 | + goto error_tgt_fput; |
---|
2078 | 2232 | if (op == EPOLL_CTL_ADD) { |
---|
2079 | 2233 | if (!list_empty(&f.file->f_ep_links) || |
---|
2080 | 2234 | ep->gen == loop_check_gen || |
---|
2081 | 2235 | is_file_epoll(tf.file)) { |
---|
2082 | | - full_check = 1; |
---|
2083 | 2236 | mutex_unlock(&ep->mtx); |
---|
2084 | | - mutex_lock(&epmutex); |
---|
| 2237 | + error = epoll_mutex_lock(&epmutex, 0, nonblock); |
---|
| 2238 | + if (error) |
---|
| 2239 | + goto error_tgt_fput; |
---|
| 2240 | + loop_check_gen++; |
---|
| 2241 | + full_check = 1; |
---|
2085 | 2242 | if (is_file_epoll(tf.file)) { |
---|
2086 | 2243 | error = -ELOOP; |
---|
2087 | 2244 | if (ep_loop_check(ep, tf.file) != 0) |
---|
.. | .. |
---|
2091 | 2248 | list_add(&tf.file->f_tfile_llink, |
---|
2092 | 2249 | &tfile_check_list); |
---|
2093 | 2250 | } |
---|
2094 | | - mutex_lock_nested(&ep->mtx, 0); |
---|
| 2251 | + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); |
---|
| 2252 | + if (error) |
---|
| 2253 | + goto error_tgt_fput; |
---|
2095 | 2254 | if (is_file_epoll(tf.file)) { |
---|
2096 | 2255 | tep = tf.file->private_data; |
---|
2097 | | - mutex_lock_nested(&tep->mtx, 1); |
---|
| 2256 | + error = epoll_mutex_lock(&tep->mtx, 1, nonblock); |
---|
| 2257 | + if (error) { |
---|
| 2258 | + mutex_unlock(&ep->mtx); |
---|
| 2259 | + goto error_tgt_fput; |
---|
| 2260 | + } |
---|
2098 | 2261 | } |
---|
2099 | 2262 | } |
---|
2100 | 2263 | } |
---|
.. | .. |
---|
2110 | 2273 | switch (op) { |
---|
2111 | 2274 | case EPOLL_CTL_ADD: |
---|
2112 | 2275 | if (!epi) { |
---|
2113 | | - epds.events |= EPOLLERR | EPOLLHUP; |
---|
2114 | | - error = ep_insert(ep, &epds, tf.file, fd, full_check); |
---|
| 2276 | + epds->events |= EPOLLERR | EPOLLHUP; |
---|
| 2277 | + error = ep_insert(ep, epds, tf.file, fd, full_check); |
---|
2115 | 2278 | } else |
---|
2116 | 2279 | error = -EEXIST; |
---|
2117 | 2280 | break; |
---|
.. | .. |
---|
2124 | 2287 | case EPOLL_CTL_MOD: |
---|
2125 | 2288 | if (epi) { |
---|
2126 | 2289 | if (!(epi->event.events & EPOLLEXCLUSIVE)) { |
---|
2127 | | - epds.events |= EPOLLERR | EPOLLHUP; |
---|
2128 | | - error = ep_modify(ep, epi, &epds); |
---|
| 2290 | + epds->events |= EPOLLERR | EPOLLHUP; |
---|
| 2291 | + error = ep_modify(ep, epi, epds); |
---|
2129 | 2292 | } |
---|
2130 | 2293 | } else |
---|
2131 | 2294 | error = -ENOENT; |
---|
.. | .. |
---|
2151 | 2314 | } |
---|
2152 | 2315 | |
---|
2153 | 2316 | /* |
---|
| 2317 | + * The following function implements the controller interface for |
---|
| 2318 | + * the eventpoll file that enables the insertion/removal/change of |
---|
| 2319 | + * file descriptors inside the interest set. |
---|
| 2320 | + */ |
---|
| 2321 | +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, |
---|
| 2322 | + struct epoll_event __user *, event) |
---|
| 2323 | +{ |
---|
| 2324 | + struct epoll_event epds; |
---|
| 2325 | + |
---|
| 2326 | + if (ep_op_has_event(op) && |
---|
| 2327 | + copy_from_user(&epds, event, sizeof(struct epoll_event))) |
---|
| 2328 | + return -EFAULT; |
---|
| 2329 | + |
---|
| 2330 | + return do_epoll_ctl(epfd, op, fd, &epds, false); |
---|
| 2331 | +} |
---|
| 2332 | + |
---|
| 2333 | +/* |
---|
2154 | 2334 | * Implement the event wait interface for the eventpoll file. It is the kernel |
---|
2155 | 2335 | * part of the user space epoll_wait(2). |
---|
2156 | 2336 | */ |
---|
.. | .. |
---|
2166 | 2346 | return -EINVAL; |
---|
2167 | 2347 | |
---|
2168 | 2348 | /* Verify that the area passed by the user is writeable */ |
---|
2169 | | - if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) |
---|
| 2349 | + if (!access_ok(events, maxevents * sizeof(struct epoll_event))) |
---|
2170 | 2350 | return -EFAULT; |
---|
2171 | 2351 | |
---|
2172 | 2352 | /* Get the "struct file *" for the eventpoll file */ |
---|
.. | .. |
---|
2211 | 2391 | size_t, sigsetsize) |
---|
2212 | 2392 | { |
---|
2213 | 2393 | int error; |
---|
2214 | | - sigset_t ksigmask, sigsaved; |
---|
2215 | 2394 | |
---|
2216 | 2395 | /* |
---|
2217 | 2396 | * If the caller wants a certain signal mask to be set during the wait, |
---|
2218 | 2397 | * we apply it here. |
---|
2219 | 2398 | */ |
---|
2220 | | - if (sigmask) { |
---|
2221 | | - if (sigsetsize != sizeof(sigset_t)) |
---|
2222 | | - return -EINVAL; |
---|
2223 | | - if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) |
---|
2224 | | - return -EFAULT; |
---|
2225 | | - sigsaved = current->blocked; |
---|
2226 | | - set_current_blocked(&ksigmask); |
---|
2227 | | - } |
---|
| 2399 | + error = set_user_sigmask(sigmask, sigsetsize); |
---|
| 2400 | + if (error) |
---|
| 2401 | + return error; |
---|
2228 | 2402 | |
---|
2229 | 2403 | error = do_epoll_wait(epfd, events, maxevents, timeout); |
---|
2230 | | - |
---|
2231 | | - /* |
---|
2232 | | - * If we changed the signal mask, we need to restore the original one. |
---|
2233 | | - * In case we've got a signal while waiting, we do not restore the |
---|
2234 | | - * signal mask yet, and we allow do_signal() to deliver the signal on |
---|
2235 | | - * the way back to userspace, before the signal mask is restored. |
---|
2236 | | - */ |
---|
2237 | | - if (sigmask) { |
---|
2238 | | - if (error == -EINTR) { |
---|
2239 | | - memcpy(¤t->saved_sigmask, &sigsaved, |
---|
2240 | | - sizeof(sigsaved)); |
---|
2241 | | - set_restore_sigmask(); |
---|
2242 | | - } else |
---|
2243 | | - set_current_blocked(&sigsaved); |
---|
2244 | | - } |
---|
| 2404 | + restore_saved_sigmask_unless(error == -EINTR); |
---|
2245 | 2405 | |
---|
2246 | 2406 | return error; |
---|
2247 | 2407 | } |
---|
.. | .. |
---|
2254 | 2414 | compat_size_t, sigsetsize) |
---|
2255 | 2415 | { |
---|
2256 | 2416 | long err; |
---|
2257 | | - sigset_t ksigmask, sigsaved; |
---|
2258 | 2417 | |
---|
2259 | 2418 | /* |
---|
2260 | 2419 | * If the caller wants a certain signal mask to be set during the wait, |
---|
2261 | 2420 | * we apply it here. |
---|
2262 | 2421 | */ |
---|
2263 | | - if (sigmask) { |
---|
2264 | | - if (sigsetsize != sizeof(compat_sigset_t)) |
---|
2265 | | - return -EINVAL; |
---|
2266 | | - if (get_compat_sigset(&ksigmask, sigmask)) |
---|
2267 | | - return -EFAULT; |
---|
2268 | | - sigsaved = current->blocked; |
---|
2269 | | - set_current_blocked(&ksigmask); |
---|
2270 | | - } |
---|
| 2422 | + err = set_compat_user_sigmask(sigmask, sigsetsize); |
---|
| 2423 | + if (err) |
---|
| 2424 | + return err; |
---|
2271 | 2425 | |
---|
2272 | 2426 | err = do_epoll_wait(epfd, events, maxevents, timeout); |
---|
2273 | | - |
---|
2274 | | - /* |
---|
2275 | | - * If we changed the signal mask, we need to restore the original one. |
---|
2276 | | - * In case we've got a signal while waiting, we do not restore the |
---|
2277 | | - * signal mask yet, and we allow do_signal() to deliver the signal on |
---|
2278 | | - * the way back to userspace, before the signal mask is restored. |
---|
2279 | | - */ |
---|
2280 | | - if (sigmask) { |
---|
2281 | | - if (err == -EINTR) { |
---|
2282 | | - memcpy(¤t->saved_sigmask, &sigsaved, |
---|
2283 | | - sizeof(sigsaved)); |
---|
2284 | | - set_restore_sigmask(); |
---|
2285 | | - } else |
---|
2286 | | - set_current_blocked(&sigsaved); |
---|
2287 | | - } |
---|
| 2427 | + restore_saved_sigmask_unless(err == -EINTR); |
---|
2288 | 2428 | |
---|
2289 | 2429 | return err; |
---|
2290 | 2430 | } |
---|
.. | .. |
---|
2307 | 2447 | * inclusion loops checks. |
---|
2308 | 2448 | */ |
---|
2309 | 2449 | ep_nested_calls_init(&poll_loop_ncalls); |
---|
2310 | | - |
---|
2311 | | -#ifdef CONFIG_DEBUG_LOCK_ALLOC |
---|
2312 | | - /* Initialize the structure used to perform safe poll wait head wake ups */ |
---|
2313 | | - ep_nested_calls_init(&poll_safewake_ncalls); |
---|
2314 | | -#endif |
---|
2315 | 2450 | |
---|
2316 | 2451 | /* |
---|
2317 | 2452 | * We can have many thousands of epitems, so prevent this from |
---|