| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * fs/eventpoll.c (Efficient event retrieval implementation) |
|---|
| 3 | 4 | * Copyright (C) 2001,...,2009 Davide Libenzi |
|---|
| 4 | 5 | * |
|---|
| 5 | | - * This program is free software; you can redistribute it and/or modify |
|---|
| 6 | | - * it under the terms of the GNU General Public License as published by |
|---|
| 7 | | - * the Free Software Foundation; either version 2 of the License, or |
|---|
| 8 | | - * (at your option) any later version. |
|---|
| 9 | | - * |
|---|
| 10 | 6 | * Davide Libenzi <davidel@xmailserver.org> |
|---|
| 11 | | - * |
|---|
| 12 | 7 | */ |
|---|
| 13 | 8 | |
|---|
| 14 | 9 | #include <linux/init.h> |
|---|
| .. | .. |
|---|
| 45 | 40 | #include <linux/rculist.h> |
|---|
| 46 | 41 | #include <net/busy_poll.h> |
|---|
| 47 | 42 | |
|---|
| 43 | +#include <trace/hooks/fs.h> |
|---|
| 44 | + |
|---|
| 48 | 45 | /* |
|---|
| 49 | 46 | * LOCKING: |
|---|
| 50 | 47 | * There are three level of locking required by epoll : |
|---|
| 51 | 48 | * |
|---|
| 52 | 49 | * 1) epmutex (mutex) |
|---|
| 53 | 50 | * 2) ep->mtx (mutex) |
|---|
| 54 | | - * 3) ep->wq.lock (spinlock) |
|---|
| 51 | + * 3) ep->lock (rwlock) |
|---|
| 55 | 52 | * |
|---|
| 56 | 53 | * The acquire order is the one listed above, from 1 to 3. |
|---|
| 57 | | - * We need a spinlock (ep->wq.lock) because we manipulate objects |
|---|
| 54 | + * We need a rwlock (ep->lock) because we manipulate objects |
|---|
| 58 | 55 | * from inside the poll callback, that might be triggered from |
|---|
| 59 | 56 | * a wake_up() that in turn might be called from IRQ context. |
|---|
| 60 | 57 | * So we can't sleep inside the poll callback and hence we need |
|---|
| .. | .. |
|---|
| 86 | 83 | * of epoll file descriptors, we use the current recursion depth as |
|---|
| 87 | 84 | * the lockdep subkey. |
|---|
| 88 | 85 | * It is possible to drop the "ep->mtx" and to use the global |
|---|
| 89 | | - * mutex "epmutex" (together with "ep->wq.lock") to have it working, |
|---|
| 86 | + * mutex "epmutex" (together with "ep->lock") to have it working, |
|---|
| 90 | 87 | * but having "ep->mtx" will make the interface more scalable. |
|---|
| 91 | 88 | * Events that require holding "epmutex" are very rare, while for |
|---|
| 92 | 89 | * normal operations the epoll private "ep->mtx" will guarantee |
|---|
| .. | .. |
|---|
| 183 | 180 | * This structure is stored inside the "private_data" member of the file |
|---|
| 184 | 181 | * structure and represents the main data structure for the eventpoll |
|---|
| 185 | 182 | * interface. |
|---|
| 186 | | - * |
|---|
| 187 | | - * Access to it is protected by the lock inside wq. |
|---|
| 188 | 183 | */ |
|---|
| 189 | 184 | struct eventpoll { |
|---|
| 190 | 185 | /* |
|---|
| .. | .. |
|---|
| 204 | 199 | /* List of ready file descriptors */ |
|---|
| 205 | 200 | struct list_head rdllist; |
|---|
| 206 | 201 | |
|---|
| 202 | + /* Lock which protects rdllist and ovflist */ |
|---|
| 203 | + rwlock_t lock; |
|---|
| 204 | + |
|---|
| 207 | 205 | /* RB tree root used to store monitored fd structs */ |
|---|
| 208 | 206 | struct rb_root_cached rbr; |
|---|
| 209 | 207 | |
|---|
| 210 | 208 | /* |
|---|
| 211 | 209 | * This is a single linked list that chains all the "struct epitem" that |
|---|
| 212 | 210 | * happened while transferring ready events to userspace w/out |
|---|
| 213 | | - * holding ->wq.lock. |
|---|
| 211 | + * holding ->lock. |
|---|
| 214 | 212 | */ |
|---|
| 215 | 213 | struct epitem *ovflist; |
|---|
| 216 | 214 | |
|---|
| .. | .. |
|---|
| 228 | 226 | #ifdef CONFIG_NET_RX_BUSY_POLL |
|---|
| 229 | 227 | /* used to track busy poll napi_id */ |
|---|
| 230 | 228 | unsigned int napi_id; |
|---|
| 229 | +#endif |
|---|
| 230 | + |
|---|
| 231 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC |
|---|
| 232 | + /* tracks wakeup nests for lockdep validation */ |
|---|
| 233 | + u8 nests; |
|---|
| 231 | 234 | #endif |
|---|
| 232 | 235 | }; |
|---|
| 233 | 236 | |
|---|
| .. | .. |
|---|
| 294 | 297 | |
|---|
| 295 | 298 | #include <linux/sysctl.h> |
|---|
| 296 | 299 | |
|---|
| 297 | | -static long zero; |
|---|
| 300 | +static long long_zero; |
|---|
| 298 | 301 | static long long_max = LONG_MAX; |
|---|
| 299 | 302 | |
|---|
| 300 | 303 | struct ctl_table epoll_table[] = { |
|---|
| .. | .. |
|---|
| 304 | 307 | .maxlen = sizeof(max_user_watches), |
|---|
| 305 | 308 | .mode = 0644, |
|---|
| 306 | 309 | .proc_handler = proc_doulongvec_minmax, |
|---|
| 307 | | - .extra1 = &zero, |
|---|
| 310 | + .extra1 = &long_zero, |
|---|
| 308 | 311 | .extra2 = &long_max, |
|---|
| 309 | 312 | }, |
|---|
| 310 | 313 | { } |
|---|
| .. | .. |
|---|
| 357 | 360 | return container_of(p, struct ep_pqueue, pt)->epi; |
|---|
| 358 | 361 | } |
|---|
| 359 | 362 | |
|---|
| 360 | | -/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ |
|---|
| 361 | | -static inline int ep_op_has_event(int op) |
|---|
| 362 | | -{ |
|---|
| 363 | | - return op != EPOLL_CTL_DEL; |
|---|
| 364 | | -} |
|---|
| 365 | | - |
|---|
| 366 | 363 | /* Initialize the poll safe wake up structure */ |
|---|
| 367 | 364 | static void ep_nested_calls_init(struct nested_calls *ncalls) |
|---|
| 368 | 365 | { |
|---|
| .. | .. |
|---|
| 380 | 377 | */ |
|---|
| 381 | 378 | static inline int ep_events_available(struct eventpoll *ep) |
|---|
| 382 | 379 | { |
|---|
| 383 | | - return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; |
|---|
| 380 | + return !list_empty_careful(&ep->rdllist) || |
|---|
| 381 | + READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; |
|---|
| 384 | 382 | } |
|---|
| 385 | 383 | |
|---|
| 386 | 384 | #ifdef CONFIG_NET_RX_BUSY_POLL |
|---|
| .. | .. |
|---|
| 470 | 468 | * no re-entered. |
|---|
| 471 | 469 | * |
|---|
| 472 | 470 | * @ncalls: Pointer to the nested_calls structure to be used for this call. |
|---|
| 473 | | - * @max_nests: Maximum number of allowed nesting calls. |
|---|
| 474 | 471 | * @nproc: Nested call core function pointer. |
|---|
| 475 | 472 | * @priv: Opaque data to be passed to the @nproc callback. |
|---|
| 476 | 473 | * @cookie: Cookie to be used to identify this nested call. |
|---|
| .. | .. |
|---|
| 479 | 476 | * Returns: Returns the code returned by the @nproc callback, or -1 if |
|---|
| 480 | 477 | * the maximum recursion limit has been exceeded. |
|---|
| 481 | 478 | */ |
|---|
| 482 | | -static int ep_call_nested(struct nested_calls *ncalls, int max_nests, |
|---|
| 479 | +static int ep_call_nested(struct nested_calls *ncalls, |
|---|
| 483 | 480 | int (*nproc)(void *, void *, int), void *priv, |
|---|
| 484 | 481 | void *cookie, void *ctx) |
|---|
| 485 | 482 | { |
|---|
| .. | .. |
|---|
| 498 | 495 | */ |
|---|
| 499 | 496 | list_for_each_entry(tncur, lsthead, llink) { |
|---|
| 500 | 497 | if (tncur->ctx == ctx && |
|---|
| 501 | | - (tncur->cookie == cookie || ++call_nests > max_nests)) { |
|---|
| 498 | + (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) { |
|---|
| 502 | 499 | /* |
|---|
| 503 | 500 | * Ops ... loop detected or maximum nest level reached. |
|---|
| 504 | 501 | * We abort this wake by breaking the cycle itself. |
|---|
| .. | .. |
|---|
| 554 | 551 | */ |
|---|
| 555 | 552 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
|---|
| 556 | 553 | |
|---|
| 557 | | -static struct nested_calls poll_safewake_ncalls; |
|---|
| 558 | | - |
|---|
| 559 | | -static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) |
|---|
| 554 | +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, |
|---|
| 555 | + unsigned pollflags) |
|---|
| 560 | 556 | { |
|---|
| 557 | + struct eventpoll *ep_src; |
|---|
| 561 | 558 | unsigned long flags; |
|---|
| 562 | | - wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie; |
|---|
| 559 | + u8 nests = 0; |
|---|
| 563 | 560 | |
|---|
| 564 | | - spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1); |
|---|
| 565 | | - wake_up_locked_poll(wqueue, EPOLLIN); |
|---|
| 566 | | - spin_unlock_irqrestore(&wqueue->lock, flags); |
|---|
| 567 | | - |
|---|
| 568 | | - return 0; |
|---|
| 569 | | -} |
|---|
| 570 | | - |
|---|
| 571 | | -static void ep_poll_safewake(wait_queue_head_t *wq) |
|---|
| 572 | | -{ |
|---|
| 573 | | - int this_cpu = get_cpu_light(); |
|---|
| 574 | | - |
|---|
| 575 | | - ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, |
|---|
| 576 | | - ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); |
|---|
| 577 | | - |
|---|
| 578 | | - put_cpu_light(); |
|---|
| 561 | + /* |
|---|
| 562 | + * To set the subclass or nesting level for spin_lock_irqsave_nested() |
|---|
| 563 | + * it might be natural to create a per-cpu nest count. However, since |
|---|
| 564 | + * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can |
|---|
| 565 | + * schedule() in the -rt kernel, the per-cpu variable are no longer |
|---|
| 566 | + * protected. Thus, we are introducing a per eventpoll nest field. |
|---|
| 567 | + * If we are not being call from ep_poll_callback(), epi is NULL and |
|---|
| 568 | + * we are at the first level of nesting, 0. Otherwise, we are being |
|---|
| 569 | + * called from ep_poll_callback() and if a previous wakeup source is |
|---|
| 570 | + * not an epoll file itself, we are at depth 1 since the wakeup source |
|---|
| 571 | + * is depth 0. If the wakeup source is a previous epoll file in the |
|---|
| 572 | + * wakeup chain then we use its nests value and record ours as |
|---|
| 573 | + * nests + 1. The previous epoll file nests value is stable since its |
|---|
| 574 | + * already holding its own poll_wait.lock. |
|---|
| 575 | + */ |
|---|
| 576 | + if (epi) { |
|---|
| 577 | + if ((is_file_epoll(epi->ffd.file))) { |
|---|
| 578 | + ep_src = epi->ffd.file->private_data; |
|---|
| 579 | + nests = ep_src->nests; |
|---|
| 580 | + } else { |
|---|
| 581 | + nests = 1; |
|---|
| 582 | + } |
|---|
| 583 | + } |
|---|
| 584 | + spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); |
|---|
| 585 | + ep->nests = nests + 1; |
|---|
| 586 | + wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); |
|---|
| 587 | + ep->nests = 0; |
|---|
| 588 | + spin_unlock_irqrestore(&ep->poll_wait.lock, flags); |
|---|
| 579 | 589 | } |
|---|
| 580 | 590 | |
|---|
| 581 | 591 | #else |
|---|
| 582 | 592 | |
|---|
| 583 | | -static void ep_poll_safewake(wait_queue_head_t *wq) |
|---|
| 593 | +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, |
|---|
| 594 | + unsigned pollflags) |
|---|
| 584 | 595 | { |
|---|
| 585 | | - wake_up_poll(wq, EPOLLIN); |
|---|
| 596 | + wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); |
|---|
| 586 | 597 | } |
|---|
| 587 | 598 | |
|---|
| 588 | 599 | #endif |
|---|
| .. | .. |
|---|
| 674 | 685 | void *priv, int depth, bool ep_locked) |
|---|
| 675 | 686 | { |
|---|
| 676 | 687 | __poll_t res; |
|---|
| 677 | | - int pwake = 0; |
|---|
| 678 | 688 | struct epitem *epi, *nepi; |
|---|
| 679 | 689 | LIST_HEAD(txlist); |
|---|
| 680 | 690 | |
|---|
| .. | .. |
|---|
| 696 | 706 | * because we want the "sproc" callback to be able to do it |
|---|
| 697 | 707 | * in a lockless way. |
|---|
| 698 | 708 | */ |
|---|
| 699 | | - spin_lock_irq(&ep->wq.lock); |
|---|
| 709 | + write_lock_irq(&ep->lock); |
|---|
| 700 | 710 | list_splice_init(&ep->rdllist, &txlist); |
|---|
| 701 | | - ep->ovflist = NULL; |
|---|
| 702 | | - spin_unlock_irq(&ep->wq.lock); |
|---|
| 711 | + WRITE_ONCE(ep->ovflist, NULL); |
|---|
| 712 | + write_unlock_irq(&ep->lock); |
|---|
| 703 | 713 | |
|---|
| 704 | 714 | /* |
|---|
| 705 | 715 | * Now call the callback function. |
|---|
| 706 | 716 | */ |
|---|
| 707 | 717 | res = (*sproc)(ep, &txlist, priv); |
|---|
| 708 | 718 | |
|---|
| 709 | | - spin_lock_irq(&ep->wq.lock); |
|---|
| 719 | + write_lock_irq(&ep->lock); |
|---|
| 710 | 720 | /* |
|---|
| 711 | 721 | * During the time we spent inside the "sproc" callback, some |
|---|
| 712 | 722 | * other events might have been queued by the poll callback. |
|---|
| 713 | 723 | * We re-insert them inside the main ready-list here. |
|---|
| 714 | 724 | */ |
|---|
| 715 | | - for (nepi = ep->ovflist; (epi = nepi) != NULL; |
|---|
| 725 | + for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; |
|---|
| 716 | 726 | nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { |
|---|
| 717 | 727 | /* |
|---|
| 718 | 728 | * We need to check if the item is already in the list. |
|---|
| .. | .. |
|---|
| 721 | 731 | * contain them, and the list_splice() below takes care of them. |
|---|
| 722 | 732 | */ |
|---|
| 723 | 733 | if (!ep_is_linked(epi)) { |
|---|
| 724 | | - list_add_tail(&epi->rdllink, &ep->rdllist); |
|---|
| 734 | + /* |
|---|
| 735 | + * ->ovflist is LIFO, so we have to reverse it in order |
|---|
| 736 | + * to keep in FIFO. |
|---|
| 737 | + */ |
|---|
| 738 | + list_add(&epi->rdllink, &ep->rdllist); |
|---|
| 725 | 739 | ep_pm_stay_awake(epi); |
|---|
| 726 | 740 | } |
|---|
| 727 | 741 | } |
|---|
| .. | .. |
|---|
| 730 | 744 | * releasing the lock, events will be queued in the normal way inside |
|---|
| 731 | 745 | * ep->rdllist. |
|---|
| 732 | 746 | */ |
|---|
| 733 | | - ep->ovflist = EP_UNACTIVE_PTR; |
|---|
| 747 | + WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); |
|---|
| 734 | 748 | |
|---|
| 735 | 749 | /* |
|---|
| 736 | 750 | * Quickly re-inject items left on "txlist". |
|---|
| .. | .. |
|---|
| 739 | 753 | __pm_relax(ep->ws); |
|---|
| 740 | 754 | |
|---|
| 741 | 755 | if (!list_empty(&ep->rdllist)) { |
|---|
| 742 | | - /* |
|---|
| 743 | | - * Wake up (if active) both the eventpoll wait list and |
|---|
| 744 | | - * the ->poll() wait list (delayed after we release the lock). |
|---|
| 745 | | - */ |
|---|
| 746 | 756 | if (waitqueue_active(&ep->wq)) |
|---|
| 747 | | - wake_up_locked(&ep->wq); |
|---|
| 748 | | - if (waitqueue_active(&ep->poll_wait)) |
|---|
| 749 | | - pwake++; |
|---|
| 757 | + wake_up(&ep->wq); |
|---|
| 750 | 758 | } |
|---|
| 751 | | - spin_unlock_irq(&ep->wq.lock); |
|---|
| 759 | + |
|---|
| 760 | + write_unlock_irq(&ep->lock); |
|---|
| 752 | 761 | |
|---|
| 753 | 762 | if (!ep_locked) |
|---|
| 754 | 763 | mutex_unlock(&ep->mtx); |
|---|
| 755 | | - |
|---|
| 756 | | - /* We have to call this outside the lock */ |
|---|
| 757 | | - if (pwake) |
|---|
| 758 | | - ep_poll_safewake(&ep->poll_wait); |
|---|
| 759 | 764 | |
|---|
| 760 | 765 | return res; |
|---|
| 761 | 766 | } |
|---|
| .. | .. |
|---|
| 788 | 793 | |
|---|
| 789 | 794 | rb_erase_cached(&epi->rbn, &ep->rbr); |
|---|
| 790 | 795 | |
|---|
| 791 | | - spin_lock_irq(&ep->wq.lock); |
|---|
| 796 | + write_lock_irq(&ep->lock); |
|---|
| 792 | 797 | if (ep_is_linked(epi)) |
|---|
| 793 | 798 | list_del_init(&epi->rdllink); |
|---|
| 794 | | - spin_unlock_irq(&ep->wq.lock); |
|---|
| 799 | + write_unlock_irq(&ep->lock); |
|---|
| 795 | 800 | |
|---|
| 796 | 801 | wakeup_source_unregister(ep_wakeup_source(epi)); |
|---|
| 797 | 802 | /* |
|---|
| .. | .. |
|---|
| 815 | 820 | |
|---|
| 816 | 821 | /* We need to release all tasks waiting for these file */ |
|---|
| 817 | 822 | if (waitqueue_active(&ep->poll_wait)) |
|---|
| 818 | | - ep_poll_safewake(&ep->poll_wait); |
|---|
| 823 | + ep_poll_safewake(ep, NULL, 0); |
|---|
| 819 | 824 | |
|---|
| 820 | 825 | /* |
|---|
| 821 | 826 | * We need to lock this because we could be hit by |
|---|
| .. | .. |
|---|
| 841 | 846 | * Walks through the whole tree by freeing each "struct epitem". At this |
|---|
| 842 | 847 | * point we are sure no poll callbacks will be lingering around, and also by |
|---|
| 843 | 848 | * holding "epmutex" we can be sure that no file cleanup code will hit |
|---|
| 844 | | - * us during this operation. So we can avoid the lock on "ep->wq.lock". |
|---|
| 849 | + * us during this operation. So we can avoid the lock on "ep->lock". |
|---|
| 845 | 850 | * We do not need to lock ep->mtx, either, we only do it to prevent |
|---|
| 846 | 851 | * a lockdep warning. |
|---|
| 847 | 852 | */ |
|---|
| .. | .. |
|---|
| 1022 | 1027 | goto free_uid; |
|---|
| 1023 | 1028 | |
|---|
| 1024 | 1029 | mutex_init(&ep->mtx); |
|---|
| 1030 | + rwlock_init(&ep->lock); |
|---|
| 1025 | 1031 | init_waitqueue_head(&ep->wq); |
|---|
| 1026 | 1032 | init_waitqueue_head(&ep->poll_wait); |
|---|
| 1027 | 1033 | INIT_LIST_HEAD(&ep->rdllist); |
|---|
| .. | .. |
|---|
| 1067 | 1073 | return epir; |
|---|
| 1068 | 1074 | } |
|---|
| 1069 | 1075 | |
|---|
| 1070 | | -#ifdef CONFIG_CHECKPOINT_RESTORE |
|---|
| 1076 | +#ifdef CONFIG_KCMP |
|---|
| 1071 | 1077 | static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) |
|---|
| 1072 | 1078 | { |
|---|
| 1073 | 1079 | struct rb_node *rbp; |
|---|
| .. | .. |
|---|
| 1109 | 1115 | |
|---|
| 1110 | 1116 | return file_raw; |
|---|
| 1111 | 1117 | } |
|---|
| 1112 | | -#endif /* CONFIG_CHECKPOINT_RESTORE */ |
|---|
| 1118 | +#endif /* CONFIG_KCMP */ |
|---|
| 1119 | + |
|---|
| 1120 | +/** |
|---|
| 1121 | + * Adds a new entry to the tail of the list in a lockless way, i.e. |
|---|
| 1122 | + * multiple CPUs are allowed to call this function concurrently. |
|---|
| 1123 | + * |
|---|
| 1124 | + * Beware: it is necessary to prevent any other modifications of the |
|---|
| 1125 | + * existing list until all changes are completed, in other words |
|---|
| 1126 | + * concurrent list_add_tail_lockless() calls should be protected |
|---|
| 1127 | + * with a read lock, where write lock acts as a barrier which |
|---|
| 1128 | + * makes sure all list_add_tail_lockless() calls are fully |
|---|
| 1129 | + * completed. |
|---|
| 1130 | + * |
|---|
| 1131 | + * Also an element can be locklessly added to the list only in one |
|---|
| 1132 | + * direction i.e. either to the tail either to the head, otherwise |
|---|
| 1133 | + * concurrent access will corrupt the list. |
|---|
| 1134 | + * |
|---|
| 1135 | + * Returns %false if element has been already added to the list, %true |
|---|
| 1136 | + * otherwise. |
|---|
| 1137 | + */ |
|---|
| 1138 | +static inline bool list_add_tail_lockless(struct list_head *new, |
|---|
| 1139 | + struct list_head *head) |
|---|
| 1140 | +{ |
|---|
| 1141 | + struct list_head *prev; |
|---|
| 1142 | + |
|---|
| 1143 | + /* |
|---|
| 1144 | + * This is simple 'new->next = head' operation, but cmpxchg() |
|---|
| 1145 | + * is used in order to detect that same element has been just |
|---|
| 1146 | + * added to the list from another CPU: the winner observes |
|---|
| 1147 | + * new->next == new. |
|---|
| 1148 | + */ |
|---|
| 1149 | + if (cmpxchg(&new->next, new, head) != new) |
|---|
| 1150 | + return false; |
|---|
| 1151 | + |
|---|
| 1152 | + /* |
|---|
| 1153 | + * Initially ->next of a new element must be updated with the head |
|---|
| 1154 | + * (we are inserting to the tail) and only then pointers are atomically |
|---|
| 1155 | + * exchanged. XCHG guarantees memory ordering, thus ->next should be |
|---|
| 1156 | + * updated before pointers are actually swapped and pointers are |
|---|
| 1157 | + * swapped before prev->next is updated. |
|---|
| 1158 | + */ |
|---|
| 1159 | + |
|---|
| 1160 | + prev = xchg(&head->prev, new); |
|---|
| 1161 | + |
|---|
| 1162 | + /* |
|---|
| 1163 | + * It is safe to modify prev->next and new->prev, because a new element |
|---|
| 1164 | + * is added only to the tail and new->next is updated before XCHG. |
|---|
| 1165 | + */ |
|---|
| 1166 | + |
|---|
| 1167 | + prev->next = new; |
|---|
| 1168 | + new->prev = prev; |
|---|
| 1169 | + |
|---|
| 1170 | + return true; |
|---|
| 1171 | +} |
|---|
| 1172 | + |
|---|
| 1173 | +/** |
|---|
| 1174 | + * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, |
|---|
| 1175 | + * i.e. multiple CPUs are allowed to call this function concurrently. |
|---|
| 1176 | + * |
|---|
| 1177 | + * Returns %false if epi element has been already chained, %true otherwise. |
|---|
| 1178 | + */ |
|---|
| 1179 | +static inline bool chain_epi_lockless(struct epitem *epi) |
|---|
| 1180 | +{ |
|---|
| 1181 | + struct eventpoll *ep = epi->ep; |
|---|
| 1182 | + |
|---|
| 1183 | + /* Fast preliminary check */ |
|---|
| 1184 | + if (epi->next != EP_UNACTIVE_PTR) |
|---|
| 1185 | + return false; |
|---|
| 1186 | + |
|---|
| 1187 | + /* Check that the same epi has not been just chained from another CPU */ |
|---|
| 1188 | + if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) |
|---|
| 1189 | + return false; |
|---|
| 1190 | + |
|---|
| 1191 | + /* Atomically exchange tail */ |
|---|
| 1192 | + epi->next = xchg(&ep->ovflist, epi); |
|---|
| 1193 | + |
|---|
| 1194 | + return true; |
|---|
| 1195 | +} |
|---|
| 1113 | 1196 | |
|---|
| 1114 | 1197 | /* |
|---|
| 1115 | 1198 | * This is the callback that is passed to the wait queue wakeup |
|---|
| 1116 | 1199 | * mechanism. It is called by the stored file descriptors when they |
|---|
| 1117 | 1200 | * have events to report. |
|---|
| 1201 | + * |
|---|
| 1202 | + * This callback takes a read lock in order not to content with concurrent |
|---|
| 1203 | + * events from another file descriptors, thus all modifications to ->rdllist |
|---|
| 1204 | + * or ->ovflist are lockless. Read lock is paired with the write lock from |
|---|
| 1205 | + * ep_scan_ready_list(), which stops all list modifications and guarantees |
|---|
| 1206 | + * that lists state is seen correctly. |
|---|
| 1207 | + * |
|---|
| 1208 | + * Another thing worth to mention is that ep_poll_callback() can be called |
|---|
| 1209 | + * concurrently for the same @epi from different CPUs if poll table was inited |
|---|
| 1210 | + * with several wait queues entries. Plural wakeup from different CPUs of a |
|---|
| 1211 | + * single wait queue is serialized by wq.lock, but the case when multiple wait |
|---|
| 1212 | + * queues are used should be detected accordingly. This is detected using |
|---|
| 1213 | + * cmpxchg() operation. |
|---|
| 1118 | 1214 | */ |
|---|
| 1119 | 1215 | static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) |
|---|
| 1120 | 1216 | { |
|---|
| 1121 | 1217 | int pwake = 0; |
|---|
| 1122 | | - unsigned long flags; |
|---|
| 1123 | 1218 | struct epitem *epi = ep_item_from_wait(wait); |
|---|
| 1124 | 1219 | struct eventpoll *ep = epi->ep; |
|---|
| 1125 | 1220 | __poll_t pollflags = key_to_poll(key); |
|---|
| 1221 | + unsigned long flags; |
|---|
| 1126 | 1222 | int ewake = 0; |
|---|
| 1127 | 1223 | |
|---|
| 1128 | | - spin_lock_irqsave(&ep->wq.lock, flags); |
|---|
| 1224 | + read_lock_irqsave(&ep->lock, flags); |
|---|
| 1129 | 1225 | |
|---|
| 1130 | 1226 | ep_set_busy_poll_napi_id(epi); |
|---|
| 1131 | 1227 | |
|---|
| .. | .. |
|---|
| 1153 | 1249 | * semantics). All the events that happen during that period of time are |
|---|
| 1154 | 1250 | * chained in ep->ovflist and requeued later on. |
|---|
| 1155 | 1251 | */ |
|---|
| 1156 | | - if (ep->ovflist != EP_UNACTIVE_PTR) { |
|---|
| 1157 | | - if (epi->next == EP_UNACTIVE_PTR) { |
|---|
| 1158 | | - epi->next = ep->ovflist; |
|---|
| 1159 | | - ep->ovflist = epi; |
|---|
| 1160 | | - if (epi->ws) { |
|---|
| 1161 | | - /* |
|---|
| 1162 | | - * Activate ep->ws since epi->ws may get |
|---|
| 1163 | | - * deactivated at any time. |
|---|
| 1164 | | - */ |
|---|
| 1165 | | - __pm_stay_awake(ep->ws); |
|---|
| 1166 | | - } |
|---|
| 1167 | | - |
|---|
| 1168 | | - } |
|---|
| 1169 | | - goto out_unlock; |
|---|
| 1170 | | - } |
|---|
| 1171 | | - |
|---|
| 1172 | | - /* If this file is already in the ready list we exit soon */ |
|---|
| 1173 | | - if (!ep_is_linked(epi)) { |
|---|
| 1174 | | - list_add_tail(&epi->rdllink, &ep->rdllist); |
|---|
| 1175 | | - ep_pm_stay_awake_rcu(epi); |
|---|
| 1252 | + if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { |
|---|
| 1253 | + if (chain_epi_lockless(epi)) |
|---|
| 1254 | + ep_pm_stay_awake_rcu(epi); |
|---|
| 1255 | + } else if (!ep_is_linked(epi)) { |
|---|
| 1256 | + /* In the usual case, add event to ready list. */ |
|---|
| 1257 | + if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) |
|---|
| 1258 | + ep_pm_stay_awake_rcu(epi); |
|---|
| 1176 | 1259 | } |
|---|
| 1177 | 1260 | |
|---|
| 1178 | 1261 | /* |
|---|
| .. | .. |
|---|
| 1196 | 1279 | break; |
|---|
| 1197 | 1280 | } |
|---|
| 1198 | 1281 | } |
|---|
| 1199 | | - wake_up_locked(&ep->wq); |
|---|
| 1282 | + wake_up(&ep->wq); |
|---|
| 1200 | 1283 | } |
|---|
| 1201 | 1284 | if (waitqueue_active(&ep->poll_wait)) |
|---|
| 1202 | 1285 | pwake++; |
|---|
| 1203 | 1286 | |
|---|
| 1204 | 1287 | out_unlock: |
|---|
| 1205 | | - spin_unlock_irqrestore(&ep->wq.lock, flags); |
|---|
| 1288 | + read_unlock_irqrestore(&ep->lock, flags); |
|---|
| 1206 | 1289 | |
|---|
| 1207 | 1290 | /* We have to call this outside the lock */ |
|---|
| 1208 | 1291 | if (pwake) |
|---|
| 1209 | | - ep_poll_safewake(&ep->poll_wait); |
|---|
| 1292 | + ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); |
|---|
| 1210 | 1293 | |
|---|
| 1211 | 1294 | if (!(epi->event.events & EPOLLEXCLUSIVE)) |
|---|
| 1212 | 1295 | ewake = 1; |
|---|
| .. | .. |
|---|
| 1332 | 1415 | } |
|---|
| 1333 | 1416 | } else { |
|---|
| 1334 | 1417 | error = ep_call_nested(&poll_loop_ncalls, |
|---|
| 1335 | | - EP_MAX_NESTS, |
|---|
| 1336 | 1418 | reverse_path_check_proc, |
|---|
| 1337 | 1419 | child_file, child_file, |
|---|
| 1338 | 1420 | current); |
|---|
| .. | .. |
|---|
| 1366 | 1448 | /* let's call this for all tfiles */ |
|---|
| 1367 | 1449 | list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { |
|---|
| 1368 | 1450 | path_count_init(); |
|---|
| 1369 | | - error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, |
|---|
| 1451 | + error = ep_call_nested(&poll_loop_ncalls, |
|---|
| 1370 | 1452 | reverse_path_check_proc, current_file, |
|---|
| 1371 | 1453 | current_file, current); |
|---|
| 1372 | 1454 | if (error) |
|---|
| .. | .. |
|---|
| 1379 | 1461 | { |
|---|
| 1380 | 1462 | struct name_snapshot n; |
|---|
| 1381 | 1463 | struct wakeup_source *ws; |
|---|
| 1464 | + char ws_name[64]; |
|---|
| 1382 | 1465 | |
|---|
| 1466 | + strlcpy(ws_name, "eventpoll", sizeof(ws_name)); |
|---|
| 1467 | + trace_android_vh_ep_create_wakeup_source(ws_name, sizeof(ws_name)); |
|---|
| 1383 | 1468 | if (!epi->ep->ws) { |
|---|
| 1384 | | - epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); |
|---|
| 1469 | + epi->ep->ws = wakeup_source_register(NULL, ws_name); |
|---|
| 1385 | 1470 | if (!epi->ep->ws) |
|---|
| 1386 | 1471 | return -ENOMEM; |
|---|
| 1387 | 1472 | } |
|---|
| 1388 | 1473 | |
|---|
| 1389 | 1474 | take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); |
|---|
| 1390 | | - ws = wakeup_source_register(NULL, n.name); |
|---|
| 1475 | + strlcpy(ws_name, n.name.name, sizeof(ws_name)); |
|---|
| 1476 | + trace_android_vh_ep_create_wakeup_source(ws_name, sizeof(ws_name)); |
|---|
| 1477 | + ws = wakeup_source_register(NULL, ws_name); |
|---|
| 1391 | 1478 | release_dentry_name_snapshot(&n); |
|---|
| 1392 | 1479 | |
|---|
| 1393 | 1480 | if (!ws) |
|---|
| .. | .. |
|---|
| 1489 | 1576 | goto error_unregister; |
|---|
| 1490 | 1577 | |
|---|
| 1491 | 1578 | /* We have to drop the new item inside our item list to keep track of it */ |
|---|
| 1492 | | - spin_lock_irq(&ep->wq.lock); |
|---|
| 1579 | + write_lock_irq(&ep->lock); |
|---|
| 1493 | 1580 | |
|---|
| 1494 | 1581 | /* record NAPI ID of new item if present */ |
|---|
| 1495 | 1582 | ep_set_busy_poll_napi_id(epi); |
|---|
| .. | .. |
|---|
| 1501 | 1588 | |
|---|
| 1502 | 1589 | /* Notify waiting tasks that events are available */ |
|---|
| 1503 | 1590 | if (waitqueue_active(&ep->wq)) |
|---|
| 1504 | | - wake_up_locked(&ep->wq); |
|---|
| 1591 | + wake_up(&ep->wq); |
|---|
| 1505 | 1592 | if (waitqueue_active(&ep->poll_wait)) |
|---|
| 1506 | 1593 | pwake++; |
|---|
| 1507 | 1594 | } |
|---|
| 1508 | 1595 | |
|---|
| 1509 | | - spin_unlock_irq(&ep->wq.lock); |
|---|
| 1596 | + write_unlock_irq(&ep->lock); |
|---|
| 1510 | 1597 | |
|---|
| 1511 | 1598 | atomic_long_inc(&ep->user->epoll_watches); |
|---|
| 1512 | 1599 | |
|---|
| 1513 | 1600 | /* We have to call this outside the lock */ |
|---|
| 1514 | 1601 | if (pwake) |
|---|
| 1515 | | - ep_poll_safewake(&ep->poll_wait); |
|---|
| 1602 | + ep_poll_safewake(ep, NULL, 0); |
|---|
| 1516 | 1603 | |
|---|
| 1517 | 1604 | return 0; |
|---|
| 1518 | 1605 | |
|---|
| .. | .. |
|---|
| 1531 | 1618 | * list, since that is used/cleaned only inside a section bound by "mtx". |
|---|
| 1532 | 1619 | * And ep_insert() is called with "mtx" held. |
|---|
| 1533 | 1620 | */ |
|---|
| 1534 | | - spin_lock_irq(&ep->wq.lock); |
|---|
| 1621 | + write_lock_irq(&ep->lock); |
|---|
| 1535 | 1622 | if (ep_is_linked(epi)) |
|---|
| 1536 | 1623 | list_del_init(&epi->rdllink); |
|---|
| 1537 | | - spin_unlock_irq(&ep->wq.lock); |
|---|
| 1624 | + write_unlock_irq(&ep->lock); |
|---|
| 1538 | 1625 | |
|---|
| 1539 | 1626 | wakeup_source_unregister(ep_wakeup_source(epi)); |
|---|
| 1540 | 1627 | |
|---|
| .. | .. |
|---|
| 1578 | 1665 | * 1) Flush epi changes above to other CPUs. This ensures |
|---|
| 1579 | 1666 | * we do not miss events from ep_poll_callback if an |
|---|
| 1580 | 1667 | * event occurs immediately after we call f_op->poll(). |
|---|
| 1581 | | - * We need this because we did not take ep->wq.lock while |
|---|
| 1668 | + * We need this because we did not take ep->lock while |
|---|
| 1582 | 1669 | * changing epi above (but ep_poll_callback does take |
|---|
| 1583 | | - * ep->wq.lock). |
|---|
| 1670 | + * ep->lock). |
|---|
| 1584 | 1671 | * |
|---|
| 1585 | 1672 | * 2) We also need to ensure we do not miss _past_ events |
|---|
| 1586 | 1673 | * when calling f_op->poll(). This barrier also |
|---|
| .. | .. |
|---|
| 1599 | 1686 | * list, push it inside. |
|---|
| 1600 | 1687 | */ |
|---|
| 1601 | 1688 | if (ep_item_poll(epi, &pt, 1)) { |
|---|
| 1602 | | - spin_lock_irq(&ep->wq.lock); |
|---|
| 1689 | + write_lock_irq(&ep->lock); |
|---|
| 1603 | 1690 | if (!ep_is_linked(epi)) { |
|---|
| 1604 | 1691 | list_add_tail(&epi->rdllink, &ep->rdllist); |
|---|
| 1605 | 1692 | ep_pm_stay_awake(epi); |
|---|
| 1606 | 1693 | |
|---|
| 1607 | 1694 | /* Notify waiting tasks that events are available */ |
|---|
| 1608 | 1695 | if (waitqueue_active(&ep->wq)) |
|---|
| 1609 | | - wake_up_locked(&ep->wq); |
|---|
| 1696 | + wake_up(&ep->wq); |
|---|
| 1610 | 1697 | if (waitqueue_active(&ep->poll_wait)) |
|---|
| 1611 | 1698 | pwake++; |
|---|
| 1612 | 1699 | } |
|---|
| 1613 | | - spin_unlock_irq(&ep->wq.lock); |
|---|
| 1700 | + write_unlock_irq(&ep->lock); |
|---|
| 1614 | 1701 | } |
|---|
| 1615 | 1702 | |
|---|
| 1616 | 1703 | /* We have to call this outside the lock */ |
|---|
| 1617 | 1704 | if (pwake) |
|---|
| 1618 | | - ep_poll_safewake(&ep->poll_wait); |
|---|
| 1705 | + ep_poll_safewake(ep, NULL, 0); |
|---|
| 1619 | 1706 | |
|---|
| 1620 | 1707 | return 0; |
|---|
| 1621 | 1708 | } |
|---|
| .. | .. |
|---|
| 1625 | 1712 | { |
|---|
| 1626 | 1713 | struct ep_send_events_data *esed = priv; |
|---|
| 1627 | 1714 | __poll_t revents; |
|---|
| 1628 | | - struct epitem *epi; |
|---|
| 1629 | | - struct epoll_event __user *uevent; |
|---|
| 1715 | + struct epitem *epi, *tmp; |
|---|
| 1716 | + struct epoll_event __user *uevent = esed->events; |
|---|
| 1630 | 1717 | struct wakeup_source *ws; |
|---|
| 1631 | 1718 | poll_table pt; |
|---|
| 1632 | 1719 | |
|---|
| 1633 | 1720 | init_poll_funcptr(&pt, NULL); |
|---|
| 1721 | + esed->res = 0; |
|---|
| 1634 | 1722 | |
|---|
| 1635 | 1723 | /* |
|---|
| 1636 | 1724 | * We can loop without lock because we are passed a task private list. |
|---|
| 1637 | 1725 | * Items cannot vanish during the loop because ep_scan_ready_list() is |
|---|
| 1638 | 1726 | * holding "mtx" during this call. |
|---|
| 1639 | 1727 | */ |
|---|
| 1640 | | - for (esed->res = 0, uevent = esed->events; |
|---|
| 1641 | | - !list_empty(head) && esed->res < esed->maxevents;) { |
|---|
| 1642 | | - epi = list_first_entry(head, struct epitem, rdllink); |
|---|
| 1728 | + lockdep_assert_held(&ep->mtx); |
|---|
| 1729 | + |
|---|
| 1730 | + list_for_each_entry_safe(epi, tmp, head, rdllink) { |
|---|
| 1731 | + if (esed->res >= esed->maxevents) |
|---|
| 1732 | + break; |
|---|
| 1643 | 1733 | |
|---|
| 1644 | 1734 | /* |
|---|
| 1645 | 1735 | * Activate ep->ws before deactivating epi->ws to prevent |
|---|
| .. | .. |
|---|
| 1659 | 1749 | |
|---|
| 1660 | 1750 | list_del_init(&epi->rdllink); |
|---|
| 1661 | 1751 | |
|---|
| 1662 | | - revents = ep_item_poll(epi, &pt, 1); |
|---|
| 1663 | | - |
|---|
| 1664 | 1752 | /* |
|---|
| 1665 | 1753 | * If the event mask intersect the caller-requested one, |
|---|
| 1666 | 1754 | * deliver the event to userspace. Again, ep_scan_ready_list() |
|---|
| 1667 | | - * is holding "mtx", so no operations coming from userspace |
|---|
| 1755 | + * is holding ep->mtx, so no operations coming from userspace |
|---|
| 1668 | 1756 | * can change the item. |
|---|
| 1669 | 1757 | */ |
|---|
| 1670 | | - if (revents) { |
|---|
| 1671 | | - if (__put_user(revents, &uevent->events) || |
|---|
| 1672 | | - __put_user(epi->event.data, &uevent->data)) { |
|---|
| 1673 | | - list_add(&epi->rdllink, head); |
|---|
| 1674 | | - ep_pm_stay_awake(epi); |
|---|
| 1675 | | - if (!esed->res) |
|---|
| 1676 | | - esed->res = -EFAULT; |
|---|
| 1677 | | - return 0; |
|---|
| 1678 | | - } |
|---|
| 1679 | | - esed->res++; |
|---|
| 1680 | | - uevent++; |
|---|
| 1681 | | - if (epi->event.events & EPOLLONESHOT) |
|---|
| 1682 | | - epi->event.events &= EP_PRIVATE_BITS; |
|---|
| 1683 | | - else if (!(epi->event.events & EPOLLET)) { |
|---|
| 1684 | | - /* |
|---|
| 1685 | | - * If this file has been added with Level |
|---|
| 1686 | | - * Trigger mode, we need to insert back inside |
|---|
| 1687 | | - * the ready list, so that the next call to |
|---|
| 1688 | | - * epoll_wait() will check again the events |
|---|
| 1689 | | - * availability. At this point, no one can insert |
|---|
| 1690 | | - * into ep->rdllist besides us. The epoll_ctl() |
|---|
| 1691 | | - * callers are locked out by |
|---|
| 1692 | | - * ep_scan_ready_list() holding "mtx" and the |
|---|
| 1693 | | - * poll callback will queue them in ep->ovflist. |
|---|
| 1694 | | - */ |
|---|
| 1695 | | - list_add_tail(&epi->rdllink, &ep->rdllist); |
|---|
| 1696 | | - ep_pm_stay_awake(epi); |
|---|
| 1697 | | - } |
|---|
| 1758 | + revents = ep_item_poll(epi, &pt, 1); |
|---|
| 1759 | + if (!revents) |
|---|
| 1760 | + continue; |
|---|
| 1761 | + |
|---|
| 1762 | + if (__put_user(revents, &uevent->events) || |
|---|
| 1763 | + __put_user(epi->event.data, &uevent->data)) { |
|---|
| 1764 | + list_add(&epi->rdllink, head); |
|---|
| 1765 | + ep_pm_stay_awake(epi); |
|---|
| 1766 | + if (!esed->res) |
|---|
| 1767 | + esed->res = -EFAULT; |
|---|
| 1768 | + return 0; |
|---|
| 1769 | + } |
|---|
| 1770 | + esed->res++; |
|---|
| 1771 | + uevent++; |
|---|
| 1772 | + if (epi->event.events & EPOLLONESHOT) |
|---|
| 1773 | + epi->event.events &= EP_PRIVATE_BITS; |
|---|
| 1774 | + else if (!(epi->event.events & EPOLLET)) { |
|---|
| 1775 | + /* |
|---|
| 1776 | + * If this file has been added with Level |
|---|
| 1777 | + * Trigger mode, we need to insert back inside |
|---|
| 1778 | + * the ready list, so that the next call to |
|---|
| 1779 | + * epoll_wait() will check again the events |
|---|
| 1780 | + * availability. At this point, no one can insert |
|---|
| 1781 | + * into ep->rdllist besides us. The epoll_ctl() |
|---|
| 1782 | + * callers are locked out by |
|---|
| 1783 | + * ep_scan_ready_list() holding "mtx" and the |
|---|
| 1784 | + * poll callback will queue them in ep->ovflist. |
|---|
| 1785 | + */ |
|---|
| 1786 | + list_add_tail(&epi->rdllink, &ep->rdllist); |
|---|
| 1787 | + ep_pm_stay_awake(epi); |
|---|
| 1698 | 1788 | } |
|---|
| 1699 | 1789 | } |
|---|
| 1700 | 1790 | |
|---|
| .. | .. |
|---|
| 1722 | 1812 | |
|---|
| 1723 | 1813 | ktime_get_ts64(&now); |
|---|
| 1724 | 1814 | return timespec64_add_safe(now, ts); |
|---|
| 1815 | +} |
|---|
| 1816 | + |
|---|
| 1817 | +/* |
|---|
| 1818 | + * autoremove_wake_function, but remove even on failure to wake up, because we |
|---|
| 1819 | + * know that default_wake_function/ttwu will only fail if the thread is already |
|---|
| 1820 | + * woken, and in that case the ep_poll loop will remove the entry anyways, not |
|---|
| 1821 | + * try to reuse it. |
|---|
| 1822 | + */ |
|---|
| 1823 | +static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, |
|---|
| 1824 | + unsigned int mode, int sync, void *key) |
|---|
| 1825 | +{ |
|---|
| 1826 | + int ret = default_wake_function(wq_entry, mode, sync, key); |
|---|
| 1827 | + |
|---|
| 1828 | + list_del_init(&wq_entry->entry); |
|---|
| 1829 | + return ret; |
|---|
| 1725 | 1830 | } |
|---|
| 1726 | 1831 | |
|---|
| 1727 | 1832 | /** |
|---|
| .. | .. |
|---|
| 1760 | 1865 | } else if (timeout == 0) { |
|---|
| 1761 | 1866 | /* |
|---|
| 1762 | 1867 | * Avoid the unnecessary trip to the wait queue loop, if the |
|---|
| 1763 | | - * caller specified a non blocking operation. |
|---|
| 1868 | + * caller specified a non blocking operation. We still need |
|---|
| 1869 | + * lock because we could race and not see an epi being added |
|---|
| 1870 | + * to the ready list while in irq callback. Thus incorrectly |
|---|
| 1871 | + * returning 0 back to userspace. |
|---|
| 1764 | 1872 | */ |
|---|
| 1765 | 1873 | timed_out = 1; |
|---|
| 1766 | | - spin_lock_irq(&ep->wq.lock); |
|---|
| 1767 | | - goto check_events; |
|---|
| 1874 | + |
|---|
| 1875 | + write_lock_irq(&ep->lock); |
|---|
| 1876 | + eavail = ep_events_available(ep); |
|---|
| 1877 | + write_unlock_irq(&ep->lock); |
|---|
| 1878 | + |
|---|
| 1879 | + goto send_events; |
|---|
| 1768 | 1880 | } |
|---|
| 1769 | 1881 | |
|---|
| 1770 | 1882 | fetch_events: |
|---|
| .. | .. |
|---|
| 1772 | 1884 | if (!ep_events_available(ep)) |
|---|
| 1773 | 1885 | ep_busy_loop(ep, timed_out); |
|---|
| 1774 | 1886 | |
|---|
| 1775 | | - spin_lock_irq(&ep->wq.lock); |
|---|
| 1776 | | - |
|---|
| 1777 | | - if (!ep_events_available(ep)) { |
|---|
| 1778 | | - /* |
|---|
| 1779 | | - * Busy poll timed out. Drop NAPI ID for now, we can add |
|---|
| 1780 | | - * it back in when we have moved a socket with a valid NAPI |
|---|
| 1781 | | - * ID onto the ready list. |
|---|
| 1782 | | - */ |
|---|
| 1783 | | - ep_reset_busy_poll_napi_id(ep); |
|---|
| 1784 | | - |
|---|
| 1785 | | - /* |
|---|
| 1786 | | - * We don't have any available event to return to the caller. |
|---|
| 1787 | | - * We need to sleep here, and we will be wake up by |
|---|
| 1788 | | - * ep_poll_callback() when events will become available. |
|---|
| 1789 | | - */ |
|---|
| 1790 | | - init_waitqueue_entry(&wait, current); |
|---|
| 1791 | | - __add_wait_queue_exclusive(&ep->wq, &wait); |
|---|
| 1792 | | - |
|---|
| 1793 | | - for (;;) { |
|---|
| 1794 | | - /* |
|---|
| 1795 | | - * We don't want to sleep if the ep_poll_callback() sends us |
|---|
| 1796 | | - * a wakeup in between. That's why we set the task state |
|---|
| 1797 | | - * to TASK_INTERRUPTIBLE before doing the checks. |
|---|
| 1798 | | - */ |
|---|
| 1799 | | - set_current_state(TASK_INTERRUPTIBLE); |
|---|
| 1800 | | - /* |
|---|
| 1801 | | - * Always short-circuit for fatal signals to allow |
|---|
| 1802 | | - * threads to make a timely exit without the chance of |
|---|
| 1803 | | - * finding more events available and fetching |
|---|
| 1804 | | - * repeatedly. |
|---|
| 1805 | | - */ |
|---|
| 1806 | | - if (fatal_signal_pending(current)) { |
|---|
| 1807 | | - res = -EINTR; |
|---|
| 1808 | | - break; |
|---|
| 1809 | | - } |
|---|
| 1810 | | - if (ep_events_available(ep) || timed_out) |
|---|
| 1811 | | - break; |
|---|
| 1812 | | - if (signal_pending(current)) { |
|---|
| 1813 | | - res = -EINTR; |
|---|
| 1814 | | - break; |
|---|
| 1815 | | - } |
|---|
| 1816 | | - |
|---|
| 1817 | | - spin_unlock_irq(&ep->wq.lock); |
|---|
| 1818 | | - if (!freezable_schedule_hrtimeout_range(to, slack, |
|---|
| 1819 | | - HRTIMER_MODE_ABS)) |
|---|
| 1820 | | - timed_out = 1; |
|---|
| 1821 | | - |
|---|
| 1822 | | - spin_lock_irq(&ep->wq.lock); |
|---|
| 1823 | | - } |
|---|
| 1824 | | - |
|---|
| 1825 | | - __remove_wait_queue(&ep->wq, &wait); |
|---|
| 1826 | | - __set_current_state(TASK_RUNNING); |
|---|
| 1827 | | - } |
|---|
| 1828 | | -check_events: |
|---|
| 1829 | | - /* Is it worth to try to dig for events ? */ |
|---|
| 1830 | 1887 | eavail = ep_events_available(ep); |
|---|
| 1888 | + if (eavail) |
|---|
| 1889 | + goto send_events; |
|---|
| 1831 | 1890 | |
|---|
| 1832 | | - spin_unlock_irq(&ep->wq.lock); |
|---|
| 1891 | + /* |
|---|
| 1892 | + * Busy poll timed out. Drop NAPI ID for now, we can add |
|---|
| 1893 | + * it back in when we have moved a socket with a valid NAPI |
|---|
| 1894 | + * ID onto the ready list. |
|---|
| 1895 | + */ |
|---|
| 1896 | + ep_reset_busy_poll_napi_id(ep); |
|---|
| 1833 | 1897 | |
|---|
| 1898 | + do { |
|---|
| 1899 | + /* |
|---|
| 1900 | + * Internally init_wait() uses autoremove_wake_function(), |
|---|
| 1901 | + * thus wait entry is removed from the wait queue on each |
|---|
| 1902 | + * wakeup. Why it is important? In case of several waiters |
|---|
| 1903 | + * each new wakeup will hit the next waiter, giving it the |
|---|
| 1904 | + * chance to harvest new event. Otherwise wakeup can be |
|---|
| 1905 | + * lost. This is also good performance-wise, because on |
|---|
| 1906 | + * normal wakeup path no need to call __remove_wait_queue() |
|---|
| 1907 | + * explicitly, thus ep->lock is not taken, which halts the |
|---|
| 1908 | + * event delivery. |
|---|
| 1909 | + * |
|---|
| 1910 | + * In fact, we now use an even more aggressive function that |
|---|
| 1911 | + * unconditionally removes, because we don't reuse the wait |
|---|
| 1912 | + * entry between loop iterations. This lets us also avoid the |
|---|
| 1913 | + * performance issue if a process is killed, causing all of its |
|---|
| 1914 | + * threads to wake up without being removed normally. |
|---|
| 1915 | + */ |
|---|
| 1916 | + init_wait(&wait); |
|---|
| 1917 | + wait.func = ep_autoremove_wake_function; |
|---|
| 1918 | + |
|---|
| 1919 | + write_lock_irq(&ep->lock); |
|---|
| 1920 | + /* |
|---|
| 1921 | + * Barrierless variant, waitqueue_active() is called under |
|---|
| 1922 | + * the same lock on wakeup ep_poll_callback() side, so it |
|---|
| 1923 | + * is safe to avoid an explicit barrier. |
|---|
| 1924 | + */ |
|---|
| 1925 | + __set_current_state(TASK_INTERRUPTIBLE); |
|---|
| 1926 | + |
|---|
| 1927 | + /* |
|---|
| 1928 | + * Do the final check under the lock. ep_scan_ready_list() |
|---|
| 1929 | + * plays with two lists (->rdllist and ->ovflist) and there |
|---|
| 1930 | + * is always a race when both lists are empty for short |
|---|
| 1931 | + * period of time although events are pending, so lock is |
|---|
| 1932 | + * important. |
|---|
| 1933 | + */ |
|---|
| 1934 | + eavail = ep_events_available(ep); |
|---|
| 1935 | + if (!eavail) { |
|---|
| 1936 | + if (signal_pending(current)) |
|---|
| 1937 | + res = -EINTR; |
|---|
| 1938 | + else |
|---|
| 1939 | + __add_wait_queue_exclusive(&ep->wq, &wait); |
|---|
| 1940 | + } |
|---|
| 1941 | + write_unlock_irq(&ep->lock); |
|---|
| 1942 | + |
|---|
| 1943 | + if (!eavail && !res) |
|---|
| 1944 | + timed_out = !freezable_schedule_hrtimeout_range(to, slack, |
|---|
| 1945 | + HRTIMER_MODE_ABS); |
|---|
| 1946 | + |
|---|
| 1947 | + /* |
|---|
| 1948 | + * We were woken up, thus go and try to harvest some events. |
|---|
| 1949 | + * If timed out and still on the wait queue, recheck eavail |
|---|
| 1950 | + * carefully under lock, below. |
|---|
| 1951 | + */ |
|---|
| 1952 | + eavail = 1; |
|---|
| 1953 | + } while (0); |
|---|
| 1954 | + |
|---|
| 1955 | + __set_current_state(TASK_RUNNING); |
|---|
| 1956 | + |
|---|
| 1957 | + if (!list_empty_careful(&wait.entry)) { |
|---|
| 1958 | + write_lock_irq(&ep->lock); |
|---|
| 1959 | + /* |
|---|
| 1960 | + * If the thread timed out and is not on the wait queue, it |
|---|
| 1961 | + * means that the thread was woken up after its timeout expired |
|---|
| 1962 | + * before it could reacquire the lock. Thus, when wait.entry is |
|---|
| 1963 | + * empty, it needs to harvest events. |
|---|
| 1964 | + */ |
|---|
| 1965 | + if (timed_out) |
|---|
| 1966 | + eavail = list_empty(&wait.entry); |
|---|
| 1967 | + __remove_wait_queue(&ep->wq, &wait); |
|---|
| 1968 | + write_unlock_irq(&ep->lock); |
|---|
| 1969 | + } |
|---|
| 1970 | + |
|---|
| 1971 | +send_events: |
|---|
| 1972 | + if (fatal_signal_pending(current)) { |
|---|
| 1973 | + /* |
|---|
| 1974 | + * Always short-circuit for fatal signals to allow |
|---|
| 1975 | + * threads to make a timely exit without the chance of |
|---|
| 1976 | + * finding more events available and fetching |
|---|
| 1977 | + * repeatedly. |
|---|
| 1978 | + */ |
|---|
| 1979 | + res = -EINTR; |
|---|
| 1980 | + } |
|---|
| 1834 | 1981 | /* |
|---|
| 1835 | 1982 | * Try to transfer events to user space. In case we get 0 events and |
|---|
| 1836 | 1983 | * there's still timeout left over, we go trying again in search of |
|---|
| .. | .. |
|---|
| 1875 | 2022 | ep_tovisit = epi->ffd.file->private_data; |
|---|
| 1876 | 2023 | if (ep_tovisit->gen == loop_check_gen) |
|---|
| 1877 | 2024 | continue; |
|---|
| 1878 | | - error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, |
|---|
| 2025 | + error = ep_call_nested(&poll_loop_ncalls, |
|---|
| 1879 | 2026 | ep_loop_check_proc, epi->ffd.file, |
|---|
| 1880 | 2027 | ep_tovisit, current); |
|---|
| 1881 | 2028 | if (error != 0) |
|---|
| .. | .. |
|---|
| 1914 | 2061 | */ |
|---|
| 1915 | 2062 | static int ep_loop_check(struct eventpoll *ep, struct file *file) |
|---|
| 1916 | 2063 | { |
|---|
| 1917 | | - return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, |
|---|
| 2064 | + return ep_call_nested(&poll_loop_ncalls, |
|---|
| 1918 | 2065 | ep_loop_check_proc, file, ep, current); |
|---|
| 1919 | 2066 | } |
|---|
| 1920 | 2067 | |
|---|
| .. | .. |
|---|
| 1991 | 2138 | return do_epoll_create(0); |
|---|
| 1992 | 2139 | } |
|---|
| 1993 | 2140 | |
|---|
| 1994 | | -/* |
|---|
| 1995 | | - * The following function implements the controller interface for |
|---|
| 1996 | | - * the eventpoll file that enables the insertion/removal/change of |
|---|
| 1997 | | - * file descriptors inside the interest set. |
|---|
| 1998 | | - */ |
|---|
| 1999 | | -SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, |
|---|
| 2000 | | - struct epoll_event __user *, event) |
|---|
| 2141 | +static inline int epoll_mutex_lock(struct mutex *mutex, int depth, |
|---|
| 2142 | + bool nonblock) |
|---|
| 2143 | +{ |
|---|
| 2144 | + if (!nonblock) { |
|---|
| 2145 | + mutex_lock_nested(mutex, depth); |
|---|
| 2146 | + return 0; |
|---|
| 2147 | + } |
|---|
| 2148 | + if (mutex_trylock(mutex)) |
|---|
| 2149 | + return 0; |
|---|
| 2150 | + return -EAGAIN; |
|---|
| 2151 | +} |
|---|
| 2152 | + |
|---|
| 2153 | +int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, |
|---|
| 2154 | + bool nonblock) |
|---|
| 2001 | 2155 | { |
|---|
| 2002 | 2156 | int error; |
|---|
| 2003 | 2157 | int full_check = 0; |
|---|
| 2004 | 2158 | struct fd f, tf; |
|---|
| 2005 | 2159 | struct eventpoll *ep; |
|---|
| 2006 | 2160 | struct epitem *epi; |
|---|
| 2007 | | - struct epoll_event epds; |
|---|
| 2008 | 2161 | struct eventpoll *tep = NULL; |
|---|
| 2009 | | - |
|---|
| 2010 | | - error = -EFAULT; |
|---|
| 2011 | | - if (ep_op_has_event(op) && |
|---|
| 2012 | | - copy_from_user(&epds, event, sizeof(struct epoll_event))) |
|---|
| 2013 | | - goto error_return; |
|---|
| 2014 | 2162 | |
|---|
| 2015 | 2163 | error = -EBADF; |
|---|
| 2016 | 2164 | f = fdget(epfd); |
|---|
| .. | .. |
|---|
| 2029 | 2177 | |
|---|
| 2030 | 2178 | /* Check if EPOLLWAKEUP is allowed */ |
|---|
| 2031 | 2179 | if (ep_op_has_event(op)) |
|---|
| 2032 | | - ep_take_care_of_epollwakeup(&epds); |
|---|
| 2180 | + ep_take_care_of_epollwakeup(epds); |
|---|
| 2033 | 2181 | |
|---|
| 2034 | 2182 | /* |
|---|
| 2035 | 2183 | * We have to check that the file structure underneath the file descriptor |
|---|
| .. | .. |
|---|
| 2045 | 2193 | * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. |
|---|
| 2046 | 2194 | * Also, we do not currently supported nested exclusive wakeups. |
|---|
| 2047 | 2195 | */ |
|---|
| 2048 | | - if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) { |
|---|
| 2196 | + if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { |
|---|
| 2049 | 2197 | if (op == EPOLL_CTL_MOD) |
|---|
| 2050 | 2198 | goto error_tgt_fput; |
|---|
| 2051 | 2199 | if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || |
|---|
| 2052 | | - (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) |
|---|
| 2200 | + (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) |
|---|
| 2053 | 2201 | goto error_tgt_fput; |
|---|
| 2054 | 2202 | } |
|---|
| 2055 | 2203 | |
|---|
| .. | .. |
|---|
| 2074 | 2222 | * deep wakeup paths from forming in parallel through multiple |
|---|
| 2075 | 2223 | * EPOLL_CTL_ADD operations. |
|---|
| 2076 | 2224 | */ |
|---|
| 2077 | | - mutex_lock_nested(&ep->mtx, 0); |
|---|
| 2225 | + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); |
|---|
| 2226 | + if (error) |
|---|
| 2227 | + goto error_tgt_fput; |
|---|
| 2078 | 2228 | if (op == EPOLL_CTL_ADD) { |
|---|
| 2079 | 2229 | if (!list_empty(&f.file->f_ep_links) || |
|---|
| 2080 | 2230 | ep->gen == loop_check_gen || |
|---|
| 2081 | 2231 | is_file_epoll(tf.file)) { |
|---|
| 2082 | | - full_check = 1; |
|---|
| 2083 | 2232 | mutex_unlock(&ep->mtx); |
|---|
| 2084 | | - mutex_lock(&epmutex); |
|---|
| 2233 | + error = epoll_mutex_lock(&epmutex, 0, nonblock); |
|---|
| 2234 | + if (error) |
|---|
| 2235 | + goto error_tgt_fput; |
|---|
| 2236 | + loop_check_gen++; |
|---|
| 2237 | + full_check = 1; |
|---|
| 2085 | 2238 | if (is_file_epoll(tf.file)) { |
|---|
| 2086 | 2239 | error = -ELOOP; |
|---|
| 2087 | 2240 | if (ep_loop_check(ep, tf.file) != 0) |
|---|
| .. | .. |
|---|
| 2091 | 2244 | list_add(&tf.file->f_tfile_llink, |
|---|
| 2092 | 2245 | &tfile_check_list); |
|---|
| 2093 | 2246 | } |
|---|
| 2094 | | - mutex_lock_nested(&ep->mtx, 0); |
|---|
| 2247 | + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); |
|---|
| 2248 | + if (error) |
|---|
| 2249 | + goto error_tgt_fput; |
|---|
| 2095 | 2250 | if (is_file_epoll(tf.file)) { |
|---|
| 2096 | 2251 | tep = tf.file->private_data; |
|---|
| 2097 | | - mutex_lock_nested(&tep->mtx, 1); |
|---|
| 2252 | + error = epoll_mutex_lock(&tep->mtx, 1, nonblock); |
|---|
| 2253 | + if (error) { |
|---|
| 2254 | + mutex_unlock(&ep->mtx); |
|---|
| 2255 | + goto error_tgt_fput; |
|---|
| 2256 | + } |
|---|
| 2098 | 2257 | } |
|---|
| 2099 | 2258 | } |
|---|
| 2100 | 2259 | } |
|---|
| .. | .. |
|---|
| 2110 | 2269 | switch (op) { |
|---|
| 2111 | 2270 | case EPOLL_CTL_ADD: |
|---|
| 2112 | 2271 | if (!epi) { |
|---|
| 2113 | | - epds.events |= EPOLLERR | EPOLLHUP; |
|---|
| 2114 | | - error = ep_insert(ep, &epds, tf.file, fd, full_check); |
|---|
| 2272 | + epds->events |= EPOLLERR | EPOLLHUP; |
|---|
| 2273 | + error = ep_insert(ep, epds, tf.file, fd, full_check); |
|---|
| 2115 | 2274 | } else |
|---|
| 2116 | 2275 | error = -EEXIST; |
|---|
| 2117 | 2276 | break; |
|---|
| .. | .. |
|---|
| 2124 | 2283 | case EPOLL_CTL_MOD: |
|---|
| 2125 | 2284 | if (epi) { |
|---|
| 2126 | 2285 | if (!(epi->event.events & EPOLLEXCLUSIVE)) { |
|---|
| 2127 | | - epds.events |= EPOLLERR | EPOLLHUP; |
|---|
| 2128 | | - error = ep_modify(ep, epi, &epds); |
|---|
| 2286 | + epds->events |= EPOLLERR | EPOLLHUP; |
|---|
| 2287 | + error = ep_modify(ep, epi, epds); |
|---|
| 2129 | 2288 | } |
|---|
| 2130 | 2289 | } else |
|---|
| 2131 | 2290 | error = -ENOENT; |
|---|
| .. | .. |
|---|
| 2151 | 2310 | } |
|---|
| 2152 | 2311 | |
|---|
| 2153 | 2312 | /* |
|---|
| 2313 | + * The following function implements the controller interface for |
|---|
| 2314 | + * the eventpoll file that enables the insertion/removal/change of |
|---|
| 2315 | + * file descriptors inside the interest set. |
|---|
| 2316 | + */ |
|---|
| 2317 | +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, |
|---|
| 2318 | + struct epoll_event __user *, event) |
|---|
| 2319 | +{ |
|---|
| 2320 | + struct epoll_event epds; |
|---|
| 2321 | + |
|---|
| 2322 | + if (ep_op_has_event(op) && |
|---|
| 2323 | + copy_from_user(&epds, event, sizeof(struct epoll_event))) |
|---|
| 2324 | + return -EFAULT; |
|---|
| 2325 | + |
|---|
| 2326 | + return do_epoll_ctl(epfd, op, fd, &epds, false); |
|---|
| 2327 | +} |
|---|
| 2328 | + |
|---|
| 2329 | +/* |
|---|
| 2154 | 2330 | * Implement the event wait interface for the eventpoll file. It is the kernel |
|---|
| 2155 | 2331 | * part of the user space epoll_wait(2). |
|---|
| 2156 | 2332 | */ |
|---|
| .. | .. |
|---|
| 2166 | 2342 | return -EINVAL; |
|---|
| 2167 | 2343 | |
|---|
| 2168 | 2344 | /* Verify that the area passed by the user is writeable */ |
|---|
| 2169 | | - if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) |
|---|
| 2345 | + if (!access_ok(events, maxevents * sizeof(struct epoll_event))) |
|---|
| 2170 | 2346 | return -EFAULT; |
|---|
| 2171 | 2347 | |
|---|
| 2172 | 2348 | /* Get the "struct file *" for the eventpoll file */ |
|---|
| .. | .. |
|---|
| 2211 | 2387 | size_t, sigsetsize) |
|---|
| 2212 | 2388 | { |
|---|
| 2213 | 2389 | int error; |
|---|
| 2214 | | - sigset_t ksigmask, sigsaved; |
|---|
| 2215 | 2390 | |
|---|
| 2216 | 2391 | /* |
|---|
| 2217 | 2392 | * If the caller wants a certain signal mask to be set during the wait, |
|---|
| 2218 | 2393 | * we apply it here. |
|---|
| 2219 | 2394 | */ |
|---|
| 2220 | | - if (sigmask) { |
|---|
| 2221 | | - if (sigsetsize != sizeof(sigset_t)) |
|---|
| 2222 | | - return -EINVAL; |
|---|
| 2223 | | - if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) |
|---|
| 2224 | | - return -EFAULT; |
|---|
| 2225 | | - sigsaved = current->blocked; |
|---|
| 2226 | | - set_current_blocked(&ksigmask); |
|---|
| 2227 | | - } |
|---|
| 2395 | + error = set_user_sigmask(sigmask, sigsetsize); |
|---|
| 2396 | + if (error) |
|---|
| 2397 | + return error; |
|---|
| 2228 | 2398 | |
|---|
| 2229 | 2399 | error = do_epoll_wait(epfd, events, maxevents, timeout); |
|---|
| 2230 | | - |
|---|
| 2231 | | - /* |
|---|
| 2232 | | - * If we changed the signal mask, we need to restore the original one. |
|---|
| 2233 | | - * In case we've got a signal while waiting, we do not restore the |
|---|
| 2234 | | - * signal mask yet, and we allow do_signal() to deliver the signal on |
|---|
| 2235 | | - * the way back to userspace, before the signal mask is restored. |
|---|
| 2236 | | - */ |
|---|
| 2237 | | - if (sigmask) { |
|---|
| 2238 | | - if (error == -EINTR) { |
|---|
| 2239 | | - memcpy(¤t->saved_sigmask, &sigsaved, |
|---|
| 2240 | | - sizeof(sigsaved)); |
|---|
| 2241 | | - set_restore_sigmask(); |
|---|
| 2242 | | - } else |
|---|
| 2243 | | - set_current_blocked(&sigsaved); |
|---|
| 2244 | | - } |
|---|
| 2400 | + restore_saved_sigmask_unless(error == -EINTR); |
|---|
| 2245 | 2401 | |
|---|
| 2246 | 2402 | return error; |
|---|
| 2247 | 2403 | } |
|---|
| .. | .. |
|---|
| 2254 | 2410 | compat_size_t, sigsetsize) |
|---|
| 2255 | 2411 | { |
|---|
| 2256 | 2412 | long err; |
|---|
| 2257 | | - sigset_t ksigmask, sigsaved; |
|---|
| 2258 | 2413 | |
|---|
| 2259 | 2414 | /* |
|---|
| 2260 | 2415 | * If the caller wants a certain signal mask to be set during the wait, |
|---|
| 2261 | 2416 | * we apply it here. |
|---|
| 2262 | 2417 | */ |
|---|
| 2263 | | - if (sigmask) { |
|---|
| 2264 | | - if (sigsetsize != sizeof(compat_sigset_t)) |
|---|
| 2265 | | - return -EINVAL; |
|---|
| 2266 | | - if (get_compat_sigset(&ksigmask, sigmask)) |
|---|
| 2267 | | - return -EFAULT; |
|---|
| 2268 | | - sigsaved = current->blocked; |
|---|
| 2269 | | - set_current_blocked(&ksigmask); |
|---|
| 2270 | | - } |
|---|
| 2418 | + err = set_compat_user_sigmask(sigmask, sigsetsize); |
|---|
| 2419 | + if (err) |
|---|
| 2420 | + return err; |
|---|
| 2271 | 2421 | |
|---|
| 2272 | 2422 | err = do_epoll_wait(epfd, events, maxevents, timeout); |
|---|
| 2273 | | - |
|---|
| 2274 | | - /* |
|---|
| 2275 | | - * If we changed the signal mask, we need to restore the original one. |
|---|
| 2276 | | - * In case we've got a signal while waiting, we do not restore the |
|---|
| 2277 | | - * signal mask yet, and we allow do_signal() to deliver the signal on |
|---|
| 2278 | | - * the way back to userspace, before the signal mask is restored. |
|---|
| 2279 | | - */ |
|---|
| 2280 | | - if (sigmask) { |
|---|
| 2281 | | - if (err == -EINTR) { |
|---|
| 2282 | | - memcpy(¤t->saved_sigmask, &sigsaved, |
|---|
| 2283 | | - sizeof(sigsaved)); |
|---|
| 2284 | | - set_restore_sigmask(); |
|---|
| 2285 | | - } else |
|---|
| 2286 | | - set_current_blocked(&sigsaved); |
|---|
| 2287 | | - } |
|---|
| 2423 | + restore_saved_sigmask_unless(err == -EINTR); |
|---|
| 2288 | 2424 | |
|---|
| 2289 | 2425 | return err; |
|---|
| 2290 | 2426 | } |
|---|
| .. | .. |
|---|
| 2307 | 2443 | * inclusion loops checks. |
|---|
| 2308 | 2444 | */ |
|---|
| 2309 | 2445 | ep_nested_calls_init(&poll_loop_ncalls); |
|---|
| 2310 | | - |
|---|
| 2311 | | -#ifdef CONFIG_DEBUG_LOCK_ALLOC |
|---|
| 2312 | | - /* Initialize the structure used to perform safe poll wait head wake ups */ |
|---|
| 2313 | | - ep_nested_calls_init(&poll_safewake_ncalls); |
|---|
| 2314 | | -#endif |
|---|
| 2315 | 2446 | |
|---|
| 2316 | 2447 | /* |
|---|
| 2317 | 2448 | * We can have many thousands of epitems, so prevent this from |
|---|