From 9370bb92b2d16684ee45cf24e879c93c509162da Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Thu, 19 Dec 2024 01:47:39 +0000 Subject: [PATCH] add wifi6 8852be driver --- kernel/fs/eventpoll.c | 711 +++++++++++++++++++++++++++++++++++------------------------ 1 files changed, 423 insertions(+), 288 deletions(-) diff --git a/kernel/fs/eventpoll.c b/kernel/fs/eventpoll.c index 9a81b56..893047a 100644 --- a/kernel/fs/eventpoll.c +++ b/kernel/fs/eventpoll.c @@ -1,14 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * fs/eventpoll.c (Efficient event retrieval implementation) * Copyright (C) 2001,...,2009 Davide Libenzi * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * * Davide Libenzi <davidel@xmailserver.org> - * */ #include <linux/init.h> @@ -45,16 +40,18 @@ #include <linux/rculist.h> #include <net/busy_poll.h> +#include <trace/hooks/fs.h> + /* * LOCKING: * There are three level of locking required by epoll : * * 1) epmutex (mutex) * 2) ep->mtx (mutex) - * 3) ep->wq.lock (spinlock) + * 3) ep->lock (rwlock) * * The acquire order is the one listed above, from 1 to 3. - * We need a spinlock (ep->wq.lock) because we manipulate objects + * We need a rwlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need @@ -86,7 +83,7 @@ * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global - * mutex "epmutex" (together with "ep->wq.lock") to have it working, + * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epmutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee @@ -183,8 +180,6 @@ * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. - * - * Access to it is protected by the lock inside wq. */ struct eventpoll { /* @@ -204,13 +199,16 @@ /* List of ready file descriptors */ struct list_head rdllist; + /* Lock which protects rdllist and ovflist */ + rwlock_t lock; + /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out - * holding ->wq.lock. + * holding ->lock. */ struct epitem *ovflist; @@ -228,6 +226,11 @@ #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* tracks wakeup nests for lockdep validation */ + u8 nests; #endif }; @@ -294,7 +297,7 @@ #include <linux/sysctl.h> -static long zero; +static long long_zero; static long long_max = LONG_MAX; struct ctl_table epoll_table[] = { @@ -304,7 +307,7 @@ .maxlen = sizeof(max_user_watches), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero, + .extra1 = &long_zero, .extra2 = &long_max, }, { } @@ -357,12 +360,6 @@ return container_of(p, struct ep_pqueue, pt)->epi; } -/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ -static inline int ep_op_has_event(int op) -{ - return op != EPOLL_CTL_DEL; -} - /* Initialize the poll safe wake up structure */ static void ep_nested_calls_init(struct nested_calls *ncalls) { @@ -380,7 +377,8 @@ */ static inline int ep_events_available(struct eventpoll *ep) { - return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; + return !list_empty_careful(&ep->rdllist) || + READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; } #ifdef CONFIG_NET_RX_BUSY_POLL @@ -470,7 +468,6 @@ * no re-entered. * * @ncalls: Pointer to the nested_calls structure to be used for this call. - * @max_nests: Maximum number of allowed nesting calls. * @nproc: Nested call core function pointer. * @priv: Opaque data to be passed to the @nproc callback. * @cookie: Cookie to be used to identify this nested call. @@ -479,7 +476,7 @@ * Returns: Returns the code returned by the @nproc callback, or -1 if * the maximum recursion limit has been exceeded. */ -static int ep_call_nested(struct nested_calls *ncalls, int max_nests, +static int ep_call_nested(struct nested_calls *ncalls, int (*nproc)(void *, void *, int), void *priv, void *cookie, void *ctx) { @@ -498,7 +495,7 @@ */ list_for_each_entry(tncur, lsthead, llink) { if (tncur->ctx == ctx && - (tncur->cookie == cookie || ++call_nests > max_nests)) { + (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) { /* * Ops ... loop detected or maximum nest level reached. * We abort this wake by breaking the cycle itself. @@ -554,35 +551,49 @@ */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct nested_calls poll_safewake_ncalls; - -static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, + unsigned pollflags) { + struct eventpoll *ep_src; unsigned long flags; - wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie; + u8 nests = 0; - spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1); - wake_up_locked_poll(wqueue, EPOLLIN); - spin_unlock_irqrestore(&wqueue->lock, flags); - - return 0; -} - -static void ep_poll_safewake(wait_queue_head_t *wq) -{ - int this_cpu = get_cpu(); - - ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, - ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); - - put_cpu(); + /* + * To set the subclass or nesting level for spin_lock_irqsave_nested() + * it might be natural to create a per-cpu nest count. However, since + * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can + * schedule() in the -rt kernel, the per-cpu variable are no longer + * protected. Thus, we are introducing a per eventpoll nest field. + * If we are not being call from ep_poll_callback(), epi is NULL and + * we are at the first level of nesting, 0. Otherwise, we are being + * called from ep_poll_callback() and if a previous wakeup source is + * not an epoll file itself, we are at depth 1 since the wakeup source + * is depth 0. If the wakeup source is a previous epoll file in the + * wakeup chain then we use its nests value and record ours as + * nests + 1. The previous epoll file nests value is stable since its + * already holding its own poll_wait.lock. + */ + if (epi) { + if ((is_file_epoll(epi->ffd.file))) { + ep_src = epi->ffd.file->private_data; + nests = ep_src->nests; + } else { + nests = 1; + } + } + spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); + ep->nests = nests + 1; + wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); + ep->nests = 0; + spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else -static void ep_poll_safewake(wait_queue_head_t *wq) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, + unsigned pollflags) { - wake_up_poll(wq, EPOLLIN); + wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } #endif @@ -674,7 +685,6 @@ void *priv, int depth, bool ep_locked) { __poll_t res; - int pwake = 0; struct epitem *epi, *nepi; LIST_HEAD(txlist); @@ -696,23 +706,23 @@ * because we want the "sproc" callback to be able to do it * in a lockless way. */ - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, &txlist); - ep->ovflist = NULL; - spin_unlock_irq(&ep->wq.lock); + WRITE_ONCE(ep->ovflist, NULL); + write_unlock_irq(&ep->lock); /* * Now call the callback function. */ res = (*sproc)(ep, &txlist, priv); - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ - for (nepi = ep->ovflist; (epi = nepi) != NULL; + for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. @@ -721,7 +731,11 @@ * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(epi)) { - list_add_tail(&epi->rdllink, &ep->rdllist); + /* + * ->ovflist is LIFO, so we have to reverse it in order + * to keep in FIFO. + */ + list_add(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } @@ -730,7 +744,7 @@ * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ - ep->ovflist = EP_UNACTIVE_PTR; + WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); /* * Quickly re-inject items left on "txlist". @@ -739,23 +753,14 @@ __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { - /* - * Wake up (if active) both the eventpoll wait list and - * the ->poll() wait list (delayed after we release the lock). - */ if (waitqueue_active(&ep->wq)) - wake_up_locked(&ep->wq); - if (waitqueue_active(&ep->poll_wait)) - pwake++; + wake_up(&ep->wq); } - spin_unlock_irq(&ep->wq.lock); + + write_unlock_irq(&ep->lock); if (!ep_locked) mutex_unlock(&ep->mtx); - - /* We have to call this outside the lock */ - if (pwake) - ep_poll_safewake(&ep->poll_wait); return res; } @@ -788,10 +793,10 @@ rb_erase_cached(&epi->rbn, &ep->rbr); - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* @@ -815,7 +820,7 @@ /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL, 0); /* * We need to lock this because we could be hit by @@ -841,7 +846,7 @@ * Walks through the whole tree by freeing each "struct epitem". At this * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit - * us during this operation. So we can avoid the lock on "ep->wq.lock". + * us during this operation. So we can avoid the lock on "ep->lock". * We do not need to lock ep->mtx, either, we only do it to prevent * a lockdep warning. */ @@ -1022,6 +1027,7 @@ goto free_uid; mutex_init(&ep->mtx); + rwlock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); @@ -1067,7 +1073,7 @@ return epir; } -#ifdef CONFIG_CHECKPOINT_RESTORE +#ifdef CONFIG_KCMP static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) { struct rb_node *rbp; @@ -1109,23 +1115,113 @@ return file_raw; } -#endif /* CONFIG_CHECKPOINT_RESTORE */ +#endif /* CONFIG_KCMP */ + +/** + * Adds a new entry to the tail of the list in a lockless way, i.e. + * multiple CPUs are allowed to call this function concurrently. + * + * Beware: it is necessary to prevent any other modifications of the + * existing list until all changes are completed, in other words + * concurrent list_add_tail_lockless() calls should be protected + * with a read lock, where write lock acts as a barrier which + * makes sure all list_add_tail_lockless() calls are fully + * completed. + * + * Also an element can be locklessly added to the list only in one + * direction i.e. either to the tail either to the head, otherwise + * concurrent access will corrupt the list. + * + * Returns %false if element has been already added to the list, %true + * otherwise. + */ +static inline bool list_add_tail_lockless(struct list_head *new, + struct list_head *head) +{ + struct list_head *prev; + + /* + * This is simple 'new->next = head' operation, but cmpxchg() + * is used in order to detect that same element has been just + * added to the list from another CPU: the winner observes + * new->next == new. + */ + if (cmpxchg(&new->next, new, head) != new) + return false; + + /* + * Initially ->next of a new element must be updated with the head + * (we are inserting to the tail) and only then pointers are atomically + * exchanged. XCHG guarantees memory ordering, thus ->next should be + * updated before pointers are actually swapped and pointers are + * swapped before prev->next is updated. + */ + + prev = xchg(&head->prev, new); + + /* + * It is safe to modify prev->next and new->prev, because a new element + * is added only to the tail and new->next is updated before XCHG. + */ + + prev->next = new; + new->prev = prev; + + return true; +} + +/** + * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, + * i.e. multiple CPUs are allowed to call this function concurrently. + * + * Returns %false if epi element has been already chained, %true otherwise. + */ +static inline bool chain_epi_lockless(struct epitem *epi) +{ + struct eventpoll *ep = epi->ep; + + /* Fast preliminary check */ + if (epi->next != EP_UNACTIVE_PTR) + return false; + + /* Check that the same epi has not been just chained from another CPU */ + if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) + return false; + + /* Atomically exchange tail */ + epi->next = xchg(&ep->ovflist, epi); + + return true; +} /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. + * + * This callback takes a read lock in order not to content with concurrent + * events from another file descriptors, thus all modifications to ->rdllist + * or ->ovflist are lockless. Read lock is paired with the write lock from + * ep_scan_ready_list(), which stops all list modifications and guarantees + * that lists state is seen correctly. + * + * Another thing worth to mention is that ep_poll_callback() can be called + * concurrently for the same @epi from different CPUs if poll table was inited + * with several wait queues entries. Plural wakeup from different CPUs of a + * single wait queue is serialized by wq.lock, but the case when multiple wait + * queues are used should be detected accordingly. This is detected using + * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; - unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; __poll_t pollflags = key_to_poll(key); + unsigned long flags; int ewake = 0; - spin_lock_irqsave(&ep->wq.lock, flags); + read_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1153,26 +1249,13 @@ * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ - if (ep->ovflist != EP_UNACTIVE_PTR) { - if (epi->next == EP_UNACTIVE_PTR) { - epi->next = ep->ovflist; - ep->ovflist = epi; - if (epi->ws) { - /* - * Activate ep->ws since epi->ws may get - * deactivated at any time. - */ - __pm_stay_awake(ep->ws); - } - - } - goto out_unlock; - } - - /* If this file is already in the ready list we exit soon */ - if (!ep_is_linked(epi)) { - list_add_tail(&epi->rdllink, &ep->rdllist); - ep_pm_stay_awake_rcu(epi); + if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { + if (chain_epi_lockless(epi)) + ep_pm_stay_awake_rcu(epi); + } else if (!ep_is_linked(epi)) { + /* In the usual case, add event to ready list. */ + if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) + ep_pm_stay_awake_rcu(epi); } /* @@ -1196,17 +1279,17 @@ break; } } - wake_up_locked(&ep->wq); + wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: - spin_unlock_irqrestore(&ep->wq.lock, flags); + read_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; @@ -1332,7 +1415,6 @@ } } else { error = ep_call_nested(&poll_loop_ncalls, - EP_MAX_NESTS, reverse_path_check_proc, child_file, child_file, current); @@ -1366,7 +1448,7 @@ /* let's call this for all tfiles */ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { path_count_init(); - error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + error = ep_call_nested(&poll_loop_ncalls, reverse_path_check_proc, current_file, current_file, current); if (error) @@ -1379,15 +1461,20 @@ { struct name_snapshot n; struct wakeup_source *ws; + char ws_name[64]; + strlcpy(ws_name, "eventpoll", sizeof(ws_name)); + trace_android_vh_ep_create_wakeup_source(ws_name, sizeof(ws_name)); if (!epi->ep->ws) { - epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); + epi->ep->ws = wakeup_source_register(NULL, ws_name); if (!epi->ep->ws) return -ENOMEM; } take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); - ws = wakeup_source_register(NULL, n.name); + strlcpy(ws_name, n.name.name, sizeof(ws_name)); + trace_android_vh_ep_create_wakeup_source(ws_name, sizeof(ws_name)); + ws = wakeup_source_register(NULL, ws_name); release_dentry_name_snapshot(&n); if (!ws) @@ -1489,7 +1576,7 @@ goto error_unregister; /* We have to drop the new item inside our item list to keep track of it */ - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); @@ -1501,18 +1588,18 @@ /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) - wake_up_locked(&ep->wq); + wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); atomic_long_inc(&ep->user->epoll_watches); /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL, 0); return 0; @@ -1531,10 +1618,10 @@ * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); @@ -1578,9 +1665,9 @@ * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). - * We need this because we did not take ep->wq.lock while + * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take - * ep->wq.lock). + * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also @@ -1599,23 +1686,23 @@ * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) - wake_up_locked(&ep->wq); + wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL, 0); return 0; } @@ -1625,21 +1712,24 @@ { struct ep_send_events_data *esed = priv; __poll_t revents; - struct epitem *epi; - struct epoll_event __user *uevent; + struct epitem *epi, *tmp; + struct epoll_event __user *uevent = esed->events; struct wakeup_source *ws; poll_table pt; init_poll_funcptr(&pt, NULL); + esed->res = 0; /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop because ep_scan_ready_list() is * holding "mtx" during this call. */ - for (esed->res = 0, uevent = esed->events; - !list_empty(head) && esed->res < esed->maxevents;) { - epi = list_first_entry(head, struct epitem, rdllink); + lockdep_assert_held(&ep->mtx); + + list_for_each_entry_safe(epi, tmp, head, rdllink) { + if (esed->res >= esed->maxevents) + break; /* * Activate ep->ws before deactivating epi->ws to prevent @@ -1659,42 +1749,42 @@ list_del_init(&epi->rdllink); - revents = ep_item_poll(epi, &pt, 1); - /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, ep_scan_ready_list() - * is holding "mtx", so no operations coming from userspace + * is holding ep->mtx, so no operations coming from userspace * can change the item. */ - if (revents) { - if (__put_user(revents, &uevent->events) || - __put_user(epi->event.data, &uevent->data)) { - list_add(&epi->rdllink, head); - ep_pm_stay_awake(epi); - if (!esed->res) - esed->res = -EFAULT; - return 0; - } - esed->res++; - uevent++; - if (epi->event.events & EPOLLONESHOT) - epi->event.events &= EP_PRIVATE_BITS; - else if (!(epi->event.events & EPOLLET)) { - /* - * If this file has been added with Level - * Trigger mode, we need to insert back inside - * the ready list, so that the next call to - * epoll_wait() will check again the events - * availability. At this point, no one can insert - * into ep->rdllist besides us. The epoll_ctl() - * callers are locked out by - * ep_scan_ready_list() holding "mtx" and the - * poll callback will queue them in ep->ovflist. - */ - list_add_tail(&epi->rdllink, &ep->rdllist); - ep_pm_stay_awake(epi); - } + revents = ep_item_poll(epi, &pt, 1); + if (!revents) + continue; + + if (__put_user(revents, &uevent->events) || + __put_user(epi->event.data, &uevent->data)) { + list_add(&epi->rdllink, head); + ep_pm_stay_awake(epi); + if (!esed->res) + esed->res = -EFAULT; + return 0; + } + esed->res++; + uevent++; + if (epi->event.events & EPOLLONESHOT) + epi->event.events &= EP_PRIVATE_BITS; + else if (!(epi->event.events & EPOLLET)) { + /* + * If this file has been added with Level + * Trigger mode, we need to insert back inside + * the ready list, so that the next call to + * epoll_wait() will check again the events + * availability. At this point, no one can insert + * into ep->rdllist besides us. The epoll_ctl() + * callers are locked out by + * ep_scan_ready_list() holding "mtx" and the + * poll callback will queue them in ep->ovflist. + */ + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake(epi); } } @@ -1722,6 +1812,25 @@ ktime_get_ts64(&now); return timespec64_add_safe(now, ts); +} + +/* + * autoremove_wake_function, but remove even on failure to wake up, because we + * know that default_wake_function/ttwu will only fail if the thread is already + * woken, and in that case the ep_poll loop will remove the entry anyways, not + * try to reuse it. + */ +static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, + unsigned int mode, int sync, void *key) +{ + int ret = default_wake_function(wq_entry, mode, sync, key); + + /* + * Pairs with list_empty_careful in ep_poll, and ensures future loop + * iterations see the cause of this wakeup. + */ + list_del_init_careful(&wq_entry->entry); + return ret; } /** @@ -1760,11 +1869,18 @@ } else if (timeout == 0) { /* * Avoid the unnecessary trip to the wait queue loop, if the - * caller specified a non blocking operation. + * caller specified a non blocking operation. We still need + * lock because we could race and not see an epi being added + * to the ready list while in irq callback. Thus incorrectly + * returning 0 back to userspace. */ timed_out = 1; - spin_lock_irq(&ep->wq.lock); - goto check_events; + + write_lock_irq(&ep->lock); + eavail = ep_events_available(ep); + write_unlock_irq(&ep->lock); + + goto send_events; } fetch_events: @@ -1772,65 +1888,100 @@ if (!ep_events_available(ep)) ep_busy_loop(ep, timed_out); - spin_lock_irq(&ep->wq.lock); - - if (!ep_events_available(ep)) { - /* - * Busy poll timed out. Drop NAPI ID for now, we can add - * it back in when we have moved a socket with a valid NAPI - * ID onto the ready list. - */ - ep_reset_busy_poll_napi_id(ep); - - /* - * We don't have any available event to return to the caller. - * We need to sleep here, and we will be wake up by - * ep_poll_callback() when events will become available. - */ - init_waitqueue_entry(&wait, current); - __add_wait_queue_exclusive(&ep->wq, &wait); - - for (;;) { - /* - * We don't want to sleep if the ep_poll_callback() sends us - * a wakeup in between. That's why we set the task state - * to TASK_INTERRUPTIBLE before doing the checks. - */ - set_current_state(TASK_INTERRUPTIBLE); - /* - * Always short-circuit for fatal signals to allow - * threads to make a timely exit without the chance of - * finding more events available and fetching - * repeatedly. - */ - if (fatal_signal_pending(current)) { - res = -EINTR; - break; - } - if (ep_events_available(ep) || timed_out) - break; - if (signal_pending(current)) { - res = -EINTR; - break; - } - - spin_unlock_irq(&ep->wq.lock); - if (!freezable_schedule_hrtimeout_range(to, slack, - HRTIMER_MODE_ABS)) - timed_out = 1; - - spin_lock_irq(&ep->wq.lock); - } - - __remove_wait_queue(&ep->wq, &wait); - __set_current_state(TASK_RUNNING); - } -check_events: - /* Is it worth to try to dig for events ? */ eavail = ep_events_available(ep); + if (eavail) + goto send_events; - spin_unlock_irq(&ep->wq.lock); + /* + * Busy poll timed out. Drop NAPI ID for now, we can add + * it back in when we have moved a socket with a valid NAPI + * ID onto the ready list. + */ + ep_reset_busy_poll_napi_id(ep); + do { + /* + * Internally init_wait() uses autoremove_wake_function(), + * thus wait entry is removed from the wait queue on each + * wakeup. Why it is important? In case of several waiters + * each new wakeup will hit the next waiter, giving it the + * chance to harvest new event. Otherwise wakeup can be + * lost. This is also good performance-wise, because on + * normal wakeup path no need to call __remove_wait_queue() + * explicitly, thus ep->lock is not taken, which halts the + * event delivery. + * + * In fact, we now use an even more aggressive function that + * unconditionally removes, because we don't reuse the wait + * entry between loop iterations. This lets us also avoid the + * performance issue if a process is killed, causing all of its + * threads to wake up without being removed normally. + */ + init_wait(&wait); + wait.func = ep_autoremove_wake_function; + + write_lock_irq(&ep->lock); + /* + * Barrierless variant, waitqueue_active() is called under + * the same lock on wakeup ep_poll_callback() side, so it + * is safe to avoid an explicit barrier. + */ + __set_current_state(TASK_INTERRUPTIBLE); + + /* + * Do the final check under the lock. ep_scan_ready_list() + * plays with two lists (->rdllist and ->ovflist) and there + * is always a race when both lists are empty for short + * period of time although events are pending, so lock is + * important. + */ + eavail = ep_events_available(ep); + if (!eavail) { + if (signal_pending(current)) + res = -EINTR; + else + __add_wait_queue_exclusive(&ep->wq, &wait); + } + write_unlock_irq(&ep->lock); + + if (!eavail && !res) + timed_out = !freezable_schedule_hrtimeout_range(to, slack, + HRTIMER_MODE_ABS); + + /* + * We were woken up, thus go and try to harvest some events. + * If timed out and still on the wait queue, recheck eavail + * carefully under lock, below. + */ + eavail = 1; + } while (0); + + __set_current_state(TASK_RUNNING); + + if (!list_empty_careful(&wait.entry)) { + write_lock_irq(&ep->lock); + /* + * If the thread timed out and is not on the wait queue, it + * means that the thread was woken up after its timeout expired + * before it could reacquire the lock. Thus, when wait.entry is + * empty, it needs to harvest events. + */ + if (timed_out) + eavail = list_empty(&wait.entry); + __remove_wait_queue(&ep->wq, &wait); + write_unlock_irq(&ep->lock); + } + +send_events: + if (fatal_signal_pending(current)) { + /* + * Always short-circuit for fatal signals to allow + * threads to make a timely exit without the chance of + * finding more events available and fetching + * repeatedly. + */ + res = -EINTR; + } /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of @@ -1875,7 +2026,7 @@ ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; - error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + error = ep_call_nested(&poll_loop_ncalls, ep_loop_check_proc, epi->ffd.file, ep_tovisit, current); if (error != 0) @@ -1914,7 +2065,7 @@ */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + return ep_call_nested(&poll_loop_ncalls, ep_loop_check_proc, file, ep, current); } @@ -1991,26 +2142,27 @@ return do_epoll_create(0); } -/* - * The following function implements the controller interface for - * the eventpoll file that enables the insertion/removal/change of - * file descriptors inside the interest set. - */ -SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, - struct epoll_event __user *, event) +static inline int epoll_mutex_lock(struct mutex *mutex, int depth, + bool nonblock) +{ + if (!nonblock) { + mutex_lock_nested(mutex, depth); + return 0; + } + if (mutex_trylock(mutex)) + return 0; + return -EAGAIN; +} + +int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, + bool nonblock) { int error; int full_check = 0; struct fd f, tf; struct eventpoll *ep; struct epitem *epi; - struct epoll_event epds; struct eventpoll *tep = NULL; - - error = -EFAULT; - if (ep_op_has_event(op) && - copy_from_user(&epds, event, sizeof(struct epoll_event))) - goto error_return; error = -EBADF; f = fdget(epfd); @@ -2029,7 +2181,7 @@ /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) - ep_take_care_of_epollwakeup(&epds); + ep_take_care_of_epollwakeup(epds); /* * We have to check that the file structure underneath the file descriptor @@ -2045,11 +2197,11 @@ * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ - if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) { + if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || - (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) + (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } @@ -2074,14 +2226,19 @@ * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ - mutex_lock_nested(&ep->mtx, 0); + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); + if (error) + goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (!list_empty(&f.file->f_ep_links) || ep->gen == loop_check_gen || is_file_epoll(tf.file)) { - full_check = 1; mutex_unlock(&ep->mtx); - mutex_lock(&epmutex); + error = epoll_mutex_lock(&epmutex, 0, nonblock); + if (error) + goto error_tgt_fput; + loop_check_gen++; + full_check = 1; if (is_file_epoll(tf.file)) { error = -ELOOP; if (ep_loop_check(ep, tf.file) != 0) @@ -2091,10 +2248,16 @@ list_add(&tf.file->f_tfile_llink, &tfile_check_list); } - mutex_lock_nested(&ep->mtx, 0); + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); + if (error) + goto error_tgt_fput; if (is_file_epoll(tf.file)) { tep = tf.file->private_data; - mutex_lock_nested(&tep->mtx, 1); + error = epoll_mutex_lock(&tep->mtx, 1, nonblock); + if (error) { + mutex_unlock(&ep->mtx); + goto error_tgt_fput; + } } } } @@ -2110,8 +2273,8 @@ switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= EPOLLERR | EPOLLHUP; - error = ep_insert(ep, &epds, tf.file, fd, full_check); + epds->events |= EPOLLERR | EPOLLHUP; + error = ep_insert(ep, epds, tf.file, fd, full_check); } else error = -EEXIST; break; @@ -2124,8 +2287,8 @@ case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { - epds.events |= EPOLLERR | EPOLLHUP; - error = ep_modify(ep, epi, &epds); + epds->events |= EPOLLERR | EPOLLHUP; + error = ep_modify(ep, epi, epds); } } else error = -ENOENT; @@ -2151,6 +2314,23 @@ } /* + * The following function implements the controller interface for + * the eventpoll file that enables the insertion/removal/change of + * file descriptors inside the interest set. + */ +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + struct epoll_event __user *, event) +{ + struct epoll_event epds; + + if (ep_op_has_event(op) && + copy_from_user(&epds, event, sizeof(struct epoll_event))) + return -EFAULT; + + return do_epoll_ctl(epfd, op, fd, &epds, false); +} + +/* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ @@ -2166,7 +2346,7 @@ return -EINVAL; /* Verify that the area passed by the user is writeable */ - if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) + if (!access_ok(events, maxevents * sizeof(struct epoll_event))) return -EFAULT; /* Get the "struct file *" for the eventpoll file */ @@ -2211,37 +2391,17 @@ size_t, sigsetsize) { int error; - sigset_t ksigmask, sigsaved; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ - if (sigmask) { - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) - return -EFAULT; - sigsaved = current->blocked; - set_current_blocked(&ksigmask); - } + error = set_user_sigmask(sigmask, sigsetsize); + if (error) + return error; error = do_epoll_wait(epfd, events, maxevents, timeout); - - /* - * If we changed the signal mask, we need to restore the original one. - * In case we've got a signal while waiting, we do not restore the - * signal mask yet, and we allow do_signal() to deliver the signal on - * the way back to userspace, before the signal mask is restored. - */ - if (sigmask) { - if (error == -EINTR) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } else - set_current_blocked(&sigsaved); - } + restore_saved_sigmask_unless(error == -EINTR); return error; } @@ -2254,37 +2414,17 @@ compat_size_t, sigsetsize) { long err; - sigset_t ksigmask, sigsaved; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (get_compat_sigset(&ksigmask, sigmask)) - return -EFAULT; - sigsaved = current->blocked; - set_current_blocked(&ksigmask); - } + err = set_compat_user_sigmask(sigmask, sigsetsize); + if (err) + return err; err = do_epoll_wait(epfd, events, maxevents, timeout); - - /* - * If we changed the signal mask, we need to restore the original one. - * In case we've got a signal while waiting, we do not restore the - * signal mask yet, and we allow do_signal() to deliver the signal on - * the way back to userspace, before the signal mask is restored. - */ - if (sigmask) { - if (err == -EINTR) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } else - set_current_blocked(&sigsaved); - } + restore_saved_sigmask_unless(err == -EINTR); return err; } @@ -2307,11 +2447,6 @@ * inclusion loops checks. */ ep_nested_calls_init(&poll_loop_ncalls); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* Initialize the structure used to perform safe poll wait head wake ups */ - ep_nested_calls_init(&poll_safewake_ncalls); -#endif /* * We can have many thousands of epitems, so prevent this from -- Gitblit v1.6.2