From cde9070d9970eef1f7ec2360586c802a16230ad8 Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Fri, 10 May 2024 07:43:50 +0000 Subject: [PATCH] rtl88x2CE_WiFi_linux driver --- kernel/kernel/locking/qspinlock.c | 130 ++++++++++++++++++++++++++++++------------- 1 files changed, 90 insertions(+), 40 deletions(-) diff --git a/kernel/kernel/locking/qspinlock.c b/kernel/kernel/locking/qspinlock.c index edd75e0..cbff6ba 100644 --- a/kernel/kernel/locking/qspinlock.c +++ b/kernel/kernel/locking/qspinlock.c @@ -1,15 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Queued spinlock - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. * * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. * (C) Copyright 2013-2014,2018 Red Hat, Inc. @@ -40,14 +31,15 @@ /* * The basic principle of a queue-based spinlock can best be understood * by studying a classic queue-based spinlock implementation called the - * MCS lock. The paper below provides a good description for this kind - * of lock. + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and + * Scott") is available at * - * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 * - * This queued spinlock implementation is based on the MCS lock, however to make - * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing - * API, we must modify it somehow. + * This queued spinlock implementation is based on the MCS lock, however to + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its + * existing API, we must modify it somehow. * * In particular; where the traditional MCS lock consists of a tail pointer * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to @@ -74,12 +66,24 @@ */ #include "mcs_spinlock.h" - -#ifdef CONFIG_PARAVIRT_SPINLOCKS -#define MAX_NODES 8 -#else #define MAX_NODES 4 + +/* + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in + * size and four of them will fit nicely in one 64-byte cacheline. For + * pvqspinlock, however, we need more space for extra data. To accommodate + * that, we insert two more long words to pad it up to 32 bytes. IOW, only + * two of them can fit in a cacheline in this case. That is OK as it is rare + * to have more than 2 levels of slowpath nesting in actual use. We don't + * want to penalize pvqspinlocks to optimize for a rare case in native + * qspinlocks. + */ +struct qnode { + struct mcs_spinlock mcs; +#ifdef CONFIG_PARAVIRT_SPINLOCKS + long reserved[2]; #endif +}; /* * The pending bit spinning loop count. @@ -101,7 +105,7 @@ * * PV doubles the storage and uses the second cacheline for PV state. */ -static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); +static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); /* * We must be able to distinguish between no-tail and the tail at 0:0, @@ -112,9 +116,6 @@ { u32 tail; -#ifdef CONFIG_DEBUG_SPINLOCK - BUG_ON(idx > 3); -#endif tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ @@ -126,7 +127,13 @@ int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; - return per_cpu_ptr(&mcs_nodes[idx], cpu); + return per_cpu_ptr(&qnodes[idx].mcs, cpu); +} + +static inline __pure +struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) +{ + return &((struct qnode *)base + idx)->mcs; } #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) @@ -340,17 +347,23 @@ /* * trylock || pending * - * 0,0,0 -> 0,0,1 ; trylock - * 0,0,1 -> 0,1,1 ; pending + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock */ val = queued_fetch_set_pending_acquire(lock); /* - * If we observe any contention; undo and queue. + * If we observe contention, there is a concurrent locker. + * + * Undo and queue; our setting of PENDING might have made the + * n,0,0 -> 0,0,0 transition fail and it will now be waiting + * on @next to become !NULL. */ if (unlikely(val & ~_Q_LOCKED_MASK)) { + + /* Undo PENDING if we set it. */ if (!(val & _Q_PENDING_MASK)) clear_pending(lock); + goto queue; } @@ -374,7 +387,7 @@ * 0,1,0 -> 0,0,1 */ clear_pending_set_locked(lock); - qstat_inc(qstat_lock_pending, true); + lockevent_inc(lock_pending); return; /* @@ -382,13 +395,34 @@ * queuing. */ queue: - qstat_inc(qstat_lock_slowpath, true); + lockevent_inc(lock_slowpath); pv_queue: - node = this_cpu_ptr(&mcs_nodes[0]); + node = this_cpu_ptr(&qnodes[0].mcs); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); - node += idx; + /* + * 4 nodes are allocated based on the assumption that there will + * not be nested NMIs taking spinlocks. That may not be true in + * some architectures even though the chance of needing more than + * 4 nodes will still be extremely unlikely. When that happens, + * we fall back to spinning on the lock directly without using + * any MCS node. This is not the most elegant solution, but is + * simple enough. + */ + if (unlikely(idx >= MAX_NODES)) { + lockevent_inc(lock_no_node); + while (!queued_spin_trylock(lock)) + cpu_relax(); + goto release; + } + + node = grab_mcs_node(node, idx); + + /* + * Keep counts of non-zero index values: + */ + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); /* * Ensure that we increment the head node->count before initialising @@ -489,16 +523,25 @@ */ /* - * In the PV case we might already have _Q_LOCKED_VAL set. + * In the PV case we might already have _Q_LOCKED_VAL set, because + * of lock stealing; therefore we must also allow: * - * The atomic_cond_read_acquire() call above has provided the - * necessary acquire semantics required for locking. + * n,0,1 -> 0,0,1 + * + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the + * above wait condition, therefore any concurrent setting of + * PENDING will make the uncontended transition fail. */ - if (((val & _Q_TAIL_MASK) == tail) && - atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) - goto release; /* No contention */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + } - /* Either somebody is queued behind us or _Q_PENDING_VAL is set */ + /* + * Either somebody is queued behind us or _Q_PENDING_VAL got set + * which will then detect the remaining tail and queue behind us + * ensuring we'll see a @next. + */ set_locked(lock); /* @@ -514,7 +557,7 @@ /* * release the node */ - __this_cpu_dec(mcs_nodes[0].count); + __this_cpu_dec(qnodes[0].mcs.count); } EXPORT_SYMBOL(queued_spin_lock_slowpath); @@ -538,4 +581,11 @@ #include "qspinlock_paravirt.h" #include "qspinlock.c" +bool nopvspin __initdata; +static __init int parse_nopvspin(char *arg) +{ + nopvspin = true; + return 0; +} +early_param("nopvspin", parse_nopvspin); #endif -- Gitblit v1.6.2