hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/kernel/locking/qspinlock.c
....@@ -1,15 +1,6 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * Queued spinlock
3
- *
4
- * This program is free software; you can redistribute it and/or modify
5
- * it under the terms of the GNU General Public License as published by
6
- * the Free Software Foundation; either version 2 of the License, or
7
- * (at your option) any later version.
8
- *
9
- * This program is distributed in the hope that it will be useful,
10
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
- * GNU General Public License for more details.
134 *
145 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
156 * (C) Copyright 2013-2014,2018 Red Hat, Inc.
....@@ -40,14 +31,15 @@
4031 /*
4132 * The basic principle of a queue-based spinlock can best be understood
4233 * by studying a classic queue-based spinlock implementation called the
43
- * MCS lock. The paper below provides a good description for this kind
44
- * of lock.
34
+ * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
35
+ * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
36
+ * Scott") is available at
4537 *
46
- * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
38
+ * https://bugzilla.kernel.org/show_bug.cgi?id=206115
4739 *
48
- * This queued spinlock implementation is based on the MCS lock, however to make
49
- * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
50
- * API, we must modify it somehow.
40
+ * This queued spinlock implementation is based on the MCS lock, however to
41
+ * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
42
+ * existing API, we must modify it somehow.
5143 *
5244 * In particular; where the traditional MCS lock consists of a tail pointer
5345 * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
....@@ -74,12 +66,24 @@
7466 */
7567
7668 #include "mcs_spinlock.h"
77
-
78
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
79
-#define MAX_NODES 8
80
-#else
8169 #define MAX_NODES 4
70
+
71
+/*
72
+ * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
73
+ * size and four of them will fit nicely in one 64-byte cacheline. For
74
+ * pvqspinlock, however, we need more space for extra data. To accommodate
75
+ * that, we insert two more long words to pad it up to 32 bytes. IOW, only
76
+ * two of them can fit in a cacheline in this case. That is OK as it is rare
77
+ * to have more than 2 levels of slowpath nesting in actual use. We don't
78
+ * want to penalize pvqspinlocks to optimize for a rare case in native
79
+ * qspinlocks.
80
+ */
81
+struct qnode {
82
+ struct mcs_spinlock mcs;
83
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
84
+ long reserved[2];
8285 #endif
86
+};
8387
8488 /*
8589 * The pending bit spinning loop count.
....@@ -101,7 +105,7 @@
101105 *
102106 * PV doubles the storage and uses the second cacheline for PV state.
103107 */
104
-static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
108
+static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);
105109
106110 /*
107111 * We must be able to distinguish between no-tail and the tail at 0:0,
....@@ -112,9 +116,6 @@
112116 {
113117 u32 tail;
114118
115
-#ifdef CONFIG_DEBUG_SPINLOCK
116
- BUG_ON(idx > 3);
117
-#endif
118119 tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
119120 tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
120121
....@@ -126,7 +127,13 @@
126127 int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
127128 int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
128129
129
- return per_cpu_ptr(&mcs_nodes[idx], cpu);
130
+ return per_cpu_ptr(&qnodes[idx].mcs, cpu);
131
+}
132
+
133
+static inline __pure
134
+struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx)
135
+{
136
+ return &((struct qnode *)base + idx)->mcs;
130137 }
131138
132139 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
....@@ -340,17 +347,23 @@
340347 /*
341348 * trylock || pending
342349 *
343
- * 0,0,0 -> 0,0,1 ; trylock
344
- * 0,0,1 -> 0,1,1 ; pending
350
+ * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
345351 */
346352 val = queued_fetch_set_pending_acquire(lock);
347353
348354 /*
349
- * If we observe any contention; undo and queue.
355
+ * If we observe contention, there is a concurrent locker.
356
+ *
357
+ * Undo and queue; our setting of PENDING might have made the
358
+ * n,0,0 -> 0,0,0 transition fail and it will now be waiting
359
+ * on @next to become !NULL.
350360 */
351361 if (unlikely(val & ~_Q_LOCKED_MASK)) {
362
+
363
+ /* Undo PENDING if we set it. */
352364 if (!(val & _Q_PENDING_MASK))
353365 clear_pending(lock);
366
+
354367 goto queue;
355368 }
356369
....@@ -374,7 +387,7 @@
374387 * 0,1,0 -> 0,0,1
375388 */
376389 clear_pending_set_locked(lock);
377
- qstat_inc(qstat_lock_pending, true);
390
+ lockevent_inc(lock_pending);
378391 return;
379392
380393 /*
....@@ -382,13 +395,34 @@
382395 * queuing.
383396 */
384397 queue:
385
- qstat_inc(qstat_lock_slowpath, true);
398
+ lockevent_inc(lock_slowpath);
386399 pv_queue:
387
- node = this_cpu_ptr(&mcs_nodes[0]);
400
+ node = this_cpu_ptr(&qnodes[0].mcs);
388401 idx = node->count++;
389402 tail = encode_tail(smp_processor_id(), idx);
390403
391
- node += idx;
404
+ /*
405
+ * 4 nodes are allocated based on the assumption that there will
406
+ * not be nested NMIs taking spinlocks. That may not be true in
407
+ * some architectures even though the chance of needing more than
408
+ * 4 nodes will still be extremely unlikely. When that happens,
409
+ * we fall back to spinning on the lock directly without using
410
+ * any MCS node. This is not the most elegant solution, but is
411
+ * simple enough.
412
+ */
413
+ if (unlikely(idx >= MAX_NODES)) {
414
+ lockevent_inc(lock_no_node);
415
+ while (!queued_spin_trylock(lock))
416
+ cpu_relax();
417
+ goto release;
418
+ }
419
+
420
+ node = grab_mcs_node(node, idx);
421
+
422
+ /*
423
+ * Keep counts of non-zero index values:
424
+ */
425
+ lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
392426
393427 /*
394428 * Ensure that we increment the head node->count before initialising
....@@ -489,16 +523,25 @@
489523 */
490524
491525 /*
492
- * In the PV case we might already have _Q_LOCKED_VAL set.
526
+ * In the PV case we might already have _Q_LOCKED_VAL set, because
527
+ * of lock stealing; therefore we must also allow:
493528 *
494
- * The atomic_cond_read_acquire() call above has provided the
495
- * necessary acquire semantics required for locking.
529
+ * n,0,1 -> 0,0,1
530
+ *
531
+ * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
532
+ * above wait condition, therefore any concurrent setting of
533
+ * PENDING will make the uncontended transition fail.
496534 */
497
- if (((val & _Q_TAIL_MASK) == tail) &&
498
- atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
499
- goto release; /* No contention */
535
+ if ((val & _Q_TAIL_MASK) == tail) {
536
+ if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
537
+ goto release; /* No contention */
538
+ }
500539
501
- /* Either somebody is queued behind us or _Q_PENDING_VAL is set */
540
+ /*
541
+ * Either somebody is queued behind us or _Q_PENDING_VAL got set
542
+ * which will then detect the remaining tail and queue behind us
543
+ * ensuring we'll see a @next.
544
+ */
502545 set_locked(lock);
503546
504547 /*
....@@ -514,7 +557,7 @@
514557 /*
515558 * release the node
516559 */
517
- __this_cpu_dec(mcs_nodes[0].count);
560
+ __this_cpu_dec(qnodes[0].mcs.count);
518561 }
519562 EXPORT_SYMBOL(queued_spin_lock_slowpath);
520563
....@@ -538,4 +581,11 @@
538581 #include "qspinlock_paravirt.h"
539582 #include "qspinlock.c"
540583
584
+bool nopvspin __initdata;
585
+static __init int parse_nopvspin(char *arg)
586
+{
587
+ nopvspin = true;
588
+ return 0;
589
+}
590
+early_param("nopvspin", parse_nopvspin);
541591 #endif