.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* |
---|
2 | 3 | * Queued spinlock |
---|
3 | | - * |
---|
4 | | - * This program is free software; you can redistribute it and/or modify |
---|
5 | | - * it under the terms of the GNU General Public License as published by |
---|
6 | | - * the Free Software Foundation; either version 2 of the License, or |
---|
7 | | - * (at your option) any later version. |
---|
8 | | - * |
---|
9 | | - * This program is distributed in the hope that it will be useful, |
---|
10 | | - * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
11 | | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
12 | | - * GNU General Public License for more details. |
---|
13 | 4 | * |
---|
14 | 5 | * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. |
---|
15 | 6 | * (C) Copyright 2013-2014,2018 Red Hat, Inc. |
---|
.. | .. |
---|
40 | 31 | /* |
---|
41 | 32 | * The basic principle of a queue-based spinlock can best be understood |
---|
42 | 33 | * by studying a classic queue-based spinlock implementation called the |
---|
43 | | - * MCS lock. The paper below provides a good description for this kind |
---|
44 | | - * of lock. |
---|
| 34 | + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable |
---|
| 35 | + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and |
---|
| 36 | + * Scott") is available at |
---|
45 | 37 | * |
---|
46 | | - * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf |
---|
| 38 | + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 |
---|
47 | 39 | * |
---|
48 | | - * This queued spinlock implementation is based on the MCS lock, however to make |
---|
49 | | - * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing |
---|
50 | | - * API, we must modify it somehow. |
---|
| 40 | + * This queued spinlock implementation is based on the MCS lock, however to |
---|
| 41 | + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its |
---|
| 42 | + * existing API, we must modify it somehow. |
---|
51 | 43 | * |
---|
52 | 44 | * In particular; where the traditional MCS lock consists of a tail pointer |
---|
53 | 45 | * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to |
---|
.. | .. |
---|
74 | 66 | */ |
---|
75 | 67 | |
---|
76 | 68 | #include "mcs_spinlock.h" |
---|
77 | | - |
---|
78 | | -#ifdef CONFIG_PARAVIRT_SPINLOCKS |
---|
79 | | -#define MAX_NODES 8 |
---|
80 | | -#else |
---|
81 | 69 | #define MAX_NODES 4 |
---|
| 70 | + |
---|
| 71 | +/* |
---|
| 72 | + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in |
---|
| 73 | + * size and four of them will fit nicely in one 64-byte cacheline. For |
---|
| 74 | + * pvqspinlock, however, we need more space for extra data. To accommodate |
---|
| 75 | + * that, we insert two more long words to pad it up to 32 bytes. IOW, only |
---|
| 76 | + * two of them can fit in a cacheline in this case. That is OK as it is rare |
---|
| 77 | + * to have more than 2 levels of slowpath nesting in actual use. We don't |
---|
| 78 | + * want to penalize pvqspinlocks to optimize for a rare case in native |
---|
| 79 | + * qspinlocks. |
---|
| 80 | + */ |
---|
| 81 | +struct qnode { |
---|
| 82 | + struct mcs_spinlock mcs; |
---|
| 83 | +#ifdef CONFIG_PARAVIRT_SPINLOCKS |
---|
| 84 | + long reserved[2]; |
---|
82 | 85 | #endif |
---|
| 86 | +}; |
---|
83 | 87 | |
---|
84 | 88 | /* |
---|
85 | 89 | * The pending bit spinning loop count. |
---|
.. | .. |
---|
101 | 105 | * |
---|
102 | 106 | * PV doubles the storage and uses the second cacheline for PV state. |
---|
103 | 107 | */ |
---|
104 | | -static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); |
---|
| 108 | +static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); |
---|
105 | 109 | |
---|
106 | 110 | /* |
---|
107 | 111 | * We must be able to distinguish between no-tail and the tail at 0:0, |
---|
.. | .. |
---|
112 | 116 | { |
---|
113 | 117 | u32 tail; |
---|
114 | 118 | |
---|
115 | | -#ifdef CONFIG_DEBUG_SPINLOCK |
---|
116 | | - BUG_ON(idx > 3); |
---|
117 | | -#endif |
---|
118 | 119 | tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; |
---|
119 | 120 | tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ |
---|
120 | 121 | |
---|
.. | .. |
---|
126 | 127 | int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; |
---|
127 | 128 | int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; |
---|
128 | 129 | |
---|
129 | | - return per_cpu_ptr(&mcs_nodes[idx], cpu); |
---|
| 130 | + return per_cpu_ptr(&qnodes[idx].mcs, cpu); |
---|
| 131 | +} |
---|
| 132 | + |
---|
| 133 | +static inline __pure |
---|
| 134 | +struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) |
---|
| 135 | +{ |
---|
| 136 | + return &((struct qnode *)base + idx)->mcs; |
---|
130 | 137 | } |
---|
131 | 138 | |
---|
132 | 139 | #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) |
---|
.. | .. |
---|
340 | 347 | /* |
---|
341 | 348 | * trylock || pending |
---|
342 | 349 | * |
---|
343 | | - * 0,0,0 -> 0,0,1 ; trylock |
---|
344 | | - * 0,0,1 -> 0,1,1 ; pending |
---|
| 350 | + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock |
---|
345 | 351 | */ |
---|
346 | 352 | val = queued_fetch_set_pending_acquire(lock); |
---|
347 | 353 | |
---|
348 | 354 | /* |
---|
349 | | - * If we observe any contention; undo and queue. |
---|
| 355 | + * If we observe contention, there is a concurrent locker. |
---|
| 356 | + * |
---|
| 357 | + * Undo and queue; our setting of PENDING might have made the |
---|
| 358 | + * n,0,0 -> 0,0,0 transition fail and it will now be waiting |
---|
| 359 | + * on @next to become !NULL. |
---|
350 | 360 | */ |
---|
351 | 361 | if (unlikely(val & ~_Q_LOCKED_MASK)) { |
---|
| 362 | + |
---|
| 363 | + /* Undo PENDING if we set it. */ |
---|
352 | 364 | if (!(val & _Q_PENDING_MASK)) |
---|
353 | 365 | clear_pending(lock); |
---|
| 366 | + |
---|
354 | 367 | goto queue; |
---|
355 | 368 | } |
---|
356 | 369 | |
---|
.. | .. |
---|
374 | 387 | * 0,1,0 -> 0,0,1 |
---|
375 | 388 | */ |
---|
376 | 389 | clear_pending_set_locked(lock); |
---|
377 | | - qstat_inc(qstat_lock_pending, true); |
---|
| 390 | + lockevent_inc(lock_pending); |
---|
378 | 391 | return; |
---|
379 | 392 | |
---|
380 | 393 | /* |
---|
.. | .. |
---|
382 | 395 | * queuing. |
---|
383 | 396 | */ |
---|
384 | 397 | queue: |
---|
385 | | - qstat_inc(qstat_lock_slowpath, true); |
---|
| 398 | + lockevent_inc(lock_slowpath); |
---|
386 | 399 | pv_queue: |
---|
387 | | - node = this_cpu_ptr(&mcs_nodes[0]); |
---|
| 400 | + node = this_cpu_ptr(&qnodes[0].mcs); |
---|
388 | 401 | idx = node->count++; |
---|
389 | 402 | tail = encode_tail(smp_processor_id(), idx); |
---|
390 | 403 | |
---|
391 | | - node += idx; |
---|
| 404 | + /* |
---|
| 405 | + * 4 nodes are allocated based on the assumption that there will |
---|
| 406 | + * not be nested NMIs taking spinlocks. That may not be true in |
---|
| 407 | + * some architectures even though the chance of needing more than |
---|
| 408 | + * 4 nodes will still be extremely unlikely. When that happens, |
---|
| 409 | + * we fall back to spinning on the lock directly without using |
---|
| 410 | + * any MCS node. This is not the most elegant solution, but is |
---|
| 411 | + * simple enough. |
---|
| 412 | + */ |
---|
| 413 | + if (unlikely(idx >= MAX_NODES)) { |
---|
| 414 | + lockevent_inc(lock_no_node); |
---|
| 415 | + while (!queued_spin_trylock(lock)) |
---|
| 416 | + cpu_relax(); |
---|
| 417 | + goto release; |
---|
| 418 | + } |
---|
| 419 | + |
---|
| 420 | + node = grab_mcs_node(node, idx); |
---|
| 421 | + |
---|
| 422 | + /* |
---|
| 423 | + * Keep counts of non-zero index values: |
---|
| 424 | + */ |
---|
| 425 | + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); |
---|
392 | 426 | |
---|
393 | 427 | /* |
---|
394 | 428 | * Ensure that we increment the head node->count before initialising |
---|
.. | .. |
---|
489 | 523 | */ |
---|
490 | 524 | |
---|
491 | 525 | /* |
---|
492 | | - * In the PV case we might already have _Q_LOCKED_VAL set. |
---|
| 526 | + * In the PV case we might already have _Q_LOCKED_VAL set, because |
---|
| 527 | + * of lock stealing; therefore we must also allow: |
---|
493 | 528 | * |
---|
494 | | - * The atomic_cond_read_acquire() call above has provided the |
---|
495 | | - * necessary acquire semantics required for locking. |
---|
| 529 | + * n,0,1 -> 0,0,1 |
---|
| 530 | + * |
---|
| 531 | + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the |
---|
| 532 | + * above wait condition, therefore any concurrent setting of |
---|
| 533 | + * PENDING will make the uncontended transition fail. |
---|
496 | 534 | */ |
---|
497 | | - if (((val & _Q_TAIL_MASK) == tail) && |
---|
498 | | - atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) |
---|
499 | | - goto release; /* No contention */ |
---|
| 535 | + if ((val & _Q_TAIL_MASK) == tail) { |
---|
| 536 | + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) |
---|
| 537 | + goto release; /* No contention */ |
---|
| 538 | + } |
---|
500 | 539 | |
---|
501 | | - /* Either somebody is queued behind us or _Q_PENDING_VAL is set */ |
---|
| 540 | + /* |
---|
| 541 | + * Either somebody is queued behind us or _Q_PENDING_VAL got set |
---|
| 542 | + * which will then detect the remaining tail and queue behind us |
---|
| 543 | + * ensuring we'll see a @next. |
---|
| 544 | + */ |
---|
502 | 545 | set_locked(lock); |
---|
503 | 546 | |
---|
504 | 547 | /* |
---|
.. | .. |
---|
514 | 557 | /* |
---|
515 | 558 | * release the node |
---|
516 | 559 | */ |
---|
517 | | - __this_cpu_dec(mcs_nodes[0].count); |
---|
| 560 | + __this_cpu_dec(qnodes[0].mcs.count); |
---|
518 | 561 | } |
---|
519 | 562 | EXPORT_SYMBOL(queued_spin_lock_slowpath); |
---|
520 | 563 | |
---|
.. | .. |
---|
538 | 581 | #include "qspinlock_paravirt.h" |
---|
539 | 582 | #include "qspinlock.c" |
---|
540 | 583 | |
---|
| 584 | +bool nopvspin __initdata; |
---|
| 585 | +static __init int parse_nopvspin(char *arg) |
---|
| 586 | +{ |
---|
| 587 | + nopvspin = true; |
---|
| 588 | + return 0; |
---|
| 589 | +} |
---|
| 590 | +early_param("nopvspin", parse_nopvspin); |
---|
541 | 591 | #endif |
---|