| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Queued spinlock |
|---|
| 3 | | - * |
|---|
| 4 | | - * This program is free software; you can redistribute it and/or modify |
|---|
| 5 | | - * it under the terms of the GNU General Public License as published by |
|---|
| 6 | | - * the Free Software Foundation; either version 2 of the License, or |
|---|
| 7 | | - * (at your option) any later version. |
|---|
| 8 | | - * |
|---|
| 9 | | - * This program is distributed in the hope that it will be useful, |
|---|
| 10 | | - * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 11 | | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 12 | | - * GNU General Public License for more details. |
|---|
| 13 | 4 | * |
|---|
| 14 | 5 | * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. |
|---|
| 15 | 6 | * (C) Copyright 2013-2014,2018 Red Hat, Inc. |
|---|
| .. | .. |
|---|
| 40 | 31 | /* |
|---|
| 41 | 32 | * The basic principle of a queue-based spinlock can best be understood |
|---|
| 42 | 33 | * by studying a classic queue-based spinlock implementation called the |
|---|
| 43 | | - * MCS lock. The paper below provides a good description for this kind |
|---|
| 44 | | - * of lock. |
|---|
| 34 | + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable |
|---|
| 35 | + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and |
|---|
| 36 | + * Scott") is available at |
|---|
| 45 | 37 | * |
|---|
| 46 | | - * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf |
|---|
| 38 | + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 |
|---|
| 47 | 39 | * |
|---|
| 48 | | - * This queued spinlock implementation is based on the MCS lock, however to make |
|---|
| 49 | | - * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing |
|---|
| 50 | | - * API, we must modify it somehow. |
|---|
| 40 | + * This queued spinlock implementation is based on the MCS lock, however to |
|---|
| 41 | + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its |
|---|
| 42 | + * existing API, we must modify it somehow. |
|---|
| 51 | 43 | * |
|---|
| 52 | 44 | * In particular; where the traditional MCS lock consists of a tail pointer |
|---|
| 53 | 45 | * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to |
|---|
| .. | .. |
|---|
| 74 | 66 | */ |
|---|
| 75 | 67 | |
|---|
| 76 | 68 | #include "mcs_spinlock.h" |
|---|
| 77 | | - |
|---|
| 78 | | -#ifdef CONFIG_PARAVIRT_SPINLOCKS |
|---|
| 79 | | -#define MAX_NODES 8 |
|---|
| 80 | | -#else |
|---|
| 81 | 69 | #define MAX_NODES 4 |
|---|
| 70 | + |
|---|
| 71 | +/* |
|---|
| 72 | + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in |
|---|
| 73 | + * size and four of them will fit nicely in one 64-byte cacheline. For |
|---|
| 74 | + * pvqspinlock, however, we need more space for extra data. To accommodate |
|---|
| 75 | + * that, we insert two more long words to pad it up to 32 bytes. IOW, only |
|---|
| 76 | + * two of them can fit in a cacheline in this case. That is OK as it is rare |
|---|
| 77 | + * to have more than 2 levels of slowpath nesting in actual use. We don't |
|---|
| 78 | + * want to penalize pvqspinlocks to optimize for a rare case in native |
|---|
| 79 | + * qspinlocks. |
|---|
| 80 | + */ |
|---|
| 81 | +struct qnode { |
|---|
| 82 | + struct mcs_spinlock mcs; |
|---|
| 83 | +#ifdef CONFIG_PARAVIRT_SPINLOCKS |
|---|
| 84 | + long reserved[2]; |
|---|
| 82 | 85 | #endif |
|---|
| 86 | +}; |
|---|
| 83 | 87 | |
|---|
| 84 | 88 | /* |
|---|
| 85 | 89 | * The pending bit spinning loop count. |
|---|
| .. | .. |
|---|
| 101 | 105 | * |
|---|
| 102 | 106 | * PV doubles the storage and uses the second cacheline for PV state. |
|---|
| 103 | 107 | */ |
|---|
| 104 | | -static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); |
|---|
| 108 | +static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); |
|---|
| 105 | 109 | |
|---|
| 106 | 110 | /* |
|---|
| 107 | 111 | * We must be able to distinguish between no-tail and the tail at 0:0, |
|---|
| .. | .. |
|---|
| 112 | 116 | { |
|---|
| 113 | 117 | u32 tail; |
|---|
| 114 | 118 | |
|---|
| 115 | | -#ifdef CONFIG_DEBUG_SPINLOCK |
|---|
| 116 | | - BUG_ON(idx > 3); |
|---|
| 117 | | -#endif |
|---|
| 118 | 119 | tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; |
|---|
| 119 | 120 | tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ |
|---|
| 120 | 121 | |
|---|
| .. | .. |
|---|
| 126 | 127 | int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; |
|---|
| 127 | 128 | int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; |
|---|
| 128 | 129 | |
|---|
| 129 | | - return per_cpu_ptr(&mcs_nodes[idx], cpu); |
|---|
| 130 | + return per_cpu_ptr(&qnodes[idx].mcs, cpu); |
|---|
| 131 | +} |
|---|
| 132 | + |
|---|
| 133 | +static inline __pure |
|---|
| 134 | +struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) |
|---|
| 135 | +{ |
|---|
| 136 | + return &((struct qnode *)base + idx)->mcs; |
|---|
| 130 | 137 | } |
|---|
| 131 | 138 | |
|---|
| 132 | 139 | #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) |
|---|
| .. | .. |
|---|
| 340 | 347 | /* |
|---|
| 341 | 348 | * trylock || pending |
|---|
| 342 | 349 | * |
|---|
| 343 | | - * 0,0,0 -> 0,0,1 ; trylock |
|---|
| 344 | | - * 0,0,1 -> 0,1,1 ; pending |
|---|
| 350 | + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock |
|---|
| 345 | 351 | */ |
|---|
| 346 | 352 | val = queued_fetch_set_pending_acquire(lock); |
|---|
| 347 | 353 | |
|---|
| 348 | 354 | /* |
|---|
| 349 | | - * If we observe any contention; undo and queue. |
|---|
| 355 | + * If we observe contention, there is a concurrent locker. |
|---|
| 356 | + * |
|---|
| 357 | + * Undo and queue; our setting of PENDING might have made the |
|---|
| 358 | + * n,0,0 -> 0,0,0 transition fail and it will now be waiting |
|---|
| 359 | + * on @next to become !NULL. |
|---|
| 350 | 360 | */ |
|---|
| 351 | 361 | if (unlikely(val & ~_Q_LOCKED_MASK)) { |
|---|
| 362 | + |
|---|
| 363 | + /* Undo PENDING if we set it. */ |
|---|
| 352 | 364 | if (!(val & _Q_PENDING_MASK)) |
|---|
| 353 | 365 | clear_pending(lock); |
|---|
| 366 | + |
|---|
| 354 | 367 | goto queue; |
|---|
| 355 | 368 | } |
|---|
| 356 | 369 | |
|---|
| .. | .. |
|---|
| 374 | 387 | * 0,1,0 -> 0,0,1 |
|---|
| 375 | 388 | */ |
|---|
| 376 | 389 | clear_pending_set_locked(lock); |
|---|
| 377 | | - qstat_inc(qstat_lock_pending, true); |
|---|
| 390 | + lockevent_inc(lock_pending); |
|---|
| 378 | 391 | return; |
|---|
| 379 | 392 | |
|---|
| 380 | 393 | /* |
|---|
| .. | .. |
|---|
| 382 | 395 | * queuing. |
|---|
| 383 | 396 | */ |
|---|
| 384 | 397 | queue: |
|---|
| 385 | | - qstat_inc(qstat_lock_slowpath, true); |
|---|
| 398 | + lockevent_inc(lock_slowpath); |
|---|
| 386 | 399 | pv_queue: |
|---|
| 387 | | - node = this_cpu_ptr(&mcs_nodes[0]); |
|---|
| 400 | + node = this_cpu_ptr(&qnodes[0].mcs); |
|---|
| 388 | 401 | idx = node->count++; |
|---|
| 389 | 402 | tail = encode_tail(smp_processor_id(), idx); |
|---|
| 390 | 403 | |
|---|
| 391 | | - node += idx; |
|---|
| 404 | + /* |
|---|
| 405 | + * 4 nodes are allocated based on the assumption that there will |
|---|
| 406 | + * not be nested NMIs taking spinlocks. That may not be true in |
|---|
| 407 | + * some architectures even though the chance of needing more than |
|---|
| 408 | + * 4 nodes will still be extremely unlikely. When that happens, |
|---|
| 409 | + * we fall back to spinning on the lock directly without using |
|---|
| 410 | + * any MCS node. This is not the most elegant solution, but is |
|---|
| 411 | + * simple enough. |
|---|
| 412 | + */ |
|---|
| 413 | + if (unlikely(idx >= MAX_NODES)) { |
|---|
| 414 | + lockevent_inc(lock_no_node); |
|---|
| 415 | + while (!queued_spin_trylock(lock)) |
|---|
| 416 | + cpu_relax(); |
|---|
| 417 | + goto release; |
|---|
| 418 | + } |
|---|
| 419 | + |
|---|
| 420 | + node = grab_mcs_node(node, idx); |
|---|
| 421 | + |
|---|
| 422 | + /* |
|---|
| 423 | + * Keep counts of non-zero index values: |
|---|
| 424 | + */ |
|---|
| 425 | + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); |
|---|
| 392 | 426 | |
|---|
| 393 | 427 | /* |
|---|
| 394 | 428 | * Ensure that we increment the head node->count before initialising |
|---|
| .. | .. |
|---|
| 489 | 523 | */ |
|---|
| 490 | 524 | |
|---|
| 491 | 525 | /* |
|---|
| 492 | | - * In the PV case we might already have _Q_LOCKED_VAL set. |
|---|
| 526 | + * In the PV case we might already have _Q_LOCKED_VAL set, because |
|---|
| 527 | + * of lock stealing; therefore we must also allow: |
|---|
| 493 | 528 | * |
|---|
| 494 | | - * The atomic_cond_read_acquire() call above has provided the |
|---|
| 495 | | - * necessary acquire semantics required for locking. |
|---|
| 529 | + * n,0,1 -> 0,0,1 |
|---|
| 530 | + * |
|---|
| 531 | + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the |
|---|
| 532 | + * above wait condition, therefore any concurrent setting of |
|---|
| 533 | + * PENDING will make the uncontended transition fail. |
|---|
| 496 | 534 | */ |
|---|
| 497 | | - if (((val & _Q_TAIL_MASK) == tail) && |
|---|
| 498 | | - atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) |
|---|
| 499 | | - goto release; /* No contention */ |
|---|
| 535 | + if ((val & _Q_TAIL_MASK) == tail) { |
|---|
| 536 | + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) |
|---|
| 537 | + goto release; /* No contention */ |
|---|
| 538 | + } |
|---|
| 500 | 539 | |
|---|
| 501 | | - /* Either somebody is queued behind us or _Q_PENDING_VAL is set */ |
|---|
| 540 | + /* |
|---|
| 541 | + * Either somebody is queued behind us or _Q_PENDING_VAL got set |
|---|
| 542 | + * which will then detect the remaining tail and queue behind us |
|---|
| 543 | + * ensuring we'll see a @next. |
|---|
| 544 | + */ |
|---|
| 502 | 545 | set_locked(lock); |
|---|
| 503 | 546 | |
|---|
| 504 | 547 | /* |
|---|
| .. | .. |
|---|
| 514 | 557 | /* |
|---|
| 515 | 558 | * release the node |
|---|
| 516 | 559 | */ |
|---|
| 517 | | - __this_cpu_dec(mcs_nodes[0].count); |
|---|
| 560 | + __this_cpu_dec(qnodes[0].mcs.count); |
|---|
| 518 | 561 | } |
|---|
| 519 | 562 | EXPORT_SYMBOL(queued_spin_lock_slowpath); |
|---|
| 520 | 563 | |
|---|
| .. | .. |
|---|
| 538 | 581 | #include "qspinlock_paravirt.h" |
|---|
| 539 | 582 | #include "qspinlock.c" |
|---|
| 540 | 583 | |
|---|
| 584 | +bool nopvspin __initdata; |
|---|
| 585 | +static __init int parse_nopvspin(char *arg) |
|---|
| 586 | +{ |
|---|
| 587 | + nopvspin = true; |
|---|
| 588 | + return 0; |
|---|
| 589 | +} |
|---|
| 590 | +early_param("nopvspin", parse_nopvspin); |
|---|
| 541 | 591 | #endif |
|---|