| .. | .. | 
|---|
 | 1 | +// SPDX-License-Identifier: GPL-2.0-or-later  | 
|---|
| 1 | 2 |  /* | 
|---|
| 2 | 3 |   * Queued spinlock | 
|---|
| 3 |  | - *  | 
|---|
| 4 |  | - * This program is free software; you can redistribute it and/or modify  | 
|---|
| 5 |  | - * it under the terms of the GNU General Public License as published by  | 
|---|
| 6 |  | - * the Free Software Foundation; either version 2 of the License, or  | 
|---|
| 7 |  | - * (at your option) any later version.  | 
|---|
| 8 |  | - *  | 
|---|
| 9 |  | - * This program is distributed in the hope that it will be useful,  | 
|---|
| 10 |  | - * but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
|---|
| 11 |  | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  | 
|---|
| 12 |  | - * GNU General Public License for more details.  | 
|---|
| 13 | 4 |   * | 
|---|
| 14 | 5 |   * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. | 
|---|
| 15 | 6 |   * (C) Copyright 2013-2014,2018 Red Hat, Inc. | 
|---|
| .. | .. | 
|---|
| 40 | 31 |  /* | 
|---|
| 41 | 32 |   * The basic principle of a queue-based spinlock can best be understood | 
|---|
| 42 | 33 |   * by studying a classic queue-based spinlock implementation called the | 
|---|
| 43 |  | - * MCS lock. The paper below provides a good description for this kind  | 
|---|
| 44 |  | - * of lock.  | 
|---|
 | 34 | + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable  | 
|---|
 | 35 | + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and  | 
|---|
 | 36 | + * Scott") is available at  | 
|---|
| 45 | 37 |   * | 
|---|
| 46 |  | - * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf  | 
|---|
 | 38 | + * https://bugzilla.kernel.org/show_bug.cgi?id=206115  | 
|---|
| 47 | 39 |   * | 
|---|
| 48 |  | - * This queued spinlock implementation is based on the MCS lock, however to make  | 
|---|
| 49 |  | - * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing  | 
|---|
| 50 |  | - * API, we must modify it somehow.  | 
|---|
 | 40 | + * This queued spinlock implementation is based on the MCS lock, however to  | 
|---|
 | 41 | + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its  | 
|---|
 | 42 | + * existing API, we must modify it somehow.  | 
|---|
| 51 | 43 |   * | 
|---|
| 52 | 44 |   * In particular; where the traditional MCS lock consists of a tail pointer | 
|---|
| 53 | 45 |   * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to | 
|---|
| .. | .. | 
|---|
| 74 | 66 |   */ | 
|---|
| 75 | 67 |   | 
|---|
| 76 | 68 |  #include "mcs_spinlock.h" | 
|---|
| 77 |  | -  | 
|---|
| 78 |  | -#ifdef CONFIG_PARAVIRT_SPINLOCKS  | 
|---|
| 79 |  | -#define MAX_NODES	8  | 
|---|
| 80 |  | -#else  | 
|---|
| 81 | 69 |  #define MAX_NODES	4 | 
|---|
 | 70 | +  | 
|---|
 | 71 | +/*  | 
|---|
 | 72 | + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in  | 
|---|
 | 73 | + * size and four of them will fit nicely in one 64-byte cacheline. For  | 
|---|
 | 74 | + * pvqspinlock, however, we need more space for extra data. To accommodate  | 
|---|
 | 75 | + * that, we insert two more long words to pad it up to 32 bytes. IOW, only  | 
|---|
 | 76 | + * two of them can fit in a cacheline in this case. That is OK as it is rare  | 
|---|
 | 77 | + * to have more than 2 levels of slowpath nesting in actual use. We don't  | 
|---|
 | 78 | + * want to penalize pvqspinlocks to optimize for a rare case in native  | 
|---|
 | 79 | + * qspinlocks.  | 
|---|
 | 80 | + */  | 
|---|
 | 81 | +struct qnode {  | 
|---|
 | 82 | +	struct mcs_spinlock mcs;  | 
|---|
 | 83 | +#ifdef CONFIG_PARAVIRT_SPINLOCKS  | 
|---|
 | 84 | +	long reserved[2];  | 
|---|
| 82 | 85 |  #endif | 
|---|
 | 86 | +};  | 
|---|
| 83 | 87 |   | 
|---|
| 84 | 88 |  /* | 
|---|
| 85 | 89 |   * The pending bit spinning loop count. | 
|---|
| .. | .. | 
|---|
| 101 | 105 |   * | 
|---|
| 102 | 106 |   * PV doubles the storage and uses the second cacheline for PV state. | 
|---|
| 103 | 107 |   */ | 
|---|
| 104 |  | -static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);  | 
|---|
 | 108 | +static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);  | 
|---|
| 105 | 109 |   | 
|---|
| 106 | 110 |  /* | 
|---|
| 107 | 111 |   * We must be able to distinguish between no-tail and the tail at 0:0, | 
|---|
| .. | .. | 
|---|
| 112 | 116 |  { | 
|---|
| 113 | 117 |  	u32 tail; | 
|---|
| 114 | 118 |   | 
|---|
| 115 |  | -#ifdef CONFIG_DEBUG_SPINLOCK  | 
|---|
| 116 |  | -	BUG_ON(idx > 3);  | 
|---|
| 117 |  | -#endif  | 
|---|
| 118 | 119 |  	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET; | 
|---|
| 119 | 120 |  	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ | 
|---|
| 120 | 121 |   | 
|---|
| .. | .. | 
|---|
| 126 | 127 |  	int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; | 
|---|
| 127 | 128 |  	int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; | 
|---|
| 128 | 129 |   | 
|---|
| 129 |  | -	return per_cpu_ptr(&mcs_nodes[idx], cpu);  | 
|---|
 | 130 | +	return per_cpu_ptr(&qnodes[idx].mcs, cpu);  | 
|---|
 | 131 | +}  | 
|---|
 | 132 | +  | 
|---|
 | 133 | +static inline __pure  | 
|---|
 | 134 | +struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx)  | 
|---|
 | 135 | +{  | 
|---|
 | 136 | +	return &((struct qnode *)base + idx)->mcs;  | 
|---|
| 130 | 137 |  } | 
|---|
| 131 | 138 |   | 
|---|
| 132 | 139 |  #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) | 
|---|
| .. | .. | 
|---|
| 340 | 347 |  	/* | 
|---|
| 341 | 348 |  	 * trylock || pending | 
|---|
| 342 | 349 |  	 * | 
|---|
| 343 |  | -	 * 0,0,0 -> 0,0,1 ; trylock  | 
|---|
| 344 |  | -	 * 0,0,1 -> 0,1,1 ; pending  | 
|---|
 | 350 | +	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock  | 
|---|
| 345 | 351 |  	 */ | 
|---|
| 346 | 352 |  	val = queued_fetch_set_pending_acquire(lock); | 
|---|
| 347 | 353 |   | 
|---|
| 348 | 354 |  	/* | 
|---|
| 349 |  | -	 * If we observe any contention; undo and queue.  | 
|---|
 | 355 | +	 * If we observe contention, there is a concurrent locker.  | 
|---|
 | 356 | +	 *  | 
|---|
 | 357 | +	 * Undo and queue; our setting of PENDING might have made the  | 
|---|
 | 358 | +	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting  | 
|---|
 | 359 | +	 * on @next to become !NULL.  | 
|---|
| 350 | 360 |  	 */ | 
|---|
| 351 | 361 |  	if (unlikely(val & ~_Q_LOCKED_MASK)) { | 
|---|
 | 362 | +  | 
|---|
 | 363 | +		/* Undo PENDING if we set it. */  | 
|---|
| 352 | 364 |  		if (!(val & _Q_PENDING_MASK)) | 
|---|
| 353 | 365 |  			clear_pending(lock); | 
|---|
 | 366 | +  | 
|---|
| 354 | 367 |  		goto queue; | 
|---|
| 355 | 368 |  	} | 
|---|
| 356 | 369 |   | 
|---|
| .. | .. | 
|---|
| 374 | 387 |  	 * 0,1,0 -> 0,0,1 | 
|---|
| 375 | 388 |  	 */ | 
|---|
| 376 | 389 |  	clear_pending_set_locked(lock); | 
|---|
| 377 |  | -	qstat_inc(qstat_lock_pending, true);  | 
|---|
 | 390 | +	lockevent_inc(lock_pending);  | 
|---|
| 378 | 391 |  	return; | 
|---|
| 379 | 392 |   | 
|---|
| 380 | 393 |  	/* | 
|---|
| .. | .. | 
|---|
| 382 | 395 |  	 * queuing. | 
|---|
| 383 | 396 |  	 */ | 
|---|
| 384 | 397 |  queue: | 
|---|
| 385 |  | -	qstat_inc(qstat_lock_slowpath, true);  | 
|---|
 | 398 | +	lockevent_inc(lock_slowpath);  | 
|---|
| 386 | 399 |  pv_queue: | 
|---|
| 387 |  | -	node = this_cpu_ptr(&mcs_nodes[0]);  | 
|---|
 | 400 | +	node = this_cpu_ptr(&qnodes[0].mcs);  | 
|---|
| 388 | 401 |  	idx = node->count++; | 
|---|
| 389 | 402 |  	tail = encode_tail(smp_processor_id(), idx); | 
|---|
| 390 | 403 |   | 
|---|
| 391 |  | -	node += idx;  | 
|---|
 | 404 | +	/*  | 
|---|
 | 405 | +	 * 4 nodes are allocated based on the assumption that there will  | 
|---|
 | 406 | +	 * not be nested NMIs taking spinlocks. That may not be true in  | 
|---|
 | 407 | +	 * some architectures even though the chance of needing more than  | 
|---|
 | 408 | +	 * 4 nodes will still be extremely unlikely. When that happens,  | 
|---|
 | 409 | +	 * we fall back to spinning on the lock directly without using  | 
|---|
 | 410 | +	 * any MCS node. This is not the most elegant solution, but is  | 
|---|
 | 411 | +	 * simple enough.  | 
|---|
 | 412 | +	 */  | 
|---|
 | 413 | +	if (unlikely(idx >= MAX_NODES)) {  | 
|---|
 | 414 | +		lockevent_inc(lock_no_node);  | 
|---|
 | 415 | +		while (!queued_spin_trylock(lock))  | 
|---|
 | 416 | +			cpu_relax();  | 
|---|
 | 417 | +		goto release;  | 
|---|
 | 418 | +	}  | 
|---|
 | 419 | +  | 
|---|
 | 420 | +	node = grab_mcs_node(node, idx);  | 
|---|
 | 421 | +  | 
|---|
 | 422 | +	/*  | 
|---|
 | 423 | +	 * Keep counts of non-zero index values:  | 
|---|
 | 424 | +	 */  | 
|---|
 | 425 | +	lockevent_cond_inc(lock_use_node2 + idx - 1, idx);  | 
|---|
| 392 | 426 |   | 
|---|
| 393 | 427 |  	/* | 
|---|
| 394 | 428 |  	 * Ensure that we increment the head node->count before initialising | 
|---|
| .. | .. | 
|---|
| 489 | 523 |  	 */ | 
|---|
| 490 | 524 |   | 
|---|
| 491 | 525 |  	/* | 
|---|
| 492 |  | -	 * In the PV case we might already have _Q_LOCKED_VAL set.  | 
|---|
 | 526 | +	 * In the PV case we might already have _Q_LOCKED_VAL set, because  | 
|---|
 | 527 | +	 * of lock stealing; therefore we must also allow:  | 
|---|
| 493 | 528 |  	 * | 
|---|
| 494 |  | -	 * The atomic_cond_read_acquire() call above has provided the  | 
|---|
| 495 |  | -	 * necessary acquire semantics required for locking.  | 
|---|
 | 529 | +	 * n,0,1 -> 0,0,1  | 
|---|
 | 530 | +	 *  | 
|---|
 | 531 | +	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the  | 
|---|
 | 532 | +	 *       above wait condition, therefore any concurrent setting of  | 
|---|
 | 533 | +	 *       PENDING will make the uncontended transition fail.  | 
|---|
| 496 | 534 |  	 */ | 
|---|
| 497 |  | -	if (((val & _Q_TAIL_MASK) == tail) &&  | 
|---|
| 498 |  | -	    atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))  | 
|---|
| 499 |  | -		goto release; /* No contention */  | 
|---|
 | 535 | +	if ((val & _Q_TAIL_MASK) == tail) {  | 
|---|
 | 536 | +		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))  | 
|---|
 | 537 | +			goto release; /* No contention */  | 
|---|
 | 538 | +	}  | 
|---|
| 500 | 539 |   | 
|---|
| 501 |  | -	/* Either somebody is queued behind us or _Q_PENDING_VAL is set */  | 
|---|
 | 540 | +	/*  | 
|---|
 | 541 | +	 * Either somebody is queued behind us or _Q_PENDING_VAL got set  | 
|---|
 | 542 | +	 * which will then detect the remaining tail and queue behind us  | 
|---|
 | 543 | +	 * ensuring we'll see a @next.  | 
|---|
 | 544 | +	 */  | 
|---|
| 502 | 545 |  	set_locked(lock); | 
|---|
| 503 | 546 |   | 
|---|
| 504 | 547 |  	/* | 
|---|
| .. | .. | 
|---|
| 514 | 557 |  	/* | 
|---|
| 515 | 558 |  	 * release the node | 
|---|
| 516 | 559 |  	 */ | 
|---|
| 517 |  | -	__this_cpu_dec(mcs_nodes[0].count);  | 
|---|
 | 560 | +	__this_cpu_dec(qnodes[0].mcs.count);  | 
|---|
| 518 | 561 |  } | 
|---|
| 519 | 562 |  EXPORT_SYMBOL(queued_spin_lock_slowpath); | 
|---|
| 520 | 563 |   | 
|---|
| .. | .. | 
|---|
| 538 | 581 |  #include "qspinlock_paravirt.h" | 
|---|
| 539 | 582 |  #include "qspinlock.c" | 
|---|
| 540 | 583 |   | 
|---|
 | 584 | +bool nopvspin __initdata;  | 
|---|
 | 585 | +static __init int parse_nopvspin(char *arg)  | 
|---|
 | 586 | +{  | 
|---|
 | 587 | +	nopvspin = true;  | 
|---|
 | 588 | +	return 0;  | 
|---|
 | 589 | +}  | 
|---|
 | 590 | +early_param("nopvspin", parse_nopvspin);  | 
|---|
| 541 | 591 |  #endif | 
|---|