hc
2023-11-06 e3e12f52b214121840b44c91de5b3e5af5d3eb84
kernel/kernel/futex.c
....@@ -962,7 +962,9 @@
962962 if (head->next != next) {
963963 /* retain curr->pi_lock for the loop invariant */
964964 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
965
+ raw_spin_unlock_irq(&curr->pi_lock);
965966 spin_unlock(&hb->lock);
967
+ raw_spin_lock_irq(&curr->pi_lock);
966968 put_pi_state(pi_state);
967969 continue;
968970 }
....@@ -1573,6 +1575,7 @@
15731575 struct task_struct *new_owner;
15741576 bool postunlock = false;
15751577 DEFINE_WAKE_Q(wake_q);
1578
+ DEFINE_WAKE_Q(wake_sleeper_q);
15761579 int ret = 0;
15771580
15781581 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
....@@ -1622,14 +1625,15 @@
16221625 * not fail.
16231626 */
16241627 pi_state_update_owner(pi_state, new_owner);
1625
- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
1628
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
1629
+ &wake_sleeper_q);
16261630 }
16271631
16281632 out_unlock:
16291633 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
16301634
16311635 if (postunlock)
1632
- rt_mutex_postunlock(&wake_q);
1636
+ rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
16331637
16341638 return ret;
16351639 }
....@@ -2253,6 +2257,16 @@
22532257 requeue_pi_wake_futex(this, &key2, hb2);
22542258 drop_count++;
22552259 continue;
2260
+ } else if (ret == -EAGAIN) {
2261
+ /*
2262
+ * Waiter was woken by timeout or
2263
+ * signal and has set pi_blocked_on to
2264
+ * PI_WAKEUP_INPROGRESS before we
2265
+ * tried to enqueue it on the rtmutex.
2266
+ */
2267
+ this->pi_state = NULL;
2268
+ put_pi_state(pi_state);
2269
+ continue;
22562270 } else if (ret) {
22572271 /*
22582272 * rt_mutex_start_proxy_lock() detected a
....@@ -2816,10 +2830,9 @@
28162830 if (abs_time) {
28172831 to = &timeout;
28182832
2819
- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2820
- CLOCK_REALTIME : CLOCK_MONOTONIC,
2821
- HRTIMER_MODE_ABS);
2822
- hrtimer_init_sleeper(to, current);
2833
+ hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
2834
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
2835
+ HRTIMER_MODE_ABS, current);
28232836 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
28242837 current->timer_slack_ns);
28252838 }
....@@ -2917,9 +2930,8 @@
29172930
29182931 if (time) {
29192932 to = &timeout;
2920
- hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
2921
- HRTIMER_MODE_ABS);
2922
- hrtimer_init_sleeper(to, current);
2933
+ hrtimer_init_sleeper_on_stack(to, CLOCK_REALTIME,
2934
+ HRTIMER_MODE_ABS, current);
29232935 hrtimer_set_expires(&to->timer, *time);
29242936 }
29252937
....@@ -2982,7 +2994,7 @@
29822994 goto no_block;
29832995 }
29842996
2985
- rt_mutex_init_waiter(&rt_waiter);
2997
+ rt_mutex_init_waiter(&rt_waiter, false);
29862998
29872999 /*
29883000 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
....@@ -2998,6 +3010,14 @@
29983010 * before __rt_mutex_start_proxy_lock() is done.
29993011 */
30003012 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
3013
+ /*
3014
+ * the migrate_disable() here disables migration in the in_atomic() fast
3015
+ * path which is enabled again in the following spin_unlock(). We have
3016
+ * one migrate_disable() pending in the slow-path which is reversed
3017
+ * after the raw_spin_unlock_irq() where we leave the atomic context.
3018
+ */
3019
+ migrate_disable();
3020
+
30013021 spin_unlock(q.lock_ptr);
30023022 /*
30033023 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
....@@ -3006,6 +3026,7 @@
30063026 */
30073027 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
30083028 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
3029
+ migrate_enable();
30093030
30103031 if (ret) {
30113032 if (ret == 1)
....@@ -3140,10 +3161,19 @@
31403161 * rt_waiter. Also see the WARN in wake_futex_pi().
31413162 */
31423163 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
3164
+ /*
3165
+ * Magic trickery for now to make the RT migrate disable
3166
+ * logic happy. The following spin_unlock() happens with
3167
+ * interrupts disabled so the internal migrate_enable()
3168
+ * won't undo the migrate_disable() which was issued when
3169
+ * locking hb->lock.
3170
+ */
3171
+ migrate_disable();
31433172 spin_unlock(&hb->lock);
31443173
31453174 /* drops pi_state->pi_mutex.wait_lock */
31463175 ret = wake_futex_pi(uaddr, uval, pi_state);
3176
+ migrate_enable();
31473177
31483178 put_pi_state(pi_state);
31493179
....@@ -3314,7 +3344,7 @@
33143344 {
33153345 struct hrtimer_sleeper timeout, *to = NULL;
33163346 struct rt_mutex_waiter rt_waiter;
3317
- struct futex_hash_bucket *hb;
3347
+ struct futex_hash_bucket *hb, *hb2;
33183348 union futex_key key2 = FUTEX_KEY_INIT;
33193349 struct futex_q q = futex_q_init;
33203350 int res, ret;
....@@ -3330,10 +3360,9 @@
33303360
33313361 if (abs_time) {
33323362 to = &timeout;
3333
- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
3334
- CLOCK_REALTIME : CLOCK_MONOTONIC,
3335
- HRTIMER_MODE_ABS);
3336
- hrtimer_init_sleeper(to, current);
3363
+ hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ?
3364
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
3365
+ HRTIMER_MODE_ABS, current);
33373366 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
33383367 current->timer_slack_ns);
33393368 }
....@@ -3342,7 +3371,7 @@
33423371 * The waiter is allocated on our stack, manipulated by the requeue
33433372 * code while we sleep on uaddr.
33443373 */
3345
- rt_mutex_init_waiter(&rt_waiter);
3374
+ rt_mutex_init_waiter(&rt_waiter, false);
33463375
33473376 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
33483377 if (unlikely(ret != 0))
....@@ -3373,20 +3402,55 @@
33733402 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
33743403 futex_wait_queue_me(hb, &q, to);
33753404
3376
- spin_lock(&hb->lock);
3377
- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
3378
- spin_unlock(&hb->lock);
3379
- if (ret)
3380
- goto out_put_keys;
3405
+ /*
3406
+ * On RT we must avoid races with requeue and trying to block
3407
+ * on two mutexes (hb->lock and uaddr2's rtmutex) by
3408
+ * serializing access to pi_blocked_on with pi_lock.
3409
+ */
3410
+ raw_spin_lock_irq(&current->pi_lock);
3411
+ if (current->pi_blocked_on) {
3412
+ /*
3413
+ * We have been requeued or are in the process of
3414
+ * being requeued.
3415
+ */
3416
+ raw_spin_unlock_irq(&current->pi_lock);
3417
+ } else {
3418
+ /*
3419
+ * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
3420
+ * prevents a concurrent requeue from moving us to the
3421
+ * uaddr2 rtmutex. After that we can safely acquire
3422
+ * (and possibly block on) hb->lock.
3423
+ */
3424
+ current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
3425
+ raw_spin_unlock_irq(&current->pi_lock);
3426
+
3427
+ spin_lock(&hb->lock);
3428
+
3429
+ /*
3430
+ * Clean up pi_blocked_on. We might leak it otherwise
3431
+ * when we succeeded with the hb->lock in the fast
3432
+ * path.
3433
+ */
3434
+ raw_spin_lock_irq(&current->pi_lock);
3435
+ current->pi_blocked_on = NULL;
3436
+ raw_spin_unlock_irq(&current->pi_lock);
3437
+
3438
+ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
3439
+ spin_unlock(&hb->lock);
3440
+ if (ret)
3441
+ goto out_put_keys;
3442
+ }
33813443
33823444 /*
3383
- * In order for us to be here, we know our q.key == key2, and since
3384
- * we took the hb->lock above, we also know that futex_requeue() has
3385
- * completed and we no longer have to concern ourselves with a wakeup
3386
- * race with the atomic proxy lock acquisition by the requeue code. The
3387
- * futex_requeue dropped our key1 reference and incremented our key2
3388
- * reference count.
3445
+ * In order to be here, we have either been requeued, are in
3446
+ * the process of being requeued, or requeue successfully
3447
+ * acquired uaddr2 on our behalf. If pi_blocked_on was
3448
+ * non-null above, we may be racing with a requeue. Do not
3449
+ * rely on q->lock_ptr to be hb2->lock until after blocking on
3450
+ * hb->lock or hb2->lock. The futex_requeue dropped our key1
3451
+ * reference and incremented our key2 reference count.
33893452 */
3453
+ hb2 = hash_futex(&key2);
33903454
33913455 /* Check if the requeue code acquired the second futex for us. */
33923456 if (!q.rt_waiter) {
....@@ -3395,14 +3459,15 @@
33953459 * did a lock-steal - fix up the PI-state in that case.
33963460 */
33973461 if (q.pi_state && (q.pi_state->owner != current)) {
3398
- spin_lock(q.lock_ptr);
3462
+ spin_lock(&hb2->lock);
3463
+ BUG_ON(&hb2->lock != q.lock_ptr);
33993464 ret = fixup_pi_state_owner(uaddr2, &q, current);
34003465 /*
34013466 * Drop the reference to the pi state which
34023467 * the requeue_pi() code acquired for us.
34033468 */
34043469 put_pi_state(q.pi_state);
3405
- spin_unlock(q.lock_ptr);
3470
+ spin_unlock(&hb2->lock);
34063471 /*
34073472 * Adjust the return value. It's either -EFAULT or
34083473 * success (1) but the caller expects 0 for success.
....@@ -3421,7 +3486,8 @@
34213486 pi_mutex = &q.pi_state->pi_mutex;
34223487 ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
34233488
3424
- spin_lock(q.lock_ptr);
3489
+ spin_lock(&hb2->lock);
3490
+ BUG_ON(&hb2->lock != q.lock_ptr);
34253491 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
34263492 ret = 0;
34273493