hc
2023-11-06 e3e12f52b214121840b44c91de5b3e5af5d3eb84
kernel/kernel/sched/core.c
....@@ -45,7 +45,11 @@
4545 * Number of tasks to iterate in a single balance run.
4646 * Limited because this is done with IRQs disabled.
4747 */
48
+#ifdef CONFIG_PREEMPT_RT_FULL
49
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
50
+#else
4851 const_debug unsigned int sysctl_sched_nr_migrate = 32;
52
+#endif
4953
5054 /*
5155 * period over which we measure -rt task CPU usage in us.
....@@ -317,7 +321,7 @@
317321 rq->hrtick_csd.info = rq;
318322 #endif
319323
320
- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
324
+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
321325 rq->hrtick_timer.function = hrtick;
322326 }
323327 #else /* CONFIG_SCHED_HRTICK */
....@@ -399,9 +403,15 @@
399403 #endif
400404 #endif
401405
402
-void wake_q_add(struct wake_q_head *head, struct task_struct *task)
406
+void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
407
+ bool sleeper)
403408 {
404
- struct wake_q_node *node = &task->wake_q;
409
+ struct wake_q_node *node;
410
+
411
+ if (sleeper)
412
+ node = &task->wake_q_sleeper;
413
+ else
414
+ node = &task->wake_q;
405415
406416 /*
407417 * Atomically grab the task, if ->wake_q is !nil already it means
....@@ -429,25 +439,33 @@
429439 static int
430440 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
431441 int sibling_count_hint);
432
-
433
-void wake_up_q(struct wake_q_head *head)
442
+void __wake_up_q(struct wake_q_head *head, bool sleeper)
434443 {
435444 struct wake_q_node *node = head->first;
436445
437446 while (node != WAKE_Q_TAIL) {
438447 struct task_struct *task;
439448
440
- task = container_of(node, struct task_struct, wake_q);
449
+ if (sleeper)
450
+ task = container_of(node, struct task_struct, wake_q_sleeper);
451
+ else
452
+ task = container_of(node, struct task_struct, wake_q);
441453 BUG_ON(!task);
442454 /* Task can safely be re-inserted now: */
443455 node = node->next;
444
- task->wake_q.next = NULL;
445
-
456
+ if (sleeper)
457
+ task->wake_q_sleeper.next = NULL;
458
+ else
459
+ task->wake_q.next = NULL;
446460 /*
447
- * try_to_wake_up() executes a full barrier, which pairs with
461
+ * wake_up_process() executes a full barrier, which pairs with
448462 * the queueing in wake_q_add() so as not to miss wakeups.
449463 */
450
- try_to_wake_up(task, TASK_NORMAL, 0, head->count);
464
+ if (sleeper)
465
+ wake_up_lock_sleeper(task);
466
+ else
467
+ wake_up_process(task);
468
+
451469 put_task_struct(task);
452470 }
453471 }
....@@ -486,6 +504,48 @@
486504 else
487505 trace_sched_wake_idle_without_ipi(cpu);
488506 }
507
+
508
+#ifdef CONFIG_PREEMPT_LAZY
509
+
510
+static int tsk_is_polling(struct task_struct *p)
511
+{
512
+#ifdef TIF_POLLING_NRFLAG
513
+ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
514
+#else
515
+ return 0;
516
+#endif
517
+}
518
+
519
+void resched_curr_lazy(struct rq *rq)
520
+{
521
+ struct task_struct *curr = rq->curr;
522
+ int cpu;
523
+
524
+ if (!sched_feat(PREEMPT_LAZY)) {
525
+ resched_curr(rq);
526
+ return;
527
+ }
528
+
529
+ lockdep_assert_held(&rq->lock);
530
+
531
+ if (test_tsk_need_resched(curr))
532
+ return;
533
+
534
+ if (test_tsk_need_resched_lazy(curr))
535
+ return;
536
+
537
+ set_tsk_need_resched_lazy(curr);
538
+
539
+ cpu = cpu_of(rq);
540
+ if (cpu == smp_processor_id())
541
+ return;
542
+
543
+ /* NEED_RESCHED_LAZY must be visible before we test polling */
544
+ smp_mb();
545
+ if (!tsk_is_polling(curr))
546
+ smp_send_reschedule(cpu);
547
+}
548
+#endif
489549
490550 void resched_cpu(int cpu)
491551 {
....@@ -1481,10 +1541,10 @@
14811541 */
14821542 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
14831543 {
1484
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
1544
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
14851545 return false;
14861546
1487
- if (is_per_cpu_kthread(p))
1547
+ if (is_per_cpu_kthread(p) || __migrate_disabled(p))
14881548 return cpu_online(cpu);
14891549
14901550 return cpu_active(cpu);
....@@ -1533,6 +1593,7 @@
15331593 struct migration_arg {
15341594 struct task_struct *task;
15351595 int dest_cpu;
1596
+ bool done;
15361597 };
15371598
15381599 /*
....@@ -1568,6 +1629,11 @@
15681629 struct task_struct *p = arg->task;
15691630 struct rq *rq = this_rq();
15701631 struct rq_flags rf;
1632
+ int dest_cpu = arg->dest_cpu;
1633
+
1634
+ /* We don't look at arg after this point. */
1635
+ smp_mb();
1636
+ arg->done = true;
15711637
15721638 /*
15731639 * The original target CPU might have gone down and we might
....@@ -1576,7 +1642,7 @@
15761642 local_irq_disable();
15771643 /*
15781644 * We need to explicitly wake pending tasks before running
1579
- * __migrate_task() such that we will not miss enforcing cpus_allowed
1645
+ * __migrate_task() such that we will not miss enforcing cpus_ptr
15801646 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
15811647 */
15821648 sched_ttwu_pending();
....@@ -1590,9 +1656,9 @@
15901656 */
15911657 if (task_rq(p) == rq) {
15921658 if (task_on_rq_queued(p))
1593
- rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1659
+ rq = __migrate_task(rq, &rf, p, dest_cpu);
15941660 else
1595
- p->wake_cpu = arg->dest_cpu;
1661
+ p->wake_cpu = dest_cpu;
15961662 }
15971663 rq_unlock(rq, &rf);
15981664 raw_spin_unlock(&p->pi_lock);
....@@ -1607,9 +1673,18 @@
16071673 */
16081674 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
16091675 {
1610
- cpumask_copy(&p->cpus_allowed, new_mask);
1611
- p->nr_cpus_allowed = cpumask_weight(new_mask);
1676
+ cpumask_copy(&p->cpus_mask, new_mask);
1677
+ if (p->cpus_ptr == &p->cpus_mask)
1678
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
16121679 }
1680
+
1681
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
1682
+int __migrate_disabled(struct task_struct *p)
1683
+{
1684
+ return p->migrate_disable;
1685
+}
1686
+EXPORT_SYMBOL_GPL(__migrate_disabled);
1687
+#endif
16131688
16141689 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
16151690 {
....@@ -1677,7 +1752,7 @@
16771752 goto out;
16781753 }
16791754
1680
- if (cpumask_equal(&p->cpus_allowed, new_mask))
1755
+ if (cpumask_equal(&p->cpus_mask, new_mask))
16811756 goto out;
16821757
16831758 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
....@@ -1699,7 +1774,8 @@
16991774 }
17001775
17011776 /* Can the task run on the task's current CPU? If so, we're done */
1702
- if (cpumask_test_cpu(task_cpu(p), new_mask))
1777
+ if (cpumask_test_cpu(task_cpu(p), new_mask) ||
1778
+ p->cpus_ptr != &p->cpus_mask)
17031779 goto out;
17041780
17051781 if (task_running(rq, p) || p->state == TASK_WAKING) {
....@@ -1840,10 +1916,10 @@
18401916 if (task_cpu(arg->src_task) != arg->src_cpu)
18411917 goto unlock;
18421918
1843
- if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
1919
+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
18441920 goto unlock;
18451921
1846
- if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
1922
+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
18471923 goto unlock;
18481924
18491925 __migrate_swap_task(arg->src_task, arg->dst_cpu);
....@@ -1885,10 +1961,10 @@
18851961 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
18861962 goto out;
18871963
1888
- if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
1964
+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
18891965 goto out;
18901966
1891
- if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
1967
+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
18921968 goto out;
18931969
18941970 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
....@@ -1898,6 +1974,18 @@
18981974 return ret;
18991975 }
19001976 #endif /* CONFIG_NUMA_BALANCING */
1977
+
1978
+static bool check_task_state(struct task_struct *p, long match_state)
1979
+{
1980
+ bool match = false;
1981
+
1982
+ raw_spin_lock_irq(&p->pi_lock);
1983
+ if (p->state == match_state || p->saved_state == match_state)
1984
+ match = true;
1985
+ raw_spin_unlock_irq(&p->pi_lock);
1986
+
1987
+ return match;
1988
+}
19011989
19021990 /*
19031991 * wait_task_inactive - wait for a thread to unschedule.
....@@ -1943,7 +2031,7 @@
19432031 * is actually now running somewhere else!
19442032 */
19452033 while (task_running(rq, p)) {
1946
- if (match_state && unlikely(p->state != match_state))
2034
+ if (match_state && !check_task_state(p, match_state))
19472035 return 0;
19482036 cpu_relax();
19492037 }
....@@ -1958,7 +2046,8 @@
19582046 running = task_running(rq, p);
19592047 queued = task_on_rq_queued(p);
19602048 ncsw = 0;
1961
- if (!match_state || p->state == match_state)
2049
+ if (!match_state || p->state == match_state ||
2050
+ p->saved_state == match_state)
19622051 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
19632052 task_rq_unlock(rq, p, &rf);
19642053
....@@ -2033,7 +2122,7 @@
20332122 EXPORT_SYMBOL_GPL(kick_process);
20342123
20352124 /*
2036
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2125
+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
20372126 *
20382127 * A few notes on cpu_active vs cpu_online:
20392128 *
....@@ -2073,14 +2162,14 @@
20732162 for_each_cpu(dest_cpu, nodemask) {
20742163 if (!cpu_active(dest_cpu))
20752164 continue;
2076
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2165
+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
20772166 return dest_cpu;
20782167 }
20792168 }
20802169
20812170 for (;;) {
20822171 /* Any allowed, online CPU? */
2083
- for_each_cpu(dest_cpu, &p->cpus_allowed) {
2172
+ for_each_cpu(dest_cpu, p->cpus_ptr) {
20842173 if (!is_cpu_allowed(p, dest_cpu))
20852174 continue;
20862175
....@@ -2124,7 +2213,7 @@
21242213 }
21252214
21262215 /*
2127
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2216
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
21282217 */
21292218 static inline
21302219 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
....@@ -2136,11 +2225,11 @@
21362225 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
21372226 sibling_count_hint);
21382227 else
2139
- cpu = cpumask_any(&p->cpus_allowed);
2228
+ cpu = cpumask_any(p->cpus_ptr);
21402229
21412230 /*
21422231 * In order not to call set_task_cpu() on a blocking task we need
2143
- * to rely on ttwu() to place the task on a valid ->cpus_allowed
2232
+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
21442233 * CPU.
21452234 *
21462235 * Since this is common to all placement strategies, this lives here.
....@@ -2243,10 +2332,6 @@
22432332 {
22442333 activate_task(rq, p, en_flags);
22452334 p->on_rq = TASK_ON_RQ_QUEUED;
2246
-
2247
- /* If a worker is waking up, notify the workqueue: */
2248
- if (p->flags & PF_WQ_WORKER)
2249
- wq_worker_waking_up(p, cpu_of(rq));
22502335 }
22512336
22522337 /*
....@@ -2571,8 +2656,27 @@
25712656 */
25722657 raw_spin_lock_irqsave(&p->pi_lock, flags);
25732658 smp_mb__after_spinlock();
2574
- if (!(p->state & state))
2659
+ if (!(p->state & state)) {
2660
+ /*
2661
+ * The task might be running due to a spinlock sleeper
2662
+ * wakeup. Check the saved state and set it to running
2663
+ * if the wakeup condition is true.
2664
+ */
2665
+ if (!(wake_flags & WF_LOCK_SLEEPER)) {
2666
+ if (p->saved_state & state) {
2667
+ p->saved_state = TASK_RUNNING;
2668
+ success = 1;
2669
+ }
2670
+ }
25752671 goto out;
2672
+ }
2673
+
2674
+ /*
2675
+ * If this is a regular wakeup, then we can unconditionally
2676
+ * clear the saved state of a "lock sleeper".
2677
+ */
2678
+ if (!(wake_flags & WF_LOCK_SLEEPER))
2679
+ p->saved_state = TASK_RUNNING;
25762680
25772681 trace_sched_waking(p);
25782682
....@@ -2672,56 +2776,6 @@
26722776 }
26732777
26742778 /**
2675
- * try_to_wake_up_local - try to wake up a local task with rq lock held
2676
- * @p: the thread to be awakened
2677
- * @rf: request-queue flags for pinning
2678
- *
2679
- * Put @p on the run-queue if it's not already there. The caller must
2680
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
2681
- * the current task.
2682
- */
2683
-static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
2684
-{
2685
- struct rq *rq = task_rq(p);
2686
-
2687
- if (WARN_ON_ONCE(rq != this_rq()) ||
2688
- WARN_ON_ONCE(p == current))
2689
- return;
2690
-
2691
- lockdep_assert_held(&rq->lock);
2692
-
2693
- if (!raw_spin_trylock(&p->pi_lock)) {
2694
- /*
2695
- * This is OK, because current is on_cpu, which avoids it being
2696
- * picked for load-balance and preemption/IRQs are still
2697
- * disabled avoiding further scheduler activity on it and we've
2698
- * not yet picked a replacement task.
2699
- */
2700
- rq_unlock(rq, rf);
2701
- raw_spin_lock(&p->pi_lock);
2702
- rq_relock(rq, rf);
2703
- }
2704
-
2705
- if (!(p->state & TASK_NORMAL))
2706
- goto out;
2707
-
2708
- trace_sched_waking(p);
2709
-
2710
- if (!task_on_rq_queued(p)) {
2711
- if (p->in_iowait) {
2712
- delayacct_blkio_end(p);
2713
- atomic_dec(&rq->nr_iowait);
2714
- }
2715
- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
2716
- }
2717
-
2718
- ttwu_do_wakeup(rq, p, 0, rf);
2719
- ttwu_stat(p, smp_processor_id(), 0);
2720
-out:
2721
- raw_spin_unlock(&p->pi_lock);
2722
-}
2723
-
2724
-/**
27252779 * wake_up_process - Wake up a specific process
27262780 * @p: The process to be woken up.
27272781 *
....@@ -2737,6 +2791,18 @@
27372791 return try_to_wake_up(p, TASK_NORMAL, 0, 1);
27382792 }
27392793 EXPORT_SYMBOL(wake_up_process);
2794
+
2795
+/**
2796
+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
2797
+ * @p: The process to be woken up.
2798
+ *
2799
+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
2800
+ * the nature of the wakeup.
2801
+ */
2802
+int wake_up_lock_sleeper(struct task_struct *p)
2803
+{
2804
+ return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER, 1);
2805
+}
27402806
27412807 int wake_up_state(struct task_struct *p, unsigned int state)
27422808 {
....@@ -2978,6 +3044,9 @@
29783044 p->on_cpu = 0;
29793045 #endif
29803046 init_task_preempt_count(p);
3047
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
3048
+ task_thread_info(p)->preempt_lazy_count = 0;
3049
+#endif
29813050 #ifdef CONFIG_SMP
29823051 plist_node_init(&p->pushable_tasks, MAX_PRIO);
29833052 RB_CLEAR_NODE(&p->pushable_dl_tasks);
....@@ -3018,7 +3087,7 @@
30183087 #ifdef CONFIG_SMP
30193088 /*
30203089 * Fork balancing, do it here and not earlier because:
3021
- * - cpus_allowed can change in the fork path
3090
+ * - cpus_ptr can change in the fork path
30223091 * - any previously selected CPU might disappear through hotplug
30233092 *
30243093 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
....@@ -3307,22 +3376,17 @@
33073376 * provided by mmdrop(),
33083377 * - a sync_core for SYNC_CORE.
33093378 */
3379
+ /*
3380
+ * We use mmdrop_delayed() here so we don't have to do the
3381
+ * full __mmdrop() when we are the last user.
3382
+ */
33103383 if (mm) {
33113384 membarrier_mm_sync_core_before_usermode(mm);
3312
- mmdrop(mm);
3385
+ mmdrop_delayed(mm);
33133386 }
33143387 if (unlikely(prev_state == TASK_DEAD)) {
33153388 if (prev->sched_class->task_dead)
33163389 prev->sched_class->task_dead(prev);
3317
-
3318
- /*
3319
- * Remove function-return probe instances associated with this
3320
- * task and put them back on the free list.
3321
- */
3322
- kprobe_flush_task(prev);
3323
-
3324
- /* Task is done with its stack. */
3325
- put_task_stack(prev);
33263390
33273391 put_task_struct(prev);
33283392 }
....@@ -4001,6 +4065,8 @@
40014065 BUG();
40024066 }
40034067
4068
+static void migrate_disabled_sched(struct task_struct *p);
4069
+
40044070 /*
40054071 * __schedule() is the main scheduler function.
40064072 *
....@@ -4071,6 +4137,9 @@
40714137 rq_lock(rq, &rf);
40724138 smp_mb__after_spinlock();
40734139
4140
+ if (__migrate_disabled(prev))
4141
+ migrate_disabled_sched(prev);
4142
+
40744143 /* Promote REQ to ACT */
40754144 rq->clock_update_flags <<= 1;
40764145 update_rq_clock(rq);
....@@ -4087,25 +4156,13 @@
40874156 atomic_inc(&rq->nr_iowait);
40884157 delayacct_blkio_start();
40894158 }
4090
-
4091
- /*
4092
- * If a worker went to sleep, notify and ask workqueue
4093
- * whether it wants to wake up a task to maintain
4094
- * concurrency.
4095
- */
4096
- if (prev->flags & PF_WQ_WORKER) {
4097
- struct task_struct *to_wakeup;
4098
-
4099
- to_wakeup = wq_worker_sleeping(prev);
4100
- if (to_wakeup)
4101
- try_to_wake_up_local(to_wakeup, &rf);
4102
- }
41034159 }
41044160 switch_count = &prev->nvcsw;
41054161 }
41064162
41074163 next = pick_next_task(rq, prev, &rf);
41084164 clear_tsk_need_resched(prev);
4165
+ clear_tsk_need_resched_lazy(prev);
41094166 clear_preempt_need_resched();
41104167
41114168 if (likely(prev != next)) {
....@@ -4157,14 +4214,37 @@
41574214
41584215 static inline void sched_submit_work(struct task_struct *tsk)
41594216 {
4160
- if (!tsk->state || tsk_is_pi_blocked(tsk))
4217
+ if (!tsk->state)
41614218 return;
4219
+
4220
+ /*
4221
+ * If a worker went to sleep, notify and ask workqueue whether
4222
+ * it wants to wake up a task to maintain concurrency.
4223
+ * As this function is called inside the schedule() context,
4224
+ * we disable preemption to avoid it calling schedule() again
4225
+ * in the possible wakeup of a kworker.
4226
+ */
4227
+ if (tsk->flags & PF_WQ_WORKER) {
4228
+ preempt_disable();
4229
+ wq_worker_sleeping(tsk);
4230
+ preempt_enable_no_resched();
4231
+ }
4232
+
4233
+ if (tsk_is_pi_blocked(tsk))
4234
+ return;
4235
+
41624236 /*
41634237 * If we are going to sleep and we have plugged IO queued,
41644238 * make sure to submit it to avoid deadlocks.
41654239 */
41664240 if (blk_needs_flush_plug(tsk))
41674241 blk_schedule_flush_plug(tsk);
4242
+}
4243
+
4244
+static void sched_update_worker(struct task_struct *tsk)
4245
+{
4246
+ if (tsk->flags & PF_WQ_WORKER)
4247
+ wq_worker_running(tsk);
41684248 }
41694249
41704250 asmlinkage __visible void __sched schedule(void)
....@@ -4177,6 +4257,7 @@
41774257 __schedule(false);
41784258 sched_preempt_enable_no_resched();
41794259 } while (need_resched());
4260
+ sched_update_worker(tsk);
41804261 }
41814262 EXPORT_SYMBOL(schedule);
41824263
....@@ -4265,6 +4346,30 @@
42654346 } while (need_resched());
42664347 }
42674348
4349
+#ifdef CONFIG_PREEMPT_LAZY
4350
+/*
4351
+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
4352
+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
4353
+ * preempt_lazy_count counter >0.
4354
+ */
4355
+static __always_inline int preemptible_lazy(void)
4356
+{
4357
+ if (test_thread_flag(TIF_NEED_RESCHED))
4358
+ return 1;
4359
+ if (current_thread_info()->preempt_lazy_count)
4360
+ return 0;
4361
+ return 1;
4362
+}
4363
+
4364
+#else
4365
+
4366
+static inline int preemptible_lazy(void)
4367
+{
4368
+ return 1;
4369
+}
4370
+
4371
+#endif
4372
+
42684373 #ifdef CONFIG_PREEMPT
42694374 /*
42704375 * this is the entry point to schedule() from in-kernel preemption
....@@ -4279,7 +4384,8 @@
42794384 */
42804385 if (likely(!preemptible()))
42814386 return;
4282
-
4387
+ if (!preemptible_lazy())
4388
+ return;
42834389 preempt_schedule_common();
42844390 }
42854391 NOKPROBE_SYMBOL(preempt_schedule);
....@@ -4304,6 +4410,9 @@
43044410 enum ctx_state prev_ctx;
43054411
43064412 if (likely(!preemptible()))
4413
+ return;
4414
+
4415
+ if (!preemptible_lazy())
43074416 return;
43084417
43094418 do {
....@@ -4951,7 +5060,7 @@
49515060 * the entire root_domain to become SCHED_DEADLINE. We
49525061 * will also fail if there's no bandwidth available.
49535062 */
4954
- if (!cpumask_subset(span, &p->cpus_allowed) ||
5063
+ if (!cpumask_subset(span, p->cpus_ptr) ||
49555064 rq->rd->dl_bw.bw == 0) {
49565065 task_rq_unlock(rq, p, &rf);
49575066 return -EPERM;
....@@ -5569,7 +5678,7 @@
55695678 goto out_unlock;
55705679
55715680 raw_spin_lock_irqsave(&p->pi_lock, flags);
5572
- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
5681
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
55735682 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
55745683
55755684 out_unlock:
....@@ -6106,7 +6215,9 @@
61066215
61076216 /* Set the preempt count _outside_ the spinlocks! */
61086217 init_idle_preempt_count(idle, cpu);
6109
-
6218
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
6219
+ task_thread_info(idle)->preempt_lazy_count = 0;
6220
+#endif
61106221 /*
61116222 * The idle tasks have their own, simple scheduling class:
61126223 */
....@@ -6145,7 +6256,7 @@
61456256 * allowed nodes is unnecessary. Thus, cpusets are not
61466257 * applicable for such threads. This prevents checking for
61476258 * success of set_cpus_allowed_ptr() on all attached tasks
6148
- * before cpus_allowed may be changed.
6259
+ * before cpus_mask may be changed.
61496260 */
61506261 if (p->flags & PF_NO_SETAFFINITY) {
61516262 ret = -EINVAL;
....@@ -6172,7 +6283,7 @@
61726283 if (curr_cpu == target_cpu)
61736284 return 0;
61746285
6175
- if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
6286
+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
61766287 return -EINVAL;
61776288
61786289 /* TODO: This is not properly updating schedstats */
....@@ -6211,6 +6322,7 @@
62116322 #endif /* CONFIG_NUMA_BALANCING */
62126323
62136324 #ifdef CONFIG_HOTPLUG_CPU
6325
+
62146326 /*
62156327 * Ensure that the idle task is using init_mm right before its CPU goes
62166328 * offline.
....@@ -6310,8 +6422,10 @@
63106422 BUG_ON(!next);
63116423 put_prev_task(rq, next);
63126424
6425
+ WARN_ON_ONCE(__migrate_disabled(next));
6426
+
63136427 /*
6314
- * Rules for changing task_struct::cpus_allowed are holding
6428
+ * Rules for changing task_struct::cpus_mask are holding
63156429 * both pi_lock and rq->lock, such that holding either
63166430 * stabilizes the mask.
63176431 *
....@@ -6777,7 +6891,7 @@
67776891 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
67786892 static inline int preempt_count_equals(int preempt_offset)
67796893 {
6780
- int nested = preempt_count() + rcu_preempt_depth();
6894
+ int nested = preempt_count() + sched_rcu_preempt_depth();
67816895
67826896 return (nested == preempt_offset);
67836897 }
....@@ -8014,3 +8128,171 @@
80148128 };
80158129
80168130 #undef CREATE_TRACE_POINTS
8131
+
8132
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
8133
+
8134
+static inline void
8135
+update_nr_migratory(struct task_struct *p, long delta)
8136
+{
8137
+ if (unlikely((p->sched_class == &rt_sched_class ||
8138
+ p->sched_class == &dl_sched_class) &&
8139
+ p->nr_cpus_allowed > 1)) {
8140
+ if (p->sched_class == &rt_sched_class)
8141
+ task_rq(p)->rt.rt_nr_migratory += delta;
8142
+ else
8143
+ task_rq(p)->dl.dl_nr_migratory += delta;
8144
+ }
8145
+}
8146
+
8147
+static inline void
8148
+migrate_disable_update_cpus_allowed(struct task_struct *p)
8149
+{
8150
+ p->cpus_ptr = cpumask_of(smp_processor_id());
8151
+ update_nr_migratory(p, -1);
8152
+ p->nr_cpus_allowed = 1;
8153
+}
8154
+
8155
+static inline void
8156
+migrate_enable_update_cpus_allowed(struct task_struct *p)
8157
+{
8158
+ struct rq *rq;
8159
+ struct rq_flags rf;
8160
+
8161
+ rq = task_rq_lock(p, &rf);
8162
+ p->cpus_ptr = &p->cpus_mask;
8163
+ p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
8164
+ update_nr_migratory(p, 1);
8165
+ task_rq_unlock(rq, p, &rf);
8166
+}
8167
+
8168
+void migrate_disable(void)
8169
+{
8170
+ preempt_disable();
8171
+
8172
+ if (++current->migrate_disable == 1) {
8173
+ this_rq()->nr_pinned++;
8174
+ preempt_lazy_disable();
8175
+#ifdef CONFIG_SCHED_DEBUG
8176
+ WARN_ON_ONCE(current->pinned_on_cpu >= 0);
8177
+ current->pinned_on_cpu = smp_processor_id();
8178
+#endif
8179
+ }
8180
+
8181
+ preempt_enable();
8182
+}
8183
+EXPORT_SYMBOL(migrate_disable);
8184
+
8185
+static void migrate_disabled_sched(struct task_struct *p)
8186
+{
8187
+ if (p->migrate_disable_scheduled)
8188
+ return;
8189
+
8190
+ migrate_disable_update_cpus_allowed(p);
8191
+ p->migrate_disable_scheduled = 1;
8192
+}
8193
+
8194
+static DEFINE_PER_CPU(struct cpu_stop_work, migrate_work);
8195
+static DEFINE_PER_CPU(struct migration_arg, migrate_arg);
8196
+
8197
+void migrate_enable(void)
8198
+{
8199
+ struct task_struct *p = current;
8200
+ struct rq *rq = this_rq();
8201
+ int cpu = task_cpu(p);
8202
+
8203
+ WARN_ON_ONCE(p->migrate_disable <= 0);
8204
+ if (p->migrate_disable > 1) {
8205
+ p->migrate_disable--;
8206
+ return;
8207
+ }
8208
+
8209
+ preempt_disable();
8210
+
8211
+#ifdef CONFIG_SCHED_DEBUG
8212
+ WARN_ON_ONCE(current->pinned_on_cpu != cpu);
8213
+ current->pinned_on_cpu = -1;
8214
+#endif
8215
+
8216
+ WARN_ON_ONCE(rq->nr_pinned < 1);
8217
+
8218
+ p->migrate_disable = 0;
8219
+ rq->nr_pinned--;
8220
+#ifdef CONFIG_HOTPLUG_CPU
8221
+ if (rq->nr_pinned == 0 && unlikely(!cpu_active(cpu)) &&
8222
+ takedown_cpu_task)
8223
+ wake_up_process(takedown_cpu_task);
8224
+#endif
8225
+
8226
+ if (!p->migrate_disable_scheduled)
8227
+ goto out;
8228
+
8229
+ p->migrate_disable_scheduled = 0;
8230
+
8231
+ migrate_enable_update_cpus_allowed(p);
8232
+
8233
+ WARN_ON(smp_processor_id() != cpu);
8234
+ if (!is_cpu_allowed(p, cpu)) {
8235
+ struct migration_arg __percpu *arg;
8236
+ struct cpu_stop_work __percpu *work;
8237
+ struct rq_flags rf;
8238
+
8239
+ work = this_cpu_ptr(&migrate_work);
8240
+ arg = this_cpu_ptr(&migrate_arg);
8241
+ WARN_ON_ONCE(!arg->done && !work->disabled && work->arg);
8242
+
8243
+ arg->task = p;
8244
+ arg->done = false;
8245
+
8246
+ rq = task_rq_lock(p, &rf);
8247
+ update_rq_clock(rq);
8248
+ arg->dest_cpu = select_fallback_rq(cpu, p);
8249
+ task_rq_unlock(rq, p, &rf);
8250
+
8251
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
8252
+ arg, work);
8253
+ tlb_migrate_finish(p->mm);
8254
+ }
8255
+
8256
+out:
8257
+ preempt_lazy_enable();
8258
+ preempt_enable();
8259
+}
8260
+EXPORT_SYMBOL(migrate_enable);
8261
+
8262
+int cpu_nr_pinned(int cpu)
8263
+{
8264
+ struct rq *rq = cpu_rq(cpu);
8265
+
8266
+ return rq->nr_pinned;
8267
+}
8268
+
8269
+#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
8270
+static void migrate_disabled_sched(struct task_struct *p)
8271
+{
8272
+}
8273
+
8274
+void migrate_disable(void)
8275
+{
8276
+#ifdef CONFIG_SCHED_DEBUG
8277
+ current->migrate_disable++;
8278
+#endif
8279
+ barrier();
8280
+}
8281
+EXPORT_SYMBOL(migrate_disable);
8282
+
8283
+void migrate_enable(void)
8284
+{
8285
+#ifdef CONFIG_SCHED_DEBUG
8286
+ struct task_struct *p = current;
8287
+
8288
+ WARN_ON_ONCE(p->migrate_disable <= 0);
8289
+ p->migrate_disable--;
8290
+#endif
8291
+ barrier();
8292
+}
8293
+EXPORT_SYMBOL(migrate_enable);
8294
+#else
8295
+static void migrate_disabled_sched(struct task_struct *p)
8296
+{
8297
+}
8298
+#endif