hc
2024-05-10 61598093bbdd283a7edc367d900f223070ead8d2
kernel/kernel/rcu/tasks.h
....@@ -171,8 +171,9 @@
171171 static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
172172 {
173173 /* Complain if the scheduler has not started. */
174
- WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
175
- "synchronize_rcu_tasks called too soon");
174
+ if (WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
175
+ "synchronize_%s() called too soon", rtp->name))
176
+ return;
176177
177178 /* Wait for the grace period. */
178179 wait_rcu_gp(rtp->call_func);
....@@ -416,11 +417,21 @@
416417 static void rcu_tasks_postscan(struct list_head *hop)
417418 {
418419 /*
419
- * Wait for tasks that are in the process of exiting. This
420
- * does only part of the job, ensuring that all tasks that were
421
- * previously exiting reach the point where they have disabled
422
- * preemption, allowing the later synchronize_rcu() to finish
423
- * the job.
420
+ * Exiting tasks may escape the tasklist scan. Those are vulnerable
421
+ * until their final schedule() with TASK_DEAD state. To cope with
422
+ * this, divide the fragile exit path part in two intersecting
423
+ * read side critical sections:
424
+ *
425
+ * 1) An _SRCU_ read side starting before calling exit_notify(),
426
+ * which may remove the task from the tasklist, and ending after
427
+ * the final preempt_disable() call in do_exit().
428
+ *
429
+ * 2) An _RCU_ read side starting with the final preempt_disable()
430
+ * call in do_exit() and ending with the final call to schedule()
431
+ * with TASK_DEAD state.
432
+ *
433
+ * This handles the part 1). And postgp will handle part 2) with a
434
+ * call to synchronize_rcu().
424435 */
425436 synchronize_srcu(&tasks_rcu_exit_srcu);
426437 }
....@@ -487,7 +498,10 @@
487498 *
488499 * In addition, this synchronize_rcu() waits for exiting tasks
489500 * to complete their final preempt_disable() region of execution,
490
- * cleaning up after the synchronize_srcu() above.
501
+ * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
502
+ * enforcing the whole region before tasklist removal until
503
+ * the final schedule() with TASK_DEAD state to be an RCU TASKS
504
+ * read side critical section.
491505 */
492506 synchronize_rcu();
493507 }
....@@ -576,28 +590,43 @@
576590 }
577591 #endif /* #ifndef CONFIG_TINY_RCU */
578592
579
-/* Do the srcu_read_lock() for the above synchronize_srcu(). */
593
+/*
594
+ * Contribute to protect against tasklist scan blind spot while the
595
+ * task is exiting and may be removed from the tasklist. See
596
+ * corresponding synchronize_srcu() for further details.
597
+ */
580598 void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
581599 {
582
- preempt_disable();
583600 current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
584
- preempt_enable();
585601 }
586602
587
-/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
588
-void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu)
603
+/*
604
+ * Contribute to protect against tasklist scan blind spot while the
605
+ * task is exiting and may be removed from the tasklist. See
606
+ * corresponding synchronize_srcu() for further details.
607
+ */
608
+void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
589609 {
590610 struct task_struct *t = current;
591611
592
- preempt_disable();
593612 __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
594
- preempt_enable();
595
- exit_tasks_rcu_finish_trace(t);
613
+}
614
+
615
+/*
616
+ * Contribute to protect against tasklist scan blind spot while the
617
+ * task is exiting and may be removed from the tasklist. See
618
+ * corresponding synchronize_srcu() for further details.
619
+ */
620
+void exit_tasks_rcu_finish(void)
621
+{
622
+ exit_tasks_rcu_stop();
623
+ exit_tasks_rcu_finish_trace(current);
596624 }
597625
598626 #else /* #ifdef CONFIG_TASKS_RCU */
599627 static inline void show_rcu_tasks_classic_gp_kthread(void) { }
600628 void exit_tasks_rcu_start(void) { }
629
+void exit_tasks_rcu_stop(void) { }
601630 void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
602631 #endif /* #else #ifdef CONFIG_TASKS_RCU */
603632
....@@ -620,9 +649,6 @@
620649 // Wait for one rude RCU-tasks grace period.
621650 static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
622651 {
623
- if (num_online_cpus() <= 1)
624
- return; // Fastpath for only one CPU.
625
-
626652 rtp->n_ipis += cpumask_weight(cpu_online_mask);
627653 schedule_on_each_cpu(rcu_tasks_be_rude);
628654 }
....@@ -707,7 +733,7 @@
707733 #endif /* #ifndef CONFIG_TINY_RCU */
708734
709735 #else /* #ifdef CONFIG_TASKS_RUDE_RCU */
710
-static void show_rcu_tasks_rude_gp_kthread(void) {}
736
+static inline void show_rcu_tasks_rude_gp_kthread(void) {}
711737 #endif /* #else #ifdef CONFIG_TASKS_RUDE_RCU */
712738
713739 ////////////////////////////////////////////////////////////////////////
....@@ -775,7 +801,7 @@
775801 /* If we are the last reader, wake up the grace-period kthread. */
776802 void rcu_read_unlock_trace_special(struct task_struct *t, int nesting)
777803 {
778
- int nq = t->trc_reader_special.b.need_qs;
804
+ int nq = READ_ONCE(t->trc_reader_special.b.need_qs);
779805
780806 if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
781807 t->trc_reader_special.b.need_mb)
....@@ -815,33 +841,25 @@
815841
816842 // If the task is no longer running on this CPU, leave.
817843 if (unlikely(texp != t)) {
818
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
819
- wake_up(&trc_wait);
820844 goto reset_ipi; // Already on holdout list, so will check later.
821845 }
822846
823847 // If the task is not in a read-side critical section, and
824848 // if this is the last reader, awaken the grace-period kthread.
825
- if (likely(!t->trc_reader_nesting)) {
826
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
827
- wake_up(&trc_wait);
828
- // Mark as checked after decrement to avoid false
829
- // positives on the above WARN_ON_ONCE().
849
+ if (likely(!READ_ONCE(t->trc_reader_nesting))) {
830850 WRITE_ONCE(t->trc_reader_checked, true);
831851 goto reset_ipi;
832852 }
833853 // If we are racing with an rcu_read_unlock_trace(), try again later.
834
- if (unlikely(t->trc_reader_nesting < 0)) {
835
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
836
- wake_up(&trc_wait);
854
+ if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0))
837855 goto reset_ipi;
838
- }
839856 WRITE_ONCE(t->trc_reader_checked, true);
840857
841858 // Get here if the task is in a read-side critical section. Set
842859 // its state so that it will awaken the grace-period kthread upon
843860 // exit from that critical section.
844
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
861
+ atomic_inc(&trc_n_readers_need_end); // One more to wait on.
862
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
845863 WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
846864
847865 reset_ipi:
....@@ -856,7 +874,7 @@
856874 static bool trc_inspect_reader(struct task_struct *t, void *arg)
857875 {
858876 int cpu = task_cpu(t);
859
- bool in_qs = false;
877
+ int nesting;
860878 bool ofl = cpu_is_offline(cpu);
861879
862880 if (task_curr(t)) {
....@@ -876,23 +894,24 @@
876894 n_heavy_reader_updates++;
877895 if (ofl)
878896 n_heavy_reader_ofl_updates++;
879
- in_qs = true;
897
+ nesting = 0;
880898 } else {
881
- in_qs = likely(!t->trc_reader_nesting);
899
+ // The task is not running, so C-language access is safe.
900
+ nesting = t->trc_reader_nesting;
882901 }
883902
884
- // Mark as checked so that the grace-period kthread will
885
- // remove it from the holdout list.
886
- t->trc_reader_checked = true;
887
-
888
- if (in_qs)
889
- return true; // Already in quiescent state, done!!!
903
+ // If not exiting a read-side critical section, mark as checked
904
+ // so that the grace-period kthread will remove it from the
905
+ // holdout list.
906
+ t->trc_reader_checked = nesting >= 0;
907
+ if (nesting <= 0)
908
+ return !nesting; // If in QS, done, otherwise try again later.
890909
891910 // The task is in a read-side critical section, so set up its
892911 // state so that it will awaken the grace-period kthread upon exit
893912 // from that critical section.
894913 atomic_inc(&trc_n_readers_need_end); // One more to wait on.
895
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
914
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
896915 WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
897916 return true;
898917 }
....@@ -910,7 +929,7 @@
910929 // The current task had better be in a quiescent state.
911930 if (t == current) {
912931 t->trc_reader_checked = true;
913
- WARN_ON_ONCE(t->trc_reader_nesting);
932
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
914933 return;
915934 }
916935
....@@ -933,21 +952,17 @@
933952 if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0)
934953 return;
935954
936
- atomic_inc(&trc_n_readers_need_end);
937955 per_cpu(trc_ipi_to_cpu, cpu) = true;
938956 t->trc_ipi_to_cpu = cpu;
939957 rcu_tasks_trace.n_ipis++;
940
- if (smp_call_function_single(cpu,
941
- trc_read_check_handler, t, 0)) {
958
+ if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) {
942959 // Just in case there is some other reason for
943960 // failure than the target CPU being offline.
961
+ WARN_ONCE(1, "%s(): smp_call_function_single() failed for CPU: %d\n",
962
+ __func__, cpu);
944963 rcu_tasks_trace.n_ipis_fails++;
945964 per_cpu(trc_ipi_to_cpu, cpu) = false;
946
- t->trc_ipi_to_cpu = cpu;
947
- if (atomic_dec_and_test(&trc_n_readers_need_end)) {
948
- WARN_ON_ONCE(1);
949
- wake_up(&trc_wait);
950
- }
965
+ t->trc_ipi_to_cpu = -1;
951966 }
952967 }
953968 }
....@@ -1020,8 +1035,8 @@
10201035 ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0],
10211036 ".i"[is_idle_task(t)],
10221037 ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)],
1023
- t->trc_reader_nesting,
1024
- " N"[!!t->trc_reader_special.b.need_qs],
1038
+ READ_ONCE(t->trc_reader_nesting),
1039
+ " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)],
10251040 cpu);
10261041 sched_show_task(t);
10271042 }
....@@ -1068,13 +1083,27 @@
10681083 }
10691084 }
10701085
1086
+static void rcu_tasks_trace_empty_fn(void *unused)
1087
+{
1088
+}
1089
+
10711090 /* Wait for grace period to complete and provide ordering. */
10721091 static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
10731092 {
1093
+ int cpu;
10741094 bool firstreport;
10751095 struct task_struct *g, *t;
10761096 LIST_HEAD(holdouts);
10771097 long ret;
1098
+
1099
+ // Wait for any lingering IPI handlers to complete. Note that
1100
+ // if a CPU has gone offline or transitioned to userspace in the
1101
+ // meantime, all IPI handlers should have been drained beforehand.
1102
+ // Yes, this assumes that CPUs process IPIs in order. If that ever
1103
+ // changes, there will need to be a recheck and/or timed wait.
1104
+ for_each_online_cpu(cpu)
1105
+ if (smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))
1106
+ smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
10781107
10791108 // Remove the safety count.
10801109 smp_mb__before_atomic(); // Order vs. earlier atomics
....@@ -1115,7 +1144,7 @@
11151144 static void exit_tasks_rcu_finish_trace(struct task_struct *t)
11161145 {
11171146 WRITE_ONCE(t->trc_reader_checked, true);
1118
- WARN_ON_ONCE(t->trc_reader_nesting);
1147
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
11191148 WRITE_ONCE(t->trc_reader_nesting, 0);
11201149 if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
11211150 rcu_read_unlock_trace_special(t, 0);