~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,38 +1,17 @@
	1	+/* SPDX-License-Identifier: GPL-2.0+ */
1	2	/*
2	3	* Read-Copy Update mechanism for mutual exclusion (tree-based version)
3	4	* Internal non-public definitions that provide either classic
4	5	* or preemptible semantics.
5	6	*
6		- * This program is free software; you can redistribute it and/or modify
7		- * it under the terms of the GNU General Public License as published by
8		- * the Free Software Foundation; either version 2 of the License, or
9		- * (at your option) any later version.
10		- *
11		- * This program is distributed in the hope that it will be useful,
12		- * but WITHOUT ANY WARRANTY; without even the implied warranty of
13		- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		- * GNU General Public License for more details.
15		- *
16		- * You should have received a copy of the GNU General Public License
17		- * along with this program; if not, you can access it online at
18		- * http://www.gnu.org/licenses/gpl-2.0.html.
19		- *
20	7	* Copyright Red Hat, 2009
21	8	* Copyright IBM Corporation, 2009
22	9	*
23	10	* Author: Ingo Molnar <mingo@elte.hu>
24		- * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
	11	+ * Paul E. McKenney <paulmck@linux.ibm.com>
25	12	*/
26	13
27	14	#include "../locking/rtmutex_common.h"
28		-
29		-/*
30		- * Control variables for per-CPU and per-rcu_node kthreads. These
31		- * handle all flavors of RCU.
32		- */
33		-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
34		-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
35		-DEFINE_PER_CPU(char, rcu_cpu_has_work);
36	15
37	16	#ifdef CONFIG_RCU_NOCB_CPU
38	17	static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
..	..	@@ -57,6 +36,8 @@
57	36	pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
58	37	if (IS_ENABLED(CONFIG_PROVE_RCU))
59	38	pr_info("\tRCU lockdep checking is enabled.\n");
	39	+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
	40	+ pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n");
60	41	if (RCU_NUM_LVLS >= 4)
61	42	pr_info("\tFour(or more)-level hierarchy is enabled.\n");
62	43	if (RCU_FANOUT_LEAF != 16)
..	..	@@ -77,10 +58,14 @@
77	58	pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
78	59	if (qlowmark != DEFAULT_RCU_QLOMARK)
79	60	pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
	61	+ if (qovld != DEFAULT_RCU_QOVLD)
	62	+ pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld);
80	63	if (jiffies_till_first_fqs != ULONG_MAX)
81	64	pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
82	65	if (jiffies_till_next_fqs != ULONG_MAX)
83	66	pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
	67	+ if (jiffies_till_sched_qs != ULONG_MAX)
	68	+ pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs);
84	69	if (rcu_kick_kthreads)
85	70	pr_info("\tKick kthreads if too-long grace period.\n");
86	71	if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
..	..	@@ -91,6 +76,8 @@
91	76	pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
92	77	if (gp_cleanup_delay)
93	78	pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
	79	+ if (!use_softirq)
	80	+ pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
94	81	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
95	82	pr_info("\tRCU debug extended QS entry/exit.\n");
96	83	rcupdate_announce_bootup_oddness();
..	..	@@ -98,12 +85,7 @@
98	85
99	86	#ifdef CONFIG_PREEMPT_RCU
100	87
101		-RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
102		-static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
103		-static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
104		-
105		-static void rcu_report_exp_rnp(struct rcu_state rsp, struct rcu_node rnp,
106		- bool wake);
	88	+static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake);
107	89	static void rcu_read_unlock_special(struct task_struct *t);
108	90
109	91	/*
..	..	@@ -246,7 +228,7 @@
246	228	WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
247	229	}
248	230	if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
249		- rnp->exp_tasks = &t->rcu_node_entry;
	231	+ WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry);
250	232	WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
251	233	!(rnp->qsmask & rdp->grpmask));
252	234	WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
..	..	@@ -259,13 +241,10 @@
259	241	* no need to check for a subsequent expedited GP. (Though we are
260	242	* still in a quiescent state in any case.)
261	243	*/
262		- if (blkd_state & RCU_EXP_BLKD &&
263		- t->rcu_read_unlock_special.b.exp_need_qs) {
264		- t->rcu_read_unlock_special.b.exp_need_qs = false;
265		- rcu_report_exp_rdp(rdp->rsp, rdp, true);
266		- } else {
267		- WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
268		- }
	244	+ if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs)
	245	+ rcu_report_exp_rdp(rdp);
	246	+ else
	247	+ WARN_ON_ONCE(rdp->exp_deferred_qs);
269	248	}
270	249
271	250	/*
..	..	@@ -281,16 +260,16 @@
281	260	*
282	261	* Callers to this function must disable preemption.
283	262	*/
284		-static void rcu_preempt_qs(void)
	263	+static void rcu_qs(void)
285	264	{
286		- RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n");
287		- if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
	265	+ RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
	266	+ if (__this_cpu_read(rcu_data.cpu_no_qs.s)) {
288	267	trace_rcu_grace_period(TPS("rcu_preempt"),
289		- __this_cpu_read(rcu_data_p->gp_seq),
	268	+ __this_cpu_read(rcu_data.gp_seq),
290	269	TPS("cpuqs"));
291		- __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false);
292		- barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
293		- current->rcu_read_unlock_special.b.need_qs = false;
	270	+ __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
	271	+ barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
	272	+ WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
294	273	}
295	274	}
296	275
..	..	@@ -307,23 +286,19 @@
307	286	*
308	287	* Caller must disable interrupts.
309	288	*/
310		-static void rcu_preempt_note_context_switch(bool preempt)
	289	+void rcu_note_context_switch(bool preempt)
311	290	{
312	291	struct task_struct *t = current;
313		- struct rcu_data *rdp;
	292	+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
314	293	struct rcu_node *rnp;
315		- int sleeping_l = 0;
316	294
	295	+ trace_rcu_utilization(TPS("Start context switch"));
317	296	lockdep_assert_irqs_disabled();
318		-#if defined(CONFIG_PREEMPT_RT_FULL)
319		- sleeping_l = t->sleeping_lock;
320		-#endif
321		- WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l);
322		- if (t->rcu_read_lock_nesting > 0 &&
	297	+ WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
	298	+ if (rcu_preempt_depth() > 0 &&
323	299	!t->rcu_read_unlock_special.b.blocked) {
324	300
325	301	/* Possibly blocking in an RCU read-side critical section. */
326		- rdp = this_cpu_ptr(rcu_state_p->rda);
327	302	rnp = rdp->mynode;
328	303	raw_spin_lock_rcu_node(rnp);
329	304	t->rcu_read_unlock_special.b.blocked = true;
..	..	@@ -336,20 +311,14 @@
336	311	*/
337	312	WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
338	313	WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
339		- trace_rcu_preempt_task(rdp->rsp->name,
	314	+ trace_rcu_preempt_task(rcu_state.name,
340	315	t->pid,
341	316	(rnp->qsmask & rdp->grpmask)
342	317	? rnp->gp_seq
343	318	: rcu_seq_snap(&rnp->gp_seq));
344	319	rcu_preempt_ctxt_queue(rnp, rdp);
345		- } else if (t->rcu_read_lock_nesting < 0 &&
346		- t->rcu_read_unlock_special.s) {
347		-
348		- /*
349		- * Complete exit from RCU read-side critical section on
350		- * behalf of preempted instance of __rcu_read_unlock().
351		- */
352		- rcu_read_unlock_special(t);
	320	+ } else {
	321	+ rcu_preempt_deferred_qs(t);
353	322	}
354	323
355	324	/*
..	..	@@ -361,8 +330,13 @@
361	330	* grace period, then the fact that the task has been enqueued
362	331	* means that we continue to block the current grace period.
363	332	*/
364		- rcu_preempt_qs();
	333	+ rcu_qs();
	334	+ if (rdp->exp_deferred_qs)
	335	+ rcu_report_exp_rdp(rdp);
	336	+ rcu_tasks_qs(current, preempt);
	337	+ trace_rcu_utilization(TPS("End context switch"));
365	338	}
	339	+EXPORT_SYMBOL_GPL(rcu_note_context_switch);
366	340
367	341	/*
368	342	* Check for preempted RCU readers blocking the current grace period
..	..	@@ -374,6 +348,24 @@
374	348	return READ_ONCE(rnp->gp_tasks) != NULL;
375	349	}
376	350
	351	+/* limit value for ->rcu_read_lock_nesting. */
	352	+#define RCU_NEST_PMAX (INT_MAX / 2)
	353	+
	354	+static void rcu_preempt_read_enter(void)
	355	+{
	356	+ current->rcu_read_lock_nesting++;
	357	+}
	358	+
	359	+static int rcu_preempt_read_exit(void)
	360	+{
	361	+ return --current->rcu_read_lock_nesting;
	362	+}
	363	+
	364	+static void rcu_preempt_depth_set(int val)
	365	+{
	366	+ current->rcu_read_lock_nesting = val;
	367	+}
	368	+
377	369	/*
378	370	* Preemptible RCU implementation for rcu_read_lock().
379	371	* Just increment ->rcu_read_lock_nesting, shared state will be updated
..	..	@@ -381,7 +373,11 @@
381	373	*/
382	374	void __rcu_read_lock(void)
383	375	{
384		- current->rcu_read_lock_nesting++;
	376	+ rcu_preempt_read_enter();
	377	+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
	378	+ WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
	379	+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread)
	380	+ WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true);
385	381	barrier(); /* critical section after entry code. */
386	382	}
387	383	EXPORT_SYMBOL_GPL(__rcu_read_lock);
..	..	@@ -397,24 +393,16 @@
397	393	{
398	394	struct task_struct *t = current;
399	395
400		- if (t->rcu_read_lock_nesting != 1) {
401		- --t->rcu_read_lock_nesting;
402		- } else {
	396	+ if (rcu_preempt_read_exit() == 0) {
403	397	barrier(); /* critical section before exit code. */
404		- t->rcu_read_lock_nesting = INT_MIN;
405		- barrier(); /* assign before ->rcu_read_unlock_special load */
406	398	if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
407	399	rcu_read_unlock_special(t);
408		- barrier(); /* ->rcu_read_unlock_special load before assign */
409		- t->rcu_read_lock_nesting = 0;
410	400	}
411		-#ifdef CONFIG_PROVE_LOCKING
412		- {
413		- int rrln = READ_ONCE(t->rcu_read_lock_nesting);
	401	+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
	402	+ int rrln = rcu_preempt_depth();
414	403
415		- WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
	404	+ WARN_ON_ONCE(rrln < 0 \|\| rrln > RCU_NEST_PMAX);
416	405	}
417		-#endif /* #ifdef CONFIG_PROVE_LOCKING */
418	406	}
419	407	EXPORT_SYMBOL_GPL(__rcu_read_unlock);
420	408
..	..	@@ -443,27 +431,21 @@
443	431	}
444	432
445	433	/*
446		- * Handle special cases during rcu_read_unlock(), such as needing to
447		- * notify RCU core processing or task having blocked during the RCU
448		- * read-side critical section.
	434	+ * Report deferred quiescent states. The deferral time can
	435	+ * be quite short, for example, in the case of the call from
	436	+ * rcu_read_unlock_special().
449	437	*/
450		-static void rcu_read_unlock_special(struct task_struct *t)
	438	+static void
	439	+rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
451	440	{
452	441	bool empty_exp;
453	442	bool empty_norm;
454	443	bool empty_exp_now;
455		- unsigned long flags;
456	444	struct list_head *np;
457	445	bool drop_boost_mutex = false;
458	446	struct rcu_data *rdp;
459	447	struct rcu_node *rnp;
460	448	union rcu_special special;
461		-
462		- /* NMI handlers cannot block and cannot safely manipulate state. */
463		- if (in_nmi())
464		- return;
465		-
466		- local_irq_save(flags);
467	449
468	450	/*
469	451	* If RCU core is waiting for this CPU to exit its critical section,
..	..	@@ -471,49 +453,32 @@
471	453	* t->rcu_read_unlock_special cannot change.
472	454	*/
473	455	special = t->rcu_read_unlock_special;
	456	+ rdp = this_cpu_ptr(&rcu_data);
	457	+ if (!special.s && !rdp->exp_deferred_qs) {
	458	+ local_irq_restore(flags);
	459	+ return;
	460	+ }
	461	+ t->rcu_read_unlock_special.s = 0;
474	462	if (special.b.need_qs) {
475		- rcu_preempt_qs();
476		- t->rcu_read_unlock_special.b.need_qs = false;
477		- if (!t->rcu_read_unlock_special.s) {
478		- local_irq_restore(flags);
479		- return;
	463	+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
	464	+ rcu_report_qs_rdp(rdp);
	465	+ udelay(rcu_unlock_delay);
	466	+ } else {
	467	+ rcu_qs();
480	468	}
481	469	}
482	470
483	471	/*
484		- * Respond to a request for an expedited grace period, but only if
485		- * we were not preempted, meaning that we were running on the same
486		- * CPU throughout. If we were preempted, the exp_need_qs flag
487		- * would have been cleared at the time of the first preemption,
488		- * and the quiescent state would be reported when we were dequeued.
	472	+ * Respond to a request by an expedited grace period for a
	473	+ * quiescent state from this CPU. Note that requests from
	474	+ * tasks are handled when removing the task from the
	475	+ * blocked-tasks list below.
489	476	*/
490		- if (special.b.exp_need_qs) {
491		- WARN_ON_ONCE(special.b.blocked);
492		- t->rcu_read_unlock_special.b.exp_need_qs = false;
493		- rdp = this_cpu_ptr(rcu_state_p->rda);
494		- rcu_report_exp_rdp(rcu_state_p, rdp, true);
495		- if (!t->rcu_read_unlock_special.s) {
496		- local_irq_restore(flags);
497		- return;
498		- }
499		- }
500		-
501		- /* Hardware IRQ handlers cannot block, complain if they get here. */
502		- if (preempt_count() & (HARDIRQ_MASK \| SOFTIRQ_OFFSET)) {
503		- lockdep_rcu_suspicious(__FILE__, __LINE__,
504		- "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
505		- pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
506		- t->rcu_read_unlock_special.s,
507		- t->rcu_read_unlock_special.b.blocked,
508		- t->rcu_read_unlock_special.b.exp_need_qs,
509		- t->rcu_read_unlock_special.b.need_qs);
510		- local_irq_restore(flags);
511		- return;
512		- }
	477	+ if (rdp->exp_deferred_qs)
	478	+ rcu_report_exp_rdp(rdp);
513	479
514	480	/* Clean up if blocked during RCU read-side critical section. */
515	481	if (special.b.blocked) {
516		- t->rcu_read_unlock_special.b.blocked = false;
517	482
518	483	/*
519	484	* Remove this task from the list it blocked on. The task
..	..	@@ -528,7 +493,7 @@
528	493	empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
529	494	WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
530	495	(!empty_norm \|\| rnp->qsmask));
531		- empty_exp = sync_rcu_preempt_exp_done(rnp);
	496	+ empty_exp = sync_rcu_exp_done(rnp);
532	497	smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
533	498	np = rcu_next_node_entry(t, rnp);
534	499	list_del_init(&t->rcu_node_entry);
..	..	@@ -538,12 +503,12 @@
538	503	if (&t->rcu_node_entry == rnp->gp_tasks)
539	504	WRITE_ONCE(rnp->gp_tasks, np);
540	505	if (&t->rcu_node_entry == rnp->exp_tasks)
541		- rnp->exp_tasks = np;
	506	+ WRITE_ONCE(rnp->exp_tasks, np);
542	507	if (IS_ENABLED(CONFIG_RCU_BOOST)) {
543	508	/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
544	509	drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
545	510	if (&t->rcu_node_entry == rnp->boost_tasks)
546		- rnp->boost_tasks = np;
	511	+ WRITE_ONCE(rnp->boost_tasks, np);
547	512	}
548	513
549	514	/*
..	..	@@ -552,7 +517,7 @@
552	517	* Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
553	518	* so we must take a snapshot of the expedited state.
554	519	*/
555		- empty_exp_now = sync_rcu_preempt_exp_done(rnp);
	520	+ empty_exp_now = sync_rcu_exp_done(rnp);
556	521	if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
557	522	trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
558	523	rnp->gp_seq,
..	..	@@ -561,138 +526,141 @@
561	526	rnp->grplo,
562	527	rnp->grphi,
563	528	!!rnp->gp_tasks);
564		- rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
	529	+ rcu_report_unblock_qs_rnp(rnp, flags);
565	530	} else {
566	531	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
567	532	}
568		-
569		- /* Unboost if we were boosted. */
570		- if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
571		- rt_mutex_futex_unlock(&rnp->boost_mtx);
572	533
573	534	/*
574	535	* If this was the last task on the expedited lists,
575	536	* then we need to report up the rcu_node hierarchy.
576	537	*/
577	538	if (!empty_exp && empty_exp_now)
578		- rcu_report_exp_rnp(rcu_state_p, rnp, true);
	539	+ rcu_report_exp_rnp(rnp, true);
	540	+
	541	+ /* Unboost if we were boosted. */
	542	+ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
	543	+ rt_mutex_futex_unlock(&rnp->boost_mtx);
	544	+
579	545	} else {
580	546	local_irq_restore(flags);
581	547	}
582	548	}
583	549
584	550	/*
585		- * Dump detailed information for all tasks blocking the current RCU
586		- * grace period on the specified rcu_node structure.
	551	+ * Is a deferred quiescent-state pending, and are we also not in
	552	+ * an RCU read-side critical section? It is the caller's responsibility
	553	+ * to ensure it is otherwise safe to report any deferred quiescent
	554	+ * states. The reason for this is that it is safe to report a
	555	+ * quiescent state during context switch even though preemption
	556	+ * is disabled. This function cannot be expected to understand these
	557	+ * nuances, so the caller must handle them.
587	558	*/
588		-static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
	559	+static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
	560	+{
	561	+ return (__this_cpu_read(rcu_data.exp_deferred_qs) \|\|
	562	+ READ_ONCE(t->rcu_read_unlock_special.s)) &&
	563	+ rcu_preempt_depth() == 0;
	564	+}
	565	+
	566	+/*
	567	+ * Report a deferred quiescent state if needed and safe to do so.
	568	+ * As with rcu_preempt_need_deferred_qs(), "safe" involves only
	569	+ * not being in an RCU read-side critical section. The caller must
	570	+ * evaluate safety in terms of interrupt, softirq, and preemption
	571	+ * disabling.
	572	+ */
	573	+static void rcu_preempt_deferred_qs(struct task_struct *t)
589	574	{
590	575	unsigned long flags;
591		- struct task_struct *t;
592	576
593		- raw_spin_lock_irqsave_rcu_node(rnp, flags);
594		- if (!rcu_preempt_blocked_readers_cgp(rnp)) {
595		- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
	577	+ if (!rcu_preempt_need_deferred_qs(t))
	578	+ return;
	579	+ local_irq_save(flags);
	580	+ rcu_preempt_deferred_qs_irqrestore(t, flags);
	581	+}
	582	+
	583	+/*
	584	+ * Minimal handler to give the scheduler a chance to re-evaluate.
	585	+ */
	586	+static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
	587	+{
	588	+ struct rcu_data *rdp;
	589	+
	590	+ rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
	591	+ rdp->defer_qs_iw_pending = false;
	592	+}
	593	+
	594	+/*
	595	+ * Handle special cases during rcu_read_unlock(), such as needing to
	596	+ * notify RCU core processing or task having blocked during the RCU
	597	+ * read-side critical section.
	598	+ */
	599	+static void rcu_read_unlock_special(struct task_struct *t)
	600	+{
	601	+ unsigned long flags;
	602	+ bool preempt_bh_were_disabled =
	603	+ !!(preempt_count() & (PREEMPT_MASK \| SOFTIRQ_MASK));
	604	+ bool irqs_were_disabled;
	605	+
	606	+ /* NMI handlers cannot block and cannot safely manipulate state. */
	607	+ if (in_nmi())
	608	+ return;
	609	+
	610	+ local_irq_save(flags);
	611	+ irqs_were_disabled = irqs_disabled_flags(flags);
	612	+ if (preempt_bh_were_disabled \|\| irqs_were_disabled) {
	613	+ bool exp;
	614	+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
	615	+ struct rcu_node *rnp = rdp->mynode;
	616	+
	617	+ exp = (t->rcu_blocked_node &&
	618	+ READ_ONCE(t->rcu_blocked_node->exp_tasks)) \|\|
	619	+ (rdp->grpmask & READ_ONCE(rnp->expmask));
	620	+ // Need to defer quiescent state until everything is enabled.
	621	+ if (use_softirq && (in_irq() \|\| (exp && !irqs_were_disabled))) {
	622	+ // Using softirq, safe to awaken, and either the
	623	+ // wakeup is free or there is an expedited GP.
	624	+ raise_softirq_irqoff(RCU_SOFTIRQ);
	625	+ } else {
	626	+ // Enabling BH or preempt does reschedule, so...
	627	+ // Also if no expediting, slow is OK.
	628	+ // Plus nohz_full CPUs eventually get tick enabled.
	629	+ set_tsk_need_resched(current);
	630	+ set_preempt_need_resched();
	631	+ if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
	632	+ !rdp->defer_qs_iw_pending && exp && cpu_online(rdp->cpu)) {
	633	+ // Get scheduler to re-evaluate and call hooks.
	634	+ // If !IRQ_WORK, FQS scan will eventually IPI.
	635	+ init_irq_work(&rdp->defer_qs_iw,
	636	+ rcu_preempt_deferred_qs_handler);
	637	+ rdp->defer_qs_iw_pending = true;
	638	+ irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
	639	+ }
	640	+ }
	641	+ local_irq_restore(flags);
596	642	return;
597	643	}
598		- t = list_entry(rnp->gp_tasks->prev,
599		- struct task_struct, rcu_node_entry);
600		- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
601		- /*
602		- * We could be printing a lot while holding a spinlock.
603		- * Avoid triggering hard lockup.
604		- */
605		- touch_nmi_watchdog();
606		- sched_show_task(t);
607		- }
608		- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
609		-}
610		-
611		-/*
612		- * Dump detailed information for all tasks blocking the current RCU
613		- * grace period.
614		- */
615		-static void rcu_print_detail_task_stall(struct rcu_state *rsp)
616		-{
617		- struct rcu_node *rnp = rcu_get_root(rsp);
618		-
619		- rcu_print_detail_task_stall_rnp(rnp);
620		- rcu_for_each_leaf_node(rsp, rnp)
621		- rcu_print_detail_task_stall_rnp(rnp);
622		-}
623		-
624		-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
625		-{
626		- pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
627		- rnp->level, rnp->grplo, rnp->grphi);
628		-}
629		-
630		-static void rcu_print_task_stall_end(void)
631		-{
632		- pr_cont("\n");
633		-}
634		-
635		-/*
636		- * Scan the current list of tasks blocked within RCU read-side critical
637		- * sections, printing out the tid of each.
638		- */
639		-static int rcu_print_task_stall(struct rcu_node *rnp)
640		-{
641		- struct task_struct *t;
642		- int ndetected = 0;
643		-
644		- if (!rcu_preempt_blocked_readers_cgp(rnp))
645		- return 0;
646		- rcu_print_task_stall_begin(rnp);
647		- t = list_entry(rnp->gp_tasks->prev,
648		- struct task_struct, rcu_node_entry);
649		- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
650		- pr_cont(" P%d", t->pid);
651		- ndetected++;
652		- }
653		- rcu_print_task_stall_end();
654		- return ndetected;
655		-}
656		-
657		-/*
658		- * Scan the current list of tasks blocked within RCU read-side critical
659		- * sections, printing out the tid of each that is blocking the current
660		- * expedited grace period.
661		- */
662		-static int rcu_print_task_exp_stall(struct rcu_node *rnp)
663		-{
664		- struct task_struct *t;
665		- int ndetected = 0;
666		-
667		- if (!rnp->exp_tasks)
668		- return 0;
669		- t = list_entry(rnp->exp_tasks->prev,
670		- struct task_struct, rcu_node_entry);
671		- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
672		- pr_cont(" P%d", t->pid);
673		- ndetected++;
674		- }
675		- return ndetected;
	644	+ rcu_preempt_deferred_qs_irqrestore(t, flags);
676	645	}
677	646
678	647	/*
679	648	* Check that the list of blocked tasks for the newly completed grace
680	649	* period is in fact empty. It is a serious bug to complete a grace
681	650	* period that still has RCU readers blocked! This function must be
682		- * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock
683		- * must be held by the caller.
	651	+ * invoked -before- updating this rnp's ->gp_seq.
684	652	*
685	653	* Also, if there are blocked tasks on the list, they automatically
686	654	* block the newly created grace period, so set up ->gp_tasks accordingly.
687	655	*/
688		-static void
689		-rcu_preempt_check_blocked_tasks(struct rcu_state rsp, struct rcu_node rnp)
	656	+static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
690	657	{
691	658	struct task_struct *t;
692	659
693	660	RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
	661	+ raw_lockdep_assert_held_rcu_node(rnp);
694	662	if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
695		- dump_blkd_tasks(rsp, rnp, 10);
	663	+ dump_blkd_tasks(rnp, 10);
696	664	if (rcu_preempt_has_tasks(rnp) &&
697	665	(rnp->qsmaskinit \|\| rnp->wait_blkd_tasks)) {
698	666	WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
..	..	@@ -705,139 +673,67 @@
705	673	}
706	674
707	675	/*
708		- * Check for a quiescent state from the current CPU. When a task blocks,
709		- * the task is recorded in the corresponding CPU's rcu_node structure,
710		- * which is checked elsewhere.
711		- *
712		- * Caller must disable hard irqs.
	676	+ * Check for a quiescent state from the current CPU, including voluntary
	677	+ * context switches for Tasks RCU. When a task blocks, the task is
	678	+ * recorded in the corresponding CPU's rcu_node structure, which is checked
	679	+ * elsewhere, hence this function need only check for quiescent states
	680	+ * related to the current CPU, not to those related to tasks.
713	681	*/
714		-static void rcu_preempt_check_callbacks(void)
	682	+static void rcu_flavor_sched_clock_irq(int user)
715	683	{
716		- struct rcu_state *rsp = &rcu_preempt_state;
717	684	struct task_struct *t = current;
718	685
719		- if (t->rcu_read_lock_nesting == 0) {
720		- rcu_preempt_qs();
	686	+ lockdep_assert_irqs_disabled();
	687	+ if (user \|\| rcu_is_cpu_rrupt_from_idle()) {
	688	+ rcu_note_voluntary_context_switch(current);
	689	+ }
	690	+ if (rcu_preempt_depth() > 0 \|\|
	691	+ (preempt_count() & (PREEMPT_MASK \| SOFTIRQ_MASK))) {
	692	+ /* No QS, force context switch if deferred. */
	693	+ if (rcu_preempt_need_deferred_qs(t)) {
	694	+ set_tsk_need_resched(t);
	695	+ set_preempt_need_resched();
	696	+ }
	697	+ } else if (rcu_preempt_need_deferred_qs(t)) {
	698	+ rcu_preempt_deferred_qs(t); /* Report deferred QS. */
	699	+ return;
	700	+ } else if (!WARN_ON_ONCE(rcu_preempt_depth())) {
	701	+ rcu_qs(); /* Report immediate QS. */
721	702	return;
722	703	}
723		- if (t->rcu_read_lock_nesting > 0 &&
724		- __this_cpu_read(rcu_data_p->core_needs_qs) &&
725		- __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm) &&
	704	+
	705	+ /* If GP is oldish, ask for help from rcu_read_unlock_special(). */
	706	+ if (rcu_preempt_depth() > 0 &&
	707	+ __this_cpu_read(rcu_data.core_needs_qs) &&
	708	+ __this_cpu_read(rcu_data.cpu_no_qs.b.norm) &&
726	709	!t->rcu_read_unlock_special.b.need_qs &&
727		- time_after(jiffies, rsp->gp_start + HZ))
	710	+ time_after(jiffies, rcu_state.gp_start + HZ))
728	711	t->rcu_read_unlock_special.b.need_qs = true;
729		-}
730		-
731		-/**
732		- * call_rcu() - Queue an RCU callback for invocation after a grace period.
733		- * @head: structure to be used for queueing the RCU updates.
734		- * @func: actual callback function to be invoked after the grace period
735		- *
736		- * The callback function will be invoked some time after a full grace
737		- * period elapses, in other words after all pre-existing RCU read-side
738		- * critical sections have completed. However, the callback function
739		- * might well execute concurrently with RCU read-side critical sections
740		- * that started after call_rcu() was invoked. RCU read-side critical
741		- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
742		- * and may be nested.
743		- *
744		- * Note that all CPUs must agree that the grace period extended beyond
745		- * all pre-existing RCU read-side critical section. On systems with more
746		- * than one CPU, this means that when "func()" is invoked, each CPU is
747		- * guaranteed to have executed a full memory barrier since the end of its
748		- * last RCU read-side critical section whose beginning preceded the call
749		- * to call_rcu(). It also means that each CPU executing an RCU read-side
750		- * critical section that continues beyond the start of "func()" must have
751		- * executed a memory barrier after the call_rcu() but before the beginning
752		- * of that RCU read-side critical section. Note that these guarantees
753		- * include CPUs that are offline, idle, or executing in user mode, as
754		- * well as CPUs that are executing in the kernel.
755		- *
756		- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
757		- * resulting RCU callback function "func()", then both CPU A and CPU B are
758		- * guaranteed to execute a full memory barrier during the time interval
759		- * between the call to call_rcu() and the invocation of "func()" -- even
760		- * if CPU A and CPU B are the same CPU (but again only if the system has
761		- * more than one CPU).
762		- */
763		-void call_rcu(struct rcu_head *head, rcu_callback_t func)
764		-{
765		- __call_rcu(head, func, rcu_state_p, -1, 0);
766		-}
767		-EXPORT_SYMBOL_GPL(call_rcu);
768		-
769		-/**
770		- * synchronize_rcu - wait until a grace period has elapsed.
771		- *
772		- * Control will return to the caller some time after a full grace
773		- * period has elapsed, in other words after all currently executing RCU
774		- * read-side critical sections have completed. Note, however, that
775		- * upon return from synchronize_rcu(), the caller might well be executing
776		- * concurrently with new RCU read-side critical sections that began while
777		- * synchronize_rcu() was waiting. RCU read-side critical sections are
778		- * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
779		- *
780		- * See the description of synchronize_sched() for more detailed
781		- * information on memory-ordering guarantees. However, please note
782		- * that -only- the memory-ordering guarantees apply. For example,
783		- * synchronize_rcu() is -not- guaranteed to wait on things like code
784		- * protected by preempt_disable(), instead, synchronize_rcu() is -only-
785		- * guaranteed to wait on RCU read-side critical sections, that is, sections
786		- * of code protected by rcu_read_lock().
787		- */
788		-void synchronize_rcu(void)
789		-{
790		- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) \|\|
791		- lock_is_held(&rcu_lock_map) \|\|
792		- lock_is_held(&rcu_sched_lock_map),
793		- "Illegal synchronize_rcu() in RCU read-side critical section");
794		- if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
795		- return;
796		- if (rcu_gp_is_expedited())
797		- synchronize_rcu_expedited();
798		- else
799		- wait_rcu_gp(call_rcu);
800		-}
801		-EXPORT_SYMBOL_GPL(synchronize_rcu);
802		-
803		-/**
804		- * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
805		- *
806		- * Note that this primitive does not necessarily wait for an RCU grace period
807		- * to complete. For example, if there are no RCU callbacks queued anywhere
808		- * in the system, then rcu_barrier() is within its rights to return
809		- * immediately, without waiting for anything, much less an RCU grace period.
810		- */
811		-void rcu_barrier(void)
812		-{
813		- _rcu_barrier(rcu_state_p);
814		-}
815		-EXPORT_SYMBOL_GPL(rcu_barrier);
816		-
817		-/*
818		- * Initialize preemptible RCU's state structures.
819		- */
820		-static void __init __rcu_init_preempt(void)
821		-{
822		- rcu_init_one(rcu_state_p);
823	712	}
824	713
825	714	/*
826	715	* Check for a task exiting while in a preemptible-RCU read-side
827		- * critical section, clean up if so. No need to issue warnings,
828		- * as debug_check_no_locks_held() already does this if lockdep
829		- * is enabled.
	716	+ * critical section, clean up if so. No need to issue warnings, as
	717	+ * debug_check_no_locks_held() already does this if lockdep is enabled.
	718	+ * Besides, if this function does anything other than just immediately
	719	+ * return, there was a bug of some sort. Spewing warnings from this
	720	+ * function is like as not to simply obscure important prior warnings.
830	721	*/
831	722	void exit_rcu(void)
832	723	{
833	724	struct task_struct *t = current;
834	725
835		- if (likely(list_empty(&current->rcu_node_entry)))
	726	+ if (unlikely(!list_empty(&current->rcu_node_entry))) {
	727	+ rcu_preempt_depth_set(1);
	728	+ barrier();
	729	+ WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
	730	+ } else if (unlikely(rcu_preempt_depth())) {
	731	+ rcu_preempt_depth_set(1);
	732	+ } else {
836	733	return;
837		- t->rcu_read_lock_nesting = 1;
838		- barrier();
839		- t->rcu_read_unlock_special.b.blocked = true;
	734	+ }
840	735	__rcu_read_unlock();
	736	+ rcu_preempt_deferred_qs(current);
841	737	}
842	738
843	739	/*
..	..	@@ -845,7 +741,7 @@
845	741	* specified number of elements.
846	742	*/
847	743	static void
848		-dump_blkd_tasks(struct rcu_state rsp, struct rcu_node rnp, int ncheck)
	744	+dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
849	745	{
850	746	int cpu;
851	747	int i;
..	..	@@ -857,23 +753,23 @@
857	753	raw_lockdep_assert_held_rcu_node(rnp);
858	754	pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
859	755	__func__, rnp->grplo, rnp->grphi, rnp->level,
860		- (long)rnp->gp_seq, (long)rnp->completedqs);
	756	+ (long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs);
861	757	for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
862	758	pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
863	759	__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
864	760	pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
865		- __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks,
866		- rnp->exp_tasks);
	761	+ __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks),
	762	+ READ_ONCE(rnp->exp_tasks));
867	763	pr_info("%s: ->blkd_tasks", __func__);
868	764	i = 0;
869	765	list_for_each(lhp, &rnp->blkd_tasks) {
870	766	pr_cont(" %p", lhp);
871		- if (++i >= 10)
	767	+ if (++i >= ncheck)
872	768	break;
873	769	}
874	770	pr_cont("\n");
875	771	for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
876		- rdp = per_cpu_ptr(rsp->rda, cpu);
	772	+ rdp = per_cpu_ptr(&rcu_data, cpu);
877	773	onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
878	774	pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
879	775	cpu, ".o"[onl],
..	..	@@ -884,7 +780,23 @@
884	780
885	781	#else /* #ifdef CONFIG_PREEMPT_RCU */
886	782
887		-static struct rcu_state *const rcu_state_p = &rcu_sched_state;
	783	+/*
	784	+ * If strict grace periods are enabled, and if the calling
	785	+ * __rcu_read_unlock() marks the beginning of a quiescent state, immediately
	786	+ * report that quiescent state and, if requested, spin for a bit.
	787	+ */
	788	+void rcu_read_unlock_strict(void)
	789	+{
	790	+ struct rcu_data *rdp;
	791	+
	792	+ if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) \|\|
	793	+ irqs_disabled() \|\| preempt_count() \|\| !rcu_state.gp_kthread)
	794	+ return;
	795	+ rdp = this_cpu_ptr(&rcu_data);
	796	+ rcu_report_qs_rdp(rdp);
	797	+ udelay(rcu_unlock_delay);
	798	+}
	799	+EXPORT_SYMBOL_GPL(rcu_read_unlock_strict);
888	800
889	801	/*
890	802	* Tell them what RCU they are running.
..	..	@@ -896,12 +808,73 @@
896	808	}
897	809
898	810	/*
899		- * Because preemptible RCU does not exist, we never have to check for
900		- * CPUs being in quiescent states.
	811	+ * Note a quiescent state for PREEMPTION=n. Because we do not need to know
	812	+ * how many quiescent states passed, just if there was at least one since
	813	+ * the start of the grace period, this just sets a flag. The caller must
	814	+ * have disabled preemption.
901	815	*/
902		-static void rcu_preempt_note_context_switch(bool preempt)
	816	+static void rcu_qs(void)
903	817	{
	818	+ RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!");
	819	+ if (!__this_cpu_read(rcu_data.cpu_no_qs.s))
	820	+ return;
	821	+ trace_rcu_grace_period(TPS("rcu_sched"),
	822	+ __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
	823	+ __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
	824	+ if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
	825	+ return;
	826	+ __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false);
	827	+ rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
904	828	}
	829	+
	830	+/*
	831	+ * Register an urgently needed quiescent state. If there is an
	832	+ * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
	833	+ * dyntick-idle quiescent state visible to other CPUs, which will in
	834	+ * some cases serve for expedited as well as normal grace periods.
	835	+ * Either way, register a lightweight quiescent state.
	836	+ */
	837	+void rcu_all_qs(void)
	838	+{
	839	+ unsigned long flags;
	840	+
	841	+ if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
	842	+ return;
	843	+ preempt_disable();
	844	+ /* Load rcu_urgent_qs before other flags. */
	845	+ if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
	846	+ preempt_enable();
	847	+ return;
	848	+ }
	849	+ this_cpu_write(rcu_data.rcu_urgent_qs, false);
	850	+ if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
	851	+ local_irq_save(flags);
	852	+ rcu_momentary_dyntick_idle();
	853	+ local_irq_restore(flags);
	854	+ }
	855	+ rcu_qs();
	856	+ preempt_enable();
	857	+}
	858	+EXPORT_SYMBOL_GPL(rcu_all_qs);
	859	+
	860	+/*
	861	+ * Note a PREEMPTION=n context switch. The caller must have disabled interrupts.
	862	+ */
	863	+void rcu_note_context_switch(bool preempt)
	864	+{
	865	+ trace_rcu_utilization(TPS("Start context switch"));
	866	+ rcu_qs();
	867	+ /* Load rcu_urgent_qs before other flags. */
	868	+ if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs)))
	869	+ goto out;
	870	+ this_cpu_write(rcu_data.rcu_urgent_qs, false);
	871	+ if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
	872	+ rcu_momentary_dyntick_idle();
	873	+ rcu_tasks_qs(current, preempt);
	874	+out:
	875	+ trace_rcu_utilization(TPS("End context switch"));
	876	+}
	877	+EXPORT_SYMBOL_GPL(rcu_note_context_switch);
905	878
906	879	/*
907	880	* Because preemptible RCU does not exist, there are never any preempted
..	..	@@ -921,66 +894,47 @@
921	894	}
922	895
923	896	/*
924		- * Because preemptible RCU does not exist, we never have to check for
925		- * tasks blocked within RCU read-side critical sections.
	897	+ * Because there is no preemptible RCU, there can be no deferred quiescent
	898	+ * states.
926	899	*/
927		-static void rcu_print_detail_task_stall(struct rcu_state *rsp)
	900	+static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
928	901	{
	902	+ return false;
929	903	}
930		-
931		-/*
932		- * Because preemptible RCU does not exist, we never have to check for
933		- * tasks blocked within RCU read-side critical sections.
934		- */
935		-static int rcu_print_task_stall(struct rcu_node *rnp)
936		-{
937		- return 0;
938		-}
939		-
940		-/*
941		- * Because preemptible RCU does not exist, we never have to check for
942		- * tasks blocked within RCU read-side critical sections that are
943		- * blocking the current expedited grace period.
944		- */
945		-static int rcu_print_task_exp_stall(struct rcu_node *rnp)
946		-{
947		- return 0;
948		-}
	904	+static void rcu_preempt_deferred_qs(struct task_struct *t) { }
949	905
950	906	/*
951	907	* Because there is no preemptible RCU, there can be no readers blocked,
952	908	* so there is no need to check for blocked tasks. So check only for
953	909	* bogus qsmask values.
954	910	*/
955		-static void
956		-rcu_preempt_check_blocked_tasks(struct rcu_state rsp, struct rcu_node rnp)
	911	+static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
957	912	{
958	913	WARN_ON_ONCE(rnp->qsmask);
959	914	}
960	915
961	916	/*
962		- * Because preemptible RCU does not exist, it never has any callbacks
963		- * to check.
	917	+ * Check to see if this CPU is in a non-context-switch quiescent state,
	918	+ * namely user mode and idle loop.
964	919	*/
965		-static void rcu_preempt_check_callbacks(void)
	920	+static void rcu_flavor_sched_clock_irq(int user)
966	921	{
967		-}
	922	+ if (user \|\| rcu_is_cpu_rrupt_from_idle()) {
968	923
969		-/*
970		- * Because preemptible RCU does not exist, rcu_barrier() is just
971		- * another name for rcu_barrier_sched().
972		- */
973		-void rcu_barrier(void)
974		-{
975		- rcu_barrier_sched();
976		-}
977		-EXPORT_SYMBOL_GPL(rcu_barrier);
	924	+ /*
	925	+ * Get here if this CPU took its interrupt from user
	926	+ * mode or from the idle loop, and if this is not a
	927	+ * nested interrupt. In this case, the CPU is in
	928	+ * a quiescent state, so note it.
	929	+ *
	930	+ * No memory barrier is required here because rcu_qs()
	931	+ * references only CPU-local variables that other CPUs
	932	+ * neither access nor modify, at least not while the
	933	+ * corresponding CPU is online.
	934	+ */
978	935
979		-/*
980		- * Because preemptible RCU does not exist, it need not be initialized.
981		- */
982		-static void __init __rcu_init_preempt(void)
983		-{
	936	+ rcu_qs();
	937	+ }
984	938	}
985	939
986	940	/*
..	..	@@ -995,7 +949,7 @@
995	949	* Dump the guaranteed-empty blocked-tasks state. Trust but verify.
996	950	*/
997	951	static void
998		-dump_blkd_tasks(struct rcu_state rsp, struct rcu_node rnp, int ncheck)
	952	+dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
999	953	{
1000	954	WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
1001	955	}
..	..	@@ -1095,20 +1049,21 @@
1095	1049
1096	1050	trace_rcu_utilization(TPS("Start boost kthread@init"));
1097	1051	for (;;) {
1098		- rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
	1052	+ WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING);
1099	1053	trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
1100		- rcu_wait(rnp->boost_tasks \|\| rnp->exp_tasks);
	1054	+ rcu_wait(READ_ONCE(rnp->boost_tasks) \|\|
	1055	+ READ_ONCE(rnp->exp_tasks));
1101	1056	trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
1102		- rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
	1057	+ WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING);
1103	1058	more2boost = rcu_boost(rnp);
1104	1059	if (more2boost)
1105	1060	spincnt++;
1106	1061	else
1107	1062	spincnt = 0;
1108	1063	if (spincnt > 10) {
1109		- rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
	1064	+ WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
1110	1065	trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
1111		- schedule_timeout_interruptible(2);
	1066	+ schedule_timeout_idle(2);
1112	1067	trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
1113	1068	spincnt = 0;
1114	1069	}
..	..	@@ -1131,8 +1086,6 @@
1131	1086	static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1132	1087	__releases(rnp->lock)
1133	1088	{
1134		- struct task_struct *t;
1135		-
1136	1089	raw_lockdep_assert_held_rcu_node(rnp);
1137	1090	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1138	1091	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
..	..	@@ -1142,13 +1095,12 @@
1142	1095	(rnp->gp_tasks != NULL &&
1143	1096	rnp->boost_tasks == NULL &&
1144	1097	rnp->qsmask == 0 &&
1145		- ULONG_CMP_GE(jiffies, rnp->boost_time))) {
	1098	+ (!time_after(rnp->boost_time, jiffies) \|\| rcu_state.cbovld))) {
1146	1099	if (rnp->exp_tasks == NULL)
1147		- rnp->boost_tasks = rnp->gp_tasks;
	1100	+ WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
1148	1101	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1149		- t = rnp->boost_kthread_task;
1150		- if (t)
1151		- rcu_wake_cond(t, rnp->boost_kthread_status);
	1102	+ rcu_wake_cond(rnp->boost_kthread_task,
	1103	+ READ_ONCE(rnp->boost_kthread_status));
1152	1104	} else {
1153	1105	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1154	1106	}
..	..	@@ -1160,7 +1112,7 @@
1160	1112	*/
1161	1113	static bool rcu_is_callbacks_kthread(void)
1162	1114	{
1163		- return __this_cpu_read(rcu_cpu_kthread_task) == current;
	1115	+ return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
1164	1116	}
1165	1117
1166	1118	#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
..	..	@@ -1178,34 +1130,35 @@
1178	1130	* already exist. We only create this kthread for preemptible RCU.
1179	1131	* Returns zero if all is well, a negated errno otherwise.
1180	1132	*/
1181		-static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1182		- struct rcu_node *rnp)
	1133	+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
1183	1134	{
1184		- int rnp_index = rnp - &rsp->node[0];
	1135	+ int rnp_index = rnp - rcu_get_root();
1185	1136	unsigned long flags;
1186	1137	struct sched_param sp;
1187	1138	struct task_struct *t;
1188	1139
1189		- if (rcu_state_p != rsp)
1190		- return 0;
	1140	+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
	1141	+ return;
1191	1142
1192	1143	if (!rcu_scheduler_fully_active \|\| rcu_rnp_online_cpus(rnp) == 0)
1193		- return 0;
	1144	+ return;
1194	1145
1195		- rsp->boost = 1;
	1146	+ rcu_state.boost = 1;
	1147	+
1196	1148	if (rnp->boost_kthread_task != NULL)
1197		- return 0;
	1149	+ return;
	1150	+
1198	1151	t = kthread_create(rcu_boost_kthread, (void *)rnp,
1199	1152	"rcub/%d", rnp_index);
1200		- if (IS_ERR(t))
1201		- return PTR_ERR(t);
	1153	+ if (WARN_ON_ONCE(IS_ERR(t)))
	1154	+ return;
	1155	+
1202	1156	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1203	1157	rnp->boost_kthread_task = t;
1204	1158	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1205	1159	sp.sched_priority = kthread_prio;
1206	1160	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1207	1161	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1208		- return 0;
1209	1162	}
1210	1163
1211	1164	/*
..	..	@@ -1244,18 +1197,19 @@
1244	1197	static void __init rcu_spawn_boost_kthreads(void)
1245	1198	{
1246	1199	struct rcu_node *rnp;
1247		- rcu_for_each_leaf_node(rcu_state_p, rnp)
1248		- (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
	1200	+
	1201	+ rcu_for_each_leaf_node(rnp)
	1202	+ rcu_spawn_one_boost_kthread(rnp);
1249	1203	}
1250	1204
1251	1205	static void rcu_prepare_kthreads(int cpu)
1252	1206	{
1253		- struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
	1207	+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
1254	1208	struct rcu_node *rnp = rdp->mynode;
1255	1209
1256	1210	/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1257	1211	if (rcu_scheduler_fully_active)
1258		- (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
	1212	+ rcu_spawn_one_boost_kthread(rnp);
1259	1213	}
1260	1214
1261	1215	#else /* #ifdef CONFIG_RCU_BOOST */
..	..	@@ -1289,25 +1243,24 @@
1289	1243
1290	1244	#endif /* #else #ifdef CONFIG_RCU_BOOST */
1291	1245
1292		-#if !defined(CONFIG_RCU_FAST_NO_HZ) \|\| defined(CONFIG_PREEMPT_RT_FULL)
	1246	+#if !defined(CONFIG_RCU_FAST_NO_HZ)
1293	1247
1294	1248	/*
1295		- * Check to see if any future RCU-related work will need to be done
1296		- * by the current CPU, even if none need be done immediately, returning
1297		- * 1 if so. This function is part of the RCU implementation; it is -not-
1298		- * an exported member of the RCU API.
	1249	+ * Check to see if any future non-offloaded RCU-related work will need
	1250	+ * to be done by the current CPU, even if none need be done immediately,
	1251	+ * returning 1 if so. This function is part of the RCU implementation;
	1252	+ * it is -not- an exported member of the RCU API.
1299	1253	*
1300		- * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1301		- * any flavor of RCU.
	1254	+ * Because we not have RCU_FAST_NO_HZ, just check whether or not this
	1255	+ * CPU has RCU callbacks queued.
1302	1256	*/
1303	1257	int rcu_needs_cpu(u64 basemono, u64 *nextevt)
1304	1258	{
1305	1259	*nextevt = KTIME_MAX;
1306		- return rcu_cpu_has_callbacks(NULL);
	1260	+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
	1261	+ !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist);
1307	1262	}
1308		-#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) \|\| defined(CONFIG_PREEMPT_RT_FULL) */
1309	1263
1310		-#if !defined(CONFIG_RCU_FAST_NO_HZ)
1311	1264	/*
1312	1265	* Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1313	1266	* after it.
..	..	@@ -1324,23 +1277,14 @@
1324	1277	{
1325	1278	}
1326	1279
1327		-/*
1328		- * Don't bother keeping a running count of the number of RCU callbacks
1329		- * posted because CONFIG_RCU_FAST_NO_HZ=n.
1330		- */
1331		-static void rcu_idle_count_callbacks_posted(void)
1332		-{
1333		-}
1334		-
1335	1280	#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1336	1281
1337	1282	/*
1338	1283	* This code is invoked when a CPU goes idle, at which point we want
1339	1284	* to have the CPU do everything required for RCU so that it can enter
1340		- * the energy-efficient dyntick-idle mode. This is handled by a
1341		- * state machine implemented by rcu_prepare_for_idle() below.
	1285	+ * the energy-efficient dyntick-idle mode.
1342	1286	*
1343		- * The following three proprocessor symbols control this state machine:
	1287	+ * The following preprocessor symbol controls this:
1344	1288	*
1345	1289	* RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1346	1290	* to sleep in dyntick-idle mode with RCU callbacks pending. This
..	..	@@ -1349,83 +1293,67 @@
1349	1293	* number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
1350	1294	* system. And if you are -that- concerned about energy efficiency,
1351	1295	* just power the system down and be done with it!
1352		- * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
1353		- * permitted to sleep in dyntick-idle mode with only lazy RCU
1354		- * callbacks pending. Setting this too high can OOM your system.
1355	1296	*
1356		- * The values below work well in practice. If future workloads require
	1297	+ * The value below works well in practice. If future workloads require
1357	1298	* adjustment, they can be converted into kernel config parameters, though
1358	1299	* making the state machine smarter might be a better option.
1359	1300	*/
1360	1301	#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
1361		-#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1362	1302
1363	1303	static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
1364	1304	module_param(rcu_idle_gp_delay, int, 0644);
1365		-static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
1366		-module_param(rcu_idle_lazy_gp_delay, int, 0644);
1367	1305
1368	1306	/*
1369		- * Try to advance callbacks for all flavors of RCU on the current CPU, but
1370		- * only if it has been awhile since the last time we did so. Afterwards,
1371		- * if there are any callbacks ready for immediate invocation, return true.
	1307	+ * Try to advance callbacks on the current CPU, but only if it has been
	1308	+ * awhile since the last time we did so. Afterwards, if there are any
	1309	+ * callbacks ready for immediate invocation, return true.
1372	1310	*/
1373	1311	static bool __maybe_unused rcu_try_advance_all_cbs(void)
1374	1312	{
1375	1313	bool cbs_ready = false;
1376		- struct rcu_data *rdp;
1377		- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
	1314	+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1378	1315	struct rcu_node *rnp;
1379		- struct rcu_state *rsp;
1380	1316
1381	1317	/* Exit early if we advanced recently. */
1382		- if (jiffies == rdtp->last_advance_all)
	1318	+ if (jiffies == rdp->last_advance_all)
1383	1319	return false;
1384		- rdtp->last_advance_all = jiffies;
	1320	+ rdp->last_advance_all = jiffies;
1385	1321
1386		- for_each_rcu_flavor(rsp) {
1387		- rdp = this_cpu_ptr(rsp->rda);
1388		- rnp = rdp->mynode;
	1322	+ rnp = rdp->mynode;
1389	1323
1390		- /*
1391		- * Don't bother checking unless a grace period has
1392		- * completed since we last checked and there are
1393		- * callbacks not yet ready to invoke.
1394		- */
1395		- if ((rcu_seq_completed_gp(rdp->gp_seq,
1396		- rcu_seq_current(&rnp->gp_seq)) \|\|
1397		- unlikely(READ_ONCE(rdp->gpwrap))) &&
1398		- rcu_segcblist_pend_cbs(&rdp->cblist))
1399		- note_gp_changes(rsp, rdp);
	1324	+ /*
	1325	+ * Don't bother checking unless a grace period has
	1326	+ * completed since we last checked and there are
	1327	+ * callbacks not yet ready to invoke.
	1328	+ */
	1329	+ if ((rcu_seq_completed_gp(rdp->gp_seq,
	1330	+ rcu_seq_current(&rnp->gp_seq)) \|\|
	1331	+ unlikely(READ_ONCE(rdp->gpwrap))) &&
	1332	+ rcu_segcblist_pend_cbs(&rdp->cblist))
	1333	+ note_gp_changes(rdp);
1400	1334
1401		- if (rcu_segcblist_ready_cbs(&rdp->cblist))
1402		- cbs_ready = true;
1403		- }
	1335	+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
	1336	+ cbs_ready = true;
1404	1337	return cbs_ready;
1405	1338	}
1406		-
1407		-#ifndef CONFIG_PREEMPT_RT_FULL
1408	1339
1409	1340	/*
1410	1341	* Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
1411	1342	* to invoke. If the CPU has callbacks, try to advance them. Tell the
1412		- * caller to set the timeout based on whether or not there are non-lazy
1413		- * callbacks.
	1343	+ * caller about what to set the timeout.
1414	1344	*
1415	1345	* The caller must have disabled interrupts.
1416	1346	*/
1417	1347	int rcu_needs_cpu(u64 basemono, u64 *nextevt)
1418	1348	{
1419		- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
	1349	+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1420	1350	unsigned long dj;
1421	1351
1422	1352	lockdep_assert_irqs_disabled();
1423	1353
1424		- /* Snapshot to detect later posting of non-lazy callback. */
1425		- rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1426		-
1427		- /* If no callbacks, RCU doesn't need the CPU. */
1428		- if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
	1354	+ /* If no non-offloaded callbacks, RCU doesn't need the CPU. */
	1355	+ if (rcu_segcblist_empty(&rdp->cblist) \|\|
	1356	+ rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) {
1429	1357	*nextevt = KTIME_MAX;
1430	1358	return 0;
1431	1359	}
..	..	@@ -1436,84 +1364,59 @@
1436	1364	invoke_rcu_core();
1437	1365	return 1;
1438	1366	}
1439		- rdtp->last_accelerate = jiffies;
	1367	+ rdp->last_accelerate = jiffies;
1440	1368
1441		- /* Request timer delay depending on laziness, and round. */
1442		- if (!rdtp->all_lazy) {
1443		- dj = round_up(rcu_idle_gp_delay + jiffies,
1444		- rcu_idle_gp_delay) - jiffies;
1445		- } else {
1446		- dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
1447		- }
	1369	+ /* Request timer and round. */
	1370	+ dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
	1371	+
1448	1372	nextevt = basemono + dj TICK_NSEC;
1449	1373	return 0;
1450	1374	}
1451		-#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
1452	1375
1453	1376	/*
1454		- * Prepare a CPU for idle from an RCU perspective. The first major task
1455		- * is to sense whether nohz mode has been enabled or disabled via sysfs.
1456		- * The second major task is to check to see if a non-lazy callback has
1457		- * arrived at a CPU that previously had only lazy callbacks. The third
1458		- * major task is to accelerate (that is, assign grace-period numbers to)
1459		- * any recently arrived callbacks.
	1377	+ * Prepare a CPU for idle from an RCU perspective. The first major task is to
	1378	+ * sense whether nohz mode has been enabled or disabled via sysfs. The second
	1379	+ * major task is to accelerate (that is, assign grace-period numbers to) any
	1380	+ * recently arrived callbacks.
1460	1381	*
1461	1382	* The caller must have disabled interrupts.
1462	1383	*/
1463	1384	static void rcu_prepare_for_idle(void)
1464	1385	{
1465	1386	bool needwake;
1466		- struct rcu_data *rdp;
1467		- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
	1387	+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1468	1388	struct rcu_node *rnp;
1469		- struct rcu_state *rsp;
1470	1389	int tne;
1471	1390
1472	1391	lockdep_assert_irqs_disabled();
1473		- if (rcu_is_nocb_cpu(smp_processor_id()))
	1392	+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
1474	1393	return;
1475	1394
1476	1395	/* Handle nohz enablement switches conservatively. */
1477	1396	tne = READ_ONCE(tick_nohz_active);
1478		- if (tne != rdtp->tick_nohz_enabled_snap) {
1479		- if (rcu_cpu_has_callbacks(NULL))
	1397	+ if (tne != rdp->tick_nohz_enabled_snap) {
	1398	+ if (!rcu_segcblist_empty(&rdp->cblist))
1480	1399	invoke_rcu_core(); /* force nohz to see update. */
1481		- rdtp->tick_nohz_enabled_snap = tne;
	1400	+ rdp->tick_nohz_enabled_snap = tne;
1482	1401	return;
1483	1402	}
1484	1403	if (!tne)
1485	1404	return;
1486	1405
1487	1406	/*
1488		- * If a non-lazy callback arrived at a CPU having only lazy
1489		- * callbacks, invoke RCU core for the side-effect of recalculating
1490		- * idle duration on re-entry to idle.
1491		- */
1492		- if (rdtp->all_lazy &&
1493		- rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
1494		- rdtp->all_lazy = false;
1495		- rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1496		- invoke_rcu_core();
1497		- return;
1498		- }
1499		-
1500		- /*
1501	1407	* If we have not yet accelerated this jiffy, accelerate all
1502	1408	* callbacks on this CPU.
1503	1409	*/
1504		- if (rdtp->last_accelerate == jiffies)
	1410	+ if (rdp->last_accelerate == jiffies)
1505	1411	return;
1506		- rdtp->last_accelerate = jiffies;
1507		- for_each_rcu_flavor(rsp) {
1508		- rdp = this_cpu_ptr(rsp->rda);
1509		- if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1510		- continue;
	1412	+ rdp->last_accelerate = jiffies;
	1413	+ if (rcu_segcblist_pend_cbs(&rdp->cblist)) {
1511	1414	rnp = rdp->mynode;
1512	1415	raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
1513		- needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
	1416	+ needwake = rcu_accelerate_cbs(rnp, rdp);
1514	1417	raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
1515	1418	if (needwake)
1516		- rcu_gp_kthread_wake(rsp);
	1419	+ rcu_gp_kthread_wake();
1517	1420	}
1518	1421	}
1519	1422
..	..	@@ -1524,240 +1427,58 @@
1524	1427	*/
1525	1428	static void rcu_cleanup_after_idle(void)
1526	1429	{
	1430	+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
	1431	+
1527	1432	lockdep_assert_irqs_disabled();
1528		- if (rcu_is_nocb_cpu(smp_processor_id()))
	1433	+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
1529	1434	return;
1530	1435	if (rcu_try_advance_all_cbs())
1531	1436	invoke_rcu_core();
1532	1437	}
1533	1438
1534		-/*
1535		- * Keep a running count of the number of non-lazy callbacks posted
1536		- * on this CPU. This running counter (which is never decremented) allows
1537		- * rcu_prepare_for_idle() to detect when something out of the idle loop
1538		- * posts a callback, even if an equal number of callbacks are invoked.
1539		- * Of course, callbacks should only be posted from within a trace event
1540		- * designed to be called from idle or from within RCU_NONIDLE().
1541		- */
1542		-static void rcu_idle_count_callbacks_posted(void)
1543		-{
1544		- __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
1545		-}
1546		-
1547		-/*
1548		- * Data for flushing lazy RCU callbacks at OOM time.
1549		- */
1550		-static atomic_t oom_callback_count;
1551		-static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
1552		-
1553		-/*
1554		- * RCU OOM callback -- decrement the outstanding count and deliver the
1555		- * wake-up if we are the last one.
1556		- */
1557		-static void rcu_oom_callback(struct rcu_head *rhp)
1558		-{
1559		- if (atomic_dec_and_test(&oom_callback_count))
1560		- wake_up(&oom_callback_wq);
1561		-}
1562		-
1563		-/*
1564		- * Post an rcu_oom_notify callback on the current CPU if it has at
1565		- * least one lazy callback. This will unnecessarily post callbacks
1566		- * to CPUs that already have a non-lazy callback at the end of their
1567		- * callback list, but this is an infrequent operation, so accept some
1568		- * extra overhead to keep things simple.
1569		- */
1570		-static void rcu_oom_notify_cpu(void *unused)
1571		-{
1572		- struct rcu_state *rsp;
1573		- struct rcu_data *rdp;
1574		-
1575		- for_each_rcu_flavor(rsp) {
1576		- rdp = raw_cpu_ptr(rsp->rda);
1577		- if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) {
1578		- atomic_inc(&oom_callback_count);
1579		- rsp->call(&rdp->oom_head, rcu_oom_callback);
1580		- }
1581		- }
1582		-}
1583		-
1584		-/*
1585		- * If low on memory, ensure that each CPU has a non-lazy callback.
1586		- * This will wake up CPUs that have only lazy callbacks, in turn
1587		- * ensuring that they free up the corresponding memory in a timely manner.
1588		- * Because an uncertain amount of memory will be freed in some uncertain
1589		- * timeframe, we do not claim to have freed anything.
1590		- */
1591		-static int rcu_oom_notify(struct notifier_block *self,
1592		- unsigned long notused, void *nfreed)
1593		-{
1594		- int cpu;
1595		-
1596		- /* Wait for callbacks from earlier instance to complete. */
1597		- wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
1598		- smp_mb(); /* Ensure callback reuse happens after callback invocation. */
1599		-
1600		- /*
1601		- * Prevent premature wakeup: ensure that all increments happen
1602		- * before there is a chance of the counter reaching zero.
1603		- */
1604		- atomic_set(&oom_callback_count, 1);
1605		-
1606		- for_each_online_cpu(cpu) {
1607		- smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
1608		- cond_resched_tasks_rcu_qs();
1609		- }
1610		-
1611		- /* Unconditionally decrement: no need to wake ourselves up. */
1612		- atomic_dec(&oom_callback_count);
1613		-
1614		- return NOTIFY_OK;
1615		-}
1616		-
1617		-static struct notifier_block rcu_oom_nb = {
1618		- .notifier_call = rcu_oom_notify
1619		-};
1620		-
1621		-static int __init rcu_register_oom_notifier(void)
1622		-{
1623		- register_oom_notifier(&rcu_oom_nb);
1624		- return 0;
1625		-}
1626		-early_initcall(rcu_register_oom_notifier);
1627		-
1628	1439	#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1629		-
1630		-#ifdef CONFIG_RCU_FAST_NO_HZ
1631		-
1632		-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1633		-{
1634		- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1635		- unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
1636		-
1637		- sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
1638		- rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
1639		- ulong2long(nlpd),
1640		- rdtp->all_lazy ? 'L' : '.',
1641		- rdtp->tick_nohz_enabled_snap ? '.' : 'D');
1642		-}
1643		-
1644		-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
1645		-
1646		-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1647		-{
1648		- *cp = '\0';
1649		-}
1650		-
1651		-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
1652		-
1653		-/* Initiate the stall-info list. */
1654		-static void print_cpu_stall_info_begin(void)
1655		-{
1656		- pr_cont("\n");
1657		-}
1658		-
1659		-/*
1660		- * Print out diagnostic information for the specified stalled CPU.
1661		- *
1662		- * If the specified CPU is aware of the current RCU grace period
1663		- * (flavor specified by rsp), then print the number of scheduling
1664		- * clock interrupts the CPU has taken during the time that it has
1665		- * been aware. Otherwise, print the number of RCU grace periods
1666		- * that this CPU is ignorant of, for example, "1" if the CPU was
1667		- * aware of the previous grace period.
1668		- *
1669		- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
1670		- */
1671		-static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1672		-{
1673		- unsigned long delta;
1674		- char fast_no_hz[72];
1675		- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1676		- struct rcu_dynticks *rdtp = rdp->dynticks;
1677		- char *ticks_title;
1678		- unsigned long ticks_value;
1679		-
1680		- /*
1681		- * We could be printing a lot while holding a spinlock. Avoid
1682		- * triggering hard lockup.
1683		- */
1684		- touch_nmi_watchdog();
1685		-
1686		- ticks_value = rcu_seq_ctr(rsp->gp_seq - rdp->gp_seq);
1687		- if (ticks_value) {
1688		- ticks_title = "GPs behind";
1689		- } else {
1690		- ticks_title = "ticks this GP";
1691		- ticks_value = rdp->ticks_this_gp;
1692		- }
1693		- print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1694		- delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
1695		- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
1696		- cpu,
1697		- "O."[!!cpu_online(cpu)],
1698		- "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
1699		- "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
1700		- !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
1701		- rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
1702		- "!."[!delta],
1703		- ticks_value, ticks_title,
1704		- rcu_dynticks_snap(rdtp) & 0xfff,
1705		- rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1706		- rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
1707		- READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
1708		- fast_no_hz);
1709		-}
1710		-
1711		-/* Terminate the stall-info list. */
1712		-static void print_cpu_stall_info_end(void)
1713		-{
1714		- pr_err("\t");
1715		-}
1716		-
1717		-/* Zero ->ticks_this_gp for all flavors of RCU. */
1718		-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
1719		-{
1720		- rdp->ticks_this_gp = 0;
1721		- rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
1722		-}
1723		-
1724		-/* Increment ->ticks_this_gp for all flavors of RCU. */
1725		-static void increment_cpu_stall_ticks(void)
1726		-{
1727		- struct rcu_state *rsp;
1728		-
1729		- for_each_rcu_flavor(rsp)
1730		- raw_cpu_inc(rsp->rda->ticks_this_gp);
1731		-}
1732	1440
1733	1441	#ifdef CONFIG_RCU_NOCB_CPU
1734	1442
1735	1443	/*
1736	1444	* Offload callback processing from the boot-time-specified set of CPUs
1737		- * specified by rcu_nocb_mask. For each CPU in the set, there is a
1738		- * kthread created that pulls the callbacks from the corresponding CPU,
1739		- * waits for a grace period to elapse, and invokes the callbacks.
1740		- * The no-CBs CPUs do a wake_up() on their kthread when they insert
1741		- * a callback into any empty list, unless the rcu_nocb_poll boot parameter
1742		- * has been specified, in which case each kthread actively polls its
1743		- * CPU. (Which isn't so great for energy efficiency, but which does
1744		- * reduce RCU's overhead on that CPU.)
	1445	+ * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
	1446	+ * created that pull the callbacks from the corresponding CPU, wait for
	1447	+ * a grace period to elapse, and invoke the callbacks. These kthreads
	1448	+ * are organized into GP kthreads, which manage incoming callbacks, wait for
	1449	+ * grace periods, and awaken CB kthreads, and the CB kthreads, which only
	1450	+ * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
	1451	+ * do a wake_up() on their GP kthread when they insert a callback into any
	1452	+ * empty list, unless the rcu_nocb_poll boot parameter has been specified,
	1453	+ * in which case each kthread actively polls its CPU. (Which isn't so great
	1454	+ * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
1745	1455	*
1746	1456	* This is intended to be used in conjunction with Frederic Weisbecker's
1747	1457	* adaptive-idle work, which would seriously reduce OS jitter on CPUs
1748	1458	* running CPU-bound user-mode computations.
1749	1459	*
1750		- * Offloading of callback processing could also in theory be used as
1751		- * an energy-efficiency measure because CPUs with no RCU callbacks
1752		- * queued are more aggressive about entering dyntick-idle mode.
	1460	+ * Offloading of callbacks can also be used as an energy-efficiency
	1461	+ * measure because CPUs with no RCU callbacks queued are more aggressive
	1462	+ * about entering dyntick-idle mode.
1753	1463	*/
1754	1464
1755	1465
1756		-/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
	1466	+/*
	1467	+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
	1468	+ * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a
	1469	+ * comma-separated list of CPUs and/or CPU ranges. If an invalid list is
	1470	+ * given, a warning is emitted and all CPUs are offloaded.
	1471	+ */
1757	1472	static int __init rcu_nocb_setup(char *str)
1758	1473	{
1759	1474	alloc_bootmem_cpumask_var(&rcu_nocb_mask);
1760		- cpulist_parse(str, rcu_nocb_mask);
	1475	+ if (!strcasecmp(str, "all"))
	1476	+ cpumask_setall(rcu_nocb_mask);
	1477	+ else
	1478	+ if (cpulist_parse(str, rcu_nocb_mask)) {
	1479	+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
	1480	+ cpumask_setall(rcu_nocb_mask);
	1481	+ }
1761	1482	return 1;
1762	1483	}
1763	1484	__setup("rcu_nocbs=", rcu_nocb_setup);
..	..	@@ -1768,6 +1489,117 @@
1768	1489	return 0;
1769	1490	}
1770	1491	early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
	1492	+
	1493	+/*
	1494	+ * Don't bother bypassing ->cblist if the call_rcu() rate is low.
	1495	+ * After all, the main point of bypassing is to avoid lock contention
	1496	+ * on ->nocb_lock, which only can happen at high call_rcu() rates.
	1497	+ */
	1498	+int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
	1499	+module_param(nocb_nobypass_lim_per_jiffy, int, 0);
	1500	+
	1501	+/*
	1502	+ * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
	1503	+ * lock isn't immediately available, increment ->nocb_lock_contended to
	1504	+ * flag the contention.
	1505	+ */
	1506	+static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
	1507	+ __acquires(&rdp->nocb_bypass_lock)
	1508	+{
	1509	+ lockdep_assert_irqs_disabled();
	1510	+ if (raw_spin_trylock(&rdp->nocb_bypass_lock))
	1511	+ return;
	1512	+ atomic_inc(&rdp->nocb_lock_contended);
	1513	+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
	1514	+ smp_mb__after_atomic(); /* atomic_inc() before lock. */
	1515	+ raw_spin_lock(&rdp->nocb_bypass_lock);
	1516	+ smp_mb__before_atomic(); /* atomic_dec() after lock. */
	1517	+ atomic_dec(&rdp->nocb_lock_contended);
	1518	+}
	1519	+
	1520	+/*
	1521	+ * Spinwait until the specified rcu_data structure's ->nocb_lock is
	1522	+ * not contended. Please note that this is extremely special-purpose,
	1523	+ * relying on the fact that at most two kthreads and one CPU contend for
	1524	+ * this lock, and also that the two kthreads are guaranteed to have frequent
	1525	+ * grace-period-duration time intervals between successive acquisitions
	1526	+ * of the lock. This allows us to use an extremely simple throttling
	1527	+ * mechanism, and further to apply it only to the CPU doing floods of
	1528	+ * call_rcu() invocations. Don't try this at home!
	1529	+ */
	1530	+static void rcu_nocb_wait_contended(struct rcu_data *rdp)
	1531	+{
	1532	+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
	1533	+ while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
	1534	+ cpu_relax();
	1535	+}
	1536	+
	1537	+/*
	1538	+ * Conditionally acquire the specified rcu_data structure's
	1539	+ * ->nocb_bypass_lock.
	1540	+ */
	1541	+static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
	1542	+{
	1543	+ lockdep_assert_irqs_disabled();
	1544	+ return raw_spin_trylock(&rdp->nocb_bypass_lock);
	1545	+}
	1546	+
	1547	+/*
	1548	+ * Release the specified rcu_data structure's ->nocb_bypass_lock.
	1549	+ */
	1550	+static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
	1551	+ __releases(&rdp->nocb_bypass_lock)
	1552	+{
	1553	+ lockdep_assert_irqs_disabled();
	1554	+ raw_spin_unlock(&rdp->nocb_bypass_lock);
	1555	+}
	1556	+
	1557	+/*
	1558	+ * Acquire the specified rcu_data structure's ->nocb_lock, but only
	1559	+ * if it corresponds to a no-CBs CPU.
	1560	+ */
	1561	+static void rcu_nocb_lock(struct rcu_data *rdp)
	1562	+{
	1563	+ lockdep_assert_irqs_disabled();
	1564	+ if (!rcu_segcblist_is_offloaded(&rdp->cblist))
	1565	+ return;
	1566	+ raw_spin_lock(&rdp->nocb_lock);
	1567	+}
	1568	+
	1569	+/*
	1570	+ * Release the specified rcu_data structure's ->nocb_lock, but only
	1571	+ * if it corresponds to a no-CBs CPU.
	1572	+ */
	1573	+static void rcu_nocb_unlock(struct rcu_data *rdp)
	1574	+{
	1575	+ if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
	1576	+ lockdep_assert_irqs_disabled();
	1577	+ raw_spin_unlock(&rdp->nocb_lock);
	1578	+ }
	1579	+}
	1580	+
	1581	+/*
	1582	+ * Release the specified rcu_data structure's ->nocb_lock and restore
	1583	+ * interrupts, but only if it corresponds to a no-CBs CPU.
	1584	+ */
	1585	+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
	1586	+ unsigned long flags)
	1587	+{
	1588	+ if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
	1589	+ lockdep_assert_irqs_disabled();
	1590	+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
	1591	+ } else {
	1592	+ local_irq_restore(flags);
	1593	+ }
	1594	+}
	1595	+
	1596	+/* Lockdep check that ->cblist may be safely accessed. */
	1597	+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
	1598	+{
	1599	+ lockdep_assert_irqs_disabled();
	1600	+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
	1601	+ lockdep_assert_held(&rdp->nocb_lock);
	1602	+}
1771	1603
1772	1604	/*
1773	1605	* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
..	..	@@ -1798,442 +1630,523 @@
1798	1630	}
1799	1631
1800	1632	/*
1801		- * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock
	1633	+ * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
1802	1634	* and this function releases it.
1803	1635	*/
1804		-static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
1805		- unsigned long flags)
	1636	+static void wake_nocb_gp(struct rcu_data *rdp, bool force,
	1637	+ unsigned long flags)
1806	1638	__releases(rdp->nocb_lock)
1807	1639	{
1808		- struct rcu_data *rdp_leader = rdp->nocb_leader;
	1640	+ bool needwake = false;
	1641	+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
1809	1642
1810	1643	lockdep_assert_held(&rdp->nocb_lock);
1811		- if (!READ_ONCE(rdp_leader->nocb_kthread)) {
1812		- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
	1644	+ if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
	1645	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
	1646	+ TPS("AlreadyAwake"));
	1647	+ rcu_nocb_unlock_irqrestore(rdp, flags);
1813	1648	return;
1814	1649	}
1815		- if (rdp_leader->nocb_leader_sleep \|\| force) {
1816		- /* Prior smp_mb__after_atomic() orders against prior enqueue. */
1817		- WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
	1650	+
	1651	+ if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) {
	1652	+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
1818	1653	del_timer(&rdp->nocb_timer);
1819		- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1820		- smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */
1821		- swake_up_one(&rdp_leader->nocb_wq);
1822		- } else {
1823		- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1824	1654	}
	1655	+ rcu_nocb_unlock_irqrestore(rdp, flags);
	1656	+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
	1657	+ if (force \|\| READ_ONCE(rdp_gp->nocb_gp_sleep)) {
	1658	+ WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
	1659	+ needwake = true;
	1660	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
	1661	+ }
	1662	+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
	1663	+ if (needwake)
	1664	+ wake_up_process(rdp_gp->nocb_gp_kthread);
1825	1665	}
1826	1666
1827	1667	/*
1828		- * Kick the leader kthread for this NOCB group, but caller has not
1829		- * acquired locks.
	1668	+ * Arrange to wake the GP kthread for this NOCB group at some future
	1669	+ * time when it is safe to do so.
1830	1670	*/
1831		-static void wake_nocb_leader(struct rcu_data *rdp, bool force)
	1671	+static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
	1672	+ const char *reason)
1832	1673	{
1833		- unsigned long flags;
1834		-
1835		- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
1836		- __wake_nocb_leader(rdp, force, flags);
1837		-}
1838		-
1839		-/*
1840		- * Arrange to wake the leader kthread for this NOCB group at some
1841		- * future time when it is safe to do so.
1842		- */
1843		-static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
1844		- const char *reason)
1845		-{
1846		- unsigned long flags;
1847		-
1848		- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
1849	1674	if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
1850	1675	mod_timer(&rdp->nocb_timer, jiffies + 1);
1851		- WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
1852		- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason);
1853		- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
	1676	+ if (rdp->nocb_defer_wakeup < waketype)
	1677	+ WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
	1678	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
1854	1679	}
1855	1680
1856	1681	/*
1857		- * Does the specified CPU need an RCU callback for the specified flavor
1858		- * of rcu_barrier()?
	1682	+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
	1683	+ * However, if there is a callback to be enqueued and if ->nocb_bypass
	1684	+ * proves to be initially empty, just return false because the no-CB GP
	1685	+ * kthread may need to be awakened in this case.
	1686	+ *
	1687	+ * Note that this function always returns true if rhp is NULL.
1859	1688	*/
1860		-static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
	1689	+static bool rcu_nocb_do_flush_bypass(struct rcu_data rdp, struct rcu_head rhp,
	1690	+ unsigned long j)
1861	1691	{
1862		- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1863		- unsigned long ret;
1864		-#ifdef CONFIG_PROVE_RCU
1865		- struct rcu_head *rhp;
1866		-#endif /* #ifdef CONFIG_PROVE_RCU */
	1692	+ struct rcu_cblist rcl;
1867	1693
1868		- /*
1869		- * Check count of all no-CBs callbacks awaiting invocation.
1870		- * There needs to be a barrier before this function is called,
1871		- * but associated with a prior determination that no more
1872		- * callbacks would be posted. In the worst case, the first
1873		- * barrier in _rcu_barrier() suffices (but the caller cannot
1874		- * necessarily rely on this, not a substitute for the caller
1875		- * getting the concurrency design right!). There must also be
1876		- * a barrier between the following load an posting of a callback
1877		- * (if a callback is in fact needed). This is associated with an
1878		- * atomic_inc() in the caller.
1879		- */
1880		- ret = atomic_long_read(&rdp->nocb_q_count);
1881		-
1882		-#ifdef CONFIG_PROVE_RCU
1883		- rhp = READ_ONCE(rdp->nocb_head);
1884		- if (!rhp)
1885		- rhp = READ_ONCE(rdp->nocb_gp_head);
1886		- if (!rhp)
1887		- rhp = READ_ONCE(rdp->nocb_follower_head);
1888		-
1889		- /* Having no rcuo kthread but CBs after scheduler starts is bad! */
1890		- if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
1891		- rcu_scheduler_fully_active) {
1892		- /* RCU callback enqueued before CPU first came online??? */
1893		- pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
1894		- cpu, rhp->func);
1895		- WARN_ON_ONCE(1);
	1694	+ WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist));
	1695	+ rcu_lockdep_assert_cblist_protected(rdp);
	1696	+ lockdep_assert_held(&rdp->nocb_bypass_lock);
	1697	+ if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
	1698	+ raw_spin_unlock(&rdp->nocb_bypass_lock);
	1699	+ return false;
1896	1700	}
1897		-#endif /* #ifdef CONFIG_PROVE_RCU */
1898		-
1899		- return !!ret;
	1701	+ /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
	1702	+ if (rhp)
	1703	+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
	1704	+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
	1705	+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
	1706	+ WRITE_ONCE(rdp->nocb_bypass_first, j);
	1707	+ rcu_nocb_bypass_unlock(rdp);
	1708	+ return true;
1900	1709	}
1901	1710
1902	1711	/*
1903		- * Enqueue the specified string of rcu_head structures onto the specified
1904		- * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
1905		- * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
1906		- * counts are supplied by rhcount and rhcount_lazy.
	1712	+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
	1713	+ * However, if there is a callback to be enqueued and if ->nocb_bypass
	1714	+ * proves to be initially empty, just return false because the no-CB GP
	1715	+ * kthread may need to be awakened in this case.
	1716	+ *
	1717	+ * Note that this function always returns true if rhp is NULL.
	1718	+ */
	1719	+static bool rcu_nocb_flush_bypass(struct rcu_data rdp, struct rcu_head rhp,
	1720	+ unsigned long j)
	1721	+{
	1722	+ if (!rcu_segcblist_is_offloaded(&rdp->cblist))
	1723	+ return true;
	1724	+ rcu_lockdep_assert_cblist_protected(rdp);
	1725	+ rcu_nocb_bypass_lock(rdp);
	1726	+ return rcu_nocb_do_flush_bypass(rdp, rhp, j);
	1727	+}
	1728	+
	1729	+/*
	1730	+ * If the ->nocb_bypass_lock is immediately available, flush the
	1731	+ * ->nocb_bypass queue into ->cblist.
	1732	+ */
	1733	+static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
	1734	+{
	1735	+ rcu_lockdep_assert_cblist_protected(rdp);
	1736	+ if (!rcu_segcblist_is_offloaded(&rdp->cblist) \|\|
	1737	+ !rcu_nocb_bypass_trylock(rdp))
	1738	+ return;
	1739	+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
	1740	+}
	1741	+
	1742	+/*
	1743	+ * See whether it is appropriate to use the ->nocb_bypass list in order
	1744	+ * to control contention on ->nocb_lock. A limited number of direct
	1745	+ * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
	1746	+ * is non-empty, further callbacks must be placed into ->nocb_bypass,
	1747	+ * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
	1748	+ * back to direct use of ->cblist. However, ->nocb_bypass should not be
	1749	+ * used if ->cblist is empty, because otherwise callbacks can be stranded
	1750	+ * on ->nocb_bypass because we cannot count on the current CPU ever again
	1751	+ * invoking call_rcu(). The general rule is that if ->nocb_bypass is
	1752	+ * non-empty, the corresponding no-CBs grace-period kthread must not be
	1753	+ * in an indefinite sleep state.
	1754	+ *
	1755	+ * Finally, it is not permitted to use the bypass during early boot,
	1756	+ * as doing so would confuse the auto-initialization code. Besides
	1757	+ * which, there is no point in worrying about lock contention while
	1758	+ * there is only one CPU in operation.
	1759	+ */
	1760	+static bool rcu_nocb_try_bypass(struct rcu_data rdp, struct rcu_head rhp,
	1761	+ bool *was_alldone, unsigned long flags)
	1762	+{
	1763	+ unsigned long c;
	1764	+ unsigned long cur_gp_seq;
	1765	+ unsigned long j = jiffies;
	1766	+ long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
	1767	+
	1768	+ if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
	1769	+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
	1770	+ return false; /* Not offloaded, no bypassing. */
	1771	+ }
	1772	+ lockdep_assert_irqs_disabled();
	1773	+
	1774	+ // Don't use ->nocb_bypass during early boot.
	1775	+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
	1776	+ rcu_nocb_lock(rdp);
	1777	+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
	1778	+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
	1779	+ return false;
	1780	+ }
	1781	+
	1782	+ // If we have advanced to a new jiffy, reset counts to allow
	1783	+ // moving back from ->nocb_bypass to ->cblist.
	1784	+ if (j == rdp->nocb_nobypass_last) {
	1785	+ c = rdp->nocb_nobypass_count + 1;
	1786	+ } else {
	1787	+ WRITE_ONCE(rdp->nocb_nobypass_last, j);
	1788	+ c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
	1789	+ if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
	1790	+ nocb_nobypass_lim_per_jiffy))
	1791	+ c = 0;
	1792	+ else if (c > nocb_nobypass_lim_per_jiffy)
	1793	+ c = nocb_nobypass_lim_per_jiffy;
	1794	+ }
	1795	+ WRITE_ONCE(rdp->nocb_nobypass_count, c);
	1796	+
	1797	+ // If there hasn't yet been all that many ->cblist enqueues
	1798	+ // this jiffy, tell the caller to enqueue onto ->cblist. But flush
	1799	+ // ->nocb_bypass first.
	1800	+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
	1801	+ rcu_nocb_lock(rdp);
	1802	+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
	1803	+ if (*was_alldone)
	1804	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
	1805	+ TPS("FirstQ"));
	1806	+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
	1807	+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
	1808	+ return false; // Caller must enqueue the callback.
	1809	+ }
	1810	+
	1811	+ // If ->nocb_bypass has been used too long or is too full,
	1812	+ // flush ->nocb_bypass to ->cblist.
	1813	+ if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) \|\|
	1814	+ ncbs >= qhimark) {
	1815	+ rcu_nocb_lock(rdp);
	1816	+ if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
	1817	+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
	1818	+ if (*was_alldone)
	1819	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
	1820	+ TPS("FirstQ"));
	1821	+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
	1822	+ return false; // Caller must enqueue the callback.
	1823	+ }
	1824	+ if (j != rdp->nocb_gp_adv_time &&
	1825	+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
	1826	+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
	1827	+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
	1828	+ rdp->nocb_gp_adv_time = j;
	1829	+ }
	1830	+ rcu_nocb_unlock_irqrestore(rdp, flags);
	1831	+ return true; // Callback already enqueued.
	1832	+ }
	1833	+
	1834	+ // We need to use the bypass.
	1835	+ rcu_nocb_wait_contended(rdp);
	1836	+ rcu_nocb_bypass_lock(rdp);
	1837	+ ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
	1838	+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
	1839	+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
	1840	+ if (!ncbs) {
	1841	+ WRITE_ONCE(rdp->nocb_bypass_first, j);
	1842	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
	1843	+ }
	1844	+ rcu_nocb_bypass_unlock(rdp);
	1845	+ smp_mb(); /* Order enqueue before wake. */
	1846	+ if (ncbs) {
	1847	+ local_irq_restore(flags);
	1848	+ } else {
	1849	+ // No-CBs GP kthread might be indefinitely asleep, if so, wake.
	1850	+ rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
	1851	+ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
	1852	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
	1853	+ TPS("FirstBQwake"));
	1854	+ __call_rcu_nocb_wake(rdp, true, flags);
	1855	+ } else {
	1856	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
	1857	+ TPS("FirstBQnoWake"));
	1858	+ rcu_nocb_unlock_irqrestore(rdp, flags);
	1859	+ }
	1860	+ }
	1861	+ return true; // Callback already enqueued.
	1862	+}
	1863	+
	1864	+/*
	1865	+ * Awaken the no-CBs grace-period kthead if needed, either due to it
	1866	+ * legitimately being asleep or due to overload conditions.
1907	1867	*
1908	1868	* If warranted, also wake up the kthread servicing this CPUs queues.
1909	1869	*/
1910		-static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
1911		- struct rcu_head *rhp,
1912		- struct rcu_head **rhtp,
1913		- int rhcount, int rhcount_lazy,
1914		- unsigned long flags)
	1870	+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
	1871	+ unsigned long flags)
	1872	+ __releases(rdp->nocb_lock)
1915	1873	{
1916		- int len;
1917		- struct rcu_head **old_rhpp;
	1874	+ unsigned long cur_gp_seq;
	1875	+ unsigned long j;
	1876	+ long len;
1918	1877	struct task_struct *t;
1919	1878
1920		- /* Enqueue the callback on the nocb list and update counts. */
1921		- atomic_long_add(rhcount, &rdp->nocb_q_count);
1922		- /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
1923		- old_rhpp = xchg(&rdp->nocb_tail, rhtp);
1924		- WRITE_ONCE(*old_rhpp, rhp);
1925		- atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
1926		- smp_mb__after_atomic(); /* Store old_rhpp before _wake test. /
1927		-
1928		- /* If we are not being polled and there is a kthread, awaken it ... */
1929		- t = READ_ONCE(rdp->nocb_kthread);
	1879	+ // If we are being polled or there is no kthread, just leave.
	1880	+ t = READ_ONCE(rdp->nocb_gp_kthread);
1930	1881	if (rcu_nocb_poll \|\| !t) {
1931		- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
	1882	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1932	1883	TPS("WakeNotPoll"));
	1884	+ rcu_nocb_unlock_irqrestore(rdp, flags);
1933	1885	return;
1934	1886	}
1935		- len = atomic_long_read(&rdp->nocb_q_count);
1936		- if (old_rhpp == &rdp->nocb_head) {
	1887	+ // Need to actually to a wakeup.
	1888	+ len = rcu_segcblist_n_cbs(&rdp->cblist);
	1889	+ if (was_alldone) {
	1890	+ rdp->qlen_last_fqs_check = len;
1937	1891	if (!irqs_disabled_flags(flags)) {
1938	1892	/* ... if queue was empty ... */
1939		- wake_nocb_leader(rdp, false);
1940		- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
	1893	+ wake_nocb_gp(rdp, false, flags);
	1894	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1941	1895	TPS("WakeEmpty"));
1942	1896	} else {
1943		- wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
1944		- TPS("WakeEmptyIsDeferred"));
	1897	+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
	1898	+ TPS("WakeEmptyIsDeferred"));
	1899	+ rcu_nocb_unlock_irqrestore(rdp, flags);
1945	1900	}
1946		- rdp->qlen_last_fqs_check = 0;
1947	1901	} else if (len > rdp->qlen_last_fqs_check + qhimark) {
1948	1902	/* ... or if many callbacks queued. */
1949		- if (!irqs_disabled_flags(flags)) {
1950		- wake_nocb_leader(rdp, true);
1951		- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1952		- TPS("WakeOvf"));
1953		- } else {
1954		- wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE,
1955		- TPS("WakeOvfIsDeferred"));
	1903	+ rdp->qlen_last_fqs_check = len;
	1904	+ j = jiffies;
	1905	+ if (j != rdp->nocb_gp_adv_time &&
	1906	+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
	1907	+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
	1908	+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
	1909	+ rdp->nocb_gp_adv_time = j;
1956	1910	}
1957		- rdp->qlen_last_fqs_check = LONG_MAX / 2;
	1911	+ smp_mb(); /* Enqueue before timer_pending(). */
	1912	+ if ((rdp->nocb_cb_sleep \|\|
	1913	+ !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
	1914	+ !timer_pending(&rdp->nocb_bypass_timer))
	1915	+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
	1916	+ TPS("WakeOvfIsDeferred"));
	1917	+ rcu_nocb_unlock_irqrestore(rdp, flags);
1958	1918	} else {
1959		- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
	1919	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
	1920	+ rcu_nocb_unlock_irqrestore(rdp, flags);
1960	1921	}
1961	1922	return;
1962	1923	}
1963	1924
1964		-/*
1965		- * This is a helper for __call_rcu(), which invokes this when the normal
1966		- * callback queue is inoperable. If this is not a no-CBs CPU, this
1967		- * function returns failure back to __call_rcu(), which can complain
1968		- * appropriately.
1969		- *
1970		- * Otherwise, this function queues the callback where the corresponding
1971		- * "rcuo" kthread can find it.
1972		- */
1973		-static bool __call_rcu_nocb(struct rcu_data rdp, struct rcu_head rhp,
1974		- bool lazy, unsigned long flags)
	1925	+/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
	1926	+static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
1975	1927	{
	1928	+ unsigned long flags;
	1929	+ struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
1976	1930
1977		- if (!rcu_is_nocb_cpu(rdp->cpu))
1978		- return false;
1979		- __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
1980		- if (__is_kfree_rcu_offset((unsigned long)rhp->func))
1981		- trace_rcu_kfree_callback(rdp->rsp->name, rhp,
1982		- (unsigned long)rhp->func,
1983		- -atomic_long_read(&rdp->nocb_q_count_lazy),
1984		- -atomic_long_read(&rdp->nocb_q_count));
1985		- else
1986		- trace_rcu_callback(rdp->rsp->name, rhp,
1987		- -atomic_long_read(&rdp->nocb_q_count_lazy),
1988		- -atomic_long_read(&rdp->nocb_q_count));
	1931	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
	1932	+ rcu_nocb_lock_irqsave(rdp, flags);
	1933	+ smp_mb__after_spinlock(); /* Timer expire before wakeup. */
	1934	+ __call_rcu_nocb_wake(rdp, true, flags);
	1935	+}
	1936	+
	1937	+/*
	1938	+ * No-CBs GP kthreads come here to wait for additional callbacks to show up
	1939	+ * or for grace periods to end.
	1940	+ */
	1941	+static void nocb_gp_wait(struct rcu_data *my_rdp)
	1942	+{
	1943	+ bool bypass = false;
	1944	+ long bypass_ncbs;
	1945	+ int __maybe_unused cpu = my_rdp->cpu;
	1946	+ unsigned long cur_gp_seq;
	1947	+ unsigned long flags;
	1948	+ bool gotcbs = false;
	1949	+ unsigned long j = jiffies;
	1950	+ bool needwait_gp = false; // This prevents actual uninitialized use.
	1951	+ bool needwake;
	1952	+ bool needwake_gp;
	1953	+ struct rcu_data *rdp;
	1954	+ struct rcu_node *rnp;
	1955	+ unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
	1956	+ bool wasempty = false;
1989	1957
1990	1958	/*
1991		- * If called from an extended quiescent state with interrupts
1992		- * disabled, invoke the RCU core in order to allow the idle-entry
1993		- * deferred-wakeup check to function.
	1959	+ * Each pass through the following loop checks for CBs and for the
	1960	+ * nearest grace period (if any) to wait for next. The CB kthreads
	1961	+ * and the global grace-period kthread are awakened if needed.
1994	1962	*/
1995		- if (irqs_disabled_flags(flags) &&
1996		- !rcu_is_watching() &&
1997		- cpu_online(smp_processor_id()))
1998		- invoke_rcu_core();
	1963	+ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
	1964	+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
	1965	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
	1966	+ rcu_nocb_lock_irqsave(rdp, flags);
	1967	+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
	1968	+ if (bypass_ncbs &&
	1969	+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) \|\|
	1970	+ bypass_ncbs > 2 * qhimark)) {
	1971	+ // Bypass full or old, so flush it.
	1972	+ (void)rcu_nocb_try_flush_bypass(rdp, j);
	1973	+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
	1974	+ } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
	1975	+ rcu_nocb_unlock_irqrestore(rdp, flags);
	1976	+ continue; /* No callbacks here, try next. */
	1977	+ }
	1978	+ if (bypass_ncbs) {
	1979	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
	1980	+ TPS("Bypass"));
	1981	+ bypass = true;
	1982	+ }
	1983	+ rnp = rdp->mynode;
	1984	+ if (bypass) { // Avoid race with first bypass CB.
	1985	+ WRITE_ONCE(my_rdp->nocb_defer_wakeup,
	1986	+ RCU_NOCB_WAKE_NOT);
	1987	+ del_timer(&my_rdp->nocb_timer);
	1988	+ }
	1989	+ // Advance callbacks if helpful and low contention.
	1990	+ needwake_gp = false;
	1991	+ if (!rcu_segcblist_restempty(&rdp->cblist,
	1992	+ RCU_NEXT_READY_TAIL) \|\|
	1993	+ (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
	1994	+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
	1995	+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
	1996	+ needwake_gp = rcu_advance_cbs(rnp, rdp);
	1997	+ wasempty = rcu_segcblist_restempty(&rdp->cblist,
	1998	+ RCU_NEXT_READY_TAIL);
	1999	+ raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
	2000	+ }
	2001	+ // Need to wait on some grace period?
	2002	+ WARN_ON_ONCE(wasempty &&
	2003	+ !rcu_segcblist_restempty(&rdp->cblist,
	2004	+ RCU_NEXT_READY_TAIL));
	2005	+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
	2006	+ if (!needwait_gp \|\|
	2007	+ ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
	2008	+ wait_gp_seq = cur_gp_seq;
	2009	+ needwait_gp = true;
	2010	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
	2011	+ TPS("NeedWaitGP"));
	2012	+ }
	2013	+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
	2014	+ needwake = rdp->nocb_cb_sleep;
	2015	+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
	2016	+ smp_mb(); /* CB invocation -after- GP end. */
	2017	+ } else {
	2018	+ needwake = false;
	2019	+ }
	2020	+ rcu_nocb_unlock_irqrestore(rdp, flags);
	2021	+ if (needwake) {
	2022	+ swake_up_one(&rdp->nocb_cb_wq);
	2023	+ gotcbs = true;
	2024	+ }
	2025	+ if (needwake_gp)
	2026	+ rcu_gp_kthread_wake();
	2027	+ }
1999	2028
2000		- return true;
	2029	+ my_rdp->nocb_gp_bypass = bypass;
	2030	+ my_rdp->nocb_gp_gp = needwait_gp;
	2031	+ my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
	2032	+ if (bypass && !rcu_nocb_poll) {
	2033	+ // At least one child with non-empty ->nocb_bypass, so set
	2034	+ // timer in order to avoid stranding its callbacks.
	2035	+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
	2036	+ mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
	2037	+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
	2038	+ }
	2039	+ if (rcu_nocb_poll) {
	2040	+ /* Polling, so trace if first poll in the series. */
	2041	+ if (gotcbs)
	2042	+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
	2043	+ schedule_timeout_idle(1);
	2044	+ } else if (!needwait_gp) {
	2045	+ /* Wait for callbacks to appear. */
	2046	+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
	2047	+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
	2048	+ !READ_ONCE(my_rdp->nocb_gp_sleep));
	2049	+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
	2050	+ } else {
	2051	+ rnp = my_rdp->mynode;
	2052	+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
	2053	+ swait_event_interruptible_exclusive(
	2054	+ rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
	2055	+ rcu_seq_done(&rnp->gp_seq, wait_gp_seq) \|\|
	2056	+ !READ_ONCE(my_rdp->nocb_gp_sleep));
	2057	+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
	2058	+ }
	2059	+ if (!rcu_nocb_poll) {
	2060	+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
	2061	+ if (bypass)
	2062	+ del_timer(&my_rdp->nocb_bypass_timer);
	2063	+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
	2064	+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
	2065	+ }
	2066	+ my_rdp->nocb_gp_seq = -1;
	2067	+ WARN_ON(signal_pending(current));
2001	2068	}
2002	2069
2003	2070	/*
2004		- * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
2005		- * not a no-CBs CPU.
	2071	+ * No-CBs grace-period-wait kthread. There is one of these per group
	2072	+ * of CPUs, but only once at least one CPU in that group has come online
	2073	+ * at least once since boot. This kthread checks for newly posted
	2074	+ * callbacks from any of the CPUs it is responsible for, waits for a
	2075	+ * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
	2076	+ * that then have callback-invocation work to do.
2006	2077	*/
2007		-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
2008		- struct rcu_data *rdp,
2009		- unsigned long flags)
	2078	+static int rcu_nocb_gp_kthread(void *arg)
2010	2079	{
2011		- lockdep_assert_irqs_disabled();
2012		- if (!rcu_is_nocb_cpu(smp_processor_id()))
2013		- return false; /* Not NOCBs CPU, caller must migrate CBs. */
2014		- __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
2015		- rcu_segcblist_tail(&rdp->cblist),
2016		- rcu_segcblist_n_cbs(&rdp->cblist),
2017		- rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
2018		- rcu_segcblist_init(&rdp->cblist);
2019		- rcu_segcblist_disable(&rdp->cblist);
2020		- return true;
	2080	+ struct rcu_data *rdp = arg;
	2081	+
	2082	+ for (;;) {
	2083	+ WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
	2084	+ nocb_gp_wait(rdp);
	2085	+ cond_resched_tasks_rcu_qs();
	2086	+ }
	2087	+ return 0;
2021	2088	}
2022	2089
2023	2090	/*
2024		- * If necessary, kick off a new grace period, and either way wait
2025		- * for a subsequent grace period to complete.
	2091	+ * Invoke any ready callbacks from the corresponding no-CBs CPU,
	2092	+ * then, if there are no more, wait for more to appear.
2026	2093	*/
2027		-static void rcu_nocb_wait_gp(struct rcu_data *rdp)
	2094	+static void nocb_cb_wait(struct rcu_data *rdp)
2028	2095	{
2029		- unsigned long c;
2030		- bool d;
	2096	+ unsigned long cur_gp_seq;
2031	2097	unsigned long flags;
2032		- bool needwake;
	2098	+ bool needwake_gp = false;
2033	2099	struct rcu_node *rnp = rdp->mynode;
2034	2100
2035	2101	local_irq_save(flags);
2036		- c = rcu_seq_snap(&rdp->rsp->gp_seq);
2037		- if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
2038		- local_irq_restore(flags);
2039		- } else {
2040		- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
2041		- needwake = rcu_start_this_gp(rnp, rdp, c);
2042		- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2043		- if (needwake)
2044		- rcu_gp_kthread_wake(rdp->rsp);
	2102	+ rcu_momentary_dyntick_idle();
	2103	+ local_irq_restore(flags);
	2104	+ local_bh_disable();
	2105	+ rcu_do_batch(rdp);
	2106	+ local_bh_enable();
	2107	+ lockdep_assert_irqs_enabled();
	2108	+ rcu_nocb_lock_irqsave(rdp, flags);
	2109	+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
	2110	+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
	2111	+ raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
	2112	+ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
	2113	+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
	2114	+ }
	2115	+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
	2116	+ rcu_nocb_unlock_irqrestore(rdp, flags);
	2117	+ if (needwake_gp)
	2118	+ rcu_gp_kthread_wake();
	2119	+ return;
2045	2120	}
2046	2121
2047		- /*
2048		- * Wait for the grace period. Do so interruptibly to avoid messing
2049		- * up the load average.
2050		- */
2051		- trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
2052		- for (;;) {
2053		- swait_event_interruptible_exclusive(
2054		- rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1],
2055		- (d = rcu_seq_done(&rnp->gp_seq, c)));
2056		- if (likely(d))
2057		- break;
2058		- WARN_ON(signal_pending(current));
2059		- trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait"));
	2122	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
	2123	+ WRITE_ONCE(rdp->nocb_cb_sleep, true);
	2124	+ rcu_nocb_unlock_irqrestore(rdp, flags);
	2125	+ if (needwake_gp)
	2126	+ rcu_gp_kthread_wake();
	2127	+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
	2128	+ !READ_ONCE(rdp->nocb_cb_sleep));
	2129	+ if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
	2130	+ /* ^^^ Ensure CB invocation follows _sleep test. */
	2131	+ return;
2060	2132	}
2061		- trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait"));
2062		- smp_mb(); /* Ensure that CB invocation happens after GP end. */
	2133	+ WARN_ON(signal_pending(current));
	2134	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
2063	2135	}
2064	2136
2065	2137	/*
2066		- * Leaders come here to wait for additional callbacks to show up.
2067		- * This function does not return until callbacks appear.
	2138	+ * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
	2139	+ * nocb_cb_wait() to do the dirty work.
2068	2140	*/
2069		-static void nocb_leader_wait(struct rcu_data *my_rdp)
	2141	+static int rcu_nocb_cb_kthread(void *arg)
2070	2142	{
2071		- bool firsttime = true;
2072		- unsigned long flags;
2073		- bool gotcbs;
2074		- struct rcu_data *rdp;
2075		- struct rcu_head **tail;
2076		-
2077		-wait_again:
2078		-
2079		- /* Wait for callbacks to appear. */
2080		- if (!rcu_nocb_poll) {
2081		- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
2082		- swait_event_interruptible_exclusive(my_rdp->nocb_wq,
2083		- !READ_ONCE(my_rdp->nocb_leader_sleep));
2084		- raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
2085		- my_rdp->nocb_leader_sleep = true;
2086		- WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
2087		- del_timer(&my_rdp->nocb_timer);
2088		- raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
2089		- } else if (firsttime) {
2090		- firsttime = false; /* Don't drown trace log with "Poll"! */
2091		- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll"));
2092		- }
2093		-
2094		- /*
2095		- * Each pass through the following loop checks a follower for CBs.
2096		- * We are our own first follower. Any CBs found are moved to
2097		- * nocb_gp_head, where they await a grace period.
2098		- */
2099		- gotcbs = false;
2100		- smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
2101		- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
2102		- rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
2103		- if (!rdp->nocb_gp_head)
2104		- continue; /* No CBs here, try next follower. */
2105		-
2106		- /* Move callbacks to wait-for-GP list, which is empty. */
2107		- WRITE_ONCE(rdp->nocb_head, NULL);
2108		- rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2109		- gotcbs = true;
2110		- }
2111		-
2112		- /* No callbacks? Sleep a bit if polling, and go retry. */
2113		- if (unlikely(!gotcbs)) {
2114		- WARN_ON(signal_pending(current));
2115		- if (rcu_nocb_poll) {
2116		- schedule_timeout_interruptible(1);
2117		- } else {
2118		- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
2119		- TPS("WokeEmpty"));
2120		- }
2121		- goto wait_again;
2122		- }
2123		-
2124		- /* Wait for one grace period. */
2125		- rcu_nocb_wait_gp(my_rdp);
2126		-
2127		- /* Each pass through the following loop wakes a follower, if needed. */
2128		- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
2129		- if (!rcu_nocb_poll &&
2130		- READ_ONCE(rdp->nocb_head) &&
2131		- READ_ONCE(my_rdp->nocb_leader_sleep)) {
2132		- raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
2133		- my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
2134		- raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
2135		- }
2136		- if (!rdp->nocb_gp_head)
2137		- continue; /* No CBs, so no need to wake follower. */
2138		-
2139		- /* Append callbacks to follower's "done" list. */
2140		- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
2141		- tail = rdp->nocb_follower_tail;
2142		- rdp->nocb_follower_tail = rdp->nocb_gp_tail;
2143		- *tail = rdp->nocb_gp_head;
2144		- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
2145		- if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
2146		- /* List was empty, so wake up the follower. */
2147		- swake_up_one(&rdp->nocb_wq);
2148		- }
2149		- }
2150		-
2151		- /* If we (the leader) don't have CBs, go wait some more. */
2152		- if (!my_rdp->nocb_follower_head)
2153		- goto wait_again;
2154		-}
2155		-
2156		-/*
2157		- * Followers come here to wait for additional callbacks to show up.
2158		- * This function does not return until callbacks appear.
2159		- */
2160		-static void nocb_follower_wait(struct rcu_data *rdp)
2161		-{
2162		- for (;;) {
2163		- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
2164		- swait_event_interruptible_exclusive(rdp->nocb_wq,
2165		- READ_ONCE(rdp->nocb_follower_head));
2166		- if (smp_load_acquire(&rdp->nocb_follower_head)) {
2167		- /* ^^^ Ensure CB invocation follows _head test. */
2168		- return;
2169		- }
2170		- WARN_ON(signal_pending(current));
2171		- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty"));
2172		- }
2173		-}
2174		-
2175		-/*
2176		- * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
2177		- * callbacks queued by the corresponding no-CBs CPU, however, there is
2178		- * an optional leader-follower relationship so that the grace-period
2179		- * kthreads don't have to do quite so many wakeups.
2180		- */
2181		-static int rcu_nocb_kthread(void *arg)
2182		-{
2183		- int c, cl;
2184		- unsigned long flags;
2185		- struct rcu_head *list;
2186		- struct rcu_head *next;
2187		- struct rcu_head **tail;
2188	2143	struct rcu_data *rdp = arg;
2189	2144
2190		- /* Each pass through this loop invokes one batch of callbacks */
	2145	+ // Each pass through this loop does one callback batch, and,
	2146	+ // if there are no more ready callbacks, waits for them.
2191	2147	for (;;) {
2192		- /* Wait for callbacks. */
2193		- if (rdp->nocb_leader == rdp)
2194		- nocb_leader_wait(rdp);
2195		- else
2196		- nocb_follower_wait(rdp);
2197		-
2198		- /* Pull the ready-to-invoke callbacks onto local list. */
2199		- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
2200		- list = rdp->nocb_follower_head;
2201		- rdp->nocb_follower_head = NULL;
2202		- tail = rdp->nocb_follower_tail;
2203		- rdp->nocb_follower_tail = &rdp->nocb_follower_head;
2204		- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
2205		- BUG_ON(!list);
2206		- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty"));
2207		-
2208		- /* Each pass through the following loop invokes a callback. */
2209		- trace_rcu_batch_start(rdp->rsp->name,
2210		- atomic_long_read(&rdp->nocb_q_count_lazy),
2211		- atomic_long_read(&rdp->nocb_q_count), -1);
2212		- c = cl = 0;
2213		- while (list) {
2214		- next = list->next;
2215		- /* Wait for enqueuing to complete, if needed. */
2216		- while (next == NULL && &list->next != tail) {
2217		- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2218		- TPS("WaitQueue"));
2219		- schedule_timeout_interruptible(1);
2220		- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2221		- TPS("WokeQueue"));
2222		- next = list->next;
2223		- }
2224		- debug_rcu_head_unqueue(list);
2225		- local_bh_disable();
2226		- if (__rcu_reclaim(rdp->rsp->name, list))
2227		- cl++;
2228		- c++;
2229		- local_bh_enable();
2230		- cond_resched_tasks_rcu_qs();
2231		- list = next;
2232		- }
2233		- trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2234		- smp_mb__before_atomic(); /* _add after CB invocation. */
2235		- atomic_long_add(-c, &rdp->nocb_q_count);
2236		- atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
	2148	+ nocb_cb_wait(rdp);
	2149	+ cond_resched_tasks_rcu_qs();
2237	2150	}
2238	2151	return 0;
2239	2152	}
..	..	@@ -2250,15 +2163,14 @@
2250	2163	unsigned long flags;
2251	2164	int ndw;
2252	2165
2253		- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
	2166	+ rcu_nocb_lock_irqsave(rdp, flags);
2254	2167	if (!rcu_nocb_need_deferred_wakeup(rdp)) {
2255		- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
	2168	+ rcu_nocb_unlock_irqrestore(rdp, flags);
2256	2169	return;
2257	2170	}
2258	2171	ndw = READ_ONCE(rdp->nocb_defer_wakeup);
2259		- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
2260		- __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
2261		- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
	2172	+ wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
	2173	+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
2262	2174	}
2263	2175
2264	2176	/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
..	..	@@ -2280,11 +2192,16 @@
2280	2192	do_nocb_deferred_wakeup_common(rdp);
2281	2193	}
2282	2194
	2195	+void rcu_nocb_flush_deferred_wakeup(void)
	2196	+{
	2197	+ do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
	2198	+}
	2199	+
2283	2200	void __init rcu_init_nohz(void)
2284	2201	{
2285	2202	int cpu;
2286	2203	bool need_rcu_nocb_mask = false;
2287		- struct rcu_state *rsp;
	2204	+ struct rcu_data *rdp;
2288	2205
2289	2206	#if defined(CONFIG_NO_HZ_FULL)
2290	2207	if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
..	..	@@ -2318,82 +2235,73 @@
2318	2235	if (rcu_nocb_poll)
2319	2236	pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2320	2237
2321		- for_each_rcu_flavor(rsp) {
2322		- for_each_cpu(cpu, rcu_nocb_mask)
2323		- init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
2324		- rcu_organize_nocb_kthreads(rsp);
	2238	+ for_each_cpu(cpu, rcu_nocb_mask) {
	2239	+ rdp = per_cpu_ptr(&rcu_data, cpu);
	2240	+ if (rcu_segcblist_empty(&rdp->cblist))
	2241	+ rcu_segcblist_init(&rdp->cblist);
	2242	+ rcu_segcblist_offload(&rdp->cblist);
2325	2243	}
	2244	+ rcu_organize_nocb_kthreads();
2326	2245	}
2327	2246
2328	2247	/* Initialize per-rcu_data variables for no-CBs CPUs. */
2329	2248	static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2330	2249	{
2331		- rdp->nocb_tail = &rdp->nocb_head;
2332		- init_swait_queue_head(&rdp->nocb_wq);
2333		- rdp->nocb_follower_tail = &rdp->nocb_follower_head;
	2250	+ init_swait_queue_head(&rdp->nocb_cb_wq);
	2251	+ init_swait_queue_head(&rdp->nocb_gp_wq);
2334	2252	raw_spin_lock_init(&rdp->nocb_lock);
	2253	+ raw_spin_lock_init(&rdp->nocb_bypass_lock);
	2254	+ raw_spin_lock_init(&rdp->nocb_gp_lock);
2335	2255	timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
	2256	+ timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
	2257	+ rcu_cblist_init(&rdp->nocb_bypass);
2336	2258	}
2337	2259
2338	2260	/*
2339	2261	* If the specified CPU is a no-CBs CPU that does not already have its
2340		- * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are
2341		- * brought online out of order, this can require re-organizing the
2342		- * leader-follower relationships.
	2262	+ * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
	2263	+ * for this CPU's group has not yet been created, spawn it as well.
2343	2264	*/
2344		-static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
	2265	+static void rcu_spawn_one_nocb_kthread(int cpu)
2345	2266	{
2346		- struct rcu_data *rdp;
2347		- struct rcu_data *rdp_last;
2348		- struct rcu_data *rdp_old_leader;
2349		- struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu);
	2267	+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
	2268	+ struct rcu_data *rdp_gp;
2350	2269	struct task_struct *t;
2351	2270
2352	2271	/*
2353	2272	* If this isn't a no-CBs CPU or if it already has an rcuo kthread,
2354	2273	* then nothing to do.
2355	2274	*/
2356		- if (!rcu_is_nocb_cpu(cpu) \|\| rdp_spawn->nocb_kthread)
	2275	+ if (!rcu_is_nocb_cpu(cpu) \|\| rdp->nocb_cb_kthread)
2357	2276	return;
2358	2277
2359		- /* If we didn't spawn the leader first, reorganize! */
2360		- rdp_old_leader = rdp_spawn->nocb_leader;
2361		- if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
2362		- rdp_last = NULL;
2363		- rdp = rdp_old_leader;
2364		- do {
2365		- rdp->nocb_leader = rdp_spawn;
2366		- if (rdp_last && rdp != rdp_spawn)
2367		- rdp_last->nocb_next_follower = rdp;
2368		- if (rdp == rdp_spawn) {
2369		- rdp = rdp->nocb_next_follower;
2370		- } else {
2371		- rdp_last = rdp;
2372		- rdp = rdp->nocb_next_follower;
2373		- rdp_last->nocb_next_follower = NULL;
2374		- }
2375		- } while (rdp);
2376		- rdp_spawn->nocb_next_follower = rdp_old_leader;
	2278	+ /* If we didn't spawn the GP kthread first, reorganize! */
	2279	+ rdp_gp = rdp->nocb_gp_rdp;
	2280	+ if (!rdp_gp->nocb_gp_kthread) {
	2281	+ t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
	2282	+ "rcuog/%d", rdp_gp->cpu);
	2283	+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
	2284	+ return;
	2285	+ WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
2377	2286	}
2378	2287
2379		- /* Spawn the kthread for this CPU and RCU flavor. */
2380		- t = kthread_run(rcu_nocb_kthread, rdp_spawn,
2381		- "rcuo%c/%d", rsp->abbr, cpu);
2382		- BUG_ON(IS_ERR(t));
2383		- WRITE_ONCE(rdp_spawn->nocb_kthread, t);
	2288	+ /* Spawn the kthread for this CPU. */
	2289	+ t = kthread_run(rcu_nocb_cb_kthread, rdp,
	2290	+ "rcuo%c/%d", rcu_state.abbr, cpu);
	2291	+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
	2292	+ return;
	2293	+ WRITE_ONCE(rdp->nocb_cb_kthread, t);
	2294	+ WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
2384	2295	}
2385	2296
2386	2297	/*
2387	2298	* If the specified CPU is a no-CBs CPU that does not already have its
2388		- * rcuo kthreads, spawn them.
	2299	+ * rcuo kthread, spawn it.
2389	2300	*/
2390		-static void rcu_spawn_all_nocb_kthreads(int cpu)
	2301	+static void rcu_spawn_cpu_nocb_kthread(int cpu)
2391	2302	{
2392		- struct rcu_state *rsp;
2393		-
2394	2303	if (rcu_scheduler_fully_active)
2395		- for_each_rcu_flavor(rsp)
2396		- rcu_spawn_one_nocb_kthread(rsp, cpu);
	2304	+ rcu_spawn_one_nocb_kthread(cpu);
2397	2305	}
2398	2306
2399	2307	/*
..	..	@@ -2407,30 +2315,33 @@
2407	2315	int cpu;
2408	2316
2409	2317	for_each_online_cpu(cpu)
2410		- rcu_spawn_all_nocb_kthreads(cpu);
	2318	+ rcu_spawn_cpu_nocb_kthread(cpu);
2411	2319	}
2412	2320
2413		-/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
2414		-static int rcu_nocb_leader_stride = -1;
2415		-module_param(rcu_nocb_leader_stride, int, 0444);
	2321	+/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
	2322	+static int rcu_nocb_gp_stride = -1;
	2323	+module_param(rcu_nocb_gp_stride, int, 0444);
2416	2324
2417	2325	/*
2418		- * Initialize leader-follower relationships for all no-CBs CPU.
	2326	+ * Initialize GP-CB relationships for all no-CBs CPU.
2419	2327	*/
2420		-static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
	2328	+static void __init rcu_organize_nocb_kthreads(void)
2421	2329	{
2422	2330	int cpu;
2423		- int ls = rcu_nocb_leader_stride;
2424		- int nl = 0; /* Next leader. */
	2331	+ bool firsttime = true;
	2332	+ bool gotnocbs = false;
	2333	+ bool gotnocbscbs = true;
	2334	+ int ls = rcu_nocb_gp_stride;
	2335	+ int nl = 0; /* Next GP kthread. */
2425	2336	struct rcu_data *rdp;
2426		- struct rcu_data rdp_leader = NULL; / Suppress misguided gcc warn. */
	2337	+ struct rcu_data rdp_gp = NULL; / Suppress misguided gcc warn. */
2427	2338	struct rcu_data *rdp_prev = NULL;
2428	2339
2429	2340	if (!cpumask_available(rcu_nocb_mask))
2430	2341	return;
2431	2342	if (ls == -1) {
2432		- ls = int_sqrt(nr_cpu_ids);
2433		- rcu_nocb_leader_stride = ls;
	2343	+ ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
	2344	+ rcu_nocb_gp_stride = ls;
2434	2345	}
2435	2346
2436	2347	/*
..	..	@@ -2439,47 +2350,142 @@
2439	2350	* we will spawn the needed set of rcu_nocb_kthread() kthreads.
2440	2351	*/
2441	2352	for_each_cpu(cpu, rcu_nocb_mask) {
2442		- rdp = per_cpu_ptr(rsp->rda, cpu);
	2353	+ rdp = per_cpu_ptr(&rcu_data, cpu);
2443	2354	if (rdp->cpu >= nl) {
2444		- /* New leader, set up for followers & next leader. */
	2355	+ /* New GP kthread, set up for CBs & next GP. */
	2356	+ gotnocbs = true;
2445	2357	nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
2446		- rdp->nocb_leader = rdp;
2447		- rdp_leader = rdp;
	2358	+ rdp->nocb_gp_rdp = rdp;
	2359	+ rdp_gp = rdp;
	2360	+ if (dump_tree) {
	2361	+ if (!firsttime)
	2362	+ pr_cont("%s\n", gotnocbscbs
	2363	+ ? "" : " (self only)");
	2364	+ gotnocbscbs = false;
	2365	+ firsttime = false;
	2366	+ pr_alert("%s: No-CB GP kthread CPU %d:",
	2367	+ __func__, cpu);
	2368	+ }
2448	2369	} else {
2449		- /* Another follower, link to previous leader. */
2450		- rdp->nocb_leader = rdp_leader;
2451		- rdp_prev->nocb_next_follower = rdp;
	2370	+ /* Another CB kthread, link to previous GP kthread. */
	2371	+ gotnocbscbs = true;
	2372	+ rdp->nocb_gp_rdp = rdp_gp;
	2373	+ rdp_prev->nocb_next_cb_rdp = rdp;
	2374	+ if (dump_tree)
	2375	+ pr_cont(" %d", cpu);
2452	2376	}
2453	2377	rdp_prev = rdp;
2454	2378	}
	2379	+ if (gotnocbs && dump_tree)
	2380	+ pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
2455	2381	}
2456	2382
2457		-/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2458		-static bool init_nocb_callback_list(struct rcu_data *rdp)
	2383	+/*
	2384	+ * Bind the current task to the offloaded CPUs. If there are no offloaded
	2385	+ * CPUs, leave the task unbound. Splat if the bind attempt fails.
	2386	+ */
	2387	+void rcu_bind_current_to_nocb(void)
2459	2388	{
2460		- if (!rcu_is_nocb_cpu(rdp->cpu))
2461		- return false;
	2389	+ if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
	2390	+ WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
	2391	+}
	2392	+EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
2462	2393
2463		- /* If there are early-boot callbacks, move them to nocb lists. */
2464		- if (!rcu_segcblist_empty(&rdp->cblist)) {
2465		- rdp->nocb_head = rcu_segcblist_head(&rdp->cblist);
2466		- rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist);
2467		- atomic_long_set(&rdp->nocb_q_count,
2468		- rcu_segcblist_n_cbs(&rdp->cblist));
2469		- atomic_long_set(&rdp->nocb_q_count_lazy,
2470		- rcu_segcblist_n_lazy_cbs(&rdp->cblist));
2471		- rcu_segcblist_init(&rdp->cblist);
2472		- }
2473		- rcu_segcblist_disable(&rdp->cblist);
2474		- return true;
	2394	+/*
	2395	+ * Dump out nocb grace-period kthread state for the specified rcu_data
	2396	+ * structure.
	2397	+ */
	2398	+static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
	2399	+{
	2400	+ struct rcu_node *rnp = rdp->mynode;
	2401	+
	2402	+ pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n",
	2403	+ rdp->cpu,
	2404	+ "kK"[!!rdp->nocb_gp_kthread],
	2405	+ "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
	2406	+ "dD"[!!rdp->nocb_defer_wakeup],
	2407	+ "tT"[timer_pending(&rdp->nocb_timer)],
	2408	+ "bB"[timer_pending(&rdp->nocb_bypass_timer)],
	2409	+ "sS"[!!rdp->nocb_gp_sleep],
	2410	+ ".W"[swait_active(&rdp->nocb_gp_wq)],
	2411	+ ".W"[swait_active(&rnp->nocb_gp_wq[0])],
	2412	+ ".W"[swait_active(&rnp->nocb_gp_wq[1])],
	2413	+ ".B"[!!rdp->nocb_gp_bypass],
	2414	+ ".G"[!!rdp->nocb_gp_gp],
	2415	+ (long)rdp->nocb_gp_seq,
	2416	+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops));
	2417	+}
	2418	+
	2419	+/* Dump out nocb kthread state for the specified rcu_data structure. */
	2420	+static void show_rcu_nocb_state(struct rcu_data *rdp)
	2421	+{
	2422	+ struct rcu_segcblist *rsclp = &rdp->cblist;
	2423	+ bool waslocked;
	2424	+ bool wastimer;
	2425	+ bool wassleep;
	2426	+
	2427	+ if (rdp->nocb_gp_rdp == rdp)
	2428	+ show_rcu_nocb_gp_state(rdp);
	2429	+
	2430	+ pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n",
	2431	+ rdp->cpu, rdp->nocb_gp_rdp->cpu,
	2432	+ "kK"[!!rdp->nocb_cb_kthread],
	2433	+ "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
	2434	+ "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
	2435	+ "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
	2436	+ "sS"[!!rdp->nocb_cb_sleep],
	2437	+ ".W"[swait_active(&rdp->nocb_cb_wq)],
	2438	+ jiffies - rdp->nocb_bypass_first,
	2439	+ jiffies - rdp->nocb_nobypass_last,
	2440	+ rdp->nocb_nobypass_count,
	2441	+ ".D"[rcu_segcblist_ready_cbs(rsclp)],
	2442	+ ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)],
	2443	+ ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)],
	2444	+ ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)],
	2445	+ ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
	2446	+ rcu_segcblist_n_cbs(&rdp->cblist));
	2447	+
	2448	+ /* It is OK for GP kthreads to have GP state. */
	2449	+ if (rdp->nocb_gp_rdp == rdp)
	2450	+ return;
	2451	+
	2452	+ waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
	2453	+ wastimer = timer_pending(&rdp->nocb_bypass_timer);
	2454	+ wassleep = swait_active(&rdp->nocb_gp_wq);
	2455	+ if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep)
	2456	+ return; /* Nothing untowards. */
	2457	+
	2458	+ pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n",
	2459	+ "lL"[waslocked],
	2460	+ "dD"[!!rdp->nocb_defer_wakeup],
	2461	+ "tT"[wastimer],
	2462	+ "sS"[!!rdp->nocb_gp_sleep],
	2463	+ ".W"[wassleep]);
2475	2464	}
2476	2465
2477	2466	#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2478	2467
2479		-static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
	2468	+/* No ->nocb_lock to acquire. */
	2469	+static void rcu_nocb_lock(struct rcu_data *rdp)
2480	2470	{
2481		- WARN_ON_ONCE(1); /* Should be dead code. */
2482		- return false;
	2471	+}
	2472	+
	2473	+/* No ->nocb_lock to release. */
	2474	+static void rcu_nocb_unlock(struct rcu_data *rdp)
	2475	+{
	2476	+}
	2477	+
	2478	+/* No ->nocb_lock to release. */
	2479	+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
	2480	+ unsigned long flags)
	2481	+{
	2482	+ local_irq_restore(flags);
	2483	+}
	2484	+
	2485	+/* Lockdep check that ->cblist may be safely accessed. */
	2486	+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
	2487	+{
	2488	+ lockdep_assert_irqs_disabled();
2483	2489	}
2484	2490
2485	2491	static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
..	..	@@ -2495,17 +2501,22 @@
2495	2501	{
2496	2502	}
2497	2503
2498		-static bool __call_rcu_nocb(struct rcu_data rdp, struct rcu_head rhp,
2499		- bool lazy, unsigned long flags)
	2504	+static bool rcu_nocb_flush_bypass(struct rcu_data rdp, struct rcu_head rhp,
	2505	+ unsigned long j)
	2506	+{
	2507	+ return true;
	2508	+}
	2509	+
	2510	+static bool rcu_nocb_try_bypass(struct rcu_data rdp, struct rcu_head rhp,
	2511	+ bool *was_alldone, unsigned long flags)
2500	2512	{
2501	2513	return false;
2502	2514	}
2503	2515
2504		-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
2505		- struct rcu_data *rdp,
2506		- unsigned long flags)
	2516	+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
	2517	+ unsigned long flags)
2507	2518	{
2508		- return false;
	2519	+ WARN_ON_ONCE(1); /* Should be dead code! */
2509	2520	}
2510	2521
2511	2522	static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
..	..	@@ -2521,7 +2532,7 @@
2521	2532	{
2522	2533	}
2523	2534
2524		-static void rcu_spawn_all_nocb_kthreads(int cpu)
	2535	+static void rcu_spawn_cpu_nocb_kthread(int cpu)
2525	2536	{
2526	2537	}
2527	2538
..	..	@@ -2529,9 +2540,8 @@
2529	2540	{
2530	2541	}
2531	2542
2532		-static bool init_nocb_callback_list(struct rcu_data *rdp)
	2543	+static void show_rcu_nocb_state(struct rcu_data *rdp)
2533	2544	{
2534		- return false;
2535	2545	}
2536	2546
2537	2547	#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
..	..	@@ -2545,12 +2555,12 @@
2545	2555	* This code relies on the fact that all NO_HZ_FULL CPUs are also
2546	2556	* CONFIG_RCU_NOCB_CPU CPUs.
2547	2557	*/
2548		-static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
	2558	+static bool rcu_nohz_full_cpu(void)
2549	2559	{
2550	2560	#ifdef CONFIG_NO_HZ_FULL
2551	2561	if (tick_nohz_full_cpu(smp_processor_id()) &&
2552		- (!rcu_gp_in_progress(rsp) \|\|
2553		- ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ)))
	2562	+ (!rcu_gp_in_progress() \|\|
	2563	+ time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ)))
2554	2564	return true;
2555	2565	#endif /* #ifdef CONFIG_NO_HZ_FULL */
2556	2566	return false;
..	..	@@ -2567,7 +2577,7 @@
2567	2577	}
2568	2578
2569	2579	/* Record the current task on dyntick-idle entry. */
2570		-static void rcu_dynticks_task_enter(void)
	2580	+static __always_inline void rcu_dynticks_task_enter(void)
2571	2581	{
2572	2582	#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
2573	2583	WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
..	..	@@ -2575,9 +2585,27 @@
2575	2585	}
2576	2586
2577	2587	/* Record no current task on dyntick-idle exit. */
2578		-static void rcu_dynticks_task_exit(void)
	2588	+static __always_inline void rcu_dynticks_task_exit(void)
2579	2589	{
2580	2590	#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
2581	2591	WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
2582	2592	#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
2583	2593	}
	2594	+
	2595	+/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
	2596	+static __always_inline void rcu_dynticks_task_trace_enter(void)
	2597	+{
	2598	+#ifdef CONFIG_TASKS_TRACE_RCU
	2599	+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
	2600	+ current->trc_reader_special.b.need_mb = true;
	2601	+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
	2602	+}
	2603	+
	2604	+/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
	2605	+static __always_inline void rcu_dynticks_task_trace_exit(void)
	2606	+{
	2607	+#ifdef CONFIG_TASKS_TRACE_RCU
	2608	+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
	2609	+ current->trc_reader_special.b.need_mb = false;
	2610	+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
	2611	+}