~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0
1	2	/*
2	3	* Common Block IO controller cgroup interface
3	4	*
..	..	@@ -28,7 +29,9 @@
28	29	#include <linux/ctype.h>
29	30	#include <linux/blk-cgroup.h>
30	31	#include <linux/tracehook.h>
	32	+#include <linux/psi.h>
31	33	#include "blk.h"
	34	+#include "blk-ioprio.h"
32	35
33	36	#define MAX_KEY_LEN 100
34	37
..	..	@@ -46,12 +49,14 @@
46	49	EXPORT_SYMBOL_GPL(blkcg_root);
47	50
48	51	struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
	52	+EXPORT_SYMBOL_GPL(blkcg_root_css);
49	53
50	54	static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
51	55
52	56	static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
53	57
54		-static bool blkcg_debug_stats = false;
	58	+bool blkcg_debug_stats = false;
	59	+static struct workqueue_struct *blkcg_punt_bio_wq;
55	60
56	61	static bool blkcg_policy_enabled(struct request_queue *q,
57	62	const struct blkcg_policy *pol)
..	..	@@ -76,12 +81,63 @@
76	81	if (blkg->pd[i])
77	82	blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
78	83
79		- if (blkg->blkcg != &blkcg_root)
80		- blk_exit_rl(blkg->q, &blkg->rl);
81		-
82		- blkg_rwstat_exit(&blkg->stat_ios);
83		- blkg_rwstat_exit(&blkg->stat_bytes);
	84	+ free_percpu(blkg->iostat_cpu);
	85	+ percpu_ref_exit(&blkg->refcnt);
84	86	kfree(blkg);
	87	+}
	88	+
	89	+static void __blkg_release(struct rcu_head *rcu)
	90	+{
	91	+ struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
	92	+
	93	+ WARN_ON(!bio_list_empty(&blkg->async_bios));
	94	+
	95	+ /* release the blkcg and parent blkg refs this blkg has been holding */
	96	+ css_put(&blkg->blkcg->css);
	97	+ if (blkg->parent)
	98	+ blkg_put(blkg->parent);
	99	+ blkg_free(blkg);
	100	+}
	101	+
	102	+/*
	103	+ * A group is RCU protected, but having an rcu lock does not mean that one
	104	+ * can access all the fields of blkg and assume these are valid. For
	105	+ * example, don't try to follow throtl_data and request queue links.
	106	+ *
	107	+ * Having a reference to blkg under an rcu allows accesses to only values
	108	+ * local to groups like group stats and group rate limits.
	109	+ */
	110	+static void blkg_release(struct percpu_ref *ref)
	111	+{
	112	+ struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
	113	+
	114	+ call_rcu(&blkg->rcu_head, __blkg_release);
	115	+}
	116	+
	117	+static void blkg_async_bio_workfn(struct work_struct *work)
	118	+{
	119	+ struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
	120	+ async_bio_work);
	121	+ struct bio_list bios = BIO_EMPTY_LIST;
	122	+ struct bio *bio;
	123	+ struct blk_plug plug;
	124	+ bool need_plug = false;
	125	+
	126	+ /* as long as there are pending bios, @blkg can't go away */
	127	+ spin_lock_bh(&blkg->async_bio_lock);
	128	+ bio_list_merge(&bios, &blkg->async_bios);
	129	+ bio_list_init(&blkg->async_bios);
	130	+ spin_unlock_bh(&blkg->async_bio_lock);
	131	+
	132	+ /* start plug only when bio_list contains at least 2 bios */
	133	+ if (bios.head && bios.head->bi_next) {
	134	+ need_plug = true;
	135	+ blk_start_plug(&plug);
	136	+ }
	137	+ while ((bio = bio_list_pop(&bios)))
	138	+ submit_bio(bio);
	139	+ if (need_plug)
	140	+ blk_finish_plug(&plug);
85	141	}
86	142
87	143	/**
..	..	@@ -96,28 +152,30 @@
96	152	gfp_t gfp_mask)
97	153	{
98	154	struct blkcg_gq *blkg;
99		- int i;
	155	+ int i, cpu;
100	156
101	157	/* alloc and init base part */
102	158	blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
103	159	if (!blkg)
104	160	return NULL;
105	161
106		- if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) \|\|
107		- blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
	162	+ if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
	163	+ goto err_free;
	164	+
	165	+ blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
	166	+ if (!blkg->iostat_cpu)
108	167	goto err_free;
109	168
110	169	blkg->q = q;
111	170	INIT_LIST_HEAD(&blkg->q_node);
	171	+ spin_lock_init(&blkg->async_bio_lock);
	172	+ bio_list_init(&blkg->async_bios);
	173	+ INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
112	174	blkg->blkcg = blkcg;
113		- atomic_set(&blkg->refcnt, 1);
114	175
115		- /* root blkg uses @q->root_rl, init rl only for !root blkgs */
116		- if (blkcg != &blkcg_root) {
117		- if (blk_init_rl(&blkg->rl, q, gfp_mask))
118		- goto err_free;
119		- blkg->rl.blkg = blkg;
120		- }
	176	+ u64_stats_init(&blkg->iostat.sync);
	177	+ for_each_possible_cpu(cpu)
	178	+ u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
121	179
122	180	for (i = 0; i < BLKCG_MAX_POLS; i++) {
123	181	struct blkcg_policy *pol = blkcg_policy[i];
..	..	@@ -127,7 +185,7 @@
127	185	continue;
128	186
129	187	/* alloc per-policy data and attach it to blkg */
130		- pd = pol->pd_alloc_fn(gfp_mask, q->node);
	188	+ pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
131	189	if (!pd)
132	190	goto err_free;
133	191
..	..	@@ -157,7 +215,7 @@
157	215	blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
158	216	if (blkg && blkg->q == q) {
159	217	if (update_hint) {
160		- lockdep_assert_held(q->queue_lock);
	218	+ lockdep_assert_held(&q->queue_lock);
161	219	rcu_assign_pointer(blkcg->blkg_hint, blkg);
162	220	}
163	221	return blkg;
..	..	@@ -176,11 +234,16 @@
176	234	struct blkcg_gq *new_blkg)
177	235	{
178	236	struct blkcg_gq *blkg;
179		- struct bdi_writeback_congested *wb_congested;
180	237	int i, ret;
181	238
182	239	WARN_ON_ONCE(!rcu_read_lock_held());
183		- lockdep_assert_held(q->queue_lock);
	240	+ lockdep_assert_held(&q->queue_lock);
	241	+
	242	+ /* request_queue is dying, do not create/recreate a blkg */
	243	+ if (blk_queue_dying(q)) {
	244	+ ret = -ENODEV;
	245	+ goto err_free_blkg;
	246	+ }
184	247
185	248	/* blkg holds a reference to blkcg */
186	249	if (!css_tryget_online(&blkcg->css)) {
..	..	@@ -188,31 +251,22 @@
188	251	goto err_free_blkg;
189	252	}
190	253
191		- wb_congested = wb_congested_get_create(q->backing_dev_info,
192		- blkcg->css.id,
193		- GFP_NOWAIT \| __GFP_NOWARN);
194		- if (!wb_congested) {
195		- ret = -ENOMEM;
196		- goto err_put_css;
197		- }
198		-
199	254	/* allocate */
200	255	if (!new_blkg) {
201	256	new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT \| __GFP_NOWARN);
202	257	if (unlikely(!new_blkg)) {
203	258	ret = -ENOMEM;
204		- goto err_put_congested;
	259	+ goto err_put_css;
205	260	}
206	261	}
207	262	blkg = new_blkg;
208		- blkg->wb_congested = wb_congested;
209	263
210	264	/* link parent */
211	265	if (blkcg_parent(blkcg)) {
212	266	blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
213	267	if (WARN_ON_ONCE(!blkg->parent)) {
214	268	ret = -ENODEV;
215		- goto err_put_congested;
	269	+ goto err_put_css;
216	270	}
217	271	blkg_get(blkg->parent);
218	272	}
..	..	@@ -249,8 +303,6 @@
249	303	blkg_put(blkg);
250	304	return ERR_PTR(ret);
251	305
252		-err_put_congested:
253		- wb_congested_put(wb_congested);
254	306	err_put_css:
255	307	css_put(&blkcg->css);
256	308	err_free_blkg:
..	..	@@ -266,57 +318,69 @@
266	318	* Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to
267	319	* create one. blkg creation is performed recursively from blkcg_root such
268	320	* that all non-root blkg's have access to the parent blkg. This function
269		- * should be called under RCU read lock and @q->queue_lock.
	321	+ * should be called under RCU read lock and takes @q->queue_lock.
270	322	*
271		- * Returns pointer to the looked up or created blkg on success, ERR_PTR()
272		- * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not
273		- * dead and bypassing, returns ERR_PTR(-EBUSY).
	323	+ * Returns the blkg or the closest blkg if blkg_create() fails as it walks
	324	+ * down from root.
274	325	*/
275		-struct blkcg_gq blkg_lookup_create(struct blkcg blkcg,
276		- struct request_queue *q)
	326	+static struct blkcg_gq blkg_lookup_create(struct blkcg blkcg,
	327	+ struct request_queue *q)
277	328	{
278	329	struct blkcg_gq *blkg;
	330	+ unsigned long flags;
279	331
280	332	WARN_ON_ONCE(!rcu_read_lock_held());
281		- lockdep_assert_held(q->queue_lock);
282	333
283		- /*
284		- * This could be the first entry point of blkcg implementation and
285		- * we shouldn't allow anything to go through for a bypassing queue.
286		- */
287		- if (unlikely(blk_queue_bypass(q)))
288		- return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
289		-
290		- blkg = __blkg_lookup(blkcg, q, true);
	334	+ blkg = blkg_lookup(blkcg, q);
291	335	if (blkg)
292	336	return blkg;
293	337
	338	+ spin_lock_irqsave(&q->queue_lock, flags);
	339	+ blkg = __blkg_lookup(blkcg, q, true);
	340	+ if (blkg)
	341	+ goto found;
	342	+
294	343	/*
295	344	* Create blkgs walking down from blkcg_root to @blkcg, so that all
296		- * non-root blkgs have access to their parents.
	345	+ * non-root blkgs have access to their parents. Returns the closest
	346	+ * blkg to the intended blkg should blkg_create() fail.
297	347	*/
298	348	while (true) {
299	349	struct blkcg *pos = blkcg;
300	350	struct blkcg *parent = blkcg_parent(blkcg);
	351	+ struct blkcg_gq *ret_blkg = q->root_blkg;
301	352
302		- while (parent && !__blkg_lookup(parent, q, false)) {
	353	+ while (parent) {
	354	+ blkg = __blkg_lookup(parent, q, false);
	355	+ if (blkg) {
	356	+ /* remember closest blkg */
	357	+ ret_blkg = blkg;
	358	+ break;
	359	+ }
303	360	pos = parent;
304	361	parent = blkcg_parent(parent);
305	362	}
306	363
307	364	blkg = blkg_create(pos, q, NULL);
308		- if (pos == blkcg \|\| IS_ERR(blkg))
309		- return blkg;
	365	+ if (IS_ERR(blkg)) {
	366	+ blkg = ret_blkg;
	367	+ break;
	368	+ }
	369	+ if (pos == blkcg)
	370	+ break;
310	371	}
	372	+
	373	+found:
	374	+ spin_unlock_irqrestore(&q->queue_lock, flags);
	375	+ return blkg;
311	376	}
312	377
313	378	static void blkg_destroy(struct blkcg_gq *blkg)
314	379	{
315	380	struct blkcg *blkcg = blkg->blkcg;
316		- struct blkcg_gq *parent = blkg->parent;
317	381	int i;
318	382
319		- lockdep_assert_held(blkg->q->queue_lock);
	383	+ lockdep_assert_held(&blkg->q->queue_lock);
320	384	lockdep_assert_held(&blkcg->lock);
321	385
322	386	/* Something wrong if we are trying to remove same group twice */
..	..	@@ -328,11 +392,6 @@
328	392
329	393	if (blkg->pd[i] && pol->pd_offline_fn)
330	394	pol->pd_offline_fn(blkg->pd[i]);
331		- }
332		-
333		- if (parent) {
334		- blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
335		- blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
336	395	}
337	396
338	397	blkg->online = false;
..	..	@@ -353,7 +412,7 @@
353	412	* Put the reference taken at the time of creation so that when all
354	413	* queues are gone, group can be destroyed.
355	414	*/
356		- blkg_put(blkg);
	415	+ percpu_ref_kill(&blkg->refcnt);
357	416	}
358	417
359	418	/**
..	..	@@ -366,8 +425,7 @@
366	425	{
367	426	struct blkcg_gq blkg, n;
368	427
369		- lockdep_assert_held(q->queue_lock);
370		-
	428	+ spin_lock_irq(&q->queue_lock);
371	429	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
372	430	struct blkcg *blkcg = blkg->blkcg;
373	431
..	..	@@ -377,65 +435,7 @@
377	435	}
378	436
379	437	q->root_blkg = NULL;
380		- q->root_rl.blkg = NULL;
381		-}
382		-
383		-/*
384		- * A group is RCU protected, but having an rcu lock does not mean that one
385		- * can access all the fields of blkg and assume these are valid. For
386		- * example, don't try to follow throtl_data and request queue links.
387		- *
388		- * Having a reference to blkg under an rcu allows accesses to only values
389		- * local to groups like group stats and group rate limits.
390		- */
391		-void __blkg_release_rcu(struct rcu_head *rcu_head)
392		-{
393		- struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
394		-
395		- /* release the blkcg and parent blkg refs this blkg has been holding */
396		- css_put(&blkg->blkcg->css);
397		- if (blkg->parent)
398		- blkg_put(blkg->parent);
399		-
400		- wb_congested_put(blkg->wb_congested);
401		-
402		- blkg_free(blkg);
403		-}
404		-EXPORT_SYMBOL_GPL(__blkg_release_rcu);
405		-
406		-/*
407		- * The next function used by blk_queue_for_each_rl(). It's a bit tricky
408		- * because the root blkg uses @q->root_rl instead of its own rl.
409		- */
410		-struct request_list __blk_queue_next_rl(struct request_list rl,
411		- struct request_queue *q)
412		-{
413		- struct list_head *ent;
414		- struct blkcg_gq *blkg;
415		-
416		- /*
417		- * Determine the current blkg list_head. The first entry is
418		- * root_rl which is off @q->blkg_list and mapped to the head.
419		- */
420		- if (rl == &q->root_rl) {
421		- ent = &q->blkg_list;
422		- /* There are no more block groups, hence no request lists */
423		- if (list_empty(ent))
424		- return NULL;
425		- } else {
426		- blkg = container_of(rl, struct blkcg_gq, rl);
427		- ent = &blkg->q_node;
428		- }
429		-
430		- /* walk to the next list_head, skip root blkcg */
431		- ent = ent->next;
432		- if (ent == &q->root_blkg->q_node)
433		- ent = ent->next;
434		- if (ent == &q->blkg_list)
435		- return NULL;
436		-
437		- blkg = container_of(ent, struct blkcg_gq, q_node);
438		- return &blkg->rl;
	438	+ spin_unlock_irq(&q->queue_lock);
439	439	}
440	440
441	441	static int blkcg_reset_stats(struct cgroup_subsys_state *css,
..	..	@@ -443,7 +443,7 @@
443	443	{
444	444	struct blkcg *blkcg = css_to_blkcg(css);
445	445	struct blkcg_gq *blkg;
446		- int i;
	446	+ int i, cpu;
447	447
448	448	mutex_lock(&blkcg_pol_mutex);
449	449	spin_lock_irq(&blkcg->lock);
..	..	@@ -454,8 +454,12 @@
454	454	* anyway. If you get hit by a race, retry.
455	455	*/
456	456	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
457		- blkg_rwstat_reset(&blkg->stat_bytes);
458		- blkg_rwstat_reset(&blkg->stat_ios);
	457	+ for_each_possible_cpu(cpu) {
	458	+ struct blkg_iostat_set *bis =
	459	+ per_cpu_ptr(blkg->iostat_cpu, cpu);
	460	+ memset(bis, 0, sizeof(*bis));
	461	+ }
	462	+ memset(&blkg->iostat, 0, sizeof(blkg->iostat));
459	463
460	464	for (i = 0; i < BLKCG_MAX_POLS; i++) {
461	465	struct blkcg_policy *pol = blkcg_policy[i];
..	..	@@ -477,7 +481,6 @@
477	481	return bdi_dev_name(blkg->q->backing_dev_info);
478	482	return NULL;
479	483	}
480		-EXPORT_SYMBOL_GPL(blkg_dev_name);
481	484
482	485	/**
483	486	* blkcg_print_blkgs - helper for printing per-blkg data
..	..	@@ -508,10 +511,10 @@
508	511
509	512	rcu_read_lock();
510	513	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
511		- spin_lock_irq(blkg->q->queue_lock);
	514	+ spin_lock_irq(&blkg->q->queue_lock);
512	515	if (blkcg_policy_enabled(blkg->q, pol))
513	516	total += prfill(sf, blkg->pd[pol->plid], data);
514		- spin_unlock_irq(blkg->q->queue_lock);
	517	+ spin_unlock_irq(&blkg->q->queue_lock);
515	518	}
516	519	rcu_read_unlock();
517	520
..	..	@@ -540,262 +543,55 @@
540	543	}
541	544	EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
542	545
543		-/**
544		- * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
545		- * @sf: seq_file to print to
546		- * @pd: policy private data of interest
547		- * @rwstat: rwstat to print
548		- *
549		- * Print @rwstat to @sf for the device assocaited with @pd.
550		- */
551		-u64 __blkg_prfill_rwstat(struct seq_file sf, struct blkg_policy_data pd,
552		- const struct blkg_rwstat *rwstat)
553		-{
554		- static const char *rwstr[] = {
555		- [BLKG_RWSTAT_READ] = "Read",
556		- [BLKG_RWSTAT_WRITE] = "Write",
557		- [BLKG_RWSTAT_SYNC] = "Sync",
558		- [BLKG_RWSTAT_ASYNC] = "Async",
559		- [BLKG_RWSTAT_DISCARD] = "Discard",
560		- };
561		- const char *dname = blkg_dev_name(pd->blkg);
562		- u64 v;
563		- int i;
564		-
565		- if (!dname)
566		- return 0;
567		-
568		- for (i = 0; i < BLKG_RWSTAT_NR; i++)
569		- seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
570		- (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
571		-
572		- v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
573		- atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
574		- atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
575		- seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
576		- return v;
577		-}
578		-EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
579		-
580		-/**
581		- * blkg_prfill_stat - prfill callback for blkg_stat
582		- * @sf: seq_file to print to
583		- * @pd: policy private data of interest
584		- * @off: offset to the blkg_stat in @pd
585		- *
586		- * prfill callback for printing a blkg_stat.
587		- */
588		-u64 blkg_prfill_stat(struct seq_file sf, struct blkg_policy_data pd, int off)
589		-{
590		- return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
591		-}
592		-EXPORT_SYMBOL_GPL(blkg_prfill_stat);
593		-
594		-/**
595		- * blkg_prfill_rwstat - prfill callback for blkg_rwstat
596		- * @sf: seq_file to print to
597		- * @pd: policy private data of interest
598		- * @off: offset to the blkg_rwstat in @pd
599		- *
600		- * prfill callback for printing a blkg_rwstat.
601		- */
602		-u64 blkg_prfill_rwstat(struct seq_file sf, struct blkg_policy_data pd,
603		- int off)
604		-{
605		- struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
606		-
607		- return __blkg_prfill_rwstat(sf, pd, &rwstat);
608		-}
609		-EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
610		-
611		-static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
612		- struct blkg_policy_data *pd, int off)
613		-{
614		- struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
615		-
616		- return __blkg_prfill_rwstat(sf, pd, &rwstat);
617		-}
618		-
619		-/**
620		- * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
621		- * @sf: seq_file to print to
622		- * @v: unused
623		- *
624		- * To be used as cftype->seq_show to print blkg->stat_bytes.
625		- * cftype->private must be set to the blkcg_policy.
626		- */
627		-int blkg_print_stat_bytes(struct seq_file sf, void v)
628		-{
629		- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
630		- blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
631		- offsetof(struct blkcg_gq, stat_bytes), true);
632		- return 0;
633		-}
634		-EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
635		-
636		-/**
637		- * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
638		- * @sf: seq_file to print to
639		- * @v: unused
640		- *
641		- * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private
642		- * must be set to the blkcg_policy.
643		- */
644		-int blkg_print_stat_ios(struct seq_file sf, void v)
645		-{
646		- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
647		- blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
648		- offsetof(struct blkcg_gq, stat_ios), true);
649		- return 0;
650		-}
651		-EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
652		-
653		-static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
654		- struct blkg_policy_data *pd,
655		- int off)
656		-{
657		- struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
658		- NULL, off);
659		- return __blkg_prfill_rwstat(sf, pd, &rwstat);
660		-}
661		-
662		-/**
663		- * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
664		- * @sf: seq_file to print to
665		- * @v: unused
666		- */
667		-int blkg_print_stat_bytes_recursive(struct seq_file sf, void v)
668		-{
669		- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
670		- blkg_prfill_rwstat_field_recursive,
671		- (void *)seq_cft(sf)->private,
672		- offsetof(struct blkcg_gq, stat_bytes), true);
673		- return 0;
674		-}
675		-EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
676		-
677		-/**
678		- * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
679		- * @sf: seq_file to print to
680		- * @v: unused
681		- */
682		-int blkg_print_stat_ios_recursive(struct seq_file sf, void v)
683		-{
684		- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
685		- blkg_prfill_rwstat_field_recursive,
686		- (void *)seq_cft(sf)->private,
687		- offsetof(struct blkcg_gq, stat_ios), true);
688		- return 0;
689		-}
690		-EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
691		-
692		-/**
693		- * blkg_stat_recursive_sum - collect hierarchical blkg_stat
694		- * @blkg: blkg of interest
695		- * @pol: blkcg_policy which contains the blkg_stat
696		- * @off: offset to the blkg_stat in blkg_policy_data or @blkg
697		- *
698		- * Collect the blkg_stat specified by @blkg, @pol and @off and all its
699		- * online descendants and their aux counts. The caller must be holding the
700		- * queue lock for online tests.
701		- *
702		- * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
703		- * at @off bytes into @blkg's blkg_policy_data of the policy.
704		- */
705		-u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
706		- struct blkcg_policy *pol, int off)
707		-{
708		- struct blkcg_gq *pos_blkg;
709		- struct cgroup_subsys_state *pos_css;
710		- u64 sum = 0;
711		-
712		- lockdep_assert_held(blkg->q->queue_lock);
713		-
714		- rcu_read_lock();
715		- blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
716		- struct blkg_stat *stat;
717		-
718		- if (!pos_blkg->online)
719		- continue;
720		-
721		- if (pol)
722		- stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
723		- else
724		- stat = (void *)blkg + off;
725		-
726		- sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
727		- }
728		- rcu_read_unlock();
729		-
730		- return sum;
731		-}
732		-EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
733		-
734		-/**
735		- * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
736		- * @blkg: blkg of interest
737		- * @pol: blkcg_policy which contains the blkg_rwstat
738		- * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
739		- *
740		- * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
741		- * online descendants and their aux counts. The caller must be holding the
742		- * queue lock for online tests.
743		- *
744		- * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
745		- * is at @off bytes into @blkg's blkg_policy_data of the policy.
746		- */
747		-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
748		- struct blkcg_policy *pol, int off)
749		-{
750		- struct blkcg_gq *pos_blkg;
751		- struct cgroup_subsys_state *pos_css;
752		- struct blkg_rwstat sum = { };
753		- int i;
754		-
755		- lockdep_assert_held(blkg->q->queue_lock);
756		-
757		- rcu_read_lock();
758		- blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
759		- struct blkg_rwstat *rwstat;
760		-
761		- if (!pos_blkg->online)
762		- continue;
763		-
764		- if (pol)
765		- rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
766		- else
767		- rwstat = (void *)pos_blkg + off;
768		-
769		- for (i = 0; i < BLKG_RWSTAT_NR; i++)
770		- atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
771		- percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
772		- &sum.aux_cnt[i]);
773		- }
774		- rcu_read_unlock();
775		-
776		- return sum;
777		-}
778		-EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
779		-
780	546	/* Performs queue bypass and policy enabled checks then looks up blkg. */
781	547	static struct blkcg_gq blkg_lookup_check(struct blkcg blkcg,
782	548	const struct blkcg_policy *pol,
783	549	struct request_queue *q)
784	550	{
785	551	WARN_ON_ONCE(!rcu_read_lock_held());
786		- lockdep_assert_held(q->queue_lock);
	552	+ lockdep_assert_held(&q->queue_lock);
787	553
788	554	if (!blkcg_policy_enabled(q, pol))
789	555	return ERR_PTR(-EOPNOTSUPP);
790		-
791		- /*
792		- * This could be the first entry point of blkcg implementation and
793		- * we shouldn't allow anything to go through for a bypassing queue.
794		- */
795		- if (unlikely(blk_queue_bypass(q)))
796		- return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
797		-
798	556	return __blkg_lookup(blkcg, q, true /* update_hint */);
	557	+}
	558	+
	559	+/**
	560	+ * blkg_conf_prep - parse and prepare for per-blkg config update
	561	+ * @inputp: input string pointer
	562	+ *
	563	+ * Parse the device node prefix part, MAJ:MIN, of per-blkg config update
	564	+ * from @input and get and return the matching gendisk. *@inputp is
	565	+ * updated to point past the device node prefix. Returns an ERR_PTR()
	566	+ * value on error.
	567	+ *
	568	+ * Use this function iff blkg_conf_prep() can't be used for some reason.
	569	+ */
	570	+struct gendisk blkcg_conf_get_disk(char *inputp)
	571	+{
	572	+ char input = inputp;
	573	+ unsigned int major, minor;
	574	+ struct gendisk *disk;
	575	+ int key_len, part;
	576	+
	577	+ if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
	578	+ return ERR_PTR(-EINVAL);
	579	+
	580	+ input += key_len;
	581	+ if (!isspace(*input))
	582	+ return ERR_PTR(-EINVAL);
	583	+ input = skip_spaces(input);
	584	+
	585	+ disk = get_gendisk(MKDEV(major, minor), &part);
	586	+ if (!disk)
	587	+ return ERR_PTR(-ENODEV);
	588	+ if (part) {
	589	+ put_disk_and_module(disk);
	590	+ return ERR_PTR(-ENODEV);
	591	+ }
	592	+
	593	+ *inputp = input;
	594	+ return disk;
799	595	}
800	596
801	597	/**
..	..	@@ -812,35 +608,21 @@
812	608	*/
813	609	int blkg_conf_prep(struct blkcg blkcg, const struct blkcg_policy pol,
814	610	char input, struct blkg_conf_ctx ctx)
815		- __acquires(rcu) __acquires(disk->queue->queue_lock)
	611	+ __acquires(rcu) __acquires(&disk->queue->queue_lock)
816	612	{
817	613	struct gendisk *disk;
818	614	struct request_queue *q;
819	615	struct blkcg_gq *blkg;
820		- unsigned int major, minor;
821		- int key_len, part, ret;
822		- char *body;
	616	+ int ret;
823	617
824		- if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
825		- return -EINVAL;
826		-
827		- body = input + key_len;
828		- if (!isspace(*body))
829		- return -EINVAL;
830		- body = skip_spaces(body);
831		-
832		- disk = get_gendisk(MKDEV(major, minor), &part);
833		- if (!disk)
834		- return -ENODEV;
835		- if (part) {
836		- ret = -ENODEV;
837		- goto fail;
838		- }
	618	+ disk = blkcg_conf_get_disk(&input);
	619	+ if (IS_ERR(disk))
	620	+ return PTR_ERR(disk);
839	621
840	622	q = disk->queue;
841	623
842	624	rcu_read_lock();
843		- spin_lock_irq(q->queue_lock);
	625	+ spin_lock_irq(&q->queue_lock);
844	626
845	627	blkg = blkg_lookup_check(blkcg, pol, q);
846	628	if (IS_ERR(blkg)) {
..	..	@@ -867,7 +649,7 @@
867	649	}
868	650
869	651	/* Drop locks to do new blkg allocation with GFP_KERNEL. */
870		- spin_unlock_irq(q->queue_lock);
	652	+ spin_unlock_irq(&q->queue_lock);
871	653	rcu_read_unlock();
872	654
873	655	new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
..	..	@@ -883,7 +665,7 @@
883	665	}
884	666
885	667	rcu_read_lock();
886		- spin_lock_irq(q->queue_lock);
	668	+ spin_lock_irq(&q->queue_lock);
887	669
888	670	blkg = blkg_lookup_check(pos, pol, q);
889	671	if (IS_ERR(blkg)) {
..	..	@@ -896,7 +678,7 @@
896	678	blkg_free(new_blkg);
897	679	} else {
898	680	blkg = blkg_create(pos, q, new_blkg);
899		- if (unlikely(IS_ERR(blkg))) {
	681	+ if (IS_ERR(blkg)) {
900	682	ret = PTR_ERR(blkg);
901	683	goto fail_preloaded;
902	684	}
..	..	@@ -910,13 +692,13 @@
910	692	success:
911	693	ctx->disk = disk;
912	694	ctx->blkg = blkg;
913		- ctx->body = body;
	695	+ ctx->body = input;
914	696	return 0;
915	697
916	698	fail_preloaded:
917	699	radix_tree_preload_end();
918	700	fail_unlock:
919		- spin_unlock_irq(q->queue_lock);
	701	+ spin_unlock_irq(&q->queue_lock);
920	702	rcu_read_unlock();
921	703	fail:
922	704	put_disk_and_module(disk);
..	..	@@ -942,31 +724,159 @@
942	724	* with blkg_conf_prep().
943	725	*/
944	726	void blkg_conf_finish(struct blkg_conf_ctx *ctx)
945		- __releases(ctx->disk->queue->queue_lock) __releases(rcu)
	727	+ __releases(&ctx->disk->queue->queue_lock) __releases(rcu)
946	728	{
947		- spin_unlock_irq(ctx->disk->queue->queue_lock);
	729	+ spin_unlock_irq(&ctx->disk->queue->queue_lock);
948	730	rcu_read_unlock();
949	731	put_disk_and_module(ctx->disk);
950	732	}
951	733	EXPORT_SYMBOL_GPL(blkg_conf_finish);
	734	+
	735	+static void blkg_iostat_set(struct blkg_iostat dst, struct blkg_iostat src)
	736	+{
	737	+ int i;
	738	+
	739	+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
	740	+ dst->bytes[i] = src->bytes[i];
	741	+ dst->ios[i] = src->ios[i];
	742	+ }
	743	+}
	744	+
	745	+static void blkg_iostat_add(struct blkg_iostat dst, struct blkg_iostat src)
	746	+{
	747	+ int i;
	748	+
	749	+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
	750	+ dst->bytes[i] += src->bytes[i];
	751	+ dst->ios[i] += src->ios[i];
	752	+ }
	753	+}
	754	+
	755	+static void blkg_iostat_sub(struct blkg_iostat dst, struct blkg_iostat src)
	756	+{
	757	+ int i;
	758	+
	759	+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
	760	+ dst->bytes[i] -= src->bytes[i];
	761	+ dst->ios[i] -= src->ios[i];
	762	+ }
	763	+}
	764	+
	765	+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
	766	+{
	767	+ struct blkcg *blkcg = css_to_blkcg(css);
	768	+ struct blkcg_gq *blkg;
	769	+
	770	+ rcu_read_lock();
	771	+
	772	+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
	773	+ struct blkcg_gq *parent = blkg->parent;
	774	+ struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
	775	+ struct blkg_iostat cur, delta;
	776	+ unsigned int seq;
	777	+
	778	+ /* fetch the current per-cpu values */
	779	+ do {
	780	+ seq = u64_stats_fetch_begin(&bisc->sync);
	781	+ blkg_iostat_set(&cur, &bisc->cur);
	782	+ } while (u64_stats_fetch_retry(&bisc->sync, seq));
	783	+
	784	+ /* propagate percpu delta to global */
	785	+ u64_stats_update_begin(&blkg->iostat.sync);
	786	+ blkg_iostat_set(&delta, &cur);
	787	+ blkg_iostat_sub(&delta, &bisc->last);
	788	+ blkg_iostat_add(&blkg->iostat.cur, &delta);
	789	+ blkg_iostat_add(&bisc->last, &delta);
	790	+ u64_stats_update_end(&blkg->iostat.sync);
	791	+
	792	+ /* propagate global delta to parent */
	793	+ if (parent) {
	794	+ u64_stats_update_begin(&parent->iostat.sync);
	795	+ blkg_iostat_set(&delta, &blkg->iostat.cur);
	796	+ blkg_iostat_sub(&delta, &blkg->iostat.last);
	797	+ blkg_iostat_add(&parent->iostat.cur, &delta);
	798	+ blkg_iostat_add(&blkg->iostat.last, &delta);
	799	+ u64_stats_update_end(&parent->iostat.sync);
	800	+ }
	801	+ }
	802	+
	803	+ rcu_read_unlock();
	804	+}
	805	+
	806	+/*
	807	+ * The rstat algorithms intentionally don't handle the root cgroup to avoid
	808	+ * incurring overhead when no cgroups are defined. For that reason,
	809	+ * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the
	810	+ * iostat in the root cgroup's blkcg_gq.
	811	+ *
	812	+ * However, we would like to re-use the printing code between the root and
	813	+ * non-root cgroups to the extent possible. For that reason, we simulate
	814	+ * flushing the root cgroup's stats by explicitly filling in the iostat
	815	+ * with disk level statistics.
	816	+ */
	817	+static void blkcg_fill_root_iostats(void)
	818	+{
	819	+ struct class_dev_iter iter;
	820	+ struct device *dev;
	821	+
	822	+ class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
	823	+ while ((dev = class_dev_iter_next(&iter))) {
	824	+ struct gendisk *disk = dev_to_disk(dev);
	825	+ struct hd_struct *part = disk_get_part(disk, 0);
	826	+ struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue);
	827	+ struct blkg_iostat tmp;
	828	+ int cpu;
	829	+
	830	+ memset(&tmp, 0, sizeof(tmp));
	831	+ for_each_possible_cpu(cpu) {
	832	+ struct disk_stats *cpu_dkstats;
	833	+
	834	+ cpu_dkstats = per_cpu_ptr(part->dkstats, cpu);
	835	+ tmp.ios[BLKG_IOSTAT_READ] +=
	836	+ cpu_dkstats->ios[STAT_READ];
	837	+ tmp.ios[BLKG_IOSTAT_WRITE] +=
	838	+ cpu_dkstats->ios[STAT_WRITE];
	839	+ tmp.ios[BLKG_IOSTAT_DISCARD] +=
	840	+ cpu_dkstats->ios[STAT_DISCARD];
	841	+ // convert sectors to bytes
	842	+ tmp.bytes[BLKG_IOSTAT_READ] +=
	843	+ cpu_dkstats->sectors[STAT_READ] << 9;
	844	+ tmp.bytes[BLKG_IOSTAT_WRITE] +=
	845	+ cpu_dkstats->sectors[STAT_WRITE] << 9;
	846	+ tmp.bytes[BLKG_IOSTAT_DISCARD] +=
	847	+ cpu_dkstats->sectors[STAT_DISCARD] << 9;
	848	+
	849	+ u64_stats_update_begin(&blkg->iostat.sync);
	850	+ blkg_iostat_set(&blkg->iostat.cur, &tmp);
	851	+ u64_stats_update_end(&blkg->iostat.sync);
	852	+ }
	853	+ disk_put_part(part);
	854	+ }
	855	+}
952	856
953	857	static int blkcg_print_stat(struct seq_file sf, void v)
954	858	{
955	859	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
956	860	struct blkcg_gq *blkg;
957	861
	862	+ if (!seq_css(sf)->parent)
	863	+ blkcg_fill_root_iostats();
	864	+ else
	865	+ cgroup_rstat_flush(blkcg->css.cgroup);
	866	+
958	867	rcu_read_lock();
959	868
960	869	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
	870	+ struct blkg_iostat_set *bis = &blkg->iostat;
961	871	const char *dname;
962	872	char *buf;
963		- struct blkg_rwstat rwstat;
964	873	u64 rbytes, wbytes, rios, wios, dbytes, dios;
965	874	size_t size = seq_get_buf(sf, &buf), off = 0;
966	875	int i;
967	876	bool has_stats = false;
	877	+ unsigned seq;
968	878
969		- spin_lock_irq(blkg->q->queue_lock);
	879	+ spin_lock_irq(&blkg->q->queue_lock);
970	880
971	881	if (!blkg->online)
972	882	goto skip;
..	..	@@ -983,17 +893,16 @@
983	893	*/
984	894	off += scnprintf(buf+off, size-off, "%s ", dname);
985	895
986		- rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
987		- offsetof(struct blkcg_gq, stat_bytes));
988		- rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
989		- wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
990		- dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
	896	+ do {
	897	+ seq = u64_stats_fetch_begin(&bis->sync);
991	898
992		- rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
993		- offsetof(struct blkcg_gq, stat_ios));
994		- rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
995		- wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
996		- dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
	899	+ rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
	900	+ wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
	901	+ dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
	902	+ rios = bis->cur.ios[BLKG_IOSTAT_READ];
	903	+ wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
	904	+ dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
	905	+ } while (u64_stats_fetch_retry(&bis->sync, seq));
997	906
998	907	if (rbytes \|\| wbytes \|\| rios \|\| wios) {
999	908	has_stats = true;
..	..	@@ -1003,10 +912,7 @@
1003	912	dbytes, dios);
1004	913	}
1005	914
1006		- if (!blkcg_debug_stats)
1007		- goto next;
1008		-
1009		- if (atomic_read(&blkg->use_delay)) {
	915	+ if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
1010	916	has_stats = true;
1011	917	off += scnprintf(buf+off, size-off,
1012	918	" use_delay=%d delay_nsec=%llu",
..	..	@@ -1026,7 +932,7 @@
1026	932	has_stats = true;
1027	933	off += written;
1028	934	}
1029		-next:
	935	+
1030	936	if (has_stats) {
1031	937	if (off < size - 1) {
1032	938	off += scnprintf(buf+off, size-off, "\n");
..	..	@@ -1036,7 +942,7 @@
1036	942	}
1037	943	}
1038	944	skip:
1039		- spin_unlock_irq(blkg->q->queue_lock);
	945	+ spin_unlock_irq(&blkg->q->queue_lock);
1040	946	}
1041	947
1042	948	rcu_read_unlock();
..	..	@@ -1046,7 +952,6 @@
1046	952	static struct cftype blkcg_files[] = {
1047	953	{
1048	954	.name = "stat",
1049		- .flags = CFTYPE_NOT_ON_ROOT,
1050	955	.seq_show = blkcg_print_stat,
1051	956	},
1052	957	{ } /* terminate */
..	..	@@ -1096,8 +1001,8 @@
1096	1001	/* this prevents anyone from attaching or migrating to this blkcg */
1097	1002	wb_blkcg_offline(blkcg);
1098	1003
1099		- /* put the base cgwb reference allowing step 2 to be triggered */
1100		- blkcg_cgwb_put(blkcg);
	1004	+ /* put the base online pin allowing step 2 to be triggered */
	1005	+ blkcg_unpin_online(blkcg);
1101	1006	}
1102	1007
1103	1008	/**
..	..	@@ -1113,6 +1018,8 @@
1113	1018	*/
1114	1019	void blkcg_destroy_blkgs(struct blkcg *blkcg)
1115	1020	{
	1021	+ might_sleep();
	1022	+
1116	1023	spin_lock_irq(&blkcg->lock);
1117	1024
1118	1025	while (!hlist_empty(&blkcg->blkg_list)) {
..	..	@@ -1120,14 +1027,20 @@
1120	1027	struct blkcg_gq, blkcg_node);
1121	1028	struct request_queue *q = blkg->q;
1122	1029
1123		- if (spin_trylock(q->queue_lock)) {
1124		- blkg_destroy(blkg);
1125		- spin_unlock(q->queue_lock);
1126		- } else {
	1030	+ if (need_resched() \|\| !spin_trylock(&q->queue_lock)) {
	1031	+ /*
	1032	+ * Given that the system can accumulate a huge number
	1033	+ * of blkgs in pathological cases, check to see if we
	1034	+ * need to rescheduling to avoid softlockup.
	1035	+ */
1127	1036	spin_unlock_irq(&blkcg->lock);
1128		- cpu_relax();
	1037	+ cond_resched();
1129	1038	spin_lock_irq(&blkcg->lock);
	1039	+ continue;
1130	1040	}
	1041	+
	1042	+ blkg_destroy(blkg);
	1043	+ spin_unlock(&q->queue_lock);
1131	1044	}
1132	1045
1133	1046	spin_unlock_irq(&blkcg->lock);
..	..	@@ -1196,11 +1109,11 @@
1196	1109	}
1197	1110
1198	1111	spin_lock_init(&blkcg->lock);
	1112	+ refcount_set(&blkcg->online_pin, 1);
1199	1113	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT \| __GFP_NOWARN);
1200	1114	INIT_HLIST_HEAD(&blkcg->blkg_list);
1201	1115	#ifdef CONFIG_CGROUP_WRITEBACK
1202	1116	INIT_LIST_HEAD(&blkcg->cgwb_list);
1203		- refcount_set(&blkcg->cgwb_refcnt, 1);
1204	1117	#endif
1205	1118	list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
1206	1119
..	..	@@ -1219,11 +1132,26 @@
1219	1132	return ret;
1220	1133	}
1221	1134
	1135	+static int blkcg_css_online(struct cgroup_subsys_state *css)
	1136	+{
	1137	+ struct blkcg *blkcg = css_to_blkcg(css);
	1138	+ struct blkcg *parent = blkcg_parent(blkcg);
	1139	+
	1140	+ /*
	1141	+ * blkcg_pin_online() is used to delay blkcg offline so that blkgs
	1142	+ * don't go offline while cgwbs are still active on them. Pin the
	1143	+ * parent so that offline always happens towards the root.
	1144	+ */
	1145	+ if (parent)
	1146	+ blkcg_pin_online(parent);
	1147	+ return 0;
	1148	+}
	1149	+
1222	1150	/**
1223	1151	* blkcg_init_queue - initialize blkcg part of request queue
1224	1152	* @q: request_queue to initialize
1225	1153	*
1226		- * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
	1154	+ * Called from blk_alloc_queue(). Responsible for initializing blkcg
1227	1155	* part of new request_queue @q.
1228	1156	*
1229	1157	* RETURNS:
..	..	@@ -1243,36 +1171,38 @@
1243	1171
1244	1172	/* Make sure the root blkg exists. */
1245	1173	rcu_read_lock();
1246		- spin_lock_irq(q->queue_lock);
	1174	+ spin_lock_irq(&q->queue_lock);
1247	1175	blkg = blkg_create(&blkcg_root, q, new_blkg);
1248	1176	if (IS_ERR(blkg))
1249	1177	goto err_unlock;
1250	1178	q->root_blkg = blkg;
1251		- q->root_rl.blkg = blkg;
1252		- spin_unlock_irq(q->queue_lock);
	1179	+ spin_unlock_irq(&q->queue_lock);
1253	1180	rcu_read_unlock();
1254	1181
1255	1182	if (preloaded)
1256	1183	radix_tree_preload_end();
1257	1184
1258		- ret = blk_iolatency_init(q);
1259		- if (ret) {
1260		- spin_lock_irq(q->queue_lock);
1261		- blkg_destroy_all(q);
1262		- spin_unlock_irq(q->queue_lock);
1263		- return ret;
1264		- }
	1185	+ ret = blk_ioprio_init(q);
	1186	+ if (ret)
	1187	+ goto err_destroy_all;
1265	1188
1266	1189	ret = blk_throtl_init(q);
1267		- if (ret) {
1268		- spin_lock_irq(q->queue_lock);
1269		- blkg_destroy_all(q);
1270		- spin_unlock_irq(q->queue_lock);
1271		- }
1272		- return ret;
	1190	+ if (ret)
	1191	+ goto err_destroy_all;
1273	1192
	1193	+ ret = blk_iolatency_init(q);
	1194	+ if (ret) {
	1195	+ blk_throtl_exit(q);
	1196	+ goto err_destroy_all;
	1197	+ }
	1198	+
	1199	+ return 0;
	1200	+
	1201	+err_destroy_all:
	1202	+ blkg_destroy_all(q);
	1203	+ return ret;
1274	1204	err_unlock:
1275		- spin_unlock_irq(q->queue_lock);
	1205	+ spin_unlock_irq(&q->queue_lock);
1276	1206	rcu_read_unlock();
1277	1207	if (preloaded)
1278	1208	radix_tree_preload_end();
..	..	@@ -1280,37 +1210,14 @@
1280	1210	}
1281	1211
1282	1212	/**
1283		- * blkcg_drain_queue - drain blkcg part of request_queue
1284		- * @q: request_queue to drain
1285		- *
1286		- * Called from blk_drain_queue(). Responsible for draining blkcg part.
1287		- */
1288		-void blkcg_drain_queue(struct request_queue *q)
1289		-{
1290		- lockdep_assert_held(q->queue_lock);
1291		-
1292		- /*
1293		- * @q could be exiting and already have destroyed all blkgs as
1294		- * indicated by NULL root_blkg. If so, don't confuse policies.
1295		- */
1296		- if (!q->root_blkg)
1297		- return;
1298		-
1299		- blk_throtl_drain(q);
1300		-}
1301		-
1302		-/**
1303	1213	* blkcg_exit_queue - exit and release blkcg part of request_queue
1304	1214	* @q: request_queue being released
1305	1215	*
1306		- * Called from blk_release_queue(). Responsible for exiting blkcg part.
	1216	+ * Called from blk_exit_queue(). Responsible for exiting blkcg part.
1307	1217	*/
1308	1218	void blkcg_exit_queue(struct request_queue *q)
1309	1219	{
1310		- spin_lock_irq(q->queue_lock);
1311	1220	blkg_destroy_all(q);
1312		- spin_unlock_irq(q->queue_lock);
1313		-
1314	1221	blk_throtl_exit(q);
1315	1222	}
1316	1223
..	..	@@ -1369,9 +1276,11 @@
1369	1276
1370	1277	struct cgroup_subsys io_cgrp_subsys = {
1371	1278	.css_alloc = blkcg_css_alloc,
	1279	+ .css_online = blkcg_css_online,
1372	1280	.css_offline = blkcg_css_offline,
1373	1281	.css_free = blkcg_css_free,
1374	1282	.can_attach = blkcg_can_attach,
	1283	+ .css_rstat_flush = blkcg_rstat_flush,
1375	1284	.bind = blkcg_bind,
1376	1285	.dfl_cftypes = blkcg_files,
1377	1286	.legacy_cftypes = blkcg_legacy_files,
..	..	@@ -1408,60 +1317,98 @@
1408	1317	const struct blkcg_policy *pol)
1409	1318	{
1410	1319	struct blkg_policy_data *pd_prealloc = NULL;
1411		- struct blkcg_gq *blkg;
	1320	+ struct blkcg_gq blkg, pinned_blkg = NULL;
1412	1321	int ret;
1413	1322
1414	1323	if (blkcg_policy_enabled(q, pol))
1415	1324	return 0;
1416	1325
1417		- if (q->mq_ops)
	1326	+ if (queue_is_mq(q))
1418	1327	blk_mq_freeze_queue(q);
1419		- else
1420		- blk_queue_bypass_start(q);
1421		-pd_prealloc:
1422		- if (!pd_prealloc) {
1423		- pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
1424		- if (!pd_prealloc) {
1425		- ret = -ENOMEM;
1426		- goto out_bypass_end;
1427		- }
1428		- }
	1328	+retry:
	1329	+ spin_lock_irq(&q->queue_lock);
1429	1330
1430		- spin_lock_irq(q->queue_lock);
1431		-
1432		- list_for_each_entry(blkg, &q->blkg_list, q_node) {
	1331	+ /* blkg_list is pushed at the head, reverse walk to allocate parents first */
	1332	+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
1433	1333	struct blkg_policy_data *pd;
1434	1334
1435	1335	if (blkg->pd[pol->plid])
1436	1336	continue;
1437	1337
1438		- pd = pol->pd_alloc_fn(GFP_NOWAIT \| __GFP_NOWARN, q->node);
1439		- if (!pd)
1440		- swap(pd, pd_prealloc);
	1338	+ /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
	1339	+ if (blkg == pinned_blkg) {
	1340	+ pd = pd_prealloc;
	1341	+ pd_prealloc = NULL;
	1342	+ } else {
	1343	+ pd = pol->pd_alloc_fn(GFP_NOWAIT \| __GFP_NOWARN, q,
	1344	+ blkg->blkcg);
	1345	+ }
	1346	+
1441	1347	if (!pd) {
1442		- spin_unlock_irq(q->queue_lock);
1443		- goto pd_prealloc;
	1348	+ /*
	1349	+ * GFP_NOWAIT failed. Free the existing one and
	1350	+ * prealloc for @blkg w/ GFP_KERNEL.
	1351	+ */
	1352	+ if (pinned_blkg)
	1353	+ blkg_put(pinned_blkg);
	1354	+ blkg_get(blkg);
	1355	+ pinned_blkg = blkg;
	1356	+
	1357	+ spin_unlock_irq(&q->queue_lock);
	1358	+
	1359	+ if (pd_prealloc)
	1360	+ pol->pd_free_fn(pd_prealloc);
	1361	+ pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
	1362	+ blkg->blkcg);
	1363	+ if (pd_prealloc)
	1364	+ goto retry;
	1365	+ else
	1366	+ goto enomem;
1444	1367	}
1445	1368
1446	1369	blkg->pd[pol->plid] = pd;
1447	1370	pd->blkg = blkg;
1448	1371	pd->plid = pol->plid;
1449		- if (pol->pd_init_fn)
1450		- pol->pd_init_fn(pd);
1451	1372	}
	1373	+
	1374	+ /* all allocated, init in the same order */
	1375	+ if (pol->pd_init_fn)
	1376	+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
	1377	+ pol->pd_init_fn(blkg->pd[pol->plid]);
	1378	+
	1379	+ if (pol->pd_online_fn)
	1380	+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
	1381	+ pol->pd_online_fn(blkg->pd[pol->plid]);
1452	1382
1453	1383	__set_bit(pol->plid, q->blkcg_pols);
1454	1384	ret = 0;
1455	1385
1456		- spin_unlock_irq(q->queue_lock);
1457		-out_bypass_end:
1458		- if (q->mq_ops)
	1386	+ spin_unlock_irq(&q->queue_lock);
	1387	+out:
	1388	+ if (queue_is_mq(q))
1459	1389	blk_mq_unfreeze_queue(q);
1460		- else
1461		- blk_queue_bypass_end(q);
	1390	+ if (pinned_blkg)
	1391	+ blkg_put(pinned_blkg);
1462	1392	if (pd_prealloc)
1463	1393	pol->pd_free_fn(pd_prealloc);
1464	1394	return ret;
	1395	+
	1396	+enomem:
	1397	+ /* alloc failed, nothing's initialized yet, free everything */
	1398	+ spin_lock_irq(&q->queue_lock);
	1399	+ list_for_each_entry(blkg, &q->blkg_list, q_node) {
	1400	+ struct blkcg *blkcg = blkg->blkcg;
	1401	+
	1402	+ spin_lock(&blkcg->lock);
	1403	+ if (blkg->pd[pol->plid]) {
	1404	+ pol->pd_free_fn(blkg->pd[pol->plid]);
	1405	+ blkg->pd[pol->plid] = NULL;
	1406	+ }
	1407	+ spin_unlock(&blkcg->lock);
	1408	+ }
	1409	+ spin_unlock_irq(&q->queue_lock);
	1410	+ ret = -ENOMEM;
	1411	+ goto out;
1465	1412	}
1466	1413	EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1467	1414
..	..	@@ -1481,30 +1428,30 @@
1481	1428	if (!blkcg_policy_enabled(q, pol))
1482	1429	return;
1483	1430
1484		- if (q->mq_ops)
	1431	+ if (queue_is_mq(q))
1485	1432	blk_mq_freeze_queue(q);
1486		- else
1487		- blk_queue_bypass_start(q);
1488	1433
1489		- spin_lock_irq(q->queue_lock);
	1434	+ spin_lock_irq(&q->queue_lock);
1490	1435
1491	1436	__clear_bit(pol->plid, q->blkcg_pols);
1492	1437
1493	1438	list_for_each_entry(blkg, &q->blkg_list, q_node) {
	1439	+ struct blkcg *blkcg = blkg->blkcg;
	1440	+
	1441	+ spin_lock(&blkcg->lock);
1494	1442	if (blkg->pd[pol->plid]) {
1495	1443	if (pol->pd_offline_fn)
1496	1444	pol->pd_offline_fn(blkg->pd[pol->plid]);
1497	1445	pol->pd_free_fn(blkg->pd[pol->plid]);
1498	1446	blkg->pd[pol->plid] = NULL;
1499	1447	}
	1448	+ spin_unlock(&blkcg->lock);
1500	1449	}
1501	1450
1502		- spin_unlock_irq(q->queue_lock);
	1451	+ spin_unlock_irq(&q->queue_lock);
1503	1452
1504		- if (q->mq_ops)
	1453	+ if (queue_is_mq(q))
1505	1454	blk_mq_unfreeze_queue(q);
1506		- else
1507		- blk_queue_bypass_end(q);
1508	1455	}
1509	1456	EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1510	1457
..	..	@@ -1554,7 +1501,8 @@
1554	1501	blkcg->cpd[pol->plid] = cpd;
1555	1502	cpd->blkcg = blkcg;
1556	1503	cpd->plid = pol->plid;
1557		- pol->cpd_init_fn(cpd);
	1504	+ if (pol->cpd_init_fn)
	1505	+ pol->cpd_init_fn(cpd);
1558	1506	}
1559	1507	}
1560	1508
..	..	@@ -1627,6 +1575,25 @@
1627	1575	}
1628	1576	EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1629	1577
	1578	+bool __blkcg_punt_bio_submit(struct bio *bio)
	1579	+{
	1580	+ struct blkcg_gq *blkg = bio->bi_blkg;
	1581	+
	1582	+ /* consume the flag first */
	1583	+ bio->bi_opf &= ~REQ_CGROUP_PUNT;
	1584	+
	1585	+ /* never bounce for the root cgroup */
	1586	+ if (!blkg->parent)
	1587	+ return false;
	1588	+
	1589	+ spin_lock_bh(&blkg->async_bio_lock);
	1590	+ bio_list_add(&blkg->async_bios, bio);
	1591	+ spin_unlock_bh(&blkg->async_bio_lock);
	1592	+
	1593	+ queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
	1594	+ return true;
	1595	+}
	1596	+
1630	1597	/*
1631	1598	* Scale the accumulated delay based on how long it has been since we updated
1632	1599	* the delay. We only call this when we are adding delay, in case it's been a
..	..	@@ -1636,6 +1603,10 @@
1636	1603	static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1637	1604	{
1638	1605	u64 old = atomic64_read(&blkg->delay_start);
	1606	+
	1607	+ /* negative use_delay means no scaling, see blkcg_set_delay() */
	1608	+ if (atomic_read(&blkg->use_delay) < 0)
	1609	+ return;
1639	1610
1640	1611	/*
1641	1612	* We only want to scale down every second. The idea here is that we
..	..	@@ -1688,16 +1659,25 @@
1688	1659	*/
1689	1660	static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1690	1661	{
	1662	+ unsigned long pflags;
	1663	+ bool clamp;
1691	1664	u64 now = ktime_to_ns(ktime_get());
1692	1665	u64 exp;
1693	1666	u64 delay_nsec = 0;
1694	1667	int tok;
1695	1668
1696	1669	while (blkg->parent) {
1697		- if (atomic_read(&blkg->use_delay)) {
	1670	+ int use_delay = atomic_read(&blkg->use_delay);
	1671	+
	1672	+ if (use_delay) {
	1673	+ u64 this_delay;
	1674	+
1698	1675	blkcg_scale_delay(blkg, now);
1699		- delay_nsec = max_t(u64, delay_nsec,
1700		- atomic64_read(&blkg->delay_nsec));
	1676	+ this_delay = atomic64_read(&blkg->delay_nsec);
	1677	+ if (this_delay > delay_nsec) {
	1678	+ delay_nsec = this_delay;
	1679	+ clamp = use_delay > 0;
	1680	+ }
1701	1681	}
1702	1682	blkg = blkg->parent;
1703	1683	}
..	..	@@ -1709,16 +1689,16 @@
1709	1689	* Let's not sleep for all eternity if we've amassed a huge delay.
1710	1690	* Swapping or metadata IO can accumulate 10's of seconds worth of
1711	1691	* delay, and we want userspace to be able to do _something_ so cap the
1712		- * delays at 1 second. If there's 10's of seconds worth of delay then
1713		- * the tasks will be delayed for 1 second for every syscall.
	1692	+ * delays at 0.25s. If there's 10's of seconds worth of delay then the
	1693	+ * tasks will be delayed for 0.25 second for every syscall. If
	1694	+ * blkcg_set_delay() was used as indicated by negative use_delay, the
	1695	+ * caller is responsible for regulating the range.
1714	1696	*/
1715		- delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
	1697	+ if (clamp)
	1698	+ delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1716	1699
1717		- /*
1718		- * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
1719		- * that hasn't landed upstream yet. Once that stuff is in place we need
1720		- * to do a psi_memstall_enter/leave if memdelay is set.
1721		- */
	1700	+ if (use_memdelay)
	1701	+ psi_memstall_enter(&pflags);
1722	1702
1723	1703	exp = ktime_add_ns(now, delay_nsec);
1724	1704	tok = io_schedule_prepare();
..	..	@@ -1728,6 +1708,9 @@
1728	1708	break;
1729	1709	} while (!fatal_signal_pending(current));
1730	1710	io_schedule_finish(tok);
	1711	+
	1712	+ if (use_memdelay)
	1713	+ psi_memstall_leave(&pflags);
1731	1714	}
1732	1715
1733	1716	/**
..	..	@@ -1766,8 +1749,7 @@
1766	1749	blkg = blkg_lookup(blkcg, q);
1767	1750	if (!blkg)
1768	1751	goto out;
1769		- blkg = blkg_try_get(blkg);
1770		- if (!blkg)
	1752	+ if (!blkg_tryget(blkg))
1771	1753	goto out;
1772	1754	rcu_read_unlock();
1773	1755
..	..	@@ -1779,12 +1761,11 @@
1779	1761	rcu_read_unlock();
1780	1762	blk_put_queue(q);
1781	1763	}
1782		-EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
1783	1764
1784	1765	/**
1785	1766	* blkcg_schedule_throttle - this task needs to check for throttling
1786		- * @q - the request queue IO was submitted on
1787		- * @use_memdelay - do we charge this to memory delay for PSI
	1767	+ * @q: the request queue IO was submitted on
	1768	+ * @use_memdelay: do we charge this to memory delay for PSI
1788	1769	*
1789	1770	* This is called by the IO controller when we know there's delay accumulated
1790	1771	* for the blkg for this task. We do not pass the blkg because there are places
..	..	@@ -1817,18 +1798,160 @@
1817	1798
1818	1799	/**
1819	1800	* blkcg_add_delay - add delay to this blkg
1820		- * @now - the current time in nanoseconds
1821		- * @delta - how many nanoseconds of delay to add
	1801	+ * @blkg: blkg of interest
	1802	+ * @now: the current time in nanoseconds
	1803	+ * @delta: how many nanoseconds of delay to add
1822	1804	*
1823	1805	* Charge @delta to the blkg's current delay accumulation. This is used to
1824	1806	* throttle tasks if an IO controller thinks we need more throttling.
1825	1807	*/
1826	1808	void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1827	1809	{
	1810	+ if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
	1811	+ return;
1828	1812	blkcg_scale_delay(blkg, now);
1829	1813	atomic64_add(delta, &blkg->delay_nsec);
1830	1814	}
1831		-EXPORT_SYMBOL_GPL(blkcg_add_delay);
	1815	+
	1816	+/**
	1817	+ * blkg_tryget_closest - try and get a blkg ref on the closet blkg
	1818	+ * @bio: target bio
	1819	+ * @css: target css
	1820	+ *
	1821	+ * As the failure mode here is to walk up the blkg tree, this ensure that the
	1822	+ * blkg->parent pointers are always valid. This returns the blkg that it ended
	1823	+ * up taking a reference on or %NULL if no reference was taken.
	1824	+ */
	1825	+static inline struct blkcg_gq blkg_tryget_closest(struct bio bio,
	1826	+ struct cgroup_subsys_state *css)
	1827	+{
	1828	+ struct blkcg_gq blkg, ret_blkg = NULL;
	1829	+
	1830	+ rcu_read_lock();
	1831	+ blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
	1832	+ while (blkg) {
	1833	+ if (blkg_tryget(blkg)) {
	1834	+ ret_blkg = blkg;
	1835	+ break;
	1836	+ }
	1837	+ blkg = blkg->parent;
	1838	+ }
	1839	+ rcu_read_unlock();
	1840	+
	1841	+ return ret_blkg;
	1842	+}
	1843	+
	1844	+/**
	1845	+ * bio_associate_blkg_from_css - associate a bio with a specified css
	1846	+ * @bio: target bio
	1847	+ * @css: target css
	1848	+ *
	1849	+ * Associate @bio with the blkg found by combining the css's blkg and the
	1850	+ * request_queue of the @bio. An association failure is handled by walking up
	1851	+ * the blkg tree. Therefore, the blkg associated can be anything between @blkg
	1852	+ * and q->root_blkg. This situation only happens when a cgroup is dying and
	1853	+ * then the remaining bios will spill to the closest alive blkg.
	1854	+ *
	1855	+ * A reference will be taken on the blkg and will be released when @bio is
	1856	+ * freed.
	1857	+ */
	1858	+void bio_associate_blkg_from_css(struct bio *bio,
	1859	+ struct cgroup_subsys_state *css)
	1860	+{
	1861	+ if (bio->bi_blkg)
	1862	+ blkg_put(bio->bi_blkg);
	1863	+
	1864	+ if (css && css->parent) {
	1865	+ bio->bi_blkg = blkg_tryget_closest(bio, css);
	1866	+ } else {
	1867	+ blkg_get(bio->bi_disk->queue->root_blkg);
	1868	+ bio->bi_blkg = bio->bi_disk->queue->root_blkg;
	1869	+ }
	1870	+}
	1871	+EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
	1872	+
	1873	+/**
	1874	+ * bio_associate_blkg - associate a bio with a blkg
	1875	+ * @bio: target bio
	1876	+ *
	1877	+ * Associate @bio with the blkg found from the bio's css and request_queue.
	1878	+ * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is
	1879	+ * already associated, the css is reused and association redone as the
	1880	+ * request_queue may have changed.
	1881	+ */
	1882	+void bio_associate_blkg(struct bio *bio)
	1883	+{
	1884	+ struct cgroup_subsys_state *css;
	1885	+
	1886	+ rcu_read_lock();
	1887	+
	1888	+ if (bio->bi_blkg)
	1889	+ css = &bio_blkcg(bio)->css;
	1890	+ else
	1891	+ css = blkcg_css();
	1892	+
	1893	+ bio_associate_blkg_from_css(bio, css);
	1894	+
	1895	+ rcu_read_unlock();
	1896	+}
	1897	+EXPORT_SYMBOL_GPL(bio_associate_blkg);
	1898	+
	1899	+/**
	1900	+ * bio_clone_blkg_association - clone blkg association from src to dst bio
	1901	+ * @dst: destination bio
	1902	+ * @src: source bio
	1903	+ */
	1904	+void bio_clone_blkg_association(struct bio dst, struct bio src)
	1905	+{
	1906	+ if (src->bi_blkg)
	1907	+ bio_associate_blkg_from_css(dst, &bio_blkcg(src)->css);
	1908	+}
	1909	+EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
	1910	+
	1911	+static int blk_cgroup_io_type(struct bio *bio)
	1912	+{
	1913	+ if (op_is_discard(bio->bi_opf))
	1914	+ return BLKG_IOSTAT_DISCARD;
	1915	+ if (op_is_write(bio->bi_opf))
	1916	+ return BLKG_IOSTAT_WRITE;
	1917	+ return BLKG_IOSTAT_READ;
	1918	+}
	1919	+
	1920	+void blk_cgroup_bio_start(struct bio *bio)
	1921	+{
	1922	+ int rwd = blk_cgroup_io_type(bio), cpu;
	1923	+ struct blkg_iostat_set *bis;
	1924	+
	1925	+ cpu = get_cpu();
	1926	+ bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
	1927	+ u64_stats_update_begin(&bis->sync);
	1928	+
	1929	+ /*
	1930	+ * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
	1931	+ * bio and we would have already accounted for the size of the bio.
	1932	+ */
	1933	+ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
	1934	+ bio_set_flag(bio, BIO_CGROUP_ACCT);
	1935	+ bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
	1936	+ }
	1937	+ bis->cur.ios[rwd]++;
	1938	+
	1939	+ u64_stats_update_end(&bis->sync);
	1940	+ if (cgroup_subsys_on_dfl(io_cgrp_subsys))
	1941	+ cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
	1942	+ put_cpu();
	1943	+}
	1944	+
	1945	+static int __init blkcg_init(void)
	1946	+{
	1947	+ blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
	1948	+ WQ_MEM_RECLAIM \| WQ_FREEZABLE \|
	1949	+ WQ_UNBOUND \| WQ_SYSFS, 0);
	1950	+ if (!blkcg_punt_bio_wq)
	1951	+ return -ENOMEM;
	1952	+ return 0;
	1953	+}
	1954	+subsys_initcall(blkcg_init);
1832	1955
1833	1956	module_param(blkcg_debug_stats, bool, 0644);
1834	1957	MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");