hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/block/blk-cgroup.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * Common Block IO controller cgroup interface
34 *
....@@ -28,7 +29,9 @@
2829 #include <linux/ctype.h>
2930 #include <linux/blk-cgroup.h>
3031 #include <linux/tracehook.h>
32
+#include <linux/psi.h>
3133 #include "blk.h"
34
+#include "blk-ioprio.h"
3235
3336 #define MAX_KEY_LEN 100
3437
....@@ -46,12 +49,14 @@
4649 EXPORT_SYMBOL_GPL(blkcg_root);
4750
4851 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
52
+EXPORT_SYMBOL_GPL(blkcg_root_css);
4953
5054 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
5155
5256 static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
5357
54
-static bool blkcg_debug_stats = false;
58
+bool blkcg_debug_stats = false;
59
+static struct workqueue_struct *blkcg_punt_bio_wq;
5560
5661 static bool blkcg_policy_enabled(struct request_queue *q,
5762 const struct blkcg_policy *pol)
....@@ -76,12 +81,63 @@
7681 if (blkg->pd[i])
7782 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
7883
79
- if (blkg->blkcg != &blkcg_root)
80
- blk_exit_rl(blkg->q, &blkg->rl);
81
-
82
- blkg_rwstat_exit(&blkg->stat_ios);
83
- blkg_rwstat_exit(&blkg->stat_bytes);
84
+ free_percpu(blkg->iostat_cpu);
85
+ percpu_ref_exit(&blkg->refcnt);
8486 kfree(blkg);
87
+}
88
+
89
+static void __blkg_release(struct rcu_head *rcu)
90
+{
91
+ struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
92
+
93
+ WARN_ON(!bio_list_empty(&blkg->async_bios));
94
+
95
+ /* release the blkcg and parent blkg refs this blkg has been holding */
96
+ css_put(&blkg->blkcg->css);
97
+ if (blkg->parent)
98
+ blkg_put(blkg->parent);
99
+ blkg_free(blkg);
100
+}
101
+
102
+/*
103
+ * A group is RCU protected, but having an rcu lock does not mean that one
104
+ * can access all the fields of blkg and assume these are valid. For
105
+ * example, don't try to follow throtl_data and request queue links.
106
+ *
107
+ * Having a reference to blkg under an rcu allows accesses to only values
108
+ * local to groups like group stats and group rate limits.
109
+ */
110
+static void blkg_release(struct percpu_ref *ref)
111
+{
112
+ struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
113
+
114
+ call_rcu(&blkg->rcu_head, __blkg_release);
115
+}
116
+
117
+static void blkg_async_bio_workfn(struct work_struct *work)
118
+{
119
+ struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
120
+ async_bio_work);
121
+ struct bio_list bios = BIO_EMPTY_LIST;
122
+ struct bio *bio;
123
+ struct blk_plug plug;
124
+ bool need_plug = false;
125
+
126
+ /* as long as there are pending bios, @blkg can't go away */
127
+ spin_lock_bh(&blkg->async_bio_lock);
128
+ bio_list_merge(&bios, &blkg->async_bios);
129
+ bio_list_init(&blkg->async_bios);
130
+ spin_unlock_bh(&blkg->async_bio_lock);
131
+
132
+ /* start plug only when bio_list contains at least 2 bios */
133
+ if (bios.head && bios.head->bi_next) {
134
+ need_plug = true;
135
+ blk_start_plug(&plug);
136
+ }
137
+ while ((bio = bio_list_pop(&bios)))
138
+ submit_bio(bio);
139
+ if (need_plug)
140
+ blk_finish_plug(&plug);
85141 }
86142
87143 /**
....@@ -96,28 +152,30 @@
96152 gfp_t gfp_mask)
97153 {
98154 struct blkcg_gq *blkg;
99
- int i;
155
+ int i, cpu;
100156
101157 /* alloc and init base part */
102158 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
103159 if (!blkg)
104160 return NULL;
105161
106
- if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
107
- blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
162
+ if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
163
+ goto err_free;
164
+
165
+ blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
166
+ if (!blkg->iostat_cpu)
108167 goto err_free;
109168
110169 blkg->q = q;
111170 INIT_LIST_HEAD(&blkg->q_node);
171
+ spin_lock_init(&blkg->async_bio_lock);
172
+ bio_list_init(&blkg->async_bios);
173
+ INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
112174 blkg->blkcg = blkcg;
113
- atomic_set(&blkg->refcnt, 1);
114175
115
- /* root blkg uses @q->root_rl, init rl only for !root blkgs */
116
- if (blkcg != &blkcg_root) {
117
- if (blk_init_rl(&blkg->rl, q, gfp_mask))
118
- goto err_free;
119
- blkg->rl.blkg = blkg;
120
- }
176
+ u64_stats_init(&blkg->iostat.sync);
177
+ for_each_possible_cpu(cpu)
178
+ u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
121179
122180 for (i = 0; i < BLKCG_MAX_POLS; i++) {
123181 struct blkcg_policy *pol = blkcg_policy[i];
....@@ -127,7 +185,7 @@
127185 continue;
128186
129187 /* alloc per-policy data and attach it to blkg */
130
- pd = pol->pd_alloc_fn(gfp_mask, q->node);
188
+ pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
131189 if (!pd)
132190 goto err_free;
133191
....@@ -157,7 +215,7 @@
157215 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
158216 if (blkg && blkg->q == q) {
159217 if (update_hint) {
160
- lockdep_assert_held(q->queue_lock);
218
+ lockdep_assert_held(&q->queue_lock);
161219 rcu_assign_pointer(blkcg->blkg_hint, blkg);
162220 }
163221 return blkg;
....@@ -176,11 +234,16 @@
176234 struct blkcg_gq *new_blkg)
177235 {
178236 struct blkcg_gq *blkg;
179
- struct bdi_writeback_congested *wb_congested;
180237 int i, ret;
181238
182239 WARN_ON_ONCE(!rcu_read_lock_held());
183
- lockdep_assert_held(q->queue_lock);
240
+ lockdep_assert_held(&q->queue_lock);
241
+
242
+ /* request_queue is dying, do not create/recreate a blkg */
243
+ if (blk_queue_dying(q)) {
244
+ ret = -ENODEV;
245
+ goto err_free_blkg;
246
+ }
184247
185248 /* blkg holds a reference to blkcg */
186249 if (!css_tryget_online(&blkcg->css)) {
....@@ -188,31 +251,22 @@
188251 goto err_free_blkg;
189252 }
190253
191
- wb_congested = wb_congested_get_create(q->backing_dev_info,
192
- blkcg->css.id,
193
- GFP_NOWAIT | __GFP_NOWARN);
194
- if (!wb_congested) {
195
- ret = -ENOMEM;
196
- goto err_put_css;
197
- }
198
-
199254 /* allocate */
200255 if (!new_blkg) {
201256 new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
202257 if (unlikely(!new_blkg)) {
203258 ret = -ENOMEM;
204
- goto err_put_congested;
259
+ goto err_put_css;
205260 }
206261 }
207262 blkg = new_blkg;
208
- blkg->wb_congested = wb_congested;
209263
210264 /* link parent */
211265 if (blkcg_parent(blkcg)) {
212266 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
213267 if (WARN_ON_ONCE(!blkg->parent)) {
214268 ret = -ENODEV;
215
- goto err_put_congested;
269
+ goto err_put_css;
216270 }
217271 blkg_get(blkg->parent);
218272 }
....@@ -249,8 +303,6 @@
249303 blkg_put(blkg);
250304 return ERR_PTR(ret);
251305
252
-err_put_congested:
253
- wb_congested_put(wb_congested);
254306 err_put_css:
255307 css_put(&blkcg->css);
256308 err_free_blkg:
....@@ -266,57 +318,69 @@
266318 * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to
267319 * create one. blkg creation is performed recursively from blkcg_root such
268320 * that all non-root blkg's have access to the parent blkg. This function
269
- * should be called under RCU read lock and @q->queue_lock.
321
+ * should be called under RCU read lock and takes @q->queue_lock.
270322 *
271
- * Returns pointer to the looked up or created blkg on success, ERR_PTR()
272
- * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not
273
- * dead and bypassing, returns ERR_PTR(-EBUSY).
323
+ * Returns the blkg or the closest blkg if blkg_create() fails as it walks
324
+ * down from root.
274325 */
275
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
276
- struct request_queue *q)
326
+static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
327
+ struct request_queue *q)
277328 {
278329 struct blkcg_gq *blkg;
330
+ unsigned long flags;
279331
280332 WARN_ON_ONCE(!rcu_read_lock_held());
281
- lockdep_assert_held(q->queue_lock);
282333
283
- /*
284
- * This could be the first entry point of blkcg implementation and
285
- * we shouldn't allow anything to go through for a bypassing queue.
286
- */
287
- if (unlikely(blk_queue_bypass(q)))
288
- return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
289
-
290
- blkg = __blkg_lookup(blkcg, q, true);
334
+ blkg = blkg_lookup(blkcg, q);
291335 if (blkg)
292336 return blkg;
293337
338
+ spin_lock_irqsave(&q->queue_lock, flags);
339
+ blkg = __blkg_lookup(blkcg, q, true);
340
+ if (blkg)
341
+ goto found;
342
+
294343 /*
295344 * Create blkgs walking down from blkcg_root to @blkcg, so that all
296
- * non-root blkgs have access to their parents.
345
+ * non-root blkgs have access to their parents. Returns the closest
346
+ * blkg to the intended blkg should blkg_create() fail.
297347 */
298348 while (true) {
299349 struct blkcg *pos = blkcg;
300350 struct blkcg *parent = blkcg_parent(blkcg);
351
+ struct blkcg_gq *ret_blkg = q->root_blkg;
301352
302
- while (parent && !__blkg_lookup(parent, q, false)) {
353
+ while (parent) {
354
+ blkg = __blkg_lookup(parent, q, false);
355
+ if (blkg) {
356
+ /* remember closest blkg */
357
+ ret_blkg = blkg;
358
+ break;
359
+ }
303360 pos = parent;
304361 parent = blkcg_parent(parent);
305362 }
306363
307364 blkg = blkg_create(pos, q, NULL);
308
- if (pos == blkcg || IS_ERR(blkg))
309
- return blkg;
365
+ if (IS_ERR(blkg)) {
366
+ blkg = ret_blkg;
367
+ break;
368
+ }
369
+ if (pos == blkcg)
370
+ break;
310371 }
372
+
373
+found:
374
+ spin_unlock_irqrestore(&q->queue_lock, flags);
375
+ return blkg;
311376 }
312377
313378 static void blkg_destroy(struct blkcg_gq *blkg)
314379 {
315380 struct blkcg *blkcg = blkg->blkcg;
316
- struct blkcg_gq *parent = blkg->parent;
317381 int i;
318382
319
- lockdep_assert_held(blkg->q->queue_lock);
383
+ lockdep_assert_held(&blkg->q->queue_lock);
320384 lockdep_assert_held(&blkcg->lock);
321385
322386 /* Something wrong if we are trying to remove same group twice */
....@@ -328,11 +392,6 @@
328392
329393 if (blkg->pd[i] && pol->pd_offline_fn)
330394 pol->pd_offline_fn(blkg->pd[i]);
331
- }
332
-
333
- if (parent) {
334
- blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
335
- blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
336395 }
337396
338397 blkg->online = false;
....@@ -353,7 +412,7 @@
353412 * Put the reference taken at the time of creation so that when all
354413 * queues are gone, group can be destroyed.
355414 */
356
- blkg_put(blkg);
415
+ percpu_ref_kill(&blkg->refcnt);
357416 }
358417
359418 /**
....@@ -366,8 +425,7 @@
366425 {
367426 struct blkcg_gq *blkg, *n;
368427
369
- lockdep_assert_held(q->queue_lock);
370
-
428
+ spin_lock_irq(&q->queue_lock);
371429 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
372430 struct blkcg *blkcg = blkg->blkcg;
373431
....@@ -377,65 +435,7 @@
377435 }
378436
379437 q->root_blkg = NULL;
380
- q->root_rl.blkg = NULL;
381
-}
382
-
383
-/*
384
- * A group is RCU protected, but having an rcu lock does not mean that one
385
- * can access all the fields of blkg and assume these are valid. For
386
- * example, don't try to follow throtl_data and request queue links.
387
- *
388
- * Having a reference to blkg under an rcu allows accesses to only values
389
- * local to groups like group stats and group rate limits.
390
- */
391
-void __blkg_release_rcu(struct rcu_head *rcu_head)
392
-{
393
- struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
394
-
395
- /* release the blkcg and parent blkg refs this blkg has been holding */
396
- css_put(&blkg->blkcg->css);
397
- if (blkg->parent)
398
- blkg_put(blkg->parent);
399
-
400
- wb_congested_put(blkg->wb_congested);
401
-
402
- blkg_free(blkg);
403
-}
404
-EXPORT_SYMBOL_GPL(__blkg_release_rcu);
405
-
406
-/*
407
- * The next function used by blk_queue_for_each_rl(). It's a bit tricky
408
- * because the root blkg uses @q->root_rl instead of its own rl.
409
- */
410
-struct request_list *__blk_queue_next_rl(struct request_list *rl,
411
- struct request_queue *q)
412
-{
413
- struct list_head *ent;
414
- struct blkcg_gq *blkg;
415
-
416
- /*
417
- * Determine the current blkg list_head. The first entry is
418
- * root_rl which is off @q->blkg_list and mapped to the head.
419
- */
420
- if (rl == &q->root_rl) {
421
- ent = &q->blkg_list;
422
- /* There are no more block groups, hence no request lists */
423
- if (list_empty(ent))
424
- return NULL;
425
- } else {
426
- blkg = container_of(rl, struct blkcg_gq, rl);
427
- ent = &blkg->q_node;
428
- }
429
-
430
- /* walk to the next list_head, skip root blkcg */
431
- ent = ent->next;
432
- if (ent == &q->root_blkg->q_node)
433
- ent = ent->next;
434
- if (ent == &q->blkg_list)
435
- return NULL;
436
-
437
- blkg = container_of(ent, struct blkcg_gq, q_node);
438
- return &blkg->rl;
438
+ spin_unlock_irq(&q->queue_lock);
439439 }
440440
441441 static int blkcg_reset_stats(struct cgroup_subsys_state *css,
....@@ -443,7 +443,7 @@
443443 {
444444 struct blkcg *blkcg = css_to_blkcg(css);
445445 struct blkcg_gq *blkg;
446
- int i;
446
+ int i, cpu;
447447
448448 mutex_lock(&blkcg_pol_mutex);
449449 spin_lock_irq(&blkcg->lock);
....@@ -454,8 +454,12 @@
454454 * anyway. If you get hit by a race, retry.
455455 */
456456 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
457
- blkg_rwstat_reset(&blkg->stat_bytes);
458
- blkg_rwstat_reset(&blkg->stat_ios);
457
+ for_each_possible_cpu(cpu) {
458
+ struct blkg_iostat_set *bis =
459
+ per_cpu_ptr(blkg->iostat_cpu, cpu);
460
+ memset(bis, 0, sizeof(*bis));
461
+ }
462
+ memset(&blkg->iostat, 0, sizeof(blkg->iostat));
459463
460464 for (i = 0; i < BLKCG_MAX_POLS; i++) {
461465 struct blkcg_policy *pol = blkcg_policy[i];
....@@ -477,7 +481,6 @@
477481 return bdi_dev_name(blkg->q->backing_dev_info);
478482 return NULL;
479483 }
480
-EXPORT_SYMBOL_GPL(blkg_dev_name);
481484
482485 /**
483486 * blkcg_print_blkgs - helper for printing per-blkg data
....@@ -508,10 +511,10 @@
508511
509512 rcu_read_lock();
510513 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
511
- spin_lock_irq(blkg->q->queue_lock);
514
+ spin_lock_irq(&blkg->q->queue_lock);
512515 if (blkcg_policy_enabled(blkg->q, pol))
513516 total += prfill(sf, blkg->pd[pol->plid], data);
514
- spin_unlock_irq(blkg->q->queue_lock);
517
+ spin_unlock_irq(&blkg->q->queue_lock);
515518 }
516519 rcu_read_unlock();
517520
....@@ -540,262 +543,55 @@
540543 }
541544 EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
542545
543
-/**
544
- * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
545
- * @sf: seq_file to print to
546
- * @pd: policy private data of interest
547
- * @rwstat: rwstat to print
548
- *
549
- * Print @rwstat to @sf for the device assocaited with @pd.
550
- */
551
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
552
- const struct blkg_rwstat *rwstat)
553
-{
554
- static const char *rwstr[] = {
555
- [BLKG_RWSTAT_READ] = "Read",
556
- [BLKG_RWSTAT_WRITE] = "Write",
557
- [BLKG_RWSTAT_SYNC] = "Sync",
558
- [BLKG_RWSTAT_ASYNC] = "Async",
559
- [BLKG_RWSTAT_DISCARD] = "Discard",
560
- };
561
- const char *dname = blkg_dev_name(pd->blkg);
562
- u64 v;
563
- int i;
564
-
565
- if (!dname)
566
- return 0;
567
-
568
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
569
- seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
570
- (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
571
-
572
- v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
573
- atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
574
- atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
575
- seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
576
- return v;
577
-}
578
-EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
579
-
580
-/**
581
- * blkg_prfill_stat - prfill callback for blkg_stat
582
- * @sf: seq_file to print to
583
- * @pd: policy private data of interest
584
- * @off: offset to the blkg_stat in @pd
585
- *
586
- * prfill callback for printing a blkg_stat.
587
- */
588
-u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
589
-{
590
- return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
591
-}
592
-EXPORT_SYMBOL_GPL(blkg_prfill_stat);
593
-
594
-/**
595
- * blkg_prfill_rwstat - prfill callback for blkg_rwstat
596
- * @sf: seq_file to print to
597
- * @pd: policy private data of interest
598
- * @off: offset to the blkg_rwstat in @pd
599
- *
600
- * prfill callback for printing a blkg_rwstat.
601
- */
602
-u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
603
- int off)
604
-{
605
- struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
606
-
607
- return __blkg_prfill_rwstat(sf, pd, &rwstat);
608
-}
609
-EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
610
-
611
-static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
612
- struct blkg_policy_data *pd, int off)
613
-{
614
- struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
615
-
616
- return __blkg_prfill_rwstat(sf, pd, &rwstat);
617
-}
618
-
619
-/**
620
- * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
621
- * @sf: seq_file to print to
622
- * @v: unused
623
- *
624
- * To be used as cftype->seq_show to print blkg->stat_bytes.
625
- * cftype->private must be set to the blkcg_policy.
626
- */
627
-int blkg_print_stat_bytes(struct seq_file *sf, void *v)
628
-{
629
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
630
- blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
631
- offsetof(struct blkcg_gq, stat_bytes), true);
632
- return 0;
633
-}
634
-EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
635
-
636
-/**
637
- * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
638
- * @sf: seq_file to print to
639
- * @v: unused
640
- *
641
- * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private
642
- * must be set to the blkcg_policy.
643
- */
644
-int blkg_print_stat_ios(struct seq_file *sf, void *v)
645
-{
646
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
647
- blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
648
- offsetof(struct blkcg_gq, stat_ios), true);
649
- return 0;
650
-}
651
-EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
652
-
653
-static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
654
- struct blkg_policy_data *pd,
655
- int off)
656
-{
657
- struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
658
- NULL, off);
659
- return __blkg_prfill_rwstat(sf, pd, &rwstat);
660
-}
661
-
662
-/**
663
- * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
664
- * @sf: seq_file to print to
665
- * @v: unused
666
- */
667
-int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
668
-{
669
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
670
- blkg_prfill_rwstat_field_recursive,
671
- (void *)seq_cft(sf)->private,
672
- offsetof(struct blkcg_gq, stat_bytes), true);
673
- return 0;
674
-}
675
-EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
676
-
677
-/**
678
- * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
679
- * @sf: seq_file to print to
680
- * @v: unused
681
- */
682
-int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
683
-{
684
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
685
- blkg_prfill_rwstat_field_recursive,
686
- (void *)seq_cft(sf)->private,
687
- offsetof(struct blkcg_gq, stat_ios), true);
688
- return 0;
689
-}
690
-EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
691
-
692
-/**
693
- * blkg_stat_recursive_sum - collect hierarchical blkg_stat
694
- * @blkg: blkg of interest
695
- * @pol: blkcg_policy which contains the blkg_stat
696
- * @off: offset to the blkg_stat in blkg_policy_data or @blkg
697
- *
698
- * Collect the blkg_stat specified by @blkg, @pol and @off and all its
699
- * online descendants and their aux counts. The caller must be holding the
700
- * queue lock for online tests.
701
- *
702
- * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
703
- * at @off bytes into @blkg's blkg_policy_data of the policy.
704
- */
705
-u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
706
- struct blkcg_policy *pol, int off)
707
-{
708
- struct blkcg_gq *pos_blkg;
709
- struct cgroup_subsys_state *pos_css;
710
- u64 sum = 0;
711
-
712
- lockdep_assert_held(blkg->q->queue_lock);
713
-
714
- rcu_read_lock();
715
- blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
716
- struct blkg_stat *stat;
717
-
718
- if (!pos_blkg->online)
719
- continue;
720
-
721
- if (pol)
722
- stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
723
- else
724
- stat = (void *)blkg + off;
725
-
726
- sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
727
- }
728
- rcu_read_unlock();
729
-
730
- return sum;
731
-}
732
-EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
733
-
734
-/**
735
- * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
736
- * @blkg: blkg of interest
737
- * @pol: blkcg_policy which contains the blkg_rwstat
738
- * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
739
- *
740
- * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
741
- * online descendants and their aux counts. The caller must be holding the
742
- * queue lock for online tests.
743
- *
744
- * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
745
- * is at @off bytes into @blkg's blkg_policy_data of the policy.
746
- */
747
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
748
- struct blkcg_policy *pol, int off)
749
-{
750
- struct blkcg_gq *pos_blkg;
751
- struct cgroup_subsys_state *pos_css;
752
- struct blkg_rwstat sum = { };
753
- int i;
754
-
755
- lockdep_assert_held(blkg->q->queue_lock);
756
-
757
- rcu_read_lock();
758
- blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
759
- struct blkg_rwstat *rwstat;
760
-
761
- if (!pos_blkg->online)
762
- continue;
763
-
764
- if (pol)
765
- rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
766
- else
767
- rwstat = (void *)pos_blkg + off;
768
-
769
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
770
- atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
771
- percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
772
- &sum.aux_cnt[i]);
773
- }
774
- rcu_read_unlock();
775
-
776
- return sum;
777
-}
778
-EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
779
-
780546 /* Performs queue bypass and policy enabled checks then looks up blkg. */
781547 static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
782548 const struct blkcg_policy *pol,
783549 struct request_queue *q)
784550 {
785551 WARN_ON_ONCE(!rcu_read_lock_held());
786
- lockdep_assert_held(q->queue_lock);
552
+ lockdep_assert_held(&q->queue_lock);
787553
788554 if (!blkcg_policy_enabled(q, pol))
789555 return ERR_PTR(-EOPNOTSUPP);
790
-
791
- /*
792
- * This could be the first entry point of blkcg implementation and
793
- * we shouldn't allow anything to go through for a bypassing queue.
794
- */
795
- if (unlikely(blk_queue_bypass(q)))
796
- return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
797
-
798556 return __blkg_lookup(blkcg, q, true /* update_hint */);
557
+}
558
+
559
+/**
560
+ * blkg_conf_prep - parse and prepare for per-blkg config update
561
+ * @inputp: input string pointer
562
+ *
563
+ * Parse the device node prefix part, MAJ:MIN, of per-blkg config update
564
+ * from @input and get and return the matching gendisk. *@inputp is
565
+ * updated to point past the device node prefix. Returns an ERR_PTR()
566
+ * value on error.
567
+ *
568
+ * Use this function iff blkg_conf_prep() can't be used for some reason.
569
+ */
570
+struct gendisk *blkcg_conf_get_disk(char **inputp)
571
+{
572
+ char *input = *inputp;
573
+ unsigned int major, minor;
574
+ struct gendisk *disk;
575
+ int key_len, part;
576
+
577
+ if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
578
+ return ERR_PTR(-EINVAL);
579
+
580
+ input += key_len;
581
+ if (!isspace(*input))
582
+ return ERR_PTR(-EINVAL);
583
+ input = skip_spaces(input);
584
+
585
+ disk = get_gendisk(MKDEV(major, minor), &part);
586
+ if (!disk)
587
+ return ERR_PTR(-ENODEV);
588
+ if (part) {
589
+ put_disk_and_module(disk);
590
+ return ERR_PTR(-ENODEV);
591
+ }
592
+
593
+ *inputp = input;
594
+ return disk;
799595 }
800596
801597 /**
....@@ -812,35 +608,21 @@
812608 */
813609 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
814610 char *input, struct blkg_conf_ctx *ctx)
815
- __acquires(rcu) __acquires(disk->queue->queue_lock)
611
+ __acquires(rcu) __acquires(&disk->queue->queue_lock)
816612 {
817613 struct gendisk *disk;
818614 struct request_queue *q;
819615 struct blkcg_gq *blkg;
820
- unsigned int major, minor;
821
- int key_len, part, ret;
822
- char *body;
616
+ int ret;
823617
824
- if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
825
- return -EINVAL;
826
-
827
- body = input + key_len;
828
- if (!isspace(*body))
829
- return -EINVAL;
830
- body = skip_spaces(body);
831
-
832
- disk = get_gendisk(MKDEV(major, minor), &part);
833
- if (!disk)
834
- return -ENODEV;
835
- if (part) {
836
- ret = -ENODEV;
837
- goto fail;
838
- }
618
+ disk = blkcg_conf_get_disk(&input);
619
+ if (IS_ERR(disk))
620
+ return PTR_ERR(disk);
839621
840622 q = disk->queue;
841623
842624 rcu_read_lock();
843
- spin_lock_irq(q->queue_lock);
625
+ spin_lock_irq(&q->queue_lock);
844626
845627 blkg = blkg_lookup_check(blkcg, pol, q);
846628 if (IS_ERR(blkg)) {
....@@ -867,7 +649,7 @@
867649 }
868650
869651 /* Drop locks to do new blkg allocation with GFP_KERNEL. */
870
- spin_unlock_irq(q->queue_lock);
652
+ spin_unlock_irq(&q->queue_lock);
871653 rcu_read_unlock();
872654
873655 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
....@@ -883,7 +665,7 @@
883665 }
884666
885667 rcu_read_lock();
886
- spin_lock_irq(q->queue_lock);
668
+ spin_lock_irq(&q->queue_lock);
887669
888670 blkg = blkg_lookup_check(pos, pol, q);
889671 if (IS_ERR(blkg)) {
....@@ -896,7 +678,7 @@
896678 blkg_free(new_blkg);
897679 } else {
898680 blkg = blkg_create(pos, q, new_blkg);
899
- if (unlikely(IS_ERR(blkg))) {
681
+ if (IS_ERR(blkg)) {
900682 ret = PTR_ERR(blkg);
901683 goto fail_preloaded;
902684 }
....@@ -910,13 +692,13 @@
910692 success:
911693 ctx->disk = disk;
912694 ctx->blkg = blkg;
913
- ctx->body = body;
695
+ ctx->body = input;
914696 return 0;
915697
916698 fail_preloaded:
917699 radix_tree_preload_end();
918700 fail_unlock:
919
- spin_unlock_irq(q->queue_lock);
701
+ spin_unlock_irq(&q->queue_lock);
920702 rcu_read_unlock();
921703 fail:
922704 put_disk_and_module(disk);
....@@ -942,31 +724,159 @@
942724 * with blkg_conf_prep().
943725 */
944726 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
945
- __releases(ctx->disk->queue->queue_lock) __releases(rcu)
727
+ __releases(&ctx->disk->queue->queue_lock) __releases(rcu)
946728 {
947
- spin_unlock_irq(ctx->disk->queue->queue_lock);
729
+ spin_unlock_irq(&ctx->disk->queue->queue_lock);
948730 rcu_read_unlock();
949731 put_disk_and_module(ctx->disk);
950732 }
951733 EXPORT_SYMBOL_GPL(blkg_conf_finish);
734
+
735
+static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
736
+{
737
+ int i;
738
+
739
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
740
+ dst->bytes[i] = src->bytes[i];
741
+ dst->ios[i] = src->ios[i];
742
+ }
743
+}
744
+
745
+static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
746
+{
747
+ int i;
748
+
749
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
750
+ dst->bytes[i] += src->bytes[i];
751
+ dst->ios[i] += src->ios[i];
752
+ }
753
+}
754
+
755
+static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
756
+{
757
+ int i;
758
+
759
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
760
+ dst->bytes[i] -= src->bytes[i];
761
+ dst->ios[i] -= src->ios[i];
762
+ }
763
+}
764
+
765
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
766
+{
767
+ struct blkcg *blkcg = css_to_blkcg(css);
768
+ struct blkcg_gq *blkg;
769
+
770
+ rcu_read_lock();
771
+
772
+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
773
+ struct blkcg_gq *parent = blkg->parent;
774
+ struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
775
+ struct blkg_iostat cur, delta;
776
+ unsigned int seq;
777
+
778
+ /* fetch the current per-cpu values */
779
+ do {
780
+ seq = u64_stats_fetch_begin(&bisc->sync);
781
+ blkg_iostat_set(&cur, &bisc->cur);
782
+ } while (u64_stats_fetch_retry(&bisc->sync, seq));
783
+
784
+ /* propagate percpu delta to global */
785
+ u64_stats_update_begin(&blkg->iostat.sync);
786
+ blkg_iostat_set(&delta, &cur);
787
+ blkg_iostat_sub(&delta, &bisc->last);
788
+ blkg_iostat_add(&blkg->iostat.cur, &delta);
789
+ blkg_iostat_add(&bisc->last, &delta);
790
+ u64_stats_update_end(&blkg->iostat.sync);
791
+
792
+ /* propagate global delta to parent */
793
+ if (parent) {
794
+ u64_stats_update_begin(&parent->iostat.sync);
795
+ blkg_iostat_set(&delta, &blkg->iostat.cur);
796
+ blkg_iostat_sub(&delta, &blkg->iostat.last);
797
+ blkg_iostat_add(&parent->iostat.cur, &delta);
798
+ blkg_iostat_add(&blkg->iostat.last, &delta);
799
+ u64_stats_update_end(&parent->iostat.sync);
800
+ }
801
+ }
802
+
803
+ rcu_read_unlock();
804
+}
805
+
806
+/*
807
+ * The rstat algorithms intentionally don't handle the root cgroup to avoid
808
+ * incurring overhead when no cgroups are defined. For that reason,
809
+ * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the
810
+ * iostat in the root cgroup's blkcg_gq.
811
+ *
812
+ * However, we would like to re-use the printing code between the root and
813
+ * non-root cgroups to the extent possible. For that reason, we simulate
814
+ * flushing the root cgroup's stats by explicitly filling in the iostat
815
+ * with disk level statistics.
816
+ */
817
+static void blkcg_fill_root_iostats(void)
818
+{
819
+ struct class_dev_iter iter;
820
+ struct device *dev;
821
+
822
+ class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
823
+ while ((dev = class_dev_iter_next(&iter))) {
824
+ struct gendisk *disk = dev_to_disk(dev);
825
+ struct hd_struct *part = disk_get_part(disk, 0);
826
+ struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue);
827
+ struct blkg_iostat tmp;
828
+ int cpu;
829
+
830
+ memset(&tmp, 0, sizeof(tmp));
831
+ for_each_possible_cpu(cpu) {
832
+ struct disk_stats *cpu_dkstats;
833
+
834
+ cpu_dkstats = per_cpu_ptr(part->dkstats, cpu);
835
+ tmp.ios[BLKG_IOSTAT_READ] +=
836
+ cpu_dkstats->ios[STAT_READ];
837
+ tmp.ios[BLKG_IOSTAT_WRITE] +=
838
+ cpu_dkstats->ios[STAT_WRITE];
839
+ tmp.ios[BLKG_IOSTAT_DISCARD] +=
840
+ cpu_dkstats->ios[STAT_DISCARD];
841
+ // convert sectors to bytes
842
+ tmp.bytes[BLKG_IOSTAT_READ] +=
843
+ cpu_dkstats->sectors[STAT_READ] << 9;
844
+ tmp.bytes[BLKG_IOSTAT_WRITE] +=
845
+ cpu_dkstats->sectors[STAT_WRITE] << 9;
846
+ tmp.bytes[BLKG_IOSTAT_DISCARD] +=
847
+ cpu_dkstats->sectors[STAT_DISCARD] << 9;
848
+
849
+ u64_stats_update_begin(&blkg->iostat.sync);
850
+ blkg_iostat_set(&blkg->iostat.cur, &tmp);
851
+ u64_stats_update_end(&blkg->iostat.sync);
852
+ }
853
+ disk_put_part(part);
854
+ }
855
+}
952856
953857 static int blkcg_print_stat(struct seq_file *sf, void *v)
954858 {
955859 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
956860 struct blkcg_gq *blkg;
957861
862
+ if (!seq_css(sf)->parent)
863
+ blkcg_fill_root_iostats();
864
+ else
865
+ cgroup_rstat_flush(blkcg->css.cgroup);
866
+
958867 rcu_read_lock();
959868
960869 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
870
+ struct blkg_iostat_set *bis = &blkg->iostat;
961871 const char *dname;
962872 char *buf;
963
- struct blkg_rwstat rwstat;
964873 u64 rbytes, wbytes, rios, wios, dbytes, dios;
965874 size_t size = seq_get_buf(sf, &buf), off = 0;
966875 int i;
967876 bool has_stats = false;
877
+ unsigned seq;
968878
969
- spin_lock_irq(blkg->q->queue_lock);
879
+ spin_lock_irq(&blkg->q->queue_lock);
970880
971881 if (!blkg->online)
972882 goto skip;
....@@ -983,17 +893,16 @@
983893 */
984894 off += scnprintf(buf+off, size-off, "%s ", dname);
985895
986
- rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
987
- offsetof(struct blkcg_gq, stat_bytes));
988
- rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
989
- wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
990
- dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
896
+ do {
897
+ seq = u64_stats_fetch_begin(&bis->sync);
991898
992
- rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
993
- offsetof(struct blkcg_gq, stat_ios));
994
- rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
995
- wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
996
- dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
899
+ rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
900
+ wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
901
+ dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
902
+ rios = bis->cur.ios[BLKG_IOSTAT_READ];
903
+ wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
904
+ dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
905
+ } while (u64_stats_fetch_retry(&bis->sync, seq));
997906
998907 if (rbytes || wbytes || rios || wios) {
999908 has_stats = true;
....@@ -1003,10 +912,7 @@
1003912 dbytes, dios);
1004913 }
1005914
1006
- if (!blkcg_debug_stats)
1007
- goto next;
1008
-
1009
- if (atomic_read(&blkg->use_delay)) {
915
+ if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
1010916 has_stats = true;
1011917 off += scnprintf(buf+off, size-off,
1012918 " use_delay=%d delay_nsec=%llu",
....@@ -1026,7 +932,7 @@
1026932 has_stats = true;
1027933 off += written;
1028934 }
1029
-next:
935
+
1030936 if (has_stats) {
1031937 if (off < size - 1) {
1032938 off += scnprintf(buf+off, size-off, "\n");
....@@ -1036,7 +942,7 @@
1036942 }
1037943 }
1038944 skip:
1039
- spin_unlock_irq(blkg->q->queue_lock);
945
+ spin_unlock_irq(&blkg->q->queue_lock);
1040946 }
1041947
1042948 rcu_read_unlock();
....@@ -1046,7 +952,6 @@
1046952 static struct cftype blkcg_files[] = {
1047953 {
1048954 .name = "stat",
1049
- .flags = CFTYPE_NOT_ON_ROOT,
1050955 .seq_show = blkcg_print_stat,
1051956 },
1052957 { } /* terminate */
....@@ -1096,8 +1001,8 @@
10961001 /* this prevents anyone from attaching or migrating to this blkcg */
10971002 wb_blkcg_offline(blkcg);
10981003
1099
- /* put the base cgwb reference allowing step 2 to be triggered */
1100
- blkcg_cgwb_put(blkcg);
1004
+ /* put the base online pin allowing step 2 to be triggered */
1005
+ blkcg_unpin_online(blkcg);
11011006 }
11021007
11031008 /**
....@@ -1113,6 +1018,8 @@
11131018 */
11141019 void blkcg_destroy_blkgs(struct blkcg *blkcg)
11151020 {
1021
+ might_sleep();
1022
+
11161023 spin_lock_irq(&blkcg->lock);
11171024
11181025 while (!hlist_empty(&blkcg->blkg_list)) {
....@@ -1120,14 +1027,20 @@
11201027 struct blkcg_gq, blkcg_node);
11211028 struct request_queue *q = blkg->q;
11221029
1123
- if (spin_trylock(q->queue_lock)) {
1124
- blkg_destroy(blkg);
1125
- spin_unlock(q->queue_lock);
1126
- } else {
1030
+ if (need_resched() || !spin_trylock(&q->queue_lock)) {
1031
+ /*
1032
+ * Given that the system can accumulate a huge number
1033
+ * of blkgs in pathological cases, check to see if we
1034
+ * need to rescheduling to avoid softlockup.
1035
+ */
11271036 spin_unlock_irq(&blkcg->lock);
1128
- cpu_relax();
1037
+ cond_resched();
11291038 spin_lock_irq(&blkcg->lock);
1039
+ continue;
11301040 }
1041
+
1042
+ blkg_destroy(blkg);
1043
+ spin_unlock(&q->queue_lock);
11311044 }
11321045
11331046 spin_unlock_irq(&blkcg->lock);
....@@ -1196,11 +1109,11 @@
11961109 }
11971110
11981111 spin_lock_init(&blkcg->lock);
1112
+ refcount_set(&blkcg->online_pin, 1);
11991113 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
12001114 INIT_HLIST_HEAD(&blkcg->blkg_list);
12011115 #ifdef CONFIG_CGROUP_WRITEBACK
12021116 INIT_LIST_HEAD(&blkcg->cgwb_list);
1203
- refcount_set(&blkcg->cgwb_refcnt, 1);
12041117 #endif
12051118 list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
12061119
....@@ -1219,11 +1132,26 @@
12191132 return ret;
12201133 }
12211134
1135
+static int blkcg_css_online(struct cgroup_subsys_state *css)
1136
+{
1137
+ struct blkcg *blkcg = css_to_blkcg(css);
1138
+ struct blkcg *parent = blkcg_parent(blkcg);
1139
+
1140
+ /*
1141
+ * blkcg_pin_online() is used to delay blkcg offline so that blkgs
1142
+ * don't go offline while cgwbs are still active on them. Pin the
1143
+ * parent so that offline always happens towards the root.
1144
+ */
1145
+ if (parent)
1146
+ blkcg_pin_online(parent);
1147
+ return 0;
1148
+}
1149
+
12221150 /**
12231151 * blkcg_init_queue - initialize blkcg part of request queue
12241152 * @q: request_queue to initialize
12251153 *
1226
- * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
1154
+ * Called from blk_alloc_queue(). Responsible for initializing blkcg
12271155 * part of new request_queue @q.
12281156 *
12291157 * RETURNS:
....@@ -1243,36 +1171,38 @@
12431171
12441172 /* Make sure the root blkg exists. */
12451173 rcu_read_lock();
1246
- spin_lock_irq(q->queue_lock);
1174
+ spin_lock_irq(&q->queue_lock);
12471175 blkg = blkg_create(&blkcg_root, q, new_blkg);
12481176 if (IS_ERR(blkg))
12491177 goto err_unlock;
12501178 q->root_blkg = blkg;
1251
- q->root_rl.blkg = blkg;
1252
- spin_unlock_irq(q->queue_lock);
1179
+ spin_unlock_irq(&q->queue_lock);
12531180 rcu_read_unlock();
12541181
12551182 if (preloaded)
12561183 radix_tree_preload_end();
12571184
1258
- ret = blk_iolatency_init(q);
1259
- if (ret) {
1260
- spin_lock_irq(q->queue_lock);
1261
- blkg_destroy_all(q);
1262
- spin_unlock_irq(q->queue_lock);
1263
- return ret;
1264
- }
1185
+ ret = blk_ioprio_init(q);
1186
+ if (ret)
1187
+ goto err_destroy_all;
12651188
12661189 ret = blk_throtl_init(q);
1267
- if (ret) {
1268
- spin_lock_irq(q->queue_lock);
1269
- blkg_destroy_all(q);
1270
- spin_unlock_irq(q->queue_lock);
1271
- }
1272
- return ret;
1190
+ if (ret)
1191
+ goto err_destroy_all;
12731192
1193
+ ret = blk_iolatency_init(q);
1194
+ if (ret) {
1195
+ blk_throtl_exit(q);
1196
+ goto err_destroy_all;
1197
+ }
1198
+
1199
+ return 0;
1200
+
1201
+err_destroy_all:
1202
+ blkg_destroy_all(q);
1203
+ return ret;
12741204 err_unlock:
1275
- spin_unlock_irq(q->queue_lock);
1205
+ spin_unlock_irq(&q->queue_lock);
12761206 rcu_read_unlock();
12771207 if (preloaded)
12781208 radix_tree_preload_end();
....@@ -1280,37 +1210,14 @@
12801210 }
12811211
12821212 /**
1283
- * blkcg_drain_queue - drain blkcg part of request_queue
1284
- * @q: request_queue to drain
1285
- *
1286
- * Called from blk_drain_queue(). Responsible for draining blkcg part.
1287
- */
1288
-void blkcg_drain_queue(struct request_queue *q)
1289
-{
1290
- lockdep_assert_held(q->queue_lock);
1291
-
1292
- /*
1293
- * @q could be exiting and already have destroyed all blkgs as
1294
- * indicated by NULL root_blkg. If so, don't confuse policies.
1295
- */
1296
- if (!q->root_blkg)
1297
- return;
1298
-
1299
- blk_throtl_drain(q);
1300
-}
1301
-
1302
-/**
13031213 * blkcg_exit_queue - exit and release blkcg part of request_queue
13041214 * @q: request_queue being released
13051215 *
1306
- * Called from blk_release_queue(). Responsible for exiting blkcg part.
1216
+ * Called from blk_exit_queue(). Responsible for exiting blkcg part.
13071217 */
13081218 void blkcg_exit_queue(struct request_queue *q)
13091219 {
1310
- spin_lock_irq(q->queue_lock);
13111220 blkg_destroy_all(q);
1312
- spin_unlock_irq(q->queue_lock);
1313
-
13141221 blk_throtl_exit(q);
13151222 }
13161223
....@@ -1369,9 +1276,11 @@
13691276
13701277 struct cgroup_subsys io_cgrp_subsys = {
13711278 .css_alloc = blkcg_css_alloc,
1279
+ .css_online = blkcg_css_online,
13721280 .css_offline = blkcg_css_offline,
13731281 .css_free = blkcg_css_free,
13741282 .can_attach = blkcg_can_attach,
1283
+ .css_rstat_flush = blkcg_rstat_flush,
13751284 .bind = blkcg_bind,
13761285 .dfl_cftypes = blkcg_files,
13771286 .legacy_cftypes = blkcg_legacy_files,
....@@ -1408,60 +1317,98 @@
14081317 const struct blkcg_policy *pol)
14091318 {
14101319 struct blkg_policy_data *pd_prealloc = NULL;
1411
- struct blkcg_gq *blkg;
1320
+ struct blkcg_gq *blkg, *pinned_blkg = NULL;
14121321 int ret;
14131322
14141323 if (blkcg_policy_enabled(q, pol))
14151324 return 0;
14161325
1417
- if (q->mq_ops)
1326
+ if (queue_is_mq(q))
14181327 blk_mq_freeze_queue(q);
1419
- else
1420
- blk_queue_bypass_start(q);
1421
-pd_prealloc:
1422
- if (!pd_prealloc) {
1423
- pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
1424
- if (!pd_prealloc) {
1425
- ret = -ENOMEM;
1426
- goto out_bypass_end;
1427
- }
1428
- }
1328
+retry:
1329
+ spin_lock_irq(&q->queue_lock);
14291330
1430
- spin_lock_irq(q->queue_lock);
1431
-
1432
- list_for_each_entry(blkg, &q->blkg_list, q_node) {
1331
+ /* blkg_list is pushed at the head, reverse walk to allocate parents first */
1332
+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
14331333 struct blkg_policy_data *pd;
14341334
14351335 if (blkg->pd[pol->plid])
14361336 continue;
14371337
1438
- pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
1439
- if (!pd)
1440
- swap(pd, pd_prealloc);
1338
+ /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
1339
+ if (blkg == pinned_blkg) {
1340
+ pd = pd_prealloc;
1341
+ pd_prealloc = NULL;
1342
+ } else {
1343
+ pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q,
1344
+ blkg->blkcg);
1345
+ }
1346
+
14411347 if (!pd) {
1442
- spin_unlock_irq(q->queue_lock);
1443
- goto pd_prealloc;
1348
+ /*
1349
+ * GFP_NOWAIT failed. Free the existing one and
1350
+ * prealloc for @blkg w/ GFP_KERNEL.
1351
+ */
1352
+ if (pinned_blkg)
1353
+ blkg_put(pinned_blkg);
1354
+ blkg_get(blkg);
1355
+ pinned_blkg = blkg;
1356
+
1357
+ spin_unlock_irq(&q->queue_lock);
1358
+
1359
+ if (pd_prealloc)
1360
+ pol->pd_free_fn(pd_prealloc);
1361
+ pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
1362
+ blkg->blkcg);
1363
+ if (pd_prealloc)
1364
+ goto retry;
1365
+ else
1366
+ goto enomem;
14441367 }
14451368
14461369 blkg->pd[pol->plid] = pd;
14471370 pd->blkg = blkg;
14481371 pd->plid = pol->plid;
1449
- if (pol->pd_init_fn)
1450
- pol->pd_init_fn(pd);
14511372 }
1373
+
1374
+ /* all allocated, init in the same order */
1375
+ if (pol->pd_init_fn)
1376
+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
1377
+ pol->pd_init_fn(blkg->pd[pol->plid]);
1378
+
1379
+ if (pol->pd_online_fn)
1380
+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
1381
+ pol->pd_online_fn(blkg->pd[pol->plid]);
14521382
14531383 __set_bit(pol->plid, q->blkcg_pols);
14541384 ret = 0;
14551385
1456
- spin_unlock_irq(q->queue_lock);
1457
-out_bypass_end:
1458
- if (q->mq_ops)
1386
+ spin_unlock_irq(&q->queue_lock);
1387
+out:
1388
+ if (queue_is_mq(q))
14591389 blk_mq_unfreeze_queue(q);
1460
- else
1461
- blk_queue_bypass_end(q);
1390
+ if (pinned_blkg)
1391
+ blkg_put(pinned_blkg);
14621392 if (pd_prealloc)
14631393 pol->pd_free_fn(pd_prealloc);
14641394 return ret;
1395
+
1396
+enomem:
1397
+ /* alloc failed, nothing's initialized yet, free everything */
1398
+ spin_lock_irq(&q->queue_lock);
1399
+ list_for_each_entry(blkg, &q->blkg_list, q_node) {
1400
+ struct blkcg *blkcg = blkg->blkcg;
1401
+
1402
+ spin_lock(&blkcg->lock);
1403
+ if (blkg->pd[pol->plid]) {
1404
+ pol->pd_free_fn(blkg->pd[pol->plid]);
1405
+ blkg->pd[pol->plid] = NULL;
1406
+ }
1407
+ spin_unlock(&blkcg->lock);
1408
+ }
1409
+ spin_unlock_irq(&q->queue_lock);
1410
+ ret = -ENOMEM;
1411
+ goto out;
14651412 }
14661413 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
14671414
....@@ -1481,30 +1428,30 @@
14811428 if (!blkcg_policy_enabled(q, pol))
14821429 return;
14831430
1484
- if (q->mq_ops)
1431
+ if (queue_is_mq(q))
14851432 blk_mq_freeze_queue(q);
1486
- else
1487
- blk_queue_bypass_start(q);
14881433
1489
- spin_lock_irq(q->queue_lock);
1434
+ spin_lock_irq(&q->queue_lock);
14901435
14911436 __clear_bit(pol->plid, q->blkcg_pols);
14921437
14931438 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1439
+ struct blkcg *blkcg = blkg->blkcg;
1440
+
1441
+ spin_lock(&blkcg->lock);
14941442 if (blkg->pd[pol->plid]) {
14951443 if (pol->pd_offline_fn)
14961444 pol->pd_offline_fn(blkg->pd[pol->plid]);
14971445 pol->pd_free_fn(blkg->pd[pol->plid]);
14981446 blkg->pd[pol->plid] = NULL;
14991447 }
1448
+ spin_unlock(&blkcg->lock);
15001449 }
15011450
1502
- spin_unlock_irq(q->queue_lock);
1451
+ spin_unlock_irq(&q->queue_lock);
15031452
1504
- if (q->mq_ops)
1453
+ if (queue_is_mq(q))
15051454 blk_mq_unfreeze_queue(q);
1506
- else
1507
- blk_queue_bypass_end(q);
15081455 }
15091456 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
15101457
....@@ -1554,7 +1501,8 @@
15541501 blkcg->cpd[pol->plid] = cpd;
15551502 cpd->blkcg = blkcg;
15561503 cpd->plid = pol->plid;
1557
- pol->cpd_init_fn(cpd);
1504
+ if (pol->cpd_init_fn)
1505
+ pol->cpd_init_fn(cpd);
15581506 }
15591507 }
15601508
....@@ -1627,6 +1575,25 @@
16271575 }
16281576 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
16291577
1578
+bool __blkcg_punt_bio_submit(struct bio *bio)
1579
+{
1580
+ struct blkcg_gq *blkg = bio->bi_blkg;
1581
+
1582
+ /* consume the flag first */
1583
+ bio->bi_opf &= ~REQ_CGROUP_PUNT;
1584
+
1585
+ /* never bounce for the root cgroup */
1586
+ if (!blkg->parent)
1587
+ return false;
1588
+
1589
+ spin_lock_bh(&blkg->async_bio_lock);
1590
+ bio_list_add(&blkg->async_bios, bio);
1591
+ spin_unlock_bh(&blkg->async_bio_lock);
1592
+
1593
+ queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
1594
+ return true;
1595
+}
1596
+
16301597 /*
16311598 * Scale the accumulated delay based on how long it has been since we updated
16321599 * the delay. We only call this when we are adding delay, in case it's been a
....@@ -1636,6 +1603,10 @@
16361603 static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
16371604 {
16381605 u64 old = atomic64_read(&blkg->delay_start);
1606
+
1607
+ /* negative use_delay means no scaling, see blkcg_set_delay() */
1608
+ if (atomic_read(&blkg->use_delay) < 0)
1609
+ return;
16391610
16401611 /*
16411612 * We only want to scale down every second. The idea here is that we
....@@ -1688,16 +1659,25 @@
16881659 */
16891660 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
16901661 {
1662
+ unsigned long pflags;
1663
+ bool clamp;
16911664 u64 now = ktime_to_ns(ktime_get());
16921665 u64 exp;
16931666 u64 delay_nsec = 0;
16941667 int tok;
16951668
16961669 while (blkg->parent) {
1697
- if (atomic_read(&blkg->use_delay)) {
1670
+ int use_delay = atomic_read(&blkg->use_delay);
1671
+
1672
+ if (use_delay) {
1673
+ u64 this_delay;
1674
+
16981675 blkcg_scale_delay(blkg, now);
1699
- delay_nsec = max_t(u64, delay_nsec,
1700
- atomic64_read(&blkg->delay_nsec));
1676
+ this_delay = atomic64_read(&blkg->delay_nsec);
1677
+ if (this_delay > delay_nsec) {
1678
+ delay_nsec = this_delay;
1679
+ clamp = use_delay > 0;
1680
+ }
17011681 }
17021682 blkg = blkg->parent;
17031683 }
....@@ -1709,16 +1689,16 @@
17091689 * Let's not sleep for all eternity if we've amassed a huge delay.
17101690 * Swapping or metadata IO can accumulate 10's of seconds worth of
17111691 * delay, and we want userspace to be able to do _something_ so cap the
1712
- * delays at 1 second. If there's 10's of seconds worth of delay then
1713
- * the tasks will be delayed for 1 second for every syscall.
1692
+ * delays at 0.25s. If there's 10's of seconds worth of delay then the
1693
+ * tasks will be delayed for 0.25 second for every syscall. If
1694
+ * blkcg_set_delay() was used as indicated by negative use_delay, the
1695
+ * caller is responsible for regulating the range.
17141696 */
1715
- delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1697
+ if (clamp)
1698
+ delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
17161699
1717
- /*
1718
- * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
1719
- * that hasn't landed upstream yet. Once that stuff is in place we need
1720
- * to do a psi_memstall_enter/leave if memdelay is set.
1721
- */
1700
+ if (use_memdelay)
1701
+ psi_memstall_enter(&pflags);
17221702
17231703 exp = ktime_add_ns(now, delay_nsec);
17241704 tok = io_schedule_prepare();
....@@ -1728,6 +1708,9 @@
17281708 break;
17291709 } while (!fatal_signal_pending(current));
17301710 io_schedule_finish(tok);
1711
+
1712
+ if (use_memdelay)
1713
+ psi_memstall_leave(&pflags);
17311714 }
17321715
17331716 /**
....@@ -1766,8 +1749,7 @@
17661749 blkg = blkg_lookup(blkcg, q);
17671750 if (!blkg)
17681751 goto out;
1769
- blkg = blkg_try_get(blkg);
1770
- if (!blkg)
1752
+ if (!blkg_tryget(blkg))
17711753 goto out;
17721754 rcu_read_unlock();
17731755
....@@ -1779,12 +1761,11 @@
17791761 rcu_read_unlock();
17801762 blk_put_queue(q);
17811763 }
1782
-EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
17831764
17841765 /**
17851766 * blkcg_schedule_throttle - this task needs to check for throttling
1786
- * @q - the request queue IO was submitted on
1787
- * @use_memdelay - do we charge this to memory delay for PSI
1767
+ * @q: the request queue IO was submitted on
1768
+ * @use_memdelay: do we charge this to memory delay for PSI
17881769 *
17891770 * This is called by the IO controller when we know there's delay accumulated
17901771 * for the blkg for this task. We do not pass the blkg because there are places
....@@ -1817,18 +1798,160 @@
18171798
18181799 /**
18191800 * blkcg_add_delay - add delay to this blkg
1820
- * @now - the current time in nanoseconds
1821
- * @delta - how many nanoseconds of delay to add
1801
+ * @blkg: blkg of interest
1802
+ * @now: the current time in nanoseconds
1803
+ * @delta: how many nanoseconds of delay to add
18221804 *
18231805 * Charge @delta to the blkg's current delay accumulation. This is used to
18241806 * throttle tasks if an IO controller thinks we need more throttling.
18251807 */
18261808 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
18271809 {
1810
+ if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
1811
+ return;
18281812 blkcg_scale_delay(blkg, now);
18291813 atomic64_add(delta, &blkg->delay_nsec);
18301814 }
1831
-EXPORT_SYMBOL_GPL(blkcg_add_delay);
1815
+
1816
+/**
1817
+ * blkg_tryget_closest - try and get a blkg ref on the closet blkg
1818
+ * @bio: target bio
1819
+ * @css: target css
1820
+ *
1821
+ * As the failure mode here is to walk up the blkg tree, this ensure that the
1822
+ * blkg->parent pointers are always valid. This returns the blkg that it ended
1823
+ * up taking a reference on or %NULL if no reference was taken.
1824
+ */
1825
+static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
1826
+ struct cgroup_subsys_state *css)
1827
+{
1828
+ struct blkcg_gq *blkg, *ret_blkg = NULL;
1829
+
1830
+ rcu_read_lock();
1831
+ blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
1832
+ while (blkg) {
1833
+ if (blkg_tryget(blkg)) {
1834
+ ret_blkg = blkg;
1835
+ break;
1836
+ }
1837
+ blkg = blkg->parent;
1838
+ }
1839
+ rcu_read_unlock();
1840
+
1841
+ return ret_blkg;
1842
+}
1843
+
1844
+/**
1845
+ * bio_associate_blkg_from_css - associate a bio with a specified css
1846
+ * @bio: target bio
1847
+ * @css: target css
1848
+ *
1849
+ * Associate @bio with the blkg found by combining the css's blkg and the
1850
+ * request_queue of the @bio. An association failure is handled by walking up
1851
+ * the blkg tree. Therefore, the blkg associated can be anything between @blkg
1852
+ * and q->root_blkg. This situation only happens when a cgroup is dying and
1853
+ * then the remaining bios will spill to the closest alive blkg.
1854
+ *
1855
+ * A reference will be taken on the blkg and will be released when @bio is
1856
+ * freed.
1857
+ */
1858
+void bio_associate_blkg_from_css(struct bio *bio,
1859
+ struct cgroup_subsys_state *css)
1860
+{
1861
+ if (bio->bi_blkg)
1862
+ blkg_put(bio->bi_blkg);
1863
+
1864
+ if (css && css->parent) {
1865
+ bio->bi_blkg = blkg_tryget_closest(bio, css);
1866
+ } else {
1867
+ blkg_get(bio->bi_disk->queue->root_blkg);
1868
+ bio->bi_blkg = bio->bi_disk->queue->root_blkg;
1869
+ }
1870
+}
1871
+EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
1872
+
1873
+/**
1874
+ * bio_associate_blkg - associate a bio with a blkg
1875
+ * @bio: target bio
1876
+ *
1877
+ * Associate @bio with the blkg found from the bio's css and request_queue.
1878
+ * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is
1879
+ * already associated, the css is reused and association redone as the
1880
+ * request_queue may have changed.
1881
+ */
1882
+void bio_associate_blkg(struct bio *bio)
1883
+{
1884
+ struct cgroup_subsys_state *css;
1885
+
1886
+ rcu_read_lock();
1887
+
1888
+ if (bio->bi_blkg)
1889
+ css = &bio_blkcg(bio)->css;
1890
+ else
1891
+ css = blkcg_css();
1892
+
1893
+ bio_associate_blkg_from_css(bio, css);
1894
+
1895
+ rcu_read_unlock();
1896
+}
1897
+EXPORT_SYMBOL_GPL(bio_associate_blkg);
1898
+
1899
+/**
1900
+ * bio_clone_blkg_association - clone blkg association from src to dst bio
1901
+ * @dst: destination bio
1902
+ * @src: source bio
1903
+ */
1904
+void bio_clone_blkg_association(struct bio *dst, struct bio *src)
1905
+{
1906
+ if (src->bi_blkg)
1907
+ bio_associate_blkg_from_css(dst, &bio_blkcg(src)->css);
1908
+}
1909
+EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
1910
+
1911
+static int blk_cgroup_io_type(struct bio *bio)
1912
+{
1913
+ if (op_is_discard(bio->bi_opf))
1914
+ return BLKG_IOSTAT_DISCARD;
1915
+ if (op_is_write(bio->bi_opf))
1916
+ return BLKG_IOSTAT_WRITE;
1917
+ return BLKG_IOSTAT_READ;
1918
+}
1919
+
1920
+void blk_cgroup_bio_start(struct bio *bio)
1921
+{
1922
+ int rwd = blk_cgroup_io_type(bio), cpu;
1923
+ struct blkg_iostat_set *bis;
1924
+
1925
+ cpu = get_cpu();
1926
+ bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
1927
+ u64_stats_update_begin(&bis->sync);
1928
+
1929
+ /*
1930
+ * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
1931
+ * bio and we would have already accounted for the size of the bio.
1932
+ */
1933
+ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
1934
+ bio_set_flag(bio, BIO_CGROUP_ACCT);
1935
+ bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
1936
+ }
1937
+ bis->cur.ios[rwd]++;
1938
+
1939
+ u64_stats_update_end(&bis->sync);
1940
+ if (cgroup_subsys_on_dfl(io_cgrp_subsys))
1941
+ cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
1942
+ put_cpu();
1943
+}
1944
+
1945
+static int __init blkcg_init(void)
1946
+{
1947
+ blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
1948
+ WQ_MEM_RECLAIM | WQ_FREEZABLE |
1949
+ WQ_UNBOUND | WQ_SYSFS, 0);
1950
+ if (!blkcg_punt_bio_wq)
1951
+ return -ENOMEM;
1952
+ return 0;
1953
+}
1954
+subsys_initcall(blkcg_init);
18321955
18331956 module_param(blkcg_debug_stats, bool, 0644);
18341957 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");