hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/block/blk-throttle.c
....@@ -12,12 +12,13 @@
1212 #include <linux/blktrace_api.h>
1313 #include <linux/blk-cgroup.h>
1414 #include "blk.h"
15
+#include "blk-cgroup-rwstat.h"
1516
1617 /* Max dispatch from a group in 1 round */
17
-static int throtl_grp_quantum = 8;
18
+#define THROTL_GRP_QUANTUM 8
1819
1920 /* Total max dispatch from all groups in one round */
20
-static int throtl_quantum = 32;
21
+#define THROTL_QUANTUM 32
2122
2223 /* Throttling is performed over a slice and after that slice is renewed */
2324 #define DFL_THROTL_SLICE_HD (HZ / 10)
....@@ -84,8 +85,7 @@
8485 * RB tree of active children throtl_grp's, which are sorted by
8586 * their ->disptime.
8687 */
87
- struct rb_root pending_tree; /* RB tree of active tgs */
88
- struct rb_node *first_pending; /* first node in the tree */
88
+ struct rb_root_cached pending_tree; /* RB tree of active tgs */
8989 unsigned int nr_pending; /* # queued in the tree */
9090 unsigned long first_pending_disptime; /* disptime of the first tg */
9191 struct timer_list pending_timer; /* fires on first_pending_disptime */
....@@ -150,7 +150,7 @@
150150 /* user configured IOPS limits */
151151 unsigned int iops_conf[2][LIMIT_CNT];
152152
153
- /* Number of bytes disptached in current slice */
153
+ /* Number of bytes dispatched in current slice */
154154 uint64_t bytes_disp[2];
155155 /* Number of bio's dispatched in current slice */
156156 unsigned int io_disp[2];
....@@ -177,6 +177,12 @@
177177 unsigned int bio_cnt; /* total bios */
178178 unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
179179 unsigned long bio_cnt_reset_time;
180
+
181
+ atomic_t io_split_cnt[2];
182
+ atomic_t last_io_split_cnt[2];
183
+
184
+ struct blkg_rwstat stat_bytes;
185
+ struct blkg_rwstat stat_ios;
180186 };
181187
182188 /* We measure latency for request size from <= 4k to >= 1M */
....@@ -420,12 +426,13 @@
420426 */
421427 static struct bio *throtl_peek_queued(struct list_head *queued)
422428 {
423
- struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
429
+ struct throtl_qnode *qn;
424430 struct bio *bio;
425431
426432 if (list_empty(queued))
427433 return NULL;
428434
435
+ qn = list_first_entry(queued, struct throtl_qnode, node);
429436 bio = bio_list_peek(&qn->bios);
430437 WARN_ON_ONCE(!bio);
431438 return bio;
....@@ -448,12 +455,13 @@
448455 static struct bio *throtl_pop_queued(struct list_head *queued,
449456 struct throtl_grp **tg_to_put)
450457 {
451
- struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
458
+ struct throtl_qnode *qn;
452459 struct bio *bio;
453460
454461 if (list_empty(queued))
455462 return NULL;
456463
464
+ qn = list_first_entry(queued, struct throtl_qnode, node);
457465 bio = bio_list_pop(&qn->bios);
458466 WARN_ON_ONCE(!bio);
459467
....@@ -475,18 +483,26 @@
475483 {
476484 INIT_LIST_HEAD(&sq->queued[0]);
477485 INIT_LIST_HEAD(&sq->queued[1]);
478
- sq->pending_tree = RB_ROOT;
486
+ sq->pending_tree = RB_ROOT_CACHED;
479487 timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
480488 }
481489
482
-static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
490
+static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
491
+ struct request_queue *q,
492
+ struct blkcg *blkcg)
483493 {
484494 struct throtl_grp *tg;
485495 int rw;
486496
487
- tg = kzalloc_node(sizeof(*tg), gfp, node);
497
+ tg = kzalloc_node(sizeof(*tg), gfp, q->node);
488498 if (!tg)
489499 return NULL;
500
+
501
+ if (blkg_rwstat_init(&tg->stat_bytes, gfp))
502
+ goto err_free_tg;
503
+
504
+ if (blkg_rwstat_init(&tg->stat_ios, gfp))
505
+ goto err_exit_stat_bytes;
490506
491507 throtl_service_queue_init(&tg->service_queue);
492508
....@@ -512,6 +528,12 @@
512528 tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
513529
514530 return &tg->pd;
531
+
532
+err_exit_stat_bytes:
533
+ blkg_rwstat_exit(&tg->stat_bytes);
534
+err_free_tg:
535
+ kfree(tg);
536
+ return NULL;
515537 }
516538
517539 static void throtl_pd_init(struct blkg_policy_data *pd)
....@@ -610,37 +632,28 @@
610632 struct throtl_grp *tg = pd_to_tg(pd);
611633
612634 del_timer_sync(&tg->service_queue.pending_timer);
635
+ blkg_rwstat_exit(&tg->stat_bytes);
636
+ blkg_rwstat_exit(&tg->stat_ios);
613637 kfree(tg);
614638 }
615639
616640 static struct throtl_grp *
617641 throtl_rb_first(struct throtl_service_queue *parent_sq)
618642 {
619
- /* Service tree is empty */
620
- if (!parent_sq->nr_pending)
643
+ struct rb_node *n;
644
+
645
+ n = rb_first_cached(&parent_sq->pending_tree);
646
+ WARN_ON_ONCE(!n);
647
+ if (!n)
621648 return NULL;
622
-
623
- if (!parent_sq->first_pending)
624
- parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
625
-
626
- if (parent_sq->first_pending)
627
- return rb_entry_tg(parent_sq->first_pending);
628
-
629
- return NULL;
630
-}
631
-
632
-static void rb_erase_init(struct rb_node *n, struct rb_root *root)
633
-{
634
- rb_erase(n, root);
635
- RB_CLEAR_NODE(n);
649
+ return rb_entry_tg(n);
636650 }
637651
638652 static void throtl_rb_erase(struct rb_node *n,
639653 struct throtl_service_queue *parent_sq)
640654 {
641
- if (parent_sq->first_pending == n)
642
- parent_sq->first_pending = NULL;
643
- rb_erase_init(n, &parent_sq->pending_tree);
655
+ rb_erase_cached(n, &parent_sq->pending_tree);
656
+ RB_CLEAR_NODE(n);
644657 --parent_sq->nr_pending;
645658 }
646659
....@@ -658,11 +671,11 @@
658671 static void tg_service_queue_add(struct throtl_grp *tg)
659672 {
660673 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
661
- struct rb_node **node = &parent_sq->pending_tree.rb_node;
674
+ struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node;
662675 struct rb_node *parent = NULL;
663676 struct throtl_grp *__tg;
664677 unsigned long key = tg->disptime;
665
- int left = 1;
678
+ bool leftmost = true;
666679
667680 while (*node != NULL) {
668681 parent = *node;
....@@ -672,40 +685,30 @@
672685 node = &parent->rb_left;
673686 else {
674687 node = &parent->rb_right;
675
- left = 0;
688
+ leftmost = false;
676689 }
677690 }
678691
679
- if (left)
680
- parent_sq->first_pending = &tg->rb_node;
681
-
682692 rb_link_node(&tg->rb_node, parent, node);
683
- rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
684
-}
685
-
686
-static void __throtl_enqueue_tg(struct throtl_grp *tg)
687
-{
688
- tg_service_queue_add(tg);
689
- tg->flags |= THROTL_TG_PENDING;
690
- tg->service_queue.parent_sq->nr_pending++;
693
+ rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree,
694
+ leftmost);
691695 }
692696
693697 static void throtl_enqueue_tg(struct throtl_grp *tg)
694698 {
695
- if (!(tg->flags & THROTL_TG_PENDING))
696
- __throtl_enqueue_tg(tg);
697
-}
698
-
699
-static void __throtl_dequeue_tg(struct throtl_grp *tg)
700
-{
701
- throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
702
- tg->flags &= ~THROTL_TG_PENDING;
699
+ if (!(tg->flags & THROTL_TG_PENDING)) {
700
+ tg_service_queue_add(tg);
701
+ tg->flags |= THROTL_TG_PENDING;
702
+ tg->service_queue.parent_sq->nr_pending++;
703
+ }
703704 }
704705
705706 static void throtl_dequeue_tg(struct throtl_grp *tg)
706707 {
707
- if (tg->flags & THROTL_TG_PENDING)
708
- __throtl_dequeue_tg(tg);
708
+ if (tg->flags & THROTL_TG_PENDING) {
709
+ throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
710
+ tg->flags &= ~THROTL_TG_PENDING;
711
+ }
709712 }
710713
711714 /* Call with queue lock held */
....@@ -771,6 +774,8 @@
771774 tg->bytes_disp[rw] = 0;
772775 tg->io_disp[rw] = 0;
773776
777
+ atomic_set(&tg->io_split_cnt[rw], 0);
778
+
774779 /*
775780 * Previous slice has expired. We must have trimmed it after last
776781 * bio dispatch. That means since start of last slice, we never used
....@@ -793,6 +798,9 @@
793798 tg->io_disp[rw] = 0;
794799 tg->slice_start[rw] = jiffies;
795800 tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
801
+
802
+ atomic_set(&tg->io_split_cnt[rw], 0);
803
+
796804 throtl_log(&tg->service_queue,
797805 "[%c] new slice start=%lu end=%lu jiffies=%lu",
798806 rw == READ ? 'R' : 'W', tg->slice_start[rw],
....@@ -808,7 +816,7 @@
808816 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
809817 unsigned long jiffy_end)
810818 {
811
- tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
819
+ throtl_set_slice_end(tg, rw, jiffy_end);
812820 throtl_log(&tg->service_queue,
813821 "[%c] extend slice start=%lu end=%lu jiffies=%lu",
814822 rw == READ ? 'R' : 'W', tg->slice_start[rw],
....@@ -843,7 +851,7 @@
843851 /*
844852 * A bio has been dispatched. Also adjust slice_end. It might happen
845853 * that initially cgroup limit was very low resulting in high
846
- * slice_end, but later limit was bumped up and bio was dispached
854
+ * slice_end, but later limit was bumped up and bio was dispatched
847855 * sooner, then we need to reduce slice_end. A high bogus slice_end
848856 * is bad because it does not allow new slice to start.
849857 */
....@@ -885,12 +893,18 @@
885893 }
886894
887895 static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
888
- unsigned long *wait)
896
+ u32 iops_limit, unsigned long *wait)
889897 {
890898 bool rw = bio_data_dir(bio);
891899 unsigned int io_allowed;
892900 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
893901 u64 tmp;
902
+
903
+ if (iops_limit == UINT_MAX) {
904
+ if (wait)
905
+ *wait = 0;
906
+ return true;
907
+ }
894908
895909 jiffy_elapsed = jiffies - tg->slice_start[rw];
896910
....@@ -904,7 +918,7 @@
904918 * have been trimmed.
905919 */
906920
907
- tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
921
+ tmp = (u64)iops_limit * jiffy_elapsed_rnd;
908922 do_div(tmp, HZ);
909923
910924 if (tmp > UINT_MAX)
....@@ -927,12 +941,18 @@
927941 }
928942
929943 static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
930
- unsigned long *wait)
944
+ u64 bps_limit, unsigned long *wait)
931945 {
932946 bool rw = bio_data_dir(bio);
933
- u64 bytes_allowed, extra_bytes, tmp;
947
+ u64 bytes_allowed, extra_bytes;
934948 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
935949 unsigned int bio_size = throtl_bio_data_size(bio);
950
+
951
+ if (bps_limit == U64_MAX) {
952
+ if (wait)
953
+ *wait = 0;
954
+ return true;
955
+ }
936956
937957 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
938958
....@@ -941,10 +961,8 @@
941961 jiffy_elapsed_rnd = tg->td->throtl_slice;
942962
943963 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
944
-
945
- tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
946
- do_div(tmp, HZ);
947
- bytes_allowed = tmp;
964
+ bytes_allowed = mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed_rnd,
965
+ (u64)HZ);
948966
949967 if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
950968 if (wait)
....@@ -954,7 +972,7 @@
954972
955973 /* Calc approx time to dispatch */
956974 extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
957
- jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
975
+ jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit);
958976
959977 if (!jiffy_wait)
960978 jiffy_wait = 1;
....@@ -978,6 +996,8 @@
978996 {
979997 bool rw = bio_data_dir(bio);
980998 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
999
+ u64 bps_limit = tg_bps_limit(tg, rw);
1000
+ u32 iops_limit = tg_iops_limit(tg, rw);
9811001
9821002 /*
9831003 * Currently whole state machine of group depends on first bio
....@@ -989,8 +1009,7 @@
9891009 bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
9901010
9911011 /* If tg->bps = -1, then BW is unlimited */
992
- if (tg_bps_limit(tg, rw) == U64_MAX &&
993
- tg_iops_limit(tg, rw) == UINT_MAX) {
1012
+ if (bps_limit == U64_MAX && iops_limit == UINT_MAX) {
9941013 if (wait)
9951014 *wait = 0;
9961015 return true;
....@@ -1012,8 +1031,11 @@
10121031 jiffies + tg->td->throtl_slice);
10131032 }
10141033
1015
- if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
1016
- tg_with_in_iops_limit(tg, bio, &iops_wait)) {
1034
+ if (iops_limit != UINT_MAX)
1035
+ tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0);
1036
+
1037
+ if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) &&
1038
+ tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) {
10171039 if (wait)
10181040 *wait = 0;
10191041 return true;
....@@ -1073,7 +1095,7 @@
10731095 * If @tg doesn't currently have any bios queued in the same
10741096 * direction, queueing @bio can change when @tg should be
10751097 * dispatched. Mark that @tg was empty. This is automatically
1076
- * cleaered on the next tg_update_disptime().
1098
+ * cleared on the next tg_update_disptime().
10771099 */
10781100 if (!sq->nr_queued[rw])
10791101 tg->flags |= THROTL_TG_WAS_EMPTY;
....@@ -1166,8 +1188,8 @@
11661188 {
11671189 struct throtl_service_queue *sq = &tg->service_queue;
11681190 unsigned int nr_reads = 0, nr_writes = 0;
1169
- unsigned int max_nr_reads = throtl_grp_quantum*3/4;
1170
- unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
1191
+ unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4;
1192
+ unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads;
11711193 struct bio *bio;
11721194
11731195 /* Try to dispatch 75% READS and 25% WRITES */
....@@ -1200,9 +1222,13 @@
12001222 unsigned int nr_disp = 0;
12011223
12021224 while (1) {
1203
- struct throtl_grp *tg = throtl_rb_first(parent_sq);
1225
+ struct throtl_grp *tg;
12041226 struct throtl_service_queue *sq;
12051227
1228
+ if (!parent_sq->nr_pending)
1229
+ break;
1230
+
1231
+ tg = throtl_rb_first(parent_sq);
12061232 if (!tg)
12071233 break;
12081234
....@@ -1217,7 +1243,7 @@
12171243 if (sq->nr_queued[0] || sq->nr_queued[1])
12181244 tg_update_disptime(tg);
12191245
1220
- if (nr_disp >= throtl_quantum)
1246
+ if (nr_disp >= THROTL_QUANTUM)
12211247 break;
12221248 }
12231249
....@@ -1228,7 +1254,7 @@
12281254 struct throtl_grp *this_tg);
12291255 /**
12301256 * throtl_pending_timer_fn - timer function for service_queue->pending_timer
1231
- * @arg: the throtl_service_queue being serviced
1257
+ * @t: the pending_timer member of the throtl_service_queue being serviced
12321258 *
12331259 * This timer is armed when a child throtl_grp with active bio's become
12341260 * pending and queued on the service_queue's pending_tree and expires when
....@@ -1251,7 +1277,7 @@
12511277 bool dispatched;
12521278 int ret;
12531279
1254
- spin_lock_irq(q->queue_lock);
1280
+ spin_lock_irq(&q->queue_lock);
12551281 if (throtl_can_upgrade(td, NULL))
12561282 throtl_upgrade_state(td);
12571283
....@@ -1274,9 +1300,9 @@
12741300 break;
12751301
12761302 /* this dispatch windows is still open, relax and repeat */
1277
- spin_unlock_irq(q->queue_lock);
1303
+ spin_unlock_irq(&q->queue_lock);
12781304 cpu_relax();
1279
- spin_lock_irq(q->queue_lock);
1305
+ spin_lock_irq(&q->queue_lock);
12801306 }
12811307
12821308 if (!dispatched)
....@@ -1294,19 +1320,19 @@
12941320 }
12951321 }
12961322 } else {
1297
- /* reached the top-level, queue issueing */
1323
+ /* reached the top-level, queue issuing */
12981324 queue_work(kthrotld_workqueue, &td->dispatch_work);
12991325 }
13001326 out_unlock:
1301
- spin_unlock_irq(q->queue_lock);
1327
+ spin_unlock_irq(&q->queue_lock);
13021328 }
13031329
13041330 /**
13051331 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
13061332 * @work: work item being executed
13071333 *
1308
- * This function is queued for execution when bio's reach the bio_lists[]
1309
- * of throtl_data->service_queue. Those bio's are ready and issued by this
1334
+ * This function is queued for execution when bios reach the bio_lists[]
1335
+ * of throtl_data->service_queue. Those bios are ready and issued by this
13101336 * function.
13111337 */
13121338 static void blk_throtl_dispatch_work_fn(struct work_struct *work)
....@@ -1322,16 +1348,16 @@
13221348
13231349 bio_list_init(&bio_list_on_stack);
13241350
1325
- spin_lock_irq(q->queue_lock);
1351
+ spin_lock_irq(&q->queue_lock);
13261352 for (rw = READ; rw <= WRITE; rw++)
13271353 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
13281354 bio_list_add(&bio_list_on_stack, bio);
1329
- spin_unlock_irq(q->queue_lock);
1355
+ spin_unlock_irq(&q->queue_lock);
13301356
13311357 if (!bio_list_empty(&bio_list_on_stack)) {
13321358 blk_start_plug(&plug);
1333
- while((bio = bio_list_pop(&bio_list_on_stack)))
1334
- generic_make_request(bio);
1359
+ while ((bio = bio_list_pop(&bio_list_on_stack)))
1360
+ submit_bio_noacct(bio);
13351361 blk_finish_plug(&plug);
13361362 }
13371363 }
....@@ -1419,8 +1445,8 @@
14191445 * that a group's limit are dropped suddenly and we don't want to
14201446 * account recently dispatched IO with new low rate.
14211447 */
1422
- throtl_start_new_slice(tg, 0);
1423
- throtl_start_new_slice(tg, 1);
1448
+ throtl_start_new_slice(tg, READ);
1449
+ throtl_start_new_slice(tg, WRITE);
14241450
14251451 if (tg->flags & THROTL_TG_PENDING) {
14261452 tg_update_disptime(tg);
....@@ -1473,6 +1499,32 @@
14731499 return tg_set_conf(of, buf, nbytes, off, false);
14741500 }
14751501
1502
+static int tg_print_rwstat(struct seq_file *sf, void *v)
1503
+{
1504
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1505
+ blkg_prfill_rwstat, &blkcg_policy_throtl,
1506
+ seq_cft(sf)->private, true);
1507
+ return 0;
1508
+}
1509
+
1510
+static u64 tg_prfill_rwstat_recursive(struct seq_file *sf,
1511
+ struct blkg_policy_data *pd, int off)
1512
+{
1513
+ struct blkg_rwstat_sample sum;
1514
+
1515
+ blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off,
1516
+ &sum);
1517
+ return __blkg_prfill_rwstat(sf, pd, &sum);
1518
+}
1519
+
1520
+static int tg_print_rwstat_recursive(struct seq_file *sf, void *v)
1521
+{
1522
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1523
+ tg_prfill_rwstat_recursive, &blkcg_policy_throtl,
1524
+ seq_cft(sf)->private, true);
1525
+ return 0;
1526
+}
1527
+
14761528 static struct cftype throtl_legacy_files[] = {
14771529 {
14781530 .name = "throttle.read_bps_device",
....@@ -1500,23 +1552,23 @@
15001552 },
15011553 {
15021554 .name = "throttle.io_service_bytes",
1503
- .private = (unsigned long)&blkcg_policy_throtl,
1504
- .seq_show = blkg_print_stat_bytes,
1555
+ .private = offsetof(struct throtl_grp, stat_bytes),
1556
+ .seq_show = tg_print_rwstat,
15051557 },
15061558 {
15071559 .name = "throttle.io_service_bytes_recursive",
1508
- .private = (unsigned long)&blkcg_policy_throtl,
1509
- .seq_show = blkg_print_stat_bytes_recursive,
1560
+ .private = offsetof(struct throtl_grp, stat_bytes),
1561
+ .seq_show = tg_print_rwstat_recursive,
15101562 },
15111563 {
15121564 .name = "throttle.io_serviced",
1513
- .private = (unsigned long)&blkcg_policy_throtl,
1514
- .seq_show = blkg_print_stat_ios,
1565
+ .private = offsetof(struct throtl_grp, stat_ios),
1566
+ .seq_show = tg_print_rwstat,
15151567 },
15161568 {
15171569 .name = "throttle.io_serviced_recursive",
1518
- .private = (unsigned long)&blkcg_policy_throtl,
1519
- .seq_show = blkg_print_stat_ios_recursive,
1570
+ .private = offsetof(struct throtl_grp, stat_ios),
1571
+ .seq_show = tg_print_rwstat_recursive,
15201572 },
15211573 { } /* terminate */
15221574 };
....@@ -1639,13 +1691,13 @@
16391691 goto out_finish;
16401692
16411693 ret = -EINVAL;
1642
- if (!strcmp(tok, "rbps"))
1694
+ if (!strcmp(tok, "rbps") && val > 1)
16431695 v[0] = val;
1644
- else if (!strcmp(tok, "wbps"))
1696
+ else if (!strcmp(tok, "wbps") && val > 1)
16451697 v[1] = val;
1646
- else if (!strcmp(tok, "riops"))
1698
+ else if (!strcmp(tok, "riops") && val > 1)
16471699 v[2] = min_t(u64, val, UINT_MAX);
1648
- else if (!strcmp(tok, "wiops"))
1700
+ else if (!strcmp(tok, "wiops") && val > 1)
16491701 v[3] = min_t(u64, val, UINT_MAX);
16501702 else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
16511703 idle_time = val;
....@@ -1922,7 +1974,7 @@
19221974 queue_work(kthrotld_workqueue, &td->dispatch_work);
19231975 }
19241976
1925
-static void throtl_downgrade_state(struct throtl_data *td, int new)
1977
+static void throtl_downgrade_state(struct throtl_data *td)
19261978 {
19271979 td->scale /= 2;
19281980
....@@ -1932,7 +1984,7 @@
19321984 return;
19331985 }
19341986
1935
- td->limit_index = new;
1987
+ td->limit_index = LIMIT_LOW;
19361988 td->low_downgrade_time = jiffies;
19371989 }
19381990
....@@ -2003,12 +2055,14 @@
20032055 }
20042056
20052057 if (tg->iops[READ][LIMIT_LOW]) {
2058
+ tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0);
20062059 iops = tg->last_io_disp[READ] * HZ / elapsed_time;
20072060 if (iops >= tg->iops[READ][LIMIT_LOW])
20082061 tg->last_low_overflow_time[READ] = now;
20092062 }
20102063
20112064 if (tg->iops[WRITE][LIMIT_LOW]) {
2065
+ tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0);
20122066 iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
20132067 if (iops >= tg->iops[WRITE][LIMIT_LOW])
20142068 tg->last_low_overflow_time[WRITE] = now;
....@@ -2019,7 +2073,7 @@
20192073 * cgroups
20202074 */
20212075 if (throtl_hierarchy_can_downgrade(tg))
2022
- throtl_downgrade_state(tg->td, LIMIT_LOW);
2076
+ throtl_downgrade_state(tg->td);
20232077
20242078 tg->last_bytes_disp[READ] = 0;
20252079 tg->last_bytes_disp[WRITE] = 0;
....@@ -2029,10 +2083,14 @@
20292083
20302084 static void blk_throtl_update_idletime(struct throtl_grp *tg)
20312085 {
2032
- unsigned long now = ktime_get_ns() >> 10;
2086
+ unsigned long now;
20332087 unsigned long last_finish_time = tg->last_finish_time;
20342088
2035
- if (now <= last_finish_time || last_finish_time == 0 ||
2089
+ if (last_finish_time == 0)
2090
+ return;
2091
+
2092
+ now = ktime_get_ns() >> 10;
2093
+ if (now <= last_finish_time ||
20362094 last_finish_time == tg->checked_last_finish_time)
20372095 return;
20382096
....@@ -2048,7 +2106,7 @@
20482106 unsigned long last_latency[2] = { 0 };
20492107 unsigned long latency[2];
20502108
2051
- if (!blk_queue_nonrot(td->queue))
2109
+ if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])
20522110 return;
20532111 if (time_before(jiffies, td->last_calculate_time + HZ))
20542112 return;
....@@ -2123,40 +2181,55 @@
21232181 }
21242182 #endif
21252183
2126
-static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
2184
+void blk_throtl_charge_bio_split(struct bio *bio)
21272185 {
2128
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2129
- /* fallback to root_blkg if we fail to get a blkg ref */
2130
- if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV))
2131
- bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg);
2132
- bio_issue_init(&bio->bi_issue, bio_sectors(bio));
2133
-#endif
2186
+ struct blkcg_gq *blkg = bio->bi_blkg;
2187
+ struct throtl_grp *parent = blkg_to_tg(blkg);
2188
+ struct throtl_service_queue *parent_sq;
2189
+ bool rw = bio_data_dir(bio);
2190
+
2191
+ do {
2192
+ if (!parent->has_rules[rw])
2193
+ break;
2194
+
2195
+ atomic_inc(&parent->io_split_cnt[rw]);
2196
+ atomic_inc(&parent->last_io_split_cnt[rw]);
2197
+
2198
+ parent_sq = parent->service_queue.parent_sq;
2199
+ parent = sq_to_tg(parent_sq);
2200
+ } while (parent);
21342201 }
21352202
2136
-bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
2137
- struct bio *bio)
2203
+bool blk_throtl_bio(struct bio *bio)
21382204 {
2205
+ struct request_queue *q = bio->bi_disk->queue;
2206
+ struct blkcg_gq *blkg = bio->bi_blkg;
21392207 struct throtl_qnode *qn = NULL;
2140
- struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
2208
+ struct throtl_grp *tg = blkg_to_tg(blkg);
21412209 struct throtl_service_queue *sq;
21422210 bool rw = bio_data_dir(bio);
21432211 bool throttled = false;
21442212 struct throtl_data *td = tg->td;
21452213
2146
- WARN_ON_ONCE(!rcu_read_lock_held());
2214
+ rcu_read_lock();
21472215
21482216 /* see throtl_charge_bio() */
2149
- if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
2217
+ if (bio_flagged(bio, BIO_THROTTLED))
21502218 goto out;
21512219
2152
- spin_lock_irq(q->queue_lock);
2220
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
2221
+ blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
2222
+ bio->bi_iter.bi_size);
2223
+ blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
2224
+ }
2225
+
2226
+ if (!tg->has_rules[rw])
2227
+ goto out;
2228
+
2229
+ spin_lock_irq(&q->queue_lock);
21532230
21542231 throtl_update_latency_buckets(td);
21552232
2156
- if (unlikely(blk_queue_bypass(q)))
2157
- goto out_unlock;
2158
-
2159
- blk_throtl_assoc_bio(tg, bio);
21602233 blk_throtl_update_idletime(tg);
21612234
21622235 sq = &tg->service_queue;
....@@ -2199,7 +2272,7 @@
21992272
22002273 /*
22012274 * @bio passed through this layer without being throttled.
2202
- * Climb up the ladder. If we''re already at the top, it
2275
+ * Climb up the ladder. If we're already at the top, it
22032276 * can be executed directly.
22042277 */
22052278 qn = &tg->qnode_on_parent[rw];
....@@ -2235,7 +2308,7 @@
22352308 }
22362309
22372310 out_unlock:
2238
- spin_unlock_irq(q->queue_lock);
2311
+ spin_unlock_irq(&q->queue_lock);
22392312 out:
22402313 bio_set_flag(bio, BIO_THROTTLED);
22412314
....@@ -2243,6 +2316,7 @@
22432316 if (throttled || !td->track_bio_latency)
22442317 bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
22452318 #endif
2319
+ rcu_read_unlock();
22462320 return throttled;
22472321 }
22482322
....@@ -2271,7 +2345,8 @@
22712345 struct request_queue *q = rq->q;
22722346 struct throtl_data *td = q->td;
22732347
2274
- throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
2348
+ throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
2349
+ time_ns >> 10);
22752350 }
22762351
22772352 void blk_throtl_bio_endio(struct bio *bio)
....@@ -2288,6 +2363,8 @@
22882363 if (!blkg)
22892364 return;
22902365 tg = blkg_to_tg(blkg);
2366
+ if (!tg->td->limit_valid[LIMIT_LOW])
2367
+ return;
22912368
22922369 finish_time_ns = ktime_get_ns();
22932370 tg->last_finish_time = finish_time_ns >> 10;
....@@ -2326,70 +2403,6 @@
23262403 }
23272404 }
23282405 #endif
2329
-
2330
-/*
2331
- * Dispatch all bios from all children tg's queued on @parent_sq. On
2332
- * return, @parent_sq is guaranteed to not have any active children tg's
2333
- * and all bios from previously active tg's are on @parent_sq->bio_lists[].
2334
- */
2335
-static void tg_drain_bios(struct throtl_service_queue *parent_sq)
2336
-{
2337
- struct throtl_grp *tg;
2338
-
2339
- while ((tg = throtl_rb_first(parent_sq))) {
2340
- struct throtl_service_queue *sq = &tg->service_queue;
2341
- struct bio *bio;
2342
-
2343
- throtl_dequeue_tg(tg);
2344
-
2345
- while ((bio = throtl_peek_queued(&sq->queued[READ])))
2346
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
2347
- while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
2348
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
2349
- }
2350
-}
2351
-
2352
-/**
2353
- * blk_throtl_drain - drain throttled bios
2354
- * @q: request_queue to drain throttled bios for
2355
- *
2356
- * Dispatch all currently throttled bios on @q through ->make_request_fn().
2357
- */
2358
-void blk_throtl_drain(struct request_queue *q)
2359
- __releases(q->queue_lock) __acquires(q->queue_lock)
2360
-{
2361
- struct throtl_data *td = q->td;
2362
- struct blkcg_gq *blkg;
2363
- struct cgroup_subsys_state *pos_css;
2364
- struct bio *bio;
2365
- int rw;
2366
-
2367
- queue_lockdep_assert_held(q);
2368
- rcu_read_lock();
2369
-
2370
- /*
2371
- * Drain each tg while doing post-order walk on the blkg tree, so
2372
- * that all bios are propagated to td->service_queue. It'd be
2373
- * better to walk service_queue tree directly but blkg walk is
2374
- * easier.
2375
- */
2376
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
2377
- tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
2378
-
2379
- /* finally, transfer bios from top-level tg's into the td */
2380
- tg_drain_bios(&td->service_queue);
2381
-
2382
- rcu_read_unlock();
2383
- spin_unlock_irq(q->queue_lock);
2384
-
2385
- /* all bios now should be in td->service_queue, issue them */
2386
- for (rw = READ; rw <= WRITE; rw++)
2387
- while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
2388
- NULL)))
2389
- generic_make_request(bio);
2390
-
2391
- spin_lock_irq(q->queue_lock);
2392
-}
23932406
23942407 int blk_throtl_init(struct request_queue *q)
23952408 {
....@@ -2469,7 +2482,7 @@
24692482 td->throtl_slice = DFL_THROTL_SLICE_HD;
24702483 #endif
24712484
2472
- td->track_bio_latency = !queue_is_rq_based(q);
2485
+ td->track_bio_latency = !queue_is_mq(q);
24732486 if (!td->track_bio_latency)
24742487 blk_stat_enable_accounting(q);
24752488 }