From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/block/blk-throttle.c | 393 +++++++++++++++++++++++++++++--------------------------- 1 files changed, 203 insertions(+), 190 deletions(-) diff --git a/kernel/block/blk-throttle.c b/kernel/block/blk-throttle.c index 853b177..c526fdd 100644 --- a/kernel/block/blk-throttle.c +++ b/kernel/block/blk-throttle.c @@ -12,12 +12,13 @@ #include <linux/blktrace_api.h> #include <linux/blk-cgroup.h> #include "blk.h" +#include "blk-cgroup-rwstat.h" /* Max dispatch from a group in 1 round */ -static int throtl_grp_quantum = 8; +#define THROTL_GRP_QUANTUM 8 /* Total max dispatch from all groups in one round */ -static int throtl_quantum = 32; +#define THROTL_QUANTUM 32 /* Throttling is performed over a slice and after that slice is renewed */ #define DFL_THROTL_SLICE_HD (HZ / 10) @@ -84,8 +85,7 @@ * RB tree of active children throtl_grp's, which are sorted by * their ->disptime. */ - struct rb_root pending_tree; /* RB tree of active tgs */ - struct rb_node *first_pending; /* first node in the tree */ + struct rb_root_cached pending_tree; /* RB tree of active tgs */ unsigned int nr_pending; /* # queued in the tree */ unsigned long first_pending_disptime; /* disptime of the first tg */ struct timer_list pending_timer; /* fires on first_pending_disptime */ @@ -150,7 +150,7 @@ /* user configured IOPS limits */ unsigned int iops_conf[2][LIMIT_CNT]; - /* Number of bytes disptached in current slice */ + /* Number of bytes dispatched in current slice */ uint64_t bytes_disp[2]; /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; @@ -177,6 +177,12 @@ unsigned int bio_cnt; /* total bios */ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time; + + atomic_t io_split_cnt[2]; + atomic_t last_io_split_cnt[2]; + + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; }; /* We measure latency for request size from <= 4k to >= 1M */ @@ -420,12 +426,13 @@ */ static struct bio *throtl_peek_queued(struct list_head *queued) { - struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); + struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; + qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_peek(&qn->bios); WARN_ON_ONCE(!bio); return bio; @@ -448,12 +455,13 @@ static struct bio *throtl_pop_queued(struct list_head *queued, struct throtl_grp **tg_to_put) { - struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); + struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; + qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_pop(&qn->bios); WARN_ON_ONCE(!bio); @@ -475,18 +483,26 @@ { INIT_LIST_HEAD(&sq->queued[0]); INIT_LIST_HEAD(&sq->queued[1]); - sq->pending_tree = RB_ROOT; + sq->pending_tree = RB_ROOT_CACHED; timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } -static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) +static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, + struct request_queue *q, + struct blkcg *blkcg) { struct throtl_grp *tg; int rw; - tg = kzalloc_node(sizeof(*tg), gfp, node); + tg = kzalloc_node(sizeof(*tg), gfp, q->node); if (!tg) return NULL; + + if (blkg_rwstat_init(&tg->stat_bytes, gfp)) + goto err_free_tg; + + if (blkg_rwstat_init(&tg->stat_ios, gfp)) + goto err_exit_stat_bytes; throtl_service_queue_init(&tg->service_queue); @@ -512,6 +528,12 @@ tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; return &tg->pd; + +err_exit_stat_bytes: + blkg_rwstat_exit(&tg->stat_bytes); +err_free_tg: + kfree(tg); + return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) @@ -610,37 +632,28 @@ struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); + blkg_rwstat_exit(&tg->stat_bytes); + blkg_rwstat_exit(&tg->stat_ios); kfree(tg); } static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { - /* Service tree is empty */ - if (!parent_sq->nr_pending) + struct rb_node *n; + + n = rb_first_cached(&parent_sq->pending_tree); + WARN_ON_ONCE(!n); + if (!n) return NULL; - - if (!parent_sq->first_pending) - parent_sq->first_pending = rb_first(&parent_sq->pending_tree); - - if (parent_sq->first_pending) - return rb_entry_tg(parent_sq->first_pending); - - return NULL; -} - -static void rb_erase_init(struct rb_node *n, struct rb_root *root) -{ - rb_erase(n, root); - RB_CLEAR_NODE(n); + return rb_entry_tg(n); } static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *parent_sq) { - if (parent_sq->first_pending == n) - parent_sq->first_pending = NULL; - rb_erase_init(n, &parent_sq->pending_tree); + rb_erase_cached(n, &parent_sq->pending_tree); + RB_CLEAR_NODE(n); --parent_sq->nr_pending; } @@ -658,11 +671,11 @@ static void tg_service_queue_add(struct throtl_grp *tg) { struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; - struct rb_node **node = &parent_sq->pending_tree.rb_node; + struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; struct rb_node *parent = NULL; struct throtl_grp *__tg; unsigned long key = tg->disptime; - int left = 1; + bool leftmost = true; while (*node != NULL) { parent = *node; @@ -672,40 +685,30 @@ node = &parent->rb_left; else { node = &parent->rb_right; - left = 0; + leftmost = false; } } - if (left) - parent_sq->first_pending = &tg->rb_node; - rb_link_node(&tg->rb_node, parent, node); - rb_insert_color(&tg->rb_node, &parent_sq->pending_tree); -} - -static void __throtl_enqueue_tg(struct throtl_grp *tg) -{ - tg_service_queue_add(tg); - tg->flags |= THROTL_TG_PENDING; - tg->service_queue.parent_sq->nr_pending++; + rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, + leftmost); } static void throtl_enqueue_tg(struct throtl_grp *tg) { - if (!(tg->flags & THROTL_TG_PENDING)) - __throtl_enqueue_tg(tg); -} - -static void __throtl_dequeue_tg(struct throtl_grp *tg) -{ - throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); - tg->flags &= ~THROTL_TG_PENDING; + if (!(tg->flags & THROTL_TG_PENDING)) { + tg_service_queue_add(tg); + tg->flags |= THROTL_TG_PENDING; + tg->service_queue.parent_sq->nr_pending++; + } } static void throtl_dequeue_tg(struct throtl_grp *tg) { - if (tg->flags & THROTL_TG_PENDING) - __throtl_dequeue_tg(tg); + if (tg->flags & THROTL_TG_PENDING) { + throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); + tg->flags &= ~THROTL_TG_PENDING; + } } /* Call with queue lock held */ @@ -771,6 +774,8 @@ tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; + atomic_set(&tg->io_split_cnt[rw], 0); + /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used @@ -793,6 +798,9 @@ tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + + atomic_set(&tg->io_split_cnt[rw], 0); + throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -808,7 +816,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { - tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); + throtl_set_slice_end(tg, rw, jiffy_end); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -843,7 +851,7 @@ /* * A bio has been dispatched. Also adjust slice_end. It might happen * that initially cgroup limit was very low resulting in high - * slice_end, but later limit was bumped up and bio was dispached + * slice_end, but later limit was bumped up and bio was dispatched * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ @@ -885,12 +893,18 @@ } static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, - unsigned long *wait) + u32 iops_limit, unsigned long *wait) { bool rw = bio_data_dir(bio); unsigned int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp; + + if (iops_limit == UINT_MAX) { + if (wait) + *wait = 0; + return true; + } jiffy_elapsed = jiffies - tg->slice_start[rw]; @@ -904,7 +918,7 @@ * have been trimmed. */ - tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd; + tmp = (u64)iops_limit * jiffy_elapsed_rnd; do_div(tmp, HZ); if (tmp > UINT_MAX) @@ -927,12 +941,18 @@ } static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, - unsigned long *wait) + u64 bps_limit, unsigned long *wait) { bool rw = bio_data_dir(bio); - u64 bytes_allowed, extra_bytes, tmp; + u64 bytes_allowed, extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); + + if (bps_limit == U64_MAX) { + if (wait) + *wait = 0; + return true; + } jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; @@ -941,10 +961,8 @@ jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - - tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd; - do_div(tmp, HZ); - bytes_allowed = tmp; + bytes_allowed = mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed_rnd, + (u64)HZ); if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { if (wait) @@ -954,7 +972,7 @@ /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; - jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw)); + jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); if (!jiffy_wait) jiffy_wait = 1; @@ -978,6 +996,8 @@ { bool rw = bio_data_dir(bio); unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; + u64 bps_limit = tg_bps_limit(tg, rw); + u32 iops_limit = tg_iops_limit(tg, rw); /* * Currently whole state machine of group depends on first bio @@ -989,8 +1009,7 @@ bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ - if (tg_bps_limit(tg, rw) == U64_MAX && - tg_iops_limit(tg, rw) == UINT_MAX) { + if (bps_limit == U64_MAX && iops_limit == UINT_MAX) { if (wait) *wait = 0; return true; @@ -1012,8 +1031,11 @@ jiffies + tg->td->throtl_slice); } - if (tg_with_in_bps_limit(tg, bio, &bps_wait) && - tg_with_in_iops_limit(tg, bio, &iops_wait)) { + if (iops_limit != UINT_MAX) + tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0); + + if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && + tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) *wait = 0; return true; @@ -1073,7 +1095,7 @@ * If @tg doesn't currently have any bios queued in the same * direction, queueing @bio can change when @tg should be * dispatched. Mark that @tg was empty. This is automatically - * cleaered on the next tg_update_disptime(). + * cleared on the next tg_update_disptime(). */ if (!sq->nr_queued[rw]) tg->flags |= THROTL_TG_WAS_EMPTY; @@ -1166,8 +1188,8 @@ { struct throtl_service_queue *sq = &tg->service_queue; unsigned int nr_reads = 0, nr_writes = 0; - unsigned int max_nr_reads = throtl_grp_quantum*3/4; - unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; + unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; + unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; struct bio *bio; /* Try to dispatch 75% READS and 25% WRITES */ @@ -1200,9 +1222,13 @@ unsigned int nr_disp = 0; while (1) { - struct throtl_grp *tg = throtl_rb_first(parent_sq); + struct throtl_grp *tg; struct throtl_service_queue *sq; + if (!parent_sq->nr_pending) + break; + + tg = throtl_rb_first(parent_sq); if (!tg) break; @@ -1217,7 +1243,7 @@ if (sq->nr_queued[0] || sq->nr_queued[1]) tg_update_disptime(tg); - if (nr_disp >= throtl_quantum) + if (nr_disp >= THROTL_QUANTUM) break; } @@ -1228,7 +1254,7 @@ struct throtl_grp *this_tg); /** * throtl_pending_timer_fn - timer function for service_queue->pending_timer - * @arg: the throtl_service_queue being serviced + * @t: the pending_timer member of the throtl_service_queue being serviced * * This timer is armed when a child throtl_grp with active bio's become * pending and queued on the service_queue's pending_tree and expires when @@ -1251,7 +1277,7 @@ bool dispatched; int ret; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); if (throtl_can_upgrade(td, NULL)) throtl_upgrade_state(td); @@ -1274,9 +1300,9 @@ break; /* this dispatch windows is still open, relax and repeat */ - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); cpu_relax(); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); } if (!dispatched) @@ -1294,19 +1320,19 @@ } } } else { - /* reached the top-level, queue issueing */ + /* reached the top-level, queue issuing */ queue_work(kthrotld_workqueue, &td->dispatch_work); } out_unlock: - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); } /** * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work * @work: work item being executed * - * This function is queued for execution when bio's reach the bio_lists[] - * of throtl_data->service_queue. Those bio's are ready and issued by this + * This function is queued for execution when bios reach the bio_lists[] + * of throtl_data->service_queue. Those bios are ready and issued by this * function. */ static void blk_throtl_dispatch_work_fn(struct work_struct *work) @@ -1322,16 +1348,16 @@ bio_list_init(&bio_list_on_stack); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); for (rw = READ; rw <= WRITE; rw++) while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) bio_list_add(&bio_list_on_stack, bio); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); - while((bio = bio_list_pop(&bio_list_on_stack))) - generic_make_request(bio); + while ((bio = bio_list_pop(&bio_list_on_stack))) + submit_bio_noacct(bio); blk_finish_plug(&plug); } } @@ -1419,8 +1445,8 @@ * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate. */ - throtl_start_new_slice(tg, 0); - throtl_start_new_slice(tg, 1); + throtl_start_new_slice(tg, READ); + throtl_start_new_slice(tg, WRITE); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); @@ -1473,6 +1499,32 @@ return tg_set_conf(of, buf, nbytes, off, false); } +static int tg_print_rwstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + +static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat_sample sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, + &sum); + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + tg_prfill_rwstat_recursive, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", @@ -1500,23 +1552,23 @@ }, { .name = "throttle.io_service_bytes", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_bytes, + .private = offsetof(struct throtl_grp, stat_bytes), + .seq_show = tg_print_rwstat, }, { .name = "throttle.io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_bytes_recursive, + .private = offsetof(struct throtl_grp, stat_bytes), + .seq_show = tg_print_rwstat_recursive, }, { .name = "throttle.io_serviced", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_ios, + .private = offsetof(struct throtl_grp, stat_ios), + .seq_show = tg_print_rwstat, }, { .name = "throttle.io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_throtl, - .seq_show = blkg_print_stat_ios_recursive, + .private = offsetof(struct throtl_grp, stat_ios), + .seq_show = tg_print_rwstat_recursive, }, { } /* terminate */ }; @@ -1639,13 +1691,13 @@ goto out_finish; ret = -EINVAL; - if (!strcmp(tok, "rbps")) + if (!strcmp(tok, "rbps") && val > 1) v[0] = val; - else if (!strcmp(tok, "wbps")) + else if (!strcmp(tok, "wbps") && val > 1) v[1] = val; - else if (!strcmp(tok, "riops")) + else if (!strcmp(tok, "riops") && val > 1) v[2] = min_t(u64, val, UINT_MAX); - else if (!strcmp(tok, "wiops")) + else if (!strcmp(tok, "wiops") && val > 1) v[3] = min_t(u64, val, UINT_MAX); else if (off == LIMIT_LOW && !strcmp(tok, "idle")) idle_time = val; @@ -1922,7 +1974,7 @@ queue_work(kthrotld_workqueue, &td->dispatch_work); } -static void throtl_downgrade_state(struct throtl_data *td, int new) +static void throtl_downgrade_state(struct throtl_data *td) { td->scale /= 2; @@ -1932,7 +1984,7 @@ return; } - td->limit_index = new; + td->limit_index = LIMIT_LOW; td->low_downgrade_time = jiffies; } @@ -2003,12 +2055,14 @@ } if (tg->iops[READ][LIMIT_LOW]) { + tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0); iops = tg->last_io_disp[READ] * HZ / elapsed_time; if (iops >= tg->iops[READ][LIMIT_LOW]) tg->last_low_overflow_time[READ] = now; } if (tg->iops[WRITE][LIMIT_LOW]) { + tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0); iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; if (iops >= tg->iops[WRITE][LIMIT_LOW]) tg->last_low_overflow_time[WRITE] = now; @@ -2019,7 +2073,7 @@ * cgroups */ if (throtl_hierarchy_can_downgrade(tg)) - throtl_downgrade_state(tg->td, LIMIT_LOW); + throtl_downgrade_state(tg->td); tg->last_bytes_disp[READ] = 0; tg->last_bytes_disp[WRITE] = 0; @@ -2029,10 +2083,14 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) { - unsigned long now = ktime_get_ns() >> 10; + unsigned long now; unsigned long last_finish_time = tg->last_finish_time; - if (now <= last_finish_time || last_finish_time == 0 || + if (last_finish_time == 0) + return; + + now = ktime_get_ns() >> 10; + if (now <= last_finish_time || last_finish_time == tg->checked_last_finish_time) return; @@ -2048,7 +2106,7 @@ unsigned long last_latency[2] = { 0 }; unsigned long latency[2]; - if (!blk_queue_nonrot(td->queue)) + if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW]) return; if (time_before(jiffies, td->last_calculate_time + HZ)) return; @@ -2123,40 +2181,55 @@ } #endif -static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) +void blk_throtl_charge_bio_split(struct bio *bio) { -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - /* fallback to root_blkg if we fail to get a blkg ref */ - if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV)) - bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg); - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); -#endif + struct blkcg_gq *blkg = bio->bi_blkg; + struct throtl_grp *parent = blkg_to_tg(blkg); + struct throtl_service_queue *parent_sq; + bool rw = bio_data_dir(bio); + + do { + if (!parent->has_rules[rw]) + break; + + atomic_inc(&parent->io_split_cnt[rw]); + atomic_inc(&parent->last_io_split_cnt[rw]); + + parent_sq = parent->service_queue.parent_sq; + parent = sq_to_tg(parent_sq); + } while (parent); } -bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, - struct bio *bio) +bool blk_throtl_bio(struct bio *bio) { + struct request_queue *q = bio->bi_disk->queue; + struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; - struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); + struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; struct throtl_data *td = tg->td; - WARN_ON_ONCE(!rcu_read_lock_held()); + rcu_read_lock(); /* see throtl_charge_bio() */ - if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) + if (bio_flagged(bio, BIO_THROTTLED)) goto out; - spin_lock_irq(q->queue_lock); + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { + blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); + blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); + } + + if (!tg->has_rules[rw]) + goto out; + + spin_lock_irq(&q->queue_lock); throtl_update_latency_buckets(td); - if (unlikely(blk_queue_bypass(q))) - goto out_unlock; - - blk_throtl_assoc_bio(tg, bio); blk_throtl_update_idletime(tg); sq = &tg->service_queue; @@ -2199,7 +2272,7 @@ /* * @bio passed through this layer without being throttled. - * Climb up the ladder. If we''re already at the top, it + * Climb up the ladder. If we're already at the top, it * can be executed directly. */ qn = &tg->qnode_on_parent[rw]; @@ -2235,7 +2308,7 @@ } out_unlock: - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); out: bio_set_flag(bio, BIO_THROTTLED); @@ -2243,6 +2316,7 @@ if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; #endif + rcu_read_unlock(); return throttled; } @@ -2271,7 +2345,8 @@ struct request_queue *q = rq->q; struct throtl_data *td = q->td; - throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10); + throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq), + time_ns >> 10); } void blk_throtl_bio_endio(struct bio *bio) @@ -2288,6 +2363,8 @@ if (!blkg) return; tg = blkg_to_tg(blkg); + if (!tg->td->limit_valid[LIMIT_LOW]) + return; finish_time_ns = ktime_get_ns(); tg->last_finish_time = finish_time_ns >> 10; @@ -2326,70 +2403,6 @@ } } #endif - -/* - * Dispatch all bios from all children tg's queued on @parent_sq. On - * return, @parent_sq is guaranteed to not have any active children tg's - * and all bios from previously active tg's are on @parent_sq->bio_lists[]. - */ -static void tg_drain_bios(struct throtl_service_queue *parent_sq) -{ - struct throtl_grp *tg; - - while ((tg = throtl_rb_first(parent_sq))) { - struct throtl_service_queue *sq = &tg->service_queue; - struct bio *bio; - - throtl_dequeue_tg(tg); - - while ((bio = throtl_peek_queued(&sq->queued[READ]))) - tg_dispatch_one_bio(tg, bio_data_dir(bio)); - while ((bio = throtl_peek_queued(&sq->queued[WRITE]))) - tg_dispatch_one_bio(tg, bio_data_dir(bio)); - } -} - -/** - * blk_throtl_drain - drain throttled bios - * @q: request_queue to drain throttled bios for - * - * Dispatch all currently throttled bios on @q through ->make_request_fn(). - */ -void blk_throtl_drain(struct request_queue *q) - __releases(q->queue_lock) __acquires(q->queue_lock) -{ - struct throtl_data *td = q->td; - struct blkcg_gq *blkg; - struct cgroup_subsys_state *pos_css; - struct bio *bio; - int rw; - - queue_lockdep_assert_held(q); - rcu_read_lock(); - - /* - * Drain each tg while doing post-order walk on the blkg tree, so - * that all bios are propagated to td->service_queue. It'd be - * better to walk service_queue tree directly but blkg walk is - * easier. - */ - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) - tg_drain_bios(&blkg_to_tg(blkg)->service_queue); - - /* finally, transfer bios from top-level tg's into the td */ - tg_drain_bios(&td->service_queue); - - rcu_read_unlock(); - spin_unlock_irq(q->queue_lock); - - /* all bios now should be in td->service_queue, issue them */ - for (rw = READ; rw <= WRITE; rw++) - while ((bio = throtl_pop_queued(&td->service_queue.queued[rw], - NULL))) - generic_make_request(bio); - - spin_lock_irq(q->queue_lock); -} int blk_throtl_init(struct request_queue *q) { @@ -2469,7 +2482,7 @@ td->throtl_slice = DFL_THROTL_SLICE_HD; #endif - td->track_bio_latency = !queue_is_rq_based(q); + td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); } -- Gitblit v1.6.2