From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/block/blk-iolatency.c | 422 +++++++++++++++++++++++++++++++--------------------- 1 files changed, 250 insertions(+), 172 deletions(-) diff --git a/kernel/block/blk-iolatency.c b/kernel/block/blk-iolatency.c index ebc2bdc..74511a0 100644 --- a/kernel/block/blk-iolatency.c +++ b/kernel/block/blk-iolatency.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Block rq-qos base io controller * @@ -85,17 +86,22 @@ struct blk_iolatency { struct rq_qos rqos; struct timer_list timer; - atomic_t enabled; + + /* + * ->enabled is the master enable switch gating the throttling logic and + * inflight tracking. The number of cgroups which have iolat enabled is + * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly + * from ->enable_work with the request_queue frozen. For details, See + * blkiolatency_enable_work_fn(). + */ + bool enabled; + atomic_t enable_cnt; + struct work_struct enable_work; }; static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) { return container_of(rqos, struct blk_iolatency, rqos); -} - -static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat) -{ - return atomic_read(&blkiolat->enabled) > 0; } struct child_latency_info { @@ -117,9 +123,22 @@ atomic_t scale_cookie; }; +struct percentile_stats { + u64 total; + u64 missed; +}; + +struct latency_stat { + union { + struct percentile_stats ps; + struct blk_rq_stat rqs; + }; +}; + struct iolatency_grp { struct blkg_policy_data pd; - struct blk_rq_stat __percpu *stats; + struct latency_stat __percpu *stats; + struct latency_stat cur_stat; struct blk_iolatency *blkiolat; struct rq_depth rq_depth; struct rq_wait rq_wait; @@ -134,6 +153,7 @@ /* Our current number of IO's for the last summation. */ u64 nr_samples; + bool ssd; struct child_latency_info child_lat; }; @@ -174,29 +194,101 @@ return pd_to_blkg(&iolat->pd); } -static inline bool iolatency_may_queue(struct iolatency_grp *iolat, - wait_queue_entry_t *wait, - bool first_block) +static inline void latency_stat_init(struct iolatency_grp *iolat, + struct latency_stat *stat) { - struct rq_wait *rqw = &iolat->rq_wait; + if (iolat->ssd) { + stat->ps.total = 0; + stat->ps.missed = 0; + } else + blk_rq_stat_init(&stat->rqs); +} - if (first_block && waitqueue_active(&rqw->wait) && - rqw->wait.head.next != &wait->entry) - return false; +static inline void latency_stat_sum(struct iolatency_grp *iolat, + struct latency_stat *sum, + struct latency_stat *stat) +{ + if (iolat->ssd) { + sum->ps.total += stat->ps.total; + sum->ps.missed += stat->ps.missed; + } else + blk_rq_stat_sum(&sum->rqs, &stat->rqs); +} + +static inline void latency_stat_record_time(struct iolatency_grp *iolat, + u64 req_time) +{ + struct latency_stat *stat = get_cpu_ptr(iolat->stats); + if (iolat->ssd) { + if (req_time >= iolat->min_lat_nsec) + stat->ps.missed++; + stat->ps.total++; + } else + blk_rq_stat_add(&stat->rqs, req_time); + put_cpu_ptr(stat); +} + +static inline bool latency_sum_ok(struct iolatency_grp *iolat, + struct latency_stat *stat) +{ + if (iolat->ssd) { + u64 thresh = div64_u64(stat->ps.total, 10); + thresh = max(thresh, 1ULL); + return stat->ps.missed < thresh; + } + return stat->rqs.mean <= iolat->min_lat_nsec; +} + +static inline u64 latency_stat_samples(struct iolatency_grp *iolat, + struct latency_stat *stat) +{ + if (iolat->ssd) + return stat->ps.total; + return stat->rqs.nr_samples; +} + +static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, + struct latency_stat *stat) +{ + int exp_idx; + + if (iolat->ssd) + return; + + /* + * calc_load() takes in a number stored in fixed point representation. + * Because we are using this for IO time in ns, the values stored + * are significantly larger than the FIXED_1 denominator (2048). + * Therefore, rounding errors in the calculation are negligible and + * can be ignored. + */ + exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, + div64_u64(iolat->cur_win_nsec, + BLKIOLATENCY_EXP_BUCKET_SIZE)); + iolat->lat_avg = calc_load(iolat->lat_avg, + iolatency_exp_factors[exp_idx], + stat->rqs.mean); +} + +static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) +{ + atomic_dec(&rqw->inflight); + wake_up(&rqw->wait); +} + +static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) +{ + struct iolatency_grp *iolat = private_data; return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); } static void __blkcg_iolatency_throttle(struct rq_qos *rqos, struct iolatency_grp *iolat, - spinlock_t *lock, bool issue_as_root, + bool issue_as_root, bool use_memdelay) - __releases(lock) - __acquires(lock) { struct rq_wait *rqw = &iolat->rq_wait; unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); - DEFINE_WAIT(wait); - bool first_block = true; if (use_delay) blkcg_schedule_throttle(rqos->q, use_memdelay); @@ -213,27 +305,7 @@ return; } - if (iolatency_may_queue(iolat, &wait, first_block)) - return; - - do { - prepare_to_wait_exclusive(&rqw->wait, &wait, - TASK_UNINTERRUPTIBLE); - - if (iolatency_may_queue(iolat, &wait, first_block)) - break; - first_block = false; - - if (lock) { - spin_unlock_irq(lock); - io_schedule(); - spin_lock_irq(lock); - } else { - io_schedule(); - } - } while (1); - - finish_wait(&rqw->wait, &wait); + rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb); } #define SCALE_DOWN_FACTOR 2 @@ -257,7 +329,7 @@ struct child_latency_info *lat_info, bool up) { - unsigned long qd = blk_queue_depth(blkiolat->rqos.q); + unsigned long qd = blkiolat->rqos.q->nr_requests; unsigned long scale = scale_amount(qd, up); unsigned long old = atomic_read(&lat_info->scale_cookie); unsigned long max_scale = qd << 1; @@ -297,10 +369,9 @@ */ static void scale_change(struct iolatency_grp *iolat, bool up) { - unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q); + unsigned long qd = iolat->blkiolat->rqos.q->nr_requests; unsigned long scale = scale_amount(qd, up); unsigned long old = iolat->rq_depth.max_depth; - bool changed = false; if (old > qd) old = qd; @@ -310,15 +381,13 @@ return; if (old < qd) { - changed = true; old += scale; old = min(old, qd); iolat->rq_depth.max_depth = old; wake_up_all(&iolat->rq_wait.wait); } - } else if (old > 1) { + } else { old >>= 1; - changed = true; iolat->rq_depth.max_depth = max(old, 1UL); } } @@ -371,7 +440,7 @@ * scale down event. */ samples_thresh = lat_info->nr_samples * 5; - samples_thresh = div64_u64(samples_thresh, 100); + samples_thresh = max(1ULL, div64_u64(samples_thresh, 100)); if (iolat->nr_samples <= samples_thresh) return; } @@ -393,38 +462,15 @@ scale_change(iolat, direction > 0); } -static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, - spinlock_t *lock) +static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) { struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - struct blkcg *blkcg; - struct blkcg_gq *blkg; - struct request_queue *q = rqos->q; + struct blkcg_gq *blkg = bio->bi_blkg; bool issue_as_root = bio_issue_as_root_blkg(bio); - if (!blk_iolatency_enabled(blkiolat)) + if (!blkiolat->enabled) return; - rcu_read_lock(); - blkcg = bio_blkcg(bio); - bio_associate_blkcg(bio, &blkcg->css); - blkg = blkg_lookup(blkcg, q); - if (unlikely(!blkg)) { - if (!lock) - spin_lock_irq(q->queue_lock); - blkg = blkg_lookup_create(blkcg, q); - if (IS_ERR(blkg)) - blkg = NULL; - if (!lock) - spin_unlock_irq(q->queue_lock); - } - if (!blkg) - goto out; - - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); - bio_associate_blkg(bio, blkg); -out: - rcu_read_unlock(); while (blkg && blkg->parent) { struct iolatency_grp *iolat = blkg_to_lat(blkg); if (!iolat) { @@ -433,7 +479,7 @@ } check_scale_change(iolat); - __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root, + __blkcg_iolatency_throttle(rqos, iolat, issue_as_root, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); blkg = blkg->parent; } @@ -445,7 +491,6 @@ struct bio_issue *issue, u64 now, bool issue_as_root) { - struct blk_rq_stat *rq_stat; u64 start = bio_issue_time(issue); u64 req_time; @@ -471,9 +516,7 @@ return; } - rq_stat = get_cpu_ptr(iolat->stats); - blk_rq_stat_add(rq_stat, req_time); - put_cpu_ptr(rq_stat); + latency_stat_record_time(iolat, req_time); } #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) @@ -484,17 +527,17 @@ struct blkcg_gq *blkg = lat_to_blkg(iolat); struct iolatency_grp *parent; struct child_latency_info *lat_info; - struct blk_rq_stat stat; + struct latency_stat stat; unsigned long flags; - int cpu, exp_idx; + int cpu; - blk_rq_stat_init(&stat); + latency_stat_init(iolat, &stat); preempt_disable(); for_each_online_cpu(cpu) { - struct blk_rq_stat *s; + struct latency_stat *s; s = per_cpu_ptr(iolat->stats, cpu); - blk_rq_stat_sum(&stat, s); - blk_rq_stat_init(s); + latency_stat_sum(iolat, &stat, s); + latency_stat_init(iolat, s); } preempt_enable(); @@ -504,43 +547,36 @@ lat_info = &parent->child_lat; - /* - * calc_load() takes in a number stored in fixed point representation. - * Because we are using this for IO time in ns, the values stored - * are significantly larger than the FIXED_1 denominator (2048). - * Therefore, rounding errors in the calculation are negligible and - * can be ignored. - */ - exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, - div64_u64(iolat->cur_win_nsec, - BLKIOLATENCY_EXP_BUCKET_SIZE)); - iolat->lat_avg = calc_load(iolat->lat_avg, - iolatency_exp_factors[exp_idx], - stat.mean); + iolat_update_total_lat_avg(iolat, &stat); /* Everything is ok and we don't need to adjust the scale. */ - if (stat.mean <= iolat->min_lat_nsec && + if (latency_sum_ok(iolat, &stat) && atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) return; /* Somebody beat us to the punch, just bail. */ spin_lock_irqsave(&lat_info->lock, flags); + + latency_stat_sum(iolat, &iolat->cur_stat, &stat); lat_info->nr_samples -= iolat->nr_samples; - lat_info->nr_samples += stat.nr_samples; - iolat->nr_samples = stat.nr_samples; + lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat); + iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat); if ((lat_info->last_scale_event >= now || - now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) && - lat_info->scale_lat <= iolat->min_lat_nsec) + now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME)) goto out; - if (stat.mean <= iolat->min_lat_nsec && - stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) { + if (latency_sum_ok(iolat, &iolat->cur_stat) && + latency_sum_ok(iolat, &stat)) { + if (latency_stat_samples(iolat, &iolat->cur_stat) < + BLKIOLATENCY_MIN_GOOD_SAMPLES) + goto out; if (lat_info->scale_grp == iolat) { lat_info->last_scale_event = now; scale_cookie_change(iolat->blkiolat, lat_info, true); } - } else if (stat.mean > iolat->min_lat_nsec) { + } else if (lat_info->scale_lat == 0 || + lat_info->scale_lat >= iolat->min_lat_nsec) { lat_info->last_scale_event = now; if (!lat_info->scale_grp || lat_info->scale_lat > iolat->min_lat_nsec) { @@ -549,6 +585,7 @@ } scale_cookie_change(iolat->blkiolat, lat_info, false); } + latency_stat_init(iolat, &iolat->cur_stat); out: spin_unlock_irqrestore(&lat_info->lock, flags); } @@ -559,23 +596,22 @@ struct rq_wait *rqw; struct iolatency_grp *iolat; u64 window_start; - u64 now = ktime_to_ns(ktime_get()); + u64 now; bool issue_as_root = bio_issue_as_root_blkg(bio); - bool enabled = false; int inflight = 0; blkg = bio->bi_blkg; - if (!blkg) + if (!blkg || !bio_flagged(bio, BIO_TRACKED)) return; iolat = blkg_to_lat(bio->bi_blkg); if (!iolat) return; - enabled = blk_iolatency_enabled(iolat->blkiolat); - if (!enabled) + if (!iolat->blkiolat->enabled) return; + now = ktime_to_ns(ktime_get()); while (blkg && blkg->parent) { iolat = blkg_to_lat(blkg); if (!iolat) { @@ -611,6 +647,7 @@ struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); del_timer_sync(&blkiolat->timer); + flush_work(&blkiolat->enable_work); blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); kfree(blkiolat); } @@ -640,7 +677,7 @@ * We could be exiting, don't access the pd unless we have a * ref on the blkg. */ - if (!blkg_try_get(blkg)) + if (!blkg_tryget(blkg)) continue; iolat = blkg_to_lat(blkg); @@ -682,6 +719,44 @@ rcu_read_unlock(); } +/** + * blkiolatency_enable_work_fn - Enable or disable iolatency on the device + * @work: enable_work of the blk_iolatency of interest + * + * iolatency needs to keep track of the number of in-flight IOs per cgroup. This + * is relatively expensive as it involves walking up the hierarchy twice for + * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we + * want to disable the in-flight tracking. + * + * We have to make sure that the counting is balanced - we don't want to leak + * the in-flight counts by disabling accounting in the completion path while IOs + * are in flight. This is achieved by ensuring that no IO is in flight by + * freezing the queue while flipping ->enabled. As this requires a sleepable + * context, ->enabled flipping is punted to this work function. + */ +static void blkiolatency_enable_work_fn(struct work_struct *work) +{ + struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency, + enable_work); + bool enabled; + + /* + * There can only be one instance of this function running for @blkiolat + * and it's guaranteed to be executed at least once after the latest + * ->enabled_cnt modification. Acting on the latest ->enable_cnt is + * sufficient. + * + * Also, we know @blkiolat is safe to access as ->enable_work is flushed + * in blkcg_iolatency_exit(). + */ + enabled = atomic_read(&blkiolat->enable_cnt); + if (enabled != blkiolat->enabled) { + blk_mq_freeze_queue(blkiolat->rqos.q); + blkiolat->enabled = enabled; + blk_mq_unfreeze_queue(blkiolat->rqos.q); + } +} + int blk_iolatency_init(struct request_queue *q) { struct blk_iolatency *blkiolat; @@ -693,7 +768,7 @@ return -ENOMEM; rqos = &blkiolat->rqos; - rqos->id = RQ_QOS_CGROUP; + rqos->id = RQ_QOS_LATENCY; rqos->ops = &blkcg_iolatency_ops; rqos->q = q; @@ -707,17 +782,15 @@ } timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); + INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); return 0; } -/* - * return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise - * return 0. - */ -static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) +static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) { struct iolatency_grp *iolat = blkg_to_lat(blkg); + struct blk_iolatency *blkiolat = iolat->blkiolat; u64 oldval = iolat->min_lat_nsec; iolat->min_lat_nsec = val; @@ -725,13 +798,15 @@ iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, BLKIOLATENCY_MAX_WIN_SIZE); - if (!oldval && val) - return 1; + if (!oldval && val) { + if (atomic_inc_return(&blkiolat->enable_cnt) == 1) + schedule_work(&blkiolat->enable_work); + } if (oldval && !val) { blkcg_clear_delay(blkg); - return -1; + if (atomic_dec_return(&blkiolat->enable_cnt) == 0) + schedule_work(&blkiolat->enable_work); } - return 0; } static void iolatency_clear_scaling(struct blkcg_gq *blkg) @@ -757,21 +832,18 @@ { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkcg_gq *blkg; - struct blk_iolatency *blkiolat; struct blkg_conf_ctx ctx; struct iolatency_grp *iolat; char *p, *tok; u64 lat_val = 0; u64 oldval; int ret; - int enable = 0; ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); if (ret) return ret; iolat = blkg_to_lat(ctx.blkg); - blkiolat = iolat->blkiolat; p = ctx.body; ret = -EINVAL; @@ -800,41 +872,12 @@ blkg = ctx.blkg; oldval = iolat->min_lat_nsec; - enable = iolatency_set_min_lat_nsec(blkg, lat_val); - if (enable) { - if (!blk_get_queue(blkg->q)) { - ret = -ENODEV; - goto out; - } - - blkg_get(blkg); - } - - if (oldval != iolat->min_lat_nsec) { + iolatency_set_min_lat_nsec(blkg, lat_val); + if (oldval != iolat->min_lat_nsec) iolatency_clear_scaling(blkg); - } - ret = 0; out: blkg_conf_finish(&ctx); - if (ret == 0 && enable) { - struct iolatency_grp *tmp = blkg_to_lat(blkg); - struct blk_iolatency *blkiolat = tmp->blkiolat; - - blk_mq_freeze_queue(blkg->q); - - if (enable == 1) - atomic_inc(&blkiolat->enabled); - else if (enable == -1) - atomic_dec(&blkiolat->enabled); - else - WARN_ON_ONCE(1); - - blk_mq_unfreeze_queue(blkg->q); - - blkg_put(blkg); - blk_put_queue(blkg->q); - } return ret ?: nbytes; } @@ -859,13 +902,46 @@ return 0; } +static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, + size_t size) +{ + struct latency_stat stat; + int cpu; + + latency_stat_init(iolat, &stat); + preempt_disable(); + for_each_online_cpu(cpu) { + struct latency_stat *s; + s = per_cpu_ptr(iolat->stats, cpu); + latency_stat_sum(iolat, &stat, s); + } + preempt_enable(); + + if (iolat->rq_depth.max_depth == UINT_MAX) + return scnprintf(buf, size, " missed=%llu total=%llu depth=max", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total); + return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total, + iolat->rq_depth.max_depth); +} + static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) { struct iolatency_grp *iolat = pd_to_lat(pd); - unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); - unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); + unsigned long long avg_lat; + unsigned long long cur_win; + if (!blkcg_debug_stats) + return 0; + + if (iolat->ssd) + return iolatency_ssd_stat(iolat, buf, size); + + avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); + cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); if (iolat->rq_depth.max_depth == UINT_MAX) return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", avg_lat, cur_win); @@ -875,15 +951,17 @@ } -static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node) +static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, + struct request_queue *q, + struct blkcg *blkcg) { struct iolatency_grp *iolat; - iolat = kzalloc_node(sizeof(*iolat), gfp, node); + iolat = kzalloc_node(sizeof(*iolat), gfp, q->node); if (!iolat) return NULL; - iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat), - __alignof__(struct blk_rq_stat), gfp); + iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), + __alignof__(struct latency_stat), gfp); if (!iolat->stats) { kfree(iolat); return NULL; @@ -900,15 +978,21 @@ u64 now = ktime_to_ns(ktime_get()); int cpu; + if (blk_queue_nonrot(blkg->q)) + iolat->ssd = true; + else + iolat->ssd = false; + for_each_possible_cpu(cpu) { - struct blk_rq_stat *stat; + struct latency_stat *stat; stat = per_cpu_ptr(iolat->stats, cpu); - blk_rq_stat_init(stat); + latency_stat_init(iolat, stat); } + latency_stat_init(iolat, &iolat->cur_stat); rq_wait_init(&iolat->rq_wait); spin_lock_init(&iolat->child_lat.lock); - iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q); + iolat->rq_depth.queue_depth = blkg->q->nr_requests; iolat->rq_depth.max_depth = UINT_MAX; iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; iolat->blkiolat = blkiolat; @@ -934,14 +1018,8 @@ { struct iolatency_grp *iolat = pd_to_lat(pd); struct blkcg_gq *blkg = lat_to_blkg(iolat); - struct blk_iolatency *blkiolat = iolat->blkiolat; - int ret; - ret = iolatency_set_min_lat_nsec(blkg, 0); - if (ret == 1) - atomic_inc(&blkiolat->enabled); - if (ret == -1) - atomic_dec(&blkiolat->enabled); + iolatency_set_min_lat_nsec(blkg, 0); iolatency_clear_scaling(blkg); } @@ -978,7 +1056,7 @@ static void __exit iolatency_exit(void) { - return blkcg_policy_unregister(&blkcg_policy_iolatency); + blkcg_policy_unregister(&blkcg_policy_iolatency); } module_init(iolatency_init); -- Gitblit v1.6.2