| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * Block rq-qos base io controller |
|---|
| 3 | 4 | * |
|---|
| .. | .. |
|---|
| 85 | 86 | struct blk_iolatency { |
|---|
| 86 | 87 | struct rq_qos rqos; |
|---|
| 87 | 88 | struct timer_list timer; |
|---|
| 88 | | - atomic_t enabled; |
|---|
| 89 | + |
|---|
| 90 | + /* |
|---|
| 91 | + * ->enabled is the master enable switch gating the throttling logic and |
|---|
| 92 | + * inflight tracking. The number of cgroups which have iolat enabled is |
|---|
| 93 | + * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly |
|---|
| 94 | + * from ->enable_work with the request_queue frozen. For details, See |
|---|
| 95 | + * blkiolatency_enable_work_fn(). |
|---|
| 96 | + */ |
|---|
| 97 | + bool enabled; |
|---|
| 98 | + atomic_t enable_cnt; |
|---|
| 99 | + struct work_struct enable_work; |
|---|
| 89 | 100 | }; |
|---|
| 90 | 101 | |
|---|
| 91 | 102 | static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) |
|---|
| 92 | 103 | { |
|---|
| 93 | 104 | return container_of(rqos, struct blk_iolatency, rqos); |
|---|
| 94 | | -} |
|---|
| 95 | | - |
|---|
| 96 | | -static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat) |
|---|
| 97 | | -{ |
|---|
| 98 | | - return atomic_read(&blkiolat->enabled) > 0; |
|---|
| 99 | 105 | } |
|---|
| 100 | 106 | |
|---|
| 101 | 107 | struct child_latency_info { |
|---|
| .. | .. |
|---|
| 117 | 123 | atomic_t scale_cookie; |
|---|
| 118 | 124 | }; |
|---|
| 119 | 125 | |
|---|
| 126 | +struct percentile_stats { |
|---|
| 127 | + u64 total; |
|---|
| 128 | + u64 missed; |
|---|
| 129 | +}; |
|---|
| 130 | + |
|---|
| 131 | +struct latency_stat { |
|---|
| 132 | + union { |
|---|
| 133 | + struct percentile_stats ps; |
|---|
| 134 | + struct blk_rq_stat rqs; |
|---|
| 135 | + }; |
|---|
| 136 | +}; |
|---|
| 137 | + |
|---|
| 120 | 138 | struct iolatency_grp { |
|---|
| 121 | 139 | struct blkg_policy_data pd; |
|---|
| 122 | | - struct blk_rq_stat __percpu *stats; |
|---|
| 140 | + struct latency_stat __percpu *stats; |
|---|
| 141 | + struct latency_stat cur_stat; |
|---|
| 123 | 142 | struct blk_iolatency *blkiolat; |
|---|
| 124 | 143 | struct rq_depth rq_depth; |
|---|
| 125 | 144 | struct rq_wait rq_wait; |
|---|
| .. | .. |
|---|
| 134 | 153 | /* Our current number of IO's for the last summation. */ |
|---|
| 135 | 154 | u64 nr_samples; |
|---|
| 136 | 155 | |
|---|
| 156 | + bool ssd; |
|---|
| 137 | 157 | struct child_latency_info child_lat; |
|---|
| 138 | 158 | }; |
|---|
| 139 | 159 | |
|---|
| .. | .. |
|---|
| 174 | 194 | return pd_to_blkg(&iolat->pd); |
|---|
| 175 | 195 | } |
|---|
| 176 | 196 | |
|---|
| 177 | | -static inline bool iolatency_may_queue(struct iolatency_grp *iolat, |
|---|
| 178 | | - wait_queue_entry_t *wait, |
|---|
| 179 | | - bool first_block) |
|---|
| 197 | +static inline void latency_stat_init(struct iolatency_grp *iolat, |
|---|
| 198 | + struct latency_stat *stat) |
|---|
| 180 | 199 | { |
|---|
| 181 | | - struct rq_wait *rqw = &iolat->rq_wait; |
|---|
| 200 | + if (iolat->ssd) { |
|---|
| 201 | + stat->ps.total = 0; |
|---|
| 202 | + stat->ps.missed = 0; |
|---|
| 203 | + } else |
|---|
| 204 | + blk_rq_stat_init(&stat->rqs); |
|---|
| 205 | +} |
|---|
| 182 | 206 | |
|---|
| 183 | | - if (first_block && waitqueue_active(&rqw->wait) && |
|---|
| 184 | | - rqw->wait.head.next != &wait->entry) |
|---|
| 185 | | - return false; |
|---|
| 207 | +static inline void latency_stat_sum(struct iolatency_grp *iolat, |
|---|
| 208 | + struct latency_stat *sum, |
|---|
| 209 | + struct latency_stat *stat) |
|---|
| 210 | +{ |
|---|
| 211 | + if (iolat->ssd) { |
|---|
| 212 | + sum->ps.total += stat->ps.total; |
|---|
| 213 | + sum->ps.missed += stat->ps.missed; |
|---|
| 214 | + } else |
|---|
| 215 | + blk_rq_stat_sum(&sum->rqs, &stat->rqs); |
|---|
| 216 | +} |
|---|
| 217 | + |
|---|
| 218 | +static inline void latency_stat_record_time(struct iolatency_grp *iolat, |
|---|
| 219 | + u64 req_time) |
|---|
| 220 | +{ |
|---|
| 221 | + struct latency_stat *stat = get_cpu_ptr(iolat->stats); |
|---|
| 222 | + if (iolat->ssd) { |
|---|
| 223 | + if (req_time >= iolat->min_lat_nsec) |
|---|
| 224 | + stat->ps.missed++; |
|---|
| 225 | + stat->ps.total++; |
|---|
| 226 | + } else |
|---|
| 227 | + blk_rq_stat_add(&stat->rqs, req_time); |
|---|
| 228 | + put_cpu_ptr(stat); |
|---|
| 229 | +} |
|---|
| 230 | + |
|---|
| 231 | +static inline bool latency_sum_ok(struct iolatency_grp *iolat, |
|---|
| 232 | + struct latency_stat *stat) |
|---|
| 233 | +{ |
|---|
| 234 | + if (iolat->ssd) { |
|---|
| 235 | + u64 thresh = div64_u64(stat->ps.total, 10); |
|---|
| 236 | + thresh = max(thresh, 1ULL); |
|---|
| 237 | + return stat->ps.missed < thresh; |
|---|
| 238 | + } |
|---|
| 239 | + return stat->rqs.mean <= iolat->min_lat_nsec; |
|---|
| 240 | +} |
|---|
| 241 | + |
|---|
| 242 | +static inline u64 latency_stat_samples(struct iolatency_grp *iolat, |
|---|
| 243 | + struct latency_stat *stat) |
|---|
| 244 | +{ |
|---|
| 245 | + if (iolat->ssd) |
|---|
| 246 | + return stat->ps.total; |
|---|
| 247 | + return stat->rqs.nr_samples; |
|---|
| 248 | +} |
|---|
| 249 | + |
|---|
| 250 | +static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, |
|---|
| 251 | + struct latency_stat *stat) |
|---|
| 252 | +{ |
|---|
| 253 | + int exp_idx; |
|---|
| 254 | + |
|---|
| 255 | + if (iolat->ssd) |
|---|
| 256 | + return; |
|---|
| 257 | + |
|---|
| 258 | + /* |
|---|
| 259 | + * calc_load() takes in a number stored in fixed point representation. |
|---|
| 260 | + * Because we are using this for IO time in ns, the values stored |
|---|
| 261 | + * are significantly larger than the FIXED_1 denominator (2048). |
|---|
| 262 | + * Therefore, rounding errors in the calculation are negligible and |
|---|
| 263 | + * can be ignored. |
|---|
| 264 | + */ |
|---|
| 265 | + exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, |
|---|
| 266 | + div64_u64(iolat->cur_win_nsec, |
|---|
| 267 | + BLKIOLATENCY_EXP_BUCKET_SIZE)); |
|---|
| 268 | + iolat->lat_avg = calc_load(iolat->lat_avg, |
|---|
| 269 | + iolatency_exp_factors[exp_idx], |
|---|
| 270 | + stat->rqs.mean); |
|---|
| 271 | +} |
|---|
| 272 | + |
|---|
| 273 | +static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) |
|---|
| 274 | +{ |
|---|
| 275 | + atomic_dec(&rqw->inflight); |
|---|
| 276 | + wake_up(&rqw->wait); |
|---|
| 277 | +} |
|---|
| 278 | + |
|---|
| 279 | +static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) |
|---|
| 280 | +{ |
|---|
| 281 | + struct iolatency_grp *iolat = private_data; |
|---|
| 186 | 282 | return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); |
|---|
| 187 | 283 | } |
|---|
| 188 | 284 | |
|---|
| 189 | 285 | static void __blkcg_iolatency_throttle(struct rq_qos *rqos, |
|---|
| 190 | 286 | struct iolatency_grp *iolat, |
|---|
| 191 | | - spinlock_t *lock, bool issue_as_root, |
|---|
| 287 | + bool issue_as_root, |
|---|
| 192 | 288 | bool use_memdelay) |
|---|
| 193 | | - __releases(lock) |
|---|
| 194 | | - __acquires(lock) |
|---|
| 195 | 289 | { |
|---|
| 196 | 290 | struct rq_wait *rqw = &iolat->rq_wait; |
|---|
| 197 | 291 | unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); |
|---|
| 198 | | - DEFINE_WAIT(wait); |
|---|
| 199 | | - bool first_block = true; |
|---|
| 200 | 292 | |
|---|
| 201 | 293 | if (use_delay) |
|---|
| 202 | 294 | blkcg_schedule_throttle(rqos->q, use_memdelay); |
|---|
| .. | .. |
|---|
| 213 | 305 | return; |
|---|
| 214 | 306 | } |
|---|
| 215 | 307 | |
|---|
| 216 | | - if (iolatency_may_queue(iolat, &wait, first_block)) |
|---|
| 217 | | - return; |
|---|
| 218 | | - |
|---|
| 219 | | - do { |
|---|
| 220 | | - prepare_to_wait_exclusive(&rqw->wait, &wait, |
|---|
| 221 | | - TASK_UNINTERRUPTIBLE); |
|---|
| 222 | | - |
|---|
| 223 | | - if (iolatency_may_queue(iolat, &wait, first_block)) |
|---|
| 224 | | - break; |
|---|
| 225 | | - first_block = false; |
|---|
| 226 | | - |
|---|
| 227 | | - if (lock) { |
|---|
| 228 | | - spin_unlock_irq(lock); |
|---|
| 229 | | - io_schedule(); |
|---|
| 230 | | - spin_lock_irq(lock); |
|---|
| 231 | | - } else { |
|---|
| 232 | | - io_schedule(); |
|---|
| 233 | | - } |
|---|
| 234 | | - } while (1); |
|---|
| 235 | | - |
|---|
| 236 | | - finish_wait(&rqw->wait, &wait); |
|---|
| 308 | + rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb); |
|---|
| 237 | 309 | } |
|---|
| 238 | 310 | |
|---|
| 239 | 311 | #define SCALE_DOWN_FACTOR 2 |
|---|
| .. | .. |
|---|
| 257 | 329 | struct child_latency_info *lat_info, |
|---|
| 258 | 330 | bool up) |
|---|
| 259 | 331 | { |
|---|
| 260 | | - unsigned long qd = blk_queue_depth(blkiolat->rqos.q); |
|---|
| 332 | + unsigned long qd = blkiolat->rqos.q->nr_requests; |
|---|
| 261 | 333 | unsigned long scale = scale_amount(qd, up); |
|---|
| 262 | 334 | unsigned long old = atomic_read(&lat_info->scale_cookie); |
|---|
| 263 | 335 | unsigned long max_scale = qd << 1; |
|---|
| .. | .. |
|---|
| 297 | 369 | */ |
|---|
| 298 | 370 | static void scale_change(struct iolatency_grp *iolat, bool up) |
|---|
| 299 | 371 | { |
|---|
| 300 | | - unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q); |
|---|
| 372 | + unsigned long qd = iolat->blkiolat->rqos.q->nr_requests; |
|---|
| 301 | 373 | unsigned long scale = scale_amount(qd, up); |
|---|
| 302 | 374 | unsigned long old = iolat->rq_depth.max_depth; |
|---|
| 303 | | - bool changed = false; |
|---|
| 304 | 375 | |
|---|
| 305 | 376 | if (old > qd) |
|---|
| 306 | 377 | old = qd; |
|---|
| .. | .. |
|---|
| 310 | 381 | return; |
|---|
| 311 | 382 | |
|---|
| 312 | 383 | if (old < qd) { |
|---|
| 313 | | - changed = true; |
|---|
| 314 | 384 | old += scale; |
|---|
| 315 | 385 | old = min(old, qd); |
|---|
| 316 | 386 | iolat->rq_depth.max_depth = old; |
|---|
| 317 | 387 | wake_up_all(&iolat->rq_wait.wait); |
|---|
| 318 | 388 | } |
|---|
| 319 | | - } else if (old > 1) { |
|---|
| 389 | + } else { |
|---|
| 320 | 390 | old >>= 1; |
|---|
| 321 | | - changed = true; |
|---|
| 322 | 391 | iolat->rq_depth.max_depth = max(old, 1UL); |
|---|
| 323 | 392 | } |
|---|
| 324 | 393 | } |
|---|
| .. | .. |
|---|
| 371 | 440 | * scale down event. |
|---|
| 372 | 441 | */ |
|---|
| 373 | 442 | samples_thresh = lat_info->nr_samples * 5; |
|---|
| 374 | | - samples_thresh = div64_u64(samples_thresh, 100); |
|---|
| 443 | + samples_thresh = max(1ULL, div64_u64(samples_thresh, 100)); |
|---|
| 375 | 444 | if (iolat->nr_samples <= samples_thresh) |
|---|
| 376 | 445 | return; |
|---|
| 377 | 446 | } |
|---|
| .. | .. |
|---|
| 393 | 462 | scale_change(iolat, direction > 0); |
|---|
| 394 | 463 | } |
|---|
| 395 | 464 | |
|---|
| 396 | | -static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, |
|---|
| 397 | | - spinlock_t *lock) |
|---|
| 465 | +static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) |
|---|
| 398 | 466 | { |
|---|
| 399 | 467 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); |
|---|
| 400 | | - struct blkcg *blkcg; |
|---|
| 401 | | - struct blkcg_gq *blkg; |
|---|
| 402 | | - struct request_queue *q = rqos->q; |
|---|
| 468 | + struct blkcg_gq *blkg = bio->bi_blkg; |
|---|
| 403 | 469 | bool issue_as_root = bio_issue_as_root_blkg(bio); |
|---|
| 404 | 470 | |
|---|
| 405 | | - if (!blk_iolatency_enabled(blkiolat)) |
|---|
| 471 | + if (!blkiolat->enabled) |
|---|
| 406 | 472 | return; |
|---|
| 407 | 473 | |
|---|
| 408 | | - rcu_read_lock(); |
|---|
| 409 | | - blkcg = bio_blkcg(bio); |
|---|
| 410 | | - bio_associate_blkcg(bio, &blkcg->css); |
|---|
| 411 | | - blkg = blkg_lookup(blkcg, q); |
|---|
| 412 | | - if (unlikely(!blkg)) { |
|---|
| 413 | | - if (!lock) |
|---|
| 414 | | - spin_lock_irq(q->queue_lock); |
|---|
| 415 | | - blkg = blkg_lookup_create(blkcg, q); |
|---|
| 416 | | - if (IS_ERR(blkg)) |
|---|
| 417 | | - blkg = NULL; |
|---|
| 418 | | - if (!lock) |
|---|
| 419 | | - spin_unlock_irq(q->queue_lock); |
|---|
| 420 | | - } |
|---|
| 421 | | - if (!blkg) |
|---|
| 422 | | - goto out; |
|---|
| 423 | | - |
|---|
| 424 | | - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); |
|---|
| 425 | | - bio_associate_blkg(bio, blkg); |
|---|
| 426 | | -out: |
|---|
| 427 | | - rcu_read_unlock(); |
|---|
| 428 | 474 | while (blkg && blkg->parent) { |
|---|
| 429 | 475 | struct iolatency_grp *iolat = blkg_to_lat(blkg); |
|---|
| 430 | 476 | if (!iolat) { |
|---|
| .. | .. |
|---|
| 433 | 479 | } |
|---|
| 434 | 480 | |
|---|
| 435 | 481 | check_scale_change(iolat); |
|---|
| 436 | | - __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root, |
|---|
| 482 | + __blkcg_iolatency_throttle(rqos, iolat, issue_as_root, |
|---|
| 437 | 483 | (bio->bi_opf & REQ_SWAP) == REQ_SWAP); |
|---|
| 438 | 484 | blkg = blkg->parent; |
|---|
| 439 | 485 | } |
|---|
| .. | .. |
|---|
| 445 | 491 | struct bio_issue *issue, u64 now, |
|---|
| 446 | 492 | bool issue_as_root) |
|---|
| 447 | 493 | { |
|---|
| 448 | | - struct blk_rq_stat *rq_stat; |
|---|
| 449 | 494 | u64 start = bio_issue_time(issue); |
|---|
| 450 | 495 | u64 req_time; |
|---|
| 451 | 496 | |
|---|
| .. | .. |
|---|
| 471 | 516 | return; |
|---|
| 472 | 517 | } |
|---|
| 473 | 518 | |
|---|
| 474 | | - rq_stat = get_cpu_ptr(iolat->stats); |
|---|
| 475 | | - blk_rq_stat_add(rq_stat, req_time); |
|---|
| 476 | | - put_cpu_ptr(rq_stat); |
|---|
| 519 | + latency_stat_record_time(iolat, req_time); |
|---|
| 477 | 520 | } |
|---|
| 478 | 521 | |
|---|
| 479 | 522 | #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) |
|---|
| .. | .. |
|---|
| 484 | 527 | struct blkcg_gq *blkg = lat_to_blkg(iolat); |
|---|
| 485 | 528 | struct iolatency_grp *parent; |
|---|
| 486 | 529 | struct child_latency_info *lat_info; |
|---|
| 487 | | - struct blk_rq_stat stat; |
|---|
| 530 | + struct latency_stat stat; |
|---|
| 488 | 531 | unsigned long flags; |
|---|
| 489 | | - int cpu, exp_idx; |
|---|
| 532 | + int cpu; |
|---|
| 490 | 533 | |
|---|
| 491 | | - blk_rq_stat_init(&stat); |
|---|
| 534 | + latency_stat_init(iolat, &stat); |
|---|
| 492 | 535 | preempt_disable(); |
|---|
| 493 | 536 | for_each_online_cpu(cpu) { |
|---|
| 494 | | - struct blk_rq_stat *s; |
|---|
| 537 | + struct latency_stat *s; |
|---|
| 495 | 538 | s = per_cpu_ptr(iolat->stats, cpu); |
|---|
| 496 | | - blk_rq_stat_sum(&stat, s); |
|---|
| 497 | | - blk_rq_stat_init(s); |
|---|
| 539 | + latency_stat_sum(iolat, &stat, s); |
|---|
| 540 | + latency_stat_init(iolat, s); |
|---|
| 498 | 541 | } |
|---|
| 499 | 542 | preempt_enable(); |
|---|
| 500 | 543 | |
|---|
| .. | .. |
|---|
| 504 | 547 | |
|---|
| 505 | 548 | lat_info = &parent->child_lat; |
|---|
| 506 | 549 | |
|---|
| 507 | | - /* |
|---|
| 508 | | - * calc_load() takes in a number stored in fixed point representation. |
|---|
| 509 | | - * Because we are using this for IO time in ns, the values stored |
|---|
| 510 | | - * are significantly larger than the FIXED_1 denominator (2048). |
|---|
| 511 | | - * Therefore, rounding errors in the calculation are negligible and |
|---|
| 512 | | - * can be ignored. |
|---|
| 513 | | - */ |
|---|
| 514 | | - exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, |
|---|
| 515 | | - div64_u64(iolat->cur_win_nsec, |
|---|
| 516 | | - BLKIOLATENCY_EXP_BUCKET_SIZE)); |
|---|
| 517 | | - iolat->lat_avg = calc_load(iolat->lat_avg, |
|---|
| 518 | | - iolatency_exp_factors[exp_idx], |
|---|
| 519 | | - stat.mean); |
|---|
| 550 | + iolat_update_total_lat_avg(iolat, &stat); |
|---|
| 520 | 551 | |
|---|
| 521 | 552 | /* Everything is ok and we don't need to adjust the scale. */ |
|---|
| 522 | | - if (stat.mean <= iolat->min_lat_nsec && |
|---|
| 553 | + if (latency_sum_ok(iolat, &stat) && |
|---|
| 523 | 554 | atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) |
|---|
| 524 | 555 | return; |
|---|
| 525 | 556 | |
|---|
| 526 | 557 | /* Somebody beat us to the punch, just bail. */ |
|---|
| 527 | 558 | spin_lock_irqsave(&lat_info->lock, flags); |
|---|
| 559 | + |
|---|
| 560 | + latency_stat_sum(iolat, &iolat->cur_stat, &stat); |
|---|
| 528 | 561 | lat_info->nr_samples -= iolat->nr_samples; |
|---|
| 529 | | - lat_info->nr_samples += stat.nr_samples; |
|---|
| 530 | | - iolat->nr_samples = stat.nr_samples; |
|---|
| 562 | + lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat); |
|---|
| 563 | + iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat); |
|---|
| 531 | 564 | |
|---|
| 532 | 565 | if ((lat_info->last_scale_event >= now || |
|---|
| 533 | | - now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) && |
|---|
| 534 | | - lat_info->scale_lat <= iolat->min_lat_nsec) |
|---|
| 566 | + now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME)) |
|---|
| 535 | 567 | goto out; |
|---|
| 536 | 568 | |
|---|
| 537 | | - if (stat.mean <= iolat->min_lat_nsec && |
|---|
| 538 | | - stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) { |
|---|
| 569 | + if (latency_sum_ok(iolat, &iolat->cur_stat) && |
|---|
| 570 | + latency_sum_ok(iolat, &stat)) { |
|---|
| 571 | + if (latency_stat_samples(iolat, &iolat->cur_stat) < |
|---|
| 572 | + BLKIOLATENCY_MIN_GOOD_SAMPLES) |
|---|
| 573 | + goto out; |
|---|
| 539 | 574 | if (lat_info->scale_grp == iolat) { |
|---|
| 540 | 575 | lat_info->last_scale_event = now; |
|---|
| 541 | 576 | scale_cookie_change(iolat->blkiolat, lat_info, true); |
|---|
| 542 | 577 | } |
|---|
| 543 | | - } else if (stat.mean > iolat->min_lat_nsec) { |
|---|
| 578 | + } else if (lat_info->scale_lat == 0 || |
|---|
| 579 | + lat_info->scale_lat >= iolat->min_lat_nsec) { |
|---|
| 544 | 580 | lat_info->last_scale_event = now; |
|---|
| 545 | 581 | if (!lat_info->scale_grp || |
|---|
| 546 | 582 | lat_info->scale_lat > iolat->min_lat_nsec) { |
|---|
| .. | .. |
|---|
| 549 | 585 | } |
|---|
| 550 | 586 | scale_cookie_change(iolat->blkiolat, lat_info, false); |
|---|
| 551 | 587 | } |
|---|
| 588 | + latency_stat_init(iolat, &iolat->cur_stat); |
|---|
| 552 | 589 | out: |
|---|
| 553 | 590 | spin_unlock_irqrestore(&lat_info->lock, flags); |
|---|
| 554 | 591 | } |
|---|
| .. | .. |
|---|
| 559 | 596 | struct rq_wait *rqw; |
|---|
| 560 | 597 | struct iolatency_grp *iolat; |
|---|
| 561 | 598 | u64 window_start; |
|---|
| 562 | | - u64 now = ktime_to_ns(ktime_get()); |
|---|
| 599 | + u64 now; |
|---|
| 563 | 600 | bool issue_as_root = bio_issue_as_root_blkg(bio); |
|---|
| 564 | | - bool enabled = false; |
|---|
| 565 | 601 | int inflight = 0; |
|---|
| 566 | 602 | |
|---|
| 567 | 603 | blkg = bio->bi_blkg; |
|---|
| 568 | | - if (!blkg) |
|---|
| 604 | + if (!blkg || !bio_flagged(bio, BIO_TRACKED)) |
|---|
| 569 | 605 | return; |
|---|
| 570 | 606 | |
|---|
| 571 | 607 | iolat = blkg_to_lat(bio->bi_blkg); |
|---|
| 572 | 608 | if (!iolat) |
|---|
| 573 | 609 | return; |
|---|
| 574 | 610 | |
|---|
| 575 | | - enabled = blk_iolatency_enabled(iolat->blkiolat); |
|---|
| 576 | | - if (!enabled) |
|---|
| 611 | + if (!iolat->blkiolat->enabled) |
|---|
| 577 | 612 | return; |
|---|
| 578 | 613 | |
|---|
| 614 | + now = ktime_to_ns(ktime_get()); |
|---|
| 579 | 615 | while (blkg && blkg->parent) { |
|---|
| 580 | 616 | iolat = blkg_to_lat(blkg); |
|---|
| 581 | 617 | if (!iolat) { |
|---|
| .. | .. |
|---|
| 611 | 647 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); |
|---|
| 612 | 648 | |
|---|
| 613 | 649 | del_timer_sync(&blkiolat->timer); |
|---|
| 650 | + flush_work(&blkiolat->enable_work); |
|---|
| 614 | 651 | blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); |
|---|
| 615 | 652 | kfree(blkiolat); |
|---|
| 616 | 653 | } |
|---|
| .. | .. |
|---|
| 640 | 677 | * We could be exiting, don't access the pd unless we have a |
|---|
| 641 | 678 | * ref on the blkg. |
|---|
| 642 | 679 | */ |
|---|
| 643 | | - if (!blkg_try_get(blkg)) |
|---|
| 680 | + if (!blkg_tryget(blkg)) |
|---|
| 644 | 681 | continue; |
|---|
| 645 | 682 | |
|---|
| 646 | 683 | iolat = blkg_to_lat(blkg); |
|---|
| .. | .. |
|---|
| 682 | 719 | rcu_read_unlock(); |
|---|
| 683 | 720 | } |
|---|
| 684 | 721 | |
|---|
| 722 | +/** |
|---|
| 723 | + * blkiolatency_enable_work_fn - Enable or disable iolatency on the device |
|---|
| 724 | + * @work: enable_work of the blk_iolatency of interest |
|---|
| 725 | + * |
|---|
| 726 | + * iolatency needs to keep track of the number of in-flight IOs per cgroup. This |
|---|
| 727 | + * is relatively expensive as it involves walking up the hierarchy twice for |
|---|
| 728 | + * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we |
|---|
| 729 | + * want to disable the in-flight tracking. |
|---|
| 730 | + * |
|---|
| 731 | + * We have to make sure that the counting is balanced - we don't want to leak |
|---|
| 732 | + * the in-flight counts by disabling accounting in the completion path while IOs |
|---|
| 733 | + * are in flight. This is achieved by ensuring that no IO is in flight by |
|---|
| 734 | + * freezing the queue while flipping ->enabled. As this requires a sleepable |
|---|
| 735 | + * context, ->enabled flipping is punted to this work function. |
|---|
| 736 | + */ |
|---|
| 737 | +static void blkiolatency_enable_work_fn(struct work_struct *work) |
|---|
| 738 | +{ |
|---|
| 739 | + struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency, |
|---|
| 740 | + enable_work); |
|---|
| 741 | + bool enabled; |
|---|
| 742 | + |
|---|
| 743 | + /* |
|---|
| 744 | + * There can only be one instance of this function running for @blkiolat |
|---|
| 745 | + * and it's guaranteed to be executed at least once after the latest |
|---|
| 746 | + * ->enabled_cnt modification. Acting on the latest ->enable_cnt is |
|---|
| 747 | + * sufficient. |
|---|
| 748 | + * |
|---|
| 749 | + * Also, we know @blkiolat is safe to access as ->enable_work is flushed |
|---|
| 750 | + * in blkcg_iolatency_exit(). |
|---|
| 751 | + */ |
|---|
| 752 | + enabled = atomic_read(&blkiolat->enable_cnt); |
|---|
| 753 | + if (enabled != blkiolat->enabled) { |
|---|
| 754 | + blk_mq_freeze_queue(blkiolat->rqos.q); |
|---|
| 755 | + blkiolat->enabled = enabled; |
|---|
| 756 | + blk_mq_unfreeze_queue(blkiolat->rqos.q); |
|---|
| 757 | + } |
|---|
| 758 | +} |
|---|
| 759 | + |
|---|
| 685 | 760 | int blk_iolatency_init(struct request_queue *q) |
|---|
| 686 | 761 | { |
|---|
| 687 | 762 | struct blk_iolatency *blkiolat; |
|---|
| .. | .. |
|---|
| 693 | 768 | return -ENOMEM; |
|---|
| 694 | 769 | |
|---|
| 695 | 770 | rqos = &blkiolat->rqos; |
|---|
| 696 | | - rqos->id = RQ_QOS_CGROUP; |
|---|
| 771 | + rqos->id = RQ_QOS_LATENCY; |
|---|
| 697 | 772 | rqos->ops = &blkcg_iolatency_ops; |
|---|
| 698 | 773 | rqos->q = q; |
|---|
| 699 | 774 | |
|---|
| .. | .. |
|---|
| 707 | 782 | } |
|---|
| 708 | 783 | |
|---|
| 709 | 784 | timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); |
|---|
| 785 | + INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); |
|---|
| 710 | 786 | |
|---|
| 711 | 787 | return 0; |
|---|
| 712 | 788 | } |
|---|
| 713 | 789 | |
|---|
| 714 | | -/* |
|---|
| 715 | | - * return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise |
|---|
| 716 | | - * return 0. |
|---|
| 717 | | - */ |
|---|
| 718 | | -static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) |
|---|
| 790 | +static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) |
|---|
| 719 | 791 | { |
|---|
| 720 | 792 | struct iolatency_grp *iolat = blkg_to_lat(blkg); |
|---|
| 793 | + struct blk_iolatency *blkiolat = iolat->blkiolat; |
|---|
| 721 | 794 | u64 oldval = iolat->min_lat_nsec; |
|---|
| 722 | 795 | |
|---|
| 723 | 796 | iolat->min_lat_nsec = val; |
|---|
| .. | .. |
|---|
| 725 | 798 | iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, |
|---|
| 726 | 799 | BLKIOLATENCY_MAX_WIN_SIZE); |
|---|
| 727 | 800 | |
|---|
| 728 | | - if (!oldval && val) |
|---|
| 729 | | - return 1; |
|---|
| 801 | + if (!oldval && val) { |
|---|
| 802 | + if (atomic_inc_return(&blkiolat->enable_cnt) == 1) |
|---|
| 803 | + schedule_work(&blkiolat->enable_work); |
|---|
| 804 | + } |
|---|
| 730 | 805 | if (oldval && !val) { |
|---|
| 731 | 806 | blkcg_clear_delay(blkg); |
|---|
| 732 | | - return -1; |
|---|
| 807 | + if (atomic_dec_return(&blkiolat->enable_cnt) == 0) |
|---|
| 808 | + schedule_work(&blkiolat->enable_work); |
|---|
| 733 | 809 | } |
|---|
| 734 | | - return 0; |
|---|
| 735 | 810 | } |
|---|
| 736 | 811 | |
|---|
| 737 | 812 | static void iolatency_clear_scaling(struct blkcg_gq *blkg) |
|---|
| .. | .. |
|---|
| 757 | 832 | { |
|---|
| 758 | 833 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); |
|---|
| 759 | 834 | struct blkcg_gq *blkg; |
|---|
| 760 | | - struct blk_iolatency *blkiolat; |
|---|
| 761 | 835 | struct blkg_conf_ctx ctx; |
|---|
| 762 | 836 | struct iolatency_grp *iolat; |
|---|
| 763 | 837 | char *p, *tok; |
|---|
| 764 | 838 | u64 lat_val = 0; |
|---|
| 765 | 839 | u64 oldval; |
|---|
| 766 | 840 | int ret; |
|---|
| 767 | | - int enable = 0; |
|---|
| 768 | 841 | |
|---|
| 769 | 842 | ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); |
|---|
| 770 | 843 | if (ret) |
|---|
| 771 | 844 | return ret; |
|---|
| 772 | 845 | |
|---|
| 773 | 846 | iolat = blkg_to_lat(ctx.blkg); |
|---|
| 774 | | - blkiolat = iolat->blkiolat; |
|---|
| 775 | 847 | p = ctx.body; |
|---|
| 776 | 848 | |
|---|
| 777 | 849 | ret = -EINVAL; |
|---|
| .. | .. |
|---|
| 800 | 872 | blkg = ctx.blkg; |
|---|
| 801 | 873 | oldval = iolat->min_lat_nsec; |
|---|
| 802 | 874 | |
|---|
| 803 | | - enable = iolatency_set_min_lat_nsec(blkg, lat_val); |
|---|
| 804 | | - if (enable) { |
|---|
| 805 | | - if (!blk_get_queue(blkg->q)) { |
|---|
| 806 | | - ret = -ENODEV; |
|---|
| 807 | | - goto out; |
|---|
| 808 | | - } |
|---|
| 809 | | - |
|---|
| 810 | | - blkg_get(blkg); |
|---|
| 811 | | - } |
|---|
| 812 | | - |
|---|
| 813 | | - if (oldval != iolat->min_lat_nsec) { |
|---|
| 875 | + iolatency_set_min_lat_nsec(blkg, lat_val); |
|---|
| 876 | + if (oldval != iolat->min_lat_nsec) |
|---|
| 814 | 877 | iolatency_clear_scaling(blkg); |
|---|
| 815 | | - } |
|---|
| 816 | | - |
|---|
| 817 | 878 | ret = 0; |
|---|
| 818 | 879 | out: |
|---|
| 819 | 880 | blkg_conf_finish(&ctx); |
|---|
| 820 | | - if (ret == 0 && enable) { |
|---|
| 821 | | - struct iolatency_grp *tmp = blkg_to_lat(blkg); |
|---|
| 822 | | - struct blk_iolatency *blkiolat = tmp->blkiolat; |
|---|
| 823 | | - |
|---|
| 824 | | - blk_mq_freeze_queue(blkg->q); |
|---|
| 825 | | - |
|---|
| 826 | | - if (enable == 1) |
|---|
| 827 | | - atomic_inc(&blkiolat->enabled); |
|---|
| 828 | | - else if (enable == -1) |
|---|
| 829 | | - atomic_dec(&blkiolat->enabled); |
|---|
| 830 | | - else |
|---|
| 831 | | - WARN_ON_ONCE(1); |
|---|
| 832 | | - |
|---|
| 833 | | - blk_mq_unfreeze_queue(blkg->q); |
|---|
| 834 | | - |
|---|
| 835 | | - blkg_put(blkg); |
|---|
| 836 | | - blk_put_queue(blkg->q); |
|---|
| 837 | | - } |
|---|
| 838 | 881 | return ret ?: nbytes; |
|---|
| 839 | 882 | } |
|---|
| 840 | 883 | |
|---|
| .. | .. |
|---|
| 859 | 902 | return 0; |
|---|
| 860 | 903 | } |
|---|
| 861 | 904 | |
|---|
| 905 | +static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, |
|---|
| 906 | + size_t size) |
|---|
| 907 | +{ |
|---|
| 908 | + struct latency_stat stat; |
|---|
| 909 | + int cpu; |
|---|
| 910 | + |
|---|
| 911 | + latency_stat_init(iolat, &stat); |
|---|
| 912 | + preempt_disable(); |
|---|
| 913 | + for_each_online_cpu(cpu) { |
|---|
| 914 | + struct latency_stat *s; |
|---|
| 915 | + s = per_cpu_ptr(iolat->stats, cpu); |
|---|
| 916 | + latency_stat_sum(iolat, &stat, s); |
|---|
| 917 | + } |
|---|
| 918 | + preempt_enable(); |
|---|
| 919 | + |
|---|
| 920 | + if (iolat->rq_depth.max_depth == UINT_MAX) |
|---|
| 921 | + return scnprintf(buf, size, " missed=%llu total=%llu depth=max", |
|---|
| 922 | + (unsigned long long)stat.ps.missed, |
|---|
| 923 | + (unsigned long long)stat.ps.total); |
|---|
| 924 | + return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", |
|---|
| 925 | + (unsigned long long)stat.ps.missed, |
|---|
| 926 | + (unsigned long long)stat.ps.total, |
|---|
| 927 | + iolat->rq_depth.max_depth); |
|---|
| 928 | +} |
|---|
| 929 | + |
|---|
| 862 | 930 | static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, |
|---|
| 863 | 931 | size_t size) |
|---|
| 864 | 932 | { |
|---|
| 865 | 933 | struct iolatency_grp *iolat = pd_to_lat(pd); |
|---|
| 866 | | - unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); |
|---|
| 867 | | - unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); |
|---|
| 934 | + unsigned long long avg_lat; |
|---|
| 935 | + unsigned long long cur_win; |
|---|
| 868 | 936 | |
|---|
| 937 | + if (!blkcg_debug_stats) |
|---|
| 938 | + return 0; |
|---|
| 939 | + |
|---|
| 940 | + if (iolat->ssd) |
|---|
| 941 | + return iolatency_ssd_stat(iolat, buf, size); |
|---|
| 942 | + |
|---|
| 943 | + avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); |
|---|
| 944 | + cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); |
|---|
| 869 | 945 | if (iolat->rq_depth.max_depth == UINT_MAX) |
|---|
| 870 | 946 | return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", |
|---|
| 871 | 947 | avg_lat, cur_win); |
|---|
| .. | .. |
|---|
| 875 | 951 | } |
|---|
| 876 | 952 | |
|---|
| 877 | 953 | |
|---|
| 878 | | -static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node) |
|---|
| 954 | +static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, |
|---|
| 955 | + struct request_queue *q, |
|---|
| 956 | + struct blkcg *blkcg) |
|---|
| 879 | 957 | { |
|---|
| 880 | 958 | struct iolatency_grp *iolat; |
|---|
| 881 | 959 | |
|---|
| 882 | | - iolat = kzalloc_node(sizeof(*iolat), gfp, node); |
|---|
| 960 | + iolat = kzalloc_node(sizeof(*iolat), gfp, q->node); |
|---|
| 883 | 961 | if (!iolat) |
|---|
| 884 | 962 | return NULL; |
|---|
| 885 | | - iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat), |
|---|
| 886 | | - __alignof__(struct blk_rq_stat), gfp); |
|---|
| 963 | + iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), |
|---|
| 964 | + __alignof__(struct latency_stat), gfp); |
|---|
| 887 | 965 | if (!iolat->stats) { |
|---|
| 888 | 966 | kfree(iolat); |
|---|
| 889 | 967 | return NULL; |
|---|
| .. | .. |
|---|
| 900 | 978 | u64 now = ktime_to_ns(ktime_get()); |
|---|
| 901 | 979 | int cpu; |
|---|
| 902 | 980 | |
|---|
| 981 | + if (blk_queue_nonrot(blkg->q)) |
|---|
| 982 | + iolat->ssd = true; |
|---|
| 983 | + else |
|---|
| 984 | + iolat->ssd = false; |
|---|
| 985 | + |
|---|
| 903 | 986 | for_each_possible_cpu(cpu) { |
|---|
| 904 | | - struct blk_rq_stat *stat; |
|---|
| 987 | + struct latency_stat *stat; |
|---|
| 905 | 988 | stat = per_cpu_ptr(iolat->stats, cpu); |
|---|
| 906 | | - blk_rq_stat_init(stat); |
|---|
| 989 | + latency_stat_init(iolat, stat); |
|---|
| 907 | 990 | } |
|---|
| 908 | 991 | |
|---|
| 992 | + latency_stat_init(iolat, &iolat->cur_stat); |
|---|
| 909 | 993 | rq_wait_init(&iolat->rq_wait); |
|---|
| 910 | 994 | spin_lock_init(&iolat->child_lat.lock); |
|---|
| 911 | | - iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q); |
|---|
| 995 | + iolat->rq_depth.queue_depth = blkg->q->nr_requests; |
|---|
| 912 | 996 | iolat->rq_depth.max_depth = UINT_MAX; |
|---|
| 913 | 997 | iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; |
|---|
| 914 | 998 | iolat->blkiolat = blkiolat; |
|---|
| .. | .. |
|---|
| 934 | 1018 | { |
|---|
| 935 | 1019 | struct iolatency_grp *iolat = pd_to_lat(pd); |
|---|
| 936 | 1020 | struct blkcg_gq *blkg = lat_to_blkg(iolat); |
|---|
| 937 | | - struct blk_iolatency *blkiolat = iolat->blkiolat; |
|---|
| 938 | | - int ret; |
|---|
| 939 | 1021 | |
|---|
| 940 | | - ret = iolatency_set_min_lat_nsec(blkg, 0); |
|---|
| 941 | | - if (ret == 1) |
|---|
| 942 | | - atomic_inc(&blkiolat->enabled); |
|---|
| 943 | | - if (ret == -1) |
|---|
| 944 | | - atomic_dec(&blkiolat->enabled); |
|---|
| 1022 | + iolatency_set_min_lat_nsec(blkg, 0); |
|---|
| 945 | 1023 | iolatency_clear_scaling(blkg); |
|---|
| 946 | 1024 | } |
|---|
| 947 | 1025 | |
|---|
| .. | .. |
|---|
| 978 | 1056 | |
|---|
| 979 | 1057 | static void __exit iolatency_exit(void) |
|---|
| 980 | 1058 | { |
|---|
| 981 | | - return blkcg_policy_unregister(&blkcg_policy_iolatency); |
|---|
| 1059 | + blkcg_policy_unregister(&blkcg_policy_iolatency); |
|---|
| 982 | 1060 | } |
|---|
| 983 | 1061 | |
|---|
| 984 | 1062 | module_init(iolatency_init); |
|---|