.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
---|
1 | 2 | /* |
---|
2 | 3 | * Block rq-qos base io controller |
---|
3 | 4 | * |
---|
.. | .. |
---|
85 | 86 | struct blk_iolatency { |
---|
86 | 87 | struct rq_qos rqos; |
---|
87 | 88 | struct timer_list timer; |
---|
88 | | - atomic_t enabled; |
---|
| 89 | + |
---|
| 90 | + /* |
---|
| 91 | + * ->enabled is the master enable switch gating the throttling logic and |
---|
| 92 | + * inflight tracking. The number of cgroups which have iolat enabled is |
---|
| 93 | + * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly |
---|
| 94 | + * from ->enable_work with the request_queue frozen. For details, See |
---|
| 95 | + * blkiolatency_enable_work_fn(). |
---|
| 96 | + */ |
---|
| 97 | + bool enabled; |
---|
| 98 | + atomic_t enable_cnt; |
---|
| 99 | + struct work_struct enable_work; |
---|
89 | 100 | }; |
---|
90 | 101 | |
---|
91 | 102 | static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) |
---|
92 | 103 | { |
---|
93 | 104 | return container_of(rqos, struct blk_iolatency, rqos); |
---|
94 | | -} |
---|
95 | | - |
---|
96 | | -static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat) |
---|
97 | | -{ |
---|
98 | | - return atomic_read(&blkiolat->enabled) > 0; |
---|
99 | 105 | } |
---|
100 | 106 | |
---|
101 | 107 | struct child_latency_info { |
---|
.. | .. |
---|
117 | 123 | atomic_t scale_cookie; |
---|
118 | 124 | }; |
---|
119 | 125 | |
---|
| 126 | +struct percentile_stats { |
---|
| 127 | + u64 total; |
---|
| 128 | + u64 missed; |
---|
| 129 | +}; |
---|
| 130 | + |
---|
| 131 | +struct latency_stat { |
---|
| 132 | + union { |
---|
| 133 | + struct percentile_stats ps; |
---|
| 134 | + struct blk_rq_stat rqs; |
---|
| 135 | + }; |
---|
| 136 | +}; |
---|
| 137 | + |
---|
120 | 138 | struct iolatency_grp { |
---|
121 | 139 | struct blkg_policy_data pd; |
---|
122 | | - struct blk_rq_stat __percpu *stats; |
---|
| 140 | + struct latency_stat __percpu *stats; |
---|
| 141 | + struct latency_stat cur_stat; |
---|
123 | 142 | struct blk_iolatency *blkiolat; |
---|
124 | 143 | struct rq_depth rq_depth; |
---|
125 | 144 | struct rq_wait rq_wait; |
---|
.. | .. |
---|
134 | 153 | /* Our current number of IO's for the last summation. */ |
---|
135 | 154 | u64 nr_samples; |
---|
136 | 155 | |
---|
| 156 | + bool ssd; |
---|
137 | 157 | struct child_latency_info child_lat; |
---|
138 | 158 | }; |
---|
139 | 159 | |
---|
.. | .. |
---|
174 | 194 | return pd_to_blkg(&iolat->pd); |
---|
175 | 195 | } |
---|
176 | 196 | |
---|
177 | | -static inline bool iolatency_may_queue(struct iolatency_grp *iolat, |
---|
178 | | - wait_queue_entry_t *wait, |
---|
179 | | - bool first_block) |
---|
| 197 | +static inline void latency_stat_init(struct iolatency_grp *iolat, |
---|
| 198 | + struct latency_stat *stat) |
---|
180 | 199 | { |
---|
181 | | - struct rq_wait *rqw = &iolat->rq_wait; |
---|
| 200 | + if (iolat->ssd) { |
---|
| 201 | + stat->ps.total = 0; |
---|
| 202 | + stat->ps.missed = 0; |
---|
| 203 | + } else |
---|
| 204 | + blk_rq_stat_init(&stat->rqs); |
---|
| 205 | +} |
---|
182 | 206 | |
---|
183 | | - if (first_block && waitqueue_active(&rqw->wait) && |
---|
184 | | - rqw->wait.head.next != &wait->entry) |
---|
185 | | - return false; |
---|
| 207 | +static inline void latency_stat_sum(struct iolatency_grp *iolat, |
---|
| 208 | + struct latency_stat *sum, |
---|
| 209 | + struct latency_stat *stat) |
---|
| 210 | +{ |
---|
| 211 | + if (iolat->ssd) { |
---|
| 212 | + sum->ps.total += stat->ps.total; |
---|
| 213 | + sum->ps.missed += stat->ps.missed; |
---|
| 214 | + } else |
---|
| 215 | + blk_rq_stat_sum(&sum->rqs, &stat->rqs); |
---|
| 216 | +} |
---|
| 217 | + |
---|
| 218 | +static inline void latency_stat_record_time(struct iolatency_grp *iolat, |
---|
| 219 | + u64 req_time) |
---|
| 220 | +{ |
---|
| 221 | + struct latency_stat *stat = get_cpu_ptr(iolat->stats); |
---|
| 222 | + if (iolat->ssd) { |
---|
| 223 | + if (req_time >= iolat->min_lat_nsec) |
---|
| 224 | + stat->ps.missed++; |
---|
| 225 | + stat->ps.total++; |
---|
| 226 | + } else |
---|
| 227 | + blk_rq_stat_add(&stat->rqs, req_time); |
---|
| 228 | + put_cpu_ptr(stat); |
---|
| 229 | +} |
---|
| 230 | + |
---|
| 231 | +static inline bool latency_sum_ok(struct iolatency_grp *iolat, |
---|
| 232 | + struct latency_stat *stat) |
---|
| 233 | +{ |
---|
| 234 | + if (iolat->ssd) { |
---|
| 235 | + u64 thresh = div64_u64(stat->ps.total, 10); |
---|
| 236 | + thresh = max(thresh, 1ULL); |
---|
| 237 | + return stat->ps.missed < thresh; |
---|
| 238 | + } |
---|
| 239 | + return stat->rqs.mean <= iolat->min_lat_nsec; |
---|
| 240 | +} |
---|
| 241 | + |
---|
| 242 | +static inline u64 latency_stat_samples(struct iolatency_grp *iolat, |
---|
| 243 | + struct latency_stat *stat) |
---|
| 244 | +{ |
---|
| 245 | + if (iolat->ssd) |
---|
| 246 | + return stat->ps.total; |
---|
| 247 | + return stat->rqs.nr_samples; |
---|
| 248 | +} |
---|
| 249 | + |
---|
| 250 | +static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, |
---|
| 251 | + struct latency_stat *stat) |
---|
| 252 | +{ |
---|
| 253 | + int exp_idx; |
---|
| 254 | + |
---|
| 255 | + if (iolat->ssd) |
---|
| 256 | + return; |
---|
| 257 | + |
---|
| 258 | + /* |
---|
| 259 | + * calc_load() takes in a number stored in fixed point representation. |
---|
| 260 | + * Because we are using this for IO time in ns, the values stored |
---|
| 261 | + * are significantly larger than the FIXED_1 denominator (2048). |
---|
| 262 | + * Therefore, rounding errors in the calculation are negligible and |
---|
| 263 | + * can be ignored. |
---|
| 264 | + */ |
---|
| 265 | + exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, |
---|
| 266 | + div64_u64(iolat->cur_win_nsec, |
---|
| 267 | + BLKIOLATENCY_EXP_BUCKET_SIZE)); |
---|
| 268 | + iolat->lat_avg = calc_load(iolat->lat_avg, |
---|
| 269 | + iolatency_exp_factors[exp_idx], |
---|
| 270 | + stat->rqs.mean); |
---|
| 271 | +} |
---|
| 272 | + |
---|
| 273 | +static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) |
---|
| 274 | +{ |
---|
| 275 | + atomic_dec(&rqw->inflight); |
---|
| 276 | + wake_up(&rqw->wait); |
---|
| 277 | +} |
---|
| 278 | + |
---|
| 279 | +static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) |
---|
| 280 | +{ |
---|
| 281 | + struct iolatency_grp *iolat = private_data; |
---|
186 | 282 | return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); |
---|
187 | 283 | } |
---|
188 | 284 | |
---|
189 | 285 | static void __blkcg_iolatency_throttle(struct rq_qos *rqos, |
---|
190 | 286 | struct iolatency_grp *iolat, |
---|
191 | | - spinlock_t *lock, bool issue_as_root, |
---|
| 287 | + bool issue_as_root, |
---|
192 | 288 | bool use_memdelay) |
---|
193 | | - __releases(lock) |
---|
194 | | - __acquires(lock) |
---|
195 | 289 | { |
---|
196 | 290 | struct rq_wait *rqw = &iolat->rq_wait; |
---|
197 | 291 | unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); |
---|
198 | | - DEFINE_WAIT(wait); |
---|
199 | | - bool first_block = true; |
---|
200 | 292 | |
---|
201 | 293 | if (use_delay) |
---|
202 | 294 | blkcg_schedule_throttle(rqos->q, use_memdelay); |
---|
.. | .. |
---|
213 | 305 | return; |
---|
214 | 306 | } |
---|
215 | 307 | |
---|
216 | | - if (iolatency_may_queue(iolat, &wait, first_block)) |
---|
217 | | - return; |
---|
218 | | - |
---|
219 | | - do { |
---|
220 | | - prepare_to_wait_exclusive(&rqw->wait, &wait, |
---|
221 | | - TASK_UNINTERRUPTIBLE); |
---|
222 | | - |
---|
223 | | - if (iolatency_may_queue(iolat, &wait, first_block)) |
---|
224 | | - break; |
---|
225 | | - first_block = false; |
---|
226 | | - |
---|
227 | | - if (lock) { |
---|
228 | | - spin_unlock_irq(lock); |
---|
229 | | - io_schedule(); |
---|
230 | | - spin_lock_irq(lock); |
---|
231 | | - } else { |
---|
232 | | - io_schedule(); |
---|
233 | | - } |
---|
234 | | - } while (1); |
---|
235 | | - |
---|
236 | | - finish_wait(&rqw->wait, &wait); |
---|
| 308 | + rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb); |
---|
237 | 309 | } |
---|
238 | 310 | |
---|
239 | 311 | #define SCALE_DOWN_FACTOR 2 |
---|
.. | .. |
---|
257 | 329 | struct child_latency_info *lat_info, |
---|
258 | 330 | bool up) |
---|
259 | 331 | { |
---|
260 | | - unsigned long qd = blk_queue_depth(blkiolat->rqos.q); |
---|
| 332 | + unsigned long qd = blkiolat->rqos.q->nr_requests; |
---|
261 | 333 | unsigned long scale = scale_amount(qd, up); |
---|
262 | 334 | unsigned long old = atomic_read(&lat_info->scale_cookie); |
---|
263 | 335 | unsigned long max_scale = qd << 1; |
---|
.. | .. |
---|
297 | 369 | */ |
---|
298 | 370 | static void scale_change(struct iolatency_grp *iolat, bool up) |
---|
299 | 371 | { |
---|
300 | | - unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q); |
---|
| 372 | + unsigned long qd = iolat->blkiolat->rqos.q->nr_requests; |
---|
301 | 373 | unsigned long scale = scale_amount(qd, up); |
---|
302 | 374 | unsigned long old = iolat->rq_depth.max_depth; |
---|
303 | | - bool changed = false; |
---|
304 | 375 | |
---|
305 | 376 | if (old > qd) |
---|
306 | 377 | old = qd; |
---|
.. | .. |
---|
310 | 381 | return; |
---|
311 | 382 | |
---|
312 | 383 | if (old < qd) { |
---|
313 | | - changed = true; |
---|
314 | 384 | old += scale; |
---|
315 | 385 | old = min(old, qd); |
---|
316 | 386 | iolat->rq_depth.max_depth = old; |
---|
317 | 387 | wake_up_all(&iolat->rq_wait.wait); |
---|
318 | 388 | } |
---|
319 | | - } else if (old > 1) { |
---|
| 389 | + } else { |
---|
320 | 390 | old >>= 1; |
---|
321 | | - changed = true; |
---|
322 | 391 | iolat->rq_depth.max_depth = max(old, 1UL); |
---|
323 | 392 | } |
---|
324 | 393 | } |
---|
.. | .. |
---|
371 | 440 | * scale down event. |
---|
372 | 441 | */ |
---|
373 | 442 | samples_thresh = lat_info->nr_samples * 5; |
---|
374 | | - samples_thresh = div64_u64(samples_thresh, 100); |
---|
| 443 | + samples_thresh = max(1ULL, div64_u64(samples_thresh, 100)); |
---|
375 | 444 | if (iolat->nr_samples <= samples_thresh) |
---|
376 | 445 | return; |
---|
377 | 446 | } |
---|
.. | .. |
---|
393 | 462 | scale_change(iolat, direction > 0); |
---|
394 | 463 | } |
---|
395 | 464 | |
---|
396 | | -static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, |
---|
397 | | - spinlock_t *lock) |
---|
| 465 | +static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) |
---|
398 | 466 | { |
---|
399 | 467 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); |
---|
400 | | - struct blkcg *blkcg; |
---|
401 | | - struct blkcg_gq *blkg; |
---|
402 | | - struct request_queue *q = rqos->q; |
---|
| 468 | + struct blkcg_gq *blkg = bio->bi_blkg; |
---|
403 | 469 | bool issue_as_root = bio_issue_as_root_blkg(bio); |
---|
404 | 470 | |
---|
405 | | - if (!blk_iolatency_enabled(blkiolat)) |
---|
| 471 | + if (!blkiolat->enabled) |
---|
406 | 472 | return; |
---|
407 | 473 | |
---|
408 | | - rcu_read_lock(); |
---|
409 | | - blkcg = bio_blkcg(bio); |
---|
410 | | - bio_associate_blkcg(bio, &blkcg->css); |
---|
411 | | - blkg = blkg_lookup(blkcg, q); |
---|
412 | | - if (unlikely(!blkg)) { |
---|
413 | | - if (!lock) |
---|
414 | | - spin_lock_irq(q->queue_lock); |
---|
415 | | - blkg = blkg_lookup_create(blkcg, q); |
---|
416 | | - if (IS_ERR(blkg)) |
---|
417 | | - blkg = NULL; |
---|
418 | | - if (!lock) |
---|
419 | | - spin_unlock_irq(q->queue_lock); |
---|
420 | | - } |
---|
421 | | - if (!blkg) |
---|
422 | | - goto out; |
---|
423 | | - |
---|
424 | | - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); |
---|
425 | | - bio_associate_blkg(bio, blkg); |
---|
426 | | -out: |
---|
427 | | - rcu_read_unlock(); |
---|
428 | 474 | while (blkg && blkg->parent) { |
---|
429 | 475 | struct iolatency_grp *iolat = blkg_to_lat(blkg); |
---|
430 | 476 | if (!iolat) { |
---|
.. | .. |
---|
433 | 479 | } |
---|
434 | 480 | |
---|
435 | 481 | check_scale_change(iolat); |
---|
436 | | - __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root, |
---|
| 482 | + __blkcg_iolatency_throttle(rqos, iolat, issue_as_root, |
---|
437 | 483 | (bio->bi_opf & REQ_SWAP) == REQ_SWAP); |
---|
438 | 484 | blkg = blkg->parent; |
---|
439 | 485 | } |
---|
.. | .. |
---|
445 | 491 | struct bio_issue *issue, u64 now, |
---|
446 | 492 | bool issue_as_root) |
---|
447 | 493 | { |
---|
448 | | - struct blk_rq_stat *rq_stat; |
---|
449 | 494 | u64 start = bio_issue_time(issue); |
---|
450 | 495 | u64 req_time; |
---|
451 | 496 | |
---|
.. | .. |
---|
471 | 516 | return; |
---|
472 | 517 | } |
---|
473 | 518 | |
---|
474 | | - rq_stat = get_cpu_ptr(iolat->stats); |
---|
475 | | - blk_rq_stat_add(rq_stat, req_time); |
---|
476 | | - put_cpu_ptr(rq_stat); |
---|
| 519 | + latency_stat_record_time(iolat, req_time); |
---|
477 | 520 | } |
---|
478 | 521 | |
---|
479 | 522 | #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) |
---|
.. | .. |
---|
484 | 527 | struct blkcg_gq *blkg = lat_to_blkg(iolat); |
---|
485 | 528 | struct iolatency_grp *parent; |
---|
486 | 529 | struct child_latency_info *lat_info; |
---|
487 | | - struct blk_rq_stat stat; |
---|
| 530 | + struct latency_stat stat; |
---|
488 | 531 | unsigned long flags; |
---|
489 | | - int cpu, exp_idx; |
---|
| 532 | + int cpu; |
---|
490 | 533 | |
---|
491 | | - blk_rq_stat_init(&stat); |
---|
| 534 | + latency_stat_init(iolat, &stat); |
---|
492 | 535 | preempt_disable(); |
---|
493 | 536 | for_each_online_cpu(cpu) { |
---|
494 | | - struct blk_rq_stat *s; |
---|
| 537 | + struct latency_stat *s; |
---|
495 | 538 | s = per_cpu_ptr(iolat->stats, cpu); |
---|
496 | | - blk_rq_stat_sum(&stat, s); |
---|
497 | | - blk_rq_stat_init(s); |
---|
| 539 | + latency_stat_sum(iolat, &stat, s); |
---|
| 540 | + latency_stat_init(iolat, s); |
---|
498 | 541 | } |
---|
499 | 542 | preempt_enable(); |
---|
500 | 543 | |
---|
.. | .. |
---|
504 | 547 | |
---|
505 | 548 | lat_info = &parent->child_lat; |
---|
506 | 549 | |
---|
507 | | - /* |
---|
508 | | - * calc_load() takes in a number stored in fixed point representation. |
---|
509 | | - * Because we are using this for IO time in ns, the values stored |
---|
510 | | - * are significantly larger than the FIXED_1 denominator (2048). |
---|
511 | | - * Therefore, rounding errors in the calculation are negligible and |
---|
512 | | - * can be ignored. |
---|
513 | | - */ |
---|
514 | | - exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, |
---|
515 | | - div64_u64(iolat->cur_win_nsec, |
---|
516 | | - BLKIOLATENCY_EXP_BUCKET_SIZE)); |
---|
517 | | - iolat->lat_avg = calc_load(iolat->lat_avg, |
---|
518 | | - iolatency_exp_factors[exp_idx], |
---|
519 | | - stat.mean); |
---|
| 550 | + iolat_update_total_lat_avg(iolat, &stat); |
---|
520 | 551 | |
---|
521 | 552 | /* Everything is ok and we don't need to adjust the scale. */ |
---|
522 | | - if (stat.mean <= iolat->min_lat_nsec && |
---|
| 553 | + if (latency_sum_ok(iolat, &stat) && |
---|
523 | 554 | atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) |
---|
524 | 555 | return; |
---|
525 | 556 | |
---|
526 | 557 | /* Somebody beat us to the punch, just bail. */ |
---|
527 | 558 | spin_lock_irqsave(&lat_info->lock, flags); |
---|
| 559 | + |
---|
| 560 | + latency_stat_sum(iolat, &iolat->cur_stat, &stat); |
---|
528 | 561 | lat_info->nr_samples -= iolat->nr_samples; |
---|
529 | | - lat_info->nr_samples += stat.nr_samples; |
---|
530 | | - iolat->nr_samples = stat.nr_samples; |
---|
| 562 | + lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat); |
---|
| 563 | + iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat); |
---|
531 | 564 | |
---|
532 | 565 | if ((lat_info->last_scale_event >= now || |
---|
533 | | - now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) && |
---|
534 | | - lat_info->scale_lat <= iolat->min_lat_nsec) |
---|
| 566 | + now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME)) |
---|
535 | 567 | goto out; |
---|
536 | 568 | |
---|
537 | | - if (stat.mean <= iolat->min_lat_nsec && |
---|
538 | | - stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) { |
---|
| 569 | + if (latency_sum_ok(iolat, &iolat->cur_stat) && |
---|
| 570 | + latency_sum_ok(iolat, &stat)) { |
---|
| 571 | + if (latency_stat_samples(iolat, &iolat->cur_stat) < |
---|
| 572 | + BLKIOLATENCY_MIN_GOOD_SAMPLES) |
---|
| 573 | + goto out; |
---|
539 | 574 | if (lat_info->scale_grp == iolat) { |
---|
540 | 575 | lat_info->last_scale_event = now; |
---|
541 | 576 | scale_cookie_change(iolat->blkiolat, lat_info, true); |
---|
542 | 577 | } |
---|
543 | | - } else if (stat.mean > iolat->min_lat_nsec) { |
---|
| 578 | + } else if (lat_info->scale_lat == 0 || |
---|
| 579 | + lat_info->scale_lat >= iolat->min_lat_nsec) { |
---|
544 | 580 | lat_info->last_scale_event = now; |
---|
545 | 581 | if (!lat_info->scale_grp || |
---|
546 | 582 | lat_info->scale_lat > iolat->min_lat_nsec) { |
---|
.. | .. |
---|
549 | 585 | } |
---|
550 | 586 | scale_cookie_change(iolat->blkiolat, lat_info, false); |
---|
551 | 587 | } |
---|
| 588 | + latency_stat_init(iolat, &iolat->cur_stat); |
---|
552 | 589 | out: |
---|
553 | 590 | spin_unlock_irqrestore(&lat_info->lock, flags); |
---|
554 | 591 | } |
---|
.. | .. |
---|
559 | 596 | struct rq_wait *rqw; |
---|
560 | 597 | struct iolatency_grp *iolat; |
---|
561 | 598 | u64 window_start; |
---|
562 | | - u64 now = ktime_to_ns(ktime_get()); |
---|
| 599 | + u64 now; |
---|
563 | 600 | bool issue_as_root = bio_issue_as_root_blkg(bio); |
---|
564 | | - bool enabled = false; |
---|
565 | 601 | int inflight = 0; |
---|
566 | 602 | |
---|
567 | 603 | blkg = bio->bi_blkg; |
---|
568 | | - if (!blkg) |
---|
| 604 | + if (!blkg || !bio_flagged(bio, BIO_TRACKED)) |
---|
569 | 605 | return; |
---|
570 | 606 | |
---|
571 | 607 | iolat = blkg_to_lat(bio->bi_blkg); |
---|
572 | 608 | if (!iolat) |
---|
573 | 609 | return; |
---|
574 | 610 | |
---|
575 | | - enabled = blk_iolatency_enabled(iolat->blkiolat); |
---|
576 | | - if (!enabled) |
---|
| 611 | + if (!iolat->blkiolat->enabled) |
---|
577 | 612 | return; |
---|
578 | 613 | |
---|
| 614 | + now = ktime_to_ns(ktime_get()); |
---|
579 | 615 | while (blkg && blkg->parent) { |
---|
580 | 616 | iolat = blkg_to_lat(blkg); |
---|
581 | 617 | if (!iolat) { |
---|
.. | .. |
---|
611 | 647 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); |
---|
612 | 648 | |
---|
613 | 649 | del_timer_sync(&blkiolat->timer); |
---|
| 650 | + flush_work(&blkiolat->enable_work); |
---|
614 | 651 | blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); |
---|
615 | 652 | kfree(blkiolat); |
---|
616 | 653 | } |
---|
.. | .. |
---|
640 | 677 | * We could be exiting, don't access the pd unless we have a |
---|
641 | 678 | * ref on the blkg. |
---|
642 | 679 | */ |
---|
643 | | - if (!blkg_try_get(blkg)) |
---|
| 680 | + if (!blkg_tryget(blkg)) |
---|
644 | 681 | continue; |
---|
645 | 682 | |
---|
646 | 683 | iolat = blkg_to_lat(blkg); |
---|
.. | .. |
---|
682 | 719 | rcu_read_unlock(); |
---|
683 | 720 | } |
---|
684 | 721 | |
---|
| 722 | +/** |
---|
| 723 | + * blkiolatency_enable_work_fn - Enable or disable iolatency on the device |
---|
| 724 | + * @work: enable_work of the blk_iolatency of interest |
---|
| 725 | + * |
---|
| 726 | + * iolatency needs to keep track of the number of in-flight IOs per cgroup. This |
---|
| 727 | + * is relatively expensive as it involves walking up the hierarchy twice for |
---|
| 728 | + * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we |
---|
| 729 | + * want to disable the in-flight tracking. |
---|
| 730 | + * |
---|
| 731 | + * We have to make sure that the counting is balanced - we don't want to leak |
---|
| 732 | + * the in-flight counts by disabling accounting in the completion path while IOs |
---|
| 733 | + * are in flight. This is achieved by ensuring that no IO is in flight by |
---|
| 734 | + * freezing the queue while flipping ->enabled. As this requires a sleepable |
---|
| 735 | + * context, ->enabled flipping is punted to this work function. |
---|
| 736 | + */ |
---|
| 737 | +static void blkiolatency_enable_work_fn(struct work_struct *work) |
---|
| 738 | +{ |
---|
| 739 | + struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency, |
---|
| 740 | + enable_work); |
---|
| 741 | + bool enabled; |
---|
| 742 | + |
---|
| 743 | + /* |
---|
| 744 | + * There can only be one instance of this function running for @blkiolat |
---|
| 745 | + * and it's guaranteed to be executed at least once after the latest |
---|
| 746 | + * ->enabled_cnt modification. Acting on the latest ->enable_cnt is |
---|
| 747 | + * sufficient. |
---|
| 748 | + * |
---|
| 749 | + * Also, we know @blkiolat is safe to access as ->enable_work is flushed |
---|
| 750 | + * in blkcg_iolatency_exit(). |
---|
| 751 | + */ |
---|
| 752 | + enabled = atomic_read(&blkiolat->enable_cnt); |
---|
| 753 | + if (enabled != blkiolat->enabled) { |
---|
| 754 | + blk_mq_freeze_queue(blkiolat->rqos.q); |
---|
| 755 | + blkiolat->enabled = enabled; |
---|
| 756 | + blk_mq_unfreeze_queue(blkiolat->rqos.q); |
---|
| 757 | + } |
---|
| 758 | +} |
---|
| 759 | + |
---|
685 | 760 | int blk_iolatency_init(struct request_queue *q) |
---|
686 | 761 | { |
---|
687 | 762 | struct blk_iolatency *blkiolat; |
---|
.. | .. |
---|
693 | 768 | return -ENOMEM; |
---|
694 | 769 | |
---|
695 | 770 | rqos = &blkiolat->rqos; |
---|
696 | | - rqos->id = RQ_QOS_CGROUP; |
---|
| 771 | + rqos->id = RQ_QOS_LATENCY; |
---|
697 | 772 | rqos->ops = &blkcg_iolatency_ops; |
---|
698 | 773 | rqos->q = q; |
---|
699 | 774 | |
---|
.. | .. |
---|
707 | 782 | } |
---|
708 | 783 | |
---|
709 | 784 | timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); |
---|
| 785 | + INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); |
---|
710 | 786 | |
---|
711 | 787 | return 0; |
---|
712 | 788 | } |
---|
713 | 789 | |
---|
714 | | -/* |
---|
715 | | - * return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise |
---|
716 | | - * return 0. |
---|
717 | | - */ |
---|
718 | | -static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) |
---|
| 790 | +static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) |
---|
719 | 791 | { |
---|
720 | 792 | struct iolatency_grp *iolat = blkg_to_lat(blkg); |
---|
| 793 | + struct blk_iolatency *blkiolat = iolat->blkiolat; |
---|
721 | 794 | u64 oldval = iolat->min_lat_nsec; |
---|
722 | 795 | |
---|
723 | 796 | iolat->min_lat_nsec = val; |
---|
.. | .. |
---|
725 | 798 | iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, |
---|
726 | 799 | BLKIOLATENCY_MAX_WIN_SIZE); |
---|
727 | 800 | |
---|
728 | | - if (!oldval && val) |
---|
729 | | - return 1; |
---|
| 801 | + if (!oldval && val) { |
---|
| 802 | + if (atomic_inc_return(&blkiolat->enable_cnt) == 1) |
---|
| 803 | + schedule_work(&blkiolat->enable_work); |
---|
| 804 | + } |
---|
730 | 805 | if (oldval && !val) { |
---|
731 | 806 | blkcg_clear_delay(blkg); |
---|
732 | | - return -1; |
---|
| 807 | + if (atomic_dec_return(&blkiolat->enable_cnt) == 0) |
---|
| 808 | + schedule_work(&blkiolat->enable_work); |
---|
733 | 809 | } |
---|
734 | | - return 0; |
---|
735 | 810 | } |
---|
736 | 811 | |
---|
737 | 812 | static void iolatency_clear_scaling(struct blkcg_gq *blkg) |
---|
.. | .. |
---|
757 | 832 | { |
---|
758 | 833 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); |
---|
759 | 834 | struct blkcg_gq *blkg; |
---|
760 | | - struct blk_iolatency *blkiolat; |
---|
761 | 835 | struct blkg_conf_ctx ctx; |
---|
762 | 836 | struct iolatency_grp *iolat; |
---|
763 | 837 | char *p, *tok; |
---|
764 | 838 | u64 lat_val = 0; |
---|
765 | 839 | u64 oldval; |
---|
766 | 840 | int ret; |
---|
767 | | - int enable = 0; |
---|
768 | 841 | |
---|
769 | 842 | ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); |
---|
770 | 843 | if (ret) |
---|
771 | 844 | return ret; |
---|
772 | 845 | |
---|
773 | 846 | iolat = blkg_to_lat(ctx.blkg); |
---|
774 | | - blkiolat = iolat->blkiolat; |
---|
775 | 847 | p = ctx.body; |
---|
776 | 848 | |
---|
777 | 849 | ret = -EINVAL; |
---|
.. | .. |
---|
800 | 872 | blkg = ctx.blkg; |
---|
801 | 873 | oldval = iolat->min_lat_nsec; |
---|
802 | 874 | |
---|
803 | | - enable = iolatency_set_min_lat_nsec(blkg, lat_val); |
---|
804 | | - if (enable) { |
---|
805 | | - if (!blk_get_queue(blkg->q)) { |
---|
806 | | - ret = -ENODEV; |
---|
807 | | - goto out; |
---|
808 | | - } |
---|
809 | | - |
---|
810 | | - blkg_get(blkg); |
---|
811 | | - } |
---|
812 | | - |
---|
813 | | - if (oldval != iolat->min_lat_nsec) { |
---|
| 875 | + iolatency_set_min_lat_nsec(blkg, lat_val); |
---|
| 876 | + if (oldval != iolat->min_lat_nsec) |
---|
814 | 877 | iolatency_clear_scaling(blkg); |
---|
815 | | - } |
---|
816 | | - |
---|
817 | 878 | ret = 0; |
---|
818 | 879 | out: |
---|
819 | 880 | blkg_conf_finish(&ctx); |
---|
820 | | - if (ret == 0 && enable) { |
---|
821 | | - struct iolatency_grp *tmp = blkg_to_lat(blkg); |
---|
822 | | - struct blk_iolatency *blkiolat = tmp->blkiolat; |
---|
823 | | - |
---|
824 | | - blk_mq_freeze_queue(blkg->q); |
---|
825 | | - |
---|
826 | | - if (enable == 1) |
---|
827 | | - atomic_inc(&blkiolat->enabled); |
---|
828 | | - else if (enable == -1) |
---|
829 | | - atomic_dec(&blkiolat->enabled); |
---|
830 | | - else |
---|
831 | | - WARN_ON_ONCE(1); |
---|
832 | | - |
---|
833 | | - blk_mq_unfreeze_queue(blkg->q); |
---|
834 | | - |
---|
835 | | - blkg_put(blkg); |
---|
836 | | - blk_put_queue(blkg->q); |
---|
837 | | - } |
---|
838 | 881 | return ret ?: nbytes; |
---|
839 | 882 | } |
---|
840 | 883 | |
---|
.. | .. |
---|
859 | 902 | return 0; |
---|
860 | 903 | } |
---|
861 | 904 | |
---|
| 905 | +static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, |
---|
| 906 | + size_t size) |
---|
| 907 | +{ |
---|
| 908 | + struct latency_stat stat; |
---|
| 909 | + int cpu; |
---|
| 910 | + |
---|
| 911 | + latency_stat_init(iolat, &stat); |
---|
| 912 | + preempt_disable(); |
---|
| 913 | + for_each_online_cpu(cpu) { |
---|
| 914 | + struct latency_stat *s; |
---|
| 915 | + s = per_cpu_ptr(iolat->stats, cpu); |
---|
| 916 | + latency_stat_sum(iolat, &stat, s); |
---|
| 917 | + } |
---|
| 918 | + preempt_enable(); |
---|
| 919 | + |
---|
| 920 | + if (iolat->rq_depth.max_depth == UINT_MAX) |
---|
| 921 | + return scnprintf(buf, size, " missed=%llu total=%llu depth=max", |
---|
| 922 | + (unsigned long long)stat.ps.missed, |
---|
| 923 | + (unsigned long long)stat.ps.total); |
---|
| 924 | + return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", |
---|
| 925 | + (unsigned long long)stat.ps.missed, |
---|
| 926 | + (unsigned long long)stat.ps.total, |
---|
| 927 | + iolat->rq_depth.max_depth); |
---|
| 928 | +} |
---|
| 929 | + |
---|
862 | 930 | static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, |
---|
863 | 931 | size_t size) |
---|
864 | 932 | { |
---|
865 | 933 | struct iolatency_grp *iolat = pd_to_lat(pd); |
---|
866 | | - unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); |
---|
867 | | - unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); |
---|
| 934 | + unsigned long long avg_lat; |
---|
| 935 | + unsigned long long cur_win; |
---|
868 | 936 | |
---|
| 937 | + if (!blkcg_debug_stats) |
---|
| 938 | + return 0; |
---|
| 939 | + |
---|
| 940 | + if (iolat->ssd) |
---|
| 941 | + return iolatency_ssd_stat(iolat, buf, size); |
---|
| 942 | + |
---|
| 943 | + avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); |
---|
| 944 | + cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); |
---|
869 | 945 | if (iolat->rq_depth.max_depth == UINT_MAX) |
---|
870 | 946 | return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", |
---|
871 | 947 | avg_lat, cur_win); |
---|
.. | .. |
---|
875 | 951 | } |
---|
876 | 952 | |
---|
877 | 953 | |
---|
878 | | -static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node) |
---|
| 954 | +static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, |
---|
| 955 | + struct request_queue *q, |
---|
| 956 | + struct blkcg *blkcg) |
---|
879 | 957 | { |
---|
880 | 958 | struct iolatency_grp *iolat; |
---|
881 | 959 | |
---|
882 | | - iolat = kzalloc_node(sizeof(*iolat), gfp, node); |
---|
| 960 | + iolat = kzalloc_node(sizeof(*iolat), gfp, q->node); |
---|
883 | 961 | if (!iolat) |
---|
884 | 962 | return NULL; |
---|
885 | | - iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat), |
---|
886 | | - __alignof__(struct blk_rq_stat), gfp); |
---|
| 963 | + iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), |
---|
| 964 | + __alignof__(struct latency_stat), gfp); |
---|
887 | 965 | if (!iolat->stats) { |
---|
888 | 966 | kfree(iolat); |
---|
889 | 967 | return NULL; |
---|
.. | .. |
---|
900 | 978 | u64 now = ktime_to_ns(ktime_get()); |
---|
901 | 979 | int cpu; |
---|
902 | 980 | |
---|
| 981 | + if (blk_queue_nonrot(blkg->q)) |
---|
| 982 | + iolat->ssd = true; |
---|
| 983 | + else |
---|
| 984 | + iolat->ssd = false; |
---|
| 985 | + |
---|
903 | 986 | for_each_possible_cpu(cpu) { |
---|
904 | | - struct blk_rq_stat *stat; |
---|
| 987 | + struct latency_stat *stat; |
---|
905 | 988 | stat = per_cpu_ptr(iolat->stats, cpu); |
---|
906 | | - blk_rq_stat_init(stat); |
---|
| 989 | + latency_stat_init(iolat, stat); |
---|
907 | 990 | } |
---|
908 | 991 | |
---|
| 992 | + latency_stat_init(iolat, &iolat->cur_stat); |
---|
909 | 993 | rq_wait_init(&iolat->rq_wait); |
---|
910 | 994 | spin_lock_init(&iolat->child_lat.lock); |
---|
911 | | - iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q); |
---|
| 995 | + iolat->rq_depth.queue_depth = blkg->q->nr_requests; |
---|
912 | 996 | iolat->rq_depth.max_depth = UINT_MAX; |
---|
913 | 997 | iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; |
---|
914 | 998 | iolat->blkiolat = blkiolat; |
---|
.. | .. |
---|
934 | 1018 | { |
---|
935 | 1019 | struct iolatency_grp *iolat = pd_to_lat(pd); |
---|
936 | 1020 | struct blkcg_gq *blkg = lat_to_blkg(iolat); |
---|
937 | | - struct blk_iolatency *blkiolat = iolat->blkiolat; |
---|
938 | | - int ret; |
---|
939 | 1021 | |
---|
940 | | - ret = iolatency_set_min_lat_nsec(blkg, 0); |
---|
941 | | - if (ret == 1) |
---|
942 | | - atomic_inc(&blkiolat->enabled); |
---|
943 | | - if (ret == -1) |
---|
944 | | - atomic_dec(&blkiolat->enabled); |
---|
| 1022 | + iolatency_set_min_lat_nsec(blkg, 0); |
---|
945 | 1023 | iolatency_clear_scaling(blkg); |
---|
946 | 1024 | } |
---|
947 | 1025 | |
---|
.. | .. |
---|
978 | 1056 | |
---|
979 | 1057 | static void __exit iolatency_exit(void) |
---|
980 | 1058 | { |
---|
981 | | - return blkcg_policy_unregister(&blkcg_policy_iolatency); |
---|
| 1059 | + blkcg_policy_unregister(&blkcg_policy_iolatency); |
---|
982 | 1060 | } |
---|
983 | 1061 | |
---|
984 | 1062 | module_init(iolatency_init); |
---|