From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/block/blk-throttle.c | 393 +++++++++++++++++++++++++++++---------------------------
1 files changed, 203 insertions(+), 190 deletions(-)
diff --git a/kernel/block/blk-throttle.c b/kernel/block/blk-throttle.c
index 853b177..c526fdd 100644
--- a/kernel/block/blk-throttle.c
+++ b/kernel/block/blk-throttle.c
@@ -12,12 +12,13 @@
#include <linux/blktrace_api.h>
#include <linux/blk-cgroup.h>
#include "blk.h"
+#include "blk-cgroup-rwstat.h"
/* Max dispatch from a group in 1 round */
-static int throtl_grp_quantum = 8;
+#define THROTL_GRP_QUANTUM 8
/* Total max dispatch from all groups in one round */
-static int throtl_quantum = 32;
+#define THROTL_QUANTUM 32
/* Throttling is performed over a slice and after that slice is renewed */
#define DFL_THROTL_SLICE_HD (HZ / 10)
@@ -84,8 +85,7 @@
* RB tree of active children throtl_grp's, which are sorted by
* their ->disptime.
*/
- struct rb_root pending_tree; /* RB tree of active tgs */
- struct rb_node *first_pending; /* first node in the tree */
+ struct rb_root_cached pending_tree; /* RB tree of active tgs */
unsigned int nr_pending; /* # queued in the tree */
unsigned long first_pending_disptime; /* disptime of the first tg */
struct timer_list pending_timer; /* fires on first_pending_disptime */
@@ -150,7 +150,7 @@
/* user configured IOPS limits */
unsigned int iops_conf[2][LIMIT_CNT];
- /* Number of bytes disptached in current slice */
+ /* Number of bytes dispatched in current slice */
uint64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
@@ -177,6 +177,12 @@
unsigned int bio_cnt; /* total bios */
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
unsigned long bio_cnt_reset_time;
+
+ atomic_t io_split_cnt[2];
+ atomic_t last_io_split_cnt[2];
+
+ struct blkg_rwstat stat_bytes;
+ struct blkg_rwstat stat_ios;
};
/* We measure latency for request size from <= 4k to >= 1M */
@@ -420,12 +426,13 @@
*/
static struct bio *throtl_peek_queued(struct list_head *queued)
{
- struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+ struct throtl_qnode *qn;
struct bio *bio;
if (list_empty(queued))
return NULL;
+ qn = list_first_entry(queued, struct throtl_qnode, node);
bio = bio_list_peek(&qn->bios);
WARN_ON_ONCE(!bio);
return bio;
@@ -448,12 +455,13 @@
static struct bio *throtl_pop_queued(struct list_head *queued,
struct throtl_grp **tg_to_put)
{
- struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+ struct throtl_qnode *qn;
struct bio *bio;
if (list_empty(queued))
return NULL;
+ qn = list_first_entry(queued, struct throtl_qnode, node);
bio = bio_list_pop(&qn->bios);
WARN_ON_ONCE(!bio);
@@ -475,18 +483,26 @@
{
INIT_LIST_HEAD(&sq->queued[0]);
INIT_LIST_HEAD(&sq->queued[1]);
- sq->pending_tree = RB_ROOT;
+ sq->pending_tree = RB_ROOT_CACHED;
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
}
-static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
+static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
+ struct request_queue *q,
+ struct blkcg *blkcg)
{
struct throtl_grp *tg;
int rw;
- tg = kzalloc_node(sizeof(*tg), gfp, node);
+ tg = kzalloc_node(sizeof(*tg), gfp, q->node);
if (!tg)
return NULL;
+
+ if (blkg_rwstat_init(&tg->stat_bytes, gfp))
+ goto err_free_tg;
+
+ if (blkg_rwstat_init(&tg->stat_ios, gfp))
+ goto err_exit_stat_bytes;
throtl_service_queue_init(&tg->service_queue);
@@ -512,6 +528,12 @@
tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
return &tg->pd;
+
+err_exit_stat_bytes:
+ blkg_rwstat_exit(&tg->stat_bytes);
+err_free_tg:
+ kfree(tg);
+ return NULL;
}
static void throtl_pd_init(struct blkg_policy_data *pd)
@@ -610,37 +632,28 @@
struct throtl_grp *tg = pd_to_tg(pd);
del_timer_sync(&tg->service_queue.pending_timer);
+ blkg_rwstat_exit(&tg->stat_bytes);
+ blkg_rwstat_exit(&tg->stat_ios);
kfree(tg);
}
static struct throtl_grp *
throtl_rb_first(struct throtl_service_queue *parent_sq)
{
- /* Service tree is empty */
- if (!parent_sq->nr_pending)
+ struct rb_node *n;
+
+ n = rb_first_cached(&parent_sq->pending_tree);
+ WARN_ON_ONCE(!n);
+ if (!n)
return NULL;
-
- if (!parent_sq->first_pending)
- parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
-
- if (parent_sq->first_pending)
- return rb_entry_tg(parent_sq->first_pending);
-
- return NULL;
-}
-
-static void rb_erase_init(struct rb_node *n, struct rb_root *root)
-{
- rb_erase(n, root);
- RB_CLEAR_NODE(n);
+ return rb_entry_tg(n);
}
static void throtl_rb_erase(struct rb_node *n,
struct throtl_service_queue *parent_sq)
{
- if (parent_sq->first_pending == n)
- parent_sq->first_pending = NULL;
- rb_erase_init(n, &parent_sq->pending_tree);
+ rb_erase_cached(n, &parent_sq->pending_tree);
+ RB_CLEAR_NODE(n);
--parent_sq->nr_pending;
}
@@ -658,11 +671,11 @@
static void tg_service_queue_add(struct throtl_grp *tg)
{
struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
- struct rb_node **node = &parent_sq->pending_tree.rb_node;
+ struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node;
struct rb_node *parent = NULL;
struct throtl_grp *__tg;
unsigned long key = tg->disptime;
- int left = 1;
+ bool leftmost = true;
while (*node != NULL) {
parent = *node;
@@ -672,40 +685,30 @@
node = &parent->rb_left;
else {
node = &parent->rb_right;
- left = 0;
+ leftmost = false;
}
}
- if (left)
- parent_sq->first_pending = &tg->rb_node;
-
rb_link_node(&tg->rb_node, parent, node);
- rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
-}
-
-static void __throtl_enqueue_tg(struct throtl_grp *tg)
-{
- tg_service_queue_add(tg);
- tg->flags |= THROTL_TG_PENDING;
- tg->service_queue.parent_sq->nr_pending++;
+ rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree,
+ leftmost);
}
static void throtl_enqueue_tg(struct throtl_grp *tg)
{
- if (!(tg->flags & THROTL_TG_PENDING))
- __throtl_enqueue_tg(tg);
-}
-
-static void __throtl_dequeue_tg(struct throtl_grp *tg)
-{
- throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
- tg->flags &= ~THROTL_TG_PENDING;
+ if (!(tg->flags & THROTL_TG_PENDING)) {
+ tg_service_queue_add(tg);
+ tg->flags |= THROTL_TG_PENDING;
+ tg->service_queue.parent_sq->nr_pending++;
+ }
}
static void throtl_dequeue_tg(struct throtl_grp *tg)
{
- if (tg->flags & THROTL_TG_PENDING)
- __throtl_dequeue_tg(tg);
+ if (tg->flags & THROTL_TG_PENDING) {
+ throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
+ tg->flags &= ~THROTL_TG_PENDING;
+ }
}
/* Call with queue lock held */
@@ -771,6 +774,8 @@
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
+ atomic_set(&tg->io_split_cnt[rw], 0);
+
/*
* Previous slice has expired. We must have trimmed it after last
* bio dispatch. That means since start of last slice, we never used
@@ -793,6 +798,9 @@
tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
+
+ atomic_set(&tg->io_split_cnt[rw], 0);
+
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -808,7 +816,7 @@
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
- tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
+ throtl_set_slice_end(tg, rw, jiffy_end);
throtl_log(&tg->service_queue,
"[%c] extend slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -843,7 +851,7 @@
/*
* A bio has been dispatched. Also adjust slice_end. It might happen
* that initially cgroup limit was very low resulting in high
- * slice_end, but later limit was bumped up and bio was dispached
+ * slice_end, but later limit was bumped up and bio was dispatched
* sooner, then we need to reduce slice_end. A high bogus slice_end
* is bad because it does not allow new slice to start.
*/
@@ -885,12 +893,18 @@
}
static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
- unsigned long *wait)
+ u32 iops_limit, unsigned long *wait)
{
bool rw = bio_data_dir(bio);
unsigned int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;
+
+ if (iops_limit == UINT_MAX) {
+ if (wait)
+ *wait = 0;
+ return true;
+ }
jiffy_elapsed = jiffies - tg->slice_start[rw];
@@ -904,7 +918,7 @@
* have been trimmed.
*/
- tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
+ tmp = (u64)iops_limit * jiffy_elapsed_rnd;
do_div(tmp, HZ);
if (tmp > UINT_MAX)
@@ -927,12 +941,18 @@
}
static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
- unsigned long *wait)
+ u64 bps_limit, unsigned long *wait)
{
bool rw = bio_data_dir(bio);
- u64 bytes_allowed, extra_bytes, tmp;
+ u64 bytes_allowed, extra_bytes;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio);
+
+ if (bps_limit == U64_MAX) {
+ if (wait)
+ *wait = 0;
+ return true;
+ }
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
@@ -941,10 +961,8 @@
jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
-
- tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
- do_div(tmp, HZ);
- bytes_allowed = tmp;
+ bytes_allowed = mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed_rnd,
+ (u64)HZ);
if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
if (wait)
@@ -954,7 +972,7 @@
/* Calc approx time to dispatch */
extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
- jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
+ jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit);
if (!jiffy_wait)
jiffy_wait = 1;
@@ -978,6 +996,8 @@
{
bool rw = bio_data_dir(bio);
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
+ u64 bps_limit = tg_bps_limit(tg, rw);
+ u32 iops_limit = tg_iops_limit(tg, rw);
/*
* Currently whole state machine of group depends on first bio
@@ -989,8 +1009,7 @@
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
- if (tg_bps_limit(tg, rw) == U64_MAX &&
- tg_iops_limit(tg, rw) == UINT_MAX) {
+ if (bps_limit == U64_MAX && iops_limit == UINT_MAX) {
if (wait)
*wait = 0;
return true;
@@ -1012,8 +1031,11 @@
jiffies + tg->td->throtl_slice);
}
- if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
- tg_with_in_iops_limit(tg, bio, &iops_wait)) {
+ if (iops_limit != UINT_MAX)
+ tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0);
+
+ if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) &&
+ tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) {
if (wait)
*wait = 0;
return true;
@@ -1073,7 +1095,7 @@
* If @tg doesn't currently have any bios queued in the same
* direction, queueing @bio can change when @tg should be
* dispatched. Mark that @tg was empty. This is automatically
- * cleaered on the next tg_update_disptime().
+ * cleared on the next tg_update_disptime().
*/
if (!sq->nr_queued[rw])
tg->flags |= THROTL_TG_WAS_EMPTY;
@@ -1166,8 +1188,8 @@
{
struct throtl_service_queue *sq = &tg->service_queue;
unsigned int nr_reads = 0, nr_writes = 0;
- unsigned int max_nr_reads = throtl_grp_quantum*3/4;
- unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
+ unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4;
+ unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads;
struct bio *bio;
/* Try to dispatch 75% READS and 25% WRITES */
@@ -1200,9 +1222,13 @@
unsigned int nr_disp = 0;
while (1) {
- struct throtl_grp *tg = throtl_rb_first(parent_sq);
+ struct throtl_grp *tg;
struct throtl_service_queue *sq;
+ if (!parent_sq->nr_pending)
+ break;
+
+ tg = throtl_rb_first(parent_sq);
if (!tg)
break;
@@ -1217,7 +1243,7 @@
if (sq->nr_queued[0] || sq->nr_queued[1])
tg_update_disptime(tg);
- if (nr_disp >= throtl_quantum)
+ if (nr_disp >= THROTL_QUANTUM)
break;
}
@@ -1228,7 +1254,7 @@
struct throtl_grp *this_tg);
/**
* throtl_pending_timer_fn - timer function for service_queue->pending_timer
- * @arg: the throtl_service_queue being serviced
+ * @t: the pending_timer member of the throtl_service_queue being serviced
*
* This timer is armed when a child throtl_grp with active bio's become
* pending and queued on the service_queue's pending_tree and expires when
@@ -1251,7 +1277,7 @@
bool dispatched;
int ret;
- spin_lock_irq(q->queue_lock);
+ spin_lock_irq(&q->queue_lock);
if (throtl_can_upgrade(td, NULL))
throtl_upgrade_state(td);
@@ -1274,9 +1300,9 @@
break;
/* this dispatch windows is still open, relax and repeat */
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irq(&q->queue_lock);
cpu_relax();
- spin_lock_irq(q->queue_lock);
+ spin_lock_irq(&q->queue_lock);
}
if (!dispatched)
@@ -1294,19 +1320,19 @@
}
}
} else {
- /* reached the top-level, queue issueing */
+ /* reached the top-level, queue issuing */
queue_work(kthrotld_workqueue, &td->dispatch_work);
}
out_unlock:
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irq(&q->queue_lock);
}
/**
* blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
* @work: work item being executed
*
- * This function is queued for execution when bio's reach the bio_lists[]
- * of throtl_data->service_queue. Those bio's are ready and issued by this
+ * This function is queued for execution when bios reach the bio_lists[]
+ * of throtl_data->service_queue. Those bios are ready and issued by this
* function.
*/
static void blk_throtl_dispatch_work_fn(struct work_struct *work)
@@ -1322,16 +1348,16 @@
bio_list_init(&bio_list_on_stack);
- spin_lock_irq(q->queue_lock);
+ spin_lock_irq(&q->queue_lock);
for (rw = READ; rw <= WRITE; rw++)
while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
bio_list_add(&bio_list_on_stack, bio);
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irq(&q->queue_lock);
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
- while((bio = bio_list_pop(&bio_list_on_stack)))
- generic_make_request(bio);
+ while ((bio = bio_list_pop(&bio_list_on_stack)))
+ submit_bio_noacct(bio);
blk_finish_plug(&plug);
}
}
@@ -1419,8 +1445,8 @@
* that a group's limit are dropped suddenly and we don't want to
* account recently dispatched IO with new low rate.
*/
- throtl_start_new_slice(tg, 0);
- throtl_start_new_slice(tg, 1);
+ throtl_start_new_slice(tg, READ);
+ throtl_start_new_slice(tg, WRITE);
if (tg->flags & THROTL_TG_PENDING) {
tg_update_disptime(tg);
@@ -1473,6 +1499,32 @@
return tg_set_conf(of, buf, nbytes, off, false);
}
+static int tg_print_rwstat(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ blkg_prfill_rwstat, &blkcg_policy_throtl,
+ seq_cft(sf)->private, true);
+ return 0;
+}
+
+static u64 tg_prfill_rwstat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat_sample sum;
+
+ blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off,
+ &sum);
+ return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int tg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ tg_prfill_rwstat_recursive, &blkcg_policy_throtl,
+ seq_cft(sf)->private, true);
+ return 0;
+}
+
static struct cftype throtl_legacy_files[] = {
{
.name = "throttle.read_bps_device",
@@ -1500,23 +1552,23 @@
},
{
.name = "throttle.io_service_bytes",
- .private = (unsigned long)&blkcg_policy_throtl,
- .seq_show = blkg_print_stat_bytes,
+ .private = offsetof(struct throtl_grp, stat_bytes),
+ .seq_show = tg_print_rwstat,
},
{
.name = "throttle.io_service_bytes_recursive",
- .private = (unsigned long)&blkcg_policy_throtl,
- .seq_show = blkg_print_stat_bytes_recursive,
+ .private = offsetof(struct throtl_grp, stat_bytes),
+ .seq_show = tg_print_rwstat_recursive,
},
{
.name = "throttle.io_serviced",
- .private = (unsigned long)&blkcg_policy_throtl,
- .seq_show = blkg_print_stat_ios,
+ .private = offsetof(struct throtl_grp, stat_ios),
+ .seq_show = tg_print_rwstat,
},
{
.name = "throttle.io_serviced_recursive",
- .private = (unsigned long)&blkcg_policy_throtl,
- .seq_show = blkg_print_stat_ios_recursive,
+ .private = offsetof(struct throtl_grp, stat_ios),
+ .seq_show = tg_print_rwstat_recursive,
},
{ } /* terminate */
};
@@ -1639,13 +1691,13 @@
goto out_finish;
ret = -EINVAL;
- if (!strcmp(tok, "rbps"))
+ if (!strcmp(tok, "rbps") && val > 1)
v[0] = val;
- else if (!strcmp(tok, "wbps"))
+ else if (!strcmp(tok, "wbps") && val > 1)
v[1] = val;
- else if (!strcmp(tok, "riops"))
+ else if (!strcmp(tok, "riops") && val > 1)
v[2] = min_t(u64, val, UINT_MAX);
- else if (!strcmp(tok, "wiops"))
+ else if (!strcmp(tok, "wiops") && val > 1)
v[3] = min_t(u64, val, UINT_MAX);
else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
idle_time = val;
@@ -1922,7 +1974,7 @@
queue_work(kthrotld_workqueue, &td->dispatch_work);
}
-static void throtl_downgrade_state(struct throtl_data *td, int new)
+static void throtl_downgrade_state(struct throtl_data *td)
{
td->scale /= 2;
@@ -1932,7 +1984,7 @@
return;
}
- td->limit_index = new;
+ td->limit_index = LIMIT_LOW;
td->low_downgrade_time = jiffies;
}
@@ -2003,12 +2055,14 @@
}
if (tg->iops[READ][LIMIT_LOW]) {
+ tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0);
iops = tg->last_io_disp[READ] * HZ / elapsed_time;
if (iops >= tg->iops[READ][LIMIT_LOW])
tg->last_low_overflow_time[READ] = now;
}
if (tg->iops[WRITE][LIMIT_LOW]) {
+ tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0);
iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
if (iops >= tg->iops[WRITE][LIMIT_LOW])
tg->last_low_overflow_time[WRITE] = now;
@@ -2019,7 +2073,7 @@
* cgroups
*/
if (throtl_hierarchy_can_downgrade(tg))
- throtl_downgrade_state(tg->td, LIMIT_LOW);
+ throtl_downgrade_state(tg->td);
tg->last_bytes_disp[READ] = 0;
tg->last_bytes_disp[WRITE] = 0;
@@ -2029,10 +2083,14 @@
static void blk_throtl_update_idletime(struct throtl_grp *tg)
{
- unsigned long now = ktime_get_ns() >> 10;
+ unsigned long now;
unsigned long last_finish_time = tg->last_finish_time;
- if (now <= last_finish_time || last_finish_time == 0 ||
+ if (last_finish_time == 0)
+ return;
+
+ now = ktime_get_ns() >> 10;
+ if (now <= last_finish_time ||
last_finish_time == tg->checked_last_finish_time)
return;
@@ -2048,7 +2106,7 @@
unsigned long last_latency[2] = { 0 };
unsigned long latency[2];
- if (!blk_queue_nonrot(td->queue))
+ if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])
return;
if (time_before(jiffies, td->last_calculate_time + HZ))
return;
@@ -2123,40 +2181,55 @@
}
#endif
-static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
+void blk_throtl_charge_bio_split(struct bio *bio)
{
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- /* fallback to root_blkg if we fail to get a blkg ref */
- if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV))
- bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg);
- bio_issue_init(&bio->bi_issue, bio_sectors(bio));
-#endif
+ struct blkcg_gq *blkg = bio->bi_blkg;
+ struct throtl_grp *parent = blkg_to_tg(blkg);
+ struct throtl_service_queue *parent_sq;
+ bool rw = bio_data_dir(bio);
+
+ do {
+ if (!parent->has_rules[rw])
+ break;
+
+ atomic_inc(&parent->io_split_cnt[rw]);
+ atomic_inc(&parent->last_io_split_cnt[rw]);
+
+ parent_sq = parent->service_queue.parent_sq;
+ parent = sq_to_tg(parent_sq);
+ } while (parent);
}
-bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
- struct bio *bio)
+bool blk_throtl_bio(struct bio *bio)
{
+ struct request_queue *q = bio->bi_disk->queue;
+ struct blkcg_gq *blkg = bio->bi_blkg;
struct throtl_qnode *qn = NULL;
- struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
+ struct throtl_grp *tg = blkg_to_tg(blkg);
struct throtl_service_queue *sq;
bool rw = bio_data_dir(bio);
bool throttled = false;
struct throtl_data *td = tg->td;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ rcu_read_lock();
/* see throtl_charge_bio() */
- if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
+ if (bio_flagged(bio, BIO_THROTTLED))
goto out;
- spin_lock_irq(q->queue_lock);
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
+ blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
+ bio->bi_iter.bi_size);
+ blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
+ }
+
+ if (!tg->has_rules[rw])
+ goto out;
+
+ spin_lock_irq(&q->queue_lock);
throtl_update_latency_buckets(td);
- if (unlikely(blk_queue_bypass(q)))
- goto out_unlock;
-
- blk_throtl_assoc_bio(tg, bio);
blk_throtl_update_idletime(tg);
sq = &tg->service_queue;
@@ -2199,7 +2272,7 @@
/*
* @bio passed through this layer without being throttled.
- * Climb up the ladder. If we''re already at the top, it
+ * Climb up the ladder. If we're already at the top, it
* can be executed directly.
*/
qn = &tg->qnode_on_parent[rw];
@@ -2235,7 +2308,7 @@
}
out_unlock:
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irq(&q->queue_lock);
out:
bio_set_flag(bio, BIO_THROTTLED);
@@ -2243,6 +2316,7 @@
if (throttled || !td->track_bio_latency)
bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
#endif
+ rcu_read_unlock();
return throttled;
}
@@ -2271,7 +2345,8 @@
struct request_queue *q = rq->q;
struct throtl_data *td = q->td;
- throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
+ throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
+ time_ns >> 10);
}
void blk_throtl_bio_endio(struct bio *bio)
@@ -2288,6 +2363,8 @@
if (!blkg)
return;
tg = blkg_to_tg(blkg);
+ if (!tg->td->limit_valid[LIMIT_LOW])
+ return;
finish_time_ns = ktime_get_ns();
tg->last_finish_time = finish_time_ns >> 10;
@@ -2326,70 +2403,6 @@
}
}
#endif
-
-/*
- * Dispatch all bios from all children tg's queued on @parent_sq. On
- * return, @parent_sq is guaranteed to not have any active children tg's
- * and all bios from previously active tg's are on @parent_sq->bio_lists[].
- */
-static void tg_drain_bios(struct throtl_service_queue *parent_sq)
-{
- struct throtl_grp *tg;
-
- while ((tg = throtl_rb_first(parent_sq))) {
- struct throtl_service_queue *sq = &tg->service_queue;
- struct bio *bio;
-
- throtl_dequeue_tg(tg);
-
- while ((bio = throtl_peek_queued(&sq->queued[READ])))
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
- while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
- }
-}
-
-/**
- * blk_throtl_drain - drain throttled bios
- * @q: request_queue to drain throttled bios for
- *
- * Dispatch all currently throttled bios on @q through ->make_request_fn().
- */
-void blk_throtl_drain(struct request_queue *q)
- __releases(q->queue_lock) __acquires(q->queue_lock)
-{
- struct throtl_data *td = q->td;
- struct blkcg_gq *blkg;
- struct cgroup_subsys_state *pos_css;
- struct bio *bio;
- int rw;
-
- queue_lockdep_assert_held(q);
- rcu_read_lock();
-
- /*
- * Drain each tg while doing post-order walk on the blkg tree, so
- * that all bios are propagated to td->service_queue. It'd be
- * better to walk service_queue tree directly but blkg walk is
- * easier.
- */
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
- tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
-
- /* finally, transfer bios from top-level tg's into the td */
- tg_drain_bios(&td->service_queue);
-
- rcu_read_unlock();
- spin_unlock_irq(q->queue_lock);
-
- /* all bios now should be in td->service_queue, issue them */
- for (rw = READ; rw <= WRITE; rw++)
- while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
- NULL)))
- generic_make_request(bio);
-
- spin_lock_irq(q->queue_lock);
-}
int blk_throtl_init(struct request_queue *q)
{
@@ -2469,7 +2482,7 @@
td->throtl_slice = DFL_THROTL_SLICE_HD;
#endif
- td->track_bio_latency = !queue_is_rq_based(q);
+ td->track_bio_latency = !queue_is_mq(q);
if (!td->track_bio_latency)
blk_stat_enable_accounting(q);
}
--
Gitblit v1.6.2