From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:20:52 +0000
Subject: [PATCH] add new system file
---
kernel/drivers/md/bcache/writeback.c | 295 +++++++++++++++++++++++++++++++++++++++++++++++++---------
1 files changed, 247 insertions(+), 48 deletions(-)
diff --git a/kernel/drivers/md/bcache/writeback.c b/kernel/drivers/md/bcache/writeback.c
index aa58833..6324c92 100644
--- a/kernel/drivers/md/bcache/writeback.c
+++ b/kernel/drivers/md/bcache/writeback.c
@@ -17,6 +17,15 @@
#include <linux/sched/clock.h>
#include <trace/events/bcache.h>
+static void update_gc_after_writeback(struct cache_set *c)
+{
+ if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
+ c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
+ return;
+
+ c->gc_after_writeback |= BCH_DO_AUTO_GC;
+}
+
/* Rate limiting */
static uint64_t __calc_target_rate(struct cached_dev *dc)
{
@@ -26,7 +35,7 @@
* This is the size of the cache, minus the amount used for
* flash-only devices
*/
- uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
+ uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size -
atomic_long_read(&c->flash_dev_dirty_sectors);
/*
@@ -110,24 +119,65 @@
dc->writeback_rate_target = target;
}
+static bool idle_counter_exceeded(struct cache_set *c)
+{
+ int counter, dev_nr;
+
+ /*
+ * If c->idle_counter is overflow (idel for really long time),
+ * reset as 0 and not set maximum rate this time for code
+ * simplicity.
+ */
+ counter = atomic_inc_return(&c->idle_counter);
+ if (counter <= 0) {
+ atomic_set(&c->idle_counter, 0);
+ return false;
+ }
+
+ dev_nr = atomic_read(&c->attached_dev_nr);
+ if (dev_nr == 0)
+ return false;
+
+ /*
+ * c->idle_counter is increased by writeback thread of all
+ * attached backing devices, in order to represent a rough
+ * time period, counter should be divided by dev_nr.
+ * Otherwise the idle time cannot be larger with more backing
+ * device attached.
+ * The following calculation equals to checking
+ * (counter / dev_nr) < (dev_nr * 6)
+ */
+ if (counter < (dev_nr * dev_nr * 6))
+ return false;
+
+ return true;
+}
+
+/*
+ * Idle_counter is increased every time when update_writeback_rate() is
+ * called. If all backing devices attached to the same cache set have
+ * identical dc->writeback_rate_update_seconds values, it is about 6
+ * rounds of update_writeback_rate() on each backing device before
+ * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
+ * to each dc->writeback_rate.rate.
+ * In order to avoid extra locking cost for counting exact dirty cached
+ * devices number, c->attached_dev_nr is used to calculate the idle
+ * throushold. It might be bigger if not all cached device are in write-
+ * back mode, but it still works well with limited extra rounds of
+ * update_writeback_rate().
+ */
static bool set_at_max_writeback_rate(struct cache_set *c,
struct cached_dev *dc)
{
- /*
- * Idle_counter is increased everytime when update_writeback_rate() is
- * called. If all backing devices attached to the same cache set have
- * identical dc->writeback_rate_update_seconds values, it is about 6
- * rounds of update_writeback_rate() on each backing device before
- * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
- * to each dc->writeback_rate.rate.
- * In order to avoid extra locking cost for counting exact dirty cached
- * devices number, c->attached_dev_nr is used to calculate the idle
- * throushold. It might be bigger if not all cached device are in write-
- * back mode, but it still works well with limited extra rounds of
- * update_writeback_rate().
- */
- if (atomic_inc_return(&c->idle_counter) <
- atomic_read(&c->attached_dev_nr) * 6)
+ /* Don't sst max writeback rate if it is disabled */
+ if (!c->idle_max_writeback_rate_enabled)
+ return false;
+
+ /* Don't set max writeback rate if gc is running */
+ if (!c->gc_mark_valid)
+ return false;
+
+ if (!idle_counter_exceeded(c))
return false;
if (atomic_read(&c->at_max_writeback_rate) != 1)
@@ -141,13 +191,10 @@
dc->writeback_rate_change = 0;
/*
- * Check c->idle_counter and c->at_max_writeback_rate agagain in case
- * new I/O arrives during before set_at_max_writeback_rate() returns.
- * Then the writeback rate is set to 1, and its new value should be
- * decided via __update_writeback_rate().
+ * In case new I/O arrives during before
+ * set_at_max_writeback_rate() returns.
*/
- if ((atomic_read(&c->idle_counter) <
- atomic_read(&c->attached_dev_nr) * 6) ||
+ if (!idle_counter_exceeded(c) ||
!atomic_read(&c->at_max_writeback_rate))
return false;
@@ -167,7 +214,7 @@
*/
set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
- smp_mb();
+ smp_mb__after_atomic();
/*
* CACHE_SET_IO_DISABLE might be set via sysfs interface,
@@ -177,7 +224,7 @@
test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
- smp_mb();
+ smp_mb__after_atomic();
return;
}
@@ -191,6 +238,7 @@
if (!set_at_max_writeback_rate(c, dc)) {
down_read(&dc->writeback_lock);
__update_writeback_rate(dc);
+ update_gc_after_writeback(c);
up_read(&dc->writeback_lock);
}
}
@@ -212,7 +260,7 @@
*/
clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
- smp_mb();
+ smp_mb__after_atomic();
}
static unsigned int writeback_delay(struct cached_dev *dc,
@@ -442,10 +490,8 @@
for (i = 0; i < nk; i++) {
w = keys[i];
- io = kzalloc(sizeof(struct dirty_io) +
- sizeof(struct bio_vec) *
- DIV_ROUND_UP(KEY_SIZE(&w->key),
- PAGE_SECTORS),
+ io = kzalloc(struct_size(io, bio.bi_inline_vecs,
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
GFP_KERNEL);
if (!io)
goto err;
@@ -693,6 +739,23 @@
up_write(&dc->writeback_lock);
break;
}
+
+ /*
+ * When dirty data rate is high (e.g. 50%+), there might
+ * be heavy buckets fragmentation after writeback
+ * finished, which hurts following write performance.
+ * If users really care about write performance they
+ * may set BCH_ENABLE_AUTO_GC via sysfs, then when
+ * BCH_DO_AUTO_GC is set, garbage collection thread
+ * will be wake up here. After moving gc, the shrunk
+ * btree and discarded free buckets SSD space may be
+ * helpful for following write requests.
+ */
+ if (c->gc_after_writeback ==
+ (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
+ c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
+ force_wake_up_gc(c);
+ }
}
up_write(&dc->writeback_lock);
@@ -724,13 +787,11 @@
/* Init */
#define INIT_KEYS_EACH_TIME 500000
-#define INIT_KEYS_SLEEP_MS 100
struct sectors_dirty_init {
struct btree_op op;
unsigned int inode;
size_t count;
- struct bkey start;
};
static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
@@ -746,16 +807,15 @@
KEY_START(k), KEY_SIZE(k));
op->count++;
- if (atomic_read(&b->c->search_inflight) &&
- !(op->count % INIT_KEYS_EACH_TIME)) {
- bkey_copy_key(&op->start, k);
- return -EAGAIN;
- }
+ if (!(op->count % INIT_KEYS_EACH_TIME))
+ cond_resched();
return MAP_CONTINUE;
}
-void bch_sectors_dirty_init(struct bcache_device *d)
+static int bch_root_node_dirty_init(struct cache_set *c,
+ struct bcache_device *d,
+ struct bkey *k)
{
struct sectors_dirty_init op;
int ret;
@@ -763,19 +823,158 @@
bch_btree_op_init(&op.op, -1);
op.inode = d->id;
op.count = 0;
- op.start = KEY(op.inode, 0, 0);
- do {
- ret = bch_btree_map_keys(&op.op, d->c, &op.start,
- sectors_dirty_init_fn, 0);
- if (ret == -EAGAIN)
- schedule_timeout_interruptible(
- msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
- else if (ret < 0) {
- pr_warn("sectors dirty init failed, ret=%d!", ret);
- break;
+ ret = bcache_btree(map_keys_recurse,
+ k,
+ c->root,
+ &op.op,
+ &KEY(op.inode, 0, 0),
+ sectors_dirty_init_fn,
+ 0);
+ if (ret < 0)
+ pr_warn("sectors dirty init failed, ret=%d!\n", ret);
+
+ /*
+ * The op may be added to cache_set's btree_cache_wait
+ * in mca_cannibalize(), must ensure it is removed from
+ * the list and release btree_cache_alloc_lock before
+ * free op memory.
+ * Otherwise, the btree_cache_wait will be damaged.
+ */
+ bch_cannibalize_unlock(c);
+ finish_wait(&c->btree_cache_wait, &(&op.op)->wait);
+
+ return ret;
+}
+
+static int bch_dirty_init_thread(void *arg)
+{
+ struct dirty_init_thrd_info *info = arg;
+ struct bch_dirty_init_state *state = info->state;
+ struct cache_set *c = state->c;
+ struct btree_iter iter;
+ struct bkey *k, *p;
+ int cur_idx, prev_idx, skip_nr;
+
+ k = p = NULL;
+ cur_idx = prev_idx = 0;
+
+ bch_btree_iter_init(&c->root->keys, &iter, NULL);
+ k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+ BUG_ON(!k);
+
+ p = k;
+
+ while (k) {
+ spin_lock(&state->idx_lock);
+ cur_idx = state->key_idx;
+ state->key_idx++;
+ spin_unlock(&state->idx_lock);
+
+ skip_nr = cur_idx - prev_idx;
+
+ while (skip_nr) {
+ k = bch_btree_iter_next_filter(&iter,
+ &c->root->keys,
+ bch_ptr_bad);
+ if (k)
+ p = k;
+ else {
+ atomic_set(&state->enough, 1);
+ /* Update state->enough earlier */
+ smp_mb__after_atomic();
+ goto out;
+ }
+ skip_nr--;
}
- } while (ret == -EAGAIN);
+
+ if (p) {
+ if (bch_root_node_dirty_init(c, state->d, p) < 0)
+ goto out;
+ }
+
+ p = NULL;
+ prev_idx = cur_idx;
+ }
+
+out:
+ /* In order to wake up state->wait in time */
+ smp_mb__before_atomic();
+ if (atomic_dec_and_test(&state->started))
+ wake_up(&state->wait);
+
+ return 0;
+}
+
+static int bch_btre_dirty_init_thread_nr(void)
+{
+ int n = num_online_cpus()/2;
+
+ if (n == 0)
+ n = 1;
+ else if (n > BCH_DIRTY_INIT_THRD_MAX)
+ n = BCH_DIRTY_INIT_THRD_MAX;
+
+ return n;
+}
+
+void bch_sectors_dirty_init(struct bcache_device *d)
+{
+ int i;
+ struct bkey *k = NULL;
+ struct btree_iter iter;
+ struct sectors_dirty_init op;
+ struct cache_set *c = d->c;
+ struct bch_dirty_init_state state;
+
+ /* Just count root keys if no leaf node */
+ rw_lock(0, c->root, c->root->level);
+ if (c->root->level == 0) {
+ bch_btree_op_init(&op.op, -1);
+ op.inode = d->id;
+ op.count = 0;
+
+ for_each_key_filter(&c->root->keys,
+ k, &iter, bch_ptr_invalid)
+ sectors_dirty_init_fn(&op.op, c->root, k);
+
+ rw_unlock(0, c->root);
+ return;
+ }
+
+ memset(&state, 0, sizeof(struct bch_dirty_init_state));
+ state.c = c;
+ state.d = d;
+ state.total_threads = bch_btre_dirty_init_thread_nr();
+ state.key_idx = 0;
+ spin_lock_init(&state.idx_lock);
+ atomic_set(&state.started, 0);
+ atomic_set(&state.enough, 0);
+ init_waitqueue_head(&state.wait);
+
+ for (i = 0; i < state.total_threads; i++) {
+ /* Fetch latest state.enough earlier */
+ smp_mb__before_atomic();
+ if (atomic_read(&state.enough))
+ break;
+
+ state.infos[i].state = &state;
+ state.infos[i].thread =
+ kthread_run(bch_dirty_init_thread, &state.infos[i],
+ "bch_dirtcnt[%d]", i);
+ if (IS_ERR(state.infos[i].thread)) {
+ pr_err("fails to run thread bch_dirty_init[%d]\n", i);
+ for (--i; i >= 0; i--)
+ kthread_stop(state.infos[i].thread);
+ goto out;
+ }
+ atomic_inc(&state.started);
+ }
+
+out:
+ /* Must wait for all threads to stop. */
+ wait_event(state.wait, atomic_read(&state.started) == 0);
+ rw_unlock(0, c->root);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
--
Gitblit v1.6.2