forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 95099d4622f8cb224d94e314c7a8e0df60b13f87
kernel/drivers/md/bcache/writeback.c
....@@ -17,6 +17,15 @@
1717 #include <linux/sched/clock.h>
1818 #include <trace/events/bcache.h>
1919
20
+static void update_gc_after_writeback(struct cache_set *c)
21
+{
22
+ if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
23
+ c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
24
+ return;
25
+
26
+ c->gc_after_writeback |= BCH_DO_AUTO_GC;
27
+}
28
+
2029 /* Rate limiting */
2130 static uint64_t __calc_target_rate(struct cached_dev *dc)
2231 {
....@@ -26,7 +35,7 @@
2635 * This is the size of the cache, minus the amount used for
2736 * flash-only devices
2837 */
29
- uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
38
+ uint64_t cache_sectors = c->nbuckets * c->cache->sb.bucket_size -
3039 atomic_long_read(&c->flash_dev_dirty_sectors);
3140
3241 /*
....@@ -110,24 +119,65 @@
110119 dc->writeback_rate_target = target;
111120 }
112121
122
+static bool idle_counter_exceeded(struct cache_set *c)
123
+{
124
+ int counter, dev_nr;
125
+
126
+ /*
127
+ * If c->idle_counter is overflow (idel for really long time),
128
+ * reset as 0 and not set maximum rate this time for code
129
+ * simplicity.
130
+ */
131
+ counter = atomic_inc_return(&c->idle_counter);
132
+ if (counter <= 0) {
133
+ atomic_set(&c->idle_counter, 0);
134
+ return false;
135
+ }
136
+
137
+ dev_nr = atomic_read(&c->attached_dev_nr);
138
+ if (dev_nr == 0)
139
+ return false;
140
+
141
+ /*
142
+ * c->idle_counter is increased by writeback thread of all
143
+ * attached backing devices, in order to represent a rough
144
+ * time period, counter should be divided by dev_nr.
145
+ * Otherwise the idle time cannot be larger with more backing
146
+ * device attached.
147
+ * The following calculation equals to checking
148
+ * (counter / dev_nr) < (dev_nr * 6)
149
+ */
150
+ if (counter < (dev_nr * dev_nr * 6))
151
+ return false;
152
+
153
+ return true;
154
+}
155
+
156
+/*
157
+ * Idle_counter is increased every time when update_writeback_rate() is
158
+ * called. If all backing devices attached to the same cache set have
159
+ * identical dc->writeback_rate_update_seconds values, it is about 6
160
+ * rounds of update_writeback_rate() on each backing device before
161
+ * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
162
+ * to each dc->writeback_rate.rate.
163
+ * In order to avoid extra locking cost for counting exact dirty cached
164
+ * devices number, c->attached_dev_nr is used to calculate the idle
165
+ * throushold. It might be bigger if not all cached device are in write-
166
+ * back mode, but it still works well with limited extra rounds of
167
+ * update_writeback_rate().
168
+ */
113169 static bool set_at_max_writeback_rate(struct cache_set *c,
114170 struct cached_dev *dc)
115171 {
116
- /*
117
- * Idle_counter is increased everytime when update_writeback_rate() is
118
- * called. If all backing devices attached to the same cache set have
119
- * identical dc->writeback_rate_update_seconds values, it is about 6
120
- * rounds of update_writeback_rate() on each backing device before
121
- * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
122
- * to each dc->writeback_rate.rate.
123
- * In order to avoid extra locking cost for counting exact dirty cached
124
- * devices number, c->attached_dev_nr is used to calculate the idle
125
- * throushold. It might be bigger if not all cached device are in write-
126
- * back mode, but it still works well with limited extra rounds of
127
- * update_writeback_rate().
128
- */
129
- if (atomic_inc_return(&c->idle_counter) <
130
- atomic_read(&c->attached_dev_nr) * 6)
172
+ /* Don't sst max writeback rate if it is disabled */
173
+ if (!c->idle_max_writeback_rate_enabled)
174
+ return false;
175
+
176
+ /* Don't set max writeback rate if gc is running */
177
+ if (!c->gc_mark_valid)
178
+ return false;
179
+
180
+ if (!idle_counter_exceeded(c))
131181 return false;
132182
133183 if (atomic_read(&c->at_max_writeback_rate) != 1)
....@@ -141,13 +191,10 @@
141191 dc->writeback_rate_change = 0;
142192
143193 /*
144
- * Check c->idle_counter and c->at_max_writeback_rate agagain in case
145
- * new I/O arrives during before set_at_max_writeback_rate() returns.
146
- * Then the writeback rate is set to 1, and its new value should be
147
- * decided via __update_writeback_rate().
194
+ * In case new I/O arrives during before
195
+ * set_at_max_writeback_rate() returns.
148196 */
149
- if ((atomic_read(&c->idle_counter) <
150
- atomic_read(&c->attached_dev_nr) * 6) ||
197
+ if (!idle_counter_exceeded(c) ||
151198 !atomic_read(&c->at_max_writeback_rate))
152199 return false;
153200
....@@ -167,7 +214,7 @@
167214 */
168215 set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
169216 /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
170
- smp_mb();
217
+ smp_mb__after_atomic();
171218
172219 /*
173220 * CACHE_SET_IO_DISABLE might be set via sysfs interface,
....@@ -177,7 +224,7 @@
177224 test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
178225 clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
179226 /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
180
- smp_mb();
227
+ smp_mb__after_atomic();
181228 return;
182229 }
183230
....@@ -191,6 +238,7 @@
191238 if (!set_at_max_writeback_rate(c, dc)) {
192239 down_read(&dc->writeback_lock);
193240 __update_writeback_rate(dc);
241
+ update_gc_after_writeback(c);
194242 up_read(&dc->writeback_lock);
195243 }
196244 }
....@@ -212,7 +260,7 @@
212260 */
213261 clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
214262 /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
215
- smp_mb();
263
+ smp_mb__after_atomic();
216264 }
217265
218266 static unsigned int writeback_delay(struct cached_dev *dc,
....@@ -442,10 +490,8 @@
442490 for (i = 0; i < nk; i++) {
443491 w = keys[i];
444492
445
- io = kzalloc(sizeof(struct dirty_io) +
446
- sizeof(struct bio_vec) *
447
- DIV_ROUND_UP(KEY_SIZE(&w->key),
448
- PAGE_SECTORS),
493
+ io = kzalloc(struct_size(io, bio.bi_inline_vecs,
494
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
449495 GFP_KERNEL);
450496 if (!io)
451497 goto err;
....@@ -693,6 +739,23 @@
693739 up_write(&dc->writeback_lock);
694740 break;
695741 }
742
+
743
+ /*
744
+ * When dirty data rate is high (e.g. 50%+), there might
745
+ * be heavy buckets fragmentation after writeback
746
+ * finished, which hurts following write performance.
747
+ * If users really care about write performance they
748
+ * may set BCH_ENABLE_AUTO_GC via sysfs, then when
749
+ * BCH_DO_AUTO_GC is set, garbage collection thread
750
+ * will be wake up here. After moving gc, the shrunk
751
+ * btree and discarded free buckets SSD space may be
752
+ * helpful for following write requests.
753
+ */
754
+ if (c->gc_after_writeback ==
755
+ (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
756
+ c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
757
+ force_wake_up_gc(c);
758
+ }
696759 }
697760
698761 up_write(&dc->writeback_lock);
....@@ -724,13 +787,11 @@
724787
725788 /* Init */
726789 #define INIT_KEYS_EACH_TIME 500000
727
-#define INIT_KEYS_SLEEP_MS 100
728790
729791 struct sectors_dirty_init {
730792 struct btree_op op;
731793 unsigned int inode;
732794 size_t count;
733
- struct bkey start;
734795 };
735796
736797 static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
....@@ -746,16 +807,15 @@
746807 KEY_START(k), KEY_SIZE(k));
747808
748809 op->count++;
749
- if (atomic_read(&b->c->search_inflight) &&
750
- !(op->count % INIT_KEYS_EACH_TIME)) {
751
- bkey_copy_key(&op->start, k);
752
- return -EAGAIN;
753
- }
810
+ if (!(op->count % INIT_KEYS_EACH_TIME))
811
+ cond_resched();
754812
755813 return MAP_CONTINUE;
756814 }
757815
758
-void bch_sectors_dirty_init(struct bcache_device *d)
816
+static int bch_root_node_dirty_init(struct cache_set *c,
817
+ struct bcache_device *d,
818
+ struct bkey *k)
759819 {
760820 struct sectors_dirty_init op;
761821 int ret;
....@@ -763,19 +823,148 @@
763823 bch_btree_op_init(&op.op, -1);
764824 op.inode = d->id;
765825 op.count = 0;
766
- op.start = KEY(op.inode, 0, 0);
767826
768
- do {
769
- ret = bch_btree_map_keys(&op.op, d->c, &op.start,
770
- sectors_dirty_init_fn, 0);
771
- if (ret == -EAGAIN)
772
- schedule_timeout_interruptible(
773
- msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
774
- else if (ret < 0) {
775
- pr_warn("sectors dirty init failed, ret=%d!", ret);
776
- break;
827
+ ret = bcache_btree(map_keys_recurse,
828
+ k,
829
+ c->root,
830
+ &op.op,
831
+ &KEY(op.inode, 0, 0),
832
+ sectors_dirty_init_fn,
833
+ 0);
834
+ if (ret < 0)
835
+ pr_warn("sectors dirty init failed, ret=%d!\n", ret);
836
+
837
+ return ret;
838
+}
839
+
840
+static int bch_dirty_init_thread(void *arg)
841
+{
842
+ struct dirty_init_thrd_info *info = arg;
843
+ struct bch_dirty_init_state *state = info->state;
844
+ struct cache_set *c = state->c;
845
+ struct btree_iter iter;
846
+ struct bkey *k, *p;
847
+ int cur_idx, prev_idx, skip_nr;
848
+
849
+ k = p = NULL;
850
+ cur_idx = prev_idx = 0;
851
+
852
+ bch_btree_iter_init(&c->root->keys, &iter, NULL);
853
+ k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
854
+ BUG_ON(!k);
855
+
856
+ p = k;
857
+
858
+ while (k) {
859
+ spin_lock(&state->idx_lock);
860
+ cur_idx = state->key_idx;
861
+ state->key_idx++;
862
+ spin_unlock(&state->idx_lock);
863
+
864
+ skip_nr = cur_idx - prev_idx;
865
+
866
+ while (skip_nr) {
867
+ k = bch_btree_iter_next_filter(&iter,
868
+ &c->root->keys,
869
+ bch_ptr_bad);
870
+ if (k)
871
+ p = k;
872
+ else {
873
+ atomic_set(&state->enough, 1);
874
+ /* Update state->enough earlier */
875
+ smp_mb__after_atomic();
876
+ goto out;
877
+ }
878
+ skip_nr--;
777879 }
778
- } while (ret == -EAGAIN);
880
+
881
+ if (p) {
882
+ if (bch_root_node_dirty_init(c, state->d, p) < 0)
883
+ goto out;
884
+ }
885
+
886
+ p = NULL;
887
+ prev_idx = cur_idx;
888
+ }
889
+
890
+out:
891
+ /* In order to wake up state->wait in time */
892
+ smp_mb__before_atomic();
893
+ if (atomic_dec_and_test(&state->started))
894
+ wake_up(&state->wait);
895
+
896
+ return 0;
897
+}
898
+
899
+static int bch_btre_dirty_init_thread_nr(void)
900
+{
901
+ int n = num_online_cpus()/2;
902
+
903
+ if (n == 0)
904
+ n = 1;
905
+ else if (n > BCH_DIRTY_INIT_THRD_MAX)
906
+ n = BCH_DIRTY_INIT_THRD_MAX;
907
+
908
+ return n;
909
+}
910
+
911
+void bch_sectors_dirty_init(struct bcache_device *d)
912
+{
913
+ int i;
914
+ struct bkey *k = NULL;
915
+ struct btree_iter iter;
916
+ struct sectors_dirty_init op;
917
+ struct cache_set *c = d->c;
918
+ struct bch_dirty_init_state state;
919
+
920
+ /* Just count root keys if no leaf node */
921
+ rw_lock(0, c->root, c->root->level);
922
+ if (c->root->level == 0) {
923
+ bch_btree_op_init(&op.op, -1);
924
+ op.inode = d->id;
925
+ op.count = 0;
926
+
927
+ for_each_key_filter(&c->root->keys,
928
+ k, &iter, bch_ptr_invalid)
929
+ sectors_dirty_init_fn(&op.op, c->root, k);
930
+
931
+ rw_unlock(0, c->root);
932
+ return;
933
+ }
934
+
935
+ memset(&state, 0, sizeof(struct bch_dirty_init_state));
936
+ state.c = c;
937
+ state.d = d;
938
+ state.total_threads = bch_btre_dirty_init_thread_nr();
939
+ state.key_idx = 0;
940
+ spin_lock_init(&state.idx_lock);
941
+ atomic_set(&state.started, 0);
942
+ atomic_set(&state.enough, 0);
943
+ init_waitqueue_head(&state.wait);
944
+
945
+ for (i = 0; i < state.total_threads; i++) {
946
+ /* Fetch latest state.enough earlier */
947
+ smp_mb__before_atomic();
948
+ if (atomic_read(&state.enough))
949
+ break;
950
+
951
+ state.infos[i].state = &state;
952
+ state.infos[i].thread =
953
+ kthread_run(bch_dirty_init_thread, &state.infos[i],
954
+ "bch_dirtcnt[%d]", i);
955
+ if (IS_ERR(state.infos[i].thread)) {
956
+ pr_err("fails to run thread bch_dirty_init[%d]\n", i);
957
+ for (--i; i >= 0; i--)
958
+ kthread_stop(state.infos[i].thread);
959
+ goto out;
960
+ }
961
+ atomic_inc(&state.started);
962
+ }
963
+
964
+out:
965
+ /* Must wait for all threads to stop. */
966
+ wait_event(state.wait, atomic_read(&state.started) == 0);
967
+ rw_unlock(0, c->root);
779968 }
780969
781970 void bch_cached_dev_writeback_init(struct cached_dev *dc)