forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-05-13 9d77db3c730780c8ef5ccd4b66403ff5675cfe4e
kernel/drivers/md/bcache/btree.c
....@@ -99,70 +99,14 @@
9999 #define PTR_HASH(c, k) \
100100 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
101101
102
+static struct workqueue_struct *btree_io_wq;
103
+
102104 #define insert_lock(s, b) ((b)->level <= (s)->lock)
103105
104
-/*
105
- * These macros are for recursing down the btree - they handle the details of
106
- * locking and looking up nodes in the cache for you. They're best treated as
107
- * mere syntax when reading code that uses them.
108
- *
109
- * op->lock determines whether we take a read or a write lock at a given depth.
110
- * If you've got a read lock and find that you need a write lock (i.e. you're
111
- * going to have to split), set op->lock and return -EINTR; btree_root() will
112
- * call you again and you'll have the correct lock.
113
- */
114
-
115
-/**
116
- * btree - recurse down the btree on a specified key
117
- * @fn: function to call, which will be passed the child node
118
- * @key: key to recurse on
119
- * @b: parent btree node
120
- * @op: pointer to struct btree_op
121
- */
122
-#define btree(fn, key, b, op, ...) \
123
-({ \
124
- int _r, l = (b)->level - 1; \
125
- bool _w = l <= (op)->lock; \
126
- struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \
127
- _w, b); \
128
- if (!IS_ERR(_child)) { \
129
- _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
130
- rw_unlock(_w, _child); \
131
- } else \
132
- _r = PTR_ERR(_child); \
133
- _r; \
134
-})
135
-
136
-/**
137
- * btree_root - call a function on the root of the btree
138
- * @fn: function to call, which will be passed the child node
139
- * @c: cache set
140
- * @op: pointer to struct btree_op
141
- */
142
-#define btree_root(fn, c, op, ...) \
143
-({ \
144
- int _r = -EINTR; \
145
- do { \
146
- struct btree *_b = (c)->root; \
147
- bool _w = insert_lock(op, _b); \
148
- rw_lock(_w, _b, _b->level); \
149
- if (_b == (c)->root && \
150
- _w == insert_lock(op, _b)) { \
151
- _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
152
- } \
153
- rw_unlock(_w, _b); \
154
- bch_cannibalize_unlock(c); \
155
- if (_r == -EINTR) \
156
- schedule(); \
157
- } while (_r == -EINTR); \
158
- \
159
- finish_wait(&(c)->btree_cache_wait, &(op)->wait); \
160
- _r; \
161
-})
162106
163107 static inline struct bset *write_block(struct btree *b)
164108 {
165
- return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
109
+ return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache);
166110 }
167111
168112 static void bch_btree_init_next(struct btree *b)
....@@ -175,7 +119,7 @@
175119
176120 if (b->written < btree_blocks(b))
177121 bch_bset_init_next(&b->keys, write_block(b),
178
- bset_magic(&b->c->sb));
122
+ bset_magic(&b->c->cache->sb));
179123
180124 }
181125
....@@ -207,8 +151,13 @@
207151 struct bset *i = btree_bset_first(b);
208152 struct btree_iter *iter;
209153
154
+ /*
155
+ * c->fill_iter can allocate an iterator with more memory space
156
+ * than static MAX_BSETS.
157
+ * See the comment arount cache_set->fill_iter.
158
+ */
210159 iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
211
- iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
160
+ iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
212161 iter->used = 0;
213162
214163 #ifdef CONFIG_BCACHE_DEBUG
....@@ -226,12 +175,12 @@
226175 goto err;
227176
228177 err = "bad btree header";
229
- if (b->written + set_blocks(i, block_bytes(b->c)) >
178
+ if (b->written + set_blocks(i, block_bytes(b->c->cache)) >
230179 btree_blocks(b))
231180 goto err;
232181
233182 err = "bad magic";
234
- if (i->magic != bset_magic(&b->c->sb))
183
+ if (i->magic != bset_magic(&b->c->cache->sb))
235184 goto err;
236185
237186 err = "bad checksum";
....@@ -252,13 +201,13 @@
252201
253202 bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
254203
255
- b->written += set_blocks(i, block_bytes(b->c));
204
+ b->written += set_blocks(i, block_bytes(b->c->cache));
256205 }
257206
258207 err = "corrupted btree";
259208 for (i = write_block(b);
260209 bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
261
- i = ((void *) i) + block_bytes(b->c))
210
+ i = ((void *) i) + block_bytes(b->c->cache))
262211 if (i->seq == b->keys.set[0].data->seq)
263212 goto err;
264213
....@@ -272,7 +221,7 @@
272221
273222 if (b->written < btree_blocks(b))
274223 bch_bset_init_next(&b->keys, write_block(b),
275
- bset_magic(&b->c->sb));
224
+ bset_magic(&b->c->cache->sb));
276225 out:
277226 mempool_free(iter, &b->c->fill_iter);
278227 return;
....@@ -361,7 +310,7 @@
361310 btree_complete_write(b, w);
362311
363312 if (btree_node_dirty(b))
364
- schedule_delayed_work(&b->work, 30 * HZ);
313
+ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
365314
366315 closure_return_with_destructor(cl, btree_node_write_unlock);
367316 }
....@@ -400,7 +349,7 @@
400349
401350 b->bio->bi_end_io = btree_node_write_endio;
402351 b->bio->bi_private = cl;
403
- b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
352
+ b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache));
404353 b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA;
405354 bch_bio_map(b->bio, i);
406355
....@@ -424,13 +373,14 @@
424373 bset_sector_offset(&b->keys, i));
425374
426375 if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
427
- int j;
428376 struct bio_vec *bv;
429
- void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
377
+ void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
378
+ struct bvec_iter_all iter_all;
430379
431
- bio_for_each_segment_all(bv, b->bio, j)
432
- memcpy(page_address(bv->bv_page),
433
- base + j * PAGE_SIZE, PAGE_SIZE);
380
+ bio_for_each_segment_all(bv, b->bio, iter_all) {
381
+ memcpy(page_address(bv->bv_page), addr, PAGE_SIZE);
382
+ addr += PAGE_SIZE;
383
+ }
434384
435385 bch_submit_bbio(b->bio, b->c, &k.key, 0);
436386
....@@ -475,10 +425,10 @@
475425
476426 do_btree_node_write(b);
477427
478
- atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
428
+ atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size,
479429 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
480430
481
- b->written += set_blocks(i, block_bytes(b->c));
431
+ b->written += set_blocks(i, block_bytes(b->c->cache));
482432 }
483433
484434 void bch_btree_node_write(struct btree *b, struct closure *parent)
....@@ -533,10 +483,15 @@
533483 BUG_ON(!i->keys);
534484
535485 if (!btree_node_dirty(b))
536
- schedule_delayed_work(&b->work, 30 * HZ);
486
+ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
537487
538488 set_btree_node_dirty(b);
539489
490
+ /*
491
+ * w->journal is always the oldest journal pin of all bkeys
492
+ * in the leaf node, to make sure the oldest jset seq won't
493
+ * be increased before this btree node is flushed.
494
+ */
540495 if (journal_ref) {
541496 if (w->journal &&
542497 journal_pin_cmp(b->c, w->journal, journal_ref)) {
....@@ -561,7 +516,7 @@
561516 * mca -> memory cache
562517 */
563518
564
-#define mca_reserve(c) (((c->root && c->root->level) \
519
+#define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \
565520 ? c->root->level : 1) * 8 + 16)
566521 #define mca_can_free(c) \
567522 max_t(int, 0, c->btree_cache_used - mca_reserve(c))
....@@ -607,6 +562,10 @@
607562 static struct btree *mca_bucket_alloc(struct cache_set *c,
608563 struct bkey *k, gfp_t gfp)
609564 {
565
+ /*
566
+ * kzalloc() is necessary here for initialization,
567
+ * see code comments in bch_btree_keys_init().
568
+ */
610569 struct btree *b = kzalloc(sizeof(struct btree), gfp);
611570
612571 if (!b)
....@@ -662,7 +621,7 @@
662621 * and BTREE_NODE_journal_flush bit cleared by btree_flush_write().
663622 */
664623 if (btree_node_journal_flush(b)) {
665
- pr_debug("bnode %p is flushing by journal, retry", b);
624
+ pr_debug("bnode %p is flushing by journal, retry\n", b);
666625 mutex_unlock(&b->write_lock);
667626 udelay(1);
668627 goto retry;
....@@ -719,34 +678,32 @@
719678
720679 i = 0;
721680 btree_cache_used = c->btree_cache_used;
722
- list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
681
+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) {
723682 if (nr <= 0)
724683 goto out;
725684
726
- if (++i > 3 &&
727
- !mca_reap(b, 0, false)) {
685
+ if (!mca_reap(b, 0, false)) {
728686 mca_data_free(b);
729687 rw_unlock(true, b);
730688 freed++;
731689 }
732690 nr--;
691
+ i++;
733692 }
734693
735
- for (; (nr--) && i < btree_cache_used; i++) {
736
- if (list_empty(&c->btree_cache))
694
+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
695
+ if (nr <= 0 || i >= btree_cache_used)
737696 goto out;
738697
739
- b = list_first_entry(&c->btree_cache, struct btree, list);
740
- list_rotate_left(&c->btree_cache);
741
-
742
- if (!b->accessed &&
743
- !mca_reap(b, 0, false)) {
698
+ if (!mca_reap(b, 0, false)) {
744699 mca_bucket_free(b);
745700 mca_data_free(b);
746701 rw_unlock(true, b);
747702 freed++;
748
- } else
749
- b->accessed = 0;
703
+ }
704
+
705
+ nr--;
706
+ i++;
750707 }
751708 out:
752709 mutex_unlock(&c->bucket_lock);
....@@ -783,7 +740,7 @@
783740 if (c->verify_data)
784741 list_move(&c->verify_data->list, &c->btree_cache);
785742
786
- free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
743
+ free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb)));
787744 #endif
788745
789746 list_splice(&c->btree_cache_freeable,
....@@ -830,7 +787,16 @@
830787 mutex_init(&c->verify_lock);
831788
832789 c->verify_ondisk = (void *)
833
- __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(bucket_pages(c)));
790
+ __get_free_pages(GFP_KERNEL|__GFP_COMP,
791
+ ilog2(meta_bucket_pages(&c->cache->sb)));
792
+ if (!c->verify_ondisk) {
793
+ /*
794
+ * Don't worry about the mca_rereserve buckets
795
+ * allocated in previous for-loop, they will be
796
+ * handled properly in bch_cache_set_unregister().
797
+ */
798
+ return -ENOMEM;
799
+ }
834800
835801 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
836802
....@@ -847,7 +813,7 @@
847813 c->shrink.batch = c->btree_pages * 2;
848814
849815 if (register_shrinker(&c->shrink))
850
- pr_warn("bcache: %s: could not register shrinker",
816
+ pr_warn("bcache: %s: could not register shrinker\n",
851817 __func__);
852818
853819 return 0;
....@@ -919,7 +885,7 @@
919885 * cannibalize_bucket() will take. This means every time we unlock the root of
920886 * the btree, we need to release this lock if we have it held.
921887 */
922
-static void bch_cannibalize_unlock(struct cache_set *c)
888
+void bch_cannibalize_unlock(struct cache_set *c)
923889 {
924890 spin_lock(&c->btree_cannibalize_lock);
925891 if (c->btree_cache_alloc_lock == current) {
....@@ -1004,7 +970,7 @@
1004970 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
1005971 * in from disk if necessary.
1006972 *
1007
- * If IO is necessary and running under generic_make_request, returns -EAGAIN.
973
+ * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN.
1008974 *
1009975 * The btree node will have either a read or a write lock held, depending on
1010976 * level and op->lock.
....@@ -1054,7 +1020,6 @@
10541020 BUG_ON(!b->written);
10551021
10561022 b->parent = parent;
1057
- b->accessed = 1;
10581023
10591024 for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
10601025 prefetch(b->keys.set[i].tree);
....@@ -1100,7 +1065,7 @@
11001065 */
11011066 if (btree_node_journal_flush(b)) {
11021067 mutex_unlock(&b->write_lock);
1103
- pr_debug("bnode %p journal_flush set, retry", b);
1068
+ pr_debug("bnode %p journal_flush set, retry\n", b);
11041069 udelay(1);
11051070 goto retry;
11061071 }
....@@ -1125,11 +1090,13 @@
11251090 struct btree *parent)
11261091 {
11271092 BKEY_PADDED(key) k;
1128
- struct btree *b = ERR_PTR(-EAGAIN);
1093
+ struct btree *b;
11291094
11301095 mutex_lock(&c->bucket_lock);
11311096 retry:
1132
- if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
1097
+ /* return ERR_PTR(-EAGAIN) when it fails */
1098
+ b = ERR_PTR(-EAGAIN);
1099
+ if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
11331100 goto err;
11341101
11351102 bkey_put(c, &k.key);
....@@ -1145,9 +1112,8 @@
11451112 goto retry;
11461113 }
11471114
1148
- b->accessed = 1;
11491115 b->parent = parent;
1150
- bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
1116
+ bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb));
11511117
11521118 mutex_unlock(&c->bucket_lock);
11531119
....@@ -1174,7 +1140,7 @@
11741140 {
11751141 struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
11761142
1177
- if (!IS_ERR_OR_NULL(n)) {
1143
+ if (!IS_ERR(n)) {
11781144 mutex_lock(&n->write_lock);
11791145 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
11801146 bkey_copy_key(&n->key, &b->key);
....@@ -1206,19 +1172,18 @@
12061172 static int btree_check_reserve(struct btree *b, struct btree_op *op)
12071173 {
12081174 struct cache_set *c = b->c;
1209
- struct cache *ca;
1210
- unsigned int i, reserve = (c->root->level - b->level) * 2 + 1;
1175
+ struct cache *ca = c->cache;
1176
+ unsigned int reserve = (c->root->level - b->level) * 2 + 1;
12111177
12121178 mutex_lock(&c->bucket_lock);
12131179
1214
- for_each_cache(ca, c, i)
1215
- if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1216
- if (op)
1217
- prepare_to_wait(&c->btree_cache_wait, &op->wait,
1218
- TASK_UNINTERRUPTIBLE);
1219
- mutex_unlock(&c->bucket_lock);
1220
- return -EINTR;
1221
- }
1180
+ if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1181
+ if (op)
1182
+ prepare_to_wait(&c->btree_cache_wait, &op->wait,
1183
+ TASK_UNINTERRUPTIBLE);
1184
+ mutex_unlock(&c->bucket_lock);
1185
+ return -EINTR;
1186
+ }
12221187
12231188 mutex_unlock(&c->bucket_lock);
12241189
....@@ -1377,19 +1342,19 @@
13771342 memset(new_nodes, 0, sizeof(new_nodes));
13781343 closure_init_stack(&cl);
13791344
1380
- while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
1345
+ while (nodes < GC_MERGE_NODES && !IS_ERR(r[nodes].b))
13811346 keys += r[nodes++].keys;
13821347
13831348 blocks = btree_default_blocks(b->c) * 2 / 3;
13841349
13851350 if (nodes < 2 ||
13861351 __set_blocks(b->keys.set[0].data, keys,
1387
- block_bytes(b->c)) > blocks * (nodes - 1))
1352
+ block_bytes(b->c->cache)) > blocks * (nodes - 1))
13881353 return 0;
13891354
13901355 for (i = 0; i < nodes; i++) {
13911356 new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
1392
- if (IS_ERR_OR_NULL(new_nodes[i]))
1357
+ if (IS_ERR(new_nodes[i]))
13931358 goto out_nocoalesce;
13941359 }
13951360
....@@ -1418,7 +1383,7 @@
14181383 k = bkey_next(k)) {
14191384 if (__set_blocks(n1, n1->keys + keys +
14201385 bkey_u64s(k),
1421
- block_bytes(b->c)) > blocks)
1386
+ block_bytes(b->c->cache)) > blocks)
14221387 break;
14231388
14241389 last = k;
....@@ -1434,7 +1399,7 @@
14341399 * though)
14351400 */
14361401 if (__set_blocks(n1, n1->keys + n2->keys,
1437
- block_bytes(b->c)) >
1402
+ block_bytes(b->c->cache)) >
14381403 btree_blocks(new_nodes[i]))
14391404 goto out_unlock_nocoalesce;
14401405
....@@ -1443,7 +1408,7 @@
14431408 last = &r->b->key;
14441409 }
14451410
1446
- BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
1411
+ BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) >
14471412 btree_blocks(new_nodes[i]));
14481413
14491414 if (last)
....@@ -1517,14 +1482,14 @@
15171482
15181483 out_nocoalesce:
15191484 closure_sync(&cl);
1520
- bch_keylist_free(&keylist);
15211485
15221486 while ((k = bch_keylist_pop(&keylist)))
15231487 if (!bkey_cmp(k, &ZERO_KEY))
15241488 atomic_dec(&b->c->prio_blocked);
1489
+ bch_keylist_free(&keylist);
15251490
15261491 for (i = 0; i < nodes; i++)
1527
- if (!IS_ERR_OR_NULL(new_nodes[i])) {
1492
+ if (!IS_ERR(new_nodes[i])) {
15281493 btree_node_free(new_nodes[i]);
15291494 rw_unlock(true, new_nodes[i]);
15301495 }
....@@ -1706,7 +1671,7 @@
17061671 if (should_rewrite) {
17071672 n = btree_node_alloc_replacement(b, NULL);
17081673
1709
- if (!IS_ERR_OR_NULL(n)) {
1674
+ if (!IS_ERR(n)) {
17101675 bch_btree_node_write_sync(n);
17111676
17121677 bch_btree_set_root(n);
....@@ -1734,7 +1699,6 @@
17341699 {
17351700 struct cache *ca;
17361701 struct bucket *b;
1737
- unsigned int i;
17381702
17391703 if (!c->gc_mark_valid)
17401704 return;
....@@ -1744,14 +1708,14 @@
17441708 c->gc_mark_valid = 0;
17451709 c->gc_done = ZERO_KEY;
17461710
1747
- for_each_cache(ca, c, i)
1748
- for_each_bucket(b, ca) {
1749
- b->last_gc = b->gen;
1750
- if (!atomic_read(&b->pin)) {
1751
- SET_GC_MARK(b, 0);
1752
- SET_GC_SECTORS_USED(b, 0);
1753
- }
1711
+ ca = c->cache;
1712
+ for_each_bucket(b, ca) {
1713
+ b->last_gc = b->gen;
1714
+ if (!atomic_read(&b->pin)) {
1715
+ SET_GC_MARK(b, 0);
1716
+ SET_GC_SECTORS_USED(b, 0);
17541717 }
1718
+ }
17551719
17561720 mutex_unlock(&c->bucket_lock);
17571721 }
....@@ -1760,7 +1724,8 @@
17601724 {
17611725 struct bucket *b;
17621726 struct cache *ca;
1763
- unsigned int i;
1727
+ unsigned int i, j;
1728
+ uint64_t *k;
17641729
17651730 mutex_lock(&c->bucket_lock);
17661731
....@@ -1778,7 +1743,6 @@
17781743 struct bcache_device *d = c->devices[i];
17791744 struct cached_dev *dc;
17801745 struct keybuf_key *w, *n;
1781
- unsigned int j;
17821746
17831747 if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
17841748 continue;
....@@ -1795,29 +1759,27 @@
17951759 rcu_read_unlock();
17961760
17971761 c->avail_nbuckets = 0;
1798
- for_each_cache(ca, c, i) {
1799
- uint64_t *i;
18001762
1801
- ca->invalidate_needs_gc = 0;
1763
+ ca = c->cache;
1764
+ ca->invalidate_needs_gc = 0;
18021765
1803
- for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1804
- SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1766
+ for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
1767
+ SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
18051768
1806
- for (i = ca->prio_buckets;
1807
- i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1808
- SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1769
+ for (k = ca->prio_buckets;
1770
+ k < ca->prio_buckets + prio_buckets(ca) * 2; k++)
1771
+ SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
18091772
1810
- for_each_bucket(b, ca) {
1811
- c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1773
+ for_each_bucket(b, ca) {
1774
+ c->need_gc = max(c->need_gc, bucket_gc_gen(b));
18121775
1813
- if (atomic_read(&b->pin))
1814
- continue;
1776
+ if (atomic_read(&b->pin))
1777
+ continue;
18151778
1816
- BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
1779
+ BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
18171780
1818
- if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
1819
- c->avail_nbuckets++;
1820
- }
1781
+ if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
1782
+ c->avail_nbuckets++;
18211783 }
18221784
18231785 mutex_unlock(&c->bucket_lock);
....@@ -1841,7 +1803,7 @@
18411803
18421804 /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
18431805 do {
1844
- ret = btree_root(gc_root, c, &op, &writes, &stats);
1806
+ ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
18451807 closure_sync(&writes);
18461808 cond_resched();
18471809
....@@ -1849,7 +1811,7 @@
18491811 schedule_timeout_interruptible(msecs_to_jiffies
18501812 (GC_SLEEP_MS));
18511813 else if (ret)
1852
- pr_warn("gc failed!");
1814
+ pr_warn("gc failed!\n");
18531815 } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
18541816
18551817 bch_btree_gc_finish(c);
....@@ -1869,12 +1831,10 @@
18691831
18701832 static bool gc_should_run(struct cache_set *c)
18711833 {
1872
- struct cache *ca;
1873
- unsigned int i;
1834
+ struct cache *ca = c->cache;
18741835
1875
- for_each_cache(ca, c, i)
1876
- if (ca->invalidate_needs_gc)
1877
- return true;
1836
+ if (ca->invalidate_needs_gc)
1837
+ return true;
18781838
18791839 if (atomic_read(&c->sectors_to_gc) < 0)
18801840 return true;
....@@ -1939,7 +1899,7 @@
19391899 }
19401900
19411901 if (p)
1942
- ret = btree(check_recurse, p, b, op);
1902
+ ret = bcache_btree(check_recurse, p, b, op);
19431903
19441904 p = k;
19451905 } while (p && !ret);
....@@ -1948,20 +1908,186 @@
19481908 return ret;
19491909 }
19501910
1911
+
1912
+static int bch_btree_check_thread(void *arg)
1913
+{
1914
+ int ret;
1915
+ struct btree_check_info *info = arg;
1916
+ struct btree_check_state *check_state = info->state;
1917
+ struct cache_set *c = check_state->c;
1918
+ struct btree_iter iter;
1919
+ struct bkey *k, *p;
1920
+ int cur_idx, prev_idx, skip_nr;
1921
+
1922
+ k = p = NULL;
1923
+ cur_idx = prev_idx = 0;
1924
+ ret = 0;
1925
+
1926
+ /* root node keys are checked before thread created */
1927
+ bch_btree_iter_init(&c->root->keys, &iter, NULL);
1928
+ k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
1929
+ BUG_ON(!k);
1930
+
1931
+ p = k;
1932
+ while (k) {
1933
+ /*
1934
+ * Fetch a root node key index, skip the keys which
1935
+ * should be fetched by other threads, then check the
1936
+ * sub-tree indexed by the fetched key.
1937
+ */
1938
+ spin_lock(&check_state->idx_lock);
1939
+ cur_idx = check_state->key_idx;
1940
+ check_state->key_idx++;
1941
+ spin_unlock(&check_state->idx_lock);
1942
+
1943
+ skip_nr = cur_idx - prev_idx;
1944
+
1945
+ while (skip_nr) {
1946
+ k = bch_btree_iter_next_filter(&iter,
1947
+ &c->root->keys,
1948
+ bch_ptr_bad);
1949
+ if (k)
1950
+ p = k;
1951
+ else {
1952
+ /*
1953
+ * No more keys to check in root node,
1954
+ * current checking threads are enough,
1955
+ * stop creating more.
1956
+ */
1957
+ atomic_set(&check_state->enough, 1);
1958
+ /* Update check_state->enough earlier */
1959
+ smp_mb__after_atomic();
1960
+ goto out;
1961
+ }
1962
+ skip_nr--;
1963
+ cond_resched();
1964
+ }
1965
+
1966
+ if (p) {
1967
+ struct btree_op op;
1968
+
1969
+ btree_node_prefetch(c->root, p);
1970
+ c->gc_stats.nodes++;
1971
+ bch_btree_op_init(&op, 0);
1972
+ ret = bcache_btree(check_recurse, p, c->root, &op);
1973
+ /*
1974
+ * The op may be added to cache_set's btree_cache_wait
1975
+ * in mca_cannibalize(), must ensure it is removed from
1976
+ * the list and release btree_cache_alloc_lock before
1977
+ * free op memory.
1978
+ * Otherwise, the btree_cache_wait will be damaged.
1979
+ */
1980
+ bch_cannibalize_unlock(c);
1981
+ finish_wait(&c->btree_cache_wait, &(&op)->wait);
1982
+ if (ret)
1983
+ goto out;
1984
+ }
1985
+ p = NULL;
1986
+ prev_idx = cur_idx;
1987
+ cond_resched();
1988
+ }
1989
+
1990
+out:
1991
+ info->result = ret;
1992
+ /* update check_state->started among all CPUs */
1993
+ smp_mb__before_atomic();
1994
+ if (atomic_dec_and_test(&check_state->started))
1995
+ wake_up(&check_state->wait);
1996
+
1997
+ return ret;
1998
+}
1999
+
2000
+
2001
+
2002
+static int bch_btree_chkthread_nr(void)
2003
+{
2004
+ int n = num_online_cpus()/2;
2005
+
2006
+ if (n == 0)
2007
+ n = 1;
2008
+ else if (n > BCH_BTR_CHKTHREAD_MAX)
2009
+ n = BCH_BTR_CHKTHREAD_MAX;
2010
+
2011
+ return n;
2012
+}
2013
+
19512014 int bch_btree_check(struct cache_set *c)
19522015 {
1953
- struct btree_op op;
2016
+ int ret = 0;
2017
+ int i;
2018
+ struct bkey *k = NULL;
2019
+ struct btree_iter iter;
2020
+ struct btree_check_state check_state;
19542021
1955
- bch_btree_op_init(&op, SHRT_MAX);
2022
+ /* check and mark root node keys */
2023
+ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
2024
+ bch_initial_mark_key(c, c->root->level, k);
19562025
1957
- return btree_root(check_recurse, c, &op);
2026
+ bch_initial_mark_key(c, c->root->level + 1, &c->root->key);
2027
+
2028
+ if (c->root->level == 0)
2029
+ return 0;
2030
+
2031
+ memset(&check_state, 0, sizeof(struct btree_check_state));
2032
+ check_state.c = c;
2033
+ check_state.total_threads = bch_btree_chkthread_nr();
2034
+ check_state.key_idx = 0;
2035
+ spin_lock_init(&check_state.idx_lock);
2036
+ atomic_set(&check_state.started, 0);
2037
+ atomic_set(&check_state.enough, 0);
2038
+ init_waitqueue_head(&check_state.wait);
2039
+
2040
+ rw_lock(0, c->root, c->root->level);
2041
+ /*
2042
+ * Run multiple threads to check btree nodes in parallel,
2043
+ * if check_state.enough is non-zero, it means current
2044
+ * running check threads are enough, unncessary to create
2045
+ * more.
2046
+ */
2047
+ for (i = 0; i < check_state.total_threads; i++) {
2048
+ /* fetch latest check_state.enough earlier */
2049
+ smp_mb__before_atomic();
2050
+ if (atomic_read(&check_state.enough))
2051
+ break;
2052
+
2053
+ check_state.infos[i].result = 0;
2054
+ check_state.infos[i].state = &check_state;
2055
+
2056
+ check_state.infos[i].thread =
2057
+ kthread_run(bch_btree_check_thread,
2058
+ &check_state.infos[i],
2059
+ "bch_btrchk[%d]", i);
2060
+ if (IS_ERR(check_state.infos[i].thread)) {
2061
+ pr_err("fails to run thread bch_btrchk[%d]\n", i);
2062
+ for (--i; i >= 0; i--)
2063
+ kthread_stop(check_state.infos[i].thread);
2064
+ ret = -ENOMEM;
2065
+ goto out;
2066
+ }
2067
+ atomic_inc(&check_state.started);
2068
+ }
2069
+
2070
+ /*
2071
+ * Must wait for all threads to stop.
2072
+ */
2073
+ wait_event(check_state.wait, atomic_read(&check_state.started) == 0);
2074
+
2075
+ for (i = 0; i < check_state.total_threads; i++) {
2076
+ if (check_state.infos[i].result) {
2077
+ ret = check_state.infos[i].result;
2078
+ goto out;
2079
+ }
2080
+ }
2081
+
2082
+out:
2083
+ rw_unlock(0, c->root);
2084
+ return ret;
19582085 }
19592086
19602087 void bch_initial_gc_finish(struct cache_set *c)
19612088 {
1962
- struct cache *ca;
2089
+ struct cache *ca = c->cache;
19632090 struct bucket *b;
1964
- unsigned int i;
19652091
19662092 bch_btree_gc_finish(c);
19672093
....@@ -1976,20 +2102,18 @@
19762102 * This is only safe for buckets that have no live data in them, which
19772103 * there should always be some of.
19782104 */
1979
- for_each_cache(ca, c, i) {
1980
- for_each_bucket(b, ca) {
1981
- if (fifo_full(&ca->free[RESERVE_PRIO]) &&
1982
- fifo_full(&ca->free[RESERVE_BTREE]))
1983
- break;
2105
+ for_each_bucket(b, ca) {
2106
+ if (fifo_full(&ca->free[RESERVE_PRIO]) &&
2107
+ fifo_full(&ca->free[RESERVE_BTREE]))
2108
+ break;
19842109
1985
- if (bch_can_invalidate_bucket(ca, b) &&
1986
- !GC_MARK(b)) {
1987
- __bch_invalidate_one_bucket(ca, b);
1988
- if (!fifo_push(&ca->free[RESERVE_PRIO],
1989
- b - ca->buckets))
1990
- fifo_push(&ca->free[RESERVE_BTREE],
1991
- b - ca->buckets);
1992
- }
2110
+ if (bch_can_invalidate_bucket(ca, b) &&
2111
+ !GC_MARK(b)) {
2112
+ __bch_invalidate_one_bucket(ca, b);
2113
+ if (!fifo_push(&ca->free[RESERVE_PRIO],
2114
+ b - ca->buckets))
2115
+ fifo_push(&ca->free[RESERVE_BTREE],
2116
+ b - ca->buckets);
19932117 }
19942118 }
19952119
....@@ -2097,7 +2221,7 @@
20972221 goto err;
20982222
20992223 split = set_blocks(btree_bset_first(n1),
2100
- block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
2224
+ block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5;
21012225
21022226 if (split) {
21032227 unsigned int keys = 0;
....@@ -2344,7 +2468,7 @@
23442468 if (ret) {
23452469 struct bkey *k;
23462470
2347
- pr_err("error %i", ret);
2471
+ pr_err("error %i\n", ret);
23482472
23492473 while ((k = bch_keylist_pop(keys)))
23502474 bkey_put(c, k);
....@@ -2394,7 +2518,7 @@
23942518
23952519 while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
23962520 bch_ptr_bad))) {
2397
- ret = btree(map_nodes_recurse, k, b,
2521
+ ret = bcache_btree(map_nodes_recurse, k, b,
23982522 op, from, fn, flags);
23992523 from = NULL;
24002524
....@@ -2412,10 +2536,10 @@
24122536 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
24132537 struct bkey *from, btree_map_nodes_fn *fn, int flags)
24142538 {
2415
- return btree_root(map_nodes_recurse, c, op, from, fn, flags);
2539
+ return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags);
24162540 }
24172541
2418
-static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2542
+int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
24192543 struct bkey *from, btree_map_keys_fn *fn,
24202544 int flags)
24212545 {
....@@ -2428,7 +2552,8 @@
24282552 while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
24292553 ret = !b->level
24302554 ? fn(op, b, k)
2431
- : btree(map_keys_recurse, k, b, op, from, fn, flags);
2555
+ : bcache_btree(map_keys_recurse, k,
2556
+ b, op, from, fn, flags);
24322557 from = NULL;
24332558
24342559 if (ret != MAP_CONTINUE)
....@@ -2445,7 +2570,7 @@
24452570 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
24462571 struct bkey *from, btree_map_keys_fn *fn, int flags)
24472572 {
2448
- return btree_root(map_keys_recurse, c, op, from, fn, flags);
2573
+ return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags);
24492574 }
24502575
24512576 /* Keybuf code */
....@@ -2631,7 +2756,7 @@
26312756 break;
26322757
26332758 if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2634
- pr_debug("scan finished");
2759
+ pr_debug("scan finished\n");
26352760 break;
26362761 }
26372762
....@@ -2649,3 +2774,18 @@
26492774 spin_lock_init(&buf->lock);
26502775 array_allocator_init(&buf->freelist);
26512776 }
2777
+
2778
+void bch_btree_exit(void)
2779
+{
2780
+ if (btree_io_wq)
2781
+ destroy_workqueue(btree_io_wq);
2782
+}
2783
+
2784
+int __init bch_btree_init(void)
2785
+{
2786
+ btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0);
2787
+ if (!btree_io_wq)
2788
+ return -ENOMEM;
2789
+
2790
+ return 0;
2791
+}