forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 95099d4622f8cb224d94e314c7a8e0df60b13f87
kernel/drivers/md/bcache/btree.c
....@@ -99,70 +99,14 @@
9999 #define PTR_HASH(c, k) \
100100 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
101101
102
+static struct workqueue_struct *btree_io_wq;
103
+
102104 #define insert_lock(s, b) ((b)->level <= (s)->lock)
103105
104
-/*
105
- * These macros are for recursing down the btree - they handle the details of
106
- * locking and looking up nodes in the cache for you. They're best treated as
107
- * mere syntax when reading code that uses them.
108
- *
109
- * op->lock determines whether we take a read or a write lock at a given depth.
110
- * If you've got a read lock and find that you need a write lock (i.e. you're
111
- * going to have to split), set op->lock and return -EINTR; btree_root() will
112
- * call you again and you'll have the correct lock.
113
- */
114
-
115
-/**
116
- * btree - recurse down the btree on a specified key
117
- * @fn: function to call, which will be passed the child node
118
- * @key: key to recurse on
119
- * @b: parent btree node
120
- * @op: pointer to struct btree_op
121
- */
122
-#define btree(fn, key, b, op, ...) \
123
-({ \
124
- int _r, l = (b)->level - 1; \
125
- bool _w = l <= (op)->lock; \
126
- struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \
127
- _w, b); \
128
- if (!IS_ERR(_child)) { \
129
- _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
130
- rw_unlock(_w, _child); \
131
- } else \
132
- _r = PTR_ERR(_child); \
133
- _r; \
134
-})
135
-
136
-/**
137
- * btree_root - call a function on the root of the btree
138
- * @fn: function to call, which will be passed the child node
139
- * @c: cache set
140
- * @op: pointer to struct btree_op
141
- */
142
-#define btree_root(fn, c, op, ...) \
143
-({ \
144
- int _r = -EINTR; \
145
- do { \
146
- struct btree *_b = (c)->root; \
147
- bool _w = insert_lock(op, _b); \
148
- rw_lock(_w, _b, _b->level); \
149
- if (_b == (c)->root && \
150
- _w == insert_lock(op, _b)) { \
151
- _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
152
- } \
153
- rw_unlock(_w, _b); \
154
- bch_cannibalize_unlock(c); \
155
- if (_r == -EINTR) \
156
- schedule(); \
157
- } while (_r == -EINTR); \
158
- \
159
- finish_wait(&(c)->btree_cache_wait, &(op)->wait); \
160
- _r; \
161
-})
162106
163107 static inline struct bset *write_block(struct btree *b)
164108 {
165
- return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
109
+ return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache);
166110 }
167111
168112 static void bch_btree_init_next(struct btree *b)
....@@ -175,7 +119,7 @@
175119
176120 if (b->written < btree_blocks(b))
177121 bch_bset_init_next(&b->keys, write_block(b),
178
- bset_magic(&b->c->sb));
122
+ bset_magic(&b->c->cache->sb));
179123
180124 }
181125
....@@ -207,8 +151,13 @@
207151 struct bset *i = btree_bset_first(b);
208152 struct btree_iter *iter;
209153
154
+ /*
155
+ * c->fill_iter can allocate an iterator with more memory space
156
+ * than static MAX_BSETS.
157
+ * See the comment arount cache_set->fill_iter.
158
+ */
210159 iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
211
- iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
160
+ iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
212161 iter->used = 0;
213162
214163 #ifdef CONFIG_BCACHE_DEBUG
....@@ -226,12 +175,12 @@
226175 goto err;
227176
228177 err = "bad btree header";
229
- if (b->written + set_blocks(i, block_bytes(b->c)) >
178
+ if (b->written + set_blocks(i, block_bytes(b->c->cache)) >
230179 btree_blocks(b))
231180 goto err;
232181
233182 err = "bad magic";
234
- if (i->magic != bset_magic(&b->c->sb))
183
+ if (i->magic != bset_magic(&b->c->cache->sb))
235184 goto err;
236185
237186 err = "bad checksum";
....@@ -252,13 +201,13 @@
252201
253202 bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
254203
255
- b->written += set_blocks(i, block_bytes(b->c));
204
+ b->written += set_blocks(i, block_bytes(b->c->cache));
256205 }
257206
258207 err = "corrupted btree";
259208 for (i = write_block(b);
260209 bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
261
- i = ((void *) i) + block_bytes(b->c))
210
+ i = ((void *) i) + block_bytes(b->c->cache))
262211 if (i->seq == b->keys.set[0].data->seq)
263212 goto err;
264213
....@@ -272,7 +221,7 @@
272221
273222 if (b->written < btree_blocks(b))
274223 bch_bset_init_next(&b->keys, write_block(b),
275
- bset_magic(&b->c->sb));
224
+ bset_magic(&b->c->cache->sb));
276225 out:
277226 mempool_free(iter, &b->c->fill_iter);
278227 return;
....@@ -361,7 +310,7 @@
361310 btree_complete_write(b, w);
362311
363312 if (btree_node_dirty(b))
364
- schedule_delayed_work(&b->work, 30 * HZ);
313
+ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
365314
366315 closure_return_with_destructor(cl, btree_node_write_unlock);
367316 }
....@@ -400,7 +349,7 @@
400349
401350 b->bio->bi_end_io = btree_node_write_endio;
402351 b->bio->bi_private = cl;
403
- b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
352
+ b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache));
404353 b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA;
405354 bch_bio_map(b->bio, i);
406355
....@@ -424,13 +373,14 @@
424373 bset_sector_offset(&b->keys, i));
425374
426375 if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
427
- int j;
428376 struct bio_vec *bv;
429
- void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
377
+ void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
378
+ struct bvec_iter_all iter_all;
430379
431
- bio_for_each_segment_all(bv, b->bio, j)
432
- memcpy(page_address(bv->bv_page),
433
- base + j * PAGE_SIZE, PAGE_SIZE);
380
+ bio_for_each_segment_all(bv, b->bio, iter_all) {
381
+ memcpy(page_address(bv->bv_page), addr, PAGE_SIZE);
382
+ addr += PAGE_SIZE;
383
+ }
434384
435385 bch_submit_bbio(b->bio, b->c, &k.key, 0);
436386
....@@ -475,10 +425,10 @@
475425
476426 do_btree_node_write(b);
477427
478
- atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
428
+ atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size,
479429 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
480430
481
- b->written += set_blocks(i, block_bytes(b->c));
431
+ b->written += set_blocks(i, block_bytes(b->c->cache));
482432 }
483433
484434 void bch_btree_node_write(struct btree *b, struct closure *parent)
....@@ -533,10 +483,15 @@
533483 BUG_ON(!i->keys);
534484
535485 if (!btree_node_dirty(b))
536
- schedule_delayed_work(&b->work, 30 * HZ);
486
+ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
537487
538488 set_btree_node_dirty(b);
539489
490
+ /*
491
+ * w->journal is always the oldest journal pin of all bkeys
492
+ * in the leaf node, to make sure the oldest jset seq won't
493
+ * be increased before this btree node is flushed.
494
+ */
540495 if (journal_ref) {
541496 if (w->journal &&
542497 journal_pin_cmp(b->c, w->journal, journal_ref)) {
....@@ -561,7 +516,7 @@
561516 * mca -> memory cache
562517 */
563518
564
-#define mca_reserve(c) (((c->root && c->root->level) \
519
+#define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \
565520 ? c->root->level : 1) * 8 + 16)
566521 #define mca_can_free(c) \
567522 max_t(int, 0, c->btree_cache_used - mca_reserve(c))
....@@ -607,6 +562,10 @@
607562 static struct btree *mca_bucket_alloc(struct cache_set *c,
608563 struct bkey *k, gfp_t gfp)
609564 {
565
+ /*
566
+ * kzalloc() is necessary here for initialization,
567
+ * see code comments in bch_btree_keys_init().
568
+ */
610569 struct btree *b = kzalloc(sizeof(struct btree), gfp);
611570
612571 if (!b)
....@@ -662,7 +621,7 @@
662621 * and BTREE_NODE_journal_flush bit cleared by btree_flush_write().
663622 */
664623 if (btree_node_journal_flush(b)) {
665
- pr_debug("bnode %p is flushing by journal, retry", b);
624
+ pr_debug("bnode %p is flushing by journal, retry\n", b);
666625 mutex_unlock(&b->write_lock);
667626 udelay(1);
668627 goto retry;
....@@ -719,34 +678,32 @@
719678
720679 i = 0;
721680 btree_cache_used = c->btree_cache_used;
722
- list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
681
+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) {
723682 if (nr <= 0)
724683 goto out;
725684
726
- if (++i > 3 &&
727
- !mca_reap(b, 0, false)) {
685
+ if (!mca_reap(b, 0, false)) {
728686 mca_data_free(b);
729687 rw_unlock(true, b);
730688 freed++;
731689 }
732690 nr--;
691
+ i++;
733692 }
734693
735
- for (; (nr--) && i < btree_cache_used; i++) {
736
- if (list_empty(&c->btree_cache))
694
+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
695
+ if (nr <= 0 || i >= btree_cache_used)
737696 goto out;
738697
739
- b = list_first_entry(&c->btree_cache, struct btree, list);
740
- list_rotate_left(&c->btree_cache);
741
-
742
- if (!b->accessed &&
743
- !mca_reap(b, 0, false)) {
698
+ if (!mca_reap(b, 0, false)) {
744699 mca_bucket_free(b);
745700 mca_data_free(b);
746701 rw_unlock(true, b);
747702 freed++;
748
- } else
749
- b->accessed = 0;
703
+ }
704
+
705
+ nr--;
706
+ i++;
750707 }
751708 out:
752709 mutex_unlock(&c->bucket_lock);
....@@ -783,7 +740,7 @@
783740 if (c->verify_data)
784741 list_move(&c->verify_data->list, &c->btree_cache);
785742
786
- free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
743
+ free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb)));
787744 #endif
788745
789746 list_splice(&c->btree_cache_freeable,
....@@ -830,7 +787,16 @@
830787 mutex_init(&c->verify_lock);
831788
832789 c->verify_ondisk = (void *)
833
- __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(bucket_pages(c)));
790
+ __get_free_pages(GFP_KERNEL|__GFP_COMP,
791
+ ilog2(meta_bucket_pages(&c->cache->sb)));
792
+ if (!c->verify_ondisk) {
793
+ /*
794
+ * Don't worry about the mca_rereserve buckets
795
+ * allocated in previous for-loop, they will be
796
+ * handled properly in bch_cache_set_unregister().
797
+ */
798
+ return -ENOMEM;
799
+ }
834800
835801 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
836802
....@@ -847,7 +813,7 @@
847813 c->shrink.batch = c->btree_pages * 2;
848814
849815 if (register_shrinker(&c->shrink))
850
- pr_warn("bcache: %s: could not register shrinker",
816
+ pr_warn("bcache: %s: could not register shrinker\n",
851817 __func__);
852818
853819 return 0;
....@@ -1004,7 +970,7 @@
1004970 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
1005971 * in from disk if necessary.
1006972 *
1007
- * If IO is necessary and running under generic_make_request, returns -EAGAIN.
973
+ * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN.
1008974 *
1009975 * The btree node will have either a read or a write lock held, depending on
1010976 * level and op->lock.
....@@ -1054,7 +1020,6 @@
10541020 BUG_ON(!b->written);
10551021
10561022 b->parent = parent;
1057
- b->accessed = 1;
10581023
10591024 for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
10601025 prefetch(b->keys.set[i].tree);
....@@ -1100,7 +1065,7 @@
11001065 */
11011066 if (btree_node_journal_flush(b)) {
11021067 mutex_unlock(&b->write_lock);
1103
- pr_debug("bnode %p journal_flush set, retry", b);
1068
+ pr_debug("bnode %p journal_flush set, retry\n", b);
11041069 udelay(1);
11051070 goto retry;
11061071 }
....@@ -1129,7 +1094,7 @@
11291094
11301095 mutex_lock(&c->bucket_lock);
11311096 retry:
1132
- if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
1097
+ if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
11331098 goto err;
11341099
11351100 bkey_put(c, &k.key);
....@@ -1145,9 +1110,8 @@
11451110 goto retry;
11461111 }
11471112
1148
- b->accessed = 1;
11491113 b->parent = parent;
1150
- bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
1114
+ bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb));
11511115
11521116 mutex_unlock(&c->bucket_lock);
11531117
....@@ -1206,19 +1170,18 @@
12061170 static int btree_check_reserve(struct btree *b, struct btree_op *op)
12071171 {
12081172 struct cache_set *c = b->c;
1209
- struct cache *ca;
1210
- unsigned int i, reserve = (c->root->level - b->level) * 2 + 1;
1173
+ struct cache *ca = c->cache;
1174
+ unsigned int reserve = (c->root->level - b->level) * 2 + 1;
12111175
12121176 mutex_lock(&c->bucket_lock);
12131177
1214
- for_each_cache(ca, c, i)
1215
- if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1216
- if (op)
1217
- prepare_to_wait(&c->btree_cache_wait, &op->wait,
1218
- TASK_UNINTERRUPTIBLE);
1219
- mutex_unlock(&c->bucket_lock);
1220
- return -EINTR;
1221
- }
1178
+ if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1179
+ if (op)
1180
+ prepare_to_wait(&c->btree_cache_wait, &op->wait,
1181
+ TASK_UNINTERRUPTIBLE);
1182
+ mutex_unlock(&c->bucket_lock);
1183
+ return -EINTR;
1184
+ }
12221185
12231186 mutex_unlock(&c->bucket_lock);
12241187
....@@ -1384,7 +1347,7 @@
13841347
13851348 if (nodes < 2 ||
13861349 __set_blocks(b->keys.set[0].data, keys,
1387
- block_bytes(b->c)) > blocks * (nodes - 1))
1350
+ block_bytes(b->c->cache)) > blocks * (nodes - 1))
13881351 return 0;
13891352
13901353 for (i = 0; i < nodes; i++) {
....@@ -1418,7 +1381,7 @@
14181381 k = bkey_next(k)) {
14191382 if (__set_blocks(n1, n1->keys + keys +
14201383 bkey_u64s(k),
1421
- block_bytes(b->c)) > blocks)
1384
+ block_bytes(b->c->cache)) > blocks)
14221385 break;
14231386
14241387 last = k;
....@@ -1434,7 +1397,7 @@
14341397 * though)
14351398 */
14361399 if (__set_blocks(n1, n1->keys + n2->keys,
1437
- block_bytes(b->c)) >
1400
+ block_bytes(b->c->cache)) >
14381401 btree_blocks(new_nodes[i]))
14391402 goto out_unlock_nocoalesce;
14401403
....@@ -1443,7 +1406,7 @@
14431406 last = &r->b->key;
14441407 }
14451408
1446
- BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
1409
+ BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) >
14471410 btree_blocks(new_nodes[i]));
14481411
14491412 if (last)
....@@ -1517,11 +1480,11 @@
15171480
15181481 out_nocoalesce:
15191482 closure_sync(&cl);
1520
- bch_keylist_free(&keylist);
15211483
15221484 while ((k = bch_keylist_pop(&keylist)))
15231485 if (!bkey_cmp(k, &ZERO_KEY))
15241486 atomic_dec(&b->c->prio_blocked);
1487
+ bch_keylist_free(&keylist);
15251488
15261489 for (i = 0; i < nodes; i++)
15271490 if (!IS_ERR_OR_NULL(new_nodes[i])) {
....@@ -1734,7 +1697,6 @@
17341697 {
17351698 struct cache *ca;
17361699 struct bucket *b;
1737
- unsigned int i;
17381700
17391701 if (!c->gc_mark_valid)
17401702 return;
....@@ -1744,14 +1706,14 @@
17441706 c->gc_mark_valid = 0;
17451707 c->gc_done = ZERO_KEY;
17461708
1747
- for_each_cache(ca, c, i)
1748
- for_each_bucket(b, ca) {
1749
- b->last_gc = b->gen;
1750
- if (!atomic_read(&b->pin)) {
1751
- SET_GC_MARK(b, 0);
1752
- SET_GC_SECTORS_USED(b, 0);
1753
- }
1709
+ ca = c->cache;
1710
+ for_each_bucket(b, ca) {
1711
+ b->last_gc = b->gen;
1712
+ if (!atomic_read(&b->pin)) {
1713
+ SET_GC_MARK(b, 0);
1714
+ SET_GC_SECTORS_USED(b, 0);
17541715 }
1716
+ }
17551717
17561718 mutex_unlock(&c->bucket_lock);
17571719 }
....@@ -1760,7 +1722,8 @@
17601722 {
17611723 struct bucket *b;
17621724 struct cache *ca;
1763
- unsigned int i;
1725
+ unsigned int i, j;
1726
+ uint64_t *k;
17641727
17651728 mutex_lock(&c->bucket_lock);
17661729
....@@ -1778,7 +1741,6 @@
17781741 struct bcache_device *d = c->devices[i];
17791742 struct cached_dev *dc;
17801743 struct keybuf_key *w, *n;
1781
- unsigned int j;
17821744
17831745 if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
17841746 continue;
....@@ -1795,29 +1757,27 @@
17951757 rcu_read_unlock();
17961758
17971759 c->avail_nbuckets = 0;
1798
- for_each_cache(ca, c, i) {
1799
- uint64_t *i;
18001760
1801
- ca->invalidate_needs_gc = 0;
1761
+ ca = c->cache;
1762
+ ca->invalidate_needs_gc = 0;
18021763
1803
- for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1804
- SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1764
+ for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
1765
+ SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
18051766
1806
- for (i = ca->prio_buckets;
1807
- i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1808
- SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1767
+ for (k = ca->prio_buckets;
1768
+ k < ca->prio_buckets + prio_buckets(ca) * 2; k++)
1769
+ SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA);
18091770
1810
- for_each_bucket(b, ca) {
1811
- c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1771
+ for_each_bucket(b, ca) {
1772
+ c->need_gc = max(c->need_gc, bucket_gc_gen(b));
18121773
1813
- if (atomic_read(&b->pin))
1814
- continue;
1774
+ if (atomic_read(&b->pin))
1775
+ continue;
18151776
1816
- BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
1777
+ BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
18171778
1818
- if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
1819
- c->avail_nbuckets++;
1820
- }
1779
+ if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
1780
+ c->avail_nbuckets++;
18211781 }
18221782
18231783 mutex_unlock(&c->bucket_lock);
....@@ -1841,7 +1801,7 @@
18411801
18421802 /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
18431803 do {
1844
- ret = btree_root(gc_root, c, &op, &writes, &stats);
1804
+ ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
18451805 closure_sync(&writes);
18461806 cond_resched();
18471807
....@@ -1849,7 +1809,7 @@
18491809 schedule_timeout_interruptible(msecs_to_jiffies
18501810 (GC_SLEEP_MS));
18511811 else if (ret)
1852
- pr_warn("gc failed!");
1812
+ pr_warn("gc failed!\n");
18531813 } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
18541814
18551815 bch_btree_gc_finish(c);
....@@ -1869,12 +1829,10 @@
18691829
18701830 static bool gc_should_run(struct cache_set *c)
18711831 {
1872
- struct cache *ca;
1873
- unsigned int i;
1832
+ struct cache *ca = c->cache;
18741833
1875
- for_each_cache(ca, c, i)
1876
- if (ca->invalidate_needs_gc)
1877
- return true;
1834
+ if (ca->invalidate_needs_gc)
1835
+ return true;
18781836
18791837 if (atomic_read(&c->sectors_to_gc) < 0)
18801838 return true;
....@@ -1939,7 +1897,7 @@
19391897 }
19401898
19411899 if (p)
1942
- ret = btree(check_recurse, p, b, op);
1900
+ ret = bcache_btree(check_recurse, p, b, op);
19431901
19441902 p = k;
19451903 } while (p && !ret);
....@@ -1948,20 +1906,177 @@
19481906 return ret;
19491907 }
19501908
1909
+
1910
+static int bch_btree_check_thread(void *arg)
1911
+{
1912
+ int ret;
1913
+ struct btree_check_info *info = arg;
1914
+ struct btree_check_state *check_state = info->state;
1915
+ struct cache_set *c = check_state->c;
1916
+ struct btree_iter iter;
1917
+ struct bkey *k, *p;
1918
+ int cur_idx, prev_idx, skip_nr;
1919
+
1920
+ k = p = NULL;
1921
+ cur_idx = prev_idx = 0;
1922
+ ret = 0;
1923
+
1924
+ /* root node keys are checked before thread created */
1925
+ bch_btree_iter_init(&c->root->keys, &iter, NULL);
1926
+ k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
1927
+ BUG_ON(!k);
1928
+
1929
+ p = k;
1930
+ while (k) {
1931
+ /*
1932
+ * Fetch a root node key index, skip the keys which
1933
+ * should be fetched by other threads, then check the
1934
+ * sub-tree indexed by the fetched key.
1935
+ */
1936
+ spin_lock(&check_state->idx_lock);
1937
+ cur_idx = check_state->key_idx;
1938
+ check_state->key_idx++;
1939
+ spin_unlock(&check_state->idx_lock);
1940
+
1941
+ skip_nr = cur_idx - prev_idx;
1942
+
1943
+ while (skip_nr) {
1944
+ k = bch_btree_iter_next_filter(&iter,
1945
+ &c->root->keys,
1946
+ bch_ptr_bad);
1947
+ if (k)
1948
+ p = k;
1949
+ else {
1950
+ /*
1951
+ * No more keys to check in root node,
1952
+ * current checking threads are enough,
1953
+ * stop creating more.
1954
+ */
1955
+ atomic_set(&check_state->enough, 1);
1956
+ /* Update check_state->enough earlier */
1957
+ smp_mb__after_atomic();
1958
+ goto out;
1959
+ }
1960
+ skip_nr--;
1961
+ cond_resched();
1962
+ }
1963
+
1964
+ if (p) {
1965
+ struct btree_op op;
1966
+
1967
+ btree_node_prefetch(c->root, p);
1968
+ c->gc_stats.nodes++;
1969
+ bch_btree_op_init(&op, 0);
1970
+ ret = bcache_btree(check_recurse, p, c->root, &op);
1971
+ if (ret)
1972
+ goto out;
1973
+ }
1974
+ p = NULL;
1975
+ prev_idx = cur_idx;
1976
+ cond_resched();
1977
+ }
1978
+
1979
+out:
1980
+ info->result = ret;
1981
+ /* update check_state->started among all CPUs */
1982
+ smp_mb__before_atomic();
1983
+ if (atomic_dec_and_test(&check_state->started))
1984
+ wake_up(&check_state->wait);
1985
+
1986
+ return ret;
1987
+}
1988
+
1989
+
1990
+
1991
+static int bch_btree_chkthread_nr(void)
1992
+{
1993
+ int n = num_online_cpus()/2;
1994
+
1995
+ if (n == 0)
1996
+ n = 1;
1997
+ else if (n > BCH_BTR_CHKTHREAD_MAX)
1998
+ n = BCH_BTR_CHKTHREAD_MAX;
1999
+
2000
+ return n;
2001
+}
2002
+
19512003 int bch_btree_check(struct cache_set *c)
19522004 {
1953
- struct btree_op op;
2005
+ int ret = 0;
2006
+ int i;
2007
+ struct bkey *k = NULL;
2008
+ struct btree_iter iter;
2009
+ struct btree_check_state check_state;
19542010
1955
- bch_btree_op_init(&op, SHRT_MAX);
2011
+ /* check and mark root node keys */
2012
+ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
2013
+ bch_initial_mark_key(c, c->root->level, k);
19562014
1957
- return btree_root(check_recurse, c, &op);
2015
+ bch_initial_mark_key(c, c->root->level + 1, &c->root->key);
2016
+
2017
+ if (c->root->level == 0)
2018
+ return 0;
2019
+
2020
+ memset(&check_state, 0, sizeof(struct btree_check_state));
2021
+ check_state.c = c;
2022
+ check_state.total_threads = bch_btree_chkthread_nr();
2023
+ check_state.key_idx = 0;
2024
+ spin_lock_init(&check_state.idx_lock);
2025
+ atomic_set(&check_state.started, 0);
2026
+ atomic_set(&check_state.enough, 0);
2027
+ init_waitqueue_head(&check_state.wait);
2028
+
2029
+ rw_lock(0, c->root, c->root->level);
2030
+ /*
2031
+ * Run multiple threads to check btree nodes in parallel,
2032
+ * if check_state.enough is non-zero, it means current
2033
+ * running check threads are enough, unncessary to create
2034
+ * more.
2035
+ */
2036
+ for (i = 0; i < check_state.total_threads; i++) {
2037
+ /* fetch latest check_state.enough earlier */
2038
+ smp_mb__before_atomic();
2039
+ if (atomic_read(&check_state.enough))
2040
+ break;
2041
+
2042
+ check_state.infos[i].result = 0;
2043
+ check_state.infos[i].state = &check_state;
2044
+
2045
+ check_state.infos[i].thread =
2046
+ kthread_run(bch_btree_check_thread,
2047
+ &check_state.infos[i],
2048
+ "bch_btrchk[%d]", i);
2049
+ if (IS_ERR(check_state.infos[i].thread)) {
2050
+ pr_err("fails to run thread bch_btrchk[%d]\n", i);
2051
+ for (--i; i >= 0; i--)
2052
+ kthread_stop(check_state.infos[i].thread);
2053
+ ret = -ENOMEM;
2054
+ goto out;
2055
+ }
2056
+ atomic_inc(&check_state.started);
2057
+ }
2058
+
2059
+ /*
2060
+ * Must wait for all threads to stop.
2061
+ */
2062
+ wait_event(check_state.wait, atomic_read(&check_state.started) == 0);
2063
+
2064
+ for (i = 0; i < check_state.total_threads; i++) {
2065
+ if (check_state.infos[i].result) {
2066
+ ret = check_state.infos[i].result;
2067
+ goto out;
2068
+ }
2069
+ }
2070
+
2071
+out:
2072
+ rw_unlock(0, c->root);
2073
+ return ret;
19582074 }
19592075
19602076 void bch_initial_gc_finish(struct cache_set *c)
19612077 {
1962
- struct cache *ca;
2078
+ struct cache *ca = c->cache;
19632079 struct bucket *b;
1964
- unsigned int i;
19652080
19662081 bch_btree_gc_finish(c);
19672082
....@@ -1976,20 +2091,18 @@
19762091 * This is only safe for buckets that have no live data in them, which
19772092 * there should always be some of.
19782093 */
1979
- for_each_cache(ca, c, i) {
1980
- for_each_bucket(b, ca) {
1981
- if (fifo_full(&ca->free[RESERVE_PRIO]) &&
1982
- fifo_full(&ca->free[RESERVE_BTREE]))
1983
- break;
2094
+ for_each_bucket(b, ca) {
2095
+ if (fifo_full(&ca->free[RESERVE_PRIO]) &&
2096
+ fifo_full(&ca->free[RESERVE_BTREE]))
2097
+ break;
19842098
1985
- if (bch_can_invalidate_bucket(ca, b) &&
1986
- !GC_MARK(b)) {
1987
- __bch_invalidate_one_bucket(ca, b);
1988
- if (!fifo_push(&ca->free[RESERVE_PRIO],
1989
- b - ca->buckets))
1990
- fifo_push(&ca->free[RESERVE_BTREE],
1991
- b - ca->buckets);
1992
- }
2099
+ if (bch_can_invalidate_bucket(ca, b) &&
2100
+ !GC_MARK(b)) {
2101
+ __bch_invalidate_one_bucket(ca, b);
2102
+ if (!fifo_push(&ca->free[RESERVE_PRIO],
2103
+ b - ca->buckets))
2104
+ fifo_push(&ca->free[RESERVE_BTREE],
2105
+ b - ca->buckets);
19932106 }
19942107 }
19952108
....@@ -2097,7 +2210,7 @@
20972210 goto err;
20982211
20992212 split = set_blocks(btree_bset_first(n1),
2100
- block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
2213
+ block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5;
21012214
21022215 if (split) {
21032216 unsigned int keys = 0;
....@@ -2344,7 +2457,7 @@
23442457 if (ret) {
23452458 struct bkey *k;
23462459
2347
- pr_err("error %i", ret);
2460
+ pr_err("error %i\n", ret);
23482461
23492462 while ((k = bch_keylist_pop(keys)))
23502463 bkey_put(c, k);
....@@ -2394,7 +2507,7 @@
23942507
23952508 while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
23962509 bch_ptr_bad))) {
2397
- ret = btree(map_nodes_recurse, k, b,
2510
+ ret = bcache_btree(map_nodes_recurse, k, b,
23982511 op, from, fn, flags);
23992512 from = NULL;
24002513
....@@ -2412,10 +2525,10 @@
24122525 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
24132526 struct bkey *from, btree_map_nodes_fn *fn, int flags)
24142527 {
2415
- return btree_root(map_nodes_recurse, c, op, from, fn, flags);
2528
+ return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags);
24162529 }
24172530
2418
-static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2531
+int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
24192532 struct bkey *from, btree_map_keys_fn *fn,
24202533 int flags)
24212534 {
....@@ -2428,7 +2541,8 @@
24282541 while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
24292542 ret = !b->level
24302543 ? fn(op, b, k)
2431
- : btree(map_keys_recurse, k, b, op, from, fn, flags);
2544
+ : bcache_btree(map_keys_recurse, k,
2545
+ b, op, from, fn, flags);
24322546 from = NULL;
24332547
24342548 if (ret != MAP_CONTINUE)
....@@ -2445,7 +2559,7 @@
24452559 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
24462560 struct bkey *from, btree_map_keys_fn *fn, int flags)
24472561 {
2448
- return btree_root(map_keys_recurse, c, op, from, fn, flags);
2562
+ return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags);
24492563 }
24502564
24512565 /* Keybuf code */
....@@ -2631,7 +2745,7 @@
26312745 break;
26322746
26332747 if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2634
- pr_debug("scan finished");
2748
+ pr_debug("scan finished\n");
26352749 break;
26362750 }
26372751
....@@ -2649,3 +2763,18 @@
26492763 spin_lock_init(&buf->lock);
26502764 array_allocator_init(&buf->freelist);
26512765 }
2766
+
2767
+void bch_btree_exit(void)
2768
+{
2769
+ if (btree_io_wq)
2770
+ destroy_workqueue(btree_io_wq);
2771
+}
2772
+
2773
+int __init bch_btree_init(void)
2774
+{
2775
+ btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0);
2776
+ if (!btree_io_wq)
2777
+ return -ENOMEM;
2778
+
2779
+ return 0;
2780
+}