hc
2024-05-16 8d2a02b24d66aa359e83eebc1ed3c0f85367a1cb
kernel/fs/btrfs/raid56.c
....@@ -35,6 +35,22 @@
3535
3636 #define RBIO_CACHE_SIZE 1024
3737
38
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
39
+
40
+/* Used by the raid56 code to lock stripes for read/modify/write */
41
+struct btrfs_stripe_hash {
42
+ struct list_head hash_list;
43
+ spinlock_t lock;
44
+};
45
+
46
+/* Used by the raid56 code to lock stripes for read/modify/write */
47
+struct btrfs_stripe_hash_table {
48
+ struct list_head stripe_cache;
49
+ spinlock_t cache_lock;
50
+ int cache_size;
51
+ struct btrfs_stripe_hash table[];
52
+};
53
+
3854 enum btrfs_rbio_ops {
3955 BTRFS_RBIO_WRITE,
4056 BTRFS_RBIO_READ_REBUILD,
....@@ -174,7 +190,7 @@
174190
175191 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
176192 {
177
- btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL);
193
+ btrfs_init_work(&rbio->work, work_func, NULL, NULL);
178194 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
179195 }
180196
....@@ -190,7 +206,6 @@
190206 struct btrfs_stripe_hash *h;
191207 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
192208 int i;
193
- int table_size;
194209
195210 if (info->stripe_hash_table)
196211 return 0;
....@@ -202,8 +217,7 @@
202217 * Try harder to allocate and fallback to vmalloc to lower the chance
203218 * of a failing mount.
204219 */
205
- table_size = sizeof(*table) + sizeof(*h) * num_entries;
206
- table = kvzalloc(table_size, GFP_KERNEL);
220
+ table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
207221 if (!table)
208222 return -ENOMEM;
209223
....@@ -318,6 +332,9 @@
318332 {
319333 bio_list_merge(&dest->bio_list, &victim->bio_list);
320334 dest->bio_list_bytes += victim->bio_list_bytes;
335
+ /* Also inherit the bitmaps from @victim. */
336
+ bitmap_or(dest->dbitmap, victim->dbitmap, dest->dbitmap,
337
+ dest->stripe_npages);
321338 dest->generic_bio_cnt += victim->generic_bio_cnt;
322339 bio_list_init(&victim->bio_list);
323340 }
....@@ -655,8 +672,7 @@
655672 */
656673 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
657674 {
658
- int bucket = rbio_bucket(rbio);
659
- struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
675
+ struct btrfs_stripe_hash *h;
660676 struct btrfs_raid_bio *cur;
661677 struct btrfs_raid_bio *pending;
662678 unsigned long flags;
....@@ -664,64 +680,63 @@
664680 struct btrfs_raid_bio *cache_drop = NULL;
665681 int ret = 0;
666682
683
+ h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
684
+
667685 spin_lock_irqsave(&h->lock, flags);
668686 list_for_each_entry(cur, &h->hash_list, hash_list) {
669
- if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
670
- spin_lock(&cur->bio_list_lock);
687
+ if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
688
+ continue;
671689
672
- /* can we steal this cached rbio's pages? */
673
- if (bio_list_empty(&cur->bio_list) &&
674
- list_empty(&cur->plug_list) &&
675
- test_bit(RBIO_CACHE_BIT, &cur->flags) &&
676
- !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
677
- list_del_init(&cur->hash_list);
678
- refcount_dec(&cur->refs);
690
+ spin_lock(&cur->bio_list_lock);
679691
680
- steal_rbio(cur, rbio);
681
- cache_drop = cur;
682
- spin_unlock(&cur->bio_list_lock);
692
+ /* Can we steal this cached rbio's pages? */
693
+ if (bio_list_empty(&cur->bio_list) &&
694
+ list_empty(&cur->plug_list) &&
695
+ test_bit(RBIO_CACHE_BIT, &cur->flags) &&
696
+ !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
697
+ list_del_init(&cur->hash_list);
698
+ refcount_dec(&cur->refs);
683699
684
- goto lockit;
685
- }
700
+ steal_rbio(cur, rbio);
701
+ cache_drop = cur;
702
+ spin_unlock(&cur->bio_list_lock);
686703
687
- /* can we merge into the lock owner? */
688
- if (rbio_can_merge(cur, rbio)) {
689
- merge_rbio(cur, rbio);
704
+ goto lockit;
705
+ }
706
+
707
+ /* Can we merge into the lock owner? */
708
+ if (rbio_can_merge(cur, rbio)) {
709
+ merge_rbio(cur, rbio);
710
+ spin_unlock(&cur->bio_list_lock);
711
+ freeit = rbio;
712
+ ret = 1;
713
+ goto out;
714
+ }
715
+
716
+
717
+ /*
718
+ * We couldn't merge with the running rbio, see if we can merge
719
+ * with the pending ones. We don't have to check for rmw_locked
720
+ * because there is no way they are inside finish_rmw right now
721
+ */
722
+ list_for_each_entry(pending, &cur->plug_list, plug_list) {
723
+ if (rbio_can_merge(pending, rbio)) {
724
+ merge_rbio(pending, rbio);
690725 spin_unlock(&cur->bio_list_lock);
691726 freeit = rbio;
692727 ret = 1;
693728 goto out;
694729 }
695
-
696
-
697
- /*
698
- * we couldn't merge with the running
699
- * rbio, see if we can merge with the
700
- * pending ones. We don't have to
701
- * check for rmw_locked because there
702
- * is no way they are inside finish_rmw
703
- * right now
704
- */
705
- list_for_each_entry(pending, &cur->plug_list,
706
- plug_list) {
707
- if (rbio_can_merge(pending, rbio)) {
708
- merge_rbio(pending, rbio);
709
- spin_unlock(&cur->bio_list_lock);
710
- freeit = rbio;
711
- ret = 1;
712
- goto out;
713
- }
714
- }
715
-
716
- /* no merging, put us on the tail of the plug list,
717
- * our rbio will be started with the currently
718
- * running rbio unlocks
719
- */
720
- list_add_tail(&rbio->plug_list, &cur->plug_list);
721
- spin_unlock(&cur->bio_list_lock);
722
- ret = 1;
723
- goto out;
724730 }
731
+
732
+ /*
733
+ * No merging, put us on the tail of the plug list, our rbio
734
+ * will be started with the currently running rbio unlocks
735
+ */
736
+ list_add_tail(&rbio->plug_list, &cur->plug_list);
737
+ spin_unlock(&cur->bio_list_lock);
738
+ ret = 1;
739
+ goto out;
725740 }
726741 lockit:
727742 refcount_inc(&rbio->refs);
....@@ -862,6 +877,12 @@
862877
863878 if (rbio->generic_bio_cnt)
864879 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
880
+ /*
881
+ * Clear the data bitmap, as the rbio may be cached for later usage.
882
+ * do this before before unlock_stripe() so there will be no new bio
883
+ * for this bio.
884
+ */
885
+ bitmap_clear(rbio->dbitmap, 0, rbio->stripe_npages);
865886
866887 /*
867888 * At this moment, rbio->bio_list is empty, however since rbio does not
....@@ -1071,7 +1092,6 @@
10711092 unsigned long bio_max_len)
10721093 {
10731094 struct bio *last = bio_list->tail;
1074
- u64 last_end = 0;
10751095 int ret;
10761096 struct bio *bio;
10771097 struct btrfs_bio_stripe *stripe;
....@@ -1086,15 +1106,14 @@
10861106
10871107 /* see if we can add this page onto our existing bio */
10881108 if (last) {
1089
- last_end = (u64)last->bi_iter.bi_sector << 9;
1109
+ u64 last_end = (u64)last->bi_iter.bi_sector << 9;
10901110 last_end += last->bi_iter.bi_size;
10911111
10921112 /*
10931113 * we can't merge these if they are from different
10941114 * devices or if they are not contiguous
10951115 */
1096
- if (last_end == disk_start && stripe->dev->bdev &&
1097
- !last->bi_status &&
1116
+ if (last_end == disk_start && !last->bi_status &&
10981117 last->bi_disk == stripe->dev->bdev->bd_disk &&
10991118 last->bi_partno == stripe->dev->bdev->bd_partno) {
11001119 ret = bio_add_page(last, page, PAGE_SIZE, 0);
....@@ -1105,6 +1124,7 @@
11051124
11061125 /* put a new bio on the list */
11071126 bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
1127
+ btrfs_io_bio(bio)->device = stripe->dev;
11081128 bio->bi_iter.bi_size = 0;
11091129 bio_set_dev(bio, stripe->dev->bdev);
11101130 bio->bi_iter.bi_sector = disk_start >> 9;
....@@ -1196,6 +1216,9 @@
11961216 else
11971217 BUG();
11981218
1219
+ /* We should have at least one data sector. */
1220
+ ASSERT(bitmap_weight(rbio->dbitmap, rbio->stripe_npages));
1221
+
11991222 /* at this point we either have a full stripe,
12001223 * or we've read the full stripe from the drive.
12011224 * recalculate the parity and write the new results.
....@@ -1269,6 +1292,11 @@
12691292 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
12701293 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
12711294 struct page *page;
1295
+
1296
+ /* This vertical stripe has no data, skip it. */
1297
+ if (!test_bit(pagenr, rbio->dbitmap))
1298
+ continue;
1299
+
12721300 if (stripe < rbio->nr_data) {
12731301 page = page_in_rbio(rbio, stripe, pagenr, 1);
12741302 if (!page)
....@@ -1293,6 +1321,11 @@
12931321
12941322 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
12951323 struct page *page;
1324
+
1325
+ /* This vertical stripe has no data, skip it. */
1326
+ if (!test_bit(pagenr, rbio->dbitmap))
1327
+ continue;
1328
+
12961329 if (stripe < rbio->nr_data) {
12971330 page = page_in_rbio(rbio, stripe, pagenr, 1);
12981331 if (!page)
....@@ -1313,11 +1346,7 @@
13131346 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
13141347 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
13151348
1316
- while (1) {
1317
- bio = bio_list_pop(&bio_list);
1318
- if (!bio)
1319
- break;
1320
-
1349
+ while ((bio = bio_list_pop(&bio_list))) {
13211350 bio->bi_private = rbio;
13221351 bio->bi_end_io = raid_write_end_io;
13231352 bio->bi_opf = REQ_OP_WRITE;
....@@ -1342,7 +1371,6 @@
13421371 struct bio *bio)
13431372 {
13441373 u64 physical = bio->bi_iter.bi_sector;
1345
- u64 stripe_start;
13461374 int i;
13471375 struct btrfs_bio_stripe *stripe;
13481376
....@@ -1350,9 +1378,7 @@
13501378
13511379 for (i = 0; i < rbio->bbio->num_stripes; i++) {
13521380 stripe = &rbio->bbio->stripes[i];
1353
- stripe_start = stripe->physical;
1354
- if (physical >= stripe_start &&
1355
- physical < stripe_start + rbio->stripe_len &&
1381
+ if (in_range(physical, stripe->physical, rbio->stripe_len) &&
13561382 stripe->dev->bdev &&
13571383 bio->bi_disk == stripe->dev->bdev->bd_disk &&
13581384 bio->bi_partno == stripe->dev->bdev->bd_partno) {
....@@ -1370,18 +1396,14 @@
13701396 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
13711397 struct bio *bio)
13721398 {
1373
- u64 logical = bio->bi_iter.bi_sector;
1374
- u64 stripe_start;
1399
+ u64 logical = (u64)bio->bi_iter.bi_sector << 9;
13751400 int i;
13761401
1377
- logical <<= 9;
1378
-
13791402 for (i = 0; i < rbio->nr_data; i++) {
1380
- stripe_start = rbio->bbio->raid_map[i];
1381
- if (logical >= stripe_start &&
1382
- logical < stripe_start + rbio->stripe_len) {
1403
+ u64 stripe_start = rbio->bbio->raid_map[i];
1404
+
1405
+ if (in_range(logical, stripe_start, rbio->stripe_len))
13831406 return i;
1384
- }
13851407 }
13861408 return -1;
13871409 }
....@@ -1439,11 +1461,11 @@
14391461 static void set_bio_pages_uptodate(struct bio *bio)
14401462 {
14411463 struct bio_vec *bvec;
1442
- int i;
1464
+ struct bvec_iter_all iter_all;
14431465
14441466 ASSERT(!bio_flagged(bio, BIO_CLONED));
14451467
1446
- bio_for_each_segment_all(bvec, bio, i)
1468
+ bio_for_each_segment_all(bvec, bio, iter_all)
14471469 SetPageUptodate(bvec->bv_page);
14481470 }
14491471
....@@ -1555,11 +1577,7 @@
15551577 * not to touch it after that
15561578 */
15571579 atomic_set(&rbio->stripes_pending, bios_to_read);
1558
- while (1) {
1559
- bio = bio_list_pop(&bio_list);
1560
- if (!bio)
1561
- break;
1562
-
1580
+ while ((bio = bio_list_pop(&bio_list))) {
15631581 bio->bi_private = rbio;
15641582 bio->bi_end_io = raid_rmw_end_io;
15651583 bio->bi_opf = REQ_OP_READ;
....@@ -1724,13 +1742,39 @@
17241742 plug = container_of(cb, struct btrfs_plug_cb, cb);
17251743
17261744 if (from_schedule) {
1727
- btrfs_init_work(&plug->work, btrfs_rmw_helper,
1728
- unplug_work, NULL, NULL);
1745
+ btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
17291746 btrfs_queue_work(plug->info->rmw_workers,
17301747 &plug->work);
17311748 return;
17321749 }
17331750 run_plug(plug);
1751
+}
1752
+
1753
+/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1754
+static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1755
+{
1756
+ const struct btrfs_fs_info *fs_info = rbio->fs_info;
1757
+ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1758
+ const u64 full_stripe_start = rbio->bbio->raid_map[0];
1759
+ const u32 orig_len = orig_bio->bi_iter.bi_size;
1760
+ const u32 sectorsize = fs_info->sectorsize;
1761
+ u64 cur_logical;
1762
+
1763
+ ASSERT(orig_logical >= full_stripe_start &&
1764
+ orig_logical + orig_len <= full_stripe_start +
1765
+ rbio->nr_data * rbio->stripe_len);
1766
+
1767
+ bio_list_add(&rbio->bio_list, orig_bio);
1768
+ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1769
+
1770
+ /* Update the dbitmap. */
1771
+ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1772
+ cur_logical += sectorsize) {
1773
+ int bit = ((u32)(cur_logical - full_stripe_start) >>
1774
+ PAGE_SHIFT) % rbio->stripe_npages;
1775
+
1776
+ set_bit(bit, rbio->dbitmap);
1777
+ }
17341778 }
17351779
17361780 /*
....@@ -1749,9 +1793,8 @@
17491793 btrfs_put_bbio(bbio);
17501794 return PTR_ERR(rbio);
17511795 }
1752
- bio_list_add(&rbio->bio_list, bio);
1753
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
17541796 rbio->operation = BTRFS_RBIO_WRITE;
1797
+ rbio_add_bio(rbio, bio);
17551798
17561799 btrfs_bio_counter_inc_noblocked(fs_info);
17571800 rbio->generic_bio_cnt = 1;
....@@ -1867,11 +1910,8 @@
18671910 }
18681911
18691912 /* make sure our ps and qs are in order */
1870
- if (faila > failb) {
1871
- int tmp = failb;
1872
- failb = faila;
1873
- faila = tmp;
1874
- }
1913
+ if (faila > failb)
1914
+ swap(faila, failb);
18751915
18761916 /* if the q stripe is failed, do a pstripe reconstruction
18771917 * from the xors.
....@@ -1977,7 +2017,7 @@
19772017 * - In case of single failure, where rbio->failb == -1:
19782018 *
19792019 * Cache this rbio iff the above read reconstruction is
1980
- * excuted without problems.
2020
+ * executed without problems.
19812021 */
19822022 if (err == BLK_STS_OK && rbio->failb < 0)
19832023 cache_rbio_pages(rbio);
....@@ -2053,9 +2093,12 @@
20532093 atomic_set(&rbio->error, 0);
20542094
20552095 /*
2056
- * read everything that hasn't failed. Thanks to the
2057
- * stripe cache, it is possible that some or all of these
2058
- * pages are going to be uptodate.
2096
+ * Read everything that hasn't failed. However this time we will
2097
+ * not trust any cached sector.
2098
+ * As we may read out some stale data but higher layer is not reading
2099
+ * that stale part.
2100
+ *
2101
+ * So here we always re-read everything in recovery path.
20592102 */
20602103 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
20612104 if (rbio->faila == stripe || rbio->failb == stripe) {
....@@ -2064,16 +2107,6 @@
20642107 }
20652108
20662109 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
2067
- struct page *p;
2068
-
2069
- /*
2070
- * the rmw code may have already read this
2071
- * page in
2072
- */
2073
- p = rbio_stripe_page(rbio, stripe, pagenr);
2074
- if (PageUptodate(p))
2075
- continue;
2076
-
20772110 ret = rbio_add_io_page(rbio, &bio_list,
20782111 rbio_stripe_page(rbio, stripe, pagenr),
20792112 stripe, pagenr, rbio->stripe_len);
....@@ -2091,7 +2124,7 @@
20912124 */
20922125 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
20932126 __raid_recover_end_io(rbio);
2094
- goto out;
2127
+ return 0;
20952128 } else {
20962129 goto cleanup;
20972130 }
....@@ -2102,11 +2135,7 @@
21022135 * not to touch it after that
21032136 */
21042137 atomic_set(&rbio->stripes_pending, bios_to_read);
2105
- while (1) {
2106
- bio = bio_list_pop(&bio_list);
2107
- if (!bio)
2108
- break;
2109
-
2138
+ while ((bio = bio_list_pop(&bio_list))) {
21102139 bio->bi_private = rbio;
21112140 bio->bi_end_io = raid_recover_end_io;
21122141 bio->bi_opf = REQ_OP_READ;
....@@ -2115,7 +2144,7 @@
21152144
21162145 submit_bio(bio);
21172146 }
2118
-out:
2147
+
21192148 return 0;
21202149
21212150 cleanup:
....@@ -2155,8 +2184,7 @@
21552184 }
21562185
21572186 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2158
- bio_list_add(&rbio->bio_list, bio);
2159
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
2187
+ rbio_add_bio(rbio, bio);
21602188
21612189 rbio->faila = find_logical_bio_stripe(rbio, bio);
21622190 if (rbio->faila == -1) {
....@@ -2470,11 +2498,7 @@
24702498
24712499 atomic_set(&rbio->stripes_pending, nr_data);
24722500
2473
- while (1) {
2474
- bio = bio_list_pop(&bio_list);
2475
- if (!bio)
2476
- break;
2477
-
2501
+ while ((bio = bio_list_pop(&bio_list))) {
24782502 bio->bi_private = rbio;
24792503 bio->bi_end_io = raid_write_end_io;
24802504 bio->bi_opf = REQ_OP_WRITE;
....@@ -2652,11 +2676,7 @@
26522676 * not to touch it after that
26532677 */
26542678 atomic_set(&rbio->stripes_pending, bios_to_read);
2655
- while (1) {
2656
- bio = bio_list_pop(&bio_list);
2657
- if (!bio)
2658
- break;
2659
-
2679
+ while ((bio = bio_list_pop(&bio_list))) {
26602680 bio->bi_private = rbio;
26612681 bio->bi_end_io = raid56_parity_scrub_end_io;
26622682 bio->bi_opf = REQ_OP_READ;