hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/fs/btrfs/disk-io.c
....@@ -7,7 +7,6 @@
77 #include <linux/blkdev.h>
88 #include <linux/radix-tree.h>
99 #include <linux/writeback.h>
10
-#include <linux/buffer_head.h>
1110 #include <linux/workqueue.h>
1211 #include <linux/kthread.h>
1312 #include <linux/slab.h>
....@@ -19,6 +18,7 @@
1918 #include <linux/crc32c.h>
2019 #include <linux/sched/mm.h>
2120 #include <asm/unaligned.h>
21
+#include <crypto/hash.h>
2222 #include "ctree.h"
2323 #include "disk-io.h"
2424 #include "transaction.h"
....@@ -39,10 +39,9 @@
3939 #include "compression.h"
4040 #include "tree-checker.h"
4141 #include "ref-verify.h"
42
-
43
-#ifdef CONFIG_X86
44
-#include <asm/cpufeature.h>
45
-#endif
42
+#include "block-group.h"
43
+#include "discard.h"
44
+#include "space-info.h"
4645
4746 #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
4847 BTRFS_HEADER_FLAG_RELOC |\
....@@ -51,7 +50,6 @@
5150 BTRFS_SUPER_FLAG_METADUMP |\
5251 BTRFS_SUPER_FLAG_METADUMP_V2)
5352
54
-static const struct extent_io_ops btree_extent_io_ops;
5553 static void end_workqueue_fn(struct btrfs_work *work);
5654 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
5755 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
....@@ -99,6 +97,12 @@
9997 kmem_cache_destroy(btrfs_end_io_wq_cache);
10098 }
10199
100
+static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
101
+{
102
+ if (fs_info->csum_shash)
103
+ crypto_free_shash(fs_info->csum_shash);
104
+}
105
+
102106 /*
103107 * async submit bios are used to offload expensive checksumming
104108 * onto the worker threads. They checksum file and metadata bios
....@@ -126,8 +130,8 @@
126130 * Different roots are used for different purposes and may nest inside each
127131 * other and they require separate keysets. As lockdep keys should be
128132 * static, assign keysets according to the purpose of the root as indicated
129
- * by btrfs_root->objectid. This ensures that all special purpose roots
130
- * have separate keysets.
133
+ * by btrfs_root->root_key.objectid. This ensures that all special purpose
134
+ * roots have separate keysets.
131135 *
132136 * Lock-nesting across peer nodes is always done with the immediate parent
133137 * node locked thus preventing deadlock. As lockdep doesn't know this, use
....@@ -200,118 +204,28 @@
200204 #endif
201205
202206 /*
203
- * extents on the btree inode are pretty simple, there's one extent
204
- * that covers the entire device
207
+ * Compute the csum of a btree block and store the result to provided buffer.
205208 */
206
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
207
- struct page *page, size_t pg_offset, u64 start, u64 len,
208
- int create)
209
+static void csum_tree_block(struct extent_buffer *buf, u8 *result)
209210 {
210
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
211
- struct extent_map_tree *em_tree = &inode->extent_tree;
212
- struct extent_map *em;
213
- int ret;
214
-
215
- read_lock(&em_tree->lock);
216
- em = lookup_extent_mapping(em_tree, start, len);
217
- if (em) {
218
- em->bdev = fs_info->fs_devices->latest_bdev;
219
- read_unlock(&em_tree->lock);
220
- goto out;
221
- }
222
- read_unlock(&em_tree->lock);
223
-
224
- em = alloc_extent_map();
225
- if (!em) {
226
- em = ERR_PTR(-ENOMEM);
227
- goto out;
228
- }
229
- em->start = 0;
230
- em->len = (u64)-1;
231
- em->block_len = (u64)-1;
232
- em->block_start = 0;
233
- em->bdev = fs_info->fs_devices->latest_bdev;
234
-
235
- write_lock(&em_tree->lock);
236
- ret = add_extent_mapping(em_tree, em, 0);
237
- if (ret == -EEXIST) {
238
- free_extent_map(em);
239
- em = lookup_extent_mapping(em_tree, start, len);
240
- if (!em)
241
- em = ERR_PTR(-EIO);
242
- } else if (ret) {
243
- free_extent_map(em);
244
- em = ERR_PTR(ret);
245
- }
246
- write_unlock(&em_tree->lock);
247
-
248
-out:
249
- return em;
250
-}
251
-
252
-u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
253
-{
254
- return crc32c(seed, data, len);
255
-}
256
-
257
-void btrfs_csum_final(u32 crc, u8 *result)
258
-{
259
- put_unaligned_le32(~crc, result);
260
-}
261
-
262
-/*
263
- * compute the csum for a btree block, and either verify it or write it
264
- * into the csum field of the block.
265
- */
266
-static int csum_tree_block(struct btrfs_fs_info *fs_info,
267
- struct extent_buffer *buf,
268
- int verify)
269
-{
270
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
271
- char result[BTRFS_CSUM_SIZE];
272
- unsigned long len;
273
- unsigned long cur_len;
274
- unsigned long offset = BTRFS_CSUM_SIZE;
211
+ struct btrfs_fs_info *fs_info = buf->fs_info;
212
+ const int num_pages = fs_info->nodesize >> PAGE_SHIFT;
213
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
275214 char *kaddr;
276
- unsigned long map_start;
277
- unsigned long map_len;
278
- int err;
279
- u32 crc = ~(u32)0;
215
+ int i;
280216
281
- len = buf->len - offset;
282
- while (len > 0) {
283
- err = map_private_extent_buffer(buf, offset, 32,
284
- &kaddr, &map_start, &map_len);
285
- if (err)
286
- return err;
287
- cur_len = min(len, map_len - (offset - map_start));
288
- crc = btrfs_csum_data(kaddr + offset - map_start,
289
- crc, cur_len);
290
- len -= cur_len;
291
- offset += cur_len;
217
+ shash->tfm = fs_info->csum_shash;
218
+ crypto_shash_init(shash);
219
+ kaddr = page_address(buf->pages[0]);
220
+ crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
221
+ PAGE_SIZE - BTRFS_CSUM_SIZE);
222
+
223
+ for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
224
+ kaddr = page_address(buf->pages[i]);
225
+ crypto_shash_update(shash, kaddr, PAGE_SIZE);
292226 }
293227 memset(result, 0, BTRFS_CSUM_SIZE);
294
-
295
- btrfs_csum_final(crc, result);
296
-
297
- if (verify) {
298
- if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
299
- u32 val;
300
- u32 found = 0;
301
- memcpy(&found, result, csum_size);
302
-
303
- read_extent_buffer(buf, &val, 0, csum_size);
304
- btrfs_warn_rl(fs_info,
305
- "%s checksum verify failed on %llu wanted %X found %X level %d",
306
- fs_info->sb->s_id, buf->start,
307
- val, found, btrfs_header_level(buf));
308
- return -EUCLEAN;
309
- }
310
- } else {
311
- write_extent_buffer(buf, result, 0, csum_size);
312
- }
313
-
314
- return 0;
228
+ crypto_shash_final(shash, result);
315229 }
316230
317231 /*
....@@ -336,7 +250,7 @@
336250
337251 if (need_lock) {
338252 btrfs_tree_read_lock(eb);
339
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
253
+ btrfs_set_lock_blocking_read(eb);
340254 }
341255
342256 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
....@@ -370,6 +284,19 @@
370284 return ret;
371285 }
372286
287
+static bool btrfs_supported_super_csum(u16 csum_type)
288
+{
289
+ switch (csum_type) {
290
+ case BTRFS_CSUM_TYPE_CRC32:
291
+ case BTRFS_CSUM_TYPE_XXHASH:
292
+ case BTRFS_CSUM_TYPE_SHA256:
293
+ case BTRFS_CSUM_TYPE_BLAKE2:
294
+ return true;
295
+ default:
296
+ return false;
297
+ }
298
+}
299
+
373300 /*
374301 * Return 0 if the superblock checksum type matches the checksum value of that
375302 * algorithm. Pass the raw disk superblock data.
....@@ -379,51 +306,40 @@
379306 {
380307 struct btrfs_super_block *disk_sb =
381308 (struct btrfs_super_block *)raw_disk_sb;
382
- u16 csum_type = btrfs_super_csum_type(disk_sb);
383
- int ret = 0;
309
+ char result[BTRFS_CSUM_SIZE];
310
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
384311
385
- if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
386
- u32 crc = ~(u32)0;
387
- char result[sizeof(crc)];
312
+ shash->tfm = fs_info->csum_shash;
388313
389
- /*
390
- * The super_block structure does not span the whole
391
- * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
392
- * is filled with zeros and is included in the checksum.
393
- */
394
- crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
395
- crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
396
- btrfs_csum_final(crc, result);
314
+ /*
315
+ * The super_block structure does not span the whole
316
+ * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
317
+ * filled with zeros and is included in the checksum.
318
+ */
319
+ crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
320
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
397321
398
- if (memcmp(raw_disk_sb, result, sizeof(result)))
399
- ret = 1;
400
- }
322
+ if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
323
+ return 1;
401324
402
- if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
403
- btrfs_err(fs_info, "unsupported checksum algorithm %u",
404
- csum_type);
405
- ret = 1;
406
- }
407
-
408
- return ret;
325
+ return 0;
409326 }
410327
411
-int btrfs_verify_level_key(struct btrfs_fs_info *fs_info,
412
- struct extent_buffer *eb, int level,
328
+int btrfs_verify_level_key(struct extent_buffer *eb, int level,
413329 struct btrfs_key *first_key, u64 parent_transid)
414330 {
331
+ struct btrfs_fs_info *fs_info = eb->fs_info;
415332 int found_level;
416333 struct btrfs_key found_key;
417334 int ret;
418335
419336 found_level = btrfs_header_level(eb);
420337 if (found_level != level) {
421
-#ifdef CONFIG_BTRFS_DEBUG
422
- WARN_ON(1);
338
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
339
+ KERN_ERR "BTRFS: tree level check failed\n");
423340 btrfs_err(fs_info,
424341 "tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
425342 eb->start, level, found_level);
426
-#endif
427343 return -EIO;
428344 }
429345
....@@ -454,9 +370,9 @@
454370 btrfs_item_key_to_cpu(eb, &found_key, 0);
455371 ret = btrfs_comp_cpu_keys(first_key, &found_key);
456372
457
-#ifdef CONFIG_BTRFS_DEBUG
458373 if (ret) {
459
- WARN_ON(1);
374
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
375
+ KERN_ERR "BTRFS: tree first key check failed\n");
460376 btrfs_err(fs_info,
461377 "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
462378 eb->start, parent_transid, first_key->objectid,
....@@ -464,7 +380,6 @@
464380 found_key.objectid, found_key.type,
465381 found_key.offset);
466382 }
467
-#endif
468383 return ret;
469384 }
470385
....@@ -476,11 +391,11 @@
476391 * @level: expected level, mandatory check
477392 * @first_key: expected key of first slot, skip check if NULL
478393 */
479
-static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
480
- struct extent_buffer *eb,
394
+static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
481395 u64 parent_transid, int level,
482396 struct btrfs_key *first_key)
483397 {
398
+ struct btrfs_fs_info *fs_info = eb->fs_info;
484399 struct extent_io_tree *io_tree;
485400 int failed = 0;
486401 int ret;
....@@ -491,13 +406,12 @@
491406 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
492407 while (1) {
493408 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
494
- ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
495
- mirror_num);
409
+ ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
496410 if (!ret) {
497411 if (verify_parent_transid(io_tree, eb,
498412 parent_transid, 0))
499413 ret = -EIO;
500
- else if (btrfs_verify_level_key(fs_info, eb, level,
414
+ else if (btrfs_verify_level_key(eb, level,
501415 first_key, parent_transid))
502416 ret = -EUCLEAN;
503417 else
....@@ -523,7 +437,7 @@
523437 }
524438
525439 if (failed && !ret && failed_mirror)
526
- repair_eb_io_failure(fs_info, eb, failed_mirror);
440
+ btrfs_repair_eb_io_failure(eb, failed_mirror);
527441
528442 return ret;
529443 }
....@@ -537,7 +451,10 @@
537451 {
538452 u64 start = page_offset(page);
539453 u64 found_start;
454
+ u8 result[BTRFS_CSUM_SIZE];
455
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
540456 struct extent_buffer *eb;
457
+ int ret;
541458
542459 eb = (struct extent_buffer *)page->private;
543460 if (page != eb->pages[0])
....@@ -553,51 +470,83 @@
553470 if (WARN_ON(!PageUptodate(page)))
554471 return -EUCLEAN;
555472
556
- ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
557
- btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
473
+ ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
474
+ offsetof(struct btrfs_header, fsid),
475
+ BTRFS_FSID_SIZE) == 0);
558476
559
- return csum_tree_block(fs_info, eb, 0);
560
-}
477
+ csum_tree_block(eb, result);
561478
562
-static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
563
- struct extent_buffer *eb)
564
-{
565
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
566
- u8 fsid[BTRFS_FSID_SIZE];
567
- int ret = 1;
479
+ if (btrfs_header_level(eb))
480
+ ret = btrfs_check_node(eb);
481
+ else
482
+ ret = btrfs_check_leaf_full(eb);
568483
569
- read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
570
- while (fs_devices) {
571
- if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
572
- ret = 0;
573
- break;
574
- }
575
- fs_devices = fs_devices->seed;
484
+ if (ret < 0) {
485
+ btrfs_print_tree(eb, 0);
486
+ btrfs_err(fs_info,
487
+ "block=%llu write time tree block corruption detected",
488
+ eb->start);
489
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
490
+ return ret;
576491 }
577
- return ret;
492
+ write_extent_buffer(eb, result, 0, csum_size);
493
+
494
+ return 0;
578495 }
579496
580
-static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
581
- u64 phy_offset, struct page *page,
582
- u64 start, u64 end, int mirror)
497
+static int check_tree_block_fsid(struct extent_buffer *eb)
498
+{
499
+ struct btrfs_fs_info *fs_info = eb->fs_info;
500
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
501
+ u8 fsid[BTRFS_FSID_SIZE];
502
+ u8 *metadata_uuid;
503
+
504
+ read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
505
+ BTRFS_FSID_SIZE);
506
+ /*
507
+ * Checking the incompat flag is only valid for the current fs. For
508
+ * seed devices it's forbidden to have their uuid changed so reading
509
+ * ->fsid in this case is fine
510
+ */
511
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
512
+ metadata_uuid = fs_devices->metadata_uuid;
513
+ else
514
+ metadata_uuid = fs_devices->fsid;
515
+
516
+ if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
517
+ return 0;
518
+
519
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
520
+ if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
521
+ return 0;
522
+
523
+ return 1;
524
+}
525
+
526
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
527
+ struct page *page, u64 start, u64 end,
528
+ int mirror)
583529 {
584530 u64 found_start;
585531 int found_level;
586532 struct extent_buffer *eb;
587
- struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
588
- struct btrfs_fs_info *fs_info = root->fs_info;
533
+ struct btrfs_fs_info *fs_info;
534
+ u16 csum_size;
589535 int ret = 0;
536
+ u8 result[BTRFS_CSUM_SIZE];
590537 int reads_done;
591538
592539 if (!page->private)
593540 goto out;
594541
595542 eb = (struct extent_buffer *)page->private;
543
+ fs_info = eb->fs_info;
544
+ csum_size = btrfs_super_csum_size(fs_info->super_copy);
596545
597546 /* the pending IO might have been the only thing that kept this buffer
598547 * in memory. Make sure we have a ref for all this other checks
599548 */
600
- extent_buffer_get(eb);
549
+ atomic_inc(&eb->refs);
601550
602551 reads_done = atomic_dec_and_test(&eb->io_pages);
603552 if (!reads_done)
....@@ -616,7 +565,7 @@
616565 ret = -EIO;
617566 goto err;
618567 }
619
- if (check_tree_block_fsid(fs_info, eb)) {
568
+ if (check_tree_block_fsid(eb)) {
620569 btrfs_err_rl(fs_info, "bad fsid on block %llu",
621570 eb->start);
622571 ret = -EIO;
....@@ -633,25 +582,41 @@
633582 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
634583 eb, found_level);
635584
636
- ret = csum_tree_block(fs_info, eb, 1);
637
- if (ret)
585
+ csum_tree_block(eb, result);
586
+
587
+ if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
588
+ u8 val[BTRFS_CSUM_SIZE] = { 0 };
589
+
590
+ read_extent_buffer(eb, &val, 0, csum_size);
591
+ btrfs_warn_rl(fs_info,
592
+ "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
593
+ fs_info->sb->s_id, eb->start,
594
+ CSUM_FMT_VALUE(csum_size, val),
595
+ CSUM_FMT_VALUE(csum_size, result),
596
+ btrfs_header_level(eb));
597
+ ret = -EUCLEAN;
638598 goto err;
599
+ }
639600
640601 /*
641602 * If this is a leaf block and it is corrupt, set the corrupt bit so
642603 * that we don't try and read the other copies of this block, just
643604 * return -EIO.
644605 */
645
- if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) {
606
+ if (found_level == 0 && btrfs_check_leaf_full(eb)) {
646607 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
647608 ret = -EIO;
648609 }
649610
650
- if (found_level > 0 && btrfs_check_node(fs_info, eb))
611
+ if (found_level > 0 && btrfs_check_node(eb))
651612 ret = -EIO;
652613
653614 if (!ret)
654615 set_extent_buffer_uptodate(eb);
616
+ else
617
+ btrfs_err(fs_info,
618
+ "block=%llu read time tree block corruption detected",
619
+ eb->start);
655620 err:
656621 if (reads_done &&
657622 test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
....@@ -671,61 +636,34 @@
671636 return ret;
672637 }
673638
674
-static int btree_io_failed_hook(struct page *page, int failed_mirror)
675
-{
676
- struct extent_buffer *eb;
677
-
678
- eb = (struct extent_buffer *)page->private;
679
- set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
680
- eb->read_mirror = failed_mirror;
681
- atomic_dec(&eb->io_pages);
682
- if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
683
- btree_readahead_hook(eb, -EIO);
684
- return -EIO; /* we fixed nothing */
685
-}
686
-
687639 static void end_workqueue_bio(struct bio *bio)
688640 {
689641 struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
690642 struct btrfs_fs_info *fs_info;
691643 struct btrfs_workqueue *wq;
692
- btrfs_work_func_t func;
693644
694645 fs_info = end_io_wq->info;
695646 end_io_wq->status = bio->bi_status;
696647
697648 if (bio_op(bio) == REQ_OP_WRITE) {
698
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
649
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
699650 wq = fs_info->endio_meta_write_workers;
700
- func = btrfs_endio_meta_write_helper;
701
- } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
651
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
702652 wq = fs_info->endio_freespace_worker;
703
- func = btrfs_freespace_write_helper;
704
- } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
653
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
705654 wq = fs_info->endio_raid56_workers;
706
- func = btrfs_endio_raid56_helper;
707
- } else {
655
+ else
708656 wq = fs_info->endio_write_workers;
709
- func = btrfs_endio_write_helper;
710
- }
711657 } else {
712
- if (unlikely(end_io_wq->metadata ==
713
- BTRFS_WQ_ENDIO_DIO_REPAIR)) {
714
- wq = fs_info->endio_repair_workers;
715
- func = btrfs_endio_repair_helper;
716
- } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
658
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
717659 wq = fs_info->endio_raid56_workers;
718
- func = btrfs_endio_raid56_helper;
719
- } else if (end_io_wq->metadata) {
660
+ else if (end_io_wq->metadata)
720661 wq = fs_info->endio_meta_workers;
721
- func = btrfs_endio_meta_helper;
722
- } else {
662
+ else
723663 wq = fs_info->endio_workers;
724
- func = btrfs_endio_helper;
725
- }
726664 }
727665
728
- btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
666
+ btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
729667 btrfs_queue_work(wq, &end_io_wq->work);
730668 }
731669
....@@ -762,11 +700,22 @@
762700 async->status = ret;
763701 }
764702
703
+/*
704
+ * In order to insert checksums into the metadata in large chunks, we wait
705
+ * until bio submission time. All the pages in the bio are checksummed and
706
+ * sums are attached onto the ordered extent record.
707
+ *
708
+ * At IO completion time the csums attached on the ordered extent record are
709
+ * inserted into the tree.
710
+ */
765711 static void run_one_async_done(struct btrfs_work *work)
766712 {
767713 struct async_submit_bio *async;
714
+ struct inode *inode;
715
+ blk_status_t ret;
768716
769717 async = container_of(work, struct async_submit_bio, work);
718
+ inode = async->private_data;
770719
771720 /* If an error occurred we just want to clean up the bio and move on */
772721 if (async->status) {
....@@ -775,7 +724,17 @@
775724 return;
776725 }
777726
778
- btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num);
727
+ /*
728
+ * All of the bios that pass through here are from async helpers.
729
+ * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
730
+ * This changes nothing when cgroups aren't in use.
731
+ */
732
+ async->bio->bi_opf |= REQ_CGROUP_PUNT;
733
+ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
734
+ if (ret) {
735
+ async->bio->bi_status = ret;
736
+ bio_endio(async->bio);
737
+ }
779738 }
780739
781740 static void run_one_async_free(struct btrfs_work *work)
....@@ -802,8 +761,8 @@
802761 async->mirror_num = mirror_num;
803762 async->submit_bio_start = submit_bio_start;
804763
805
- btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
806
- run_one_async_done, run_one_async_free);
764
+ btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
765
+ run_one_async_free);
807766
808767 async->bio_offset = bio_offset;
809768
....@@ -820,10 +779,11 @@
820779 {
821780 struct bio_vec *bvec;
822781 struct btrfs_root *root;
823
- int i, ret = 0;
782
+ int ret = 0;
783
+ struct bvec_iter_all iter_all;
824784
825785 ASSERT(!bio_flagged(bio, BIO_CLONED));
826
- bio_for_each_segment_all(bvec, bio, i) {
786
+ bio_for_each_segment_all(bvec, bio, iter_all) {
827787 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
828788 ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
829789 if (ret)
....@@ -843,24 +803,21 @@
843803 return btree_csum_one_bio(bio);
844804 }
845805
846
-static int check_async_write(struct btrfs_inode *bi)
806
+static int check_async_write(struct btrfs_fs_info *fs_info,
807
+ struct btrfs_inode *bi)
847808 {
848809 if (atomic_read(&bi->sync_writers))
849810 return 0;
850
-#ifdef CONFIG_X86
851
- if (static_cpu_has(X86_FEATURE_XMM4_2))
811
+ if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
852812 return 0;
853
-#endif
854813 return 1;
855814 }
856815
857
-static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
858
- int mirror_num, unsigned long bio_flags,
859
- u64 bio_offset)
816
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
817
+ int mirror_num, unsigned long bio_flags)
860818 {
861
- struct inode *inode = private_data;
862819 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
863
- int async = check_async_write(BTRFS_I(inode));
820
+ int async = check_async_write(fs_info, BTRFS_I(inode));
864821 blk_status_t ret;
865822
866823 if (bio_op(bio) != REQ_OP_WRITE) {
....@@ -872,20 +829,19 @@
872829 BTRFS_WQ_ENDIO_METADATA);
873830 if (ret)
874831 goto out_w_error;
875
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
832
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
876833 } else if (!async) {
877834 ret = btree_csum_one_bio(bio);
878835 if (ret)
879836 goto out_w_error;
880
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
837
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
881838 } else {
882839 /*
883840 * kthread helpers are used to submit writes so that
884841 * checksumming can happen in parallel across all CPUs
885842 */
886843 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
887
- bio_offset, private_data,
888
- btree_submit_bio_start);
844
+ 0, inode, btree_submit_bio_start);
889845 }
890846
891847 if (ret)
....@@ -943,13 +899,6 @@
943899 return btree_write_cache_pages(mapping, wbc);
944900 }
945901
946
-static int btree_readpage(struct file *file, struct page *page)
947
-{
948
- struct extent_io_tree *tree;
949
- tree = &BTRFS_I(page->mapping->host)->io_tree;
950
- return extent_read_full_page(tree, page, btree_get_extent, 0);
951
-}
952
-
953902 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
954903 {
955904 if (PageWriteback(page) || PageDirty(page))
....@@ -969,9 +918,7 @@
969918 btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
970919 "page private not zero on page %llu",
971920 (unsigned long long)page_offset(page));
972
- ClearPagePrivate(page);
973
- set_page_private(page, 0);
974
- put_page(page);
921
+ detach_page_private(page);
975922 }
976923 }
977924
....@@ -991,7 +938,6 @@
991938 }
992939
993940 static const struct address_space_operations btree_aops = {
994
- .readpage = btree_readpage,
995941 .writepages = btree_writepages,
996942 .releasepage = btree_releasepage,
997943 .invalidatepage = btree_invalidatepage,
....@@ -1004,51 +950,17 @@
1004950 void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
1005951 {
1006952 struct extent_buffer *buf = NULL;
1007
- struct inode *btree_inode = fs_info->btree_inode;
1008953 int ret;
1009954
1010955 buf = btrfs_find_create_tree_block(fs_info, bytenr);
1011956 if (IS_ERR(buf))
1012957 return;
1013958
1014
- ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf,
1015
- WAIT_NONE, 0);
959
+ ret = read_extent_buffer_pages(buf, WAIT_NONE, 0);
1016960 if (ret < 0)
1017961 free_extent_buffer_stale(buf);
1018962 else
1019963 free_extent_buffer(buf);
1020
-}
1021
-
1022
-int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
1023
- int mirror_num, struct extent_buffer **eb)
1024
-{
1025
- struct extent_buffer *buf = NULL;
1026
- struct inode *btree_inode = fs_info->btree_inode;
1027
- struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1028
- int ret;
1029
-
1030
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
1031
- if (IS_ERR(buf))
1032
- return 0;
1033
-
1034
- set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1035
-
1036
- ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
1037
- mirror_num);
1038
- if (ret) {
1039
- free_extent_buffer_stale(buf);
1040
- return ret;
1041
- }
1042
-
1043
- if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1044
- free_extent_buffer_stale(buf);
1045
- return -EIO;
1046
- } else if (extent_buffer_uptodate(buf)) {
1047
- *eb = buf;
1048
- } else {
1049
- free_extent_buffer(buf);
1050
- }
1051
- return 0;
1052964 }
1053965
1054966 struct extent_buffer *btrfs_find_create_tree_block(
....@@ -1058,19 +970,6 @@
1058970 if (btrfs_is_testing(fs_info))
1059971 return alloc_test_extent_buffer(fs_info, bytenr);
1060972 return alloc_extent_buffer(fs_info, bytenr);
1061
-}
1062
-
1063
-
1064
-int btrfs_write_tree_block(struct extent_buffer *buf)
1065
-{
1066
- return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
1067
- buf->start + buf->len - 1);
1068
-}
1069
-
1070
-void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
1071
-{
1072
- filemap_fdatawait_range(buf->pages[0]->mapping,
1073
- buf->start, buf->start + buf->len - 1);
1074973 }
1075974
1076975 /*
....@@ -1092,7 +991,7 @@
1092991 if (IS_ERR(buf))
1093992 return buf;
1094993
1095
- ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
994
+ ret = btree_read_extent_buffer_pages(buf, parent_transid,
1096995 level, first_key);
1097996 if (ret) {
1098997 free_extent_buffer_stale(buf);
....@@ -1102,9 +1001,9 @@
11021001
11031002 }
11041003
1105
-void clean_tree_block(struct btrfs_fs_info *fs_info,
1106
- struct extent_buffer *buf)
1004
+void btrfs_clean_tree_block(struct extent_buffer *buf)
11071005 {
1006
+ struct btrfs_fs_info *fs_info = buf->fs_info;
11081007 if (btrfs_header_generation(buf) ==
11091008 fs_info->running_transaction->transid) {
11101009 btrfs_assert_tree_locked(buf);
....@@ -1114,48 +1013,22 @@
11141013 -buf->len,
11151014 fs_info->dirty_metadata_batch);
11161015 /* ugh, clear_extent_buffer_dirty needs to lock the page */
1117
- btrfs_set_lock_blocking(buf);
1016
+ btrfs_set_lock_blocking_write(buf);
11181017 clear_extent_buffer_dirty(buf);
11191018 }
11201019 }
1121
-}
1122
-
1123
-static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
1124
-{
1125
- struct btrfs_subvolume_writers *writers;
1126
- int ret;
1127
-
1128
- writers = kmalloc(sizeof(*writers), GFP_NOFS);
1129
- if (!writers)
1130
- return ERR_PTR(-ENOMEM);
1131
-
1132
- ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
1133
- if (ret < 0) {
1134
- kfree(writers);
1135
- return ERR_PTR(ret);
1136
- }
1137
-
1138
- init_waitqueue_head(&writers->wait);
1139
- return writers;
1140
-}
1141
-
1142
-static void
1143
-btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
1144
-{
1145
- percpu_counter_destroy(&writers->counter);
1146
- kfree(writers);
11471020 }
11481021
11491022 static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
11501023 u64 objectid)
11511024 {
11521025 bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
1026
+ root->fs_info = fs_info;
11531027 root->node = NULL;
11541028 root->commit_root = NULL;
11551029 root->state = 0;
11561030 root->orphan_cleanup_state = 0;
11571031
1158
- root->objectid = objectid;
11591032 root->last_trans = 0;
11601033 root->highest_objectid = 0;
11611034 root->nr_delalloc_inodes = 0;
....@@ -1170,6 +1043,7 @@
11701043 INIT_LIST_HEAD(&root->delalloc_root);
11711044 INIT_LIST_HEAD(&root->ordered_extents);
11721045 INIT_LIST_HEAD(&root->ordered_root);
1046
+ INIT_LIST_HEAD(&root->reloc_dirty_list);
11731047 INIT_LIST_HEAD(&root->logged_list[0]);
11741048 INIT_LIST_HEAD(&root->logged_list[1]);
11751049 spin_lock_init(&root->inode_lock);
....@@ -1183,6 +1057,7 @@
11831057 mutex_init(&root->log_mutex);
11841058 mutex_init(&root->ordered_extent_mutex);
11851059 mutex_init(&root->delalloc_mutex);
1060
+ init_waitqueue_head(&root->qgroup_flush_wait);
11861061 init_waitqueue_head(&root->log_writer_wait);
11871062 init_waitqueue_head(&root->log_commit_wait[0]);
11881063 init_waitqueue_head(&root->log_commit_wait[1]);
....@@ -1193,33 +1068,40 @@
11931068 atomic_set(&root->log_writers, 0);
11941069 atomic_set(&root->log_batch, 0);
11951070 refcount_set(&root->refs, 1);
1196
- atomic_set(&root->will_be_snapshotted, 0);
11971071 atomic_set(&root->snapshot_force_cow, 0);
1072
+ atomic_set(&root->nr_swapfiles, 0);
11981073 root->log_transid = 0;
11991074 root->log_transid_committed = -1;
12001075 root->last_log_commit = 0;
1201
- if (!dummy)
1202
- extent_io_tree_init(&root->dirty_log_pages, NULL);
1076
+ if (!dummy) {
1077
+ extent_io_tree_init(fs_info, &root->dirty_log_pages,
1078
+ IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
1079
+ extent_io_tree_init(fs_info, &root->log_csum_range,
1080
+ IO_TREE_LOG_CSUM_RANGE, NULL);
1081
+ }
12031082
12041083 memset(&root->root_key, 0, sizeof(root->root_key));
12051084 memset(&root->root_item, 0, sizeof(root->root_item));
12061085 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1207
- if (!dummy)
1208
- root->defrag_trans_start = fs_info->generation;
1209
- else
1210
- root->defrag_trans_start = 0;
12111086 root->root_key.objectid = objectid;
12121087 root->anon_dev = 0;
12131088
12141089 spin_lock_init(&root->root_item_lock);
1090
+ btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
1091
+#ifdef CONFIG_BTRFS_DEBUG
1092
+ INIT_LIST_HEAD(&root->leak_list);
1093
+ spin_lock(&fs_info->fs_roots_radix_lock);
1094
+ list_add_tail(&root->leak_list, &fs_info->allocated_roots);
1095
+ spin_unlock(&fs_info->fs_roots_radix_lock);
1096
+#endif
12151097 }
12161098
12171099 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
1218
- gfp_t flags)
1100
+ u64 objectid, gfp_t flags)
12191101 {
12201102 struct btrfs_root *root = kzalloc(sizeof(*root), flags);
12211103 if (root)
1222
- root->fs_info = fs_info;
1104
+ __setup_root(root, fs_info, objectid);
12231105 return root;
12241106 }
12251107
....@@ -1232,12 +1114,11 @@
12321114 if (!fs_info)
12331115 return ERR_PTR(-EINVAL);
12341116
1235
- root = btrfs_alloc_root(fs_info, GFP_KERNEL);
1117
+ root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
12361118 if (!root)
12371119 return ERR_PTR(-ENOMEM);
12381120
12391121 /* We don't use the stripesize in selftest, set it as sectorsize */
1240
- __setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
12411122 root->alloc_bytenr = 0;
12421123
12431124 return root;
....@@ -1245,33 +1126,32 @@
12451126 #endif
12461127
12471128 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1248
- struct btrfs_fs_info *fs_info,
12491129 u64 objectid)
12501130 {
1131
+ struct btrfs_fs_info *fs_info = trans->fs_info;
12511132 struct extent_buffer *leaf;
12521133 struct btrfs_root *tree_root = fs_info->tree_root;
12531134 struct btrfs_root *root;
12541135 struct btrfs_key key;
12551136 unsigned int nofs_flag;
12561137 int ret = 0;
1257
- uuid_le uuid = NULL_UUID_LE;
12581138
12591139 /*
12601140 * We're holding a transaction handle, so use a NOFS memory allocation
12611141 * context to avoid deadlock if reclaim happens.
12621142 */
12631143 nofs_flag = memalloc_nofs_save();
1264
- root = btrfs_alloc_root(fs_info, GFP_KERNEL);
1144
+ root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
12651145 memalloc_nofs_restore(nofs_flag);
12661146 if (!root)
12671147 return ERR_PTR(-ENOMEM);
12681148
1269
- __setup_root(root, fs_info, objectid);
12701149 root->root_key.objectid = objectid;
12711150 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
12721151 root->root_key.offset = 0;
12731152
1274
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
1153
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
1154
+ BTRFS_NESTING_NORMAL);
12751155 if (IS_ERR(leaf)) {
12761156 ret = PTR_ERR(leaf);
12771157 leaf = NULL;
....@@ -1294,8 +1174,9 @@
12941174 btrfs_set_root_last_snapshot(&root->root_item, 0);
12951175 btrfs_set_root_dirid(&root->root_item, 0);
12961176 if (is_fstree(objectid))
1297
- uuid_le_gen(&uuid);
1298
- memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
1177
+ generate_random_guid(root->root_item.uuid);
1178
+ else
1179
+ export_guid(root->root_item.uuid, &guid_null);
12991180 root->root_item.drop_level = 0;
13001181
13011182 key.objectid = objectid;
....@@ -1310,12 +1191,9 @@
13101191 return root;
13111192
13121193 fail:
1313
- if (leaf) {
1194
+ if (leaf)
13141195 btrfs_tree_unlock(leaf);
1315
- free_extent_buffer(root->commit_root);
1316
- free_extent_buffer(leaf);
1317
- }
1318
- kfree(root);
1196
+ btrfs_put_root(root);
13191197
13201198 return ERR_PTR(ret);
13211199 }
....@@ -1326,29 +1204,28 @@
13261204 struct btrfs_root *root;
13271205 struct extent_buffer *leaf;
13281206
1329
- root = btrfs_alloc_root(fs_info, GFP_NOFS);
1207
+ root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
13301208 if (!root)
13311209 return ERR_PTR(-ENOMEM);
1332
-
1333
- __setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID);
13341210
13351211 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
13361212 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
13371213 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
13381214
13391215 /*
1340
- * DON'T set REF_COWS for log trees
1216
+ * DON'T set SHAREABLE bit for log trees.
13411217 *
1342
- * log trees do not get reference counted because they go away
1343
- * before a real commit is actually done. They do store pointers
1344
- * to file data extents, and those reference counts still get
1345
- * updated (along with back refs to the log tree).
1218
+ * Log trees are not exposed to user space thus can't be snapshotted,
1219
+ * and they go away before a real commit is actually done.
1220
+ *
1221
+ * They do store pointers to file data extents, and those reference
1222
+ * counts still get updated (along with back refs to the log tree).
13461223 */
13471224
13481225 leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1349
- NULL, 0, 0, 0);
1226
+ NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
13501227 if (IS_ERR(leaf)) {
1351
- kfree(root);
1228
+ btrfs_put_root(root);
13521229 return ERR_CAST(leaf);
13531230 }
13541231
....@@ -1404,34 +1281,26 @@
14041281 return 0;
14051282 }
14061283
1407
-static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1408
- struct btrfs_key *key)
1284
+static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
1285
+ struct btrfs_path *path,
1286
+ struct btrfs_key *key)
14091287 {
14101288 struct btrfs_root *root;
14111289 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1412
- struct btrfs_path *path;
14131290 u64 generation;
14141291 int ret;
14151292 int level;
14161293
1417
- path = btrfs_alloc_path();
1418
- if (!path)
1294
+ root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1295
+ if (!root)
14191296 return ERR_PTR(-ENOMEM);
1420
-
1421
- root = btrfs_alloc_root(fs_info, GFP_NOFS);
1422
- if (!root) {
1423
- ret = -ENOMEM;
1424
- goto alloc_fail;
1425
- }
1426
-
1427
- __setup_root(root, fs_info, key->objectid);
14281297
14291298 ret = btrfs_find_root(tree_root, key, path,
14301299 &root->root_item, &root->root_key);
14311300 if (ret) {
14321301 if (ret > 0)
14331302 ret = -ENOENT;
1434
- goto find_fail;
1303
+ goto fail;
14351304 }
14361305
14371306 generation = btrfs_root_generation(&root->root_item);
....@@ -1441,45 +1310,43 @@
14411310 generation, level, NULL);
14421311 if (IS_ERR(root->node)) {
14431312 ret = PTR_ERR(root->node);
1444
- goto find_fail;
1313
+ root->node = NULL;
1314
+ goto fail;
14451315 } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
14461316 ret = -EIO;
1447
- free_extent_buffer(root->node);
1448
- goto find_fail;
1317
+ goto fail;
14491318 }
14501319 root->commit_root = btrfs_root_node(root);
1451
-out:
1452
- btrfs_free_path(path);
14531320 return root;
1454
-
1455
-find_fail:
1456
- kfree(root);
1457
-alloc_fail:
1458
- root = ERR_PTR(ret);
1459
- goto out;
1321
+fail:
1322
+ btrfs_put_root(root);
1323
+ return ERR_PTR(ret);
14601324 }
14611325
1462
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1463
- struct btrfs_key *location)
1326
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1327
+ struct btrfs_key *key)
14641328 {
14651329 struct btrfs_root *root;
1330
+ struct btrfs_path *path;
14661331
1467
- root = btrfs_read_tree_root(tree_root, location);
1468
- if (IS_ERR(root))
1469
- return root;
1470
-
1471
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
1472
- set_bit(BTRFS_ROOT_REF_COWS, &root->state);
1473
- btrfs_check_and_init_root_item(&root->root_item);
1474
- }
1332
+ path = btrfs_alloc_path();
1333
+ if (!path)
1334
+ return ERR_PTR(-ENOMEM);
1335
+ root = read_tree_root_path(tree_root, path, key);
1336
+ btrfs_free_path(path);
14751337
14761338 return root;
14771339 }
14781340
1479
-int btrfs_init_fs_root(struct btrfs_root *root)
1341
+/*
1342
+ * Initialize subvolume root in-memory structure
1343
+ *
1344
+ * @anon_dev: anonymous device to attach to the root, if zero, allocate new
1345
+ */
1346
+static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
14801347 {
14811348 int ret;
1482
- struct btrfs_subvolume_writers *writers;
1349
+ unsigned int nofs_flag;
14831350
14841351 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
14851352 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
....@@ -1489,12 +1356,21 @@
14891356 goto fail;
14901357 }
14911358
1492
- writers = btrfs_alloc_subvolume_writers();
1493
- if (IS_ERR(writers)) {
1494
- ret = PTR_ERR(writers);
1359
+ /*
1360
+ * We might be called under a transaction (e.g. indirect backref
1361
+ * resolution) which could deadlock if it triggers memory reclaim
1362
+ */
1363
+ nofs_flag = memalloc_nofs_save();
1364
+ ret = btrfs_drew_lock_init(&root->snapshot_lock);
1365
+ memalloc_nofs_restore(nofs_flag);
1366
+ if (ret)
14951367 goto fail;
1368
+
1369
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
1370
+ root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
1371
+ set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
1372
+ btrfs_check_and_init_root_item(&root->root_item);
14961373 }
1497
- root->subv_writers = writers;
14981374
14991375 btrfs_init_free_ino_ctl(root);
15001376 spin_lock_init(&root->ino_cache_lock);
....@@ -1506,9 +1382,13 @@
15061382 */
15071383 if (is_fstree(root->root_key.objectid) &&
15081384 btrfs_root_refs(&root->root_item) > 0) {
1509
- ret = get_anon_bdev(&root->anon_dev);
1510
- if (ret)
1511
- goto fail;
1385
+ if (!anon_dev) {
1386
+ ret = get_anon_bdev(&root->anon_dev);
1387
+ if (ret)
1388
+ goto fail;
1389
+ } else {
1390
+ root->anon_dev = anon_dev;
1391
+ }
15121392 }
15131393
15141394 mutex_lock(&root->objectid_mutex);
....@@ -1529,16 +1409,43 @@
15291409 return ret;
15301410 }
15311411
1532
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1533
- u64 root_id)
1412
+static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1413
+ u64 root_id)
15341414 {
15351415 struct btrfs_root *root;
15361416
15371417 spin_lock(&fs_info->fs_roots_radix_lock);
15381418 root = radix_tree_lookup(&fs_info->fs_roots_radix,
15391419 (unsigned long)root_id);
1420
+ if (root)
1421
+ root = btrfs_grab_root(root);
15401422 spin_unlock(&fs_info->fs_roots_radix_lock);
15411423 return root;
1424
+}
1425
+
1426
+static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
1427
+ u64 objectid)
1428
+{
1429
+ if (objectid == BTRFS_ROOT_TREE_OBJECTID)
1430
+ return btrfs_grab_root(fs_info->tree_root);
1431
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
1432
+ return btrfs_grab_root(fs_info->extent_root);
1433
+ if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
1434
+ return btrfs_grab_root(fs_info->chunk_root);
1435
+ if (objectid == BTRFS_DEV_TREE_OBJECTID)
1436
+ return btrfs_grab_root(fs_info->dev_root);
1437
+ if (objectid == BTRFS_CSUM_TREE_OBJECTID)
1438
+ return btrfs_grab_root(fs_info->csum_root);
1439
+ if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
1440
+ return btrfs_grab_root(fs_info->quota_root) ?
1441
+ fs_info->quota_root : ERR_PTR(-ENOENT);
1442
+ if (objectid == BTRFS_UUID_TREE_OBJECTID)
1443
+ return btrfs_grab_root(fs_info->uuid_root) ?
1444
+ fs_info->uuid_root : ERR_PTR(-ENOENT);
1445
+ if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
1446
+ return btrfs_grab_root(fs_info->free_space_root) ?
1447
+ fs_info->free_space_root : ERR_PTR(-ENOENT);
1448
+ return NULL;
15421449 }
15431450
15441451 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
....@@ -1554,51 +1461,111 @@
15541461 ret = radix_tree_insert(&fs_info->fs_roots_radix,
15551462 (unsigned long)root->root_key.objectid,
15561463 root);
1557
- if (ret == 0)
1464
+ if (ret == 0) {
1465
+ btrfs_grab_root(root);
15581466 set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1467
+ }
15591468 spin_unlock(&fs_info->fs_roots_radix_lock);
15601469 radix_tree_preload_end();
15611470
15621471 return ret;
15631472 }
15641473
1565
-struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1566
- struct btrfs_key *location,
1567
- bool check_ref)
1474
+void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
1475
+{
1476
+#ifdef CONFIG_BTRFS_DEBUG
1477
+ struct btrfs_root *root;
1478
+
1479
+ while (!list_empty(&fs_info->allocated_roots)) {
1480
+ char buf[BTRFS_ROOT_NAME_BUF_LEN];
1481
+
1482
+ root = list_first_entry(&fs_info->allocated_roots,
1483
+ struct btrfs_root, leak_list);
1484
+ btrfs_err(fs_info, "leaked root %s refcount %d",
1485
+ btrfs_root_name(&root->root_key, buf),
1486
+ refcount_read(&root->refs));
1487
+ while (refcount_read(&root->refs) > 1)
1488
+ btrfs_put_root(root);
1489
+ btrfs_put_root(root);
1490
+ }
1491
+#endif
1492
+}
1493
+
1494
+void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
1495
+{
1496
+ percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
1497
+ percpu_counter_destroy(&fs_info->delalloc_bytes);
1498
+ percpu_counter_destroy(&fs_info->dio_bytes);
1499
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
1500
+ btrfs_free_csum_hash(fs_info);
1501
+ btrfs_free_stripe_hash_table(fs_info);
1502
+ btrfs_free_ref_cache(fs_info);
1503
+ kfree(fs_info->balance_ctl);
1504
+ kfree(fs_info->delayed_root);
1505
+ btrfs_put_root(fs_info->extent_root);
1506
+ btrfs_put_root(fs_info->tree_root);
1507
+ btrfs_put_root(fs_info->chunk_root);
1508
+ btrfs_put_root(fs_info->dev_root);
1509
+ btrfs_put_root(fs_info->csum_root);
1510
+ btrfs_put_root(fs_info->quota_root);
1511
+ btrfs_put_root(fs_info->uuid_root);
1512
+ btrfs_put_root(fs_info->free_space_root);
1513
+ btrfs_put_root(fs_info->fs_root);
1514
+ btrfs_put_root(fs_info->data_reloc_root);
1515
+ btrfs_check_leaked_roots(fs_info);
1516
+ btrfs_extent_buffer_leak_debug_check(fs_info);
1517
+ kfree(fs_info->super_copy);
1518
+ kfree(fs_info->super_for_commit);
1519
+ kvfree(fs_info);
1520
+}
1521
+
1522
+
1523
+/*
1524
+ * Get an in-memory reference of a root structure.
1525
+ *
1526
+ * For essential trees like root/extent tree, we grab it from fs_info directly.
1527
+ * For subvolume trees, we check the cached filesystem roots first. If not
1528
+ * found, then read it from disk and add it to cached fs roots.
1529
+ *
1530
+ * Caller should release the root by calling btrfs_put_root() after the usage.
1531
+ *
1532
+ * NOTE: Reloc and log trees can't be read by this function as they share the
1533
+ * same root objectid.
1534
+ *
1535
+ * @objectid: root id
1536
+ * @anon_dev: preallocated anonymous block device number for new roots,
1537
+ * pass 0 for new allocation.
1538
+ * @check_ref: whether to check root item references, If true, return -ENOENT
1539
+ * for orphan roots
1540
+ */
1541
+static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
1542
+ u64 objectid, dev_t anon_dev,
1543
+ bool check_ref)
15681544 {
15691545 struct btrfs_root *root;
15701546 struct btrfs_path *path;
15711547 struct btrfs_key key;
15721548 int ret;
15731549
1574
- if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1575
- return fs_info->tree_root;
1576
- if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1577
- return fs_info->extent_root;
1578
- if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1579
- return fs_info->chunk_root;
1580
- if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1581
- return fs_info->dev_root;
1582
- if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1583
- return fs_info->csum_root;
1584
- if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
1585
- return fs_info->quota_root ? fs_info->quota_root :
1586
- ERR_PTR(-ENOENT);
1587
- if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
1588
- return fs_info->uuid_root ? fs_info->uuid_root :
1589
- ERR_PTR(-ENOENT);
1590
- if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
1591
- return fs_info->free_space_root ? fs_info->free_space_root :
1592
- ERR_PTR(-ENOENT);
1550
+ root = btrfs_get_global_root(fs_info, objectid);
1551
+ if (root)
1552
+ return root;
15931553 again:
1594
- root = btrfs_lookup_fs_root(fs_info, location->objectid);
1554
+ root = btrfs_lookup_fs_root(fs_info, objectid);
15951555 if (root) {
1596
- if (check_ref && btrfs_root_refs(&root->root_item) == 0)
1556
+ /* Shouldn't get preallocated anon_dev for cached roots */
1557
+ ASSERT(!anon_dev);
1558
+ if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1559
+ btrfs_put_root(root);
15971560 return ERR_PTR(-ENOENT);
1561
+ }
15981562 return root;
15991563 }
16001564
1601
- root = btrfs_read_fs_root(fs_info->tree_root, location);
1565
+ key.objectid = objectid;
1566
+ key.type = BTRFS_ROOT_ITEM_KEY;
1567
+ key.offset = (u64)-1;
1568
+ root = btrfs_read_tree_root(fs_info->tree_root, &key);
16021569 if (IS_ERR(root))
16031570 return root;
16041571
....@@ -1607,7 +1574,7 @@
16071574 goto fail;
16081575 }
16091576
1610
- ret = btrfs_init_fs_root(root);
1577
+ ret = btrfs_init_fs_root(root, anon_dev);
16111578 if (ret)
16121579 goto fail;
16131580
....@@ -1618,7 +1585,7 @@
16181585 }
16191586 key.objectid = BTRFS_ORPHAN_OBJECTID;
16201587 key.type = BTRFS_ORPHAN_ITEM_KEY;
1621
- key.offset = location->objectid;
1588
+ key.offset = objectid;
16221589
16231590 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
16241591 btrfs_free_path(path);
....@@ -1630,36 +1597,96 @@
16301597 ret = btrfs_insert_fs_root(fs_info, root);
16311598 if (ret) {
16321599 if (ret == -EEXIST) {
1633
- btrfs_free_fs_root(root);
1600
+ btrfs_put_root(root);
16341601 goto again;
16351602 }
16361603 goto fail;
16371604 }
16381605 return root;
16391606 fail:
1640
- btrfs_free_fs_root(root);
1607
+ /*
1608
+ * If our caller provided us an anonymous device, then it's his
1609
+ * responsability to free it in case we fail. So we have to set our
1610
+ * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
1611
+ * and once again by our caller.
1612
+ */
1613
+ if (anon_dev)
1614
+ root->anon_dev = 0;
1615
+ btrfs_put_root(root);
16411616 return ERR_PTR(ret);
16421617 }
16431618
1644
-static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1619
+/*
1620
+ * Get in-memory reference of a root structure
1621
+ *
1622
+ * @objectid: tree objectid
1623
+ * @check_ref: if set, verify that the tree exists and the item has at least
1624
+ * one reference
1625
+ */
1626
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1627
+ u64 objectid, bool check_ref)
16451628 {
1646
- struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1647
- int ret = 0;
1648
- struct btrfs_device *device;
1649
- struct backing_dev_info *bdi;
1629
+ return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
1630
+}
16501631
1651
- rcu_read_lock();
1652
- list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
1653
- if (!device->bdev)
1654
- continue;
1655
- bdi = device->bdev->bd_bdi;
1656
- if (bdi_congested(bdi, bdi_bits)) {
1657
- ret = 1;
1658
- break;
1659
- }
1660
- }
1661
- rcu_read_unlock();
1662
- return ret;
1632
+/*
1633
+ * Get in-memory reference of a root structure, created as new, optionally pass
1634
+ * the anonymous block device id
1635
+ *
1636
+ * @objectid: tree objectid
1637
+ * @anon_dev: if zero, allocate a new anonymous block device or use the
1638
+ * parameter value
1639
+ */
1640
+struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
1641
+ u64 objectid, dev_t anon_dev)
1642
+{
1643
+ return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
1644
+}
1645
+
1646
+/*
1647
+ * btrfs_get_fs_root_commit_root - return a root for the given objectid
1648
+ * @fs_info: the fs_info
1649
+ * @objectid: the objectid we need to lookup
1650
+ *
1651
+ * This is exclusively used for backref walking, and exists specifically because
1652
+ * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref
1653
+ * creation time, which means we may have to read the tree_root in order to look
1654
+ * up a fs root that is not in memory. If the root is not in memory we will
1655
+ * read the tree root commit root and look up the fs root from there. This is a
1656
+ * temporary root, it will not be inserted into the radix tree as it doesn't
1657
+ * have the most uptodate information, it'll simply be discarded once the
1658
+ * backref code is finished using the root.
1659
+ */
1660
+struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
1661
+ struct btrfs_path *path,
1662
+ u64 objectid)
1663
+{
1664
+ struct btrfs_root *root;
1665
+ struct btrfs_key key;
1666
+
1667
+ ASSERT(path->search_commit_root && path->skip_locking);
1668
+
1669
+ /*
1670
+ * This can return -ENOENT if we ask for a root that doesn't exist, but
1671
+ * since this is called via the backref walking code we won't be looking
1672
+ * up a root that doesn't exist, unless there's corruption. So if root
1673
+ * != NULL just return it.
1674
+ */
1675
+ root = btrfs_get_global_root(fs_info, objectid);
1676
+ if (root)
1677
+ return root;
1678
+
1679
+ root = btrfs_lookup_fs_root(fs_info, objectid);
1680
+ if (root)
1681
+ return root;
1682
+
1683
+ key.objectid = objectid;
1684
+ key.type = BTRFS_ROOT_ITEM_KEY;
1685
+ key.offset = (u64)-1;
1686
+ root = read_tree_root_path(fs_info->tree_root, path, &key);
1687
+ btrfs_release_path(path);
1688
+
1689
+ return root;
16631690 }
16641691
16651692 /*
....@@ -1690,6 +1717,8 @@
16901717 while (1) {
16911718 again = 0;
16921719
1720
+ set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1721
+
16931722 /* Make the cleaner go to sleep early. */
16941723 if (btrfs_need_cleaner_sleep(fs_info))
16951724 goto sleep;
....@@ -1713,9 +1742,7 @@
17131742 goto sleep;
17141743 }
17151744
1716
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
17171745 btrfs_run_delayed_iputs(fs_info);
1718
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
17191746
17201747 again = btrfs_clean_one_deleted_snapshot(root);
17211748 mutex_unlock(&fs_info->cleaner_mutex);
....@@ -1736,6 +1763,7 @@
17361763 */
17371764 btrfs_delete_unused_bgs(fs_info);
17381765 sleep:
1766
+ clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
17391767 if (kthread_should_park())
17401768 kthread_parkme();
17411769 if (kthread_should_stop())
....@@ -1772,8 +1800,7 @@
17721800 }
17731801
17741802 now = ktime_get_seconds();
1775
- if (cur->state < TRANS_STATE_BLOCKED &&
1776
- !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
1803
+ if (cur->state < TRANS_STATE_COMMIT_START &&
17771804 (now < cur->start_time ||
17781805 now - cur->start_time < fs_info->commit_interval)) {
17791806 spin_unlock(&fs_info->trans_lock);
....@@ -1811,18 +1838,18 @@
18111838 }
18121839
18131840 /*
1814
- * this will find the highest generation in the array of
1815
- * root backups. The index of the highest array is returned,
1816
- * or -1 if we can't find anything.
1841
+ * This will find the highest generation in the array of root backups. The
1842
+ * index of the highest array is returned, or -EINVAL if we can't find
1843
+ * anything.
18171844 *
18181845 * We check to make sure the array is valid by comparing the
18191846 * generation of the latest root in the array with the generation
18201847 * in the super block. If they don't match we pitch it.
18211848 */
1822
-static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1849
+static int find_newest_super_backup(struct btrfs_fs_info *info)
18231850 {
1851
+ const u64 newest_gen = btrfs_super_generation(info->super_copy);
18241852 u64 cur;
1825
- int newest_index = -1;
18261853 struct btrfs_root_backup *root_backup;
18271854 int i;
18281855
....@@ -1830,37 +1857,10 @@
18301857 root_backup = info->super_copy->super_roots + i;
18311858 cur = btrfs_backup_tree_root_gen(root_backup);
18321859 if (cur == newest_gen)
1833
- newest_index = i;
1860
+ return i;
18341861 }
18351862
1836
- /* check to see if we actually wrapped around */
1837
- if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1838
- root_backup = info->super_copy->super_roots;
1839
- cur = btrfs_backup_tree_root_gen(root_backup);
1840
- if (cur == newest_gen)
1841
- newest_index = 0;
1842
- }
1843
- return newest_index;
1844
-}
1845
-
1846
-
1847
-/*
1848
- * find the oldest backup so we know where to store new entries
1849
- * in the backup array. This will set the backup_root_index
1850
- * field in the fs_info struct
1851
- */
1852
-static void find_oldest_super_backup(struct btrfs_fs_info *info,
1853
- u64 newest_gen)
1854
-{
1855
- int newest_index = -1;
1856
-
1857
- newest_index = find_newest_super_backup(info, newest_gen);
1858
- /* if there was garbage in there, just move along */
1859
- if (newest_index == -1) {
1860
- info->backup_root_index = 0;
1861
- } else {
1862
- info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1863
- }
1863
+ return -EINVAL;
18641864 }
18651865
18661866 /*
....@@ -1870,22 +1870,8 @@
18701870 */
18711871 static void backup_super_roots(struct btrfs_fs_info *info)
18721872 {
1873
- int next_backup;
1873
+ const int next_backup = info->backup_root_index;
18741874 struct btrfs_root_backup *root_backup;
1875
- int last_backup;
1876
-
1877
- next_backup = info->backup_root_index;
1878
- last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
1879
- BTRFS_NUM_BACKUP_ROOTS;
1880
-
1881
- /*
1882
- * just overwrite the last backup if we're at the same generation
1883
- * this happens only at umount
1884
- */
1885
- root_backup = info->super_for_commit->super_roots + last_backup;
1886
- if (btrfs_backup_tree_root_gen(root_backup) ==
1887
- btrfs_header_generation(info->tree_root->node))
1888
- next_backup = last_backup;
18891875
18901876 root_backup = info->super_for_commit->super_roots + next_backup;
18911877
....@@ -1958,40 +1944,31 @@
19581944 }
19591945
19601946 /*
1961
- * this copies info out of the root backup array and back into
1962
- * the in-memory super block. It is meant to help iterate through
1963
- * the array, so you send it the number of backups you've already
1964
- * tried and the last backup index you used.
1947
+ * read_backup_root - Reads a backup root based on the passed priority. Prio 0
1948
+ * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
19651949 *
1966
- * this returns -1 when it has tried all the backups
1950
+ * fs_info - filesystem whose backup roots need to be read
1951
+ * priority - priority of backup root required
1952
+ *
1953
+ * Returns backup root index on success and -EINVAL otherwise.
19671954 */
1968
-static noinline int next_root_backup(struct btrfs_fs_info *info,
1969
- struct btrfs_super_block *super,
1970
- int *num_backups_tried, int *backup_index)
1955
+static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
19711956 {
1957
+ int backup_index = find_newest_super_backup(fs_info);
1958
+ struct btrfs_super_block *super = fs_info->super_copy;
19721959 struct btrfs_root_backup *root_backup;
1973
- int newest = *backup_index;
19741960
1975
- if (*num_backups_tried == 0) {
1976
- u64 gen = btrfs_super_generation(super);
1961
+ if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
1962
+ if (priority == 0)
1963
+ return backup_index;
19771964
1978
- newest = find_newest_super_backup(info, gen);
1979
- if (newest == -1)
1980
- return -1;
1981
-
1982
- *backup_index = newest;
1983
- *num_backups_tried = 1;
1984
- } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
1985
- /* we've tried all the backups, all done */
1986
- return -1;
1965
+ backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
1966
+ backup_index %= BTRFS_NUM_BACKUP_ROOTS;
19871967 } else {
1988
- /* jump to the next oldest backup */
1989
- newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
1990
- BTRFS_NUM_BACKUP_ROOTS;
1991
- *backup_index = newest;
1992
- *num_backups_tried += 1;
1968
+ return -EINVAL;
19931969 }
1994
- root_backup = super->super_roots + newest;
1970
+
1971
+ root_backup = super->super_roots + backup_index;
19951972
19961973 btrfs_set_super_generation(super,
19971974 btrfs_backup_tree_root_gen(root_backup));
....@@ -2001,12 +1978,13 @@
20011978 btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
20021979
20031980 /*
2004
- * fixme: the total bytes and num_devices need to match or we should
1981
+ * Fixme: the total bytes and num_devices need to match or we should
20051982 * need a fsck
20061983 */
20071984 btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
20081985 btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
2009
- return 0;
1986
+
1987
+ return backup_index;
20101988 }
20111989
20121990 /* helper to cleanup workers */
....@@ -2017,17 +1995,16 @@
20171995 btrfs_destroy_workqueue(fs_info->workers);
20181996 btrfs_destroy_workqueue(fs_info->endio_workers);
20191997 btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
2020
- btrfs_destroy_workqueue(fs_info->endio_repair_workers);
20211998 btrfs_destroy_workqueue(fs_info->rmw_workers);
20221999 btrfs_destroy_workqueue(fs_info->endio_write_workers);
20232000 btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2024
- btrfs_destroy_workqueue(fs_info->submit_workers);
20252001 btrfs_destroy_workqueue(fs_info->delayed_workers);
20262002 btrfs_destroy_workqueue(fs_info->caching_workers);
20272003 btrfs_destroy_workqueue(fs_info->readahead_workers);
20282004 btrfs_destroy_workqueue(fs_info->flush_workers);
20292005 btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2030
- btrfs_destroy_workqueue(fs_info->extent_workers);
2006
+ if (fs_info->discard_ctl.discard_workers)
2007
+ destroy_workqueue(fs_info->discard_ctl.discard_workers);
20312008 /*
20322009 * Now that all other work queues are destroyed, we can safely destroy
20332010 * the queues used for metadata I/O, since tasks from those other work
....@@ -2057,9 +2034,34 @@
20572034 free_root_extent_buffers(info->csum_root);
20582035 free_root_extent_buffers(info->quota_root);
20592036 free_root_extent_buffers(info->uuid_root);
2037
+ free_root_extent_buffers(info->fs_root);
2038
+ free_root_extent_buffers(info->data_reloc_root);
20602039 if (free_chunk_root)
20612040 free_root_extent_buffers(info->chunk_root);
20622041 free_root_extent_buffers(info->free_space_root);
2042
+}
2043
+
2044
+void btrfs_put_root(struct btrfs_root *root)
2045
+{
2046
+ if (!root)
2047
+ return;
2048
+
2049
+ if (refcount_dec_and_test(&root->refs)) {
2050
+ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2051
+ WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
2052
+ if (root->anon_dev)
2053
+ free_anon_bdev(root->anon_dev);
2054
+ btrfs_drew_lock_destroy(&root->snapshot_lock);
2055
+ free_root_extent_buffers(root);
2056
+ kfree(root->free_ino_ctl);
2057
+ kfree(root->free_ino_pinned);
2058
+#ifdef CONFIG_BTRFS_DEBUG
2059
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
2060
+ list_del_init(&root->leak_list);
2061
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
2062
+#endif
2063
+ kfree(root);
2064
+ }
20632065 }
20642066
20652067 void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
....@@ -2073,13 +2075,9 @@
20732075 struct btrfs_root, root_list);
20742076 list_del(&gang[0]->root_list);
20752077
2076
- if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
2078
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
20772079 btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2078
- } else {
2079
- free_extent_buffer(gang[0]->node);
2080
- free_extent_buffer(gang[0]->commit_root);
2081
- btrfs_put_fs_root(gang[0]);
2082
- }
2080
+ btrfs_put_root(gang[0]);
20832081 }
20842082
20852083 while (1) {
....@@ -2091,11 +2089,6 @@
20912089 for (i = 0; i < ret; i++)
20922090 btrfs_drop_and_free_fs_root(fs_info, gang[i]);
20932091 }
2094
-
2095
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
2096
- btrfs_free_log_root_tree(NULL, fs_info);
2097
- btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
2098
- }
20992092 }
21002093
21012094 static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
....@@ -2106,7 +2099,7 @@
21062099 atomic_set(&fs_info->scrubs_paused, 0);
21072100 atomic_set(&fs_info->scrub_cancel_req, 0);
21082101 init_waitqueue_head(&fs_info->scrub_pause_wait);
2109
- fs_info->scrub_workers_refcnt = 0;
2102
+ refcount_set(&fs_info->scrub_workers_refcnt, 0);
21102103 }
21112104
21122105 static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
....@@ -2134,13 +2127,12 @@
21342127 inode->i_mapping->a_ops = &btree_aops;
21352128
21362129 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
2137
- extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode);
2138
- BTRFS_I(inode)->io_tree.track_uptodate = 0;
2130
+ extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
2131
+ IO_TREE_BTREE_INODE_IO, inode);
2132
+ BTRFS_I(inode)->io_tree.track_uptodate = false;
21392133 extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
21402134
2141
- BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
2142
-
2143
- BTRFS_I(inode)->root = fs_info->tree_root;
2135
+ BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
21442136 memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
21452137 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
21462138 btrfs_insert_inode_hash(inode);
....@@ -2149,11 +2141,8 @@
21492141 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
21502142 {
21512143 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2152
- rwlock_init(&fs_info->dev_replace.lock);
2153
- atomic_set(&fs_info->dev_replace.read_locks, 0);
2154
- atomic_set(&fs_info->dev_replace.blocking_readers, 0);
2155
- init_waitqueue_head(&fs_info->replace_wait);
2156
- init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
2144
+ init_rwsem(&fs_info->dev_replace.rwsem);
2145
+ init_waitqueue_head(&fs_info->dev_replace.replace_wait);
21572146 }
21582147
21592148 static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
....@@ -2161,7 +2150,6 @@
21612150 spin_lock_init(&fs_info->qgroup_lock);
21622151 mutex_init(&fs_info->qgroup_ioctl_lock);
21632152 fs_info->qgroup_tree = RB_ROOT;
2164
- fs_info->qgroup_op_tree = RB_ROOT;
21652153 INIT_LIST_HEAD(&fs_info->dirty_qgroups);
21662154 fs_info->qgroup_seq = 1;
21672155 fs_info->qgroup_ulist = NULL;
....@@ -2190,16 +2178,6 @@
21902178 fs_info->caching_workers =
21912179 btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
21922180
2193
- /*
2194
- * a higher idle thresh on the submit workers makes it much more
2195
- * likely that bios will be send down in a sane order to the
2196
- * devices
2197
- */
2198
- fs_info->submit_workers =
2199
- btrfs_alloc_workqueue(fs_info, "submit", flags,
2200
- min_t(u64, fs_devices->num_devices,
2201
- max_active), 64);
2202
-
22032181 fs_info->fixup_workers =
22042182 btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
22052183
....@@ -2218,8 +2196,6 @@
22182196 fs_info->endio_raid56_workers =
22192197 btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
22202198 max_active, 4);
2221
- fs_info->endio_repair_workers =
2222
- btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0);
22232199 fs_info->rmw_workers =
22242200 btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
22252201 fs_info->endio_write_workers =
....@@ -2236,25 +2212,60 @@
22362212 max_active, 2);
22372213 fs_info->qgroup_rescan_workers =
22382214 btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
2239
- fs_info->extent_workers =
2240
- btrfs_alloc_workqueue(fs_info, "extent-refs", flags,
2241
- min_t(u64, fs_devices->num_devices,
2242
- max_active), 8);
2215
+ fs_info->discard_ctl.discard_workers =
2216
+ alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
22432217
22442218 if (!(fs_info->workers && fs_info->delalloc_workers &&
2245
- fs_info->submit_workers && fs_info->flush_workers &&
2219
+ fs_info->flush_workers &&
22462220 fs_info->endio_workers && fs_info->endio_meta_workers &&
22472221 fs_info->endio_meta_write_workers &&
2248
- fs_info->endio_repair_workers &&
22492222 fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
22502223 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
22512224 fs_info->caching_workers && fs_info->readahead_workers &&
22522225 fs_info->fixup_workers && fs_info->delayed_workers &&
2253
- fs_info->extent_workers &&
2254
- fs_info->qgroup_rescan_workers)) {
2226
+ fs_info->qgroup_rescan_workers &&
2227
+ fs_info->discard_ctl.discard_workers)) {
22552228 return -ENOMEM;
22562229 }
22572230
2231
+ return 0;
2232
+}
2233
+
2234
+static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
2235
+{
2236
+ struct crypto_shash *csum_shash;
2237
+ const char *csum_driver = btrfs_super_csum_driver(csum_type);
2238
+
2239
+ csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
2240
+
2241
+ if (IS_ERR(csum_shash)) {
2242
+ btrfs_err(fs_info, "error allocating %s hash for checksum",
2243
+ csum_driver);
2244
+ return PTR_ERR(csum_shash);
2245
+ }
2246
+
2247
+ fs_info->csum_shash = csum_shash;
2248
+
2249
+ /*
2250
+ * Check if the checksum implementation is a fast accelerated one.
2251
+ * As-is this is a bit of a hack and should be replaced once the csum
2252
+ * implementations provide that information themselves.
2253
+ */
2254
+ switch (csum_type) {
2255
+ case BTRFS_CSUM_TYPE_CRC32:
2256
+ if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
2257
+ set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
2258
+ break;
2259
+ case BTRFS_CSUM_TYPE_XXHASH:
2260
+ set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
2261
+ break;
2262
+ default:
2263
+ break;
2264
+ }
2265
+
2266
+ btrfs_info(fs_info, "using %s (%s) checksum algorithm",
2267
+ btrfs_super_csum_name(csum_type),
2268
+ crypto_shash_driver_name(csum_shash));
22582269 return 0;
22592270 }
22602271
....@@ -2272,11 +2283,10 @@
22722283 return -EIO;
22732284 }
22742285
2275
- log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
2286
+ log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
2287
+ GFP_KERNEL);
22762288 if (!log_tree_root)
22772289 return -ENOMEM;
2278
-
2279
- __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
22802290
22812291 log_tree_root->node = read_tree_block(fs_info, bytenr,
22822292 fs_info->generation + 1,
....@@ -2284,12 +2294,12 @@
22842294 if (IS_ERR(log_tree_root->node)) {
22852295 btrfs_warn(fs_info, "failed to read log tree");
22862296 ret = PTR_ERR(log_tree_root->node);
2287
- kfree(log_tree_root);
2297
+ log_tree_root->node = NULL;
2298
+ btrfs_put_root(log_tree_root);
22882299 return ret;
22892300 } else if (!extent_buffer_uptodate(log_tree_root->node)) {
22902301 btrfs_err(fs_info, "failed to read log tree");
2291
- free_extent_buffer(log_tree_root->node);
2292
- kfree(log_tree_root);
2302
+ btrfs_put_root(log_tree_root);
22932303 return -EIO;
22942304 }
22952305 /* returns with log_tree_root freed on success */
....@@ -2297,8 +2307,7 @@
22972307 if (ret) {
22982308 btrfs_handle_fs_error(fs_info, ret,
22992309 "Failed to recover log tree");
2300
- free_extent_buffer(log_tree_root->node);
2301
- kfree(log_tree_root);
2310
+ btrfs_put_root(log_tree_root);
23022311 return ret;
23032312 }
23042313
....@@ -2350,6 +2359,19 @@
23502359 }
23512360 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
23522361 fs_info->csum_root = root;
2362
+
2363
+ /*
2364
+ * This tree can share blocks with some other fs tree during relocation
2365
+ * and we need a proper setup by btrfs_get_fs_root
2366
+ */
2367
+ root = btrfs_get_fs_root(tree_root->fs_info,
2368
+ BTRFS_DATA_RELOC_TREE_OBJECTID, true);
2369
+ if (IS_ERR(root)) {
2370
+ ret = PTR_ERR(root);
2371
+ goto out;
2372
+ }
2373
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2374
+ fs_info->data_reloc_root = root;
23532375
23542376 location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
23552377 root = btrfs_read_tree_root(tree_root, &location);
....@@ -2474,10 +2496,26 @@
24742496 ret = -EINVAL;
24752497 }
24762498
2477
- if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
2499
+ if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
24782500 btrfs_err(fs_info,
2479
- "dev_item UUID does not match fsid: %pU != %pU",
2480
- fs_info->fsid, sb->dev_item.fsid);
2501
+ "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
2502
+ sb->fsid, fs_info->fs_devices->fsid);
2503
+ ret = -EINVAL;
2504
+ }
2505
+
2506
+ if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb),
2507
+ BTRFS_FSID_SIZE) != 0) {
2508
+ btrfs_err(fs_info,
2509
+"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
2510
+ btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid);
2511
+ ret = -EINVAL;
2512
+ }
2513
+
2514
+ if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2515
+ BTRFS_FSID_SIZE) != 0) {
2516
+ btrfs_err(fs_info,
2517
+ "dev_item UUID does not match metadata fsid: %pU != %pU",
2518
+ fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
24812519 ret = -EINVAL;
24822520 }
24832521
....@@ -2572,7 +2610,7 @@
25722610 ret = validate_super(fs_info, sb, -1);
25732611 if (ret < 0)
25742612 goto out;
2575
- if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) {
2613
+ if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
25762614 ret = -EUCLEAN;
25772615 btrfs_err(fs_info, "invalid csum type, has %u want %u",
25782616 btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
....@@ -2593,61 +2631,103 @@
25932631 return ret;
25942632 }
25952633
2596
-int open_ctree(struct super_block *sb,
2597
- struct btrfs_fs_devices *fs_devices,
2598
- char *options)
2634
+static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
25992635 {
2600
- u32 sectorsize;
2601
- u32 nodesize;
2602
- u32 stripesize;
2603
- u64 generation;
2604
- u64 features;
2605
- struct btrfs_key location;
2606
- struct buffer_head *bh;
2607
- struct btrfs_super_block *disk_super;
2608
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
2609
- struct btrfs_root *tree_root;
2610
- struct btrfs_root *chunk_root;
2611
- int ret;
2612
- int err = -EINVAL;
2613
- int num_backups_tried = 0;
2614
- int backup_index = 0;
2615
- int clear_free_space_tree = 0;
2616
- int level;
2636
+ int backup_index = find_newest_super_backup(fs_info);
2637
+ struct btrfs_super_block *sb = fs_info->super_copy;
2638
+ struct btrfs_root *tree_root = fs_info->tree_root;
2639
+ bool handle_error = false;
2640
+ int ret = 0;
2641
+ int i;
26172642
2618
- tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
2619
- chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
2620
- if (!tree_root || !chunk_root) {
2621
- err = -ENOMEM;
2622
- goto fail;
2643
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2644
+ u64 generation;
2645
+ int level;
2646
+
2647
+ if (handle_error) {
2648
+ if (!IS_ERR(tree_root->node))
2649
+ free_extent_buffer(tree_root->node);
2650
+ tree_root->node = NULL;
2651
+
2652
+ if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
2653
+ break;
2654
+
2655
+ free_root_pointers(fs_info, 0);
2656
+
2657
+ /*
2658
+ * Don't use the log in recovery mode, it won't be
2659
+ * valid
2660
+ */
2661
+ btrfs_set_super_log_root(sb, 0);
2662
+
2663
+ /* We can't trust the free space cache either */
2664
+ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2665
+
2666
+ ret = read_backup_root(fs_info, i);
2667
+ backup_index = ret;
2668
+ if (ret < 0)
2669
+ return ret;
2670
+ }
2671
+ generation = btrfs_super_generation(sb);
2672
+ level = btrfs_super_root_level(sb);
2673
+ tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
2674
+ generation, level, NULL);
2675
+ if (IS_ERR(tree_root->node)) {
2676
+ handle_error = true;
2677
+ ret = PTR_ERR(tree_root->node);
2678
+ tree_root->node = NULL;
2679
+ btrfs_warn(fs_info, "couldn't read tree root");
2680
+ continue;
2681
+
2682
+ } else if (!extent_buffer_uptodate(tree_root->node)) {
2683
+ handle_error = true;
2684
+ ret = -EIO;
2685
+ btrfs_warn(fs_info, "error while reading tree root");
2686
+ continue;
2687
+ }
2688
+
2689
+ btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2690
+ tree_root->commit_root = btrfs_root_node(tree_root);
2691
+ btrfs_set_root_refs(&tree_root->root_item, 1);
2692
+
2693
+ /*
2694
+ * No need to hold btrfs_root::objectid_mutex since the fs
2695
+ * hasn't been fully initialised and we are the only user
2696
+ */
2697
+ ret = btrfs_find_highest_objectid(tree_root,
2698
+ &tree_root->highest_objectid);
2699
+ if (ret < 0) {
2700
+ handle_error = true;
2701
+ continue;
2702
+ }
2703
+
2704
+ ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
2705
+
2706
+ ret = btrfs_read_roots(fs_info);
2707
+ if (ret < 0) {
2708
+ handle_error = true;
2709
+ continue;
2710
+ }
2711
+
2712
+ /* All successful */
2713
+ fs_info->generation = generation;
2714
+ fs_info->last_trans_committed = generation;
2715
+
2716
+ /* Always begin writing backup roots after the one being used */
2717
+ if (backup_index < 0) {
2718
+ fs_info->backup_root_index = 0;
2719
+ } else {
2720
+ fs_info->backup_root_index = backup_index + 1;
2721
+ fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
2722
+ }
2723
+ break;
26232724 }
26242725
2625
- ret = init_srcu_struct(&fs_info->subvol_srcu);
2626
- if (ret) {
2627
- err = ret;
2628
- goto fail;
2629
- }
2726
+ return ret;
2727
+}
26302728
2631
- ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
2632
- if (ret) {
2633
- err = ret;
2634
- goto fail_srcu;
2635
- }
2636
- fs_info->dirty_metadata_batch = PAGE_SIZE *
2637
- (1 + ilog2(nr_cpu_ids));
2638
-
2639
- ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2640
- if (ret) {
2641
- err = ret;
2642
- goto fail_dirty_metadata_bytes;
2643
- }
2644
-
2645
- ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
2646
- if (ret) {
2647
- err = ret;
2648
- goto fail_delalloc_bytes;
2649
- }
2650
-
2729
+void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
2730
+{
26512731 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
26522732 INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
26532733 INIT_LIST_HEAD(&fs_info->trans_list);
....@@ -2655,15 +2735,12 @@
26552735 INIT_LIST_HEAD(&fs_info->delayed_iputs);
26562736 INIT_LIST_HEAD(&fs_info->delalloc_roots);
26572737 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2658
- INIT_LIST_HEAD(&fs_info->pending_raid_kobjs);
2659
- spin_lock_init(&fs_info->pending_raid_kobjs_lock);
26602738 spin_lock_init(&fs_info->delalloc_root_lock);
26612739 spin_lock_init(&fs_info->trans_lock);
26622740 spin_lock_init(&fs_info->fs_roots_radix_lock);
26632741 spin_lock_init(&fs_info->delayed_iput_lock);
26642742 spin_lock_init(&fs_info->defrag_inodes_lock);
26652743 spin_lock_init(&fs_info->super_lock);
2666
- spin_lock_init(&fs_info->qgroup_op_lock);
26672744 spin_lock_init(&fs_info->buffer_lock);
26682745 spin_lock_init(&fs_info->unused_bgs_lock);
26692746 rwlock_init(&fs_info->tree_mod_log_lock);
....@@ -2671,14 +2748,18 @@
26712748 mutex_init(&fs_info->delete_unused_bgs_mutex);
26722749 mutex_init(&fs_info->reloc_mutex);
26732750 mutex_init(&fs_info->delalloc_root_mutex);
2674
- mutex_init(&fs_info->cleaner_delayed_iput_mutex);
26752751 seqlock_init(&fs_info->profiles_lock);
26762752
26772753 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
26782754 INIT_LIST_HEAD(&fs_info->space_info);
26792755 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
26802756 INIT_LIST_HEAD(&fs_info->unused_bgs);
2681
- btrfs_mapping_init(&fs_info->mapping_tree);
2757
+#ifdef CONFIG_BTRFS_DEBUG
2758
+ INIT_LIST_HEAD(&fs_info->allocated_roots);
2759
+ INIT_LIST_HEAD(&fs_info->allocated_ebs);
2760
+ spin_lock_init(&fs_info->eb_leak_lock);
2761
+#endif
2762
+ extent_map_tree_init(&fs_info->mapping_tree);
26822763 btrfs_init_block_rsv(&fs_info->global_block_rsv,
26832764 BTRFS_BLOCK_RSV_GLOBAL);
26842765 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
....@@ -2686,12 +2767,14 @@
26862767 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
26872768 btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
26882769 BTRFS_BLOCK_RSV_DELOPS);
2770
+ btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
2771
+ BTRFS_BLOCK_RSV_DELREFS);
2772
+
26892773 atomic_set(&fs_info->async_delalloc_pages, 0);
26902774 atomic_set(&fs_info->defrag_running, 0);
2691
- atomic_set(&fs_info->qgroup_op_seq, 0);
26922775 atomic_set(&fs_info->reada_works_cnt, 0);
2776
+ atomic_set(&fs_info->nr_delayed_iputs, 0);
26932777 atomic64_set(&fs_info->tree_mod_seq, 0);
2694
- fs_info->sb = sb;
26952778 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
26962779 fs_info->metadata_ratio = 0;
26972780 fs_info->defrag_inodes = RB_ROOT;
....@@ -2710,40 +2793,19 @@
27102793 INIT_LIST_HEAD(&fs_info->ordered_roots);
27112794 spin_lock_init(&fs_info->ordered_root_lock);
27122795
2713
- fs_info->btree_inode = new_inode(sb);
2714
- if (!fs_info->btree_inode) {
2715
- err = -ENOMEM;
2716
- goto fail_bio_counter;
2717
- }
2718
- mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2719
-
2720
- fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2721
- GFP_KERNEL);
2722
- if (!fs_info->delayed_root) {
2723
- err = -ENOMEM;
2724
- goto fail_iput;
2725
- }
2726
- btrfs_init_delayed_root(fs_info->delayed_root);
2727
-
27282796 btrfs_init_scrub(fs_info);
27292797 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
27302798 fs_info->check_integrity_print_mask = 0;
27312799 #endif
27322800 btrfs_init_balance(fs_info);
2733
- btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
2734
-
2735
- sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
2736
- sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
2737
-
2738
- btrfs_init_btree_inode(fs_info);
2801
+ btrfs_init_async_reclaim_work(fs_info);
27392802
27402803 spin_lock_init(&fs_info->block_group_cache_lock);
27412804 fs_info->block_group_cache_tree = RB_ROOT;
27422805 fs_info->first_logical_byte = (u64)-1;
27432806
2744
- extent_io_tree_init(&fs_info->freed_extents[0], NULL);
2745
- extent_io_tree_init(&fs_info->freed_extents[1], NULL);
2746
- fs_info->pinned_extents = &fs_info->freed_extents[0];
2807
+ extent_io_tree_init(fs_info, &fs_info->excluded_extents,
2808
+ IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
27472809 set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
27482810
27492811 mutex_init(&fs_info->ordered_operations_mutex);
....@@ -2759,6 +2821,7 @@
27592821
27602822 btrfs_init_dev_replace_locks(fs_info);
27612823 btrfs_init_qgroup(fs_info);
2824
+ btrfs_discard_init(fs_info);
27622825
27632826 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
27642827 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
....@@ -2767,30 +2830,165 @@
27672830 init_waitqueue_head(&fs_info->transaction_wait);
27682831 init_waitqueue_head(&fs_info->transaction_blocked_wait);
27692832 init_waitqueue_head(&fs_info->async_submit_wait);
2770
-
2771
- INIT_LIST_HEAD(&fs_info->pinned_chunks);
2833
+ init_waitqueue_head(&fs_info->delayed_iputs_wait);
27722834
27732835 /* Usable values until the real ones are cached from the superblock */
27742836 fs_info->nodesize = 4096;
27752837 fs_info->sectorsize = 4096;
27762838 fs_info->stripesize = 4096;
27772839
2778
- ret = btrfs_alloc_stripe_hash_table(fs_info);
2779
- if (ret) {
2780
- err = ret;
2781
- goto fail_alloc;
2840
+ spin_lock_init(&fs_info->swapfile_pins_lock);
2841
+ fs_info->swapfile_pins = RB_ROOT;
2842
+
2843
+ fs_info->send_in_progress = 0;
2844
+}
2845
+
2846
+static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
2847
+{
2848
+ int ret;
2849
+
2850
+ fs_info->sb = sb;
2851
+ sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
2852
+ sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
2853
+
2854
+ ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
2855
+ if (ret)
2856
+ return ret;
2857
+
2858
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
2859
+ if (ret)
2860
+ return ret;
2861
+
2862
+ fs_info->dirty_metadata_batch = PAGE_SIZE *
2863
+ (1 + ilog2(nr_cpu_ids));
2864
+
2865
+ ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2866
+ if (ret)
2867
+ return ret;
2868
+
2869
+ ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
2870
+ GFP_KERNEL);
2871
+ if (ret)
2872
+ return ret;
2873
+
2874
+ fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2875
+ GFP_KERNEL);
2876
+ if (!fs_info->delayed_root)
2877
+ return -ENOMEM;
2878
+ btrfs_init_delayed_root(fs_info->delayed_root);
2879
+
2880
+ return btrfs_alloc_stripe_hash_table(fs_info);
2881
+}
2882
+
2883
+static int btrfs_uuid_rescan_kthread(void *data)
2884
+{
2885
+ struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
2886
+ int ret;
2887
+
2888
+ /*
2889
+ * 1st step is to iterate through the existing UUID tree and
2890
+ * to delete all entries that contain outdated data.
2891
+ * 2nd step is to add all missing entries to the UUID tree.
2892
+ */
2893
+ ret = btrfs_uuid_tree_iterate(fs_info);
2894
+ if (ret < 0) {
2895
+ if (ret != -EINTR)
2896
+ btrfs_warn(fs_info, "iterating uuid_tree failed %d",
2897
+ ret);
2898
+ up(&fs_info->uuid_tree_rescan_sem);
2899
+ return ret;
2900
+ }
2901
+ return btrfs_uuid_scan_kthread(data);
2902
+}
2903
+
2904
+static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
2905
+{
2906
+ struct task_struct *task;
2907
+
2908
+ down(&fs_info->uuid_tree_rescan_sem);
2909
+ task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
2910
+ if (IS_ERR(task)) {
2911
+ /* fs_info->update_uuid_tree_gen remains 0 in all error case */
2912
+ btrfs_warn(fs_info, "failed to start uuid_rescan task");
2913
+ up(&fs_info->uuid_tree_rescan_sem);
2914
+ return PTR_ERR(task);
27822915 }
27832916
2784
- __setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
2917
+ return 0;
2918
+}
2919
+
2920
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
2921
+ char *options)
2922
+{
2923
+ u32 sectorsize;
2924
+ u32 nodesize;
2925
+ u32 stripesize;
2926
+ u64 generation;
2927
+ u64 features;
2928
+ u16 csum_type;
2929
+ struct btrfs_super_block *disk_super;
2930
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
2931
+ struct btrfs_root *tree_root;
2932
+ struct btrfs_root *chunk_root;
2933
+ int ret;
2934
+ int err = -EINVAL;
2935
+ int clear_free_space_tree = 0;
2936
+ int level;
2937
+
2938
+ ret = init_mount_fs_info(fs_info, sb);
2939
+ if (ret) {
2940
+ err = ret;
2941
+ goto fail;
2942
+ }
2943
+
2944
+ /* These need to be init'ed before we start creating inodes and such. */
2945
+ tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
2946
+ GFP_KERNEL);
2947
+ fs_info->tree_root = tree_root;
2948
+ chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
2949
+ GFP_KERNEL);
2950
+ fs_info->chunk_root = chunk_root;
2951
+ if (!tree_root || !chunk_root) {
2952
+ err = -ENOMEM;
2953
+ goto fail;
2954
+ }
2955
+
2956
+ fs_info->btree_inode = new_inode(sb);
2957
+ if (!fs_info->btree_inode) {
2958
+ err = -ENOMEM;
2959
+ goto fail;
2960
+ }
2961
+ mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2962
+ btrfs_init_btree_inode(fs_info);
27852963
27862964 invalidate_bdev(fs_devices->latest_bdev);
27872965
27882966 /*
27892967 * Read super block and check the signature bytes only
27902968 */
2791
- bh = btrfs_read_dev_super(fs_devices->latest_bdev);
2792
- if (IS_ERR(bh)) {
2793
- err = PTR_ERR(bh);
2969
+ disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
2970
+ if (IS_ERR(disk_super)) {
2971
+ err = PTR_ERR(disk_super);
2972
+ goto fail_alloc;
2973
+ }
2974
+
2975
+ /*
2976
+ * Verify the type first, if that or the checksum value are
2977
+ * corrupted, we'll find out
2978
+ */
2979
+ csum_type = btrfs_super_csum_type(disk_super);
2980
+ if (!btrfs_supported_super_csum(csum_type)) {
2981
+ btrfs_err(fs_info, "unsupported checksum algorithm: %u",
2982
+ csum_type);
2983
+ err = -EINVAL;
2984
+ btrfs_release_disk_super(disk_super);
2985
+ goto fail_alloc;
2986
+ }
2987
+
2988
+ ret = btrfs_init_csum_hash(fs_info, csum_type);
2989
+ if (ret) {
2990
+ err = ret;
2991
+ btrfs_release_disk_super(disk_super);
27942992 goto fail_alloc;
27952993 }
27962994
....@@ -2798,10 +2996,10 @@
27982996 * We want to check superblock checksum, the type is stored inside.
27992997 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
28002998 */
2801
- if (btrfs_check_super_csum(fs_info, bh->b_data)) {
2999
+ if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
28023000 btrfs_err(fs_info, "superblock checksum mismatch");
28033001 err = -EINVAL;
2804
- brelse(bh);
3002
+ btrfs_release_disk_super(disk_super);
28053003 goto fail_alloc;
28063004 }
28073005
....@@ -2810,12 +3008,22 @@
28103008 * following bytes up to INFO_SIZE, the checksum is calculated from
28113009 * the whole block of INFO_SIZE
28123010 */
2813
- memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
3011
+ memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
3012
+ btrfs_release_disk_super(disk_super);
3013
+
3014
+ disk_super = fs_info->super_copy;
3015
+
3016
+
3017
+ features = btrfs_super_flags(disk_super);
3018
+ if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
3019
+ features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
3020
+ btrfs_set_super_flags(disk_super, features);
3021
+ btrfs_info(fs_info,
3022
+ "found metadata UUID change in progress flag, clearing");
3023
+ }
3024
+
28143025 memcpy(fs_info->super_for_commit, fs_info->super_copy,
28153026 sizeof(*fs_info->super_for_commit));
2816
- brelse(bh);
2817
-
2818
- memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
28193027
28203028 ret = btrfs_validate_mount_super(fs_info);
28213029 if (ret) {
....@@ -2824,7 +3032,6 @@
28243032 goto fail_alloc;
28253033 }
28263034
2827
- disk_super = fs_info->super_copy;
28283035 if (!btrfs_super_root(disk_super))
28293036 goto fail_alloc;
28303037
....@@ -2833,17 +3040,33 @@
28333040 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
28343041
28353042 /*
2836
- * run through our array of backup supers and setup
2837
- * our ring pointer to the oldest one
2838
- */
2839
- generation = btrfs_super_generation(disk_super);
2840
- find_oldest_super_backup(fs_info, generation);
2841
-
2842
- /*
28433043 * In the long term, we'll store the compression type in the super
28443044 * block, and it'll be used for per file compression control.
28453045 */
28463046 fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
3047
+
3048
+ /*
3049
+ * Flag our filesystem as having big metadata blocks if they are bigger
3050
+ * than the page size
3051
+ */
3052
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
3053
+ if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
3054
+ btrfs_info(fs_info,
3055
+ "flagging fs with big metadata feature");
3056
+ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
3057
+ }
3058
+
3059
+ /* Set up fs_info before parsing mount options */
3060
+ nodesize = btrfs_super_nodesize(disk_super);
3061
+ sectorsize = btrfs_super_sectorsize(disk_super);
3062
+ stripesize = sectorsize;
3063
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3064
+ fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3065
+
3066
+ /* Cache block sizes */
3067
+ fs_info->nodesize = nodesize;
3068
+ fs_info->sectorsize = sectorsize;
3069
+ fs_info->stripesize = stripesize;
28473070
28483071 ret = btrfs_parse_options(fs_info, options, sb->s_flags);
28493072 if (ret) {
....@@ -2855,7 +3078,7 @@
28553078 ~BTRFS_FEATURE_INCOMPAT_SUPP;
28563079 if (features) {
28573080 btrfs_err(fs_info,
2858
- "cannot mount because of unsupported optional features (%llx)",
3081
+ "cannot mount because of unsupported optional features (0x%llx)",
28593082 features);
28603083 err = -EINVAL;
28613084 goto fail_alloc;
....@@ -2870,28 +3093,6 @@
28703093
28713094 if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
28723095 btrfs_info(fs_info, "has skinny extents");
2873
-
2874
- /*
2875
- * flag our filesystem as having big metadata blocks if
2876
- * they are bigger than the page size
2877
- */
2878
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
2879
- if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
2880
- btrfs_info(fs_info,
2881
- "flagging fs with big metadata feature");
2882
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
2883
- }
2884
-
2885
- nodesize = btrfs_super_nodesize(disk_super);
2886
- sectorsize = btrfs_super_sectorsize(disk_super);
2887
- stripesize = sectorsize;
2888
- fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
2889
- fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
2890
-
2891
- /* Cache block sizes */
2892
- fs_info->nodesize = nodesize;
2893
- fs_info->sectorsize = sectorsize;
2894
- fs_info->stripesize = stripesize;
28953096
28963097 /*
28973098 * mixed block groups end up with duplicate but slightly offset
....@@ -2915,11 +3116,25 @@
29153116 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
29163117 if (!sb_rdonly(sb) && features) {
29173118 btrfs_err(fs_info,
2918
- "cannot mount read-write because of unsupported optional features (%llx)",
3119
+ "cannot mount read-write because of unsupported optional features (0x%llx)",
29193120 features);
29203121 err = -EINVAL;
29213122 goto fail_alloc;
29223123 }
3124
+ /*
3125
+ * We have unsupported RO compat features, although RO mounted, we
3126
+ * should not cause any metadata write, including log replay.
3127
+ * Or we could screw up whatever the new feature requires.
3128
+ */
3129
+ if (unlikely(features && btrfs_super_log_root(disk_super) &&
3130
+ !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
3131
+ btrfs_err(fs_info,
3132
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
3133
+ features);
3134
+ err = -EINVAL;
3135
+ goto fail_alloc;
3136
+ }
3137
+
29233138
29243139 ret = btrfs_init_workqueues(fs_info, fs_devices);
29253140 if (ret) {
....@@ -2927,16 +3142,12 @@
29273142 goto fail_sb_buffer;
29283143 }
29293144
2930
- sb->s_bdi->congested_fn = btrfs_congested_fn;
2931
- sb->s_bdi->congested_data = fs_info;
2932
- sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
2933
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
29343145 sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
29353146 sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
29363147
29373148 sb->s_blocksize = sectorsize;
29383149 sb->s_blocksize_bits = blksize_bits(sectorsize);
2939
- memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE);
3150
+ memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
29403151
29413152 mutex_lock(&fs_info->chunk_mutex);
29423153 ret = btrfs_read_sys_array(fs_info);
....@@ -2948,8 +3159,6 @@
29483159
29493160 generation = btrfs_super_chunk_root_generation(disk_super);
29503161 level = btrfs_super_chunk_root_level(disk_super);
2951
-
2952
- __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
29533162
29543163 chunk_root->node = read_tree_block(fs_info,
29553164 btrfs_super_chunk_root(disk_super),
....@@ -2966,7 +3175,8 @@
29663175 chunk_root->commit_root = btrfs_root_node(chunk_root);
29673176
29683177 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
2969
- btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
3178
+ offsetof(struct btrfs_header, chunk_tree_uuid),
3179
+ BTRFS_UUID_SIZE);
29703180
29713181 ret = btrfs_read_chunk_tree(fs_info);
29723182 if (ret) {
....@@ -2985,44 +3195,9 @@
29853195 goto fail_tree_roots;
29863196 }
29873197
2988
-retry_root_backup:
2989
- generation = btrfs_super_generation(disk_super);
2990
- level = btrfs_super_root_level(disk_super);
2991
-
2992
- tree_root->node = read_tree_block(fs_info,
2993
- btrfs_super_root(disk_super),
2994
- generation, level, NULL);
2995
- if (IS_ERR(tree_root->node) ||
2996
- !extent_buffer_uptodate(tree_root->node)) {
2997
- btrfs_warn(fs_info, "failed to read tree root");
2998
- if (!IS_ERR(tree_root->node))
2999
- free_extent_buffer(tree_root->node);
3000
- tree_root->node = NULL;
3001
- goto recovery_tree_root;
3002
- }
3003
-
3004
- btrfs_set_root_node(&tree_root->root_item, tree_root->node);
3005
- tree_root->commit_root = btrfs_root_node(tree_root);
3006
- btrfs_set_root_refs(&tree_root->root_item, 1);
3007
-
3008
- mutex_lock(&tree_root->objectid_mutex);
3009
- ret = btrfs_find_highest_objectid(tree_root,
3010
- &tree_root->highest_objectid);
3011
- if (ret) {
3012
- mutex_unlock(&tree_root->objectid_mutex);
3013
- goto recovery_tree_root;
3014
- }
3015
-
3016
- ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
3017
-
3018
- mutex_unlock(&tree_root->objectid_mutex);
3019
-
3020
- ret = btrfs_read_roots(fs_info);
3198
+ ret = init_tree_roots(fs_info);
30213199 if (ret)
3022
- goto recovery_tree_root;
3023
-
3024
- fs_info->generation = generation;
3025
- fs_info->last_trans_committed = generation;
3200
+ goto fail_tree_roots;
30263201
30273202 /*
30283203 * If we have a uuid root and we're not being told to rescan we need to
....@@ -3063,18 +3238,11 @@
30633238
30643239 btrfs_free_extra_devids(fs_devices, 1);
30653240
3066
- ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
3241
+ ret = btrfs_sysfs_add_fsid(fs_devices);
30673242 if (ret) {
30683243 btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
30693244 ret);
30703245 goto fail_block_groups;
3071
- }
3072
-
3073
- ret = btrfs_sysfs_add_device(fs_devices);
3074
- if (ret) {
3075
- btrfs_err(fs_info, "failed to init sysfs device interface: %d",
3076
- ret);
3077
- goto fail_fsdev_sysfs;
30783246 }
30793247
30803248 ret = btrfs_sysfs_add_mounted(fs_info);
....@@ -3098,7 +3266,7 @@
30983266 if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
30993267 !btrfs_check_rw_degradable(fs_info, NULL)) {
31003268 btrfs_warn(fs_info,
3101
- "writeable mount is not allowed due to too many missing devices");
3269
+ "writable mount is not allowed due to too many missing devices");
31023270 goto fail_sysfs;
31033271 }
31043272
....@@ -3175,11 +3343,7 @@
31753343 }
31763344 }
31773345
3178
- location.objectid = BTRFS_FS_TREE_OBJECTID;
3179
- location.type = BTRFS_ROOT_ITEM_KEY;
3180
- location.offset = 0;
3181
-
3182
- fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
3346
+ fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
31833347 if (IS_ERR(fs_info->fs_root)) {
31843348 err = PTR_ERR(fs_info->fs_root);
31853349 btrfs_warn(fs_info, "failed to read fs tree: %d", err);
....@@ -3246,6 +3410,7 @@
32463410 }
32473411
32483412 btrfs_qgroup_rescan_resume(fs_info);
3413
+ btrfs_discard_resume(fs_info);
32493414
32503415 if (!fs_info->uuid_root) {
32513416 btrfs_info(fs_info, "creating UUID tree");
....@@ -3303,6 +3468,8 @@
33033468 btrfs_put_block_group_cache(fs_info);
33043469
33053470 fail_tree_roots:
3471
+ if (fs_info->data_reloc_root)
3472
+ btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
33063473 free_root_pointers(fs_info, true);
33073474 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
33083475
....@@ -3310,103 +3477,81 @@
33103477 btrfs_stop_all_workers(fs_info);
33113478 btrfs_free_block_groups(fs_info);
33123479 fail_alloc:
3313
-fail_iput:
33143480 btrfs_mapping_tree_free(&fs_info->mapping_tree);
33153481
33163482 iput(fs_info->btree_inode);
3317
-fail_bio_counter:
3318
- percpu_counter_destroy(&fs_info->bio_counter);
3319
-fail_delalloc_bytes:
3320
- percpu_counter_destroy(&fs_info->delalloc_bytes);
3321
-fail_dirty_metadata_bytes:
3322
- percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3323
-fail_srcu:
3324
- cleanup_srcu_struct(&fs_info->subvol_srcu);
33253483 fail:
3326
- btrfs_free_stripe_hash_table(fs_info);
33273484 btrfs_close_devices(fs_info->fs_devices);
33283485 return err;
3329
-
3330
-recovery_tree_root:
3331
- if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
3332
- goto fail_tree_roots;
3333
-
3334
- free_root_pointers(fs_info, false);
3335
-
3336
- /* don't use the log in recovery mode, it won't be valid */
3337
- btrfs_set_super_log_root(disk_super, 0);
3338
-
3339
- /* we can't trust the free space cache either */
3340
- btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
3341
-
3342
- ret = next_root_backup(fs_info, fs_info->super_copy,
3343
- &num_backups_tried, &backup_index);
3344
- if (ret == -1)
3345
- goto fail_block_groups;
3346
- goto retry_root_backup;
33473486 }
33483487 ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
33493488
3350
-static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
3489
+static void btrfs_end_super_write(struct bio *bio)
33513490 {
3352
- if (uptodate) {
3353
- set_buffer_uptodate(bh);
3354
- } else {
3355
- struct btrfs_device *device = (struct btrfs_device *)
3356
- bh->b_private;
3491
+ struct btrfs_device *device = bio->bi_private;
3492
+ struct bio_vec *bvec;
3493
+ struct bvec_iter_all iter_all;
3494
+ struct page *page;
33573495
3358
- btrfs_warn_rl_in_rcu(device->fs_info,
3359
- "lost page write due to IO error on %s",
3360
- rcu_str_deref(device->name));
3361
- /* note, we don't set_buffer_write_io_error because we have
3362
- * our own ways of dealing with the IO errors
3363
- */
3364
- clear_buffer_uptodate(bh);
3365
- btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
3496
+ bio_for_each_segment_all(bvec, bio, iter_all) {
3497
+ page = bvec->bv_page;
3498
+
3499
+ if (bio->bi_status) {
3500
+ btrfs_warn_rl_in_rcu(device->fs_info,
3501
+ "lost page write due to IO error on %s (%d)",
3502
+ rcu_str_deref(device->name),
3503
+ blk_status_to_errno(bio->bi_status));
3504
+ ClearPageUptodate(page);
3505
+ SetPageError(page);
3506
+ btrfs_dev_stat_inc_and_print(device,
3507
+ BTRFS_DEV_STAT_WRITE_ERRS);
3508
+ } else {
3509
+ SetPageUptodate(page);
3510
+ }
3511
+
3512
+ put_page(page);
3513
+ unlock_page(page);
33663514 }
3367
- unlock_buffer(bh);
3368
- put_bh(bh);
3515
+
3516
+ bio_put(bio);
33693517 }
33703518
3371
-int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
3372
- struct buffer_head **bh_ret)
3519
+struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
3520
+ int copy_num)
33733521 {
3374
- struct buffer_head *bh;
33753522 struct btrfs_super_block *super;
3523
+ struct page *page;
33763524 u64 bytenr;
3525
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
33773526
33783527 bytenr = btrfs_sb_offset(copy_num);
33793528 if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
3380
- return -EINVAL;
3529
+ return ERR_PTR(-EINVAL);
33813530
3382
- bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
3383
- /*
3384
- * If we fail to read from the underlying devices, as of now
3385
- * the best option we have is to mark it EIO.
3386
- */
3387
- if (!bh)
3388
- return -EIO;
3531
+ page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
3532
+ if (IS_ERR(page))
3533
+ return ERR_CAST(page);
33893534
3390
- super = (struct btrfs_super_block *)bh->b_data;
3391
- if (btrfs_super_bytenr(super) != bytenr ||
3392
- btrfs_super_magic(super) != BTRFS_MAGIC) {
3393
- brelse(bh);
3394
- return -EINVAL;
3535
+ super = page_address(page);
3536
+ if (btrfs_super_magic(super) != BTRFS_MAGIC) {
3537
+ btrfs_release_disk_super(super);
3538
+ return ERR_PTR(-ENODATA);
33953539 }
33963540
3397
- *bh_ret = bh;
3398
- return 0;
3541
+ if (btrfs_super_bytenr(super) != bytenr) {
3542
+ btrfs_release_disk_super(super);
3543
+ return ERR_PTR(-EINVAL);
3544
+ }
3545
+
3546
+ return super;
33993547 }
34003548
34013549
3402
-struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
3550
+struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
34033551 {
3404
- struct buffer_head *bh;
3405
- struct buffer_head *latest = NULL;
3406
- struct btrfs_super_block *super;
3552
+ struct btrfs_super_block *super, *latest = NULL;
34073553 int i;
34083554 u64 transid = 0;
3409
- int ret = -EINVAL;
34103555
34113556 /* we would like to check all the supers, but that would make
34123557 * a btrfs mount succeed after a mkfs from a different FS.
....@@ -3414,52 +3559,52 @@
34143559 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
34153560 */
34163561 for (i = 0; i < 1; i++) {
3417
- ret = btrfs_read_dev_one_super(bdev, i, &bh);
3418
- if (ret)
3562
+ super = btrfs_read_dev_one_super(bdev, i);
3563
+ if (IS_ERR(super))
34193564 continue;
34203565
3421
- super = (struct btrfs_super_block *)bh->b_data;
3422
-
34233566 if (!latest || btrfs_super_generation(super) > transid) {
3424
- brelse(latest);
3425
- latest = bh;
3567
+ if (latest)
3568
+ btrfs_release_disk_super(super);
3569
+
3570
+ latest = super;
34263571 transid = btrfs_super_generation(super);
3427
- } else {
3428
- brelse(bh);
34293572 }
34303573 }
34313574
3432
- if (!latest)
3433
- return ERR_PTR(ret);
3434
-
3435
- return latest;
3575
+ return super;
34363576 }
34373577
34383578 /*
34393579 * Write superblock @sb to the @device. Do not wait for completion, all the
3440
- * buffer heads we write are pinned.
3580
+ * pages we use for writing are locked.
34413581 *
34423582 * Write @max_mirrors copies of the superblock, where 0 means default that fit
34433583 * the expected device size at commit time. Note that max_mirrors must be
34443584 * same for write and wait phases.
34453585 *
3446
- * Return number of errors when buffer head is not found or submission fails.
3586
+ * Return number of errors when page is not found or submission fails.
34473587 */
34483588 static int write_dev_supers(struct btrfs_device *device,
34493589 struct btrfs_super_block *sb, int max_mirrors)
34503590 {
3451
- struct buffer_head *bh;
3591
+ struct btrfs_fs_info *fs_info = device->fs_info;
3592
+ struct address_space *mapping = device->bdev->bd_inode->i_mapping;
3593
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
34523594 int i;
3453
- int ret;
34543595 int errors = 0;
3455
- u32 crc;
34563596 u64 bytenr;
3457
- int op_flags;
34583597
34593598 if (max_mirrors == 0)
34603599 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
34613600
3601
+ shash->tfm = fs_info->csum_shash;
3602
+
34623603 for (i = 0; i < max_mirrors; i++) {
3604
+ struct page *page;
3605
+ struct bio *bio;
3606
+ struct btrfs_super_block *disk_super;
3607
+
34633608 bytenr = btrfs_sb_offset(i);
34643609 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
34653610 device->commit_total_bytes)
....@@ -3467,42 +3612,49 @@
34673612
34683613 btrfs_set_super_bytenr(sb, bytenr);
34693614
3470
- crc = ~(u32)0;
3471
- crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc,
3472
- BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
3473
- btrfs_csum_final(crc, sb->csum);
3615
+ crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
3616
+ BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
3617
+ sb->csum);
34743618
3475
- /* One reference for us, and we leave it for the caller */
3476
- bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
3477
- BTRFS_SUPER_INFO_SIZE);
3478
- if (!bh) {
3619
+ page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
3620
+ GFP_NOFS);
3621
+ if (!page) {
34793622 btrfs_err(device->fs_info,
3480
- "couldn't get super buffer head for bytenr %llu",
3623
+ "couldn't get super block page for bytenr %llu",
34813624 bytenr);
34823625 errors++;
34833626 continue;
34843627 }
34853628
3486
- memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
3629
+ /* Bump the refcount for wait_dev_supers() */
3630
+ get_page(page);
34873631
3488
- /* one reference for submit_bh */
3489
- get_bh(bh);
3490
-
3491
- set_buffer_uptodate(bh);
3492
- lock_buffer(bh);
3493
- bh->b_end_io = btrfs_end_buffer_write_sync;
3494
- bh->b_private = device;
3632
+ disk_super = page_address(page);
3633
+ memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
34953634
34963635 /*
3497
- * we fua the first super. The others we allow
3498
- * to go down lazy.
3636
+ * Directly use bios here instead of relying on the page cache
3637
+ * to do I/O, so we don't lose the ability to do integrity
3638
+ * checking.
34993639 */
3500
- op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
3640
+ bio = bio_alloc(GFP_NOFS, 1);
3641
+ bio_set_dev(bio, device->bdev);
3642
+ bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
3643
+ bio->bi_private = device;
3644
+ bio->bi_end_io = btrfs_end_super_write;
3645
+ __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
3646
+ offset_in_page(bytenr));
3647
+
3648
+ /*
3649
+ * We FUA only the first super block. The others we allow to
3650
+ * go down lazy and there's a short window where the on-disk
3651
+ * copies might still contain the older version.
3652
+ */
3653
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
35013654 if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
3502
- op_flags |= REQ_FUA;
3503
- ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
3504
- if (ret)
3505
- errors++;
3655
+ bio->bi_opf |= REQ_FUA;
3656
+
3657
+ btrfsic_submit_bio(bio);
35063658 }
35073659 return errors < i ? 0 : -1;
35083660 }
....@@ -3511,12 +3663,11 @@
35113663 * Wait for write completion of superblocks done by write_dev_supers,
35123664 * @max_mirrors same for write and wait phases.
35133665 *
3514
- * Return number of errors when buffer head is not found or not marked up to
3666
+ * Return number of errors when page is not found or not marked up to
35153667 * date.
35163668 */
35173669 static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
35183670 {
3519
- struct buffer_head *bh;
35203671 int i;
35213672 int errors = 0;
35223673 bool primary_failed = false;
....@@ -3526,32 +3677,34 @@
35263677 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
35273678
35283679 for (i = 0; i < max_mirrors; i++) {
3680
+ struct page *page;
3681
+
35293682 bytenr = btrfs_sb_offset(i);
35303683 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
35313684 device->commit_total_bytes)
35323685 break;
35333686
3534
- bh = __find_get_block(device->bdev,
3535
- bytenr / BTRFS_BDEV_BLOCKSIZE,
3536
- BTRFS_SUPER_INFO_SIZE);
3537
- if (!bh) {
3687
+ page = find_get_page(device->bdev->bd_inode->i_mapping,
3688
+ bytenr >> PAGE_SHIFT);
3689
+ if (!page) {
35383690 errors++;
35393691 if (i == 0)
35403692 primary_failed = true;
35413693 continue;
35423694 }
3543
- wait_on_buffer(bh);
3544
- if (!buffer_uptodate(bh)) {
3695
+ /* Page is submitted locked and unlocked once the IO completes */
3696
+ wait_on_page_locked(page);
3697
+ if (PageError(page)) {
35453698 errors++;
35463699 if (i == 0)
35473700 primary_failed = true;
35483701 }
35493702
3550
- /* drop our reference */
3551
- brelse(bh);
3703
+ /* Drop our reference */
3704
+ put_page(page);
35523705
3553
- /* drop the reference from the writing run */
3554
- brelse(bh);
3706
+ /* Drop the reference from the writing run */
3707
+ put_page(page);
35553708 }
35563709
35573710 /* log error, force error return */
....@@ -3697,7 +3850,7 @@
36973850
36983851 if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
36993852 (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3700
- min_tolerated = min(min_tolerated,
3853
+ min_tolerated = min_t(int, min_tolerated,
37013854 btrfs_raid_array[BTRFS_RAID_SINGLE].
37023855 tolerated_failures);
37033856
....@@ -3706,7 +3859,7 @@
37063859 continue;
37073860 if (!(flags & btrfs_raid_array[raid_type].bg_flag))
37083861 continue;
3709
- min_tolerated = min(min_tolerated,
3862
+ min_tolerated = min_t(int, min_tolerated,
37103863 btrfs_raid_array[raid_type].
37113864 tolerated_failures);
37123865 }
....@@ -3779,7 +3932,8 @@
37793932 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
37803933 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
37813934 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
3782
- memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE);
3935
+ memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
3936
+ BTRFS_FSID_SIZE);
37833937
37843938 flags = btrfs_super_flags(sb);
37853939 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
....@@ -3834,20 +3988,19 @@
38343988 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
38353989 struct btrfs_root *root)
38363990 {
3991
+ bool drop_ref = false;
3992
+
38373993 spin_lock(&fs_info->fs_roots_radix_lock);
38383994 radix_tree_delete(&fs_info->fs_roots_radix,
38393995 (unsigned long)root->root_key.objectid);
3996
+ if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
3997
+ drop_ref = true;
38403998 spin_unlock(&fs_info->fs_roots_radix_lock);
38413999
3842
- if (btrfs_root_refs(&root->root_item) == 0)
3843
- synchronize_srcu(&fs_info->subvol_srcu);
3844
-
38454000 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
3846
- btrfs_free_log(NULL, root);
4001
+ ASSERT(root->log_root == NULL);
38474002 if (root->reloc_root) {
3848
- free_extent_buffer(root->reloc_root->node);
3849
- free_extent_buffer(root->reloc_root->commit_root);
3850
- btrfs_put_fs_root(root->reloc_root);
4003
+ btrfs_put_root(root->reloc_root);
38514004 root->reloc_root = NULL;
38524005 }
38534006 }
....@@ -3856,22 +4009,12 @@
38564009 __btrfs_remove_free_space_cache(root->free_ino_pinned);
38574010 if (root->free_ino_ctl)
38584011 __btrfs_remove_free_space_cache(root->free_ino_ctl);
3859
- btrfs_free_fs_root(root);
3860
-}
3861
-
3862
-void btrfs_free_fs_root(struct btrfs_root *root)
3863
-{
3864
- iput(root->ino_cache_inode);
3865
- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
3866
- if (root->anon_dev)
3867
- free_anon_bdev(root->anon_dev);
3868
- if (root->subv_writers)
3869
- btrfs_free_subvolume_writers(root->subv_writers);
3870
- free_extent_buffer(root->node);
3871
- free_extent_buffer(root->commit_root);
3872
- kfree(root->free_ino_ctl);
3873
- kfree(root->free_ino_pinned);
3874
- btrfs_put_fs_root(root);
4012
+ if (root->ino_cache_inode) {
4013
+ iput(root->ino_cache_inode);
4014
+ root->ino_cache_inode = NULL;
4015
+ }
4016
+ if (drop_ref)
4017
+ btrfs_put_root(root);
38754018 }
38764019
38774020 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
....@@ -3881,15 +4024,14 @@
38814024 int i = 0;
38824025 int err = 0;
38834026 unsigned int ret = 0;
3884
- int index;
38854027
38864028 while (1) {
3887
- index = srcu_read_lock(&fs_info->subvol_srcu);
4029
+ spin_lock(&fs_info->fs_roots_radix_lock);
38884030 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
38894031 (void **)gang, root_objectid,
38904032 ARRAY_SIZE(gang));
38914033 if (!ret) {
3892
- srcu_read_unlock(&fs_info->subvol_srcu, index);
4034
+ spin_unlock(&fs_info->fs_roots_radix_lock);
38934035 break;
38944036 }
38954037 root_objectid = gang[ret - 1]->root_key.objectid + 1;
....@@ -3901,9 +4043,9 @@
39014043 continue;
39024044 }
39034045 /* grab all the search result for later use */
3904
- gang[i] = btrfs_grab_fs_root(gang[i]);
4046
+ gang[i] = btrfs_grab_root(gang[i]);
39054047 }
3906
- srcu_read_unlock(&fs_info->subvol_srcu, index);
4048
+ spin_unlock(&fs_info->fs_roots_radix_lock);
39074049
39084050 for (i = 0; i < ret; i++) {
39094051 if (!gang[i])
....@@ -3912,7 +4054,7 @@
39124054 err = btrfs_orphan_cleanup(gang[i]);
39134055 if (err)
39144056 break;
3915
- btrfs_put_fs_root(gang[i]);
4057
+ btrfs_put_root(gang[i]);
39164058 }
39174059 root_objectid++;
39184060 }
....@@ -3920,7 +4062,7 @@
39204062 /* release the uncleaned roots due to error */
39214063 for (; i < ret; i++) {
39224064 if (gang[i])
3923
- btrfs_put_fs_root(gang[i]);
4065
+ btrfs_put_root(gang[i]);
39244066 }
39254067 return err;
39264068 }
....@@ -3945,7 +4087,7 @@
39454087 return btrfs_commit_transaction(trans);
39464088 }
39474089
3948
-void close_ctree(struct btrfs_fs_info *fs_info)
4090
+void __cold close_ctree(struct btrfs_fs_info *fs_info)
39494091 {
39504092 int ret;
39514093
....@@ -3980,7 +4122,36 @@
39804122 /* clear out the rbtree of defraggable inodes */
39814123 btrfs_cleanup_defrag_inodes(fs_info);
39824124
4125
+ /*
4126
+ * After we parked the cleaner kthread, ordered extents may have
4127
+ * completed and created new delayed iputs. If one of the async reclaim
4128
+ * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
4129
+ * can hang forever trying to stop it, because if a delayed iput is
4130
+ * added after it ran btrfs_run_delayed_iputs() and before it called
4131
+ * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
4132
+ * no one else to run iputs.
4133
+ *
4134
+ * So wait for all ongoing ordered extents to complete and then run
4135
+ * delayed iputs. This works because once we reach this point no one
4136
+ * can either create new ordered extents nor create delayed iputs
4137
+ * through some other means.
4138
+ *
4139
+ * Also note that btrfs_wait_ordered_roots() is not safe here, because
4140
+ * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
4141
+ * but the delayed iput for the respective inode is made only when doing
4142
+ * the final btrfs_put_ordered_extent() (which must happen at
4143
+ * btrfs_finish_ordered_io() when we are unmounting).
4144
+ */
4145
+ btrfs_flush_workqueue(fs_info->endio_write_workers);
4146
+ /* Ordered extents for free space inodes. */
4147
+ btrfs_flush_workqueue(fs_info->endio_freespace_worker);
4148
+ btrfs_run_delayed_iputs(fs_info);
4149
+
39834150 cancel_work_sync(&fs_info->async_reclaim_work);
4151
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
4152
+
4153
+ /* Cancel or finish ongoing discard work */
4154
+ btrfs_discard_cleanup(fs_info);
39844155
39854156 if (!sb_rdonly(fs_info->sb)) {
39864157 /*
....@@ -4014,7 +4185,13 @@
40144185 kthread_stop(fs_info->transaction_kthread);
40154186 kthread_stop(fs_info->cleaner_kthread);
40164187
4188
+ ASSERT(list_empty(&fs_info->delayed_iputs));
40174189 set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4190
+
4191
+ if (btrfs_check_quota_leak(fs_info)) {
4192
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4193
+ btrfs_err(fs_info, "qgroup reserved space leaked");
4194
+ }
40184195
40194196 btrfs_free_qgroup_config(fs_info);
40204197 ASSERT(list_empty(&fs_info->delalloc_roots));
....@@ -4024,10 +4201,12 @@
40244201 percpu_counter_sum(&fs_info->delalloc_bytes));
40254202 }
40264203
4204
+ if (percpu_counter_sum(&fs_info->dio_bytes))
4205
+ btrfs_info(fs_info, "at unmount dio bytes count %lld",
4206
+ percpu_counter_sum(&fs_info->dio_bytes));
4207
+
40274208 btrfs_sysfs_remove_mounted(fs_info);
40284209 btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4029
-
4030
- btrfs_free_fs_roots(fs_info);
40314210
40324211 btrfs_put_block_group_cache(fs_info);
40334212
....@@ -4040,6 +4219,7 @@
40404219
40414220 clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
40424221 free_root_pointers(fs_info, true);
4222
+ btrfs_free_fs_roots(fs_info);
40434223
40444224 /*
40454225 * We must free the block groups after dropping the fs_roots as we could
....@@ -4057,25 +4237,8 @@
40574237 btrfsic_unmount(fs_info->fs_devices);
40584238 #endif
40594239
4060
- btrfs_close_devices(fs_info->fs_devices);
40614240 btrfs_mapping_tree_free(&fs_info->mapping_tree);
4062
-
4063
- percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
4064
- percpu_counter_destroy(&fs_info->delalloc_bytes);
4065
- percpu_counter_destroy(&fs_info->bio_counter);
4066
- cleanup_srcu_struct(&fs_info->subvol_srcu);
4067
-
4068
- btrfs_free_stripe_hash_table(fs_info);
4069
- btrfs_free_ref_cache(fs_info);
4070
-
4071
- while (!list_empty(&fs_info->pinned_chunks)) {
4072
- struct extent_map *em;
4073
-
4074
- em = list_first_entry(&fs_info->pinned_chunks,
4075
- struct extent_map, list);
4076
- list_del_init(&em->list);
4077
- free_extent_map(em);
4078
- }
4241
+ btrfs_close_devices(fs_info->fs_devices);
40794242 }
40804243
40814244 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
....@@ -4105,7 +4268,7 @@
41054268 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
41064269 /*
41074270 * This is a fast path so only do this check if we have sanity tests
4108
- * enabled. Normal people shouldn't be using umapped buffers as dirty
4271
+ * enabled. Normal people shouldn't be using unmapped buffers as dirty
41094272 * outside of the sanity tests.
41104273 */
41114274 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
....@@ -4129,7 +4292,7 @@
41294292 * So here we should only check item pointers, not item data.
41304293 */
41314294 if (btrfs_header_level(buf) == 0 &&
4132
- btrfs_check_leaf_relaxed(fs_info, buf)) {
4295
+ btrfs_check_leaf_relaxed(buf)) {
41334296 btrfs_print_leaf(buf);
41344297 ASSERT(0);
41354298 }
....@@ -4172,10 +4335,7 @@
41724335 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
41734336 struct btrfs_key *first_key)
41744337 {
4175
- struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
4176
- struct btrfs_fs_info *fs_info = root->fs_info;
4177
-
4178
- return btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
4338
+ return btree_read_extent_buffer_pages(buf, parent_transid,
41794339 level, first_key);
41804340 }
41814341
....@@ -4190,6 +4350,36 @@
41904350
41914351 down_write(&fs_info->cleanup_work_sem);
41924352 up_write(&fs_info->cleanup_work_sem);
4353
+}
4354
+
4355
+static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
4356
+{
4357
+ struct btrfs_root *gang[8];
4358
+ u64 root_objectid = 0;
4359
+ int ret;
4360
+
4361
+ spin_lock(&fs_info->fs_roots_radix_lock);
4362
+ while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
4363
+ (void **)gang, root_objectid,
4364
+ ARRAY_SIZE(gang))) != 0) {
4365
+ int i;
4366
+
4367
+ for (i = 0; i < ret; i++)
4368
+ gang[i] = btrfs_grab_root(gang[i]);
4369
+ spin_unlock(&fs_info->fs_roots_radix_lock);
4370
+
4371
+ for (i = 0; i < ret; i++) {
4372
+ if (!gang[i])
4373
+ continue;
4374
+ root_objectid = gang[i]->root_key.objectid;
4375
+ btrfs_free_log(NULL, gang[i]);
4376
+ btrfs_put_root(gang[i]);
4377
+ }
4378
+ root_objectid++;
4379
+ spin_lock(&fs_info->fs_roots_radix_lock);
4380
+ }
4381
+ spin_unlock(&fs_info->fs_roots_radix_lock);
4382
+ btrfs_free_log_root_tree(NULL, fs_info);
41934383 }
41944384
41954385 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
....@@ -4252,33 +4442,26 @@
42524442 spin_lock(&delayed_refs->lock);
42534443 if (atomic_read(&delayed_refs->num_entries) == 0) {
42544444 spin_unlock(&delayed_refs->lock);
4255
- btrfs_info(fs_info, "delayed_refs has NO entry");
4445
+ btrfs_debug(fs_info, "delayed_refs has NO entry");
42564446 return ret;
42574447 }
42584448
4259
- while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
4449
+ while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
42604450 struct btrfs_delayed_ref_head *head;
42614451 struct rb_node *n;
42624452 bool pin_bytes = false;
42634453
42644454 head = rb_entry(node, struct btrfs_delayed_ref_head,
42654455 href_node);
4266
- if (!mutex_trylock(&head->mutex)) {
4267
- refcount_inc(&head->refs);
4268
- spin_unlock(&delayed_refs->lock);
4269
-
4270
- mutex_lock(&head->mutex);
4271
- mutex_unlock(&head->mutex);
4272
- btrfs_put_delayed_ref_head(head);
4273
- spin_lock(&delayed_refs->lock);
4456
+ if (btrfs_delayed_ref_lock(delayed_refs, head))
42744457 continue;
4275
- }
4458
+
42764459 spin_lock(&head->lock);
4277
- while ((n = rb_first(&head->ref_tree)) != NULL) {
4460
+ while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
42784461 ref = rb_entry(n, struct btrfs_delayed_ref_node,
42794462 ref_node);
42804463 ref->in_tree = 0;
4281
- rb_erase(&ref->ref_node, &head->ref_tree);
4464
+ rb_erase_cached(&ref->ref_node, &head->ref_tree);
42824465 RB_CLEAR_NODE(&ref->ref_node);
42834466 if (!list_empty(&ref->add_list))
42844467 list_del(&ref->add_list);
....@@ -4288,23 +4471,41 @@
42884471 if (head->must_insert_reserved)
42894472 pin_bytes = true;
42904473 btrfs_free_delayed_extent_op(head->extent_op);
4291
- delayed_refs->num_heads--;
4292
- if (head->processing == 0)
4293
- delayed_refs->num_heads_ready--;
4294
- atomic_dec(&delayed_refs->num_entries);
4295
- rb_erase(&head->href_node, &delayed_refs->href_root);
4296
- RB_CLEAR_NODE(&head->href_node);
4474
+ btrfs_delete_ref_head(delayed_refs, head);
42974475 spin_unlock(&head->lock);
42984476 spin_unlock(&delayed_refs->lock);
42994477 mutex_unlock(&head->mutex);
43004478
4301
- if (pin_bytes)
4302
- btrfs_pin_extent(fs_info, head->bytenr,
4303
- head->num_bytes, 1);
4479
+ if (pin_bytes) {
4480
+ struct btrfs_block_group *cache;
4481
+
4482
+ cache = btrfs_lookup_block_group(fs_info, head->bytenr);
4483
+ BUG_ON(!cache);
4484
+
4485
+ spin_lock(&cache->space_info->lock);
4486
+ spin_lock(&cache->lock);
4487
+ cache->pinned += head->num_bytes;
4488
+ btrfs_space_info_update_bytes_pinned(fs_info,
4489
+ cache->space_info, head->num_bytes);
4490
+ cache->reserved -= head->num_bytes;
4491
+ cache->space_info->bytes_reserved -= head->num_bytes;
4492
+ spin_unlock(&cache->lock);
4493
+ spin_unlock(&cache->space_info->lock);
4494
+ percpu_counter_add_batch(
4495
+ &cache->space_info->total_bytes_pinned,
4496
+ head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
4497
+
4498
+ btrfs_put_block_group(cache);
4499
+
4500
+ btrfs_error_unpin_extent_range(fs_info, head->bytenr,
4501
+ head->bytenr + head->num_bytes - 1);
4502
+ }
4503
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
43044504 btrfs_put_delayed_ref_head(head);
43054505 cond_resched();
43064506 spin_lock(&delayed_refs->lock);
43074507 }
4508
+ btrfs_qgroup_destroy_extent_records(trans);
43084509
43094510 spin_unlock(&delayed_refs->lock);
43104511
....@@ -4334,7 +4535,11 @@
43344535 */
43354536 inode = igrab(&btrfs_inode->vfs_inode);
43364537 if (inode) {
4538
+ unsigned int nofs_flag;
4539
+
4540
+ nofs_flag = memalloc_nofs_save();
43374541 invalidate_inode_pages2(inode->i_mapping);
4542
+ memalloc_nofs_restore(nofs_flag);
43384543 iput(inode);
43394544 }
43404545 spin_lock(&root->delalloc_lock);
....@@ -4354,12 +4559,12 @@
43544559 while (!list_empty(&splice)) {
43554560 root = list_first_entry(&splice, struct btrfs_root,
43564561 delalloc_root);
4357
- root = btrfs_grab_fs_root(root);
4562
+ root = btrfs_grab_root(root);
43584563 BUG_ON(!root);
43594564 spin_unlock(&fs_info->delalloc_root_lock);
43604565
43614566 btrfs_destroy_delalloc_inodes(root);
4362
- btrfs_put_fs_root(root);
4567
+ btrfs_put_root(root);
43634568
43644569 spin_lock(&fs_info->delalloc_root_lock);
43654570 }
....@@ -4400,16 +4605,12 @@
44004605 }
44014606
44024607 static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
4403
- struct extent_io_tree *pinned_extents)
4608
+ struct extent_io_tree *unpin)
44044609 {
4405
- struct extent_io_tree *unpin;
44064610 u64 start;
44074611 u64 end;
44084612 int ret;
4409
- bool loop = true;
44104613
4411
- unpin = pinned_extents;
4412
-again:
44134614 while (1) {
44144615 struct extent_state *cached_state = NULL;
44154616
....@@ -4434,25 +4635,21 @@
44344635 cond_resched();
44354636 }
44364637
4437
- if (loop) {
4438
- if (unpin == &fs_info->freed_extents[0])
4439
- unpin = &fs_info->freed_extents[1];
4440
- else
4441
- unpin = &fs_info->freed_extents[0];
4442
- loop = false;
4443
- goto again;
4444
- }
4445
-
44464638 return 0;
44474639 }
44484640
4449
-static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
4641
+static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
44504642 {
44514643 struct inode *inode;
44524644
44534645 inode = cache->io_ctl.inode;
44544646 if (inode) {
4647
+ unsigned int nofs_flag;
4648
+
4649
+ nofs_flag = memalloc_nofs_save();
44554650 invalidate_inode_pages2(inode->i_mapping);
4651
+ memalloc_nofs_restore(nofs_flag);
4652
+
44564653 BTRFS_I(inode)->generation = 0;
44574654 cache->io_ctl.inode = NULL;
44584655 iput(inode);
....@@ -4464,12 +4661,12 @@
44644661 void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
44654662 struct btrfs_fs_info *fs_info)
44664663 {
4467
- struct btrfs_block_group_cache *cache;
4664
+ struct btrfs_block_group *cache;
44684665
44694666 spin_lock(&cur_trans->dirty_bgs_lock);
44704667 while (!list_empty(&cur_trans->dirty_bgs)) {
44714668 cache = list_first_entry(&cur_trans->dirty_bgs,
4472
- struct btrfs_block_group_cache,
4669
+ struct btrfs_block_group,
44734670 dirty_list);
44744671
44754672 if (!list_empty(&cache->io_list)) {
....@@ -4486,6 +4683,7 @@
44864683
44874684 spin_unlock(&cur_trans->dirty_bgs_lock);
44884685 btrfs_put_block_group(cache);
4686
+ btrfs_delayed_refs_rsv_release(fs_info, 1);
44894687 spin_lock(&cur_trans->dirty_bgs_lock);
44904688 }
44914689 spin_unlock(&cur_trans->dirty_bgs_lock);
....@@ -4496,7 +4694,7 @@
44964694 */
44974695 while (!list_empty(&cur_trans->io_bgs)) {
44984696 cache = list_first_entry(&cur_trans->io_bgs,
4499
- struct btrfs_block_group_cache,
4697
+ struct btrfs_block_group,
45004698 io_list);
45014699
45024700 list_del_init(&cache->io_list);
....@@ -4510,9 +4708,16 @@
45104708 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
45114709 struct btrfs_fs_info *fs_info)
45124710 {
4711
+ struct btrfs_device *dev, *tmp;
4712
+
45134713 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
45144714 ASSERT(list_empty(&cur_trans->dirty_bgs));
45154715 ASSERT(list_empty(&cur_trans->io_bgs));
4716
+
4717
+ list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
4718
+ post_commit_list) {
4719
+ list_del_init(&dev->post_commit_list);
4720
+ }
45164721
45174722 btrfs_destroy_delayed_refs(cur_trans, fs_info);
45184723
....@@ -4526,8 +4731,7 @@
45264731
45274732 btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
45284733 EXTENT_DIRTY);
4529
- btrfs_destroy_pinned_extent(fs_info,
4530
- fs_info->pinned_extents);
4734
+ btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
45314735
45324736 cur_trans->state =TRANS_STATE_COMPLETED;
45334737 wake_up(&cur_trans->commit_wait);
....@@ -4579,18 +4783,64 @@
45794783 btrfs_destroy_all_ordered_extents(fs_info);
45804784 btrfs_destroy_delayed_inodes(fs_info);
45814785 btrfs_assert_delayed_root_empty(fs_info);
4582
- btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
45834786 btrfs_destroy_all_delalloc_inodes(fs_info);
4787
+ btrfs_drop_all_logs(fs_info);
45844788 mutex_unlock(&fs_info->transaction_kthread_mutex);
45854789
45864790 return 0;
45874791 }
45884792
4589
-static const struct extent_io_ops btree_extent_io_ops = {
4590
- /* mandatory callbacks */
4591
- .submit_bio_hook = btree_submit_bio_hook,
4592
- .readpage_end_io_hook = btree_readpage_end_io_hook,
4593
- .readpage_io_failed_hook = btree_io_failed_hook,
4793
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
4794
+{
4795
+ struct btrfs_path *path;
4796
+ int ret;
4797
+ struct extent_buffer *l;
4798
+ struct btrfs_key search_key;
4799
+ struct btrfs_key found_key;
4800
+ int slot;
45944801
4595
- /* optional callbacks */
4596
-};
4802
+ path = btrfs_alloc_path();
4803
+ if (!path)
4804
+ return -ENOMEM;
4805
+
4806
+ search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
4807
+ search_key.type = -1;
4808
+ search_key.offset = (u64)-1;
4809
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
4810
+ if (ret < 0)
4811
+ goto error;
4812
+ BUG_ON(ret == 0); /* Corruption */
4813
+ if (path->slots[0] > 0) {
4814
+ slot = path->slots[0] - 1;
4815
+ l = path->nodes[0];
4816
+ btrfs_item_key_to_cpu(l, &found_key, slot);
4817
+ *objectid = max_t(u64, found_key.objectid,
4818
+ BTRFS_FIRST_FREE_OBJECTID - 1);
4819
+ } else {
4820
+ *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
4821
+ }
4822
+ ret = 0;
4823
+error:
4824
+ btrfs_free_path(path);
4825
+ return ret;
4826
+}
4827
+
4828
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
4829
+{
4830
+ int ret;
4831
+ mutex_lock(&root->objectid_mutex);
4832
+
4833
+ if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
4834
+ btrfs_warn(root->fs_info,
4835
+ "the objectid of root %llu reaches its highest value",
4836
+ root->root_key.objectid);
4837
+ ret = -ENOSPC;
4838
+ goto out;
4839
+ }
4840
+
4841
+ *objectid = ++root->highest_objectid;
4842
+ ret = 0;
4843
+out:
4844
+ mutex_unlock(&root->objectid_mutex);
4845
+ return ret;
4846
+}