From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/btrfs/volumes.c | 3658 ++++++++++++++++++++++++++++++++--------------------------- 1 files changed, 1,979 insertions(+), 1,679 deletions(-) diff --git a/kernel/fs/btrfs/volumes.c b/kernel/fs/btrfs/volumes.c index 8f05e6a..389493e 100644 --- a/kernel/fs/btrfs/volumes.c +++ b/kernel/fs/btrfs/volumes.c @@ -7,7 +7,6 @@ #include <linux/sched/mm.h> #include <linux/bio.h> #include <linux/slab.h> -#include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/ratelimit.h> #include <linux/kthread.h> @@ -15,6 +14,8 @@ #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> +#include <linux/namei.h> +#include "misc.h" #include "ctree.h" #include "extent_map.h" #include "disk-io.h" @@ -25,10 +26,12 @@ #include "async-thread.h" #include "check-integrity.h" #include "rcu-string.h" -#include "math.h" #include "dev-replace.h" #include "sysfs.h" #include "tree-checker.h" +#include "space-info.h" +#include "block-group.h" +#include "discard.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -39,6 +42,7 @@ .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .nparity = 0, .raid_name = "raid10", .bg_flag = BTRFS_BLOCK_GROUP_RAID10, .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, @@ -51,9 +55,36 @@ .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .nparity = 0, .raid_name = "raid1", .bg_flag = BTRFS_BLOCK_GROUP_RAID1, .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, + }, + [BTRFS_RAID_RAID1C3] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 3, + .devs_min = 3, + .tolerated_failures = 2, + .devs_increment = 3, + .ncopies = 3, + .nparity = 0, + .raid_name = "raid1c3", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, + .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, + }, + [BTRFS_RAID_RAID1C4] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 4, + .devs_min = 4, + .tolerated_failures = 3, + .devs_increment = 4, + .ncopies = 4, + .nparity = 0, + .raid_name = "raid1c4", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, + .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, }, [BTRFS_RAID_DUP] = { .sub_stripes = 1, @@ -63,6 +94,7 @@ .tolerated_failures = 0, .devs_increment = 1, .ncopies = 2, + .nparity = 0, .raid_name = "dup", .bg_flag = BTRFS_BLOCK_GROUP_DUP, .mindev_error = 0, @@ -75,6 +107,7 @@ .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .nparity = 0, .raid_name = "raid0", .bg_flag = BTRFS_BLOCK_GROUP_RAID0, .mindev_error = 0, @@ -87,6 +120,7 @@ .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .nparity = 0, .raid_name = "single", .bg_flag = 0, .mindev_error = 0, @@ -99,6 +133,7 @@ .tolerated_failures = 1, .devs_increment = 1, .ncopies = 1, + .nparity = 1, .raid_name = "raid5", .bg_flag = BTRFS_BLOCK_GROUP_RAID5, .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, @@ -111,24 +146,79 @@ .tolerated_failures = 2, .devs_increment = 1, .ncopies = 1, + .nparity = 2, .raid_name = "raid6", .bg_flag = BTRFS_BLOCK_GROUP_RAID6, .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, }, }; -const char *get_raid_name(enum btrfs_raid_types type) +const char *btrfs_bg_type_to_raid_name(u64 flags) { - if (type >= BTRFS_NR_RAID_TYPES) + const int index = btrfs_bg_flags_to_raid_index(flags); + + if (index >= BTRFS_NR_RAID_TYPES) return NULL; - return btrfs_raid_array[type].raid_name; + return btrfs_raid_array[index].raid_name; } -static int init_first_rw_device(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +/* + * Fill @buf with textual description of @bg_flags, no more than @size_buf + * bytes including terminating null byte. + */ +void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) +{ + int i; + int ret; + char *bp = buf; + u64 flags = bg_flags; + u32 size_bp = size_buf; + + if (!flags) { + strcpy(bp, "NONE"); + return; + } + +#define DESCRIBE_FLAG(flag, desc) \ + do { \ + if (flags & (flag)) { \ + ret = snprintf(bp, size_bp, "%s|", (desc)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + flags &= ~(flag); \ + } \ + } while (0) + + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + + DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, + btrfs_raid_array[i].raid_name); +#undef DESCRIBE_FLAG + + if (flags) { + ret = snprintf(bp, size_bp, "0x%llx|", flags); + size_bp -= ret; + } + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ + + /* + * The text is trimmed, it's up to the caller to provide sufficiently + * large buffer + */ +out_overflow:; +} + +static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); -static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, @@ -153,7 +243,7 @@ * the mutex can be very coarse and can cover long-running operations * * protects: updates to fs_devices counters like missing devices, rw devices, - * seeding, structure cloning, openning/closing devices at mount/umount time + * seeding, structure cloning, opening/closing devices at mount/umount time * * global::fs_devs - add, remove, updates to the global list * @@ -183,7 +273,9 @@ * chunk_mutex * ----------- * protects chunks, adding or removing during allocation, trim or when a new - * device is added/removed + * device is added/removed. Additionally it also protects post_commit_list of + * individual devices, since they can be added to the transaction's + * post_commit_list only with chunk_mutex held. * * cleaner_mutex * ------------- @@ -195,14 +287,13 @@ * ============ * * uuid_mutex - * volume_mutex - * device_list_mutex - * chunk_mutex - * balance_mutex + * device_list_mutex + * chunk_mutex + * balance_mutex * * - * Exclusive operations, BTRFS_FS_EXCL_OP - * ====================================== + * Exclusive operations + * ==================== * * Maintains the exclusivity of the following operations that apply to the * whole filesystem and cannot run in parallel. @@ -228,30 +319,32 @@ * - system power-cycle and filesystem mounted as read-only * - filesystem or device errors leading to forced read-only * - * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. - * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. + * The status of exclusive operation is set and cleared atomically. + * During the course of Paused state, fs_info::exclusive_operation remains set. * A device operation in Paused or Running state can be canceled or resumed * either by ioctl (Balance only) or when remounted as read-write. - * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or + * The exclusive status is cleared when the device operation is canceled or * completed. */ DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); -struct list_head *btrfs_get_fs_uuids(void) +struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) { return &fs_uuids; } /* * alloc_fs_devices - allocate struct btrfs_fs_devices - * @fsid: if not NULL, copy the uuid to fs_devices::fsid + * @fsid: if not NULL, copy the UUID to fs_devices::fsid + * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid * * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). * The returned struct is not linked onto any lists and can be destroyed with * kfree() right away. */ -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, + const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devs; @@ -262,18 +355,25 @@ mutex_init(&fs_devs->device_list_mutex); INIT_LIST_HEAD(&fs_devs->devices); - INIT_LIST_HEAD(&fs_devs->resized_devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); + INIT_LIST_HEAD(&fs_devs->seed_list); if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + + if (metadata_fsid) + memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); + else if (fsid) + memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); return fs_devs; } void btrfs_free_device(struct btrfs_device *device) { + WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); + extent_io_tree_release(&device->alloc_state); bio_put(device->flush_bio); kfree(device); } @@ -281,6 +381,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; + WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { device = list_entry(fs_devices->devices.next, @@ -289,19 +390,6 @@ btrfs_free_device(device); } kfree(fs_devices); -} - -static void btrfs_kobject_uevent(struct block_device *bdev, - enum kobject_action action) -{ - int ret; - - ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); - if (ret) - pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", - action, - kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), - &disk_to_dev(bdev->bd_disk)->kobj); } void __exit btrfs_cleanup_fs_uuids(void) @@ -321,7 +409,7 @@ * Returned struct is not linked onto any lists and must be destroyed using * btrfs_free_device. */ -static struct btrfs_device *__alloc_device(void) +static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) { struct btrfs_device *dev; @@ -341,34 +429,86 @@ INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); - INIT_LIST_HEAD(&dev->resized_list); - - spin_lock_init(&dev->io_lock); + INIT_LIST_HEAD(&dev->post_commit_list); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + extent_io_tree_init(fs_info, &dev->alloc_state, + IO_TREE_DEVICE_ALLOC_STATE, NULL); return dev; } -static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) +static noinline struct btrfs_fs_devices *find_fsid( + const u8 *fsid, const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devices; + ASSERT(fsid); + + /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) - return fs_devices; + if (metadata_fsid) { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 + && memcmp(metadata_fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) + return fs_devices; + } else { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) + return fs_devices; + } } return NULL; } +static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( + struct btrfs_super_block *disk_super) +{ + + struct btrfs_fs_devices *fs_devices; + + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by first scanning + * a device which didn't have its fsid/metadata_uuid changed + * at all and the CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(disk_super->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by a device that + * has an outdated pair of fsid/metadata_uuid and + * CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(fs_devices->metadata_uuid, + fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && + memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + + return find_fsid(disk_super->fsid, disk_super->metadata_uuid); +} + + static int btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, int flush, struct block_device **bdev, - struct buffer_head **bh) + struct btrfs_super_block **disk_super) { int ret; @@ -387,9 +527,9 @@ goto error; } invalidate_bdev(*bdev); - *bh = btrfs_read_dev_super(*bdev); - if (IS_ERR(*bh)) { - ret = PTR_ERR(*bh); + *disk_super = btrfs_read_dev_super(*bdev); + if (IS_ERR(*disk_super)) { + ret = PTR_ERR(*disk_super); blkdev_put(*bdev, flags); goto error; } @@ -398,214 +538,50 @@ error: *bdev = NULL; - *bh = NULL; return ret; } -static void requeue_list(struct btrfs_pending_bios *pending_bios, - struct bio *head, struct bio *tail) -{ - - struct bio *old_head; - - old_head = pending_bios->head; - pending_bios->head = head; - if (pending_bios->tail) - tail->bi_next = old_head; - else - pending_bios->tail = tail; -} - /* - * we try to collect pending bios for a device so we don't get a large - * number of procs sending bios down to the same device. This greatly - * improves the schedulers ability to collect and merge the bios. + * Check if the device in the path matches the device in the given struct device. * - * But, it also turns into a long list of bios to process and that is sure - * to eventually make the worker thread block. The solution here is to - * make some progress and then put this work struct back at the end of - * the list if the block device is congested. This way, multiple devices - * can make progress from a single worker thread. + * Returns: + * true If it is the same device. + * false If it is not the same device or on error. */ -static noinline void run_scheduled_bios(struct btrfs_device *device) +static bool device_matched(const struct btrfs_device *device, const char *path) { - struct btrfs_fs_info *fs_info = device->fs_info; - struct bio *pending; - struct backing_dev_info *bdi; - struct btrfs_pending_bios *pending_bios; - struct bio *tail; - struct bio *cur; - int again = 0; - unsigned long num_run; - unsigned long batch_run = 0; - unsigned long last_waited = 0; - int force_reg = 0; - int sync_pending = 0; - struct blk_plug plug; + char *device_name; + struct block_device *bdev_old; + struct block_device *bdev_new; /* - * this function runs all the bios we've collected for - * a particular device. We don't want to wander off to - * another device without first sending all of these down. - * So, setup a plug here and finish it off before we return + * If we are looking for a device with the matching dev_t, then skip + * device without a name (a missing device). */ - blk_start_plug(&plug); + if (!device->name) + return false; - bdi = device->bdev->bd_bdi; + device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); + if (!device_name) + return false; -loop: - spin_lock(&device->io_lock); + rcu_read_lock(); + scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name)); + rcu_read_unlock(); -loop_lock: - num_run = 0; + bdev_old = lookup_bdev(device_name); + kfree(device_name); + if (IS_ERR(bdev_old)) + return false; - /* take all the bios off the list at once and process them - * later on (without the lock held). But, remember the - * tail and other pointers so the bios can be properly reinserted - * into the list if we hit congestion - */ - if (!force_reg && device->pending_sync_bios.head) { - pending_bios = &device->pending_sync_bios; - force_reg = 1; - } else { - pending_bios = &device->pending_bios; - force_reg = 0; - } + bdev_new = lookup_bdev(path); + if (IS_ERR(bdev_new)) + return false; - pending = pending_bios->head; - tail = pending_bios->tail; - WARN_ON(pending && !tail); + if (bdev_old == bdev_new) + return true; - /* - * if pending was null this time around, no bios need processing - * at all and we can stop. Otherwise it'll loop back up again - * and do an additional check so no bios are missed. - * - * device->running_pending is used to synchronize with the - * schedule_bio code. - */ - if (device->pending_sync_bios.head == NULL && - device->pending_bios.head == NULL) { - again = 0; - device->running_pending = 0; - } else { - again = 1; - device->running_pending = 1; - } - - pending_bios->head = NULL; - pending_bios->tail = NULL; - - spin_unlock(&device->io_lock); - - while (pending) { - - rmb(); - /* we want to work on both lists, but do more bios on the - * sync list than the regular list - */ - if ((num_run > 32 && - pending_bios != &device->pending_sync_bios && - device->pending_sync_bios.head) || - (num_run > 64 && pending_bios == &device->pending_sync_bios && - device->pending_bios.head)) { - spin_lock(&device->io_lock); - requeue_list(pending_bios, pending, tail); - goto loop_lock; - } - - cur = pending; - pending = pending->bi_next; - cur->bi_next = NULL; - - BUG_ON(atomic_read(&cur->__bi_cnt) == 0); - - /* - * if we're doing the sync list, record that our - * plug has some sync requests on it - * - * If we're doing the regular list and there are - * sync requests sitting around, unplug before - * we add more - */ - if (pending_bios == &device->pending_sync_bios) { - sync_pending = 1; - } else if (sync_pending) { - blk_finish_plug(&plug); - blk_start_plug(&plug); - sync_pending = 0; - } - - btrfsic_submit_bio(cur); - num_run++; - batch_run++; - - cond_resched(); - - /* - * we made progress, there is more work to do and the bdi - * is now congested. Back off and let other work structs - * run instead - */ - if (pending && bdi_write_congested(bdi) && batch_run > 8 && - fs_info->fs_devices->open_devices > 1) { - struct io_context *ioc; - - ioc = current->io_context; - - /* - * the main goal here is that we don't want to - * block if we're going to be able to submit - * more requests without blocking. - * - * This code does two great things, it pokes into - * the elevator code from a filesystem _and_ - * it makes assumptions about how batching works. - */ - if (ioc && ioc->nr_batch_requests > 0 && - time_before(jiffies, ioc->last_waited + HZ/50UL) && - (last_waited == 0 || - ioc->last_waited == last_waited)) { - /* - * we want to go through our batch of - * requests and stop. So, we copy out - * the ioc->last_waited time and test - * against it before looping - */ - last_waited = ioc->last_waited; - cond_resched(); - continue; - } - spin_lock(&device->io_lock); - requeue_list(pending_bios, pending, tail); - device->running_pending = 1; - - spin_unlock(&device->io_lock); - btrfs_queue_work(fs_info->submit_workers, - &device->work); - goto done; - } - } - - cond_resched(); - if (again) - goto loop; - - spin_lock(&device->io_lock); - if (device->pending_bios.head || device->pending_sync_bios.head) - goto loop_lock; - spin_unlock(&device->io_lock); - -done: - blk_finish_plug(&plug); -} - -static void pending_bios_fn(struct btrfs_work *work) -{ - struct btrfs_device *device; - - device = container_of(work, struct btrfs_device, work); - run_scheduled_bios(device); + return false; } /* @@ -615,52 +591,55 @@ * matching this path only. * skip_dev: Optional. Will skip this device when searching for the stale * devices. + * Return: 0 for success or if @path is NULL. + * -EBUSY if @path is a mounted device. + * -ENOENT if @path does not match any device in the list. */ -static void btrfs_free_stale_devices(const char *path, +static int btrfs_free_stale_devices(const char *path, struct btrfs_device *skip_device) { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; + int ret = 0; + + lockdep_assert_held(&uuid_mutex); + + if (path) + ret = -ENOENT; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { - mutex_lock(&fs_devices->device_list_mutex); - if (fs_devices->opened) { - mutex_unlock(&fs_devices->device_list_mutex); - continue; - } + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { - int not_found = 0; - if (skip_device && skip_device == device) continue; - if (path && !device->name) + if (path && !device_matched(device, path)) continue; - - rcu_read_lock(); - if (path) - not_found = strcmp(rcu_str_deref(device->name), - path); - rcu_read_unlock(); - if (not_found) - continue; + if (fs_devices->opened) { + /* for an already deleted device return 0 */ + if (path && ret != 0) + ret = -EBUSY; + break; + } /* delete the stale device */ fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); - if (fs_devices->num_devices == 0) - break; + ret = 0; } mutex_unlock(&fs_devices->device_list_mutex); + if (fs_devices->num_devices == 0) { btrfs_sysfs_remove_fsid(fs_devices); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } + + return ret; } /* @@ -674,7 +653,6 @@ { struct request_queue *q; struct block_device *bdev; - struct buffer_head *bh; struct btrfs_super_block *disk_super; u64 devid; int ret; @@ -685,23 +663,29 @@ return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &bh); + &bdev, &disk_super); if (ret) return ret; - disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); if (devid != device->devid) - goto error_brelse; + goto error_free_page; if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) - goto error_brelse; + goto error_free_page; device->generation = btrfs_super_generation(disk_super); if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + if (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { + pr_err( + "BTRFS: Invalid seeding and uuid-changed device detected\n"); + goto error_free_page; + } + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - fs_devices->seeding = 1; + fs_devices->seeding = true; } else { if (bdev_read_only(bdev)) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); @@ -711,7 +695,7 @@ q = bdev_get_queue(bdev); if (!blk_queue_nonrot(q)) - fs_devices->rotating = 1; + fs_devices->rotating = true; device->bdev = bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); @@ -723,17 +707,109 @@ fs_devices->rw_devices++; list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); } - brelse(bh); + btrfs_release_disk_super(disk_super); return 0; -error_brelse: - brelse(bh); +error_free_page: + btrfs_release_disk_super(disk_super); blkdev_put(bdev, flags); return -EINVAL; } +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) +{ + bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + + return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; +} + +/* + * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices + * being created with a disk that has already completed its fsid change. Such + * disk can belong to an fs which has its FSID changed or to one which doesn't. + * Handle both cases here. + */ +static struct btrfs_fs_devices *find_fsid_inprogress( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->fsid, + BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { + return fs_devices; + } + } + + return find_fsid(disk_super->fsid, NULL); +} + + +static struct btrfs_fs_devices *find_fsid_changed( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Handles the case where scanned device is part of an fs that had + * multiple successful changes of FSID but curently device didn't + * observe it. Meaning our fsid will be different than theirs. We need + * to handle two subcases : + * 1 - The fs still continues to have different METADATA/FSID uuids. + * 2 - The fs is switched back to its original FSID (METADATA/FSID + * are equal). + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + /* Changed UUIDs */ + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, disk_super->fsid, + BTRFS_FSID_SIZE) != 0) + return fs_devices; + + /* Unchanged UUIDs */ + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, disk_super->metadata_uuid, + BTRFS_FSID_SIZE) == 0) + return fs_devices; + } + + return NULL; +} + +static struct btrfs_fs_devices *find_fsid_reverted_metadata( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Handle the case where the scanned device is part of an fs whose last + * metadata UUID change reverted it to the original FSID. At the same + * time * fs_devices was first created by another constitutent device + * which didn't fully observe the operation. This results in an + * btrfs_fs_devices created with metadata/fsid different AND + * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the + * fs_devices equal to the FSID of the disk. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->fsid, + BTRFS_FSID_SIZE) == 0 && + fs_devices->fsid_change) + return fs_devices; + } + + return NULL; +} /* * Add new device to list of registered devices * @@ -746,16 +822,40 @@ bool *new_device_added) { struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices; + struct btrfs_fs_devices *fs_devices = NULL; struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); + bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & + BTRFS_SUPER_FLAG_CHANGING_FSID_V2); - fs_devices = find_fsid(disk_super->fsid); + if (fsid_change_in_progress) { + if (!has_metadata_uuid) + fs_devices = find_fsid_inprogress(disk_super); + else + fs_devices = find_fsid_changed(disk_super); + } else if (has_metadata_uuid) { + fs_devices = find_fsid_with_metadata_uuid(disk_super); + } else { + fs_devices = find_fsid_reverted_metadata(disk_super); + if (!fs_devices) + fs_devices = find_fsid(disk_super->fsid, NULL); + } + + if (!fs_devices) { - fs_devices = alloc_fs_devices(disk_super->fsid); + if (has_metadata_uuid) + fs_devices = alloc_fs_devices(disk_super->fsid, + disk_super->metadata_uuid); + else + fs_devices = alloc_fs_devices(disk_super->fsid, NULL); + if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); + + fs_devices->fsid_change = fsid_change_in_progress; mutex_lock(&fs_devices->device_list_mutex); list_add(&fs_devices->fs_list, &fs_uuids); @@ -765,6 +865,27 @@ mutex_lock(&fs_devices->device_list_mutex); device = btrfs_find_device(fs_devices, devid, disk_super->dev_item.uuid, NULL, false); + + /* + * If this disk has been pulled into an fs devices created by + * a device which had the CHANGING_FSID_V2 flag then replace the + * metadata_uuid/fsid values of the fs_devices. + */ + if (fs_devices->fsid_change && + found_transid > fs_devices->latest_generation) { + memcpy(fs_devices->fsid, disk_super->fsid, + BTRFS_FSID_SIZE); + + if (has_metadata_uuid) + memcpy(fs_devices->metadata_uuid, + disk_super->metadata_uuid, + BTRFS_FSID_SIZE); + else + memcpy(fs_devices->metadata_uuid, + disk_super->fsid, BTRFS_FSID_SIZE); + + fs_devices->fsid_change = false; + } } if (!device) { @@ -796,11 +917,15 @@ *new_device_added = true; if (disk_super->label[0]) - pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", - disk_super->label, devid, found_transid, path); + pr_info( + "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", + disk_super->label, devid, found_transid, path, + current->comm, task_pid_nr(current)); else - pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", - disk_super->fsid, devid, found_transid, path); + pr_info( + "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", + disk_super->fsid, devid, found_transid, path, + current->comm, task_pid_nr(current)); } else if (!device->name || strcmp(device->name->str, path)) { /* @@ -897,8 +1022,11 @@ * it back. We need it to pick the disk with largest generation * (as above). */ - if (!fs_devices->opened) + if (!fs_devices->opened) { device->generation = found_transid; + fs_devices->latest_generation = max_t(u64, found_transid, + fs_devices->latest_generation); + } fs_devices->total_devices = btrfs_super_num_devices(disk_super); @@ -911,22 +1039,25 @@ struct btrfs_fs_devices *fs_devices; struct btrfs_device *device; struct btrfs_device *orig_dev; + int ret = 0; - fs_devices = alloc_fs_devices(orig->fsid); + lockdep_assert_held(&uuid_mutex); + + fs_devices = alloc_fs_devices(orig->fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; - mutex_lock(&orig->device_list_mutex); fs_devices->total_devices = orig->total_devices; - /* We have held the volume lock, it is safe to get the devices. */ list_for_each_entry(orig_dev, &orig->devices, dev_list) { struct rcu_string *name; device = btrfs_alloc_device(NULL, &orig_dev->devid, orig_dev->uuid); - if (IS_ERR(device)) + if (IS_ERR(device)) { + ret = PTR_ERR(device); goto error; + } /* * This is ok to do without rcu read locked because we hold the @@ -937,6 +1068,7 @@ GFP_KERNEL); if (!name) { btrfs_free_device(device); + ret = -ENOMEM; goto error; } rcu_assign_pointer(device->name, name); @@ -946,36 +1078,27 @@ device->fs_devices = fs_devices; fs_devices->num_devices++; } - mutex_unlock(&orig->device_list_mutex); return fs_devices; error: - mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } -/* - * After we have read the system tree and know devids belonging to - * this filesystem, remove the device which does not belong there. - */ -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, + int step, struct btrfs_device **latest_dev) { struct btrfs_device *device, *next; - struct btrfs_device *latest_dev = NULL; - mutex_lock(&uuid_mutex); -again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &device->dev_state)) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state) && + &device->dev_state) && !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) && - (!latest_dev || - device->generation > latest_dev->generation)) { - latest_dev = device; + (!*latest_dev || + device->generation > (*latest_dev)->generation)) { + *latest_dev = device; } continue; } @@ -1002,22 +1125,26 @@ btrfs_free_device(device); } - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } +} + +/* + * After we have read the system tree and know devids belonging to this + * filesystem, remove the device which does not belong there. + */ +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +{ + struct btrfs_device *latest_dev = NULL; + struct btrfs_fs_devices *seed_dev; + + mutex_lock(&uuid_mutex); + __btrfs_free_extra_devids(fs_devices, step, &latest_dev); + + list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) + __btrfs_free_extra_devids(seed_dev, step, &latest_dev); fs_devices->latest_bdev = latest_dev->bdev; mutex_unlock(&uuid_mutex); -} - -static void free_device_rcu(struct rcu_head *head) -{ - struct btrfs_device *device; - - device = container_of(head, struct btrfs_device, rcu); - btrfs_free_device(device); } static void btrfs_close_bdev(struct btrfs_device *device) @@ -1036,11 +1163,6 @@ static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; - struct btrfs_device *new_device; - struct rcu_string *name; - - if (device->bdev) - fs_devices->open_devices--; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1057,65 +1179,85 @@ } btrfs_close_bdev(device); - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); + if (device->bdev) { + fs_devices->open_devices--; + device->bdev = NULL; } + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; + device->fs_info = NULL; + atomic_set(&device->dev_stats_ccnt, 0); + extent_io_tree_release(&device->alloc_state); - call_rcu(&device->rcu, free_device_rcu); + /* + * Reset the flush error record. We might have a transient flush error + * in this mount, and if so we aborted the current transaction and set + * the fs to an error state, guaranteeing no super blocks can be further + * committed. However that error might be transient and if we unmount the + * filesystem and mount it again, we should allow the mount to succeed + * (btrfs_check_rw_degradable() should not fail) - if after mounting the + * filesystem again we still get flush errors, then we will again abort + * any transaction and set the error state, guaranteeing no commits of + * unsafe super blocks. + */ + device->last_flush_error = 0; + + /* Verify the device is back in a pristine state */ + ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); + ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); + ASSERT(list_empty(&device->dev_alloc_list)); + ASSERT(list_empty(&device->post_commit_list)); + ASSERT(atomic_read(&device->reada_in_flight) == 0); } -static int close_fs_devices(struct btrfs_fs_devices *fs_devices) +static void close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; - if (--fs_devices->opened > 0) - return 0; + lockdep_assert_held(&uuid_mutex); - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { + if (--fs_devices->opened > 0) + return; + + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) btrfs_close_one_device(device); - } - mutex_unlock(&fs_devices->device_list_mutex); WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; - fs_devices->seeding = 0; - - return 0; + fs_devices->seeding = false; + fs_devices->fs_info = NULL; } -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_fs_devices *seed_devices = NULL; - int ret; + LIST_HEAD(list); + struct btrfs_fs_devices *tmp; mutex_lock(&uuid_mutex); - ret = close_fs_devices(fs_devices); + close_fs_devices(fs_devices); if (!fs_devices->opened) { - seed_devices = fs_devices->seed; - fs_devices->seed = NULL; - } - mutex_unlock(&uuid_mutex); + list_splice_init(&fs_devices->seed_list, &list); - while (seed_devices) { - fs_devices = seed_devices; - seed_devices = fs_devices->seed; + /* + * If the struct btrfs_fs_devices is not assembled with any + * other device, it can be re-initialized during the next mount + * without the needing device-scan step. Therefore, it can be + * fully freed. + */ + if (fs_devices->num_devices == 1) { + list_del(&fs_devices->fs_list); + free_fs_devices(fs_devices); + } + } + + + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); + list_del(&fs_devices->seed_list); free_fs_devices(fs_devices); } - return ret; + mutex_unlock(&uuid_mutex); } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, @@ -1123,28 +1265,33 @@ { struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; - int ret = 0; + struct btrfs_device *tmp_device; flags |= FMODE_EXCL; - list_for_each_entry(device, &fs_devices->devices, dev_list) { - /* Just open everything we can; ignore failures here */ - if (btrfs_open_one_device(fs_devices, device, flags, holder)) - continue; + list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, + dev_list) { + int ret; - if (!latest_dev || - device->generation > latest_dev->generation) + ret = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret == 0 && + (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; + } else if (ret == -ENODATA) { + fs_devices->num_devices--; + list_del(&device->dev_list); + btrfs_free_device(device); + } } - if (fs_devices->open_devices == 0) { - ret = -EINVAL; - goto out; - } + if (fs_devices->open_devices == 0) + return -EINVAL; + fs_devices->opened = 1; fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; -out: - return ret; + fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; + + return 0; } static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -1186,55 +1333,66 @@ return ret; } -static void btrfs_release_disk_super(struct page *page) +void btrfs_release_disk_super(struct btrfs_super_block *super) { - kunmap(page); + struct page *page = virt_to_page(super); + put_page(page); } -static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, - struct page **page, - struct btrfs_super_block **disk_super) +static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, + u64 bytenr) { + struct btrfs_super_block *disk_super; + struct page *page; void *p; pgoff_t index; /* make sure our super fits in the device */ if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) - return 1; + return ERR_PTR(-EINVAL); /* make sure our super fits in the page */ - if (sizeof(**disk_super) > PAGE_SIZE) - return 1; + if (sizeof(*disk_super) > PAGE_SIZE) + return ERR_PTR(-EINVAL); /* make sure our super doesn't straddle pages on disk */ index = bytenr >> PAGE_SHIFT; - if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) - return 1; + if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) + return ERR_PTR(-EINVAL); /* pull in the page with our super */ - *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, - index, GFP_KERNEL); + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); - if (IS_ERR_OR_NULL(*page)) - return 1; + if (IS_ERR(page)) + return ERR_CAST(page); - p = kmap(*page); + p = page_address(page); /* align our pointer to the offset of the super block */ - *disk_super = p + (bytenr & ~PAGE_MASK); + disk_super = p + offset_in_page(bytenr); - if (btrfs_super_bytenr(*disk_super) != bytenr || - btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { - btrfs_release_disk_super(*page); - return 1; + if (btrfs_super_bytenr(disk_super) != bytenr || + btrfs_super_magic(disk_super) != BTRFS_MAGIC) { + btrfs_release_disk_super(p); + return ERR_PTR(-EINVAL); } - if ((*disk_super)->label[0] && - (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) - (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; + if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) + disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; - return 0; + return disk_super; +} + +int btrfs_forget_devices(const char *path) +{ + int ret; + + mutex_lock(&uuid_mutex); + ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); + mutex_unlock(&uuid_mutex); + + return ret; } /* @@ -1249,7 +1407,6 @@ bool new_device_added = false; struct btrfs_device *device = NULL; struct block_device *bdev; - struct page *page; u64 bytenr; lockdep_assert_held(&uuid_mutex); @@ -1261,14 +1418,24 @@ * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ bytenr = btrfs_sb_offset(0); - flags |= FMODE_EXCL; + /* + * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may + * initiate the device scan which may race with the user's mount + * or mkfs command, resulting in failure. + * Since the device scan is solely for reading purposes, there is + * no need for FMODE_EXCL. Additionally, the devices are read again + * during the mount process. It is ok to get some inconsistent + * values temporarily, as the device paths of the fsid are the only + * required information for assembling the volume. + */ bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) return ERR_CAST(bdev); - if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { - device = ERR_PTR(-EINVAL); + disk_super = btrfs_read_disk_super(bdev, bytenr); + if (IS_ERR(disk_super)) { + device = ERR_CAST(disk_super); goto error_bdev_put; } @@ -1278,7 +1445,7 @@ btrfs_free_stale_devices(path, device); } - btrfs_release_disk_super(page); + btrfs_release_disk_super(disk_super); error_bdev_put: blkdev_put(bdev, flags); @@ -1286,60 +1453,84 @@ return device; } -static int contains_pending_extent(struct btrfs_transaction *transaction, - struct btrfs_device *device, - u64 *start, u64 len) +/* + * Try to find a chunk that intersects [start, start + len] range and when one + * such is found, record the end of it in *start + */ +static bool contains_pending_extent(struct btrfs_device *device, u64 *start, + u64 len) { - struct btrfs_fs_info *fs_info = device->fs_info; - struct extent_map *em; - struct list_head *search_list = &fs_info->pinned_chunks; - int ret = 0; - u64 physical_start = *start; + u64 physical_start, physical_end; - if (transaction) - search_list = &transaction->pending_chunks; -again: - list_for_each_entry(em, search_list, list) { - struct map_lookup *map; - int i; + lockdep_assert_held(&device->fs_info->chunk_mutex); - map = em->map_lookup; - for (i = 0; i < map->num_stripes; i++) { - u64 end; + if (!find_first_extent_bit(&device->alloc_state, *start, + &physical_start, &physical_end, + CHUNK_ALLOCATED, NULL)) { - if (map->stripes[i].dev != device) - continue; - if (map->stripes[i].physical >= physical_start + len || - map->stripes[i].physical + em->orig_block_len <= - physical_start) - continue; - /* - * Make sure that while processing the pinned list we do - * not override our *start with a lower value, because - * we can have pinned chunks that fall within this - * device hole and that have lower physical addresses - * than the pending chunks we processed before. If we - * do not take this special care we can end up getting - * 2 pending chunks that start at the same physical - * device offsets because the end offset of a pinned - * chunk can be equal to the start offset of some - * pending chunk. - */ - end = map->stripes[i].physical + em->orig_block_len; - if (end > *start) { - *start = end; - ret = 1; - } + if (in_range(physical_start, *start, len) || + in_range(*start, physical_start, + physical_end - physical_start)) { + *start = physical_end + 1; + return true; } } - if (search_list != &fs_info->pinned_chunks) { - search_list = &fs_info->pinned_chunks; - goto again; - } - - return ret; + return false; } +static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) +{ + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* + * We don't want to overwrite the superblock on the drive nor + * any area used by the boot loader (grub for example), so we + * make sure to start at an offset of at least 1MB. + */ + return max_t(u64, start, SZ_1M); + default: + BUG(); + } +} + +/** + * dev_extent_hole_check - check if specified hole is suitable for allocation + * @device: the device which we have the hole + * @hole_start: starting position of the hole + * @hole_size: the size of the hole + * @num_bytes: the size of the free space that we need + * + * This function may modify @hole_start and @hole_end to reflect the suitable + * position for allocation. Returns 1 if hole position is updated, 0 otherwise. + */ +static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, + u64 *hole_size, u64 num_bytes) +{ + bool changed = false; + u64 hole_end = *hole_start + *hole_size; + + /* + * Check before we set max_hole_start, otherwise we could end up + * sending back this offset anyway. + */ + if (contains_pending_extent(device, hole_start, *hole_size)) { + if (hole_end >= *hole_start) + *hole_size = hole_end - *hole_start; + else + *hole_size = 0; + changed = true; + } + + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* No extra check */ + break; + default: + BUG(); + } + + return changed; +} /* * find_free_dev_extent_start - find free space in the specified device @@ -1361,10 +1552,16 @@ * @len is used to store the size of the free space that we find. * But if we don't find suitable free space, it is used to store the size of * the max free space. + * + * NOTE: This function will search *commit* root of device tree, and does extra + * check to ensure dev extents are not double allocated. + * This makes the function safe to allocate dev extents but may not report + * correct usable device space, as device extent freed in current transaction + * is not reported as avaiable. */ -int find_free_dev_extent_start(struct btrfs_transaction *transaction, - struct btrfs_device *device, u64 num_bytes, - u64 search_start, u64 *start, u64 *len) +static int find_free_dev_extent_start(struct btrfs_device *device, + u64 num_bytes, u64 search_start, u64 *start, + u64 *len) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; @@ -1380,12 +1577,7 @@ int slot; struct extent_buffer *l; - /* - * We don't want to overwrite the superblock on the drive nor any area - * used by the boot loader (grub for example), so we make sure to start - * at an offset of at least 1MB. - */ - search_start = max_t(u64, search_start, SZ_1M); + search_start = dev_extent_search_start(device, search_start); path = btrfs_alloc_path(); if (!path) @@ -1418,7 +1610,7 @@ goto out; } - while (1) { + while (search_start < search_end) { l = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(l)) { @@ -1441,23 +1633,13 @@ if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; + if (key.offset > search_end) + break; + if (key.offset > search_start) { hole_size = key.offset - search_start; - - /* - * Have to check before we set max_hole_start, otherwise - * we could end up sending back this offset anyway. - */ - if (contains_pending_extent(transaction, device, - &search_start, - hole_size)) { - if (key.offset >= search_start) { - hole_size = key.offset - search_start; - } else { - WARN_ON_ONCE(1); - hole_size = 0; - } - } + dev_extent_hole_check(device, &search_start, &hole_size, + num_bytes); if (hole_size > max_hole_size) { max_hole_start = search_start; @@ -1496,9 +1678,8 @@ */ if (search_end > search_start) { hole_size = search_end - search_start; - - if (contains_pending_extent(transaction, device, &search_start, - hole_size)) { + if (dev_extent_hole_check(device, &search_start, &hole_size, + num_bytes)) { btrfs_release_path(path); goto again; } @@ -1515,6 +1696,7 @@ else ret = 0; + ASSERT(max_hole_start + max_hole_size <= search_end); out: btrfs_free_path(path); *start = max_hole_start; @@ -1523,13 +1705,11 @@ return ret; } -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { /* FIXME use last free of some kind */ - return find_free_dev_extent_start(trans->transaction, device, - num_bytes, 0, start, len); + return find_free_dev_extent_start(device, num_bytes, 0, start, len); } static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, @@ -1640,9 +1820,9 @@ struct rb_node *n; u64 ret = 0; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; read_lock(&em_tree->lock); - n = rb_last(&em_tree->map); + n = rb_last(&em_tree->map.rb_root); if (n) { em = rb_entry(n, struct extent_map, rb_node); ret = em->start + em->len; @@ -1672,7 +1852,12 @@ if (ret < 0) goto error; - BUG_ON(ret == 0); /* Corruption */ + if (ret == 0) { + /* Corruption */ + btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); + ret = -EUCLEAN; + goto error; + } ret = btrfs_previous_item(fs_info->chunk_root, path, BTRFS_DEV_ITEMS_OBJECTID, @@ -1738,7 +1923,8 @@ ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); ptr = btrfs_device_fsid(dev_item); - write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE); + write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, + ptr, BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(leaf); ret = 0; @@ -1750,22 +1936,27 @@ /* * Function to update ctime/mtime for a given device path. * Mainly used for ctime/mtime based probe like libblkid. + * + * We don't care about errors here, this is just to be kind to userspace. */ -static void update_dev_time(const char *path_name) +static void update_dev_time(const char *device_path) { - struct file *filp; + struct path path; + struct timespec64 now; + int ret; - filp = filp_open(path_name, O_RDWR, 0); - if (IS_ERR(filp)) + ret = kern_path(device_path, LOOKUP_FOLLOW, &path); + if (ret) return; - file_update_time(filp); - filp_close(filp, NULL); + + now = current_time(d_inode(path.dentry)); + inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); + path_put(&path); } -static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, - struct btrfs_device *device) +static int btrfs_rm_dev_item(struct btrfs_device *device) { - struct btrfs_root *root = fs_info->chunk_root; + struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; @@ -1862,17 +2053,14 @@ * where this function called, there should be always be another device (or * this_dev) which is active. */ -void btrfs_assign_next_active_device(struct btrfs_device *device, - struct btrfs_device *this_dev) +void __cold btrfs_assign_next_active_device(struct btrfs_device *device, + struct btrfs_device *next_device) { struct btrfs_fs_info *fs_info = device->fs_info; - struct btrfs_device *next_device; - if (this_dev) - next_device = this_dev; - else + if (!next_device) next_device = btrfs_find_next_active_device(fs_info->fs_devices, - device); + device); ASSERT(next_device); if (fs_info->sb->s_bdev && @@ -1883,8 +2071,66 @@ fs_info->fs_devices->latest_bdev = next_device->bdev; } +/* + * Return btrfs_fs_devices::num_devices excluding the device that's being + * currently replaced. + */ +static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) +{ + u64 num_devices = fs_info->fs_devices->num_devices; + + down_read(&fs_info->dev_replace.rwsem); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { + ASSERT(num_devices > 1); + num_devices--; + } + up_read(&fs_info->dev_replace.rwsem); + + return num_devices; +} + +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, + struct block_device *bdev, + const char *device_path) +{ + struct btrfs_super_block *disk_super; + int copy_num; + + if (!bdev) + return; + + for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { + struct page *page; + int ret; + + disk_super = btrfs_read_dev_one_super(bdev, copy_num); + if (IS_ERR(disk_super)) + continue; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + + page = virt_to_page(disk_super); + set_page_dirty(page); + lock_page(page); + /* write_on_page() unlocks the page */ + ret = write_one_page(page); + if (ret) + btrfs_warn(fs_info, + "error clearing superblock number %d (%d)", + copy_num, ret); + btrfs_release_disk_super(disk_super); + + } + + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); +} + int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid) + u64 devid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -1892,24 +2138,35 @@ u64 num_devices; int ret = 0; - mutex_lock(&uuid_mutex); - - num_devices = fs_devices->num_devices; - btrfs_dev_replace_read_lock(&fs_info->dev_replace); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - WARN_ON(num_devices < 1); - num_devices--; - } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + /* + * The device list in fs_devices is accessed without locks (neither + * uuid_mutex nor device_list_mutex) as it won't change on a mounted + * filesystem and another device rm cannot run. + */ + num_devices = btrfs_num_devices(fs_info); ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) goto out; - ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, - &device); - if (ret) + device = btrfs_find_device_by_devspec(fs_info, devid, device_path); + + if (IS_ERR(device)) { + if (PTR_ERR(device) == -ENOENT && + device_path && strcmp(device_path, "missing") == 0) + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; + else + ret = PTR_ERR(device); goto out; + } + + if (btrfs_pinned_by_swapfile(fs_info, device)) { + btrfs_warn_in_rcu(fs_info, + "cannot remove device %s (devid %llu) due to active swapfile", + rcu_str_deref(device->name), device->devid); + ret = -ETXTBSY; + goto out; + } if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; @@ -1929,9 +2186,9 @@ mutex_unlock(&fs_info->chunk_mutex); } - mutex_unlock(&uuid_mutex); ret = btrfs_shrink_device(device, 0); - mutex_lock(&uuid_mutex); + if (!ret) + btrfs_reada_remove_dev(device); if (ret) goto error_undo; @@ -1940,12 +2197,12 @@ * counter although write_all_supers() is not locked out. This * could give a filesystem state which requires a degraded mount. */ - ret = btrfs_rm_dev_item(fs_info, device); + ret = btrfs_rm_dev_item(device); if (ret) goto error_undo; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - btrfs_scrub_cancel_dev(fs_info, device); + btrfs_scrub_cancel_dev(device); /* * the device list mutex makes sure that we don't change @@ -1980,7 +2237,7 @@ if (device->bdev) { cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_rm_device_link(fs_devices, device); + btrfs_sysfs_remove_device(device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; @@ -1993,29 +2250,24 @@ * supers and free the device. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) - btrfs_scratch_superblocks(device->bdev, device->name->str); + btrfs_scratch_superblocks(fs_info, device->bdev, + device->name->str); btrfs_close_bdev(device); - call_rcu(&device->rcu, free_device_rcu); + synchronize_rcu(); + btrfs_free_device(device); if (cur_devices->open_devices == 0) { - while (fs_devices) { - if (fs_devices->seed == cur_devices) { - fs_devices->seed = cur_devices->seed; - break; - } - fs_devices = fs_devices->seed; - } - cur_devices->seed = NULL; + list_del_init(&cur_devices->seed_list); close_fs_devices(cur_devices); free_fs_devices(cur_devices); } out: - mutex_unlock(&uuid_mutex); return ret; error_undo: + btrfs_reada_undo_remove_dev(device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, @@ -2053,23 +2305,18 @@ fs_devices->open_devices--; } -void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev) +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) { struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { - /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); - } + mutex_lock(&uuid_mutex); btrfs_close_bdev(srcdev); - call_rcu(&srcdev->rcu, free_device_rcu); + synchronize_rcu(); + btrfs_free_device(srcdev); /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { - struct btrfs_fs_devices *tmp_fs_devices; - /* * On a mounted FS, num_devices can't be zero unless it's a * seed. In case of a seed device being replaced, the replace @@ -2078,28 +2325,20 @@ */ ASSERT(fs_devices->seeding); - tmp_fs_devices = fs_info->fs_devices; - while (tmp_fs_devices) { - if (tmp_fs_devices->seed == fs_devices) { - tmp_fs_devices->seed = fs_devices->seed; - break; - } - tmp_fs_devices = tmp_fs_devices->seed; - } - fs_devices->seed = NULL; + list_del_init(&fs_devices->seed_list); close_fs_devices(fs_devices); free_fs_devices(fs_devices); } + mutex_unlock(&uuid_mutex); } void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) { struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; - WARN_ON(!tgtdev); mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_rm_device_link(fs_devices, tgtdev); + btrfs_sysfs_remove_device(tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; @@ -2119,90 +2358,77 @@ * is already out of device list, so we don't have to hold * the device_list_mutex lock. */ - btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, + tgtdev->name->str); btrfs_close_bdev(tgtdev); - call_rcu(&tgtdev->rcu, free_device_rcu); + synchronize_rcu(); + btrfs_free_device(tgtdev); } -static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device **device) +static struct btrfs_device *btrfs_find_device_by_path( + struct btrfs_fs_info *fs_info, const char *device_path) { int ret = 0; struct btrfs_super_block *disk_super; u64 devid; u8 *dev_uuid; struct block_device *bdev; - struct buffer_head *bh; + struct btrfs_device *device; - *device = NULL; ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, - fs_info->bdev_holder, 0, &bdev, &bh); + fs_info->bdev_holder, 0, &bdev, &disk_super); if (ret) - return ret; - disk_super = (struct btrfs_super_block *)bh->b_data; + return ERR_PTR(ret); + devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - *device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->fsid, true); - brelse(bh); - if (!*device) - ret = -ENOENT; + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, + disk_super->metadata_uuid, true); + else + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, + disk_super->fsid, true); + + btrfs_release_disk_super(disk_super); + if (!device) + device = ERR_PTR(-ENOENT); blkdev_put(bdev, FMODE_READ); - return ret; -} - -int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, - const char *device_path, - struct btrfs_device **device) -{ - *device = NULL; - if (strcmp(device_path, "missing") == 0) { - struct list_head *devices; - struct btrfs_device *tmp; - - devices = &fs_info->fs_devices->devices; - list_for_each_entry(tmp, devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &tmp->dev_state) && !tmp->bdev) { - *device = tmp; - break; - } - } - - if (!*device) - return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; - - return 0; - } else { - return btrfs_find_device_by_path(fs_info, device_path, device); - } + return device; } /* * Lookup a device given by device id, or the path if the id is 0. */ -int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, - const char *devpath, - struct btrfs_device **device) +struct btrfs_device *btrfs_find_device_by_devspec( + struct btrfs_fs_info *fs_info, u64 devid, + const char *device_path) { - int ret; + struct btrfs_device *device; if (devid) { - ret = 0; - *device = btrfs_find_device(fs_info->fs_devices, devid, - NULL, NULL, true); - if (!*device) - ret = -ENOENT; - } else { - if (!devpath || !devpath[0]) - return -EINVAL; - - ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, - device); + device = btrfs_find_device(fs_info->fs_devices, devid, NULL, + NULL, true); + if (!device) + return ERR_PTR(-ENOENT); + return device; } - return ret; + + if (!device_path || !device_path[0]) + return ERR_PTR(-EINVAL); + + if (strcmp(device_path, "missing") == 0) { + /* Find first missing device */ + list_for_each_entry(device, &fs_info->fs_devices->devices, + dev_list) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) && !device->bdev) + return device; + } + return ERR_PTR(-ENOENT); + } + + return btrfs_find_device_by_path(fs_info, device_path); } /* @@ -2221,10 +2447,20 @@ if (!fs_devices->seeding) return -EINVAL; - seed_devices = alloc_fs_devices(NULL); + /* + * Private copy of the seed devices, anchored at + * fs_info->fs_devices->seed_list + */ + seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) return PTR_ERR(seed_devices); + /* + * It's necessary to retain a copy of the original seed fs_devices in + * fs_uuids so that filesystems which have been seeded can successfully + * reference the seed device from open_seed_devices. This also supports + * multiple fs seed. + */ old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { kfree(seed_devices); @@ -2245,19 +2481,15 @@ list_for_each_entry(device, &seed_devices->devices, dev_list) device->fs_devices = seed_devices; - mutex_lock(&fs_info->chunk_mutex); - list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); - mutex_unlock(&fs_info->chunk_mutex); - - fs_devices->seeding = 0; + fs_devices->seeding = false; fs_devices->num_devices = 0; fs_devices->open_devices = 0; fs_devices->missing_devices = 0; - fs_devices->rotating = 0; - fs_devices->seed = seed_devices; + fs_devices->rotating = false; + list_add(&seed_devices->seed_list, &fs_devices->seed_list); generate_random_uuid(fs_devices->fsid); - memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); mutex_unlock(&fs_devices->device_list_mutex); @@ -2271,9 +2503,9 @@ /* * Store the expected generation for seed devices in device items. */ -static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -2357,7 +2589,7 @@ u64 orig_super_num_devices; int seeding_dev = 0; int ret = 0; - bool unlocked = false; + bool locked = false; if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; @@ -2371,20 +2603,20 @@ seeding_dev = 1; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); + locked = true; } - filemap_write_and_wait(bdev->bd_inode->i_mapping); + sync_blockdev(bdev); - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; - mutex_unlock( - &fs_devices->device_list_mutex); + rcu_read_unlock(); goto error; } } - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); device = btrfs_alloc_device(fs_info, NULL, NULL); if (IS_ERR(device)) { @@ -2448,7 +2680,7 @@ atomic64_add(device->total_bytes, &fs_info->free_chunk_space); if (!blk_queue_nonrot(q)) - fs_devices->rotating = 1; + fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); btrfs_set_super_total_bytes(fs_info->super_copy, @@ -2468,13 +2700,13 @@ mutex_unlock(&fs_info->chunk_mutex); /* Add sysfs device entry */ - btrfs_sysfs_add_device_link(fs_devices, device); + btrfs_sysfs_add_device(device); mutex_unlock(&fs_devices->device_list_mutex); if (seeding_dev) { mutex_lock(&fs_info->chunk_mutex); - ret = init_first_rw_device(trans, fs_info); + ret = init_first_rw_device(trans); mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2489,22 +2721,17 @@ } if (seeding_dev) { - char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; - - ret = btrfs_finish_sprout(trans, fs_info); + ret = btrfs_finish_sprout(trans); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; } - /* Sprouting would change fsid of the mounted root, - * so rename the fsid on the sysfs + /* + * fs_devices now represents the newly sprouted filesystem and + * its fsid has been changed by btrfs_prepare_sprout */ - snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", - fs_info->fsid); - if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) - btrfs_warn(fs_info, - "sysfs: failed to create fsid for sprout"); + btrfs_sysfs_update_sprout_fsid(fs_devices); } ret = btrfs_commit_transaction(trans); @@ -2512,7 +2739,7 @@ if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); - unlocked = true; + locked = false; if (ret) /* transaction commit */ return ret; @@ -2532,12 +2759,22 @@ ret = btrfs_commit_transaction(trans); } - /* Update ctime/mtime for libblkid */ + /* + * Now that we have written a new super block to this device, check all + * other fs_devices list if device_path alienates any other scanned + * device. + * We can ignore the return value as it typically returns -EINVAL and + * only succeeds if the device was an alien. + */ + btrfs_forget_devices(device_path); + + /* Update ctime/mtime for blkid or udev */ update_dev_time(device_path); + return ret; error_sysfs: - btrfs_sysfs_rm_device_link(fs_devices, device); + btrfs_sysfs_remove_device(device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); @@ -2563,7 +2800,7 @@ btrfs_free_device(device); error: blkdev_put(bdev, FMODE_EXCL); - if (seeding_dev && !unlocked) { + if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } @@ -2621,7 +2858,6 @@ { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_super_block *super_copy = fs_info->super_copy; - struct btrfs_fs_devices *fs_devices; u64 old_total; u64 diff; @@ -2640,8 +2876,6 @@ return -EINVAL; } - fs_devices = fs_info->fs_devices; - btrfs_set_super_total_bytes(super_copy, round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; @@ -2649,9 +2883,9 @@ btrfs_device_set_total_bytes(device, new_size); btrfs_device_set_disk_total_bytes(device, new_size); btrfs_clear_space_info_full(device->fs_info); - if (list_empty(&device->resized_list)) - list_add_tail(&device->resized_list, - &fs_devices->resized_devices); + if (list_empty(&device->post_commit_list)) + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); mutex_unlock(&fs_info->chunk_mutex); return btrfs_update_device(trans, device); @@ -2739,13 +2973,20 @@ return ret; } -static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) +/* + * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. + * @logical: Logical block offset in bytes. + * @length: Length of extent in bytes. + * + * Return: Chunk mapping or ERR_PTR. + */ +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { struct extent_map_tree *em_tree; struct extent_map *em; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, length); read_unlock(&em_tree->lock); @@ -2777,7 +3018,7 @@ int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em = get_chunk_map(fs_info, chunk_offset, 1); + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) { /* * This is a logic error, but we don't want to just rely on the @@ -2818,13 +3059,11 @@ mutex_unlock(&fs_info->chunk_mutex); } - if (map->stripes[i].dev) { - ret = btrfs_update_device(trans, map->stripes[i].dev); - if (ret) { - mutex_unlock(&fs_devices->device_list_mutex); - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = btrfs_update_device(trans, device); + if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); + goto out; } } mutex_unlock(&fs_devices->device_list_mutex); @@ -2861,6 +3100,7 @@ { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; + struct btrfs_block_group *block_group; int ret; /* @@ -2877,10 +3117,6 @@ */ lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); - ret = btrfs_can_relocate(fs_info, chunk_offset); - if (ret) - return -ENOSPC; - /* step one, relocate all the extents inside this chunk */ btrfs_scrub_pause(fs_info); ret = btrfs_relocate_block_group(fs_info, chunk_offset); @@ -2888,15 +3124,11 @@ if (ret) return ret; - /* - * We add the kobjects here (and after forcing data chunk creation) - * since relocation is the only place we'll create chunks of a new - * type at runtime. The only place where we'll remove the last - * chunk of a type is the call immediately below this one. Even - * so, we're protected against races with the cleaner thread since - * we're covered by the delete_unused_bgs_mutex. - */ - btrfs_add_raid_kobjects(fs_info); + block_group = btrfs_lookup_block_group(fs_info, chunk_offset); + if (!block_group) + return -ENOENT; + btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); + btrfs_put_block_group(block_group); trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); @@ -2997,7 +3229,7 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; u64 bytes_used; u64 chunk_type; @@ -3006,30 +3238,28 @@ chunk_type = cache->flags; btrfs_put_block_group(cache); - if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { - spin_lock(&fs_info->data_sinfo->lock); - bytes_used = fs_info->data_sinfo->bytes_used; - spin_unlock(&fs_info->data_sinfo->lock); + if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) + return 0; - if (!bytes_used) { - struct btrfs_trans_handle *trans; - int ret; + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); - trans = btrfs_join_transaction(fs_info->tree_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (!bytes_used) { + struct btrfs_trans_handle *trans; + int ret; - ret = btrfs_force_chunk_alloc(trans, - BTRFS_BLOCK_GROUP_DATA); - btrfs_end_transaction(trans); - if (ret < 0) - return ret; + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); - btrfs_add_raid_kobjects(fs_info); - - return 1; - } + ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); + btrfs_end_transaction(trans); + if (ret < 0) + return ret; + return 1; } + return 0; } @@ -3099,7 +3329,7 @@ if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); + trans = btrfs_start_transaction_fallback_global_rsv(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); @@ -3208,28 +3438,28 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; u64 chunk_used; u64 user_thresh_min; u64 user_thresh_max; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); - chunk_used = btrfs_block_group_used(&cache->item); + chunk_used = cache->used; if (bargs->usage_min == 0) user_thresh_min = 0; else - user_thresh_min = div_factor_fine(cache->key.offset, - bargs->usage_min); + user_thresh_min = div_factor_fine(cache->length, + bargs->usage_min); if (bargs->usage_max == 0) user_thresh_max = 1; else if (bargs->usage_max > 100) - user_thresh_max = cache->key.offset; + user_thresh_max = cache->length; else - user_thresh_max = div_factor_fine(cache->key.offset, - bargs->usage_max); + user_thresh_max = div_factor_fine(cache->length, + bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) ret = 0; @@ -3241,20 +3471,19 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; u64 chunk_used, user_thresh; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); - chunk_used = btrfs_block_group_used(&cache->item); + chunk_used = cache->used; if (bargs->usage_min == 0) user_thresh = 1; else if (bargs->usage > 100) - user_thresh = cache->key.offset; + user_thresh = cache->length; else - user_thresh = div_factor_fine(cache->key.offset, - bargs->usage); + user_thresh = div_factor_fine(cache->length, bargs->usage); if (chunk_used < user_thresh) ret = 0; @@ -3280,6 +3509,18 @@ return 1; } +static u64 calc_data_stripes(u64 type, int num_stripes) +{ + const int index = btrfs_bg_flags_to_raid_index(type); + const int ncopies = btrfs_raid_array[index].ncopies; + const int nparity = btrfs_raid_array[index].nparity; + + if (nparity) + return num_stripes - nparity; + else + return num_stripes / ncopies; +} + /* [pstart, pend) */ static int chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, @@ -3289,22 +3530,15 @@ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); u64 stripe_offset; u64 stripe_length; + u64 type; int factor; int i; if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) return 0; - if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { - factor = num_stripes / 2; - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { - factor = num_stripes - 1; - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { - factor = num_stripes - 2; - } else { - factor = num_stripes; - } + type = btrfs_chunk_type(leaf, chunk); + factor = calc_data_stripes(type, num_stripes); for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); @@ -3365,10 +3599,10 @@ return 0; } -static int should_balance_chunk(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, +static int should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_balance_args *bargs = NULL; u64 chunk_type = btrfs_chunk_type(leaf, chunk); @@ -3458,17 +3692,11 @@ { struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_root *chunk_root = fs_info->chunk_root; - struct btrfs_root *dev_root = fs_info->dev_root; - struct list_head *devices; - struct btrfs_device *device; - u64 old_size; - u64 size_to_free; u64 chunk_type; struct btrfs_chunk *chunk; struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_trans_handle *trans; struct extent_buffer *leaf; int slot; int ret; @@ -3483,53 +3711,6 @@ u32 count_sys = 0; int chunk_reserved = 0; - /* step one make some room on all the devices */ - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { - old_size = btrfs_device_get_total_bytes(device); - size_to_free = div_factor(old_size, 1); - size_to_free = min_t(u64, size_to_free, SZ_1M); - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || - btrfs_device_get_total_bytes(device) - - btrfs_device_get_bytes_used(device) > size_to_free || - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) - continue; - - ret = btrfs_shrink_device(device, old_size - size_to_free); - if (ret == -ENOSPC) - break; - if (ret) { - /* btrfs_shrink_device never returns ret > 0 */ - WARN_ON(ret > 0); - goto error; - } - - trans = btrfs_start_transaction(dev_root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - btrfs_info_in_rcu(fs_info, - "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", - rcu_str_deref(device->name), ret, - old_size, old_size - size_to_free); - goto error; - } - - ret = btrfs_grow_device(trans, device, old_size); - if (ret) { - btrfs_end_transaction(trans); - /* btrfs_grow_device never returns ret > 0 */ - WARN_ON(ret > 0); - btrfs_info_in_rcu(fs_info, - "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", - rcu_str_deref(device->name), ret, - old_size, old_size - size_to_free); - goto error; - } - - btrfs_end_transaction(trans); - } - - /* step two, relocate all the chunks */ path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -3601,8 +3782,7 @@ spin_unlock(&fs_info->balance_lock); } - ret = should_balance_chunk(fs_info, leaf, chunk, - found_key.offset); + ret = should_balance_chunk(leaf, chunk, found_key.offset); btrfs_release_path(path); if (!ret) { @@ -3659,10 +3839,15 @@ ret = btrfs_relocate_chunk(fs_info, found_key.offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret && ret != -ENOSPC) - goto error; if (ret == -ENOSPC) { enospc_errors++; + } else if (ret == -ETXTBSY) { + btrfs_info(fs_info, + "skipping relocation of block group %llu due to active swapfile", + found_key.offset); + ret = 0; + } else if (ret) { + goto error; } else { spin_lock(&fs_info->balance_lock); bctl->stat.completed++; @@ -3711,8 +3896,7 @@ if (flags == 0) return !extended; /* "0" is valid for usual profiles */ - /* true if exactly one bit set */ - return (flags & (flags - 1)) == 0; + return has_single_bit_set(flags); } static inline int balance_need_close(struct btrfs_fs_info *fs_info) @@ -3723,13 +3907,179 @@ atomic_read(&fs_info->balance_cancel_req) == 0); } -/* Non-zero return value signifies invalidity */ -static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, - u64 allowed) +/* + * Validate target profile against allowed profiles and return true if it's OK. + * Otherwise print the error message and return false. + */ +static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, + const struct btrfs_balance_args *bargs, + u64 allowed, const char *type) { - return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl_arg->target, 1) || - (bctl_arg->target & ~allowed))); + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return true; + + /* Profile is valid and does not have bits outside of the allowed set */ + if (alloc_profile_is_valid(bargs->target, 1) && + (bargs->target & ~allowed) == 0) + return true; + + btrfs_err(fs_info, "balance: invalid convert %s profile %s", + type, btrfs_bg_type_to_raid_name(bargs->target)); + return false; +} + +/* + * Fill @buf with textual description of balance filter flags @bargs, up to + * @size_buf including the terminating null. The output may be trimmed if it + * does not fit into the provided buffer. + */ +static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, + u32 size_buf) +{ + int ret; + u32 size_bp = size_buf; + char *bp = buf; + u64 flags = bargs->flags; + char tmp_buf[128] = {'\0'}; + + if (!flags) + return; + +#define CHECK_APPEND_NOARG(a) \ + do { \ + ret = snprintf(bp, size_bp, (a)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_1ARG(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_2ARG(a, v1, v2) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (flags & BTRFS_BALANCE_ARGS_CONVERT) + CHECK_APPEND_1ARG("convert=%s,", + btrfs_bg_type_to_raid_name(bargs->target)); + + if (flags & BTRFS_BALANCE_ARGS_SOFT) + CHECK_APPEND_NOARG("soft,"); + + if (flags & BTRFS_BALANCE_ARGS_PROFILES) { + btrfs_describe_block_groups(bargs->profiles, tmp_buf, + sizeof(tmp_buf)); + CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); + } + + if (flags & BTRFS_BALANCE_ARGS_USAGE) + CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); + + if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) + CHECK_APPEND_2ARG("usage=%u..%u,", + bargs->usage_min, bargs->usage_max); + + if (flags & BTRFS_BALANCE_ARGS_DEVID) + CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); + + if (flags & BTRFS_BALANCE_ARGS_DRANGE) + CHECK_APPEND_2ARG("drange=%llu..%llu,", + bargs->pstart, bargs->pend); + + if (flags & BTRFS_BALANCE_ARGS_VRANGE) + CHECK_APPEND_2ARG("vrange=%llu..%llu,", + bargs->vstart, bargs->vend); + + if (flags & BTRFS_BALANCE_ARGS_LIMIT) + CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); + + if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) + CHECK_APPEND_2ARG("limit=%u..%u,", + bargs->limit_min, bargs->limit_max); + + if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) + CHECK_APPEND_2ARG("stripes=%u..%u,", + bargs->stripes_min, bargs->stripes_max); + +#undef CHECK_APPEND_2ARG +#undef CHECK_APPEND_1ARG +#undef CHECK_APPEND_NOARG + +out_overflow: + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ + else + buf[0] = '\0'; +} + +static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) +{ + u32 size_buf = 1024; + char tmp_buf[192] = {'\0'}; + char *buf; + char *bp; + u32 size_bp = size_buf; + int ret; + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + buf = kzalloc(size_buf, GFP_KERNEL); + if (!buf) + return; + + bp = buf; + +#define CHECK_APPEND_1ARG(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (bctl->flags & BTRFS_BALANCE_FORCE) + CHECK_APPEND_1ARG("%s", "-f "); + + if (bctl->flags & BTRFS_BALANCE_DATA) { + describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-d%s ", tmp_buf); + } + + if (bctl->flags & BTRFS_BALANCE_METADATA) { + describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-m%s ", tmp_buf); + } + + if (bctl->flags & BTRFS_BALANCE_SYSTEM) { + describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-s%s ", tmp_buf); + } + +#undef CHECK_APPEND_1ARG + +out_overflow: + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ + btrfs_info(fs_info, "balance: %s %s", + (bctl->flags & BTRFS_BALANCE_RESUME) ? + "resume" : "start", buf); + + kfree(buf); } /* @@ -3745,11 +4095,12 @@ int ret; u64 num_devices; unsigned seq; - bool reducing_integrity; + bool reducing_redundancy; + int i; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || - atomic_read(&fs_info->balance_cancel_req)) { + btrfs_should_cancel_balance(fs_info)) { ret = -EINVAL; goto out; } @@ -3774,54 +4125,39 @@ } } - num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_read_lock(&fs_info->dev_replace); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - BUG_ON(num_devices < 1); - num_devices--; - } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; - if (num_devices > 1) - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); - if (num_devices > 2) - allowed |= BTRFS_BLOCK_GROUP_RAID5; - if (num_devices > 3) - allowed |= (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID6); - if (validate_convert_profile(&bctl->data, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->data.target); + /* + * rw_devices will not change at the moment, device add/delete/replace + * are exclusive + */ + num_devices = fs_info->fs_devices->rw_devices; - btrfs_err(fs_info, - "balance: invalid convert data profile %s", - get_raid_name(index)); - ret = -EINVAL; - goto out; - } - if (validate_convert_profile(&bctl->meta, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); + /* + * SINGLE profile on-disk has no profile bit, but in-memory we have a + * special bit for it, to make it easier to distinguish. Thus we need + * to set it manually, or balance would refuse the profile. + */ + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) + if (num_devices >= btrfs_raid_array[i].devs_min) + allowed |= btrfs_raid_array[i].bg_flag; - btrfs_err(fs_info, - "balance: invalid convert metadata profile %s", - get_raid_name(index)); - ret = -EINVAL; - goto out; - } - if (validate_convert_profile(&bctl->sys, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); - - btrfs_err(fs_info, - "balance: invalid convert system profile %s", - get_raid_name(index)); + if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || + !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || + !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { ret = -EINVAL; goto out; } - /* allow to reduce meta or sys integrity only if force set */ - allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6; + /* + * Allow to reduce metadata or system integrity only if force set for + * profiles with redundancy (copies, parity) + */ + allowed = 0; + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { + if (btrfs_raid_array[i].ncopies >= 2 || + btrfs_raid_array[i].tolerated_failures >= 1) + allowed |= btrfs_raid_array[i].bg_flag; + } do { seq = read_seqbegin(&fs_info->profiles_lock); @@ -3831,9 +4167,9 @@ ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_metadata_alloc_bits & allowed) && !(bctl->meta.target & allowed))) - reducing_integrity = true; + reducing_redundancy = true; else - reducing_integrity = false; + reducing_redundancy = false; /* if we're not converting, the target field is uninitialized */ meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? @@ -3842,13 +4178,13 @@ bctl->data.target : fs_info->avail_data_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); - if (reducing_integrity) { + if (reducing_redundancy) { if (bctl->flags & BTRFS_BALANCE_FORCE) { btrfs_info(fs_info, - "balance: force reducing metadata integrity"); + "balance: force reducing metadata redundancy"); } else { btrfs_err(fs_info, - "balance: reduces metadata integrity, use --force if you want this"); + "balance: reduces metadata redundancy, use --force if you want this"); ret = -EINVAL; goto out; } @@ -3856,12 +4192,18 @@ if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { - int meta_index = btrfs_bg_flags_to_raid_index(meta_target); - int data_index = btrfs_bg_flags_to_raid_index(data_target); - btrfs_warn(fs_info, "balance: metadata profile %s has lower redundancy than data profile %s", - get_raid_name(meta_index), get_raid_name(data_index)); + btrfs_bg_type_to_raid_name(meta_target), + btrfs_bg_type_to_raid_name(data_target)); + } + + if (fs_info->send_in_progress) { + btrfs_warn_rl(fs_info, +"cannot run balance while send operations are in progress (%d in progress)", + fs_info->send_in_progress); + ret = -EAGAIN; + goto out; } ret = insert_balance_item(fs_info, bctl); @@ -3883,11 +4225,34 @@ ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); + describe_balance_start_or_resume(fs_info); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); + if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) + btrfs_info(fs_info, "balance: paused"); + /* + * Balance can be canceled by: + * + * - Regular cancel request + * Then ret == -ECANCELED and balance_cancel_req > 0 + * + * - Fatal signal to "btrfs" process + * Either the signal caught by wait_reserve_ticket() and callers + * got -EINTR, or caught by btrfs_should_cancel_balance() and + * got -ECANCELED. + * Either way, in this case balance_cancel_req = 0, and + * ret == -EINTR or ret == -ECANCELED. + * + * So here we only check the return value to catch canceled balance. + */ + else if (ret == -ECANCELED || ret == -EINTR) + btrfs_info(fs_info, "balance: canceled"); + else + btrfs_info(fs_info, "balance: ended with status: %d", ret); + clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); if (bargs) { @@ -3898,7 +4263,7 @@ if ((ret && ret != -ECANCELED && ret != -ENOSPC) || balance_need_close(fs_info)) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } wake_up(&fs_info->balance_wait_q); @@ -3909,7 +4274,7 @@ reset_balance_state(fs_info); else kfree(bctl); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -3919,12 +4284,12 @@ struct btrfs_fs_info *fs_info = data; int ret = 0; + sb_start_write(fs_info->sb); mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - btrfs_info(fs_info, "balance: resuming"); + if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); - } mutex_unlock(&fs_info->balance_mutex); + sb_end_write(fs_info->sb); return ret; } @@ -4013,7 +4378,7 @@ * is in a paused state and must have fs_info::balance_ctl properly * set up. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); @@ -4097,19 +4462,18 @@ if (fs_info->balance_ctl) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); btrfs_info(fs_info, "balance: canceled"); } } - BUG_ON(fs_info->balance_ctl || - test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_cancel_req); mutex_unlock(&fs_info->balance_mutex); return 0; } -static int btrfs_uuid_scan_kthread(void *data) +int btrfs_uuid_scan_kthread(void *data) { struct btrfs_fs_info *fs_info = data; struct btrfs_root *root = fs_info->tree_root; @@ -4121,6 +4485,7 @@ struct btrfs_root_item root_item; u32 item_size; struct btrfs_trans_handle *trans = NULL; + bool closing = false; path = btrfs_alloc_path(); if (!path) { @@ -4133,6 +4498,10 @@ key.offset = 0; while (1) { + if (btrfs_fs_closing(fs_info)) { + closing = true; + break; + } ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); if (ret) { @@ -4233,74 +4602,10 @@ btrfs_end_transaction(trans); if (ret) btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); - else + else if (!closing) set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); up(&fs_info->uuid_tree_rescan_sem); return 0; -} - -/* - * Callback for btrfs_uuid_tree_iterate(). - * returns: - * 0 check succeeded, the entry is not outdated. - * < 0 if an error occurred. - * > 0 if the check failed, which means the caller shall remove the entry. - */ -static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, - u8 *uuid, u8 type, u64 subid) -{ - struct btrfs_key key; - int ret = 0; - struct btrfs_root *subvol_root; - - if (type != BTRFS_UUID_KEY_SUBVOL && - type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) - goto out; - - key.objectid = subid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(subvol_root)) { - ret = PTR_ERR(subvol_root); - if (ret == -ENOENT) - ret = 1; - goto out; - } - - switch (type) { - case BTRFS_UUID_KEY_SUBVOL: - if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) - ret = 1; - break; - case BTRFS_UUID_KEY_RECEIVED_SUBVOL: - if (memcmp(uuid, subvol_root->root_item.received_uuid, - BTRFS_UUID_SIZE)) - ret = 1; - break; - } - -out: - return ret; -} - -static int btrfs_uuid_rescan_kthread(void *data) -{ - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; - int ret; - - /* - * 1st step is to iterate through the existing UUID tree and - * to delete all entries that contain outdated data. - * 2nd step is to add all missing entries to the UUID tree. - */ - ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); - if (ret < 0) { - btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); - up(&fs_info->uuid_tree_rescan_sem); - return ret; - } - return btrfs_uuid_scan_kthread(data); } int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) @@ -4319,8 +4624,7 @@ if (IS_ERR(trans)) return PTR_ERR(trans); - uuid_root = btrfs_create_tree(trans, fs_info, - BTRFS_UUID_TREE_OBJECTID); + uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); btrfs_abort_transaction(trans, ret); @@ -4346,22 +4650,6 @@ return 0; } -int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) -{ - struct task_struct *task; - - down(&fs_info->uuid_tree_rescan_sem); - task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); - if (IS_ERR(task)) { - /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - btrfs_warn(fs_info, "failed to start uuid_rescan task"); - up(&fs_info->uuid_tree_rescan_sem); - return PTR_ERR(task); - } - - return 0; -} - /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -4380,15 +4668,16 @@ int slot; int failed = 0; bool retried = false; - bool checked_pending_chunks = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = btrfs_device_get_total_bytes(device); u64 diff; + u64 start; new_size = round_down(new_size, fs_info->sectorsize); + start = new_size; diff = round_down(old_size - new_size, fs_info->sectorsize); if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) @@ -4400,6 +4689,12 @@ path->reada = READA_BACK; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, new_size); @@ -4407,7 +4702,21 @@ device->fs_devices->total_rw_bytes -= diff; atomic64_sub(diff, &fs_info->free_chunk_space); } - mutex_unlock(&fs_info->chunk_mutex); + + /* + * Once the device's size has been set to the new size, ensure all + * in-memory chunks are synced to disk so that the loop below sees them + * and relocates them accordingly. + */ + if (contains_pending_extent(device, &start, diff)) { + mutex_unlock(&fs_info->chunk_mutex); + ret = btrfs_commit_transaction(trans); + if (ret) + goto done; + } else { + mutex_unlock(&fs_info->chunk_mutex); + btrfs_end_transaction(trans); + } again: key.objectid = device->devid; @@ -4469,10 +4778,16 @@ ret = btrfs_relocate_chunk(fs_info, chunk_offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret && ret != -ENOSPC) - goto done; - if (ret == -ENOSPC) + if (ret == -ENOSPC) { failed++; + } else if (ret) { + if (ret == -ETXTBSY) { + btrfs_warn(fs_info, + "could not shrink block group %llu due to active swapfile", + chunk_offset); + } + goto done; + } } while (key.offset-- > 0); if (failed && !retried) { @@ -4492,40 +4807,14 @@ } mutex_lock(&fs_info->chunk_mutex); - - /* - * We checked in the above loop all device extents that were already in - * the device tree. However before we have updated the device's - * total_bytes to the new size, we might have had chunk allocations that - * have not complete yet (new block groups attached to transaction - * handles), and therefore their device extents were not yet in the - * device tree and we missed them in the loop above. So if we have any - * pending chunk using a device extent that overlaps the device range - * that we can not use anymore, commit the current transaction and - * repeat the search on the device tree - this way we guarantee we will - * not have chunks using device extents that end beyond 'new_size'. - */ - if (!checked_pending_chunks) { - u64 start = new_size; - u64 len = old_size - new_size; - - if (contains_pending_extent(trans->transaction, device, - &start, len)) { - mutex_unlock(&fs_info->chunk_mutex); - checked_pending_chunks = true; - failed = 0; - retried = false; - ret = btrfs_commit_transaction(trans); - if (ret) - goto done; - goto again; - } - } + /* Clear all state bits beyond the shrunk device size */ + clear_extent_bits(&device->alloc_state, new_size, (u64)-1, + CHUNK_STATE_MASK); btrfs_device_set_disk_total_bytes(device, new_size); - if (list_empty(&device->resized_list)) - list_add_tail(&device->resized_list, - &fs_info->fs_devices->resized_devices); + if (list_empty(&device->post_commit_list)) + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); WARN_ON(diff > old_total); btrfs_set_super_total_bytes(super_copy, @@ -4609,96 +4898,119 @@ btrfs_set_fs_incompat(info, RAID56); } -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - u64 start, u64 type) +static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) { - struct btrfs_fs_info *info = trans->fs_info; - struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct btrfs_device *device; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; - struct extent_map *em; - struct btrfs_device_info *devices_info = NULL; - u64 total_avail; - int num_stripes; /* total number of stripes to allocate */ - int data_stripes; /* number of stripes that count for - block group size */ - int sub_stripes; /* sub_stripes info for map */ - int dev_stripes; /* stripes per dev */ - int devs_max; /* max devs to use */ - int devs_min; /* min devs needed */ - int devs_increment; /* ndevs has to be a multiple of this */ - int ncopies; /* how many copies to data has */ - int ret; + if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) + return; + + btrfs_set_fs_incompat(info, RAID1C34); +} + +/* + * Structure used internally for __btrfs_alloc_chunk() function. + * Wraps needed parameters. + */ +struct alloc_chunk_ctl { + u64 start; + u64 type; + /* Total number of stripes to allocate */ + int num_stripes; + /* sub_stripes info for map */ + int sub_stripes; + /* Stripes per device */ + int dev_stripes; + /* Maximum number of devices to use */ + int devs_max; + /* Minimum number of devices to use */ + int devs_min; + /* ndevs has to be a multiple of this */ + int devs_increment; + /* Number of copies */ + int ncopies; + /* Number of stripes worth of bytes to store parity information */ + int nparity; u64 max_stripe_size; u64 max_chunk_size; + u64 dev_extent_min; u64 stripe_size; - u64 num_bytes; + u64 chunk_size; int ndevs; - int i; - int j; - int index; +}; - BUG_ON(!alloc_profile_is_valid(type, 0)); - - if (list_empty(&fs_devices->alloc_list)) { - if (btrfs_test_opt(info, ENOSPC_DEBUG)) - btrfs_debug(info, "%s: no writable device", __func__); - return -ENOSPC; - } - - index = btrfs_bg_flags_to_raid_index(type); - - sub_stripes = btrfs_raid_array[index].sub_stripes; - dev_stripes = btrfs_raid_array[index].dev_stripes; - devs_max = btrfs_raid_array[index].devs_max; - devs_min = btrfs_raid_array[index].devs_min; - devs_increment = btrfs_raid_array[index].devs_increment; - ncopies = btrfs_raid_array[index].ncopies; +static void init_alloc_chunk_ctl_policy_regular( + struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + u64 type = ctl->type; if (type & BTRFS_BLOCK_GROUP_DATA) { - max_stripe_size = SZ_1G; - max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); + ctl->max_stripe_size = SZ_1G; + ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - /* for larger filesystems, use larger metadata chunks */ + /* For larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) - max_stripe_size = SZ_1G; + ctl->max_stripe_size = SZ_1G; else - max_stripe_size = SZ_256M; - max_chunk_size = max_stripe_size; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); + ctl->max_stripe_size = SZ_256M; + ctl->max_chunk_size = ctl->max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = SZ_32M; - max_chunk_size = 2 * max_stripe_size; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; + ctl->max_stripe_size = SZ_32M; + ctl->max_chunk_size = 2 * ctl->max_stripe_size; + ctl->devs_max = min_t(int, ctl->devs_max, + BTRFS_MAX_DEVS_SYS_CHUNK); } else { - btrfs_err(info, "invalid chunk type 0x%llx requested", - type); - BUG_ON(1); + BUG(); } - /* we don't want a chunk larger than 10% of writeable space */ - max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), - max_chunk_size); + /* We don't want a chunk larger than 10% of writable space */ + ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + ctl->max_chunk_size); + ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; +} - devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), - GFP_NOFS); - if (!devices_info) - return -ENOMEM; +static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + int index = btrfs_bg_flags_to_raid_index(ctl->type); + + ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; + ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; + ctl->devs_max = btrfs_raid_array[index].devs_max; + if (!ctl->devs_max) + ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); + ctl->devs_min = btrfs_raid_array[index].devs_min; + ctl->devs_increment = btrfs_raid_array[index].devs_increment; + ctl->ncopies = btrfs_raid_array[index].ncopies; + ctl->nparity = btrfs_raid_array[index].nparity; + ctl->ndevs = 0; + + switch (fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); + break; + default: + BUG(); + } +} + +static int gather_device_info(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = fs_devices->fs_info; + struct btrfs_device *device; + u64 total_avail; + u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; + int ret; + int ndevs = 0; + u64 max_avail; + u64 dev_offset; /* * in the first pass through the devices list, we gather information * about the available holes on each device. */ - ndevs = 0; list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 max_avail; - u64 dev_offset; - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { WARN(1, KERN_ERR "BTRFS: read-only device in alloc_list\n"); @@ -4716,24 +5028,23 @@ total_avail = 0; /* If there is no space on this device, skip it. */ - if (total_avail == 0) + if (total_avail < ctl->dev_extent_min) continue; - ret = find_free_dev_extent(trans, device, - max_stripe_size * dev_stripes, - &dev_offset, &max_avail); + ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, + &max_avail); if (ret && ret != -ENOSPC) - goto error; + return ret; if (ret == 0) - max_avail = max_stripe_size * dev_stripes; + max_avail = dev_extent_want; - if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { + if (max_avail < ctl->dev_extent_min) { if (btrfs_test_opt(info, ENOSPC_DEBUG)) btrfs_debug(info, - "%s: devid %llu has no free space, have=%llu want=%u", + "%s: devid %llu has no free space, have=%llu want=%llu", __func__, device->devid, max_avail, - BTRFS_STRIPE_LEN * dev_stripes); + ctl->dev_extent_min); continue; } @@ -4748,6 +5059,7 @@ devices_info[ndevs].dev = device; ++ndevs; } + ctl->ndevs = ndevs; /* * now sort the devices by hole size / available space @@ -4755,20 +5067,14 @@ sort(devices_info, ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); - /* round down to number of usable stripes */ - ndevs = round_down(ndevs, devs_increment); + return 0; +} - if (ndevs < devs_min) { - ret = -ENOSPC; - if (btrfs_test_opt(info, ENOSPC_DEBUG)) { - btrfs_debug(info, - "%s: not enough devices with free space: have=%d minimum required=%d", - __func__, ndevs, devs_min); - } - goto error; - } - - ndevs = min(ndevs, devs_max); +static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + /* Number of stripes that count for block group size */ + int data_stripes; /* * The primary goal is to maximize the number of stripes, so use as @@ -4777,109 +5083,148 @@ * The DUP profile stores more than one stripe per device, the * max_avail is the total size so we have to adjust. */ - stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); - num_stripes = ndevs * dev_stripes; + ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, + ctl->dev_stripes); + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + + /* This will have to be fixed for RAID1 and RAID10 over more drives */ + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; /* - * this will have to be fixed for RAID1 and RAID10 over - * more drives + * Use the number of data stripes to figure out how big this chunk is + * really going to be in terms of logical address space, and compare + * that answer with the max chunk size. If it's higher, we try to + * reduce stripe_size. */ - data_stripes = num_stripes / ncopies; - - if (type & BTRFS_BLOCK_GROUP_RAID5) - data_stripes = num_stripes - 1; - - if (type & BTRFS_BLOCK_GROUP_RAID6) - data_stripes = num_stripes - 2; - - /* - * Use the number of data stripes to figure out how big this chunk - * is really going to be in terms of logical address space, - * and compare that answer with the max chunk size. If it's higher, - * we try to reduce stripe_size. - */ - if (stripe_size * data_stripes > max_chunk_size) { + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { /* * Reduce stripe_size, round it up to a 16MB boundary again and * then use it, unless it ends up being even bigger than the * previous value we had already. */ - stripe_size = min(round_up(div_u64(max_chunk_size, - data_stripes), SZ_16M), - stripe_size); + ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, + data_stripes), SZ_16M), + ctl->stripe_size); } - /* align to BTRFS_STRIPE_LEN */ - stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); + /* Align to BTRFS_STRIPE_LEN */ + ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); + ctl->chunk_size = ctl->stripe_size * data_stripes; - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) { - ret = -ENOMEM; - goto error; + return 0; +} + +static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = fs_devices->fs_info; + + /* + * Round down to number of usable stripes, devs_increment can be any + * number so we can't use round_down() that requires power of 2, while + * rounddown is safe. + */ + ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); + + if (ctl->ndevs < ctl->devs_min) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) { + btrfs_debug(info, + "%s: not enough devices with free space: have=%d minimum required=%d", + __func__, ctl->ndevs, ctl->devs_min); + } + return -ENOSPC; } - map->num_stripes = num_stripes; - for (i = 0; i < ndevs; ++i) { - for (j = 0; j < dev_stripes; ++j) { - int s = i * dev_stripes + j; + ctl->ndevs = min(ctl->ndevs, ctl->devs_max); + + switch (fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + return decide_stripe_size_regular(ctl, devices_info); + default: + BUG(); + } +} + +static int create_chunk(struct btrfs_trans_handle *trans, + struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct map_lookup *map = NULL; + struct extent_map_tree *em_tree; + struct extent_map *em; + u64 start = ctl->start; + u64 type = ctl->type; + int ret; + int i; + int j; + + map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); + if (!map) + return -ENOMEM; + map->num_stripes = ctl->num_stripes; + + for (i = 0; i < ctl->ndevs; ++i) { + for (j = 0; j < ctl->dev_stripes; ++j) { + int s = i * ctl->dev_stripes + j; map->stripes[s].dev = devices_info[i].dev; map->stripes[s].physical = devices_info[i].dev_offset + - j * stripe_size; + j * ctl->stripe_size; } } map->stripe_len = BTRFS_STRIPE_LEN; map->io_align = BTRFS_STRIPE_LEN; map->io_width = BTRFS_STRIPE_LEN; map->type = type; - map->sub_stripes = sub_stripes; + map->sub_stripes = ctl->sub_stripes; - num_bytes = stripe_size * data_stripes; - - trace_btrfs_chunk_alloc(info, map, start, num_bytes); + trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); em = alloc_extent_map(); if (!em) { kfree(map); - ret = -ENOMEM; - goto error; + return -ENOMEM; } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->map_lookup = map; em->start = start; - em->len = num_bytes; + em->len = ctl->chunk_size; em->block_start = 0; em->block_len = em->len; - em->orig_block_len = stripe_size; + em->orig_block_len = ctl->stripe_size; - em_tree = &info->mapping_tree.map_tree; + em_tree = &info->mapping_tree; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); if (ret) { write_unlock(&em_tree->lock); free_extent_map(em); - goto error; + return ret; } - - list_add_tail(&em->list, &trans->transaction->pending_chunks); - refcount_inc(&em->refs); write_unlock(&em_tree->lock); - ret = btrfs_make_block_group(trans, 0, type, start, num_bytes); + ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); if (ret) goto error_del_extent; for (i = 0; i < map->num_stripes; i++) { - num_bytes = map->stripes[i].dev->bytes_used + stripe_size; - btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); - map->stripes[i].dev->has_pending_chunks = true; + struct btrfs_device *dev = map->stripes[i].dev; + + btrfs_device_set_bytes_used(dev, + dev->bytes_used + ctl->stripe_size); + if (list_empty(&dev->post_commit_list)) + list_add_tail(&dev->post_commit_list, + &trans->transaction->dev_update_list); } - atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); + atomic64_sub(ctl->stripe_size * map->num_stripes, + &info->free_chunk_space); free_extent_map(em); check_raid56_incompat_flag(info, type); + check_raid1c34_incompat_flag(info, type); - kfree(devices_info); return 0; error_del_extent: @@ -4891,13 +5236,68 @@ free_extent_map(em); /* One for the tree reference */ free_extent_map(em); - /* One for the pending_chunks list reference */ - free_extent_map(em); -error: + + return ret; +} + +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) +{ + struct btrfs_fs_info *info = trans->fs_info; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct btrfs_device_info *devices_info = NULL; + struct alloc_chunk_ctl ctl; + int ret; + + lockdep_assert_held(&info->chunk_mutex); + + if (!alloc_profile_is_valid(type, 0)) { + ASSERT(0); + return -EINVAL; + } + + if (list_empty(&fs_devices->alloc_list)) { + if (btrfs_test_opt(info, ENOSPC_DEBUG)) + btrfs_debug(info, "%s: no writable device", __func__); + return -ENOSPC; + } + + if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(info, "invalid chunk type 0x%llx requested", type); + ASSERT(0); + return -EINVAL; + } + + ctl.start = find_next_chunk(info); + ctl.type = type; + init_alloc_chunk_ctl(fs_devices, &ctl); + + devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + ret = gather_device_info(fs_devices, &ctl, devices_info); + if (ret < 0) + goto out; + + ret = decide_stripe_size(fs_devices, &ctl, devices_info); + if (ret < 0) + goto out; + + ret = create_chunk(trans, &ctl, devices_info); + +out: kfree(devices_info); return ret; } +/* + * Chunk allocation falls into two parts. The first part does work + * that makes the new allocated chunk usable, but does not do any operation + * that modifies the chunk tree. The second part does the work that + * requires modifying the chunk tree. This division is important for the + * bootstrap process of adding storage to a seed btrfs. + */ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 chunk_size) { @@ -4916,7 +5316,7 @@ int i = 0; int ret = 0; - em = get_chunk_map(fs_info, chunk_offset, chunk_size); + em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); if (IS_ERR(em)) return PTR_ERR(em); @@ -4996,57 +5396,27 @@ return ret; } -/* - * Chunk allocation falls into two parts. The first part does works - * that make the new allocated chunk useable, but not do any operation - * that modifies the chunk tree. The second part does the works that - * require modifying the chunk tree. This division is important for the - * bootstrap process of adding storage to a seed btrfs. - */ -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) +static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) { - u64 chunk_offset; - - lockdep_assert_held(&trans->fs_info->chunk_mutex); - chunk_offset = find_next_chunk(trans->fs_info); - return __btrfs_alloc_chunk(trans, chunk_offset, type); -} - -static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - u64 chunk_offset; - u64 sys_chunk_offset; + struct btrfs_fs_info *fs_info = trans->fs_info; u64 alloc_profile; int ret; - chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_metadata_alloc_profile(fs_info); - ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); + ret = btrfs_alloc_chunk(trans, alloc_profile); if (ret) return ret; - sys_chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_system_alloc_profile(fs_info); - ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); + ret = btrfs_alloc_chunk(trans, alloc_profile); return ret; } static inline int btrfs_chunk_max_errors(struct map_lookup *map) { - int max_errors; + const int index = btrfs_bg_flags_to_raid_index(map->type); - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5)) { - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { - max_errors = 2; - } else { - max_errors = 0; - } - - return max_errors; + return btrfs_raid_array[index].tolerated_failures; } int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) @@ -5057,7 +5427,7 @@ int miss_ndevs = 0; int i; - em = get_chunk_map(fs_info, chunk_offset, 1); + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) return 1; @@ -5087,21 +5457,16 @@ return readonly; } -void btrfs_mapping_init(struct btrfs_mapping_tree *tree) -{ - extent_map_tree_init(&tree->map_tree); -} - -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) +void btrfs_mapping_tree_free(struct extent_map_tree *tree) { struct extent_map *em; while (1) { - write_lock(&tree->map_tree.lock); - em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, 0, (u64)-1); if (em) - remove_extent_mapping(&tree->map_tree, em); - write_unlock(&tree->map_tree.lock); + remove_extent_mapping(tree, em); + write_unlock(&tree->lock); if (!em) break; /* once for us */ @@ -5117,7 +5482,7 @@ struct map_lookup *map; int ret; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(em)) /* * We could return errors for these cases, but that could get @@ -5128,7 +5493,7 @@ return 1; map = em->map_lookup; - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; @@ -5147,11 +5512,11 @@ ret = 1; free_extent_map(em); - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && fs_info->dev_replace.tgtdev) ret++; - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); return ret; } @@ -5163,7 +5528,7 @@ struct map_lookup *map; unsigned long len = fs_info->sectorsize; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if (!WARN_ON(IS_ERR(em))) { map = em->map_lookup; @@ -5180,7 +5545,7 @@ struct map_lookup *map; int ret = 0; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if(!WARN_ON(IS_ERR(em))) { map = em->map_lookup; @@ -5202,7 +5567,7 @@ struct btrfs_device *srcdev; ASSERT((map->type & - (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); + (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); if (map->type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = map->sub_stripes; @@ -5240,31 +5605,19 @@ return preferred_mirror; } -static inline int parity_smaller(u64 a, u64 b) -{ - return a > b; -} - /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) { - struct btrfs_bio_stripe s; int i; - u64 l; int again = 1; while (again) { again = 0; for (i = 0; i < num_stripes - 1; i++) { - if (parity_smaller(bbio->raid_map[i], - bbio->raid_map[i+1])) { - s = bbio->stripes[i]; - l = bbio->raid_map[i]; - bbio->stripes[i] = bbio->stripes[i+1]; - bbio->raid_map[i] = bbio->raid_map[i+1]; - bbio->stripes[i+1] = s; - bbio->raid_map[i+1] = l; - + /* Swap if parity is on a smaller index */ + if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { + swap(bbio->stripes[i], bbio->stripes[i + 1]); + swap(bbio->raid_map[i], bbio->raid_map[i + 1]); again = 1; } } @@ -5290,6 +5643,9 @@ atomic_set(&bbio->error, 0); refcount_set(&bbio->refs, 1); + bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); + bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); + return bbio; } @@ -5313,12 +5669,13 @@ * replace. */ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, - u64 logical, u64 length, + u64 logical, u64 *length_ret, struct btrfs_bio **bbio_ret) { struct extent_map *em; struct map_lookup *map; struct btrfs_bio *bbio; + u64 length = *length_ret; u64 offset; u64 stripe_nr; u64 stripe_nr_end; @@ -5339,7 +5696,7 @@ /* discard always return a bbio */ ASSERT(bbio_ret); - em = get_chunk_map(fs_info, logical, length); + em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) return PTR_ERR(em); @@ -5351,7 +5708,8 @@ } offset = logical - em->start; - length = min_t(u64, em->len - offset, length); + length = min_t(u64, em->start + em->len - logical, length); + *length_ret = length; stripe_len = map->stripe_len; /* @@ -5391,7 +5749,7 @@ &remaining_stripes); div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); last_stripe *= sub_stripes; - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { num_stripes = map->num_stripes; } else { @@ -5635,6 +5993,106 @@ return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); } +/* + * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) + * tuple. This information is used to calculate how big a + * particular bio can get before it straddles a stripe. + * + * @fs_info - the filesystem + * @logical - address that we want to figure out the geometry of + * @len - the length of IO we are going to perform, starting at @logical + * @op - type of operation - write or read + * @io_geom - pointer used to return values + * + * Returns < 0 in case a chunk for the given logical address cannot be found, + * usually shouldn't happen unless @logical is corrupted, 0 otherwise. + */ +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 len, struct btrfs_io_geometry *io_geom) +{ + struct extent_map *em; + struct map_lookup *map; + u64 offset; + u64 stripe_offset; + u64 stripe_nr; + u64 stripe_len; + u64 raid56_full_stripe_start = (u64)-1; + int data_stripes; + int ret = 0; + + ASSERT(op != BTRFS_MAP_DISCARD); + + em = btrfs_get_chunk_map(fs_info, logical, len); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* Offset of this logical address in the chunk */ + offset = logical - em->start; + /* Len of a stripe in a chunk */ + stripe_len = map->stripe_len; + /* Stripe wher this block falls in */ + stripe_nr = div64_u64(offset, stripe_len); + /* Offset of stripe in the chunk */ + stripe_offset = stripe_nr * stripe_len; + if (offset < stripe_offset) { + btrfs_crit(fs_info, +"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", + stripe_offset, offset, em->start, logical, stripe_len); + ret = -EINVAL; + goto out; + } + + /* stripe_offset is the offset of this block in its stripe */ + stripe_offset = offset - stripe_offset; + data_stripes = nr_data_stripes(map); + + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + u64 max_len = stripe_len - stripe_offset; + + /* + * In case of raid56, we need to know the stripe aligned start + */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + unsigned long full_stripe_len = stripe_len * data_stripes; + raid56_full_stripe_start = offset; + + /* + * Allow a write of a full stripe, but make sure we + * don't allow straddling of stripes + */ + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, + full_stripe_len); + raid56_full_stripe_start *= full_stripe_len; + + /* + * For writes to RAID[56], allow a full stripeset across + * all disks. For other RAID types and for RAID[56] + * reads, just allow a single stripe (on a single disk). + */ + if (op == BTRFS_MAP_WRITE) { + max_len = stripe_len * data_stripes - + (offset - raid56_full_stripe_start); + } + } + len = min_t(u64, em->len - offset, max_len); + } else { + len = em->len - offset; + } + + io_geom->len = len; + io_geom->offset = offset; + io_geom->stripe_len = stripe_len; + io_geom->stripe_nr = stripe_nr; + io_geom->stripe_offset = stripe_offset; + io_geom->raid56_stripe_offset = raid56_full_stripe_start; + +out: + /* once for us */ + free_extent_map(em); + return ret; +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, @@ -5643,11 +6101,11 @@ { struct extent_map *em; struct map_lookup *map; - u64 offset; u64 stripe_offset; u64 stripe_nr; u64 stripe_len; u32 stripe_index; + int data_stripes; int i; int ret = 0; int num_stripes; @@ -5660,81 +6118,34 @@ int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; + struct btrfs_io_geometry geom; - if (op == BTRFS_MAP_DISCARD) - return __btrfs_map_block_for_discard(fs_info, logical, - *length, bbio_ret); + ASSERT(bbio_ret); + ASSERT(op != BTRFS_MAP_DISCARD); - em = get_chunk_map(fs_info, logical, *length); - if (IS_ERR(em)) - return PTR_ERR(em); + ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); + if (ret < 0) + return ret; + em = btrfs_get_chunk_map(fs_info, logical, *length); + ASSERT(!IS_ERR(em)); map = em->map_lookup; - offset = logical - em->start; - stripe_len = map->stripe_len; - stripe_nr = offset; - /* - * stripe_nr counts the total number of stripes we have to stride - * to get to this block - */ - stripe_nr = div64_u64(stripe_nr, stripe_len); + *length = geom.len; + stripe_len = geom.stripe_len; + stripe_nr = geom.stripe_nr; + stripe_offset = geom.stripe_offset; + raid56_full_stripe_start = geom.raid56_stripe_offset; + data_stripes = nr_data_stripes(map); - stripe_offset = stripe_nr * stripe_len; - if (offset < stripe_offset) { - btrfs_crit(fs_info, - "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", - stripe_offset, offset, em->start, logical, - stripe_len); - free_extent_map(em); - return -EINVAL; - } - - /* stripe_offset is the offset of this block in its stripe*/ - stripe_offset = offset - stripe_offset; - - /* if we're here for raid56, we need to know the stripe aligned start */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); - raid56_full_stripe_start = offset; - - /* allow a write of a full stripe, but make sure we don't - * allow straddling of stripes - */ - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, - full_stripe_len); - raid56_full_stripe_start *= full_stripe_len; - } - - if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - u64 max_len; - /* For writes to RAID[56], allow a full stripeset across all disks. - For other RAID types and for RAID[56] reads, just allow a single - stripe (on a single disk). */ - if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - (op == BTRFS_MAP_WRITE)) { - max_len = stripe_len * nr_data_stripes(map) - - (offset - raid56_full_stripe_start); - } else { - /* we limit the length of each bio to what fits in a stripe */ - max_len = stripe_len - stripe_offset; - } - *length = min_t(u64, em->len - offset, max_len); - } else { - *length = em->len - offset; - } - - /* This is for when we're called from btrfs_merge_bio_hook() and all - it cares about is the length */ - if (!bbio_ret) - goto out; - - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + /* + * Hold the semaphore for read during the whole operation, write is + * requested at commit time but must wait. + */ if (!dev_replace_is_ongoing) - btrfs_dev_replace_read_unlock(dev_replace); - else - btrfs_dev_replace_set_lock_blocking(dev_replace); + up_read(&dev_replace->rwsem); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && !need_full_stripe(op) && dev_replace->tgtdev != NULL) { @@ -5757,7 +6168,7 @@ &stripe_index); if (!need_full_stripe(op)) mirror_num = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { if (need_full_stripe(op)) num_stripes = map->num_stripes; else if (mirror_num) @@ -5799,7 +6210,7 @@ if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, - stripe_len * nr_data_stripes(map)); + stripe_len * data_stripes); /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -5815,10 +6226,9 @@ * Mirror #3 is RAID6 Q block. */ stripe_nr = div_u64_rem(stripe_nr, - nr_data_stripes(map), &stripe_index); + data_stripes, &stripe_index); if (mirror_num > 1) - stripe_index = nr_data_stripes(map) + - mirror_num - 2; + stripe_index = data_stripes + mirror_num - 2; /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, @@ -5858,8 +6268,13 @@ ret = -ENOMEM; goto out; } - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) - bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); + + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; + stripe_index++; + } /* build raid_map */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && @@ -5867,17 +6282,12 @@ u64 tmp; unsigned rot; - bbio->raid_map = (u64 *)((void *)bbio->stripes + - sizeof(struct btrfs_bio_stripe) * - num_alloc_stripes + - sizeof(int) * tgtdev_indexes); - /* Work out the disk rotation on this stripe-set */ div_u64_rem(stripe_nr, num_stripes, &rot); /* Fill in the logical address of each stripe */ - tmp = stripe_nr * nr_data_stripes(map); - for (i = 0; i < nr_data_stripes(map); i++) + tmp = stripe_nr * data_stripes; + for (i = 0; i < data_stripes; i++) bbio->raid_map[(i+rot) % num_stripes] = em->start + (tmp + i) * map->stripe_len; @@ -5885,24 +6295,12 @@ if (map->type & BTRFS_BLOCK_GROUP_RAID6) bbio->raid_map[(i+rot+1) % num_stripes] = RAID6_Q_STRIPE; - } - - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + - stripe_nr * map->stripe_len; - bbio->stripes[i].dev = - map->stripes[stripe_index].dev; - stripe_index++; + sort_parity_stripes(bbio, num_stripes); } if (need_full_stripe(op)) max_errors = btrfs_chunk_max_errors(map); - - if (bbio->raid_map) - sort_parity_stripes(bbio, num_stripes); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { @@ -5929,8 +6327,9 @@ } out: if (dev_replace_is_ongoing) { - btrfs_dev_replace_clear_lock_blocking(dev_replace); - btrfs_dev_replace_read_unlock(dev_replace); + lockdep_assert_held(&dev_replace->rwsem); + /* Unlock and let waiting writers proceed */ + up_read(&dev_replace->rwsem); } free_extent_map(em); return ret; @@ -5940,6 +6339,10 @@ u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { + if (op == BTRFS_MAP_DISCARD) + return __btrfs_map_block_for_discard(fs_info, logical, + length, bbio_ret); + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, mirror_num, 0); } @@ -5950,75 +6353,6 @@ struct btrfs_bio **bbio_ret) { return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); -} - -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len) -{ - struct extent_map *em; - struct map_lookup *map; - u64 *buf; - u64 bytenr; - u64 length; - u64 stripe_nr; - u64 rmap_len; - int i, j, nr = 0; - - em = get_chunk_map(fs_info, chunk_start, 1); - if (IS_ERR(em)) - return -EIO; - - map = em->map_lookup; - length = em->len; - rmap_len = map->stripe_len; - - if (map->type & BTRFS_BLOCK_GROUP_RAID10) - length = div_u64(length, map->num_stripes / map->sub_stripes); - else if (map->type & BTRFS_BLOCK_GROUP_RAID0) - length = div_u64(length, map->num_stripes); - else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - length = div_u64(length, nr_data_stripes(map)); - rmap_len = map->stripe_len * nr_data_stripes(map); - } - - buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); - BUG_ON(!buf); /* -ENOMEM */ - - for (i = 0; i < map->num_stripes; i++) { - if (map->stripes[i].physical > physical || - map->stripes[i].physical + length <= physical) - continue; - - stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div64_u64(stripe_nr, map->stripe_len); - - if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - stripe_nr = stripe_nr * map->num_stripes + i; - stripe_nr = div_u64(stripe_nr, map->sub_stripes); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_nr = stripe_nr * map->num_stripes + i; - } /* else if RAID[56], multiply by nr_data_stripes(). - * Alternatively, just use rmap_len below instead of - * map->stripe_len */ - - bytenr = chunk_start + stripe_nr * rmap_len; - WARN_ON(nr >= map->num_stripes); - for (j = 0; j < nr; j++) { - if (buf[j] == bytenr) - break; - } - if (j == nr) { - WARN_ON(nr >= map->num_stripes); - buf[nr++] = bytenr; - } - } - - *logical = buf; - *naddrs = nr; - *stripe_len = rmap_len; - - free_extent_map(em); - return 0; } static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) @@ -6039,23 +6373,18 @@ atomic_inc(&bbio->error); if (bio->bi_status == BLK_STS_IOERR || bio->bi_status == BLK_STS_TARGET) { - unsigned int stripe_index = - btrfs_io_bio(bio)->stripe_index; - struct btrfs_device *dev; + struct btrfs_device *dev = btrfs_io_bio(bio)->device; - BUG_ON(stripe_index >= bbio->num_stripes); - dev = bbio->stripes[stripe_index].dev; - if (dev->bdev) { - if (bio_op(bio) == REQ_OP_WRITE) - btrfs_dev_stat_inc_and_print(dev, + ASSERT(dev->bdev); + if (bio_op(bio) == REQ_OP_WRITE) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - else if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(dev, + else if (!(bio->bi_opf & REQ_RAHEAD)) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(dev, + if (bio->bi_opf & REQ_PREFLUSH) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); - } } } @@ -6090,73 +6419,25 @@ } } -/* - * see run_scheduled_bios for a description of why bios are collected for - * async submit. - * - * This will add one bio to the pending list for a device and make sure - * the work struct is scheduled. - */ -static noinline void btrfs_schedule_bio(struct btrfs_device *device, - struct bio *bio) -{ - struct btrfs_fs_info *fs_info = device->fs_info; - int should_queue = 1; - struct btrfs_pending_bios *pending_bios; - - /* don't bother with additional async steps for reads, right now */ - if (bio_op(bio) == REQ_OP_READ) { - btrfsic_submit_bio(bio); - return; - } - - WARN_ON(bio->bi_next); - bio->bi_next = NULL; - - spin_lock(&device->io_lock); - if (op_is_sync(bio->bi_opf)) - pending_bios = &device->pending_sync_bios; - else - pending_bios = &device->pending_bios; - - if (pending_bios->tail) - pending_bios->tail->bi_next = bio; - - pending_bios->tail = bio; - if (!pending_bios->head) - pending_bios->head = bio; - if (device->running_pending) - should_queue = 0; - - spin_unlock(&device->io_lock); - - if (should_queue) - btrfs_queue_work(fs_info->submit_workers, &device->work); -} - static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, - u64 physical, int dev_nr, int async) + u64 physical, struct btrfs_device *dev) { - struct btrfs_device *dev = bbio->stripes[dev_nr].dev; struct btrfs_fs_info *fs_info = bbio->fs_info; bio->bi_private = bbio; - btrfs_io_bio(bio)->stripe_index = dev_nr; + btrfs_io_bio(bio)->device = dev; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; btrfs_debug_in_rcu(fs_info, "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, - (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, - bio->bi_iter.bi_size); + (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), + dev->devid, bio->bi_iter.bi_size); bio_set_dev(bio, dev->bdev); btrfs_bio_counter_inc_noblocked(fs_info); - if (async) - btrfs_schedule_bio(dev, bio); - else - btrfsic_submit_bio(bio); + btrfsic_submit_bio(bio); } static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) @@ -6177,7 +6458,7 @@ } blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, int async_submit) + int mirror_num) { struct btrfs_device *dev; struct bio *first_bio = bio; @@ -6245,8 +6526,7 @@ else bio = first_bio; - submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, - dev_nr, async_submit); + submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); } btrfs_bio_counter_dec(fs_info); return BLK_STS_OK; @@ -6262,15 +6542,25 @@ * If @seed is true, traverse through the seed devices. */ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid, - bool seed) + u64 devid, u8 *uuid, u8 *fsid, + bool seed) { struct btrfs_device *device; + struct btrfs_fs_devices *seed_devs; - while (fs_devices) { + if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->devid == devid && + (!uuid || memcmp(device->uuid, uuid, + BTRFS_UUID_SIZE) == 0)) + return device; + } + } + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { if (!fsid || - !memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) { - list_for_each_entry(device, &fs_devices->devices, + !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { if (device->devid == devid && (!uuid || memcmp(device->uuid, uuid, @@ -6278,11 +6568,8 @@ return device; } } - if (seed) - fs_devices = fs_devices->seed; - else - return NULL; } + return NULL; } @@ -6337,7 +6624,7 @@ if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); - dev = __alloc_device(); + dev = __alloc_device(fs_info); if (IS_ERR(dev)) return dev; @@ -6359,9 +6646,6 @@ else generate_random_uuid(dev->uuid); - btrfs_init_work(&dev->work, btrfs_submit_helper, - pending_bios_fn, NULL, NULL); - return dev; } @@ -6376,11 +6660,26 @@ devid, uuid); } -static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, - struct extent_buffer *leaf, +static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) +{ + int index = btrfs_bg_flags_to_raid_index(type); + int ncopies = btrfs_raid_array[index].ncopies; + const int nparity = btrfs_raid_array[index].nparity; + int data_stripes; + + if (nparity) + data_stripes = num_stripes - nparity; + else + data_stripes = num_stripes / ncopies; + + return div_u64(chunk_len, data_stripes); +} + +static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; u64 logical; @@ -6400,14 +6699,14 @@ * as chunk item in tree block is already verified by tree-checker. */ if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { - ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); + ret = btrfs_check_chunk_valid(leaf, chunk, logical); if (ret) return ret; } - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, logical, 1); + read_unlock(&map_tree->lock); /* already mapped? */ if (em && em->start <= logical && em->start + em->len > logical) { @@ -6441,6 +6740,8 @@ map->type = btrfs_chunk_type(leaf, chunk); map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); map->verified_stripes = 0; + em->orig_block_len = calc_stripe_length(map->type, em->len, + map->num_stripes); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -6449,7 +6750,7 @@ btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, - devid, uuid, NULL, true); + devid, uuid, NULL, true); if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); @@ -6474,9 +6775,9 @@ } - write_lock(&map_tree->map_tree.lock); - ret = add_extent_mapping(&map_tree->map_tree, em, 0); - write_unlock(&map_tree->map_tree.lock); + write_lock(&map_tree->lock); + ret = add_extent_mapping(map_tree, em, 0); + write_unlock(&map_tree->lock); if (ret < 0) { btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", @@ -6519,28 +6820,30 @@ lockdep_assert_held(&uuid_mutex); ASSERT(fsid); - fs_devices = fs_info->fs_devices->seed; - while (fs_devices) { + /* This will match only for multi-device seed fs */ + list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) return fs_devices; - fs_devices = fs_devices->seed; - } - fs_devices = find_fsid(fsid); + fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { if (!btrfs_test_opt(fs_info, DEGRADED)) return ERR_PTR(-ENOENT); - fs_devices = alloc_fs_devices(fsid); + fs_devices = alloc_fs_devices(fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; - fs_devices->seeding = 1; + fs_devices->seeding = true; fs_devices->opened = 1; return fs_devices; } + /* + * Upon first call for a seed fs fsid, just create a private copy of the + * respective fs_devices and anchor it at fs_info->fs_devices->seed_list + */ fs_devices = clone_fs_devices(fs_devices); if (IS_ERR(fs_devices)) return fs_devices; @@ -6548,27 +6851,24 @@ ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); - fs_devices = ERR_PTR(ret); - goto out; + return ERR_PTR(ret); } if (!fs_devices->seeding) { close_fs_devices(fs_devices); free_fs_devices(fs_devices); - fs_devices = ERR_PTR(-EINVAL); - goto out; + return ERR_PTR(-EINVAL); } - fs_devices->seed = fs_info->fs_devices->seed; - fs_info->fs_devices->seed = fs_devices; -out: + list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); + return fs_devices; } -static int read_one_dev(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, +static int read_one_dev(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item) { + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 devid; @@ -6582,7 +6882,7 @@ read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); - if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { + if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { fs_devices = open_seed_devices(fs_info, fs_uuid); if (IS_ERR(fs_devices)) return PTR_ERR(fs_devices); @@ -6725,48 +7025,49 @@ sb_array_offset += len; cur_offset += len; - if (key.type == BTRFS_CHUNK_ITEM_KEY) { - chunk = (struct btrfs_chunk *)sb_array_offset; - /* - * At least one btrfs_chunk with one stripe must be - * present, exact stripe count check comes afterwards - */ - len = btrfs_chunk_item_size(1); - if (cur_offset + len > array_size) - goto out_short_read; - - num_stripes = btrfs_chunk_num_stripes(sb, chunk); - if (!num_stripes) { - btrfs_err(fs_info, - "invalid number of stripes %u in sys_array at offset %u", - num_stripes, cur_offset); - ret = -EIO; - break; - } - - type = btrfs_chunk_type(sb, chunk); - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { - btrfs_err(fs_info, - "invalid chunk type %llu in sys_array at offset %u", - type, cur_offset); - ret = -EIO; - break; - } - - len = btrfs_chunk_item_size(num_stripes); - if (cur_offset + len > array_size) - goto out_short_read; - - ret = read_one_chunk(fs_info, &key, sb, chunk); - if (ret) - break; - } else { + if (key.type != BTRFS_CHUNK_ITEM_KEY) { btrfs_err(fs_info, "unexpected item type %u in sys_array at offset %u", (u32)key.type, cur_offset); ret = -EIO; break; } + + chunk = (struct btrfs_chunk *)sb_array_offset; + /* + * At least one btrfs_chunk with one stripe must be present, + * exact stripe count check comes afterwards + */ + len = btrfs_chunk_item_size(1); + if (cur_offset + len > array_size) + goto out_short_read; + + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + if (!num_stripes) { + btrfs_err(fs_info, + "invalid number of stripes %u in sys_array at offset %u", + num_stripes, cur_offset); + ret = -EIO; + break; + } + + type = btrfs_chunk_type(sb, chunk); + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { + btrfs_err(fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur_offset); + ret = -EIO; + break; + } + + len = btrfs_chunk_item_size(num_stripes); + if (cur_offset + len > array_size) + goto out_short_read; + + ret = read_one_chunk(&key, sb, chunk); + if (ret) + break; + array_ptr += len; sb_array_offset += len; cur_offset += len; @@ -6794,14 +7095,14 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; u64 next_start = 0; bool ret = true; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, 0, (u64)-1); + read_unlock(&map_tree->lock); /* No chunk at all? Return false anyway */ if (!em) { ret = false; @@ -6830,7 +7131,7 @@ if (missing > max_tolerated) { if (!failing_dev) btrfs_warn(fs_info, - "chunk %llu missing %d devices, max tolerance is %d for writeable mount", + "chunk %llu missing %d devices, max tolerance is %d for writable mount", em->start, missing, max_tolerated); free_extent_map(em); ret = false; @@ -6839,13 +7140,26 @@ next_start = extent_map_end(em); free_extent_map(em); - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, next_start, + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, next_start, (u64)(-1) - next_start); - read_unlock(&map_tree->map_tree.lock); + read_unlock(&map_tree->lock); } out: return ret; +} + +static void readahead_tree_node_children(struct extent_buffer *node) +{ + int i; + const int nr_items = btrfs_header_nritems(node); + + for (i = 0; i < nr_items; i++) { + u64 start; + + start = btrfs_node_blockptr(node, i); + readahead_tree_block(node->fs_info, start); + } } int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) @@ -6858,6 +7172,7 @@ int ret; int slot; u64 total_dev = 0; + u64 last_ra_node = 0; path = btrfs_alloc_path(); if (!path) @@ -6868,7 +7183,6 @@ * otherwise we don't need it. */ mutex_lock(&uuid_mutex); - mutex_lock(&fs_info->chunk_mutex); /* * It is possible for mount and umount to race in such a way that @@ -6891,6 +7205,8 @@ if (ret < 0) goto error; while (1) { + struct extent_buffer *node; + leaf = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(leaf)) { @@ -6901,19 +7217,32 @@ goto error; break; } + /* + * The nodes on level 1 are not locked but we don't need to do + * that during mount time as nothing else can access the tree + */ + node = path->nodes[1]; + if (node) { + if (last_ra_node != node->start) { + readahead_tree_node_children(node); + last_ra_node = node->start; + } + } btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.type == BTRFS_DEV_ITEM_KEY) { struct btrfs_dev_item *dev_item; dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); - ret = read_one_dev(fs_info, leaf, dev_item); + ret = read_one_dev(leaf, dev_item); if (ret) goto error; total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); - ret = read_one_chunk(fs_info, &found_key, leaf, chunk); + mutex_lock(&fs_info->chunk_mutex); + ret = read_one_chunk(&found_key, leaf, chunk); + mutex_unlock(&fs_info->chunk_mutex); if (ret) goto error; } @@ -6925,12 +7254,12 @@ * do another round of validation checks. */ if (total_dev != fs_info->fs_devices->total_devices) { - btrfs_err(fs_info, - "super_num_devices %llu mismatch with num_devices %llu found here", + btrfs_warn(fs_info, +"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", btrfs_super_num_devices(fs_info->super_copy), total_dev); - ret = -EINVAL; - goto error; + fs_info->fs_devices->total_devices = total_dev; + btrfs_set_super_num_devices(fs_info->super_copy, total_dev); } if (btrfs_super_total_bytes(fs_info->super_copy) < fs_info->fs_devices->total_rw_bytes) { @@ -6943,7 +7272,6 @@ } ret = 0; error: - mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&uuid_mutex); btrfs_free_path(path); @@ -6952,86 +7280,117 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; - while (fs_devices) { - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) - device->fs_info = fs_info; - mutex_unlock(&fs_devices->device_list_mutex); + fs_devices->fs_info = fs_info; - fs_devices = fs_devices->seed; + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->fs_info = fs_info; + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) + device->fs_info = fs_info; + + seed_devs->fs_info = fs_info; } + mutex_unlock(&fs_devices->device_list_mutex); } -static void __btrfs_reset_dev_stats(struct btrfs_device *dev) +static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, + const struct btrfs_dev_stats_item *ptr, + int index) { - int i; + u64 val; - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) - btrfs_dev_stat_reset(dev, i); + read_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); + return val; +} + +static void btrfs_set_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index, u64 val) +{ + write_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); +} + +static int btrfs_device_init_dev_stats(struct btrfs_device *device, + struct btrfs_path *path) +{ + struct btrfs_dev_stats_item *ptr; + struct extent_buffer *eb; + struct btrfs_key key; + int item_size; + int i, ret, slot; + + key.objectid = BTRFS_DEV_STATS_OBJECTID; + key.type = BTRFS_PERSISTENT_ITEM_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); + if (ret) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_set(device, i, 0); + device->dev_stats_valid = 1; + btrfs_release_path(path); + return ret < 0 ? ret : 0; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_set(device, i, 0); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); + + return 0; } int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) { - struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct extent_buffer *eb; - int slot; - int ret = 0; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; struct btrfs_path *path = NULL; - int i; + int ret = 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { - int item_size; - struct btrfs_dev_stats_item *ptr; - - key.objectid = BTRFS_DEV_STATS_OBJECTID; - key.type = BTRFS_PERSISTENT_ITEM_KEY; - key.offset = device->devid; - ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); - if (ret) { - __btrfs_reset_dev_stats(device); - device->dev_stats_valid = 1; - btrfs_release_path(path); - continue; - } - slot = path->slots[0]; - eb = path->nodes[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); - item_size = btrfs_item_size_nr(eb, slot); - - ptr = btrfs_item_ptr(eb, slot, - struct btrfs_dev_stats_item); - - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { - if (item_size >= (1 + i) * sizeof(__le64)) - btrfs_dev_stat_set(device, i, - btrfs_dev_stats_value(eb, ptr, i)); - else - btrfs_dev_stat_reset(device, i); - } - - device->dev_stats_valid = 1; - btrfs_dev_stat_print_on_load(device); - btrfs_release_path(path); + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; } + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; + } + } +out: mutex_unlock(&fs_devices->device_list_mutex); -out: btrfs_free_path(path); - return ret < 0 ? ret : 0; + return ret; } static int update_dev_stat_item(struct btrfs_trans_handle *trans, @@ -7102,9 +7461,9 @@ /* * called from commit_transaction. Writes all changed device stats to disk. */ -int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; int stats_cnt; @@ -7187,8 +7546,8 @@ int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, stats->devid, - NULL, NULL, true); + dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL, + true); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -7203,7 +7562,7 @@ stats->values[i] = btrfs_dev_stat_read_and_reset(dev, i); else - btrfs_dev_stat_reset(dev, i); + btrfs_dev_stat_set(dev, i, 0); } btrfs_info(fs_info, "device stats zeroed by %s (%d)", current->comm, task_pid_nr(current)); @@ -7217,101 +7576,35 @@ return 0; } -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) -{ - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - int copy_num; - - if (!bdev) - return; - - for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; - copy_num++) { - - if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) - continue; - - disk_super = (struct btrfs_super_block *)bh->b_data; - - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - brelse(bh); - } - - /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); - - /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); -} - /* - * Update the size of all devices, which is used for writing out the - * super blocks. + * Update the size and bytes used for each device where it changed. This is + * delayed since we would otherwise get errors while writing out the + * superblocks. + * + * Must be invoked during transaction commit. */ -void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) +void btrfs_commit_device_sizes(struct btrfs_transaction *trans) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *curr, *next; - if (list_empty(&fs_devices->resized_devices)) + ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); + + if (list_empty(&trans->dev_update_list)) return; - mutex_lock(&fs_devices->device_list_mutex); - mutex_lock(&fs_info->chunk_mutex); - list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, - resized_list) { - list_del_init(&curr->resized_list); + /* + * We don't need the device_list_mutex here. This list is owned by the + * transaction and the transaction must complete before the device is + * released. + */ + mutex_lock(&trans->fs_info->chunk_mutex); + list_for_each_entry_safe(curr, next, &trans->dev_update_list, + post_commit_list) { + list_del_init(&curr->post_commit_list); curr->commit_total_bytes = curr->disk_total_bytes; + curr->commit_bytes_used = curr->bytes_used; } - mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_devices->device_list_mutex); -} - -/* Must be invoked during the transaction commit */ -void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct extent_map *em; - struct map_lookup *map; - struct btrfs_device *dev; - int i; - - if (list_empty(&trans->pending_chunks)) - return; - - /* In order to kick the device replace finish process */ - mutex_lock(&fs_info->chunk_mutex); - list_for_each_entry(em, &trans->pending_chunks, list) { - map = em->map_lookup; - - for (i = 0; i < map->num_stripes; i++) { - dev = map->stripes[i].dev; - dev->commit_bytes_used = dev->bytes_used; - dev->has_pending_chunks = false; - } - } - mutex_unlock(&fs_info->chunk_mutex); -} - -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = fs_info; - fs_devices = fs_devices->seed; - } -} - -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = NULL; - fs_devices = fs_devices->seed; - } + mutex_unlock(&trans->fs_info->chunk_mutex); } /* @@ -7319,38 +7612,18 @@ */ int btrfs_bg_type_to_factor(u64 flags) { - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - return 2; - return 1; + const int index = btrfs_bg_flags_to_raid_index(flags); + + return btrfs_raid_array[index].ncopies; } -static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) -{ - int index = btrfs_bg_flags_to_raid_index(type); - int ncopies = btrfs_raid_array[index].ncopies; - int data_stripes; - - switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - case BTRFS_BLOCK_GROUP_RAID5: - data_stripes = num_stripes - 1; - break; - case BTRFS_BLOCK_GROUP_RAID6: - data_stripes = num_stripes - 2; - break; - default: - data_stripes = num_stripes / ncopies; - break; - } - return div_u64(chunk_len, data_stripes); -} static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; struct btrfs_device *dev; @@ -7414,8 +7687,11 @@ /* It's possible this device is a dummy for seed device */ if (dev->disk_total_bytes == 0) { - dev = btrfs_find_device(fs_info->fs_devices->seed, devid, - NULL, NULL, false); + struct btrfs_fs_devices *devs; + + devs = list_first_entry(&fs_info->fs_devices->seed_list, + struct btrfs_fs_devices, seed_list); + dev = btrfs_find_device(devs, devid, NULL, NULL, false); if (!dev) { btrfs_err(fs_info, "failed to find seed devid %llu", devid); @@ -7439,13 +7715,13 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct rb_node *node; int ret = 0; read_lock(&em_tree->lock); - for (node = rb_first(&em_tree->map); node; node = rb_next(node)) { + for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { em = rb_entry(node, struct extent_map, rb_node); if (em->map_lookup->num_stripes != em->map_lookup->verified_stripes) { @@ -7551,3 +7827,27 @@ btrfs_free_path(path); return ret; } + +/* + * Check whether the given block group or device is pinned by any inode being + * used as a swapfile. + */ +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) +{ + struct btrfs_swapfile_pin *sp; + struct rb_node *node; + + spin_lock(&fs_info->swapfile_pins_lock); + node = fs_info->swapfile_pins.rb_node; + while (node) { + sp = rb_entry(node, struct btrfs_swapfile_pin, node); + if (ptr < sp->ptr) + node = node->rb_left; + else if (ptr > sp->ptr) + node = node->rb_right; + else + break; + } + spin_unlock(&fs_info->swapfile_pins_lock); + return node != NULL; +} -- Gitblit v1.6.2