| .. | .. |
|---|
| 7 | 7 | #include <linux/sched/mm.h> |
|---|
| 8 | 8 | #include <linux/bio.h> |
|---|
| 9 | 9 | #include <linux/slab.h> |
|---|
| 10 | | -#include <linux/buffer_head.h> |
|---|
| 11 | 10 | #include <linux/blkdev.h> |
|---|
| 12 | 11 | #include <linux/ratelimit.h> |
|---|
| 13 | 12 | #include <linux/kthread.h> |
|---|
| .. | .. |
|---|
| 15 | 14 | #include <linux/semaphore.h> |
|---|
| 16 | 15 | #include <linux/uuid.h> |
|---|
| 17 | 16 | #include <linux/list_sort.h> |
|---|
| 17 | +#include <linux/namei.h> |
|---|
| 18 | +#include "misc.h" |
|---|
| 18 | 19 | #include "ctree.h" |
|---|
| 19 | 20 | #include "extent_map.h" |
|---|
| 20 | 21 | #include "disk-io.h" |
|---|
| .. | .. |
|---|
| 25 | 26 | #include "async-thread.h" |
|---|
| 26 | 27 | #include "check-integrity.h" |
|---|
| 27 | 28 | #include "rcu-string.h" |
|---|
| 28 | | -#include "math.h" |
|---|
| 29 | 29 | #include "dev-replace.h" |
|---|
| 30 | 30 | #include "sysfs.h" |
|---|
| 31 | 31 | #include "tree-checker.h" |
|---|
| 32 | +#include "space-info.h" |
|---|
| 33 | +#include "block-group.h" |
|---|
| 34 | +#include "discard.h" |
|---|
| 32 | 35 | |
|---|
| 33 | 36 | const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { |
|---|
| 34 | 37 | [BTRFS_RAID_RAID10] = { |
|---|
| .. | .. |
|---|
| 39 | 42 | .tolerated_failures = 1, |
|---|
| 40 | 43 | .devs_increment = 2, |
|---|
| 41 | 44 | .ncopies = 2, |
|---|
| 45 | + .nparity = 0, |
|---|
| 42 | 46 | .raid_name = "raid10", |
|---|
| 43 | 47 | .bg_flag = BTRFS_BLOCK_GROUP_RAID10, |
|---|
| 44 | 48 | .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, |
|---|
| .. | .. |
|---|
| 51 | 55 | .tolerated_failures = 1, |
|---|
| 52 | 56 | .devs_increment = 2, |
|---|
| 53 | 57 | .ncopies = 2, |
|---|
| 58 | + .nparity = 0, |
|---|
| 54 | 59 | .raid_name = "raid1", |
|---|
| 55 | 60 | .bg_flag = BTRFS_BLOCK_GROUP_RAID1, |
|---|
| 56 | 61 | .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, |
|---|
| 62 | + }, |
|---|
| 63 | + [BTRFS_RAID_RAID1C3] = { |
|---|
| 64 | + .sub_stripes = 1, |
|---|
| 65 | + .dev_stripes = 1, |
|---|
| 66 | + .devs_max = 3, |
|---|
| 67 | + .devs_min = 3, |
|---|
| 68 | + .tolerated_failures = 2, |
|---|
| 69 | + .devs_increment = 3, |
|---|
| 70 | + .ncopies = 3, |
|---|
| 71 | + .nparity = 0, |
|---|
| 72 | + .raid_name = "raid1c3", |
|---|
| 73 | + .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, |
|---|
| 74 | + .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, |
|---|
| 75 | + }, |
|---|
| 76 | + [BTRFS_RAID_RAID1C4] = { |
|---|
| 77 | + .sub_stripes = 1, |
|---|
| 78 | + .dev_stripes = 1, |
|---|
| 79 | + .devs_max = 4, |
|---|
| 80 | + .devs_min = 4, |
|---|
| 81 | + .tolerated_failures = 3, |
|---|
| 82 | + .devs_increment = 4, |
|---|
| 83 | + .ncopies = 4, |
|---|
| 84 | + .nparity = 0, |
|---|
| 85 | + .raid_name = "raid1c4", |
|---|
| 86 | + .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, |
|---|
| 87 | + .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, |
|---|
| 57 | 88 | }, |
|---|
| 58 | 89 | [BTRFS_RAID_DUP] = { |
|---|
| 59 | 90 | .sub_stripes = 1, |
|---|
| .. | .. |
|---|
| 63 | 94 | .tolerated_failures = 0, |
|---|
| 64 | 95 | .devs_increment = 1, |
|---|
| 65 | 96 | .ncopies = 2, |
|---|
| 97 | + .nparity = 0, |
|---|
| 66 | 98 | .raid_name = "dup", |
|---|
| 67 | 99 | .bg_flag = BTRFS_BLOCK_GROUP_DUP, |
|---|
| 68 | 100 | .mindev_error = 0, |
|---|
| .. | .. |
|---|
| 75 | 107 | .tolerated_failures = 0, |
|---|
| 76 | 108 | .devs_increment = 1, |
|---|
| 77 | 109 | .ncopies = 1, |
|---|
| 110 | + .nparity = 0, |
|---|
| 78 | 111 | .raid_name = "raid0", |
|---|
| 79 | 112 | .bg_flag = BTRFS_BLOCK_GROUP_RAID0, |
|---|
| 80 | 113 | .mindev_error = 0, |
|---|
| .. | .. |
|---|
| 87 | 120 | .tolerated_failures = 0, |
|---|
| 88 | 121 | .devs_increment = 1, |
|---|
| 89 | 122 | .ncopies = 1, |
|---|
| 123 | + .nparity = 0, |
|---|
| 90 | 124 | .raid_name = "single", |
|---|
| 91 | 125 | .bg_flag = 0, |
|---|
| 92 | 126 | .mindev_error = 0, |
|---|
| .. | .. |
|---|
| 99 | 133 | .tolerated_failures = 1, |
|---|
| 100 | 134 | .devs_increment = 1, |
|---|
| 101 | 135 | .ncopies = 1, |
|---|
| 136 | + .nparity = 1, |
|---|
| 102 | 137 | .raid_name = "raid5", |
|---|
| 103 | 138 | .bg_flag = BTRFS_BLOCK_GROUP_RAID5, |
|---|
| 104 | 139 | .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, |
|---|
| .. | .. |
|---|
| 111 | 146 | .tolerated_failures = 2, |
|---|
| 112 | 147 | .devs_increment = 1, |
|---|
| 113 | 148 | .ncopies = 1, |
|---|
| 149 | + .nparity = 2, |
|---|
| 114 | 150 | .raid_name = "raid6", |
|---|
| 115 | 151 | .bg_flag = BTRFS_BLOCK_GROUP_RAID6, |
|---|
| 116 | 152 | .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, |
|---|
| 117 | 153 | }, |
|---|
| 118 | 154 | }; |
|---|
| 119 | 155 | |
|---|
| 120 | | -const char *get_raid_name(enum btrfs_raid_types type) |
|---|
| 156 | +const char *btrfs_bg_type_to_raid_name(u64 flags) |
|---|
| 121 | 157 | { |
|---|
| 122 | | - if (type >= BTRFS_NR_RAID_TYPES) |
|---|
| 158 | + const int index = btrfs_bg_flags_to_raid_index(flags); |
|---|
| 159 | + |
|---|
| 160 | + if (index >= BTRFS_NR_RAID_TYPES) |
|---|
| 123 | 161 | return NULL; |
|---|
| 124 | 162 | |
|---|
| 125 | | - return btrfs_raid_array[type].raid_name; |
|---|
| 163 | + return btrfs_raid_array[index].raid_name; |
|---|
| 126 | 164 | } |
|---|
| 127 | 165 | |
|---|
| 128 | | -static int init_first_rw_device(struct btrfs_trans_handle *trans, |
|---|
| 129 | | - struct btrfs_fs_info *fs_info); |
|---|
| 166 | +/* |
|---|
| 167 | + * Fill @buf with textual description of @bg_flags, no more than @size_buf |
|---|
| 168 | + * bytes including terminating null byte. |
|---|
| 169 | + */ |
|---|
| 170 | +void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) |
|---|
| 171 | +{ |
|---|
| 172 | + int i; |
|---|
| 173 | + int ret; |
|---|
| 174 | + char *bp = buf; |
|---|
| 175 | + u64 flags = bg_flags; |
|---|
| 176 | + u32 size_bp = size_buf; |
|---|
| 177 | + |
|---|
| 178 | + if (!flags) { |
|---|
| 179 | + strcpy(bp, "NONE"); |
|---|
| 180 | + return; |
|---|
| 181 | + } |
|---|
| 182 | + |
|---|
| 183 | +#define DESCRIBE_FLAG(flag, desc) \ |
|---|
| 184 | + do { \ |
|---|
| 185 | + if (flags & (flag)) { \ |
|---|
| 186 | + ret = snprintf(bp, size_bp, "%s|", (desc)); \ |
|---|
| 187 | + if (ret < 0 || ret >= size_bp) \ |
|---|
| 188 | + goto out_overflow; \ |
|---|
| 189 | + size_bp -= ret; \ |
|---|
| 190 | + bp += ret; \ |
|---|
| 191 | + flags &= ~(flag); \ |
|---|
| 192 | + } \ |
|---|
| 193 | + } while (0) |
|---|
| 194 | + |
|---|
| 195 | + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); |
|---|
| 196 | + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); |
|---|
| 197 | + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); |
|---|
| 198 | + |
|---|
| 199 | + DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); |
|---|
| 200 | + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) |
|---|
| 201 | + DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, |
|---|
| 202 | + btrfs_raid_array[i].raid_name); |
|---|
| 203 | +#undef DESCRIBE_FLAG |
|---|
| 204 | + |
|---|
| 205 | + if (flags) { |
|---|
| 206 | + ret = snprintf(bp, size_bp, "0x%llx|", flags); |
|---|
| 207 | + size_bp -= ret; |
|---|
| 208 | + } |
|---|
| 209 | + |
|---|
| 210 | + if (size_bp < size_buf) |
|---|
| 211 | + buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ |
|---|
| 212 | + |
|---|
| 213 | + /* |
|---|
| 214 | + * The text is trimmed, it's up to the caller to provide sufficiently |
|---|
| 215 | + * large buffer |
|---|
| 216 | + */ |
|---|
| 217 | +out_overflow:; |
|---|
| 218 | +} |
|---|
| 219 | + |
|---|
| 220 | +static int init_first_rw_device(struct btrfs_trans_handle *trans); |
|---|
| 130 | 221 | static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); |
|---|
| 131 | | -static void __btrfs_reset_dev_stats(struct btrfs_device *dev); |
|---|
| 132 | 222 | static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); |
|---|
| 133 | 223 | static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); |
|---|
| 134 | 224 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, |
|---|
| .. | .. |
|---|
| 153 | 243 | * the mutex can be very coarse and can cover long-running operations |
|---|
| 154 | 244 | * |
|---|
| 155 | 245 | * protects: updates to fs_devices counters like missing devices, rw devices, |
|---|
| 156 | | - * seeding, structure cloning, openning/closing devices at mount/umount time |
|---|
| 246 | + * seeding, structure cloning, opening/closing devices at mount/umount time |
|---|
| 157 | 247 | * |
|---|
| 158 | 248 | * global::fs_devs - add, remove, updates to the global list |
|---|
| 159 | 249 | * |
|---|
| .. | .. |
|---|
| 183 | 273 | * chunk_mutex |
|---|
| 184 | 274 | * ----------- |
|---|
| 185 | 275 | * protects chunks, adding or removing during allocation, trim or when a new |
|---|
| 186 | | - * device is added/removed |
|---|
| 276 | + * device is added/removed. Additionally it also protects post_commit_list of |
|---|
| 277 | + * individual devices, since they can be added to the transaction's |
|---|
| 278 | + * post_commit_list only with chunk_mutex held. |
|---|
| 187 | 279 | * |
|---|
| 188 | 280 | * cleaner_mutex |
|---|
| 189 | 281 | * ------------- |
|---|
| .. | .. |
|---|
| 195 | 287 | * ============ |
|---|
| 196 | 288 | * |
|---|
| 197 | 289 | * uuid_mutex |
|---|
| 198 | | - * volume_mutex |
|---|
| 199 | | - * device_list_mutex |
|---|
| 200 | | - * chunk_mutex |
|---|
| 201 | | - * balance_mutex |
|---|
| 290 | + * device_list_mutex |
|---|
| 291 | + * chunk_mutex |
|---|
| 292 | + * balance_mutex |
|---|
| 202 | 293 | * |
|---|
| 203 | 294 | * |
|---|
| 204 | | - * Exclusive operations, BTRFS_FS_EXCL_OP |
|---|
| 205 | | - * ====================================== |
|---|
| 295 | + * Exclusive operations |
|---|
| 296 | + * ==================== |
|---|
| 206 | 297 | * |
|---|
| 207 | 298 | * Maintains the exclusivity of the following operations that apply to the |
|---|
| 208 | 299 | * whole filesystem and cannot run in parallel. |
|---|
| .. | .. |
|---|
| 228 | 319 | * - system power-cycle and filesystem mounted as read-only |
|---|
| 229 | 320 | * - filesystem or device errors leading to forced read-only |
|---|
| 230 | 321 | * |
|---|
| 231 | | - * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. |
|---|
| 232 | | - * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. |
|---|
| 322 | + * The status of exclusive operation is set and cleared atomically. |
|---|
| 323 | + * During the course of Paused state, fs_info::exclusive_operation remains set. |
|---|
| 233 | 324 | * A device operation in Paused or Running state can be canceled or resumed |
|---|
| 234 | 325 | * either by ioctl (Balance only) or when remounted as read-write. |
|---|
| 235 | | - * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or |
|---|
| 326 | + * The exclusive status is cleared when the device operation is canceled or |
|---|
| 236 | 327 | * completed. |
|---|
| 237 | 328 | */ |
|---|
| 238 | 329 | |
|---|
| 239 | 330 | DEFINE_MUTEX(uuid_mutex); |
|---|
| 240 | 331 | static LIST_HEAD(fs_uuids); |
|---|
| 241 | | -struct list_head *btrfs_get_fs_uuids(void) |
|---|
| 332 | +struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) |
|---|
| 242 | 333 | { |
|---|
| 243 | 334 | return &fs_uuids; |
|---|
| 244 | 335 | } |
|---|
| 245 | 336 | |
|---|
| 246 | 337 | /* |
|---|
| 247 | 338 | * alloc_fs_devices - allocate struct btrfs_fs_devices |
|---|
| 248 | | - * @fsid: if not NULL, copy the uuid to fs_devices::fsid |
|---|
| 339 | + * @fsid: if not NULL, copy the UUID to fs_devices::fsid |
|---|
| 340 | + * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid |
|---|
| 249 | 341 | * |
|---|
| 250 | 342 | * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). |
|---|
| 251 | 343 | * The returned struct is not linked onto any lists and can be destroyed with |
|---|
| 252 | 344 | * kfree() right away. |
|---|
| 253 | 345 | */ |
|---|
| 254 | | -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) |
|---|
| 346 | +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, |
|---|
| 347 | + const u8 *metadata_fsid) |
|---|
| 255 | 348 | { |
|---|
| 256 | 349 | struct btrfs_fs_devices *fs_devs; |
|---|
| 257 | 350 | |
|---|
| .. | .. |
|---|
| 262 | 355 | mutex_init(&fs_devs->device_list_mutex); |
|---|
| 263 | 356 | |
|---|
| 264 | 357 | INIT_LIST_HEAD(&fs_devs->devices); |
|---|
| 265 | | - INIT_LIST_HEAD(&fs_devs->resized_devices); |
|---|
| 266 | 358 | INIT_LIST_HEAD(&fs_devs->alloc_list); |
|---|
| 267 | 359 | INIT_LIST_HEAD(&fs_devs->fs_list); |
|---|
| 360 | + INIT_LIST_HEAD(&fs_devs->seed_list); |
|---|
| 268 | 361 | if (fsid) |
|---|
| 269 | 362 | memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); |
|---|
| 363 | + |
|---|
| 364 | + if (metadata_fsid) |
|---|
| 365 | + memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); |
|---|
| 366 | + else if (fsid) |
|---|
| 367 | + memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); |
|---|
| 270 | 368 | |
|---|
| 271 | 369 | return fs_devs; |
|---|
| 272 | 370 | } |
|---|
| 273 | 371 | |
|---|
| 274 | 372 | void btrfs_free_device(struct btrfs_device *device) |
|---|
| 275 | 373 | { |
|---|
| 374 | + WARN_ON(!list_empty(&device->post_commit_list)); |
|---|
| 276 | 375 | rcu_string_free(device->name); |
|---|
| 376 | + extent_io_tree_release(&device->alloc_state); |
|---|
| 277 | 377 | bio_put(device->flush_bio); |
|---|
| 278 | 378 | kfree(device); |
|---|
| 279 | 379 | } |
|---|
| .. | .. |
|---|
| 281 | 381 | static void free_fs_devices(struct btrfs_fs_devices *fs_devices) |
|---|
| 282 | 382 | { |
|---|
| 283 | 383 | struct btrfs_device *device; |
|---|
| 384 | + |
|---|
| 284 | 385 | WARN_ON(fs_devices->opened); |
|---|
| 285 | 386 | while (!list_empty(&fs_devices->devices)) { |
|---|
| 286 | 387 | device = list_entry(fs_devices->devices.next, |
|---|
| .. | .. |
|---|
| 289 | 390 | btrfs_free_device(device); |
|---|
| 290 | 391 | } |
|---|
| 291 | 392 | kfree(fs_devices); |
|---|
| 292 | | -} |
|---|
| 293 | | - |
|---|
| 294 | | -static void btrfs_kobject_uevent(struct block_device *bdev, |
|---|
| 295 | | - enum kobject_action action) |
|---|
| 296 | | -{ |
|---|
| 297 | | - int ret; |
|---|
| 298 | | - |
|---|
| 299 | | - ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); |
|---|
| 300 | | - if (ret) |
|---|
| 301 | | - pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", |
|---|
| 302 | | - action, |
|---|
| 303 | | - kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), |
|---|
| 304 | | - &disk_to_dev(bdev->bd_disk)->kobj); |
|---|
| 305 | 393 | } |
|---|
| 306 | 394 | |
|---|
| 307 | 395 | void __exit btrfs_cleanup_fs_uuids(void) |
|---|
| .. | .. |
|---|
| 321 | 409 | * Returned struct is not linked onto any lists and must be destroyed using |
|---|
| 322 | 410 | * btrfs_free_device. |
|---|
| 323 | 411 | */ |
|---|
| 324 | | -static struct btrfs_device *__alloc_device(void) |
|---|
| 412 | +static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) |
|---|
| 325 | 413 | { |
|---|
| 326 | 414 | struct btrfs_device *dev; |
|---|
| 327 | 415 | |
|---|
| .. | .. |
|---|
| 341 | 429 | |
|---|
| 342 | 430 | INIT_LIST_HEAD(&dev->dev_list); |
|---|
| 343 | 431 | INIT_LIST_HEAD(&dev->dev_alloc_list); |
|---|
| 344 | | - INIT_LIST_HEAD(&dev->resized_list); |
|---|
| 345 | | - |
|---|
| 346 | | - spin_lock_init(&dev->io_lock); |
|---|
| 432 | + INIT_LIST_HEAD(&dev->post_commit_list); |
|---|
| 347 | 433 | |
|---|
| 348 | 434 | atomic_set(&dev->reada_in_flight, 0); |
|---|
| 349 | 435 | atomic_set(&dev->dev_stats_ccnt, 0); |
|---|
| 350 | 436 | btrfs_device_data_ordered_init(dev); |
|---|
| 351 | 437 | INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); |
|---|
| 352 | 438 | INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); |
|---|
| 439 | + extent_io_tree_init(fs_info, &dev->alloc_state, |
|---|
| 440 | + IO_TREE_DEVICE_ALLOC_STATE, NULL); |
|---|
| 353 | 441 | |
|---|
| 354 | 442 | return dev; |
|---|
| 355 | 443 | } |
|---|
| 356 | 444 | |
|---|
| 357 | | -static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) |
|---|
| 445 | +static noinline struct btrfs_fs_devices *find_fsid( |
|---|
| 446 | + const u8 *fsid, const u8 *metadata_fsid) |
|---|
| 358 | 447 | { |
|---|
| 359 | 448 | struct btrfs_fs_devices *fs_devices; |
|---|
| 360 | 449 | |
|---|
| 450 | + ASSERT(fsid); |
|---|
| 451 | + |
|---|
| 452 | + /* Handle non-split brain cases */ |
|---|
| 361 | 453 | list_for_each_entry(fs_devices, &fs_uuids, fs_list) { |
|---|
| 362 | | - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) |
|---|
| 363 | | - return fs_devices; |
|---|
| 454 | + if (metadata_fsid) { |
|---|
| 455 | + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 |
|---|
| 456 | + && memcmp(metadata_fsid, fs_devices->metadata_uuid, |
|---|
| 457 | + BTRFS_FSID_SIZE) == 0) |
|---|
| 458 | + return fs_devices; |
|---|
| 459 | + } else { |
|---|
| 460 | + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) |
|---|
| 461 | + return fs_devices; |
|---|
| 462 | + } |
|---|
| 364 | 463 | } |
|---|
| 365 | 464 | return NULL; |
|---|
| 366 | 465 | } |
|---|
| 367 | 466 | |
|---|
| 467 | +static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( |
|---|
| 468 | + struct btrfs_super_block *disk_super) |
|---|
| 469 | +{ |
|---|
| 470 | + |
|---|
| 471 | + struct btrfs_fs_devices *fs_devices; |
|---|
| 472 | + |
|---|
| 473 | + /* |
|---|
| 474 | + * Handle scanned device having completed its fsid change but |
|---|
| 475 | + * belonging to a fs_devices that was created by first scanning |
|---|
| 476 | + * a device which didn't have its fsid/metadata_uuid changed |
|---|
| 477 | + * at all and the CHANGING_FSID_V2 flag set. |
|---|
| 478 | + */ |
|---|
| 479 | + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { |
|---|
| 480 | + if (fs_devices->fsid_change && |
|---|
| 481 | + memcmp(disk_super->metadata_uuid, fs_devices->fsid, |
|---|
| 482 | + BTRFS_FSID_SIZE) == 0 && |
|---|
| 483 | + memcmp(fs_devices->fsid, fs_devices->metadata_uuid, |
|---|
| 484 | + BTRFS_FSID_SIZE) == 0) { |
|---|
| 485 | + return fs_devices; |
|---|
| 486 | + } |
|---|
| 487 | + } |
|---|
| 488 | + /* |
|---|
| 489 | + * Handle scanned device having completed its fsid change but |
|---|
| 490 | + * belonging to a fs_devices that was created by a device that |
|---|
| 491 | + * has an outdated pair of fsid/metadata_uuid and |
|---|
| 492 | + * CHANGING_FSID_V2 flag set. |
|---|
| 493 | + */ |
|---|
| 494 | + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { |
|---|
| 495 | + if (fs_devices->fsid_change && |
|---|
| 496 | + memcmp(fs_devices->metadata_uuid, |
|---|
| 497 | + fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && |
|---|
| 498 | + memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, |
|---|
| 499 | + BTRFS_FSID_SIZE) == 0) { |
|---|
| 500 | + return fs_devices; |
|---|
| 501 | + } |
|---|
| 502 | + } |
|---|
| 503 | + |
|---|
| 504 | + return find_fsid(disk_super->fsid, disk_super->metadata_uuid); |
|---|
| 505 | +} |
|---|
| 506 | + |
|---|
| 507 | + |
|---|
| 368 | 508 | static int |
|---|
| 369 | 509 | btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, |
|---|
| 370 | 510 | int flush, struct block_device **bdev, |
|---|
| 371 | | - struct buffer_head **bh) |
|---|
| 511 | + struct btrfs_super_block **disk_super) |
|---|
| 372 | 512 | { |
|---|
| 373 | 513 | int ret; |
|---|
| 374 | 514 | |
|---|
| .. | .. |
|---|
| 387 | 527 | goto error; |
|---|
| 388 | 528 | } |
|---|
| 389 | 529 | invalidate_bdev(*bdev); |
|---|
| 390 | | - *bh = btrfs_read_dev_super(*bdev); |
|---|
| 391 | | - if (IS_ERR(*bh)) { |
|---|
| 392 | | - ret = PTR_ERR(*bh); |
|---|
| 530 | + *disk_super = btrfs_read_dev_super(*bdev); |
|---|
| 531 | + if (IS_ERR(*disk_super)) { |
|---|
| 532 | + ret = PTR_ERR(*disk_super); |
|---|
| 393 | 533 | blkdev_put(*bdev, flags); |
|---|
| 394 | 534 | goto error; |
|---|
| 395 | 535 | } |
|---|
| .. | .. |
|---|
| 398 | 538 | |
|---|
| 399 | 539 | error: |
|---|
| 400 | 540 | *bdev = NULL; |
|---|
| 401 | | - *bh = NULL; |
|---|
| 402 | 541 | return ret; |
|---|
| 403 | 542 | } |
|---|
| 404 | 543 | |
|---|
| 405 | | -static void requeue_list(struct btrfs_pending_bios *pending_bios, |
|---|
| 406 | | - struct bio *head, struct bio *tail) |
|---|
| 407 | | -{ |
|---|
| 408 | | - |
|---|
| 409 | | - struct bio *old_head; |
|---|
| 410 | | - |
|---|
| 411 | | - old_head = pending_bios->head; |
|---|
| 412 | | - pending_bios->head = head; |
|---|
| 413 | | - if (pending_bios->tail) |
|---|
| 414 | | - tail->bi_next = old_head; |
|---|
| 415 | | - else |
|---|
| 416 | | - pending_bios->tail = tail; |
|---|
| 417 | | -} |
|---|
| 418 | | - |
|---|
| 419 | 544 | /* |
|---|
| 420 | | - * we try to collect pending bios for a device so we don't get a large |
|---|
| 421 | | - * number of procs sending bios down to the same device. This greatly |
|---|
| 422 | | - * improves the schedulers ability to collect and merge the bios. |
|---|
| 545 | + * Check if the device in the path matches the device in the given struct device. |
|---|
| 423 | 546 | * |
|---|
| 424 | | - * But, it also turns into a long list of bios to process and that is sure |
|---|
| 425 | | - * to eventually make the worker thread block. The solution here is to |
|---|
| 426 | | - * make some progress and then put this work struct back at the end of |
|---|
| 427 | | - * the list if the block device is congested. This way, multiple devices |
|---|
| 428 | | - * can make progress from a single worker thread. |
|---|
| 547 | + * Returns: |
|---|
| 548 | + * true If it is the same device. |
|---|
| 549 | + * false If it is not the same device or on error. |
|---|
| 429 | 550 | */ |
|---|
| 430 | | -static noinline void run_scheduled_bios(struct btrfs_device *device) |
|---|
| 551 | +static bool device_matched(const struct btrfs_device *device, const char *path) |
|---|
| 431 | 552 | { |
|---|
| 432 | | - struct btrfs_fs_info *fs_info = device->fs_info; |
|---|
| 433 | | - struct bio *pending; |
|---|
| 434 | | - struct backing_dev_info *bdi; |
|---|
| 435 | | - struct btrfs_pending_bios *pending_bios; |
|---|
| 436 | | - struct bio *tail; |
|---|
| 437 | | - struct bio *cur; |
|---|
| 438 | | - int again = 0; |
|---|
| 439 | | - unsigned long num_run; |
|---|
| 440 | | - unsigned long batch_run = 0; |
|---|
| 441 | | - unsigned long last_waited = 0; |
|---|
| 442 | | - int force_reg = 0; |
|---|
| 443 | | - int sync_pending = 0; |
|---|
| 444 | | - struct blk_plug plug; |
|---|
| 553 | + char *device_name; |
|---|
| 554 | + struct block_device *bdev_old; |
|---|
| 555 | + struct block_device *bdev_new; |
|---|
| 445 | 556 | |
|---|
| 446 | 557 | /* |
|---|
| 447 | | - * this function runs all the bios we've collected for |
|---|
| 448 | | - * a particular device. We don't want to wander off to |
|---|
| 449 | | - * another device without first sending all of these down. |
|---|
| 450 | | - * So, setup a plug here and finish it off before we return |
|---|
| 558 | + * If we are looking for a device with the matching dev_t, then skip |
|---|
| 559 | + * device without a name (a missing device). |
|---|
| 451 | 560 | */ |
|---|
| 452 | | - blk_start_plug(&plug); |
|---|
| 561 | + if (!device->name) |
|---|
| 562 | + return false; |
|---|
| 453 | 563 | |
|---|
| 454 | | - bdi = device->bdev->bd_bdi; |
|---|
| 564 | + device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); |
|---|
| 565 | + if (!device_name) |
|---|
| 566 | + return false; |
|---|
| 455 | 567 | |
|---|
| 456 | | -loop: |
|---|
| 457 | | - spin_lock(&device->io_lock); |
|---|
| 568 | + rcu_read_lock(); |
|---|
| 569 | + scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name)); |
|---|
| 570 | + rcu_read_unlock(); |
|---|
| 458 | 571 | |
|---|
| 459 | | -loop_lock: |
|---|
| 460 | | - num_run = 0; |
|---|
| 572 | + bdev_old = lookup_bdev(device_name); |
|---|
| 573 | + kfree(device_name); |
|---|
| 574 | + if (IS_ERR(bdev_old)) |
|---|
| 575 | + return false; |
|---|
| 461 | 576 | |
|---|
| 462 | | - /* take all the bios off the list at once and process them |
|---|
| 463 | | - * later on (without the lock held). But, remember the |
|---|
| 464 | | - * tail and other pointers so the bios can be properly reinserted |
|---|
| 465 | | - * into the list if we hit congestion |
|---|
| 466 | | - */ |
|---|
| 467 | | - if (!force_reg && device->pending_sync_bios.head) { |
|---|
| 468 | | - pending_bios = &device->pending_sync_bios; |
|---|
| 469 | | - force_reg = 1; |
|---|
| 470 | | - } else { |
|---|
| 471 | | - pending_bios = &device->pending_bios; |
|---|
| 472 | | - force_reg = 0; |
|---|
| 473 | | - } |
|---|
| 577 | + bdev_new = lookup_bdev(path); |
|---|
| 578 | + if (IS_ERR(bdev_new)) |
|---|
| 579 | + return false; |
|---|
| 474 | 580 | |
|---|
| 475 | | - pending = pending_bios->head; |
|---|
| 476 | | - tail = pending_bios->tail; |
|---|
| 477 | | - WARN_ON(pending && !tail); |
|---|
| 581 | + if (bdev_old == bdev_new) |
|---|
| 582 | + return true; |
|---|
| 478 | 583 | |
|---|
| 479 | | - /* |
|---|
| 480 | | - * if pending was null this time around, no bios need processing |
|---|
| 481 | | - * at all and we can stop. Otherwise it'll loop back up again |
|---|
| 482 | | - * and do an additional check so no bios are missed. |
|---|
| 483 | | - * |
|---|
| 484 | | - * device->running_pending is used to synchronize with the |
|---|
| 485 | | - * schedule_bio code. |
|---|
| 486 | | - */ |
|---|
| 487 | | - if (device->pending_sync_bios.head == NULL && |
|---|
| 488 | | - device->pending_bios.head == NULL) { |
|---|
| 489 | | - again = 0; |
|---|
| 490 | | - device->running_pending = 0; |
|---|
| 491 | | - } else { |
|---|
| 492 | | - again = 1; |
|---|
| 493 | | - device->running_pending = 1; |
|---|
| 494 | | - } |
|---|
| 495 | | - |
|---|
| 496 | | - pending_bios->head = NULL; |
|---|
| 497 | | - pending_bios->tail = NULL; |
|---|
| 498 | | - |
|---|
| 499 | | - spin_unlock(&device->io_lock); |
|---|
| 500 | | - |
|---|
| 501 | | - while (pending) { |
|---|
| 502 | | - |
|---|
| 503 | | - rmb(); |
|---|
| 504 | | - /* we want to work on both lists, but do more bios on the |
|---|
| 505 | | - * sync list than the regular list |
|---|
| 506 | | - */ |
|---|
| 507 | | - if ((num_run > 32 && |
|---|
| 508 | | - pending_bios != &device->pending_sync_bios && |
|---|
| 509 | | - device->pending_sync_bios.head) || |
|---|
| 510 | | - (num_run > 64 && pending_bios == &device->pending_sync_bios && |
|---|
| 511 | | - device->pending_bios.head)) { |
|---|
| 512 | | - spin_lock(&device->io_lock); |
|---|
| 513 | | - requeue_list(pending_bios, pending, tail); |
|---|
| 514 | | - goto loop_lock; |
|---|
| 515 | | - } |
|---|
| 516 | | - |
|---|
| 517 | | - cur = pending; |
|---|
| 518 | | - pending = pending->bi_next; |
|---|
| 519 | | - cur->bi_next = NULL; |
|---|
| 520 | | - |
|---|
| 521 | | - BUG_ON(atomic_read(&cur->__bi_cnt) == 0); |
|---|
| 522 | | - |
|---|
| 523 | | - /* |
|---|
| 524 | | - * if we're doing the sync list, record that our |
|---|
| 525 | | - * plug has some sync requests on it |
|---|
| 526 | | - * |
|---|
| 527 | | - * If we're doing the regular list and there are |
|---|
| 528 | | - * sync requests sitting around, unplug before |
|---|
| 529 | | - * we add more |
|---|
| 530 | | - */ |
|---|
| 531 | | - if (pending_bios == &device->pending_sync_bios) { |
|---|
| 532 | | - sync_pending = 1; |
|---|
| 533 | | - } else if (sync_pending) { |
|---|
| 534 | | - blk_finish_plug(&plug); |
|---|
| 535 | | - blk_start_plug(&plug); |
|---|
| 536 | | - sync_pending = 0; |
|---|
| 537 | | - } |
|---|
| 538 | | - |
|---|
| 539 | | - btrfsic_submit_bio(cur); |
|---|
| 540 | | - num_run++; |
|---|
| 541 | | - batch_run++; |
|---|
| 542 | | - |
|---|
| 543 | | - cond_resched(); |
|---|
| 544 | | - |
|---|
| 545 | | - /* |
|---|
| 546 | | - * we made progress, there is more work to do and the bdi |
|---|
| 547 | | - * is now congested. Back off and let other work structs |
|---|
| 548 | | - * run instead |
|---|
| 549 | | - */ |
|---|
| 550 | | - if (pending && bdi_write_congested(bdi) && batch_run > 8 && |
|---|
| 551 | | - fs_info->fs_devices->open_devices > 1) { |
|---|
| 552 | | - struct io_context *ioc; |
|---|
| 553 | | - |
|---|
| 554 | | - ioc = current->io_context; |
|---|
| 555 | | - |
|---|
| 556 | | - /* |
|---|
| 557 | | - * the main goal here is that we don't want to |
|---|
| 558 | | - * block if we're going to be able to submit |
|---|
| 559 | | - * more requests without blocking. |
|---|
| 560 | | - * |
|---|
| 561 | | - * This code does two great things, it pokes into |
|---|
| 562 | | - * the elevator code from a filesystem _and_ |
|---|
| 563 | | - * it makes assumptions about how batching works. |
|---|
| 564 | | - */ |
|---|
| 565 | | - if (ioc && ioc->nr_batch_requests > 0 && |
|---|
| 566 | | - time_before(jiffies, ioc->last_waited + HZ/50UL) && |
|---|
| 567 | | - (last_waited == 0 || |
|---|
| 568 | | - ioc->last_waited == last_waited)) { |
|---|
| 569 | | - /* |
|---|
| 570 | | - * we want to go through our batch of |
|---|
| 571 | | - * requests and stop. So, we copy out |
|---|
| 572 | | - * the ioc->last_waited time and test |
|---|
| 573 | | - * against it before looping |
|---|
| 574 | | - */ |
|---|
| 575 | | - last_waited = ioc->last_waited; |
|---|
| 576 | | - cond_resched(); |
|---|
| 577 | | - continue; |
|---|
| 578 | | - } |
|---|
| 579 | | - spin_lock(&device->io_lock); |
|---|
| 580 | | - requeue_list(pending_bios, pending, tail); |
|---|
| 581 | | - device->running_pending = 1; |
|---|
| 582 | | - |
|---|
| 583 | | - spin_unlock(&device->io_lock); |
|---|
| 584 | | - btrfs_queue_work(fs_info->submit_workers, |
|---|
| 585 | | - &device->work); |
|---|
| 586 | | - goto done; |
|---|
| 587 | | - } |
|---|
| 588 | | - } |
|---|
| 589 | | - |
|---|
| 590 | | - cond_resched(); |
|---|
| 591 | | - if (again) |
|---|
| 592 | | - goto loop; |
|---|
| 593 | | - |
|---|
| 594 | | - spin_lock(&device->io_lock); |
|---|
| 595 | | - if (device->pending_bios.head || device->pending_sync_bios.head) |
|---|
| 596 | | - goto loop_lock; |
|---|
| 597 | | - spin_unlock(&device->io_lock); |
|---|
| 598 | | - |
|---|
| 599 | | -done: |
|---|
| 600 | | - blk_finish_plug(&plug); |
|---|
| 601 | | -} |
|---|
| 602 | | - |
|---|
| 603 | | -static void pending_bios_fn(struct btrfs_work *work) |
|---|
| 604 | | -{ |
|---|
| 605 | | - struct btrfs_device *device; |
|---|
| 606 | | - |
|---|
| 607 | | - device = container_of(work, struct btrfs_device, work); |
|---|
| 608 | | - run_scheduled_bios(device); |
|---|
| 584 | + return false; |
|---|
| 609 | 585 | } |
|---|
| 610 | 586 | |
|---|
| 611 | 587 | /* |
|---|
| .. | .. |
|---|
| 615 | 591 | * matching this path only. |
|---|
| 616 | 592 | * skip_dev: Optional. Will skip this device when searching for the stale |
|---|
| 617 | 593 | * devices. |
|---|
| 594 | + * Return: 0 for success or if @path is NULL. |
|---|
| 595 | + * -EBUSY if @path is a mounted device. |
|---|
| 596 | + * -ENOENT if @path does not match any device in the list. |
|---|
| 618 | 597 | */ |
|---|
| 619 | | -static void btrfs_free_stale_devices(const char *path, |
|---|
| 598 | +static int btrfs_free_stale_devices(const char *path, |
|---|
| 620 | 599 | struct btrfs_device *skip_device) |
|---|
| 621 | 600 | { |
|---|
| 622 | 601 | struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; |
|---|
| 623 | 602 | struct btrfs_device *device, *tmp_device; |
|---|
| 603 | + int ret = 0; |
|---|
| 604 | + |
|---|
| 605 | + lockdep_assert_held(&uuid_mutex); |
|---|
| 606 | + |
|---|
| 607 | + if (path) |
|---|
| 608 | + ret = -ENOENT; |
|---|
| 624 | 609 | |
|---|
| 625 | 610 | list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { |
|---|
| 626 | | - mutex_lock(&fs_devices->device_list_mutex); |
|---|
| 627 | | - if (fs_devices->opened) { |
|---|
| 628 | | - mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 629 | | - continue; |
|---|
| 630 | | - } |
|---|
| 631 | 611 | |
|---|
| 612 | + mutex_lock(&fs_devices->device_list_mutex); |
|---|
| 632 | 613 | list_for_each_entry_safe(device, tmp_device, |
|---|
| 633 | 614 | &fs_devices->devices, dev_list) { |
|---|
| 634 | | - int not_found = 0; |
|---|
| 635 | | - |
|---|
| 636 | 615 | if (skip_device && skip_device == device) |
|---|
| 637 | 616 | continue; |
|---|
| 638 | | - if (path && !device->name) |
|---|
| 617 | + if (path && !device_matched(device, path)) |
|---|
| 639 | 618 | continue; |
|---|
| 640 | | - |
|---|
| 641 | | - rcu_read_lock(); |
|---|
| 642 | | - if (path) |
|---|
| 643 | | - not_found = strcmp(rcu_str_deref(device->name), |
|---|
| 644 | | - path); |
|---|
| 645 | | - rcu_read_unlock(); |
|---|
| 646 | | - if (not_found) |
|---|
| 647 | | - continue; |
|---|
| 619 | + if (fs_devices->opened) { |
|---|
| 620 | + /* for an already deleted device return 0 */ |
|---|
| 621 | + if (path && ret != 0) |
|---|
| 622 | + ret = -EBUSY; |
|---|
| 623 | + break; |
|---|
| 624 | + } |
|---|
| 648 | 625 | |
|---|
| 649 | 626 | /* delete the stale device */ |
|---|
| 650 | 627 | fs_devices->num_devices--; |
|---|
| 651 | 628 | list_del(&device->dev_list); |
|---|
| 652 | 629 | btrfs_free_device(device); |
|---|
| 653 | 630 | |
|---|
| 654 | | - if (fs_devices->num_devices == 0) |
|---|
| 655 | | - break; |
|---|
| 631 | + ret = 0; |
|---|
| 656 | 632 | } |
|---|
| 657 | 633 | mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 634 | + |
|---|
| 658 | 635 | if (fs_devices->num_devices == 0) { |
|---|
| 659 | 636 | btrfs_sysfs_remove_fsid(fs_devices); |
|---|
| 660 | 637 | list_del(&fs_devices->fs_list); |
|---|
| 661 | 638 | free_fs_devices(fs_devices); |
|---|
| 662 | 639 | } |
|---|
| 663 | 640 | } |
|---|
| 641 | + |
|---|
| 642 | + return ret; |
|---|
| 664 | 643 | } |
|---|
| 665 | 644 | |
|---|
| 666 | 645 | /* |
|---|
| .. | .. |
|---|
| 674 | 653 | { |
|---|
| 675 | 654 | struct request_queue *q; |
|---|
| 676 | 655 | struct block_device *bdev; |
|---|
| 677 | | - struct buffer_head *bh; |
|---|
| 678 | 656 | struct btrfs_super_block *disk_super; |
|---|
| 679 | 657 | u64 devid; |
|---|
| 680 | 658 | int ret; |
|---|
| .. | .. |
|---|
| 685 | 663 | return -EINVAL; |
|---|
| 686 | 664 | |
|---|
| 687 | 665 | ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, |
|---|
| 688 | | - &bdev, &bh); |
|---|
| 666 | + &bdev, &disk_super); |
|---|
| 689 | 667 | if (ret) |
|---|
| 690 | 668 | return ret; |
|---|
| 691 | 669 | |
|---|
| 692 | | - disk_super = (struct btrfs_super_block *)bh->b_data; |
|---|
| 693 | 670 | devid = btrfs_stack_device_id(&disk_super->dev_item); |
|---|
| 694 | 671 | if (devid != device->devid) |
|---|
| 695 | | - goto error_brelse; |
|---|
| 672 | + goto error_free_page; |
|---|
| 696 | 673 | |
|---|
| 697 | 674 | if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) |
|---|
| 698 | | - goto error_brelse; |
|---|
| 675 | + goto error_free_page; |
|---|
| 699 | 676 | |
|---|
| 700 | 677 | device->generation = btrfs_super_generation(disk_super); |
|---|
| 701 | 678 | |
|---|
| 702 | 679 | if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { |
|---|
| 680 | + if (btrfs_super_incompat_flags(disk_super) & |
|---|
| 681 | + BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { |
|---|
| 682 | + pr_err( |
|---|
| 683 | + "BTRFS: Invalid seeding and uuid-changed device detected\n"); |
|---|
| 684 | + goto error_free_page; |
|---|
| 685 | + } |
|---|
| 686 | + |
|---|
| 703 | 687 | clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); |
|---|
| 704 | | - fs_devices->seeding = 1; |
|---|
| 688 | + fs_devices->seeding = true; |
|---|
| 705 | 689 | } else { |
|---|
| 706 | 690 | if (bdev_read_only(bdev)) |
|---|
| 707 | 691 | clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); |
|---|
| .. | .. |
|---|
| 711 | 695 | |
|---|
| 712 | 696 | q = bdev_get_queue(bdev); |
|---|
| 713 | 697 | if (!blk_queue_nonrot(q)) |
|---|
| 714 | | - fs_devices->rotating = 1; |
|---|
| 698 | + fs_devices->rotating = true; |
|---|
| 715 | 699 | |
|---|
| 716 | 700 | device->bdev = bdev; |
|---|
| 717 | 701 | clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); |
|---|
| .. | .. |
|---|
| 723 | 707 | fs_devices->rw_devices++; |
|---|
| 724 | 708 | list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); |
|---|
| 725 | 709 | } |
|---|
| 726 | | - brelse(bh); |
|---|
| 710 | + btrfs_release_disk_super(disk_super); |
|---|
| 727 | 711 | |
|---|
| 728 | 712 | return 0; |
|---|
| 729 | 713 | |
|---|
| 730 | | -error_brelse: |
|---|
| 731 | | - brelse(bh); |
|---|
| 714 | +error_free_page: |
|---|
| 715 | + btrfs_release_disk_super(disk_super); |
|---|
| 732 | 716 | blkdev_put(bdev, flags); |
|---|
| 733 | 717 | |
|---|
| 734 | 718 | return -EINVAL; |
|---|
| 735 | 719 | } |
|---|
| 736 | 720 | |
|---|
| 721 | +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) |
|---|
| 722 | +{ |
|---|
| 723 | + bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & |
|---|
| 724 | + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); |
|---|
| 725 | + |
|---|
| 726 | + return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; |
|---|
| 727 | +} |
|---|
| 728 | + |
|---|
| 729 | +/* |
|---|
| 730 | + * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices |
|---|
| 731 | + * being created with a disk that has already completed its fsid change. Such |
|---|
| 732 | + * disk can belong to an fs which has its FSID changed or to one which doesn't. |
|---|
| 733 | + * Handle both cases here. |
|---|
| 734 | + */ |
|---|
| 735 | +static struct btrfs_fs_devices *find_fsid_inprogress( |
|---|
| 736 | + struct btrfs_super_block *disk_super) |
|---|
| 737 | +{ |
|---|
| 738 | + struct btrfs_fs_devices *fs_devices; |
|---|
| 739 | + |
|---|
| 740 | + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { |
|---|
| 741 | + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, |
|---|
| 742 | + BTRFS_FSID_SIZE) != 0 && |
|---|
| 743 | + memcmp(fs_devices->metadata_uuid, disk_super->fsid, |
|---|
| 744 | + BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { |
|---|
| 745 | + return fs_devices; |
|---|
| 746 | + } |
|---|
| 747 | + } |
|---|
| 748 | + |
|---|
| 749 | + return find_fsid(disk_super->fsid, NULL); |
|---|
| 750 | +} |
|---|
| 751 | + |
|---|
| 752 | + |
|---|
| 753 | +static struct btrfs_fs_devices *find_fsid_changed( |
|---|
| 754 | + struct btrfs_super_block *disk_super) |
|---|
| 755 | +{ |
|---|
| 756 | + struct btrfs_fs_devices *fs_devices; |
|---|
| 757 | + |
|---|
| 758 | + /* |
|---|
| 759 | + * Handles the case where scanned device is part of an fs that had |
|---|
| 760 | + * multiple successful changes of FSID but curently device didn't |
|---|
| 761 | + * observe it. Meaning our fsid will be different than theirs. We need |
|---|
| 762 | + * to handle two subcases : |
|---|
| 763 | + * 1 - The fs still continues to have different METADATA/FSID uuids. |
|---|
| 764 | + * 2 - The fs is switched back to its original FSID (METADATA/FSID |
|---|
| 765 | + * are equal). |
|---|
| 766 | + */ |
|---|
| 767 | + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { |
|---|
| 768 | + /* Changed UUIDs */ |
|---|
| 769 | + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, |
|---|
| 770 | + BTRFS_FSID_SIZE) != 0 && |
|---|
| 771 | + memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, |
|---|
| 772 | + BTRFS_FSID_SIZE) == 0 && |
|---|
| 773 | + memcmp(fs_devices->fsid, disk_super->fsid, |
|---|
| 774 | + BTRFS_FSID_SIZE) != 0) |
|---|
| 775 | + return fs_devices; |
|---|
| 776 | + |
|---|
| 777 | + /* Unchanged UUIDs */ |
|---|
| 778 | + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, |
|---|
| 779 | + BTRFS_FSID_SIZE) == 0 && |
|---|
| 780 | + memcmp(fs_devices->fsid, disk_super->metadata_uuid, |
|---|
| 781 | + BTRFS_FSID_SIZE) == 0) |
|---|
| 782 | + return fs_devices; |
|---|
| 783 | + } |
|---|
| 784 | + |
|---|
| 785 | + return NULL; |
|---|
| 786 | +} |
|---|
| 787 | + |
|---|
| 788 | +static struct btrfs_fs_devices *find_fsid_reverted_metadata( |
|---|
| 789 | + struct btrfs_super_block *disk_super) |
|---|
| 790 | +{ |
|---|
| 791 | + struct btrfs_fs_devices *fs_devices; |
|---|
| 792 | + |
|---|
| 793 | + /* |
|---|
| 794 | + * Handle the case where the scanned device is part of an fs whose last |
|---|
| 795 | + * metadata UUID change reverted it to the original FSID. At the same |
|---|
| 796 | + * time * fs_devices was first created by another constitutent device |
|---|
| 797 | + * which didn't fully observe the operation. This results in an |
|---|
| 798 | + * btrfs_fs_devices created with metadata/fsid different AND |
|---|
| 799 | + * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the |
|---|
| 800 | + * fs_devices equal to the FSID of the disk. |
|---|
| 801 | + */ |
|---|
| 802 | + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { |
|---|
| 803 | + if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, |
|---|
| 804 | + BTRFS_FSID_SIZE) != 0 && |
|---|
| 805 | + memcmp(fs_devices->metadata_uuid, disk_super->fsid, |
|---|
| 806 | + BTRFS_FSID_SIZE) == 0 && |
|---|
| 807 | + fs_devices->fsid_change) |
|---|
| 808 | + return fs_devices; |
|---|
| 809 | + } |
|---|
| 810 | + |
|---|
| 811 | + return NULL; |
|---|
| 812 | +} |
|---|
| 737 | 813 | /* |
|---|
| 738 | 814 | * Add new device to list of registered devices |
|---|
| 739 | 815 | * |
|---|
| .. | .. |
|---|
| 746 | 822 | bool *new_device_added) |
|---|
| 747 | 823 | { |
|---|
| 748 | 824 | struct btrfs_device *device; |
|---|
| 749 | | - struct btrfs_fs_devices *fs_devices; |
|---|
| 825 | + struct btrfs_fs_devices *fs_devices = NULL; |
|---|
| 750 | 826 | struct rcu_string *name; |
|---|
| 751 | 827 | u64 found_transid = btrfs_super_generation(disk_super); |
|---|
| 752 | 828 | u64 devid = btrfs_stack_device_id(&disk_super->dev_item); |
|---|
| 829 | + bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & |
|---|
| 830 | + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); |
|---|
| 831 | + bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & |
|---|
| 832 | + BTRFS_SUPER_FLAG_CHANGING_FSID_V2); |
|---|
| 753 | 833 | |
|---|
| 754 | | - fs_devices = find_fsid(disk_super->fsid); |
|---|
| 834 | + if (fsid_change_in_progress) { |
|---|
| 835 | + if (!has_metadata_uuid) |
|---|
| 836 | + fs_devices = find_fsid_inprogress(disk_super); |
|---|
| 837 | + else |
|---|
| 838 | + fs_devices = find_fsid_changed(disk_super); |
|---|
| 839 | + } else if (has_metadata_uuid) { |
|---|
| 840 | + fs_devices = find_fsid_with_metadata_uuid(disk_super); |
|---|
| 841 | + } else { |
|---|
| 842 | + fs_devices = find_fsid_reverted_metadata(disk_super); |
|---|
| 843 | + if (!fs_devices) |
|---|
| 844 | + fs_devices = find_fsid(disk_super->fsid, NULL); |
|---|
| 845 | + } |
|---|
| 846 | + |
|---|
| 847 | + |
|---|
| 755 | 848 | if (!fs_devices) { |
|---|
| 756 | | - fs_devices = alloc_fs_devices(disk_super->fsid); |
|---|
| 849 | + if (has_metadata_uuid) |
|---|
| 850 | + fs_devices = alloc_fs_devices(disk_super->fsid, |
|---|
| 851 | + disk_super->metadata_uuid); |
|---|
| 852 | + else |
|---|
| 853 | + fs_devices = alloc_fs_devices(disk_super->fsid, NULL); |
|---|
| 854 | + |
|---|
| 757 | 855 | if (IS_ERR(fs_devices)) |
|---|
| 758 | 856 | return ERR_CAST(fs_devices); |
|---|
| 857 | + |
|---|
| 858 | + fs_devices->fsid_change = fsid_change_in_progress; |
|---|
| 759 | 859 | |
|---|
| 760 | 860 | mutex_lock(&fs_devices->device_list_mutex); |
|---|
| 761 | 861 | list_add(&fs_devices->fs_list, &fs_uuids); |
|---|
| .. | .. |
|---|
| 765 | 865 | mutex_lock(&fs_devices->device_list_mutex); |
|---|
| 766 | 866 | device = btrfs_find_device(fs_devices, devid, |
|---|
| 767 | 867 | disk_super->dev_item.uuid, NULL, false); |
|---|
| 868 | + |
|---|
| 869 | + /* |
|---|
| 870 | + * If this disk has been pulled into an fs devices created by |
|---|
| 871 | + * a device which had the CHANGING_FSID_V2 flag then replace the |
|---|
| 872 | + * metadata_uuid/fsid values of the fs_devices. |
|---|
| 873 | + */ |
|---|
| 874 | + if (fs_devices->fsid_change && |
|---|
| 875 | + found_transid > fs_devices->latest_generation) { |
|---|
| 876 | + memcpy(fs_devices->fsid, disk_super->fsid, |
|---|
| 877 | + BTRFS_FSID_SIZE); |
|---|
| 878 | + |
|---|
| 879 | + if (has_metadata_uuid) |
|---|
| 880 | + memcpy(fs_devices->metadata_uuid, |
|---|
| 881 | + disk_super->metadata_uuid, |
|---|
| 882 | + BTRFS_FSID_SIZE); |
|---|
| 883 | + else |
|---|
| 884 | + memcpy(fs_devices->metadata_uuid, |
|---|
| 885 | + disk_super->fsid, BTRFS_FSID_SIZE); |
|---|
| 886 | + |
|---|
| 887 | + fs_devices->fsid_change = false; |
|---|
| 888 | + } |
|---|
| 768 | 889 | } |
|---|
| 769 | 890 | |
|---|
| 770 | 891 | if (!device) { |
|---|
| .. | .. |
|---|
| 796 | 917 | *new_device_added = true; |
|---|
| 797 | 918 | |
|---|
| 798 | 919 | if (disk_super->label[0]) |
|---|
| 799 | | - pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", |
|---|
| 800 | | - disk_super->label, devid, found_transid, path); |
|---|
| 920 | + pr_info( |
|---|
| 921 | + "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", |
|---|
| 922 | + disk_super->label, devid, found_transid, path, |
|---|
| 923 | + current->comm, task_pid_nr(current)); |
|---|
| 801 | 924 | else |
|---|
| 802 | | - pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", |
|---|
| 803 | | - disk_super->fsid, devid, found_transid, path); |
|---|
| 925 | + pr_info( |
|---|
| 926 | + "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", |
|---|
| 927 | + disk_super->fsid, devid, found_transid, path, |
|---|
| 928 | + current->comm, task_pid_nr(current)); |
|---|
| 804 | 929 | |
|---|
| 805 | 930 | } else if (!device->name || strcmp(device->name->str, path)) { |
|---|
| 806 | 931 | /* |
|---|
| .. | .. |
|---|
| 897 | 1022 | * it back. We need it to pick the disk with largest generation |
|---|
| 898 | 1023 | * (as above). |
|---|
| 899 | 1024 | */ |
|---|
| 900 | | - if (!fs_devices->opened) |
|---|
| 1025 | + if (!fs_devices->opened) { |
|---|
| 901 | 1026 | device->generation = found_transid; |
|---|
| 1027 | + fs_devices->latest_generation = max_t(u64, found_transid, |
|---|
| 1028 | + fs_devices->latest_generation); |
|---|
| 1029 | + } |
|---|
| 902 | 1030 | |
|---|
| 903 | 1031 | fs_devices->total_devices = btrfs_super_num_devices(disk_super); |
|---|
| 904 | 1032 | |
|---|
| .. | .. |
|---|
| 911 | 1039 | struct btrfs_fs_devices *fs_devices; |
|---|
| 912 | 1040 | struct btrfs_device *device; |
|---|
| 913 | 1041 | struct btrfs_device *orig_dev; |
|---|
| 1042 | + int ret = 0; |
|---|
| 914 | 1043 | |
|---|
| 915 | | - fs_devices = alloc_fs_devices(orig->fsid); |
|---|
| 1044 | + lockdep_assert_held(&uuid_mutex); |
|---|
| 1045 | + |
|---|
| 1046 | + fs_devices = alloc_fs_devices(orig->fsid, NULL); |
|---|
| 916 | 1047 | if (IS_ERR(fs_devices)) |
|---|
| 917 | 1048 | return fs_devices; |
|---|
| 918 | 1049 | |
|---|
| 919 | | - mutex_lock(&orig->device_list_mutex); |
|---|
| 920 | 1050 | fs_devices->total_devices = orig->total_devices; |
|---|
| 921 | 1051 | |
|---|
| 922 | | - /* We have held the volume lock, it is safe to get the devices. */ |
|---|
| 923 | 1052 | list_for_each_entry(orig_dev, &orig->devices, dev_list) { |
|---|
| 924 | 1053 | struct rcu_string *name; |
|---|
| 925 | 1054 | |
|---|
| 926 | 1055 | device = btrfs_alloc_device(NULL, &orig_dev->devid, |
|---|
| 927 | 1056 | orig_dev->uuid); |
|---|
| 928 | | - if (IS_ERR(device)) |
|---|
| 1057 | + if (IS_ERR(device)) { |
|---|
| 1058 | + ret = PTR_ERR(device); |
|---|
| 929 | 1059 | goto error; |
|---|
| 1060 | + } |
|---|
| 930 | 1061 | |
|---|
| 931 | 1062 | /* |
|---|
| 932 | 1063 | * This is ok to do without rcu read locked because we hold the |
|---|
| .. | .. |
|---|
| 937 | 1068 | GFP_KERNEL); |
|---|
| 938 | 1069 | if (!name) { |
|---|
| 939 | 1070 | btrfs_free_device(device); |
|---|
| 1071 | + ret = -ENOMEM; |
|---|
| 940 | 1072 | goto error; |
|---|
| 941 | 1073 | } |
|---|
| 942 | 1074 | rcu_assign_pointer(device->name, name); |
|---|
| .. | .. |
|---|
| 946 | 1078 | device->fs_devices = fs_devices; |
|---|
| 947 | 1079 | fs_devices->num_devices++; |
|---|
| 948 | 1080 | } |
|---|
| 949 | | - mutex_unlock(&orig->device_list_mutex); |
|---|
| 950 | 1081 | return fs_devices; |
|---|
| 951 | 1082 | error: |
|---|
| 952 | | - mutex_unlock(&orig->device_list_mutex); |
|---|
| 953 | 1083 | free_fs_devices(fs_devices); |
|---|
| 954 | | - return ERR_PTR(-ENOMEM); |
|---|
| 1084 | + return ERR_PTR(ret); |
|---|
| 955 | 1085 | } |
|---|
| 956 | 1086 | |
|---|
| 957 | | -/* |
|---|
| 958 | | - * After we have read the system tree and know devids belonging to |
|---|
| 959 | | - * this filesystem, remove the device which does not belong there. |
|---|
| 960 | | - */ |
|---|
| 961 | | -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) |
|---|
| 1087 | +static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, |
|---|
| 1088 | + int step, struct btrfs_device **latest_dev) |
|---|
| 962 | 1089 | { |
|---|
| 963 | 1090 | struct btrfs_device *device, *next; |
|---|
| 964 | | - struct btrfs_device *latest_dev = NULL; |
|---|
| 965 | 1091 | |
|---|
| 966 | | - mutex_lock(&uuid_mutex); |
|---|
| 967 | | -again: |
|---|
| 968 | 1092 | /* This is the initialized path, it is safe to release the devices. */ |
|---|
| 969 | 1093 | list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { |
|---|
| 970 | | - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, |
|---|
| 971 | | - &device->dev_state)) { |
|---|
| 1094 | + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { |
|---|
| 972 | 1095 | if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, |
|---|
| 973 | | - &device->dev_state) && |
|---|
| 1096 | + &device->dev_state) && |
|---|
| 974 | 1097 | !test_bit(BTRFS_DEV_STATE_MISSING, |
|---|
| 975 | 1098 | &device->dev_state) && |
|---|
| 976 | | - (!latest_dev || |
|---|
| 977 | | - device->generation > latest_dev->generation)) { |
|---|
| 978 | | - latest_dev = device; |
|---|
| 1099 | + (!*latest_dev || |
|---|
| 1100 | + device->generation > (*latest_dev)->generation)) { |
|---|
| 1101 | + *latest_dev = device; |
|---|
| 979 | 1102 | } |
|---|
| 980 | 1103 | continue; |
|---|
| 981 | 1104 | } |
|---|
| .. | .. |
|---|
| 1002 | 1125 | btrfs_free_device(device); |
|---|
| 1003 | 1126 | } |
|---|
| 1004 | 1127 | |
|---|
| 1005 | | - if (fs_devices->seed) { |
|---|
| 1006 | | - fs_devices = fs_devices->seed; |
|---|
| 1007 | | - goto again; |
|---|
| 1008 | | - } |
|---|
| 1128 | +} |
|---|
| 1129 | + |
|---|
| 1130 | +/* |
|---|
| 1131 | + * After we have read the system tree and know devids belonging to this |
|---|
| 1132 | + * filesystem, remove the device which does not belong there. |
|---|
| 1133 | + */ |
|---|
| 1134 | +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) |
|---|
| 1135 | +{ |
|---|
| 1136 | + struct btrfs_device *latest_dev = NULL; |
|---|
| 1137 | + struct btrfs_fs_devices *seed_dev; |
|---|
| 1138 | + |
|---|
| 1139 | + mutex_lock(&uuid_mutex); |
|---|
| 1140 | + __btrfs_free_extra_devids(fs_devices, step, &latest_dev); |
|---|
| 1141 | + |
|---|
| 1142 | + list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) |
|---|
| 1143 | + __btrfs_free_extra_devids(seed_dev, step, &latest_dev); |
|---|
| 1009 | 1144 | |
|---|
| 1010 | 1145 | fs_devices->latest_bdev = latest_dev->bdev; |
|---|
| 1011 | 1146 | |
|---|
| 1012 | 1147 | mutex_unlock(&uuid_mutex); |
|---|
| 1013 | | -} |
|---|
| 1014 | | - |
|---|
| 1015 | | -static void free_device_rcu(struct rcu_head *head) |
|---|
| 1016 | | -{ |
|---|
| 1017 | | - struct btrfs_device *device; |
|---|
| 1018 | | - |
|---|
| 1019 | | - device = container_of(head, struct btrfs_device, rcu); |
|---|
| 1020 | | - btrfs_free_device(device); |
|---|
| 1021 | 1148 | } |
|---|
| 1022 | 1149 | |
|---|
| 1023 | 1150 | static void btrfs_close_bdev(struct btrfs_device *device) |
|---|
| .. | .. |
|---|
| 1036 | 1163 | static void btrfs_close_one_device(struct btrfs_device *device) |
|---|
| 1037 | 1164 | { |
|---|
| 1038 | 1165 | struct btrfs_fs_devices *fs_devices = device->fs_devices; |
|---|
| 1039 | | - struct btrfs_device *new_device; |
|---|
| 1040 | | - struct rcu_string *name; |
|---|
| 1041 | | - |
|---|
| 1042 | | - if (device->bdev) |
|---|
| 1043 | | - fs_devices->open_devices--; |
|---|
| 1044 | 1166 | |
|---|
| 1045 | 1167 | if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && |
|---|
| 1046 | 1168 | device->devid != BTRFS_DEV_REPLACE_DEVID) { |
|---|
| .. | .. |
|---|
| 1057 | 1179 | } |
|---|
| 1058 | 1180 | |
|---|
| 1059 | 1181 | btrfs_close_bdev(device); |
|---|
| 1060 | | - |
|---|
| 1061 | | - new_device = btrfs_alloc_device(NULL, &device->devid, |
|---|
| 1062 | | - device->uuid); |
|---|
| 1063 | | - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ |
|---|
| 1064 | | - |
|---|
| 1065 | | - /* Safe because we are under uuid_mutex */ |
|---|
| 1066 | | - if (device->name) { |
|---|
| 1067 | | - name = rcu_string_strdup(device->name->str, GFP_NOFS); |
|---|
| 1068 | | - BUG_ON(!name); /* -ENOMEM */ |
|---|
| 1069 | | - rcu_assign_pointer(new_device->name, name); |
|---|
| 1182 | + if (device->bdev) { |
|---|
| 1183 | + fs_devices->open_devices--; |
|---|
| 1184 | + device->bdev = NULL; |
|---|
| 1070 | 1185 | } |
|---|
| 1186 | + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); |
|---|
| 1071 | 1187 | |
|---|
| 1072 | | - list_replace_rcu(&device->dev_list, &new_device->dev_list); |
|---|
| 1073 | | - new_device->fs_devices = device->fs_devices; |
|---|
| 1188 | + device->fs_info = NULL; |
|---|
| 1189 | + atomic_set(&device->dev_stats_ccnt, 0); |
|---|
| 1190 | + extent_io_tree_release(&device->alloc_state); |
|---|
| 1074 | 1191 | |
|---|
| 1075 | | - call_rcu(&device->rcu, free_device_rcu); |
|---|
| 1192 | + /* |
|---|
| 1193 | + * Reset the flush error record. We might have a transient flush error |
|---|
| 1194 | + * in this mount, and if so we aborted the current transaction and set |
|---|
| 1195 | + * the fs to an error state, guaranteeing no super blocks can be further |
|---|
| 1196 | + * committed. However that error might be transient and if we unmount the |
|---|
| 1197 | + * filesystem and mount it again, we should allow the mount to succeed |
|---|
| 1198 | + * (btrfs_check_rw_degradable() should not fail) - if after mounting the |
|---|
| 1199 | + * filesystem again we still get flush errors, then we will again abort |
|---|
| 1200 | + * any transaction and set the error state, guaranteeing no commits of |
|---|
| 1201 | + * unsafe super blocks. |
|---|
| 1202 | + */ |
|---|
| 1203 | + device->last_flush_error = 0; |
|---|
| 1204 | + |
|---|
| 1205 | + /* Verify the device is back in a pristine state */ |
|---|
| 1206 | + ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); |
|---|
| 1207 | + ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); |
|---|
| 1208 | + ASSERT(list_empty(&device->dev_alloc_list)); |
|---|
| 1209 | + ASSERT(list_empty(&device->post_commit_list)); |
|---|
| 1210 | + ASSERT(atomic_read(&device->reada_in_flight) == 0); |
|---|
| 1076 | 1211 | } |
|---|
| 1077 | 1212 | |
|---|
| 1078 | | -static int close_fs_devices(struct btrfs_fs_devices *fs_devices) |
|---|
| 1213 | +static void close_fs_devices(struct btrfs_fs_devices *fs_devices) |
|---|
| 1079 | 1214 | { |
|---|
| 1080 | 1215 | struct btrfs_device *device, *tmp; |
|---|
| 1081 | 1216 | |
|---|
| 1082 | | - if (--fs_devices->opened > 0) |
|---|
| 1083 | | - return 0; |
|---|
| 1217 | + lockdep_assert_held(&uuid_mutex); |
|---|
| 1084 | 1218 | |
|---|
| 1085 | | - mutex_lock(&fs_devices->device_list_mutex); |
|---|
| 1086 | | - list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { |
|---|
| 1219 | + if (--fs_devices->opened > 0) |
|---|
| 1220 | + return; |
|---|
| 1221 | + |
|---|
| 1222 | + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) |
|---|
| 1087 | 1223 | btrfs_close_one_device(device); |
|---|
| 1088 | | - } |
|---|
| 1089 | | - mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 1090 | 1224 | |
|---|
| 1091 | 1225 | WARN_ON(fs_devices->open_devices); |
|---|
| 1092 | 1226 | WARN_ON(fs_devices->rw_devices); |
|---|
| 1093 | 1227 | fs_devices->opened = 0; |
|---|
| 1094 | | - fs_devices->seeding = 0; |
|---|
| 1095 | | - |
|---|
| 1096 | | - return 0; |
|---|
| 1228 | + fs_devices->seeding = false; |
|---|
| 1229 | + fs_devices->fs_info = NULL; |
|---|
| 1097 | 1230 | } |
|---|
| 1098 | 1231 | |
|---|
| 1099 | | -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) |
|---|
| 1232 | +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) |
|---|
| 1100 | 1233 | { |
|---|
| 1101 | | - struct btrfs_fs_devices *seed_devices = NULL; |
|---|
| 1102 | | - int ret; |
|---|
| 1234 | + LIST_HEAD(list); |
|---|
| 1235 | + struct btrfs_fs_devices *tmp; |
|---|
| 1103 | 1236 | |
|---|
| 1104 | 1237 | mutex_lock(&uuid_mutex); |
|---|
| 1105 | | - ret = close_fs_devices(fs_devices); |
|---|
| 1238 | + close_fs_devices(fs_devices); |
|---|
| 1106 | 1239 | if (!fs_devices->opened) { |
|---|
| 1107 | | - seed_devices = fs_devices->seed; |
|---|
| 1108 | | - fs_devices->seed = NULL; |
|---|
| 1109 | | - } |
|---|
| 1110 | | - mutex_unlock(&uuid_mutex); |
|---|
| 1240 | + list_splice_init(&fs_devices->seed_list, &list); |
|---|
| 1111 | 1241 | |
|---|
| 1112 | | - while (seed_devices) { |
|---|
| 1113 | | - fs_devices = seed_devices; |
|---|
| 1114 | | - seed_devices = fs_devices->seed; |
|---|
| 1242 | + /* |
|---|
| 1243 | + * If the struct btrfs_fs_devices is not assembled with any |
|---|
| 1244 | + * other device, it can be re-initialized during the next mount |
|---|
| 1245 | + * without the needing device-scan step. Therefore, it can be |
|---|
| 1246 | + * fully freed. |
|---|
| 1247 | + */ |
|---|
| 1248 | + if (fs_devices->num_devices == 1) { |
|---|
| 1249 | + list_del(&fs_devices->fs_list); |
|---|
| 1250 | + free_fs_devices(fs_devices); |
|---|
| 1251 | + } |
|---|
| 1252 | + } |
|---|
| 1253 | + |
|---|
| 1254 | + |
|---|
| 1255 | + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { |
|---|
| 1115 | 1256 | close_fs_devices(fs_devices); |
|---|
| 1257 | + list_del(&fs_devices->seed_list); |
|---|
| 1116 | 1258 | free_fs_devices(fs_devices); |
|---|
| 1117 | 1259 | } |
|---|
| 1118 | | - return ret; |
|---|
| 1260 | + mutex_unlock(&uuid_mutex); |
|---|
| 1119 | 1261 | } |
|---|
| 1120 | 1262 | |
|---|
| 1121 | 1263 | static int open_fs_devices(struct btrfs_fs_devices *fs_devices, |
|---|
| .. | .. |
|---|
| 1123 | 1265 | { |
|---|
| 1124 | 1266 | struct btrfs_device *device; |
|---|
| 1125 | 1267 | struct btrfs_device *latest_dev = NULL; |
|---|
| 1126 | | - int ret = 0; |
|---|
| 1268 | + struct btrfs_device *tmp_device; |
|---|
| 1127 | 1269 | |
|---|
| 1128 | 1270 | flags |= FMODE_EXCL; |
|---|
| 1129 | 1271 | |
|---|
| 1130 | | - list_for_each_entry(device, &fs_devices->devices, dev_list) { |
|---|
| 1131 | | - /* Just open everything we can; ignore failures here */ |
|---|
| 1132 | | - if (btrfs_open_one_device(fs_devices, device, flags, holder)) |
|---|
| 1133 | | - continue; |
|---|
| 1272 | + list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, |
|---|
| 1273 | + dev_list) { |
|---|
| 1274 | + int ret; |
|---|
| 1134 | 1275 | |
|---|
| 1135 | | - if (!latest_dev || |
|---|
| 1136 | | - device->generation > latest_dev->generation) |
|---|
| 1276 | + ret = btrfs_open_one_device(fs_devices, device, flags, holder); |
|---|
| 1277 | + if (ret == 0 && |
|---|
| 1278 | + (!latest_dev || device->generation > latest_dev->generation)) { |
|---|
| 1137 | 1279 | latest_dev = device; |
|---|
| 1280 | + } else if (ret == -ENODATA) { |
|---|
| 1281 | + fs_devices->num_devices--; |
|---|
| 1282 | + list_del(&device->dev_list); |
|---|
| 1283 | + btrfs_free_device(device); |
|---|
| 1284 | + } |
|---|
| 1138 | 1285 | } |
|---|
| 1139 | | - if (fs_devices->open_devices == 0) { |
|---|
| 1140 | | - ret = -EINVAL; |
|---|
| 1141 | | - goto out; |
|---|
| 1142 | | - } |
|---|
| 1286 | + if (fs_devices->open_devices == 0) |
|---|
| 1287 | + return -EINVAL; |
|---|
| 1288 | + |
|---|
| 1143 | 1289 | fs_devices->opened = 1; |
|---|
| 1144 | 1290 | fs_devices->latest_bdev = latest_dev->bdev; |
|---|
| 1145 | 1291 | fs_devices->total_rw_bytes = 0; |
|---|
| 1146 | | -out: |
|---|
| 1147 | | - return ret; |
|---|
| 1292 | + fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; |
|---|
| 1293 | + |
|---|
| 1294 | + return 0; |
|---|
| 1148 | 1295 | } |
|---|
| 1149 | 1296 | |
|---|
| 1150 | 1297 | static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) |
|---|
| .. | .. |
|---|
| 1186 | 1333 | return ret; |
|---|
| 1187 | 1334 | } |
|---|
| 1188 | 1335 | |
|---|
| 1189 | | -static void btrfs_release_disk_super(struct page *page) |
|---|
| 1336 | +void btrfs_release_disk_super(struct btrfs_super_block *super) |
|---|
| 1190 | 1337 | { |
|---|
| 1191 | | - kunmap(page); |
|---|
| 1338 | + struct page *page = virt_to_page(super); |
|---|
| 1339 | + |
|---|
| 1192 | 1340 | put_page(page); |
|---|
| 1193 | 1341 | } |
|---|
| 1194 | 1342 | |
|---|
| 1195 | | -static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, |
|---|
| 1196 | | - struct page **page, |
|---|
| 1197 | | - struct btrfs_super_block **disk_super) |
|---|
| 1343 | +static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, |
|---|
| 1344 | + u64 bytenr) |
|---|
| 1198 | 1345 | { |
|---|
| 1346 | + struct btrfs_super_block *disk_super; |
|---|
| 1347 | + struct page *page; |
|---|
| 1199 | 1348 | void *p; |
|---|
| 1200 | 1349 | pgoff_t index; |
|---|
| 1201 | 1350 | |
|---|
| 1202 | 1351 | /* make sure our super fits in the device */ |
|---|
| 1203 | 1352 | if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) |
|---|
| 1204 | | - return 1; |
|---|
| 1353 | + return ERR_PTR(-EINVAL); |
|---|
| 1205 | 1354 | |
|---|
| 1206 | 1355 | /* make sure our super fits in the page */ |
|---|
| 1207 | | - if (sizeof(**disk_super) > PAGE_SIZE) |
|---|
| 1208 | | - return 1; |
|---|
| 1356 | + if (sizeof(*disk_super) > PAGE_SIZE) |
|---|
| 1357 | + return ERR_PTR(-EINVAL); |
|---|
| 1209 | 1358 | |
|---|
| 1210 | 1359 | /* make sure our super doesn't straddle pages on disk */ |
|---|
| 1211 | 1360 | index = bytenr >> PAGE_SHIFT; |
|---|
| 1212 | | - if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) |
|---|
| 1213 | | - return 1; |
|---|
| 1361 | + if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) |
|---|
| 1362 | + return ERR_PTR(-EINVAL); |
|---|
| 1214 | 1363 | |
|---|
| 1215 | 1364 | /* pull in the page with our super */ |
|---|
| 1216 | | - *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, |
|---|
| 1217 | | - index, GFP_KERNEL); |
|---|
| 1365 | + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); |
|---|
| 1218 | 1366 | |
|---|
| 1219 | | - if (IS_ERR_OR_NULL(*page)) |
|---|
| 1220 | | - return 1; |
|---|
| 1367 | + if (IS_ERR(page)) |
|---|
| 1368 | + return ERR_CAST(page); |
|---|
| 1221 | 1369 | |
|---|
| 1222 | | - p = kmap(*page); |
|---|
| 1370 | + p = page_address(page); |
|---|
| 1223 | 1371 | |
|---|
| 1224 | 1372 | /* align our pointer to the offset of the super block */ |
|---|
| 1225 | | - *disk_super = p + (bytenr & ~PAGE_MASK); |
|---|
| 1373 | + disk_super = p + offset_in_page(bytenr); |
|---|
| 1226 | 1374 | |
|---|
| 1227 | | - if (btrfs_super_bytenr(*disk_super) != bytenr || |
|---|
| 1228 | | - btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { |
|---|
| 1229 | | - btrfs_release_disk_super(*page); |
|---|
| 1230 | | - return 1; |
|---|
| 1375 | + if (btrfs_super_bytenr(disk_super) != bytenr || |
|---|
| 1376 | + btrfs_super_magic(disk_super) != BTRFS_MAGIC) { |
|---|
| 1377 | + btrfs_release_disk_super(p); |
|---|
| 1378 | + return ERR_PTR(-EINVAL); |
|---|
| 1231 | 1379 | } |
|---|
| 1232 | 1380 | |
|---|
| 1233 | | - if ((*disk_super)->label[0] && |
|---|
| 1234 | | - (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) |
|---|
| 1235 | | - (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; |
|---|
| 1381 | + if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) |
|---|
| 1382 | + disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; |
|---|
| 1236 | 1383 | |
|---|
| 1237 | | - return 0; |
|---|
| 1384 | + return disk_super; |
|---|
| 1385 | +} |
|---|
| 1386 | + |
|---|
| 1387 | +int btrfs_forget_devices(const char *path) |
|---|
| 1388 | +{ |
|---|
| 1389 | + int ret; |
|---|
| 1390 | + |
|---|
| 1391 | + mutex_lock(&uuid_mutex); |
|---|
| 1392 | + ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); |
|---|
| 1393 | + mutex_unlock(&uuid_mutex); |
|---|
| 1394 | + |
|---|
| 1395 | + return ret; |
|---|
| 1238 | 1396 | } |
|---|
| 1239 | 1397 | |
|---|
| 1240 | 1398 | /* |
|---|
| .. | .. |
|---|
| 1249 | 1407 | bool new_device_added = false; |
|---|
| 1250 | 1408 | struct btrfs_device *device = NULL; |
|---|
| 1251 | 1409 | struct block_device *bdev; |
|---|
| 1252 | | - struct page *page; |
|---|
| 1253 | 1410 | u64 bytenr; |
|---|
| 1254 | 1411 | |
|---|
| 1255 | 1412 | lockdep_assert_held(&uuid_mutex); |
|---|
| .. | .. |
|---|
| 1261 | 1418 | * later supers, using BTRFS_SUPER_MIRROR_MAX instead |
|---|
| 1262 | 1419 | */ |
|---|
| 1263 | 1420 | bytenr = btrfs_sb_offset(0); |
|---|
| 1264 | | - flags |= FMODE_EXCL; |
|---|
| 1265 | 1421 | |
|---|
| 1422 | + /* |
|---|
| 1423 | + * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may |
|---|
| 1424 | + * initiate the device scan which may race with the user's mount |
|---|
| 1425 | + * or mkfs command, resulting in failure. |
|---|
| 1426 | + * Since the device scan is solely for reading purposes, there is |
|---|
| 1427 | + * no need for FMODE_EXCL. Additionally, the devices are read again |
|---|
| 1428 | + * during the mount process. It is ok to get some inconsistent |
|---|
| 1429 | + * values temporarily, as the device paths of the fsid are the only |
|---|
| 1430 | + * required information for assembling the volume. |
|---|
| 1431 | + */ |
|---|
| 1266 | 1432 | bdev = blkdev_get_by_path(path, flags, holder); |
|---|
| 1267 | 1433 | if (IS_ERR(bdev)) |
|---|
| 1268 | 1434 | return ERR_CAST(bdev); |
|---|
| 1269 | 1435 | |
|---|
| 1270 | | - if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { |
|---|
| 1271 | | - device = ERR_PTR(-EINVAL); |
|---|
| 1436 | + disk_super = btrfs_read_disk_super(bdev, bytenr); |
|---|
| 1437 | + if (IS_ERR(disk_super)) { |
|---|
| 1438 | + device = ERR_CAST(disk_super); |
|---|
| 1272 | 1439 | goto error_bdev_put; |
|---|
| 1273 | 1440 | } |
|---|
| 1274 | 1441 | |
|---|
| .. | .. |
|---|
| 1278 | 1445 | btrfs_free_stale_devices(path, device); |
|---|
| 1279 | 1446 | } |
|---|
| 1280 | 1447 | |
|---|
| 1281 | | - btrfs_release_disk_super(page); |
|---|
| 1448 | + btrfs_release_disk_super(disk_super); |
|---|
| 1282 | 1449 | |
|---|
| 1283 | 1450 | error_bdev_put: |
|---|
| 1284 | 1451 | blkdev_put(bdev, flags); |
|---|
| .. | .. |
|---|
| 1286 | 1453 | return device; |
|---|
| 1287 | 1454 | } |
|---|
| 1288 | 1455 | |
|---|
| 1289 | | -static int contains_pending_extent(struct btrfs_transaction *transaction, |
|---|
| 1290 | | - struct btrfs_device *device, |
|---|
| 1291 | | - u64 *start, u64 len) |
|---|
| 1456 | +/* |
|---|
| 1457 | + * Try to find a chunk that intersects [start, start + len] range and when one |
|---|
| 1458 | + * such is found, record the end of it in *start |
|---|
| 1459 | + */ |
|---|
| 1460 | +static bool contains_pending_extent(struct btrfs_device *device, u64 *start, |
|---|
| 1461 | + u64 len) |
|---|
| 1292 | 1462 | { |
|---|
| 1293 | | - struct btrfs_fs_info *fs_info = device->fs_info; |
|---|
| 1294 | | - struct extent_map *em; |
|---|
| 1295 | | - struct list_head *search_list = &fs_info->pinned_chunks; |
|---|
| 1296 | | - int ret = 0; |
|---|
| 1297 | | - u64 physical_start = *start; |
|---|
| 1463 | + u64 physical_start, physical_end; |
|---|
| 1298 | 1464 | |
|---|
| 1299 | | - if (transaction) |
|---|
| 1300 | | - search_list = &transaction->pending_chunks; |
|---|
| 1301 | | -again: |
|---|
| 1302 | | - list_for_each_entry(em, search_list, list) { |
|---|
| 1303 | | - struct map_lookup *map; |
|---|
| 1304 | | - int i; |
|---|
| 1465 | + lockdep_assert_held(&device->fs_info->chunk_mutex); |
|---|
| 1305 | 1466 | |
|---|
| 1306 | | - map = em->map_lookup; |
|---|
| 1307 | | - for (i = 0; i < map->num_stripes; i++) { |
|---|
| 1308 | | - u64 end; |
|---|
| 1467 | + if (!find_first_extent_bit(&device->alloc_state, *start, |
|---|
| 1468 | + &physical_start, &physical_end, |
|---|
| 1469 | + CHUNK_ALLOCATED, NULL)) { |
|---|
| 1309 | 1470 | |
|---|
| 1310 | | - if (map->stripes[i].dev != device) |
|---|
| 1311 | | - continue; |
|---|
| 1312 | | - if (map->stripes[i].physical >= physical_start + len || |
|---|
| 1313 | | - map->stripes[i].physical + em->orig_block_len <= |
|---|
| 1314 | | - physical_start) |
|---|
| 1315 | | - continue; |
|---|
| 1316 | | - /* |
|---|
| 1317 | | - * Make sure that while processing the pinned list we do |
|---|
| 1318 | | - * not override our *start with a lower value, because |
|---|
| 1319 | | - * we can have pinned chunks that fall within this |
|---|
| 1320 | | - * device hole and that have lower physical addresses |
|---|
| 1321 | | - * than the pending chunks we processed before. If we |
|---|
| 1322 | | - * do not take this special care we can end up getting |
|---|
| 1323 | | - * 2 pending chunks that start at the same physical |
|---|
| 1324 | | - * device offsets because the end offset of a pinned |
|---|
| 1325 | | - * chunk can be equal to the start offset of some |
|---|
| 1326 | | - * pending chunk. |
|---|
| 1327 | | - */ |
|---|
| 1328 | | - end = map->stripes[i].physical + em->orig_block_len; |
|---|
| 1329 | | - if (end > *start) { |
|---|
| 1330 | | - *start = end; |
|---|
| 1331 | | - ret = 1; |
|---|
| 1332 | | - } |
|---|
| 1471 | + if (in_range(physical_start, *start, len) || |
|---|
| 1472 | + in_range(*start, physical_start, |
|---|
| 1473 | + physical_end - physical_start)) { |
|---|
| 1474 | + *start = physical_end + 1; |
|---|
| 1475 | + return true; |
|---|
| 1333 | 1476 | } |
|---|
| 1334 | 1477 | } |
|---|
| 1335 | | - if (search_list != &fs_info->pinned_chunks) { |
|---|
| 1336 | | - search_list = &fs_info->pinned_chunks; |
|---|
| 1337 | | - goto again; |
|---|
| 1338 | | - } |
|---|
| 1339 | | - |
|---|
| 1340 | | - return ret; |
|---|
| 1478 | + return false; |
|---|
| 1341 | 1479 | } |
|---|
| 1342 | 1480 | |
|---|
| 1481 | +static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) |
|---|
| 1482 | +{ |
|---|
| 1483 | + switch (device->fs_devices->chunk_alloc_policy) { |
|---|
| 1484 | + case BTRFS_CHUNK_ALLOC_REGULAR: |
|---|
| 1485 | + /* |
|---|
| 1486 | + * We don't want to overwrite the superblock on the drive nor |
|---|
| 1487 | + * any area used by the boot loader (grub for example), so we |
|---|
| 1488 | + * make sure to start at an offset of at least 1MB. |
|---|
| 1489 | + */ |
|---|
| 1490 | + return max_t(u64, start, SZ_1M); |
|---|
| 1491 | + default: |
|---|
| 1492 | + BUG(); |
|---|
| 1493 | + } |
|---|
| 1494 | +} |
|---|
| 1495 | + |
|---|
| 1496 | +/** |
|---|
| 1497 | + * dev_extent_hole_check - check if specified hole is suitable for allocation |
|---|
| 1498 | + * @device: the device which we have the hole |
|---|
| 1499 | + * @hole_start: starting position of the hole |
|---|
| 1500 | + * @hole_size: the size of the hole |
|---|
| 1501 | + * @num_bytes: the size of the free space that we need |
|---|
| 1502 | + * |
|---|
| 1503 | + * This function may modify @hole_start and @hole_end to reflect the suitable |
|---|
| 1504 | + * position for allocation. Returns 1 if hole position is updated, 0 otherwise. |
|---|
| 1505 | + */ |
|---|
| 1506 | +static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, |
|---|
| 1507 | + u64 *hole_size, u64 num_bytes) |
|---|
| 1508 | +{ |
|---|
| 1509 | + bool changed = false; |
|---|
| 1510 | + u64 hole_end = *hole_start + *hole_size; |
|---|
| 1511 | + |
|---|
| 1512 | + /* |
|---|
| 1513 | + * Check before we set max_hole_start, otherwise we could end up |
|---|
| 1514 | + * sending back this offset anyway. |
|---|
| 1515 | + */ |
|---|
| 1516 | + if (contains_pending_extent(device, hole_start, *hole_size)) { |
|---|
| 1517 | + if (hole_end >= *hole_start) |
|---|
| 1518 | + *hole_size = hole_end - *hole_start; |
|---|
| 1519 | + else |
|---|
| 1520 | + *hole_size = 0; |
|---|
| 1521 | + changed = true; |
|---|
| 1522 | + } |
|---|
| 1523 | + |
|---|
| 1524 | + switch (device->fs_devices->chunk_alloc_policy) { |
|---|
| 1525 | + case BTRFS_CHUNK_ALLOC_REGULAR: |
|---|
| 1526 | + /* No extra check */ |
|---|
| 1527 | + break; |
|---|
| 1528 | + default: |
|---|
| 1529 | + BUG(); |
|---|
| 1530 | + } |
|---|
| 1531 | + |
|---|
| 1532 | + return changed; |
|---|
| 1533 | +} |
|---|
| 1343 | 1534 | |
|---|
| 1344 | 1535 | /* |
|---|
| 1345 | 1536 | * find_free_dev_extent_start - find free space in the specified device |
|---|
| .. | .. |
|---|
| 1361 | 1552 | * @len is used to store the size of the free space that we find. |
|---|
| 1362 | 1553 | * But if we don't find suitable free space, it is used to store the size of |
|---|
| 1363 | 1554 | * the max free space. |
|---|
| 1555 | + * |
|---|
| 1556 | + * NOTE: This function will search *commit* root of device tree, and does extra |
|---|
| 1557 | + * check to ensure dev extents are not double allocated. |
|---|
| 1558 | + * This makes the function safe to allocate dev extents but may not report |
|---|
| 1559 | + * correct usable device space, as device extent freed in current transaction |
|---|
| 1560 | + * is not reported as avaiable. |
|---|
| 1364 | 1561 | */ |
|---|
| 1365 | | -int find_free_dev_extent_start(struct btrfs_transaction *transaction, |
|---|
| 1366 | | - struct btrfs_device *device, u64 num_bytes, |
|---|
| 1367 | | - u64 search_start, u64 *start, u64 *len) |
|---|
| 1562 | +static int find_free_dev_extent_start(struct btrfs_device *device, |
|---|
| 1563 | + u64 num_bytes, u64 search_start, u64 *start, |
|---|
| 1564 | + u64 *len) |
|---|
| 1368 | 1565 | { |
|---|
| 1369 | 1566 | struct btrfs_fs_info *fs_info = device->fs_info; |
|---|
| 1370 | 1567 | struct btrfs_root *root = fs_info->dev_root; |
|---|
| .. | .. |
|---|
| 1380 | 1577 | int slot; |
|---|
| 1381 | 1578 | struct extent_buffer *l; |
|---|
| 1382 | 1579 | |
|---|
| 1383 | | - /* |
|---|
| 1384 | | - * We don't want to overwrite the superblock on the drive nor any area |
|---|
| 1385 | | - * used by the boot loader (grub for example), so we make sure to start |
|---|
| 1386 | | - * at an offset of at least 1MB. |
|---|
| 1387 | | - */ |
|---|
| 1388 | | - search_start = max_t(u64, search_start, SZ_1M); |
|---|
| 1580 | + search_start = dev_extent_search_start(device, search_start); |
|---|
| 1389 | 1581 | |
|---|
| 1390 | 1582 | path = btrfs_alloc_path(); |
|---|
| 1391 | 1583 | if (!path) |
|---|
| .. | .. |
|---|
| 1418 | 1610 | goto out; |
|---|
| 1419 | 1611 | } |
|---|
| 1420 | 1612 | |
|---|
| 1421 | | - while (1) { |
|---|
| 1613 | + while (search_start < search_end) { |
|---|
| 1422 | 1614 | l = path->nodes[0]; |
|---|
| 1423 | 1615 | slot = path->slots[0]; |
|---|
| 1424 | 1616 | if (slot >= btrfs_header_nritems(l)) { |
|---|
| .. | .. |
|---|
| 1441 | 1633 | if (key.type != BTRFS_DEV_EXTENT_KEY) |
|---|
| 1442 | 1634 | goto next; |
|---|
| 1443 | 1635 | |
|---|
| 1636 | + if (key.offset > search_end) |
|---|
| 1637 | + break; |
|---|
| 1638 | + |
|---|
| 1444 | 1639 | if (key.offset > search_start) { |
|---|
| 1445 | 1640 | hole_size = key.offset - search_start; |
|---|
| 1446 | | - |
|---|
| 1447 | | - /* |
|---|
| 1448 | | - * Have to check before we set max_hole_start, otherwise |
|---|
| 1449 | | - * we could end up sending back this offset anyway. |
|---|
| 1450 | | - */ |
|---|
| 1451 | | - if (contains_pending_extent(transaction, device, |
|---|
| 1452 | | - &search_start, |
|---|
| 1453 | | - hole_size)) { |
|---|
| 1454 | | - if (key.offset >= search_start) { |
|---|
| 1455 | | - hole_size = key.offset - search_start; |
|---|
| 1456 | | - } else { |
|---|
| 1457 | | - WARN_ON_ONCE(1); |
|---|
| 1458 | | - hole_size = 0; |
|---|
| 1459 | | - } |
|---|
| 1460 | | - } |
|---|
| 1641 | + dev_extent_hole_check(device, &search_start, &hole_size, |
|---|
| 1642 | + num_bytes); |
|---|
| 1461 | 1643 | |
|---|
| 1462 | 1644 | if (hole_size > max_hole_size) { |
|---|
| 1463 | 1645 | max_hole_start = search_start; |
|---|
| .. | .. |
|---|
| 1496 | 1678 | */ |
|---|
| 1497 | 1679 | if (search_end > search_start) { |
|---|
| 1498 | 1680 | hole_size = search_end - search_start; |
|---|
| 1499 | | - |
|---|
| 1500 | | - if (contains_pending_extent(transaction, device, &search_start, |
|---|
| 1501 | | - hole_size)) { |
|---|
| 1681 | + if (dev_extent_hole_check(device, &search_start, &hole_size, |
|---|
| 1682 | + num_bytes)) { |
|---|
| 1502 | 1683 | btrfs_release_path(path); |
|---|
| 1503 | 1684 | goto again; |
|---|
| 1504 | 1685 | } |
|---|
| .. | .. |
|---|
| 1515 | 1696 | else |
|---|
| 1516 | 1697 | ret = 0; |
|---|
| 1517 | 1698 | |
|---|
| 1699 | + ASSERT(max_hole_start + max_hole_size <= search_end); |
|---|
| 1518 | 1700 | out: |
|---|
| 1519 | 1701 | btrfs_free_path(path); |
|---|
| 1520 | 1702 | *start = max_hole_start; |
|---|
| .. | .. |
|---|
| 1523 | 1705 | return ret; |
|---|
| 1524 | 1706 | } |
|---|
| 1525 | 1707 | |
|---|
| 1526 | | -int find_free_dev_extent(struct btrfs_trans_handle *trans, |
|---|
| 1527 | | - struct btrfs_device *device, u64 num_bytes, |
|---|
| 1708 | +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, |
|---|
| 1528 | 1709 | u64 *start, u64 *len) |
|---|
| 1529 | 1710 | { |
|---|
| 1530 | 1711 | /* FIXME use last free of some kind */ |
|---|
| 1531 | | - return find_free_dev_extent_start(trans->transaction, device, |
|---|
| 1532 | | - num_bytes, 0, start, len); |
|---|
| 1712 | + return find_free_dev_extent_start(device, num_bytes, 0, start, len); |
|---|
| 1533 | 1713 | } |
|---|
| 1534 | 1714 | |
|---|
| 1535 | 1715 | static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, |
|---|
| .. | .. |
|---|
| 1640 | 1820 | struct rb_node *n; |
|---|
| 1641 | 1821 | u64 ret = 0; |
|---|
| 1642 | 1822 | |
|---|
| 1643 | | - em_tree = &fs_info->mapping_tree.map_tree; |
|---|
| 1823 | + em_tree = &fs_info->mapping_tree; |
|---|
| 1644 | 1824 | read_lock(&em_tree->lock); |
|---|
| 1645 | | - n = rb_last(&em_tree->map); |
|---|
| 1825 | + n = rb_last(&em_tree->map.rb_root); |
|---|
| 1646 | 1826 | if (n) { |
|---|
| 1647 | 1827 | em = rb_entry(n, struct extent_map, rb_node); |
|---|
| 1648 | 1828 | ret = em->start + em->len; |
|---|
| .. | .. |
|---|
| 1672 | 1852 | if (ret < 0) |
|---|
| 1673 | 1853 | goto error; |
|---|
| 1674 | 1854 | |
|---|
| 1675 | | - BUG_ON(ret == 0); /* Corruption */ |
|---|
| 1855 | + if (ret == 0) { |
|---|
| 1856 | + /* Corruption */ |
|---|
| 1857 | + btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); |
|---|
| 1858 | + ret = -EUCLEAN; |
|---|
| 1859 | + goto error; |
|---|
| 1860 | + } |
|---|
| 1676 | 1861 | |
|---|
| 1677 | 1862 | ret = btrfs_previous_item(fs_info->chunk_root, path, |
|---|
| 1678 | 1863 | BTRFS_DEV_ITEMS_OBJECTID, |
|---|
| .. | .. |
|---|
| 1738 | 1923 | ptr = btrfs_device_uuid(dev_item); |
|---|
| 1739 | 1924 | write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); |
|---|
| 1740 | 1925 | ptr = btrfs_device_fsid(dev_item); |
|---|
| 1741 | | - write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE); |
|---|
| 1926 | + write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, |
|---|
| 1927 | + ptr, BTRFS_FSID_SIZE); |
|---|
| 1742 | 1928 | btrfs_mark_buffer_dirty(leaf); |
|---|
| 1743 | 1929 | |
|---|
| 1744 | 1930 | ret = 0; |
|---|
| .. | .. |
|---|
| 1750 | 1936 | /* |
|---|
| 1751 | 1937 | * Function to update ctime/mtime for a given device path. |
|---|
| 1752 | 1938 | * Mainly used for ctime/mtime based probe like libblkid. |
|---|
| 1939 | + * |
|---|
| 1940 | + * We don't care about errors here, this is just to be kind to userspace. |
|---|
| 1753 | 1941 | */ |
|---|
| 1754 | | -static void update_dev_time(const char *path_name) |
|---|
| 1942 | +static void update_dev_time(const char *device_path) |
|---|
| 1755 | 1943 | { |
|---|
| 1756 | | - struct file *filp; |
|---|
| 1944 | + struct path path; |
|---|
| 1945 | + struct timespec64 now; |
|---|
| 1946 | + int ret; |
|---|
| 1757 | 1947 | |
|---|
| 1758 | | - filp = filp_open(path_name, O_RDWR, 0); |
|---|
| 1759 | | - if (IS_ERR(filp)) |
|---|
| 1948 | + ret = kern_path(device_path, LOOKUP_FOLLOW, &path); |
|---|
| 1949 | + if (ret) |
|---|
| 1760 | 1950 | return; |
|---|
| 1761 | | - file_update_time(filp); |
|---|
| 1762 | | - filp_close(filp, NULL); |
|---|
| 1951 | + |
|---|
| 1952 | + now = current_time(d_inode(path.dentry)); |
|---|
| 1953 | + inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); |
|---|
| 1954 | + path_put(&path); |
|---|
| 1763 | 1955 | } |
|---|
| 1764 | 1956 | |
|---|
| 1765 | | -static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, |
|---|
| 1766 | | - struct btrfs_device *device) |
|---|
| 1957 | +static int btrfs_rm_dev_item(struct btrfs_device *device) |
|---|
| 1767 | 1958 | { |
|---|
| 1768 | | - struct btrfs_root *root = fs_info->chunk_root; |
|---|
| 1959 | + struct btrfs_root *root = device->fs_info->chunk_root; |
|---|
| 1769 | 1960 | int ret; |
|---|
| 1770 | 1961 | struct btrfs_path *path; |
|---|
| 1771 | 1962 | struct btrfs_key key; |
|---|
| .. | .. |
|---|
| 1862 | 2053 | * where this function called, there should be always be another device (or |
|---|
| 1863 | 2054 | * this_dev) which is active. |
|---|
| 1864 | 2055 | */ |
|---|
| 1865 | | -void btrfs_assign_next_active_device(struct btrfs_device *device, |
|---|
| 1866 | | - struct btrfs_device *this_dev) |
|---|
| 2056 | +void __cold btrfs_assign_next_active_device(struct btrfs_device *device, |
|---|
| 2057 | + struct btrfs_device *next_device) |
|---|
| 1867 | 2058 | { |
|---|
| 1868 | 2059 | struct btrfs_fs_info *fs_info = device->fs_info; |
|---|
| 1869 | | - struct btrfs_device *next_device; |
|---|
| 1870 | 2060 | |
|---|
| 1871 | | - if (this_dev) |
|---|
| 1872 | | - next_device = this_dev; |
|---|
| 1873 | | - else |
|---|
| 2061 | + if (!next_device) |
|---|
| 1874 | 2062 | next_device = btrfs_find_next_active_device(fs_info->fs_devices, |
|---|
| 1875 | | - device); |
|---|
| 2063 | + device); |
|---|
| 1876 | 2064 | ASSERT(next_device); |
|---|
| 1877 | 2065 | |
|---|
| 1878 | 2066 | if (fs_info->sb->s_bdev && |
|---|
| .. | .. |
|---|
| 1883 | 2071 | fs_info->fs_devices->latest_bdev = next_device->bdev; |
|---|
| 1884 | 2072 | } |
|---|
| 1885 | 2073 | |
|---|
| 2074 | +/* |
|---|
| 2075 | + * Return btrfs_fs_devices::num_devices excluding the device that's being |
|---|
| 2076 | + * currently replaced. |
|---|
| 2077 | + */ |
|---|
| 2078 | +static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) |
|---|
| 2079 | +{ |
|---|
| 2080 | + u64 num_devices = fs_info->fs_devices->num_devices; |
|---|
| 2081 | + |
|---|
| 2082 | + down_read(&fs_info->dev_replace.rwsem); |
|---|
| 2083 | + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { |
|---|
| 2084 | + ASSERT(num_devices > 1); |
|---|
| 2085 | + num_devices--; |
|---|
| 2086 | + } |
|---|
| 2087 | + up_read(&fs_info->dev_replace.rwsem); |
|---|
| 2088 | + |
|---|
| 2089 | + return num_devices; |
|---|
| 2090 | +} |
|---|
| 2091 | + |
|---|
| 2092 | +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, |
|---|
| 2093 | + struct block_device *bdev, |
|---|
| 2094 | + const char *device_path) |
|---|
| 2095 | +{ |
|---|
| 2096 | + struct btrfs_super_block *disk_super; |
|---|
| 2097 | + int copy_num; |
|---|
| 2098 | + |
|---|
| 2099 | + if (!bdev) |
|---|
| 2100 | + return; |
|---|
| 2101 | + |
|---|
| 2102 | + for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { |
|---|
| 2103 | + struct page *page; |
|---|
| 2104 | + int ret; |
|---|
| 2105 | + |
|---|
| 2106 | + disk_super = btrfs_read_dev_one_super(bdev, copy_num); |
|---|
| 2107 | + if (IS_ERR(disk_super)) |
|---|
| 2108 | + continue; |
|---|
| 2109 | + |
|---|
| 2110 | + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); |
|---|
| 2111 | + |
|---|
| 2112 | + page = virt_to_page(disk_super); |
|---|
| 2113 | + set_page_dirty(page); |
|---|
| 2114 | + lock_page(page); |
|---|
| 2115 | + /* write_on_page() unlocks the page */ |
|---|
| 2116 | + ret = write_one_page(page); |
|---|
| 2117 | + if (ret) |
|---|
| 2118 | + btrfs_warn(fs_info, |
|---|
| 2119 | + "error clearing superblock number %d (%d)", |
|---|
| 2120 | + copy_num, ret); |
|---|
| 2121 | + btrfs_release_disk_super(disk_super); |
|---|
| 2122 | + |
|---|
| 2123 | + } |
|---|
| 2124 | + |
|---|
| 2125 | + /* Notify udev that device has changed */ |
|---|
| 2126 | + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); |
|---|
| 2127 | + |
|---|
| 2128 | + /* Update ctime/mtime for device path for libblkid */ |
|---|
| 2129 | + update_dev_time(device_path); |
|---|
| 2130 | +} |
|---|
| 2131 | + |
|---|
| 1886 | 2132 | int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, |
|---|
| 1887 | | - u64 devid) |
|---|
| 2133 | + u64 devid) |
|---|
| 1888 | 2134 | { |
|---|
| 1889 | 2135 | struct btrfs_device *device; |
|---|
| 1890 | 2136 | struct btrfs_fs_devices *cur_devices; |
|---|
| .. | .. |
|---|
| 1892 | 2138 | u64 num_devices; |
|---|
| 1893 | 2139 | int ret = 0; |
|---|
| 1894 | 2140 | |
|---|
| 1895 | | - mutex_lock(&uuid_mutex); |
|---|
| 1896 | | - |
|---|
| 1897 | | - num_devices = fs_devices->num_devices; |
|---|
| 1898 | | - btrfs_dev_replace_read_lock(&fs_info->dev_replace); |
|---|
| 1899 | | - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { |
|---|
| 1900 | | - WARN_ON(num_devices < 1); |
|---|
| 1901 | | - num_devices--; |
|---|
| 1902 | | - } |
|---|
| 1903 | | - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); |
|---|
| 2141 | + /* |
|---|
| 2142 | + * The device list in fs_devices is accessed without locks (neither |
|---|
| 2143 | + * uuid_mutex nor device_list_mutex) as it won't change on a mounted |
|---|
| 2144 | + * filesystem and another device rm cannot run. |
|---|
| 2145 | + */ |
|---|
| 2146 | + num_devices = btrfs_num_devices(fs_info); |
|---|
| 1904 | 2147 | |
|---|
| 1905 | 2148 | ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); |
|---|
| 1906 | 2149 | if (ret) |
|---|
| 1907 | 2150 | goto out; |
|---|
| 1908 | 2151 | |
|---|
| 1909 | | - ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, |
|---|
| 1910 | | - &device); |
|---|
| 1911 | | - if (ret) |
|---|
| 2152 | + device = btrfs_find_device_by_devspec(fs_info, devid, device_path); |
|---|
| 2153 | + |
|---|
| 2154 | + if (IS_ERR(device)) { |
|---|
| 2155 | + if (PTR_ERR(device) == -ENOENT && |
|---|
| 2156 | + device_path && strcmp(device_path, "missing") == 0) |
|---|
| 2157 | + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; |
|---|
| 2158 | + else |
|---|
| 2159 | + ret = PTR_ERR(device); |
|---|
| 1912 | 2160 | goto out; |
|---|
| 2161 | + } |
|---|
| 2162 | + |
|---|
| 2163 | + if (btrfs_pinned_by_swapfile(fs_info, device)) { |
|---|
| 2164 | + btrfs_warn_in_rcu(fs_info, |
|---|
| 2165 | + "cannot remove device %s (devid %llu) due to active swapfile", |
|---|
| 2166 | + rcu_str_deref(device->name), device->devid); |
|---|
| 2167 | + ret = -ETXTBSY; |
|---|
| 2168 | + goto out; |
|---|
| 2169 | + } |
|---|
| 1913 | 2170 | |
|---|
| 1914 | 2171 | if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { |
|---|
| 1915 | 2172 | ret = BTRFS_ERROR_DEV_TGT_REPLACE; |
|---|
| .. | .. |
|---|
| 1929 | 2186 | mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 1930 | 2187 | } |
|---|
| 1931 | 2188 | |
|---|
| 1932 | | - mutex_unlock(&uuid_mutex); |
|---|
| 1933 | 2189 | ret = btrfs_shrink_device(device, 0); |
|---|
| 1934 | | - mutex_lock(&uuid_mutex); |
|---|
| 2190 | + if (!ret) |
|---|
| 2191 | + btrfs_reada_remove_dev(device); |
|---|
| 1935 | 2192 | if (ret) |
|---|
| 1936 | 2193 | goto error_undo; |
|---|
| 1937 | 2194 | |
|---|
| .. | .. |
|---|
| 1940 | 2197 | * counter although write_all_supers() is not locked out. This |
|---|
| 1941 | 2198 | * could give a filesystem state which requires a degraded mount. |
|---|
| 1942 | 2199 | */ |
|---|
| 1943 | | - ret = btrfs_rm_dev_item(fs_info, device); |
|---|
| 2200 | + ret = btrfs_rm_dev_item(device); |
|---|
| 1944 | 2201 | if (ret) |
|---|
| 1945 | 2202 | goto error_undo; |
|---|
| 1946 | 2203 | |
|---|
| 1947 | 2204 | clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); |
|---|
| 1948 | | - btrfs_scrub_cancel_dev(fs_info, device); |
|---|
| 2205 | + btrfs_scrub_cancel_dev(device); |
|---|
| 1949 | 2206 | |
|---|
| 1950 | 2207 | /* |
|---|
| 1951 | 2208 | * the device list mutex makes sure that we don't change |
|---|
| .. | .. |
|---|
| 1980 | 2237 | if (device->bdev) { |
|---|
| 1981 | 2238 | cur_devices->open_devices--; |
|---|
| 1982 | 2239 | /* remove sysfs entry */ |
|---|
| 1983 | | - btrfs_sysfs_rm_device_link(fs_devices, device); |
|---|
| 2240 | + btrfs_sysfs_remove_device(device); |
|---|
| 1984 | 2241 | } |
|---|
| 1985 | 2242 | |
|---|
| 1986 | 2243 | num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; |
|---|
| .. | .. |
|---|
| 1993 | 2250 | * supers and free the device. |
|---|
| 1994 | 2251 | */ |
|---|
| 1995 | 2252 | if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) |
|---|
| 1996 | | - btrfs_scratch_superblocks(device->bdev, device->name->str); |
|---|
| 2253 | + btrfs_scratch_superblocks(fs_info, device->bdev, |
|---|
| 2254 | + device->name->str); |
|---|
| 1997 | 2255 | |
|---|
| 1998 | 2256 | btrfs_close_bdev(device); |
|---|
| 1999 | | - call_rcu(&device->rcu, free_device_rcu); |
|---|
| 2257 | + synchronize_rcu(); |
|---|
| 2258 | + btrfs_free_device(device); |
|---|
| 2000 | 2259 | |
|---|
| 2001 | 2260 | if (cur_devices->open_devices == 0) { |
|---|
| 2002 | | - while (fs_devices) { |
|---|
| 2003 | | - if (fs_devices->seed == cur_devices) { |
|---|
| 2004 | | - fs_devices->seed = cur_devices->seed; |
|---|
| 2005 | | - break; |
|---|
| 2006 | | - } |
|---|
| 2007 | | - fs_devices = fs_devices->seed; |
|---|
| 2008 | | - } |
|---|
| 2009 | | - cur_devices->seed = NULL; |
|---|
| 2261 | + list_del_init(&cur_devices->seed_list); |
|---|
| 2010 | 2262 | close_fs_devices(cur_devices); |
|---|
| 2011 | 2263 | free_fs_devices(cur_devices); |
|---|
| 2012 | 2264 | } |
|---|
| 2013 | 2265 | |
|---|
| 2014 | 2266 | out: |
|---|
| 2015 | | - mutex_unlock(&uuid_mutex); |
|---|
| 2016 | 2267 | return ret; |
|---|
| 2017 | 2268 | |
|---|
| 2018 | 2269 | error_undo: |
|---|
| 2270 | + btrfs_reada_undo_remove_dev(device); |
|---|
| 2019 | 2271 | if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { |
|---|
| 2020 | 2272 | mutex_lock(&fs_info->chunk_mutex); |
|---|
| 2021 | 2273 | list_add(&device->dev_alloc_list, |
|---|
| .. | .. |
|---|
| 2053 | 2305 | fs_devices->open_devices--; |
|---|
| 2054 | 2306 | } |
|---|
| 2055 | 2307 | |
|---|
| 2056 | | -void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, |
|---|
| 2057 | | - struct btrfs_device *srcdev) |
|---|
| 2308 | +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) |
|---|
| 2058 | 2309 | { |
|---|
| 2059 | 2310 | struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; |
|---|
| 2060 | 2311 | |
|---|
| 2061 | | - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { |
|---|
| 2062 | | - /* zero out the old super if it is writable */ |
|---|
| 2063 | | - btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); |
|---|
| 2064 | | - } |
|---|
| 2312 | + mutex_lock(&uuid_mutex); |
|---|
| 2065 | 2313 | |
|---|
| 2066 | 2314 | btrfs_close_bdev(srcdev); |
|---|
| 2067 | | - call_rcu(&srcdev->rcu, free_device_rcu); |
|---|
| 2315 | + synchronize_rcu(); |
|---|
| 2316 | + btrfs_free_device(srcdev); |
|---|
| 2068 | 2317 | |
|---|
| 2069 | 2318 | /* if this is no devs we rather delete the fs_devices */ |
|---|
| 2070 | 2319 | if (!fs_devices->num_devices) { |
|---|
| 2071 | | - struct btrfs_fs_devices *tmp_fs_devices; |
|---|
| 2072 | | - |
|---|
| 2073 | 2320 | /* |
|---|
| 2074 | 2321 | * On a mounted FS, num_devices can't be zero unless it's a |
|---|
| 2075 | 2322 | * seed. In case of a seed device being replaced, the replace |
|---|
| .. | .. |
|---|
| 2078 | 2325 | */ |
|---|
| 2079 | 2326 | ASSERT(fs_devices->seeding); |
|---|
| 2080 | 2327 | |
|---|
| 2081 | | - tmp_fs_devices = fs_info->fs_devices; |
|---|
| 2082 | | - while (tmp_fs_devices) { |
|---|
| 2083 | | - if (tmp_fs_devices->seed == fs_devices) { |
|---|
| 2084 | | - tmp_fs_devices->seed = fs_devices->seed; |
|---|
| 2085 | | - break; |
|---|
| 2086 | | - } |
|---|
| 2087 | | - tmp_fs_devices = tmp_fs_devices->seed; |
|---|
| 2088 | | - } |
|---|
| 2089 | | - fs_devices->seed = NULL; |
|---|
| 2328 | + list_del_init(&fs_devices->seed_list); |
|---|
| 2090 | 2329 | close_fs_devices(fs_devices); |
|---|
| 2091 | 2330 | free_fs_devices(fs_devices); |
|---|
| 2092 | 2331 | } |
|---|
| 2332 | + mutex_unlock(&uuid_mutex); |
|---|
| 2093 | 2333 | } |
|---|
| 2094 | 2334 | |
|---|
| 2095 | 2335 | void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) |
|---|
| 2096 | 2336 | { |
|---|
| 2097 | 2337 | struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; |
|---|
| 2098 | 2338 | |
|---|
| 2099 | | - WARN_ON(!tgtdev); |
|---|
| 2100 | 2339 | mutex_lock(&fs_devices->device_list_mutex); |
|---|
| 2101 | 2340 | |
|---|
| 2102 | | - btrfs_sysfs_rm_device_link(fs_devices, tgtdev); |
|---|
| 2341 | + btrfs_sysfs_remove_device(tgtdev); |
|---|
| 2103 | 2342 | |
|---|
| 2104 | 2343 | if (tgtdev->bdev) |
|---|
| 2105 | 2344 | fs_devices->open_devices--; |
|---|
| .. | .. |
|---|
| 2119 | 2358 | * is already out of device list, so we don't have to hold |
|---|
| 2120 | 2359 | * the device_list_mutex lock. |
|---|
| 2121 | 2360 | */ |
|---|
| 2122 | | - btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); |
|---|
| 2361 | + btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, |
|---|
| 2362 | + tgtdev->name->str); |
|---|
| 2123 | 2363 | |
|---|
| 2124 | 2364 | btrfs_close_bdev(tgtdev); |
|---|
| 2125 | | - call_rcu(&tgtdev->rcu, free_device_rcu); |
|---|
| 2365 | + synchronize_rcu(); |
|---|
| 2366 | + btrfs_free_device(tgtdev); |
|---|
| 2126 | 2367 | } |
|---|
| 2127 | 2368 | |
|---|
| 2128 | | -static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, |
|---|
| 2129 | | - const char *device_path, |
|---|
| 2130 | | - struct btrfs_device **device) |
|---|
| 2369 | +static struct btrfs_device *btrfs_find_device_by_path( |
|---|
| 2370 | + struct btrfs_fs_info *fs_info, const char *device_path) |
|---|
| 2131 | 2371 | { |
|---|
| 2132 | 2372 | int ret = 0; |
|---|
| 2133 | 2373 | struct btrfs_super_block *disk_super; |
|---|
| 2134 | 2374 | u64 devid; |
|---|
| 2135 | 2375 | u8 *dev_uuid; |
|---|
| 2136 | 2376 | struct block_device *bdev; |
|---|
| 2137 | | - struct buffer_head *bh; |
|---|
| 2377 | + struct btrfs_device *device; |
|---|
| 2138 | 2378 | |
|---|
| 2139 | | - *device = NULL; |
|---|
| 2140 | 2379 | ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, |
|---|
| 2141 | | - fs_info->bdev_holder, 0, &bdev, &bh); |
|---|
| 2380 | + fs_info->bdev_holder, 0, &bdev, &disk_super); |
|---|
| 2142 | 2381 | if (ret) |
|---|
| 2143 | | - return ret; |
|---|
| 2144 | | - disk_super = (struct btrfs_super_block *)bh->b_data; |
|---|
| 2382 | + return ERR_PTR(ret); |
|---|
| 2383 | + |
|---|
| 2145 | 2384 | devid = btrfs_stack_device_id(&disk_super->dev_item); |
|---|
| 2146 | 2385 | dev_uuid = disk_super->dev_item.uuid; |
|---|
| 2147 | | - *device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, |
|---|
| 2148 | | - disk_super->fsid, true); |
|---|
| 2149 | | - brelse(bh); |
|---|
| 2150 | | - if (!*device) |
|---|
| 2151 | | - ret = -ENOENT; |
|---|
| 2386 | + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) |
|---|
| 2387 | + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, |
|---|
| 2388 | + disk_super->metadata_uuid, true); |
|---|
| 2389 | + else |
|---|
| 2390 | + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, |
|---|
| 2391 | + disk_super->fsid, true); |
|---|
| 2392 | + |
|---|
| 2393 | + btrfs_release_disk_super(disk_super); |
|---|
| 2394 | + if (!device) |
|---|
| 2395 | + device = ERR_PTR(-ENOENT); |
|---|
| 2152 | 2396 | blkdev_put(bdev, FMODE_READ); |
|---|
| 2153 | | - return ret; |
|---|
| 2154 | | -} |
|---|
| 2155 | | - |
|---|
| 2156 | | -int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, |
|---|
| 2157 | | - const char *device_path, |
|---|
| 2158 | | - struct btrfs_device **device) |
|---|
| 2159 | | -{ |
|---|
| 2160 | | - *device = NULL; |
|---|
| 2161 | | - if (strcmp(device_path, "missing") == 0) { |
|---|
| 2162 | | - struct list_head *devices; |
|---|
| 2163 | | - struct btrfs_device *tmp; |
|---|
| 2164 | | - |
|---|
| 2165 | | - devices = &fs_info->fs_devices->devices; |
|---|
| 2166 | | - list_for_each_entry(tmp, devices, dev_list) { |
|---|
| 2167 | | - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, |
|---|
| 2168 | | - &tmp->dev_state) && !tmp->bdev) { |
|---|
| 2169 | | - *device = tmp; |
|---|
| 2170 | | - break; |
|---|
| 2171 | | - } |
|---|
| 2172 | | - } |
|---|
| 2173 | | - |
|---|
| 2174 | | - if (!*device) |
|---|
| 2175 | | - return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; |
|---|
| 2176 | | - |
|---|
| 2177 | | - return 0; |
|---|
| 2178 | | - } else { |
|---|
| 2179 | | - return btrfs_find_device_by_path(fs_info, device_path, device); |
|---|
| 2180 | | - } |
|---|
| 2397 | + return device; |
|---|
| 2181 | 2398 | } |
|---|
| 2182 | 2399 | |
|---|
| 2183 | 2400 | /* |
|---|
| 2184 | 2401 | * Lookup a device given by device id, or the path if the id is 0. |
|---|
| 2185 | 2402 | */ |
|---|
| 2186 | | -int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, |
|---|
| 2187 | | - const char *devpath, |
|---|
| 2188 | | - struct btrfs_device **device) |
|---|
| 2403 | +struct btrfs_device *btrfs_find_device_by_devspec( |
|---|
| 2404 | + struct btrfs_fs_info *fs_info, u64 devid, |
|---|
| 2405 | + const char *device_path) |
|---|
| 2189 | 2406 | { |
|---|
| 2190 | | - int ret; |
|---|
| 2407 | + struct btrfs_device *device; |
|---|
| 2191 | 2408 | |
|---|
| 2192 | 2409 | if (devid) { |
|---|
| 2193 | | - ret = 0; |
|---|
| 2194 | | - *device = btrfs_find_device(fs_info->fs_devices, devid, |
|---|
| 2195 | | - NULL, NULL, true); |
|---|
| 2196 | | - if (!*device) |
|---|
| 2197 | | - ret = -ENOENT; |
|---|
| 2198 | | - } else { |
|---|
| 2199 | | - if (!devpath || !devpath[0]) |
|---|
| 2200 | | - return -EINVAL; |
|---|
| 2201 | | - |
|---|
| 2202 | | - ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, |
|---|
| 2203 | | - device); |
|---|
| 2410 | + device = btrfs_find_device(fs_info->fs_devices, devid, NULL, |
|---|
| 2411 | + NULL, true); |
|---|
| 2412 | + if (!device) |
|---|
| 2413 | + return ERR_PTR(-ENOENT); |
|---|
| 2414 | + return device; |
|---|
| 2204 | 2415 | } |
|---|
| 2205 | | - return ret; |
|---|
| 2416 | + |
|---|
| 2417 | + if (!device_path || !device_path[0]) |
|---|
| 2418 | + return ERR_PTR(-EINVAL); |
|---|
| 2419 | + |
|---|
| 2420 | + if (strcmp(device_path, "missing") == 0) { |
|---|
| 2421 | + /* Find first missing device */ |
|---|
| 2422 | + list_for_each_entry(device, &fs_info->fs_devices->devices, |
|---|
| 2423 | + dev_list) { |
|---|
| 2424 | + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, |
|---|
| 2425 | + &device->dev_state) && !device->bdev) |
|---|
| 2426 | + return device; |
|---|
| 2427 | + } |
|---|
| 2428 | + return ERR_PTR(-ENOENT); |
|---|
| 2429 | + } |
|---|
| 2430 | + |
|---|
| 2431 | + return btrfs_find_device_by_path(fs_info, device_path); |
|---|
| 2206 | 2432 | } |
|---|
| 2207 | 2433 | |
|---|
| 2208 | 2434 | /* |
|---|
| .. | .. |
|---|
| 2221 | 2447 | if (!fs_devices->seeding) |
|---|
| 2222 | 2448 | return -EINVAL; |
|---|
| 2223 | 2449 | |
|---|
| 2224 | | - seed_devices = alloc_fs_devices(NULL); |
|---|
| 2450 | + /* |
|---|
| 2451 | + * Private copy of the seed devices, anchored at |
|---|
| 2452 | + * fs_info->fs_devices->seed_list |
|---|
| 2453 | + */ |
|---|
| 2454 | + seed_devices = alloc_fs_devices(NULL, NULL); |
|---|
| 2225 | 2455 | if (IS_ERR(seed_devices)) |
|---|
| 2226 | 2456 | return PTR_ERR(seed_devices); |
|---|
| 2227 | 2457 | |
|---|
| 2458 | + /* |
|---|
| 2459 | + * It's necessary to retain a copy of the original seed fs_devices in |
|---|
| 2460 | + * fs_uuids so that filesystems which have been seeded can successfully |
|---|
| 2461 | + * reference the seed device from open_seed_devices. This also supports |
|---|
| 2462 | + * multiple fs seed. |
|---|
| 2463 | + */ |
|---|
| 2228 | 2464 | old_devices = clone_fs_devices(fs_devices); |
|---|
| 2229 | 2465 | if (IS_ERR(old_devices)) { |
|---|
| 2230 | 2466 | kfree(seed_devices); |
|---|
| .. | .. |
|---|
| 2245 | 2481 | list_for_each_entry(device, &seed_devices->devices, dev_list) |
|---|
| 2246 | 2482 | device->fs_devices = seed_devices; |
|---|
| 2247 | 2483 | |
|---|
| 2248 | | - mutex_lock(&fs_info->chunk_mutex); |
|---|
| 2249 | | - list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); |
|---|
| 2250 | | - mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 2251 | | - |
|---|
| 2252 | | - fs_devices->seeding = 0; |
|---|
| 2484 | + fs_devices->seeding = false; |
|---|
| 2253 | 2485 | fs_devices->num_devices = 0; |
|---|
| 2254 | 2486 | fs_devices->open_devices = 0; |
|---|
| 2255 | 2487 | fs_devices->missing_devices = 0; |
|---|
| 2256 | | - fs_devices->rotating = 0; |
|---|
| 2257 | | - fs_devices->seed = seed_devices; |
|---|
| 2488 | + fs_devices->rotating = false; |
|---|
| 2489 | + list_add(&seed_devices->seed_list, &fs_devices->seed_list); |
|---|
| 2258 | 2490 | |
|---|
| 2259 | 2491 | generate_random_uuid(fs_devices->fsid); |
|---|
| 2260 | | - memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); |
|---|
| 2492 | + memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); |
|---|
| 2261 | 2493 | memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); |
|---|
| 2262 | 2494 | mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 2263 | 2495 | |
|---|
| .. | .. |
|---|
| 2271 | 2503 | /* |
|---|
| 2272 | 2504 | * Store the expected generation for seed devices in device items. |
|---|
| 2273 | 2505 | */ |
|---|
| 2274 | | -static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, |
|---|
| 2275 | | - struct btrfs_fs_info *fs_info) |
|---|
| 2506 | +static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) |
|---|
| 2276 | 2507 | { |
|---|
| 2508 | + struct btrfs_fs_info *fs_info = trans->fs_info; |
|---|
| 2277 | 2509 | struct btrfs_root *root = fs_info->chunk_root; |
|---|
| 2278 | 2510 | struct btrfs_path *path; |
|---|
| 2279 | 2511 | struct extent_buffer *leaf; |
|---|
| .. | .. |
|---|
| 2357 | 2589 | u64 orig_super_num_devices; |
|---|
| 2358 | 2590 | int seeding_dev = 0; |
|---|
| 2359 | 2591 | int ret = 0; |
|---|
| 2360 | | - bool unlocked = false; |
|---|
| 2592 | + bool locked = false; |
|---|
| 2361 | 2593 | |
|---|
| 2362 | 2594 | if (sb_rdonly(sb) && !fs_devices->seeding) |
|---|
| 2363 | 2595 | return -EROFS; |
|---|
| .. | .. |
|---|
| 2371 | 2603 | seeding_dev = 1; |
|---|
| 2372 | 2604 | down_write(&sb->s_umount); |
|---|
| 2373 | 2605 | mutex_lock(&uuid_mutex); |
|---|
| 2606 | + locked = true; |
|---|
| 2374 | 2607 | } |
|---|
| 2375 | 2608 | |
|---|
| 2376 | | - filemap_write_and_wait(bdev->bd_inode->i_mapping); |
|---|
| 2609 | + sync_blockdev(bdev); |
|---|
| 2377 | 2610 | |
|---|
| 2378 | | - mutex_lock(&fs_devices->device_list_mutex); |
|---|
| 2379 | | - list_for_each_entry(device, &fs_devices->devices, dev_list) { |
|---|
| 2611 | + rcu_read_lock(); |
|---|
| 2612 | + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { |
|---|
| 2380 | 2613 | if (device->bdev == bdev) { |
|---|
| 2381 | 2614 | ret = -EEXIST; |
|---|
| 2382 | | - mutex_unlock( |
|---|
| 2383 | | - &fs_devices->device_list_mutex); |
|---|
| 2615 | + rcu_read_unlock(); |
|---|
| 2384 | 2616 | goto error; |
|---|
| 2385 | 2617 | } |
|---|
| 2386 | 2618 | } |
|---|
| 2387 | | - mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 2619 | + rcu_read_unlock(); |
|---|
| 2388 | 2620 | |
|---|
| 2389 | 2621 | device = btrfs_alloc_device(fs_info, NULL, NULL); |
|---|
| 2390 | 2622 | if (IS_ERR(device)) { |
|---|
| .. | .. |
|---|
| 2448 | 2680 | atomic64_add(device->total_bytes, &fs_info->free_chunk_space); |
|---|
| 2449 | 2681 | |
|---|
| 2450 | 2682 | if (!blk_queue_nonrot(q)) |
|---|
| 2451 | | - fs_devices->rotating = 1; |
|---|
| 2683 | + fs_devices->rotating = true; |
|---|
| 2452 | 2684 | |
|---|
| 2453 | 2685 | orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); |
|---|
| 2454 | 2686 | btrfs_set_super_total_bytes(fs_info->super_copy, |
|---|
| .. | .. |
|---|
| 2468 | 2700 | mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 2469 | 2701 | |
|---|
| 2470 | 2702 | /* Add sysfs device entry */ |
|---|
| 2471 | | - btrfs_sysfs_add_device_link(fs_devices, device); |
|---|
| 2703 | + btrfs_sysfs_add_device(device); |
|---|
| 2472 | 2704 | |
|---|
| 2473 | 2705 | mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 2474 | 2706 | |
|---|
| 2475 | 2707 | if (seeding_dev) { |
|---|
| 2476 | 2708 | mutex_lock(&fs_info->chunk_mutex); |
|---|
| 2477 | | - ret = init_first_rw_device(trans, fs_info); |
|---|
| 2709 | + ret = init_first_rw_device(trans); |
|---|
| 2478 | 2710 | mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 2479 | 2711 | if (ret) { |
|---|
| 2480 | 2712 | btrfs_abort_transaction(trans, ret); |
|---|
| .. | .. |
|---|
| 2489 | 2721 | } |
|---|
| 2490 | 2722 | |
|---|
| 2491 | 2723 | if (seeding_dev) { |
|---|
| 2492 | | - char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; |
|---|
| 2493 | | - |
|---|
| 2494 | | - ret = btrfs_finish_sprout(trans, fs_info); |
|---|
| 2724 | + ret = btrfs_finish_sprout(trans); |
|---|
| 2495 | 2725 | if (ret) { |
|---|
| 2496 | 2726 | btrfs_abort_transaction(trans, ret); |
|---|
| 2497 | 2727 | goto error_sysfs; |
|---|
| 2498 | 2728 | } |
|---|
| 2499 | 2729 | |
|---|
| 2500 | | - /* Sprouting would change fsid of the mounted root, |
|---|
| 2501 | | - * so rename the fsid on the sysfs |
|---|
| 2730 | + /* |
|---|
| 2731 | + * fs_devices now represents the newly sprouted filesystem and |
|---|
| 2732 | + * its fsid has been changed by btrfs_prepare_sprout |
|---|
| 2502 | 2733 | */ |
|---|
| 2503 | | - snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", |
|---|
| 2504 | | - fs_info->fsid); |
|---|
| 2505 | | - if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) |
|---|
| 2506 | | - btrfs_warn(fs_info, |
|---|
| 2507 | | - "sysfs: failed to create fsid for sprout"); |
|---|
| 2734 | + btrfs_sysfs_update_sprout_fsid(fs_devices); |
|---|
| 2508 | 2735 | } |
|---|
| 2509 | 2736 | |
|---|
| 2510 | 2737 | ret = btrfs_commit_transaction(trans); |
|---|
| .. | .. |
|---|
| 2512 | 2739 | if (seeding_dev) { |
|---|
| 2513 | 2740 | mutex_unlock(&uuid_mutex); |
|---|
| 2514 | 2741 | up_write(&sb->s_umount); |
|---|
| 2515 | | - unlocked = true; |
|---|
| 2742 | + locked = false; |
|---|
| 2516 | 2743 | |
|---|
| 2517 | 2744 | if (ret) /* transaction commit */ |
|---|
| 2518 | 2745 | return ret; |
|---|
| .. | .. |
|---|
| 2532 | 2759 | ret = btrfs_commit_transaction(trans); |
|---|
| 2533 | 2760 | } |
|---|
| 2534 | 2761 | |
|---|
| 2535 | | - /* Update ctime/mtime for libblkid */ |
|---|
| 2762 | + /* |
|---|
| 2763 | + * Now that we have written a new super block to this device, check all |
|---|
| 2764 | + * other fs_devices list if device_path alienates any other scanned |
|---|
| 2765 | + * device. |
|---|
| 2766 | + * We can ignore the return value as it typically returns -EINVAL and |
|---|
| 2767 | + * only succeeds if the device was an alien. |
|---|
| 2768 | + */ |
|---|
| 2769 | + btrfs_forget_devices(device_path); |
|---|
| 2770 | + |
|---|
| 2771 | + /* Update ctime/mtime for blkid or udev */ |
|---|
| 2536 | 2772 | update_dev_time(device_path); |
|---|
| 2773 | + |
|---|
| 2537 | 2774 | return ret; |
|---|
| 2538 | 2775 | |
|---|
| 2539 | 2776 | error_sysfs: |
|---|
| 2540 | | - btrfs_sysfs_rm_device_link(fs_devices, device); |
|---|
| 2777 | + btrfs_sysfs_remove_device(device); |
|---|
| 2541 | 2778 | mutex_lock(&fs_info->fs_devices->device_list_mutex); |
|---|
| 2542 | 2779 | mutex_lock(&fs_info->chunk_mutex); |
|---|
| 2543 | 2780 | list_del_rcu(&device->dev_list); |
|---|
| .. | .. |
|---|
| 2563 | 2800 | btrfs_free_device(device); |
|---|
| 2564 | 2801 | error: |
|---|
| 2565 | 2802 | blkdev_put(bdev, FMODE_EXCL); |
|---|
| 2566 | | - if (seeding_dev && !unlocked) { |
|---|
| 2803 | + if (locked) { |
|---|
| 2567 | 2804 | mutex_unlock(&uuid_mutex); |
|---|
| 2568 | 2805 | up_write(&sb->s_umount); |
|---|
| 2569 | 2806 | } |
|---|
| .. | .. |
|---|
| 2621 | 2858 | { |
|---|
| 2622 | 2859 | struct btrfs_fs_info *fs_info = device->fs_info; |
|---|
| 2623 | 2860 | struct btrfs_super_block *super_copy = fs_info->super_copy; |
|---|
| 2624 | | - struct btrfs_fs_devices *fs_devices; |
|---|
| 2625 | 2861 | u64 old_total; |
|---|
| 2626 | 2862 | u64 diff; |
|---|
| 2627 | 2863 | |
|---|
| .. | .. |
|---|
| 2640 | 2876 | return -EINVAL; |
|---|
| 2641 | 2877 | } |
|---|
| 2642 | 2878 | |
|---|
| 2643 | | - fs_devices = fs_info->fs_devices; |
|---|
| 2644 | | - |
|---|
| 2645 | 2879 | btrfs_set_super_total_bytes(super_copy, |
|---|
| 2646 | 2880 | round_down(old_total + diff, fs_info->sectorsize)); |
|---|
| 2647 | 2881 | device->fs_devices->total_rw_bytes += diff; |
|---|
| .. | .. |
|---|
| 2649 | 2883 | btrfs_device_set_total_bytes(device, new_size); |
|---|
| 2650 | 2884 | btrfs_device_set_disk_total_bytes(device, new_size); |
|---|
| 2651 | 2885 | btrfs_clear_space_info_full(device->fs_info); |
|---|
| 2652 | | - if (list_empty(&device->resized_list)) |
|---|
| 2653 | | - list_add_tail(&device->resized_list, |
|---|
| 2654 | | - &fs_devices->resized_devices); |
|---|
| 2886 | + if (list_empty(&device->post_commit_list)) |
|---|
| 2887 | + list_add_tail(&device->post_commit_list, |
|---|
| 2888 | + &trans->transaction->dev_update_list); |
|---|
| 2655 | 2889 | mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 2656 | 2890 | |
|---|
| 2657 | 2891 | return btrfs_update_device(trans, device); |
|---|
| .. | .. |
|---|
| 2739 | 2973 | return ret; |
|---|
| 2740 | 2974 | } |
|---|
| 2741 | 2975 | |
|---|
| 2742 | | -static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, |
|---|
| 2743 | | - u64 logical, u64 length) |
|---|
| 2976 | +/* |
|---|
| 2977 | + * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. |
|---|
| 2978 | + * @logical: Logical block offset in bytes. |
|---|
| 2979 | + * @length: Length of extent in bytes. |
|---|
| 2980 | + * |
|---|
| 2981 | + * Return: Chunk mapping or ERR_PTR. |
|---|
| 2982 | + */ |
|---|
| 2983 | +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, |
|---|
| 2984 | + u64 logical, u64 length) |
|---|
| 2744 | 2985 | { |
|---|
| 2745 | 2986 | struct extent_map_tree *em_tree; |
|---|
| 2746 | 2987 | struct extent_map *em; |
|---|
| 2747 | 2988 | |
|---|
| 2748 | | - em_tree = &fs_info->mapping_tree.map_tree; |
|---|
| 2989 | + em_tree = &fs_info->mapping_tree; |
|---|
| 2749 | 2990 | read_lock(&em_tree->lock); |
|---|
| 2750 | 2991 | em = lookup_extent_mapping(em_tree, logical, length); |
|---|
| 2751 | 2992 | read_unlock(&em_tree->lock); |
|---|
| .. | .. |
|---|
| 2777 | 3018 | int i, ret = 0; |
|---|
| 2778 | 3019 | struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
|---|
| 2779 | 3020 | |
|---|
| 2780 | | - em = get_chunk_map(fs_info, chunk_offset, 1); |
|---|
| 3021 | + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); |
|---|
| 2781 | 3022 | if (IS_ERR(em)) { |
|---|
| 2782 | 3023 | /* |
|---|
| 2783 | 3024 | * This is a logic error, but we don't want to just rely on the |
|---|
| .. | .. |
|---|
| 2818 | 3059 | mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 2819 | 3060 | } |
|---|
| 2820 | 3061 | |
|---|
| 2821 | | - if (map->stripes[i].dev) { |
|---|
| 2822 | | - ret = btrfs_update_device(trans, map->stripes[i].dev); |
|---|
| 2823 | | - if (ret) { |
|---|
| 2824 | | - mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 2825 | | - btrfs_abort_transaction(trans, ret); |
|---|
| 2826 | | - goto out; |
|---|
| 2827 | | - } |
|---|
| 3062 | + ret = btrfs_update_device(trans, device); |
|---|
| 3063 | + if (ret) { |
|---|
| 3064 | + mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 3065 | + btrfs_abort_transaction(trans, ret); |
|---|
| 3066 | + goto out; |
|---|
| 2828 | 3067 | } |
|---|
| 2829 | 3068 | } |
|---|
| 2830 | 3069 | mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| .. | .. |
|---|
| 2861 | 3100 | { |
|---|
| 2862 | 3101 | struct btrfs_root *root = fs_info->chunk_root; |
|---|
| 2863 | 3102 | struct btrfs_trans_handle *trans; |
|---|
| 3103 | + struct btrfs_block_group *block_group; |
|---|
| 2864 | 3104 | int ret; |
|---|
| 2865 | 3105 | |
|---|
| 2866 | 3106 | /* |
|---|
| .. | .. |
|---|
| 2877 | 3117 | */ |
|---|
| 2878 | 3118 | lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); |
|---|
| 2879 | 3119 | |
|---|
| 2880 | | - ret = btrfs_can_relocate(fs_info, chunk_offset); |
|---|
| 2881 | | - if (ret) |
|---|
| 2882 | | - return -ENOSPC; |
|---|
| 2883 | | - |
|---|
| 2884 | 3120 | /* step one, relocate all the extents inside this chunk */ |
|---|
| 2885 | 3121 | btrfs_scrub_pause(fs_info); |
|---|
| 2886 | 3122 | ret = btrfs_relocate_block_group(fs_info, chunk_offset); |
|---|
| .. | .. |
|---|
| 2888 | 3124 | if (ret) |
|---|
| 2889 | 3125 | return ret; |
|---|
| 2890 | 3126 | |
|---|
| 2891 | | - /* |
|---|
| 2892 | | - * We add the kobjects here (and after forcing data chunk creation) |
|---|
| 2893 | | - * since relocation is the only place we'll create chunks of a new |
|---|
| 2894 | | - * type at runtime. The only place where we'll remove the last |
|---|
| 2895 | | - * chunk of a type is the call immediately below this one. Even |
|---|
| 2896 | | - * so, we're protected against races with the cleaner thread since |
|---|
| 2897 | | - * we're covered by the delete_unused_bgs_mutex. |
|---|
| 2898 | | - */ |
|---|
| 2899 | | - btrfs_add_raid_kobjects(fs_info); |
|---|
| 3127 | + block_group = btrfs_lookup_block_group(fs_info, chunk_offset); |
|---|
| 3128 | + if (!block_group) |
|---|
| 3129 | + return -ENOENT; |
|---|
| 3130 | + btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); |
|---|
| 3131 | + btrfs_put_block_group(block_group); |
|---|
| 2900 | 3132 | |
|---|
| 2901 | 3133 | trans = btrfs_start_trans_remove_block_group(root->fs_info, |
|---|
| 2902 | 3134 | chunk_offset); |
|---|
| .. | .. |
|---|
| 2997 | 3229 | static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, |
|---|
| 2998 | 3230 | u64 chunk_offset) |
|---|
| 2999 | 3231 | { |
|---|
| 3000 | | - struct btrfs_block_group_cache *cache; |
|---|
| 3232 | + struct btrfs_block_group *cache; |
|---|
| 3001 | 3233 | u64 bytes_used; |
|---|
| 3002 | 3234 | u64 chunk_type; |
|---|
| 3003 | 3235 | |
|---|
| .. | .. |
|---|
| 3006 | 3238 | chunk_type = cache->flags; |
|---|
| 3007 | 3239 | btrfs_put_block_group(cache); |
|---|
| 3008 | 3240 | |
|---|
| 3009 | | - if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { |
|---|
| 3010 | | - spin_lock(&fs_info->data_sinfo->lock); |
|---|
| 3011 | | - bytes_used = fs_info->data_sinfo->bytes_used; |
|---|
| 3012 | | - spin_unlock(&fs_info->data_sinfo->lock); |
|---|
| 3241 | + if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) |
|---|
| 3242 | + return 0; |
|---|
| 3013 | 3243 | |
|---|
| 3014 | | - if (!bytes_used) { |
|---|
| 3015 | | - struct btrfs_trans_handle *trans; |
|---|
| 3016 | | - int ret; |
|---|
| 3244 | + spin_lock(&fs_info->data_sinfo->lock); |
|---|
| 3245 | + bytes_used = fs_info->data_sinfo->bytes_used; |
|---|
| 3246 | + spin_unlock(&fs_info->data_sinfo->lock); |
|---|
| 3017 | 3247 | |
|---|
| 3018 | | - trans = btrfs_join_transaction(fs_info->tree_root); |
|---|
| 3019 | | - if (IS_ERR(trans)) |
|---|
| 3020 | | - return PTR_ERR(trans); |
|---|
| 3248 | + if (!bytes_used) { |
|---|
| 3249 | + struct btrfs_trans_handle *trans; |
|---|
| 3250 | + int ret; |
|---|
| 3021 | 3251 | |
|---|
| 3022 | | - ret = btrfs_force_chunk_alloc(trans, |
|---|
| 3023 | | - BTRFS_BLOCK_GROUP_DATA); |
|---|
| 3024 | | - btrfs_end_transaction(trans); |
|---|
| 3025 | | - if (ret < 0) |
|---|
| 3026 | | - return ret; |
|---|
| 3252 | + trans = btrfs_join_transaction(fs_info->tree_root); |
|---|
| 3253 | + if (IS_ERR(trans)) |
|---|
| 3254 | + return PTR_ERR(trans); |
|---|
| 3027 | 3255 | |
|---|
| 3028 | | - btrfs_add_raid_kobjects(fs_info); |
|---|
| 3029 | | - |
|---|
| 3030 | | - return 1; |
|---|
| 3031 | | - } |
|---|
| 3256 | + ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); |
|---|
| 3257 | + btrfs_end_transaction(trans); |
|---|
| 3258 | + if (ret < 0) |
|---|
| 3259 | + return ret; |
|---|
| 3260 | + return 1; |
|---|
| 3032 | 3261 | } |
|---|
| 3262 | + |
|---|
| 3033 | 3263 | return 0; |
|---|
| 3034 | 3264 | } |
|---|
| 3035 | 3265 | |
|---|
| .. | .. |
|---|
| 3099 | 3329 | if (!path) |
|---|
| 3100 | 3330 | return -ENOMEM; |
|---|
| 3101 | 3331 | |
|---|
| 3102 | | - trans = btrfs_start_transaction(root, 0); |
|---|
| 3332 | + trans = btrfs_start_transaction_fallback_global_rsv(root, 0); |
|---|
| 3103 | 3333 | if (IS_ERR(trans)) { |
|---|
| 3104 | 3334 | btrfs_free_path(path); |
|---|
| 3105 | 3335 | return PTR_ERR(trans); |
|---|
| .. | .. |
|---|
| 3208 | 3438 | static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, |
|---|
| 3209 | 3439 | struct btrfs_balance_args *bargs) |
|---|
| 3210 | 3440 | { |
|---|
| 3211 | | - struct btrfs_block_group_cache *cache; |
|---|
| 3441 | + struct btrfs_block_group *cache; |
|---|
| 3212 | 3442 | u64 chunk_used; |
|---|
| 3213 | 3443 | u64 user_thresh_min; |
|---|
| 3214 | 3444 | u64 user_thresh_max; |
|---|
| 3215 | 3445 | int ret = 1; |
|---|
| 3216 | 3446 | |
|---|
| 3217 | 3447 | cache = btrfs_lookup_block_group(fs_info, chunk_offset); |
|---|
| 3218 | | - chunk_used = btrfs_block_group_used(&cache->item); |
|---|
| 3448 | + chunk_used = cache->used; |
|---|
| 3219 | 3449 | |
|---|
| 3220 | 3450 | if (bargs->usage_min == 0) |
|---|
| 3221 | 3451 | user_thresh_min = 0; |
|---|
| 3222 | 3452 | else |
|---|
| 3223 | | - user_thresh_min = div_factor_fine(cache->key.offset, |
|---|
| 3224 | | - bargs->usage_min); |
|---|
| 3453 | + user_thresh_min = div_factor_fine(cache->length, |
|---|
| 3454 | + bargs->usage_min); |
|---|
| 3225 | 3455 | |
|---|
| 3226 | 3456 | if (bargs->usage_max == 0) |
|---|
| 3227 | 3457 | user_thresh_max = 1; |
|---|
| 3228 | 3458 | else if (bargs->usage_max > 100) |
|---|
| 3229 | | - user_thresh_max = cache->key.offset; |
|---|
| 3459 | + user_thresh_max = cache->length; |
|---|
| 3230 | 3460 | else |
|---|
| 3231 | | - user_thresh_max = div_factor_fine(cache->key.offset, |
|---|
| 3232 | | - bargs->usage_max); |
|---|
| 3461 | + user_thresh_max = div_factor_fine(cache->length, |
|---|
| 3462 | + bargs->usage_max); |
|---|
| 3233 | 3463 | |
|---|
| 3234 | 3464 | if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) |
|---|
| 3235 | 3465 | ret = 0; |
|---|
| .. | .. |
|---|
| 3241 | 3471 | static int chunk_usage_filter(struct btrfs_fs_info *fs_info, |
|---|
| 3242 | 3472 | u64 chunk_offset, struct btrfs_balance_args *bargs) |
|---|
| 3243 | 3473 | { |
|---|
| 3244 | | - struct btrfs_block_group_cache *cache; |
|---|
| 3474 | + struct btrfs_block_group *cache; |
|---|
| 3245 | 3475 | u64 chunk_used, user_thresh; |
|---|
| 3246 | 3476 | int ret = 1; |
|---|
| 3247 | 3477 | |
|---|
| 3248 | 3478 | cache = btrfs_lookup_block_group(fs_info, chunk_offset); |
|---|
| 3249 | | - chunk_used = btrfs_block_group_used(&cache->item); |
|---|
| 3479 | + chunk_used = cache->used; |
|---|
| 3250 | 3480 | |
|---|
| 3251 | 3481 | if (bargs->usage_min == 0) |
|---|
| 3252 | 3482 | user_thresh = 1; |
|---|
| 3253 | 3483 | else if (bargs->usage > 100) |
|---|
| 3254 | | - user_thresh = cache->key.offset; |
|---|
| 3484 | + user_thresh = cache->length; |
|---|
| 3255 | 3485 | else |
|---|
| 3256 | | - user_thresh = div_factor_fine(cache->key.offset, |
|---|
| 3257 | | - bargs->usage); |
|---|
| 3486 | + user_thresh = div_factor_fine(cache->length, bargs->usage); |
|---|
| 3258 | 3487 | |
|---|
| 3259 | 3488 | if (chunk_used < user_thresh) |
|---|
| 3260 | 3489 | ret = 0; |
|---|
| .. | .. |
|---|
| 3280 | 3509 | return 1; |
|---|
| 3281 | 3510 | } |
|---|
| 3282 | 3511 | |
|---|
| 3512 | +static u64 calc_data_stripes(u64 type, int num_stripes) |
|---|
| 3513 | +{ |
|---|
| 3514 | + const int index = btrfs_bg_flags_to_raid_index(type); |
|---|
| 3515 | + const int ncopies = btrfs_raid_array[index].ncopies; |
|---|
| 3516 | + const int nparity = btrfs_raid_array[index].nparity; |
|---|
| 3517 | + |
|---|
| 3518 | + if (nparity) |
|---|
| 3519 | + return num_stripes - nparity; |
|---|
| 3520 | + else |
|---|
| 3521 | + return num_stripes / ncopies; |
|---|
| 3522 | +} |
|---|
| 3523 | + |
|---|
| 3283 | 3524 | /* [pstart, pend) */ |
|---|
| 3284 | 3525 | static int chunk_drange_filter(struct extent_buffer *leaf, |
|---|
| 3285 | 3526 | struct btrfs_chunk *chunk, |
|---|
| .. | .. |
|---|
| 3289 | 3530 | int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); |
|---|
| 3290 | 3531 | u64 stripe_offset; |
|---|
| 3291 | 3532 | u64 stripe_length; |
|---|
| 3533 | + u64 type; |
|---|
| 3292 | 3534 | int factor; |
|---|
| 3293 | 3535 | int i; |
|---|
| 3294 | 3536 | |
|---|
| 3295 | 3537 | if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) |
|---|
| 3296 | 3538 | return 0; |
|---|
| 3297 | 3539 | |
|---|
| 3298 | | - if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | |
|---|
| 3299 | | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { |
|---|
| 3300 | | - factor = num_stripes / 2; |
|---|
| 3301 | | - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { |
|---|
| 3302 | | - factor = num_stripes - 1; |
|---|
| 3303 | | - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { |
|---|
| 3304 | | - factor = num_stripes - 2; |
|---|
| 3305 | | - } else { |
|---|
| 3306 | | - factor = num_stripes; |
|---|
| 3307 | | - } |
|---|
| 3540 | + type = btrfs_chunk_type(leaf, chunk); |
|---|
| 3541 | + factor = calc_data_stripes(type, num_stripes); |
|---|
| 3308 | 3542 | |
|---|
| 3309 | 3543 | for (i = 0; i < num_stripes; i++) { |
|---|
| 3310 | 3544 | stripe = btrfs_stripe_nr(chunk, i); |
|---|
| .. | .. |
|---|
| 3365 | 3599 | return 0; |
|---|
| 3366 | 3600 | } |
|---|
| 3367 | 3601 | |
|---|
| 3368 | | -static int should_balance_chunk(struct btrfs_fs_info *fs_info, |
|---|
| 3369 | | - struct extent_buffer *leaf, |
|---|
| 3602 | +static int should_balance_chunk(struct extent_buffer *leaf, |
|---|
| 3370 | 3603 | struct btrfs_chunk *chunk, u64 chunk_offset) |
|---|
| 3371 | 3604 | { |
|---|
| 3605 | + struct btrfs_fs_info *fs_info = leaf->fs_info; |
|---|
| 3372 | 3606 | struct btrfs_balance_control *bctl = fs_info->balance_ctl; |
|---|
| 3373 | 3607 | struct btrfs_balance_args *bargs = NULL; |
|---|
| 3374 | 3608 | u64 chunk_type = btrfs_chunk_type(leaf, chunk); |
|---|
| .. | .. |
|---|
| 3458 | 3692 | { |
|---|
| 3459 | 3693 | struct btrfs_balance_control *bctl = fs_info->balance_ctl; |
|---|
| 3460 | 3694 | struct btrfs_root *chunk_root = fs_info->chunk_root; |
|---|
| 3461 | | - struct btrfs_root *dev_root = fs_info->dev_root; |
|---|
| 3462 | | - struct list_head *devices; |
|---|
| 3463 | | - struct btrfs_device *device; |
|---|
| 3464 | | - u64 old_size; |
|---|
| 3465 | | - u64 size_to_free; |
|---|
| 3466 | 3695 | u64 chunk_type; |
|---|
| 3467 | 3696 | struct btrfs_chunk *chunk; |
|---|
| 3468 | 3697 | struct btrfs_path *path = NULL; |
|---|
| 3469 | 3698 | struct btrfs_key key; |
|---|
| 3470 | 3699 | struct btrfs_key found_key; |
|---|
| 3471 | | - struct btrfs_trans_handle *trans; |
|---|
| 3472 | 3700 | struct extent_buffer *leaf; |
|---|
| 3473 | 3701 | int slot; |
|---|
| 3474 | 3702 | int ret; |
|---|
| .. | .. |
|---|
| 3483 | 3711 | u32 count_sys = 0; |
|---|
| 3484 | 3712 | int chunk_reserved = 0; |
|---|
| 3485 | 3713 | |
|---|
| 3486 | | - /* step one make some room on all the devices */ |
|---|
| 3487 | | - devices = &fs_info->fs_devices->devices; |
|---|
| 3488 | | - list_for_each_entry(device, devices, dev_list) { |
|---|
| 3489 | | - old_size = btrfs_device_get_total_bytes(device); |
|---|
| 3490 | | - size_to_free = div_factor(old_size, 1); |
|---|
| 3491 | | - size_to_free = min_t(u64, size_to_free, SZ_1M); |
|---|
| 3492 | | - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || |
|---|
| 3493 | | - btrfs_device_get_total_bytes(device) - |
|---|
| 3494 | | - btrfs_device_get_bytes_used(device) > size_to_free || |
|---|
| 3495 | | - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) |
|---|
| 3496 | | - continue; |
|---|
| 3497 | | - |
|---|
| 3498 | | - ret = btrfs_shrink_device(device, old_size - size_to_free); |
|---|
| 3499 | | - if (ret == -ENOSPC) |
|---|
| 3500 | | - break; |
|---|
| 3501 | | - if (ret) { |
|---|
| 3502 | | - /* btrfs_shrink_device never returns ret > 0 */ |
|---|
| 3503 | | - WARN_ON(ret > 0); |
|---|
| 3504 | | - goto error; |
|---|
| 3505 | | - } |
|---|
| 3506 | | - |
|---|
| 3507 | | - trans = btrfs_start_transaction(dev_root, 0); |
|---|
| 3508 | | - if (IS_ERR(trans)) { |
|---|
| 3509 | | - ret = PTR_ERR(trans); |
|---|
| 3510 | | - btrfs_info_in_rcu(fs_info, |
|---|
| 3511 | | - "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", |
|---|
| 3512 | | - rcu_str_deref(device->name), ret, |
|---|
| 3513 | | - old_size, old_size - size_to_free); |
|---|
| 3514 | | - goto error; |
|---|
| 3515 | | - } |
|---|
| 3516 | | - |
|---|
| 3517 | | - ret = btrfs_grow_device(trans, device, old_size); |
|---|
| 3518 | | - if (ret) { |
|---|
| 3519 | | - btrfs_end_transaction(trans); |
|---|
| 3520 | | - /* btrfs_grow_device never returns ret > 0 */ |
|---|
| 3521 | | - WARN_ON(ret > 0); |
|---|
| 3522 | | - btrfs_info_in_rcu(fs_info, |
|---|
| 3523 | | - "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", |
|---|
| 3524 | | - rcu_str_deref(device->name), ret, |
|---|
| 3525 | | - old_size, old_size - size_to_free); |
|---|
| 3526 | | - goto error; |
|---|
| 3527 | | - } |
|---|
| 3528 | | - |
|---|
| 3529 | | - btrfs_end_transaction(trans); |
|---|
| 3530 | | - } |
|---|
| 3531 | | - |
|---|
| 3532 | | - /* step two, relocate all the chunks */ |
|---|
| 3533 | 3714 | path = btrfs_alloc_path(); |
|---|
| 3534 | 3715 | if (!path) { |
|---|
| 3535 | 3716 | ret = -ENOMEM; |
|---|
| .. | .. |
|---|
| 3601 | 3782 | spin_unlock(&fs_info->balance_lock); |
|---|
| 3602 | 3783 | } |
|---|
| 3603 | 3784 | |
|---|
| 3604 | | - ret = should_balance_chunk(fs_info, leaf, chunk, |
|---|
| 3605 | | - found_key.offset); |
|---|
| 3785 | + ret = should_balance_chunk(leaf, chunk, found_key.offset); |
|---|
| 3606 | 3786 | |
|---|
| 3607 | 3787 | btrfs_release_path(path); |
|---|
| 3608 | 3788 | if (!ret) { |
|---|
| .. | .. |
|---|
| 3659 | 3839 | |
|---|
| 3660 | 3840 | ret = btrfs_relocate_chunk(fs_info, found_key.offset); |
|---|
| 3661 | 3841 | mutex_unlock(&fs_info->delete_unused_bgs_mutex); |
|---|
| 3662 | | - if (ret && ret != -ENOSPC) |
|---|
| 3663 | | - goto error; |
|---|
| 3664 | 3842 | if (ret == -ENOSPC) { |
|---|
| 3665 | 3843 | enospc_errors++; |
|---|
| 3844 | + } else if (ret == -ETXTBSY) { |
|---|
| 3845 | + btrfs_info(fs_info, |
|---|
| 3846 | + "skipping relocation of block group %llu due to active swapfile", |
|---|
| 3847 | + found_key.offset); |
|---|
| 3848 | + ret = 0; |
|---|
| 3849 | + } else if (ret) { |
|---|
| 3850 | + goto error; |
|---|
| 3666 | 3851 | } else { |
|---|
| 3667 | 3852 | spin_lock(&fs_info->balance_lock); |
|---|
| 3668 | 3853 | bctl->stat.completed++; |
|---|
| .. | .. |
|---|
| 3711 | 3896 | if (flags == 0) |
|---|
| 3712 | 3897 | return !extended; /* "0" is valid for usual profiles */ |
|---|
| 3713 | 3898 | |
|---|
| 3714 | | - /* true if exactly one bit set */ |
|---|
| 3715 | | - return (flags & (flags - 1)) == 0; |
|---|
| 3899 | + return has_single_bit_set(flags); |
|---|
| 3716 | 3900 | } |
|---|
| 3717 | 3901 | |
|---|
| 3718 | 3902 | static inline int balance_need_close(struct btrfs_fs_info *fs_info) |
|---|
| .. | .. |
|---|
| 3723 | 3907 | atomic_read(&fs_info->balance_cancel_req) == 0); |
|---|
| 3724 | 3908 | } |
|---|
| 3725 | 3909 | |
|---|
| 3726 | | -/* Non-zero return value signifies invalidity */ |
|---|
| 3727 | | -static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, |
|---|
| 3728 | | - u64 allowed) |
|---|
| 3910 | +/* |
|---|
| 3911 | + * Validate target profile against allowed profiles and return true if it's OK. |
|---|
| 3912 | + * Otherwise print the error message and return false. |
|---|
| 3913 | + */ |
|---|
| 3914 | +static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, |
|---|
| 3915 | + const struct btrfs_balance_args *bargs, |
|---|
| 3916 | + u64 allowed, const char *type) |
|---|
| 3729 | 3917 | { |
|---|
| 3730 | | - return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && |
|---|
| 3731 | | - (!alloc_profile_is_valid(bctl_arg->target, 1) || |
|---|
| 3732 | | - (bctl_arg->target & ~allowed))); |
|---|
| 3918 | + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) |
|---|
| 3919 | + return true; |
|---|
| 3920 | + |
|---|
| 3921 | + /* Profile is valid and does not have bits outside of the allowed set */ |
|---|
| 3922 | + if (alloc_profile_is_valid(bargs->target, 1) && |
|---|
| 3923 | + (bargs->target & ~allowed) == 0) |
|---|
| 3924 | + return true; |
|---|
| 3925 | + |
|---|
| 3926 | + btrfs_err(fs_info, "balance: invalid convert %s profile %s", |
|---|
| 3927 | + type, btrfs_bg_type_to_raid_name(bargs->target)); |
|---|
| 3928 | + return false; |
|---|
| 3929 | +} |
|---|
| 3930 | + |
|---|
| 3931 | +/* |
|---|
| 3932 | + * Fill @buf with textual description of balance filter flags @bargs, up to |
|---|
| 3933 | + * @size_buf including the terminating null. The output may be trimmed if it |
|---|
| 3934 | + * does not fit into the provided buffer. |
|---|
| 3935 | + */ |
|---|
| 3936 | +static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, |
|---|
| 3937 | + u32 size_buf) |
|---|
| 3938 | +{ |
|---|
| 3939 | + int ret; |
|---|
| 3940 | + u32 size_bp = size_buf; |
|---|
| 3941 | + char *bp = buf; |
|---|
| 3942 | + u64 flags = bargs->flags; |
|---|
| 3943 | + char tmp_buf[128] = {'\0'}; |
|---|
| 3944 | + |
|---|
| 3945 | + if (!flags) |
|---|
| 3946 | + return; |
|---|
| 3947 | + |
|---|
| 3948 | +#define CHECK_APPEND_NOARG(a) \ |
|---|
| 3949 | + do { \ |
|---|
| 3950 | + ret = snprintf(bp, size_bp, (a)); \ |
|---|
| 3951 | + if (ret < 0 || ret >= size_bp) \ |
|---|
| 3952 | + goto out_overflow; \ |
|---|
| 3953 | + size_bp -= ret; \ |
|---|
| 3954 | + bp += ret; \ |
|---|
| 3955 | + } while (0) |
|---|
| 3956 | + |
|---|
| 3957 | +#define CHECK_APPEND_1ARG(a, v1) \ |
|---|
| 3958 | + do { \ |
|---|
| 3959 | + ret = snprintf(bp, size_bp, (a), (v1)); \ |
|---|
| 3960 | + if (ret < 0 || ret >= size_bp) \ |
|---|
| 3961 | + goto out_overflow; \ |
|---|
| 3962 | + size_bp -= ret; \ |
|---|
| 3963 | + bp += ret; \ |
|---|
| 3964 | + } while (0) |
|---|
| 3965 | + |
|---|
| 3966 | +#define CHECK_APPEND_2ARG(a, v1, v2) \ |
|---|
| 3967 | + do { \ |
|---|
| 3968 | + ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ |
|---|
| 3969 | + if (ret < 0 || ret >= size_bp) \ |
|---|
| 3970 | + goto out_overflow; \ |
|---|
| 3971 | + size_bp -= ret; \ |
|---|
| 3972 | + bp += ret; \ |
|---|
| 3973 | + } while (0) |
|---|
| 3974 | + |
|---|
| 3975 | + if (flags & BTRFS_BALANCE_ARGS_CONVERT) |
|---|
| 3976 | + CHECK_APPEND_1ARG("convert=%s,", |
|---|
| 3977 | + btrfs_bg_type_to_raid_name(bargs->target)); |
|---|
| 3978 | + |
|---|
| 3979 | + if (flags & BTRFS_BALANCE_ARGS_SOFT) |
|---|
| 3980 | + CHECK_APPEND_NOARG("soft,"); |
|---|
| 3981 | + |
|---|
| 3982 | + if (flags & BTRFS_BALANCE_ARGS_PROFILES) { |
|---|
| 3983 | + btrfs_describe_block_groups(bargs->profiles, tmp_buf, |
|---|
| 3984 | + sizeof(tmp_buf)); |
|---|
| 3985 | + CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); |
|---|
| 3986 | + } |
|---|
| 3987 | + |
|---|
| 3988 | + if (flags & BTRFS_BALANCE_ARGS_USAGE) |
|---|
| 3989 | + CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); |
|---|
| 3990 | + |
|---|
| 3991 | + if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) |
|---|
| 3992 | + CHECK_APPEND_2ARG("usage=%u..%u,", |
|---|
| 3993 | + bargs->usage_min, bargs->usage_max); |
|---|
| 3994 | + |
|---|
| 3995 | + if (flags & BTRFS_BALANCE_ARGS_DEVID) |
|---|
| 3996 | + CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); |
|---|
| 3997 | + |
|---|
| 3998 | + if (flags & BTRFS_BALANCE_ARGS_DRANGE) |
|---|
| 3999 | + CHECK_APPEND_2ARG("drange=%llu..%llu,", |
|---|
| 4000 | + bargs->pstart, bargs->pend); |
|---|
| 4001 | + |
|---|
| 4002 | + if (flags & BTRFS_BALANCE_ARGS_VRANGE) |
|---|
| 4003 | + CHECK_APPEND_2ARG("vrange=%llu..%llu,", |
|---|
| 4004 | + bargs->vstart, bargs->vend); |
|---|
| 4005 | + |
|---|
| 4006 | + if (flags & BTRFS_BALANCE_ARGS_LIMIT) |
|---|
| 4007 | + CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); |
|---|
| 4008 | + |
|---|
| 4009 | + if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) |
|---|
| 4010 | + CHECK_APPEND_2ARG("limit=%u..%u,", |
|---|
| 4011 | + bargs->limit_min, bargs->limit_max); |
|---|
| 4012 | + |
|---|
| 4013 | + if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) |
|---|
| 4014 | + CHECK_APPEND_2ARG("stripes=%u..%u,", |
|---|
| 4015 | + bargs->stripes_min, bargs->stripes_max); |
|---|
| 4016 | + |
|---|
| 4017 | +#undef CHECK_APPEND_2ARG |
|---|
| 4018 | +#undef CHECK_APPEND_1ARG |
|---|
| 4019 | +#undef CHECK_APPEND_NOARG |
|---|
| 4020 | + |
|---|
| 4021 | +out_overflow: |
|---|
| 4022 | + |
|---|
| 4023 | + if (size_bp < size_buf) |
|---|
| 4024 | + buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ |
|---|
| 4025 | + else |
|---|
| 4026 | + buf[0] = '\0'; |
|---|
| 4027 | +} |
|---|
| 4028 | + |
|---|
| 4029 | +static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) |
|---|
| 4030 | +{ |
|---|
| 4031 | + u32 size_buf = 1024; |
|---|
| 4032 | + char tmp_buf[192] = {'\0'}; |
|---|
| 4033 | + char *buf; |
|---|
| 4034 | + char *bp; |
|---|
| 4035 | + u32 size_bp = size_buf; |
|---|
| 4036 | + int ret; |
|---|
| 4037 | + struct btrfs_balance_control *bctl = fs_info->balance_ctl; |
|---|
| 4038 | + |
|---|
| 4039 | + buf = kzalloc(size_buf, GFP_KERNEL); |
|---|
| 4040 | + if (!buf) |
|---|
| 4041 | + return; |
|---|
| 4042 | + |
|---|
| 4043 | + bp = buf; |
|---|
| 4044 | + |
|---|
| 4045 | +#define CHECK_APPEND_1ARG(a, v1) \ |
|---|
| 4046 | + do { \ |
|---|
| 4047 | + ret = snprintf(bp, size_bp, (a), (v1)); \ |
|---|
| 4048 | + if (ret < 0 || ret >= size_bp) \ |
|---|
| 4049 | + goto out_overflow; \ |
|---|
| 4050 | + size_bp -= ret; \ |
|---|
| 4051 | + bp += ret; \ |
|---|
| 4052 | + } while (0) |
|---|
| 4053 | + |
|---|
| 4054 | + if (bctl->flags & BTRFS_BALANCE_FORCE) |
|---|
| 4055 | + CHECK_APPEND_1ARG("%s", "-f "); |
|---|
| 4056 | + |
|---|
| 4057 | + if (bctl->flags & BTRFS_BALANCE_DATA) { |
|---|
| 4058 | + describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); |
|---|
| 4059 | + CHECK_APPEND_1ARG("-d%s ", tmp_buf); |
|---|
| 4060 | + } |
|---|
| 4061 | + |
|---|
| 4062 | + if (bctl->flags & BTRFS_BALANCE_METADATA) { |
|---|
| 4063 | + describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); |
|---|
| 4064 | + CHECK_APPEND_1ARG("-m%s ", tmp_buf); |
|---|
| 4065 | + } |
|---|
| 4066 | + |
|---|
| 4067 | + if (bctl->flags & BTRFS_BALANCE_SYSTEM) { |
|---|
| 4068 | + describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); |
|---|
| 4069 | + CHECK_APPEND_1ARG("-s%s ", tmp_buf); |
|---|
| 4070 | + } |
|---|
| 4071 | + |
|---|
| 4072 | +#undef CHECK_APPEND_1ARG |
|---|
| 4073 | + |
|---|
| 4074 | +out_overflow: |
|---|
| 4075 | + |
|---|
| 4076 | + if (size_bp < size_buf) |
|---|
| 4077 | + buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ |
|---|
| 4078 | + btrfs_info(fs_info, "balance: %s %s", |
|---|
| 4079 | + (bctl->flags & BTRFS_BALANCE_RESUME) ? |
|---|
| 4080 | + "resume" : "start", buf); |
|---|
| 4081 | + |
|---|
| 4082 | + kfree(buf); |
|---|
| 3733 | 4083 | } |
|---|
| 3734 | 4084 | |
|---|
| 3735 | 4085 | /* |
|---|
| .. | .. |
|---|
| 3745 | 4095 | int ret; |
|---|
| 3746 | 4096 | u64 num_devices; |
|---|
| 3747 | 4097 | unsigned seq; |
|---|
| 3748 | | - bool reducing_integrity; |
|---|
| 4098 | + bool reducing_redundancy; |
|---|
| 4099 | + int i; |
|---|
| 3749 | 4100 | |
|---|
| 3750 | 4101 | if (btrfs_fs_closing(fs_info) || |
|---|
| 3751 | 4102 | atomic_read(&fs_info->balance_pause_req) || |
|---|
| 3752 | | - atomic_read(&fs_info->balance_cancel_req)) { |
|---|
| 4103 | + btrfs_should_cancel_balance(fs_info)) { |
|---|
| 3753 | 4104 | ret = -EINVAL; |
|---|
| 3754 | 4105 | goto out; |
|---|
| 3755 | 4106 | } |
|---|
| .. | .. |
|---|
| 3774 | 4125 | } |
|---|
| 3775 | 4126 | } |
|---|
| 3776 | 4127 | |
|---|
| 3777 | | - num_devices = fs_info->fs_devices->num_devices; |
|---|
| 3778 | | - btrfs_dev_replace_read_lock(&fs_info->dev_replace); |
|---|
| 3779 | | - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { |
|---|
| 3780 | | - BUG_ON(num_devices < 1); |
|---|
| 3781 | | - num_devices--; |
|---|
| 3782 | | - } |
|---|
| 3783 | | - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); |
|---|
| 3784 | | - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; |
|---|
| 3785 | | - if (num_devices > 1) |
|---|
| 3786 | | - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); |
|---|
| 3787 | | - if (num_devices > 2) |
|---|
| 3788 | | - allowed |= BTRFS_BLOCK_GROUP_RAID5; |
|---|
| 3789 | | - if (num_devices > 3) |
|---|
| 3790 | | - allowed |= (BTRFS_BLOCK_GROUP_RAID10 | |
|---|
| 3791 | | - BTRFS_BLOCK_GROUP_RAID6); |
|---|
| 3792 | | - if (validate_convert_profile(&bctl->data, allowed)) { |
|---|
| 3793 | | - int index = btrfs_bg_flags_to_raid_index(bctl->data.target); |
|---|
| 4128 | + /* |
|---|
| 4129 | + * rw_devices will not change at the moment, device add/delete/replace |
|---|
| 4130 | + * are exclusive |
|---|
| 4131 | + */ |
|---|
| 4132 | + num_devices = fs_info->fs_devices->rw_devices; |
|---|
| 3794 | 4133 | |
|---|
| 3795 | | - btrfs_err(fs_info, |
|---|
| 3796 | | - "balance: invalid convert data profile %s", |
|---|
| 3797 | | - get_raid_name(index)); |
|---|
| 3798 | | - ret = -EINVAL; |
|---|
| 3799 | | - goto out; |
|---|
| 3800 | | - } |
|---|
| 3801 | | - if (validate_convert_profile(&bctl->meta, allowed)) { |
|---|
| 3802 | | - int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); |
|---|
| 4134 | + /* |
|---|
| 4135 | + * SINGLE profile on-disk has no profile bit, but in-memory we have a |
|---|
| 4136 | + * special bit for it, to make it easier to distinguish. Thus we need |
|---|
| 4137 | + * to set it manually, or balance would refuse the profile. |
|---|
| 4138 | + */ |
|---|
| 4139 | + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; |
|---|
| 4140 | + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) |
|---|
| 4141 | + if (num_devices >= btrfs_raid_array[i].devs_min) |
|---|
| 4142 | + allowed |= btrfs_raid_array[i].bg_flag; |
|---|
| 3803 | 4143 | |
|---|
| 3804 | | - btrfs_err(fs_info, |
|---|
| 3805 | | - "balance: invalid convert metadata profile %s", |
|---|
| 3806 | | - get_raid_name(index)); |
|---|
| 3807 | | - ret = -EINVAL; |
|---|
| 3808 | | - goto out; |
|---|
| 3809 | | - } |
|---|
| 3810 | | - if (validate_convert_profile(&bctl->sys, allowed)) { |
|---|
| 3811 | | - int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); |
|---|
| 3812 | | - |
|---|
| 3813 | | - btrfs_err(fs_info, |
|---|
| 3814 | | - "balance: invalid convert system profile %s", |
|---|
| 3815 | | - get_raid_name(index)); |
|---|
| 4144 | + if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || |
|---|
| 4145 | + !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || |
|---|
| 4146 | + !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { |
|---|
| 3816 | 4147 | ret = -EINVAL; |
|---|
| 3817 | 4148 | goto out; |
|---|
| 3818 | 4149 | } |
|---|
| 3819 | 4150 | |
|---|
| 3820 | | - /* allow to reduce meta or sys integrity only if force set */ |
|---|
| 3821 | | - allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | |
|---|
| 3822 | | - BTRFS_BLOCK_GROUP_RAID10 | |
|---|
| 3823 | | - BTRFS_BLOCK_GROUP_RAID5 | |
|---|
| 3824 | | - BTRFS_BLOCK_GROUP_RAID6; |
|---|
| 4151 | + /* |
|---|
| 4152 | + * Allow to reduce metadata or system integrity only if force set for |
|---|
| 4153 | + * profiles with redundancy (copies, parity) |
|---|
| 4154 | + */ |
|---|
| 4155 | + allowed = 0; |
|---|
| 4156 | + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { |
|---|
| 4157 | + if (btrfs_raid_array[i].ncopies >= 2 || |
|---|
| 4158 | + btrfs_raid_array[i].tolerated_failures >= 1) |
|---|
| 4159 | + allowed |= btrfs_raid_array[i].bg_flag; |
|---|
| 4160 | + } |
|---|
| 3825 | 4161 | do { |
|---|
| 3826 | 4162 | seq = read_seqbegin(&fs_info->profiles_lock); |
|---|
| 3827 | 4163 | |
|---|
| .. | .. |
|---|
| 3831 | 4167 | ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
|---|
| 3832 | 4168 | (fs_info->avail_metadata_alloc_bits & allowed) && |
|---|
| 3833 | 4169 | !(bctl->meta.target & allowed))) |
|---|
| 3834 | | - reducing_integrity = true; |
|---|
| 4170 | + reducing_redundancy = true; |
|---|
| 3835 | 4171 | else |
|---|
| 3836 | | - reducing_integrity = false; |
|---|
| 4172 | + reducing_redundancy = false; |
|---|
| 3837 | 4173 | |
|---|
| 3838 | 4174 | /* if we're not converting, the target field is uninitialized */ |
|---|
| 3839 | 4175 | meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? |
|---|
| .. | .. |
|---|
| 3842 | 4178 | bctl->data.target : fs_info->avail_data_alloc_bits; |
|---|
| 3843 | 4179 | } while (read_seqretry(&fs_info->profiles_lock, seq)); |
|---|
| 3844 | 4180 | |
|---|
| 3845 | | - if (reducing_integrity) { |
|---|
| 4181 | + if (reducing_redundancy) { |
|---|
| 3846 | 4182 | if (bctl->flags & BTRFS_BALANCE_FORCE) { |
|---|
| 3847 | 4183 | btrfs_info(fs_info, |
|---|
| 3848 | | - "balance: force reducing metadata integrity"); |
|---|
| 4184 | + "balance: force reducing metadata redundancy"); |
|---|
| 3849 | 4185 | } else { |
|---|
| 3850 | 4186 | btrfs_err(fs_info, |
|---|
| 3851 | | - "balance: reduces metadata integrity, use --force if you want this"); |
|---|
| 4187 | + "balance: reduces metadata redundancy, use --force if you want this"); |
|---|
| 3852 | 4188 | ret = -EINVAL; |
|---|
| 3853 | 4189 | goto out; |
|---|
| 3854 | 4190 | } |
|---|
| .. | .. |
|---|
| 3856 | 4192 | |
|---|
| 3857 | 4193 | if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < |
|---|
| 3858 | 4194 | btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { |
|---|
| 3859 | | - int meta_index = btrfs_bg_flags_to_raid_index(meta_target); |
|---|
| 3860 | | - int data_index = btrfs_bg_flags_to_raid_index(data_target); |
|---|
| 3861 | | - |
|---|
| 3862 | 4195 | btrfs_warn(fs_info, |
|---|
| 3863 | 4196 | "balance: metadata profile %s has lower redundancy than data profile %s", |
|---|
| 3864 | | - get_raid_name(meta_index), get_raid_name(data_index)); |
|---|
| 4197 | + btrfs_bg_type_to_raid_name(meta_target), |
|---|
| 4198 | + btrfs_bg_type_to_raid_name(data_target)); |
|---|
| 4199 | + } |
|---|
| 4200 | + |
|---|
| 4201 | + if (fs_info->send_in_progress) { |
|---|
| 4202 | + btrfs_warn_rl(fs_info, |
|---|
| 4203 | +"cannot run balance while send operations are in progress (%d in progress)", |
|---|
| 4204 | + fs_info->send_in_progress); |
|---|
| 4205 | + ret = -EAGAIN; |
|---|
| 4206 | + goto out; |
|---|
| 3865 | 4207 | } |
|---|
| 3866 | 4208 | |
|---|
| 3867 | 4209 | ret = insert_balance_item(fs_info, bctl); |
|---|
| .. | .. |
|---|
| 3883 | 4225 | |
|---|
| 3884 | 4226 | ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); |
|---|
| 3885 | 4227 | set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); |
|---|
| 4228 | + describe_balance_start_or_resume(fs_info); |
|---|
| 3886 | 4229 | mutex_unlock(&fs_info->balance_mutex); |
|---|
| 3887 | 4230 | |
|---|
| 3888 | 4231 | ret = __btrfs_balance(fs_info); |
|---|
| 3889 | 4232 | |
|---|
| 3890 | 4233 | mutex_lock(&fs_info->balance_mutex); |
|---|
| 4234 | + if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) |
|---|
| 4235 | + btrfs_info(fs_info, "balance: paused"); |
|---|
| 4236 | + /* |
|---|
| 4237 | + * Balance can be canceled by: |
|---|
| 4238 | + * |
|---|
| 4239 | + * - Regular cancel request |
|---|
| 4240 | + * Then ret == -ECANCELED and balance_cancel_req > 0 |
|---|
| 4241 | + * |
|---|
| 4242 | + * - Fatal signal to "btrfs" process |
|---|
| 4243 | + * Either the signal caught by wait_reserve_ticket() and callers |
|---|
| 4244 | + * got -EINTR, or caught by btrfs_should_cancel_balance() and |
|---|
| 4245 | + * got -ECANCELED. |
|---|
| 4246 | + * Either way, in this case balance_cancel_req = 0, and |
|---|
| 4247 | + * ret == -EINTR or ret == -ECANCELED. |
|---|
| 4248 | + * |
|---|
| 4249 | + * So here we only check the return value to catch canceled balance. |
|---|
| 4250 | + */ |
|---|
| 4251 | + else if (ret == -ECANCELED || ret == -EINTR) |
|---|
| 4252 | + btrfs_info(fs_info, "balance: canceled"); |
|---|
| 4253 | + else |
|---|
| 4254 | + btrfs_info(fs_info, "balance: ended with status: %d", ret); |
|---|
| 4255 | + |
|---|
| 3891 | 4256 | clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); |
|---|
| 3892 | 4257 | |
|---|
| 3893 | 4258 | if (bargs) { |
|---|
| .. | .. |
|---|
| 3898 | 4263 | if ((ret && ret != -ECANCELED && ret != -ENOSPC) || |
|---|
| 3899 | 4264 | balance_need_close(fs_info)) { |
|---|
| 3900 | 4265 | reset_balance_state(fs_info); |
|---|
| 3901 | | - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); |
|---|
| 4266 | + btrfs_exclop_finish(fs_info); |
|---|
| 3902 | 4267 | } |
|---|
| 3903 | 4268 | |
|---|
| 3904 | 4269 | wake_up(&fs_info->balance_wait_q); |
|---|
| .. | .. |
|---|
| 3909 | 4274 | reset_balance_state(fs_info); |
|---|
| 3910 | 4275 | else |
|---|
| 3911 | 4276 | kfree(bctl); |
|---|
| 3912 | | - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); |
|---|
| 4277 | + btrfs_exclop_finish(fs_info); |
|---|
| 3913 | 4278 | |
|---|
| 3914 | 4279 | return ret; |
|---|
| 3915 | 4280 | } |
|---|
| .. | .. |
|---|
| 3919 | 4284 | struct btrfs_fs_info *fs_info = data; |
|---|
| 3920 | 4285 | int ret = 0; |
|---|
| 3921 | 4286 | |
|---|
| 4287 | + sb_start_write(fs_info->sb); |
|---|
| 3922 | 4288 | mutex_lock(&fs_info->balance_mutex); |
|---|
| 3923 | | - if (fs_info->balance_ctl) { |
|---|
| 3924 | | - btrfs_info(fs_info, "balance: resuming"); |
|---|
| 4289 | + if (fs_info->balance_ctl) |
|---|
| 3925 | 4290 | ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); |
|---|
| 3926 | | - } |
|---|
| 3927 | 4291 | mutex_unlock(&fs_info->balance_mutex); |
|---|
| 4292 | + sb_end_write(fs_info->sb); |
|---|
| 3928 | 4293 | |
|---|
| 3929 | 4294 | return ret; |
|---|
| 3930 | 4295 | } |
|---|
| .. | .. |
|---|
| 4013 | 4378 | * is in a paused state and must have fs_info::balance_ctl properly |
|---|
| 4014 | 4379 | * set up. |
|---|
| 4015 | 4380 | */ |
|---|
| 4016 | | - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) |
|---|
| 4381 | + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) |
|---|
| 4017 | 4382 | btrfs_warn(fs_info, |
|---|
| 4018 | 4383 | "balance: cannot set exclusive op status, resume manually"); |
|---|
| 4019 | 4384 | |
|---|
| .. | .. |
|---|
| 4097 | 4462 | |
|---|
| 4098 | 4463 | if (fs_info->balance_ctl) { |
|---|
| 4099 | 4464 | reset_balance_state(fs_info); |
|---|
| 4100 | | - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); |
|---|
| 4465 | + btrfs_exclop_finish(fs_info); |
|---|
| 4101 | 4466 | btrfs_info(fs_info, "balance: canceled"); |
|---|
| 4102 | 4467 | } |
|---|
| 4103 | 4468 | } |
|---|
| 4104 | 4469 | |
|---|
| 4105 | | - BUG_ON(fs_info->balance_ctl || |
|---|
| 4106 | | - test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); |
|---|
| 4470 | + ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); |
|---|
| 4107 | 4471 | atomic_dec(&fs_info->balance_cancel_req); |
|---|
| 4108 | 4472 | mutex_unlock(&fs_info->balance_mutex); |
|---|
| 4109 | 4473 | return 0; |
|---|
| 4110 | 4474 | } |
|---|
| 4111 | 4475 | |
|---|
| 4112 | | -static int btrfs_uuid_scan_kthread(void *data) |
|---|
| 4476 | +int btrfs_uuid_scan_kthread(void *data) |
|---|
| 4113 | 4477 | { |
|---|
| 4114 | 4478 | struct btrfs_fs_info *fs_info = data; |
|---|
| 4115 | 4479 | struct btrfs_root *root = fs_info->tree_root; |
|---|
| .. | .. |
|---|
| 4121 | 4485 | struct btrfs_root_item root_item; |
|---|
| 4122 | 4486 | u32 item_size; |
|---|
| 4123 | 4487 | struct btrfs_trans_handle *trans = NULL; |
|---|
| 4488 | + bool closing = false; |
|---|
| 4124 | 4489 | |
|---|
| 4125 | 4490 | path = btrfs_alloc_path(); |
|---|
| 4126 | 4491 | if (!path) { |
|---|
| .. | .. |
|---|
| 4133 | 4498 | key.offset = 0; |
|---|
| 4134 | 4499 | |
|---|
| 4135 | 4500 | while (1) { |
|---|
| 4501 | + if (btrfs_fs_closing(fs_info)) { |
|---|
| 4502 | + closing = true; |
|---|
| 4503 | + break; |
|---|
| 4504 | + } |
|---|
| 4136 | 4505 | ret = btrfs_search_forward(root, &key, path, |
|---|
| 4137 | 4506 | BTRFS_OLDEST_GENERATION); |
|---|
| 4138 | 4507 | if (ret) { |
|---|
| .. | .. |
|---|
| 4233 | 4602 | btrfs_end_transaction(trans); |
|---|
| 4234 | 4603 | if (ret) |
|---|
| 4235 | 4604 | btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); |
|---|
| 4236 | | - else |
|---|
| 4605 | + else if (!closing) |
|---|
| 4237 | 4606 | set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); |
|---|
| 4238 | 4607 | up(&fs_info->uuid_tree_rescan_sem); |
|---|
| 4239 | 4608 | return 0; |
|---|
| 4240 | | -} |
|---|
| 4241 | | - |
|---|
| 4242 | | -/* |
|---|
| 4243 | | - * Callback for btrfs_uuid_tree_iterate(). |
|---|
| 4244 | | - * returns: |
|---|
| 4245 | | - * 0 check succeeded, the entry is not outdated. |
|---|
| 4246 | | - * < 0 if an error occurred. |
|---|
| 4247 | | - * > 0 if the check failed, which means the caller shall remove the entry. |
|---|
| 4248 | | - */ |
|---|
| 4249 | | -static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, |
|---|
| 4250 | | - u8 *uuid, u8 type, u64 subid) |
|---|
| 4251 | | -{ |
|---|
| 4252 | | - struct btrfs_key key; |
|---|
| 4253 | | - int ret = 0; |
|---|
| 4254 | | - struct btrfs_root *subvol_root; |
|---|
| 4255 | | - |
|---|
| 4256 | | - if (type != BTRFS_UUID_KEY_SUBVOL && |
|---|
| 4257 | | - type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) |
|---|
| 4258 | | - goto out; |
|---|
| 4259 | | - |
|---|
| 4260 | | - key.objectid = subid; |
|---|
| 4261 | | - key.type = BTRFS_ROOT_ITEM_KEY; |
|---|
| 4262 | | - key.offset = (u64)-1; |
|---|
| 4263 | | - subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); |
|---|
| 4264 | | - if (IS_ERR(subvol_root)) { |
|---|
| 4265 | | - ret = PTR_ERR(subvol_root); |
|---|
| 4266 | | - if (ret == -ENOENT) |
|---|
| 4267 | | - ret = 1; |
|---|
| 4268 | | - goto out; |
|---|
| 4269 | | - } |
|---|
| 4270 | | - |
|---|
| 4271 | | - switch (type) { |
|---|
| 4272 | | - case BTRFS_UUID_KEY_SUBVOL: |
|---|
| 4273 | | - if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) |
|---|
| 4274 | | - ret = 1; |
|---|
| 4275 | | - break; |
|---|
| 4276 | | - case BTRFS_UUID_KEY_RECEIVED_SUBVOL: |
|---|
| 4277 | | - if (memcmp(uuid, subvol_root->root_item.received_uuid, |
|---|
| 4278 | | - BTRFS_UUID_SIZE)) |
|---|
| 4279 | | - ret = 1; |
|---|
| 4280 | | - break; |
|---|
| 4281 | | - } |
|---|
| 4282 | | - |
|---|
| 4283 | | -out: |
|---|
| 4284 | | - return ret; |
|---|
| 4285 | | -} |
|---|
| 4286 | | - |
|---|
| 4287 | | -static int btrfs_uuid_rescan_kthread(void *data) |
|---|
| 4288 | | -{ |
|---|
| 4289 | | - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; |
|---|
| 4290 | | - int ret; |
|---|
| 4291 | | - |
|---|
| 4292 | | - /* |
|---|
| 4293 | | - * 1st step is to iterate through the existing UUID tree and |
|---|
| 4294 | | - * to delete all entries that contain outdated data. |
|---|
| 4295 | | - * 2nd step is to add all missing entries to the UUID tree. |
|---|
| 4296 | | - */ |
|---|
| 4297 | | - ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); |
|---|
| 4298 | | - if (ret < 0) { |
|---|
| 4299 | | - btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); |
|---|
| 4300 | | - up(&fs_info->uuid_tree_rescan_sem); |
|---|
| 4301 | | - return ret; |
|---|
| 4302 | | - } |
|---|
| 4303 | | - return btrfs_uuid_scan_kthread(data); |
|---|
| 4304 | 4609 | } |
|---|
| 4305 | 4610 | |
|---|
| 4306 | 4611 | int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) |
|---|
| .. | .. |
|---|
| 4319 | 4624 | if (IS_ERR(trans)) |
|---|
| 4320 | 4625 | return PTR_ERR(trans); |
|---|
| 4321 | 4626 | |
|---|
| 4322 | | - uuid_root = btrfs_create_tree(trans, fs_info, |
|---|
| 4323 | | - BTRFS_UUID_TREE_OBJECTID); |
|---|
| 4627 | + uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); |
|---|
| 4324 | 4628 | if (IS_ERR(uuid_root)) { |
|---|
| 4325 | 4629 | ret = PTR_ERR(uuid_root); |
|---|
| 4326 | 4630 | btrfs_abort_transaction(trans, ret); |
|---|
| .. | .. |
|---|
| 4346 | 4650 | return 0; |
|---|
| 4347 | 4651 | } |
|---|
| 4348 | 4652 | |
|---|
| 4349 | | -int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) |
|---|
| 4350 | | -{ |
|---|
| 4351 | | - struct task_struct *task; |
|---|
| 4352 | | - |
|---|
| 4353 | | - down(&fs_info->uuid_tree_rescan_sem); |
|---|
| 4354 | | - task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); |
|---|
| 4355 | | - if (IS_ERR(task)) { |
|---|
| 4356 | | - /* fs_info->update_uuid_tree_gen remains 0 in all error case */ |
|---|
| 4357 | | - btrfs_warn(fs_info, "failed to start uuid_rescan task"); |
|---|
| 4358 | | - up(&fs_info->uuid_tree_rescan_sem); |
|---|
| 4359 | | - return PTR_ERR(task); |
|---|
| 4360 | | - } |
|---|
| 4361 | | - |
|---|
| 4362 | | - return 0; |
|---|
| 4363 | | -} |
|---|
| 4364 | | - |
|---|
| 4365 | 4653 | /* |
|---|
| 4366 | 4654 | * shrinking a device means finding all of the device extents past |
|---|
| 4367 | 4655 | * the new size, and then following the back refs to the chunks. |
|---|
| .. | .. |
|---|
| 4380 | 4668 | int slot; |
|---|
| 4381 | 4669 | int failed = 0; |
|---|
| 4382 | 4670 | bool retried = false; |
|---|
| 4383 | | - bool checked_pending_chunks = false; |
|---|
| 4384 | 4671 | struct extent_buffer *l; |
|---|
| 4385 | 4672 | struct btrfs_key key; |
|---|
| 4386 | 4673 | struct btrfs_super_block *super_copy = fs_info->super_copy; |
|---|
| 4387 | 4674 | u64 old_total = btrfs_super_total_bytes(super_copy); |
|---|
| 4388 | 4675 | u64 old_size = btrfs_device_get_total_bytes(device); |
|---|
| 4389 | 4676 | u64 diff; |
|---|
| 4677 | + u64 start; |
|---|
| 4390 | 4678 | |
|---|
| 4391 | 4679 | new_size = round_down(new_size, fs_info->sectorsize); |
|---|
| 4680 | + start = new_size; |
|---|
| 4392 | 4681 | diff = round_down(old_size - new_size, fs_info->sectorsize); |
|---|
| 4393 | 4682 | |
|---|
| 4394 | 4683 | if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) |
|---|
| .. | .. |
|---|
| 4400 | 4689 | |
|---|
| 4401 | 4690 | path->reada = READA_BACK; |
|---|
| 4402 | 4691 | |
|---|
| 4692 | + trans = btrfs_start_transaction(root, 0); |
|---|
| 4693 | + if (IS_ERR(trans)) { |
|---|
| 4694 | + btrfs_free_path(path); |
|---|
| 4695 | + return PTR_ERR(trans); |
|---|
| 4696 | + } |
|---|
| 4697 | + |
|---|
| 4403 | 4698 | mutex_lock(&fs_info->chunk_mutex); |
|---|
| 4404 | 4699 | |
|---|
| 4405 | 4700 | btrfs_device_set_total_bytes(device, new_size); |
|---|
| .. | .. |
|---|
| 4407 | 4702 | device->fs_devices->total_rw_bytes -= diff; |
|---|
| 4408 | 4703 | atomic64_sub(diff, &fs_info->free_chunk_space); |
|---|
| 4409 | 4704 | } |
|---|
| 4410 | | - mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 4705 | + |
|---|
| 4706 | + /* |
|---|
| 4707 | + * Once the device's size has been set to the new size, ensure all |
|---|
| 4708 | + * in-memory chunks are synced to disk so that the loop below sees them |
|---|
| 4709 | + * and relocates them accordingly. |
|---|
| 4710 | + */ |
|---|
| 4711 | + if (contains_pending_extent(device, &start, diff)) { |
|---|
| 4712 | + mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 4713 | + ret = btrfs_commit_transaction(trans); |
|---|
| 4714 | + if (ret) |
|---|
| 4715 | + goto done; |
|---|
| 4716 | + } else { |
|---|
| 4717 | + mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 4718 | + btrfs_end_transaction(trans); |
|---|
| 4719 | + } |
|---|
| 4411 | 4720 | |
|---|
| 4412 | 4721 | again: |
|---|
| 4413 | 4722 | key.objectid = device->devid; |
|---|
| .. | .. |
|---|
| 4469 | 4778 | |
|---|
| 4470 | 4779 | ret = btrfs_relocate_chunk(fs_info, chunk_offset); |
|---|
| 4471 | 4780 | mutex_unlock(&fs_info->delete_unused_bgs_mutex); |
|---|
| 4472 | | - if (ret && ret != -ENOSPC) |
|---|
| 4473 | | - goto done; |
|---|
| 4474 | | - if (ret == -ENOSPC) |
|---|
| 4781 | + if (ret == -ENOSPC) { |
|---|
| 4475 | 4782 | failed++; |
|---|
| 4783 | + } else if (ret) { |
|---|
| 4784 | + if (ret == -ETXTBSY) { |
|---|
| 4785 | + btrfs_warn(fs_info, |
|---|
| 4786 | + "could not shrink block group %llu due to active swapfile", |
|---|
| 4787 | + chunk_offset); |
|---|
| 4788 | + } |
|---|
| 4789 | + goto done; |
|---|
| 4790 | + } |
|---|
| 4476 | 4791 | } while (key.offset-- > 0); |
|---|
| 4477 | 4792 | |
|---|
| 4478 | 4793 | if (failed && !retried) { |
|---|
| .. | .. |
|---|
| 4492 | 4807 | } |
|---|
| 4493 | 4808 | |
|---|
| 4494 | 4809 | mutex_lock(&fs_info->chunk_mutex); |
|---|
| 4495 | | - |
|---|
| 4496 | | - /* |
|---|
| 4497 | | - * We checked in the above loop all device extents that were already in |
|---|
| 4498 | | - * the device tree. However before we have updated the device's |
|---|
| 4499 | | - * total_bytes to the new size, we might have had chunk allocations that |
|---|
| 4500 | | - * have not complete yet (new block groups attached to transaction |
|---|
| 4501 | | - * handles), and therefore their device extents were not yet in the |
|---|
| 4502 | | - * device tree and we missed them in the loop above. So if we have any |
|---|
| 4503 | | - * pending chunk using a device extent that overlaps the device range |
|---|
| 4504 | | - * that we can not use anymore, commit the current transaction and |
|---|
| 4505 | | - * repeat the search on the device tree - this way we guarantee we will |
|---|
| 4506 | | - * not have chunks using device extents that end beyond 'new_size'. |
|---|
| 4507 | | - */ |
|---|
| 4508 | | - if (!checked_pending_chunks) { |
|---|
| 4509 | | - u64 start = new_size; |
|---|
| 4510 | | - u64 len = old_size - new_size; |
|---|
| 4511 | | - |
|---|
| 4512 | | - if (contains_pending_extent(trans->transaction, device, |
|---|
| 4513 | | - &start, len)) { |
|---|
| 4514 | | - mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 4515 | | - checked_pending_chunks = true; |
|---|
| 4516 | | - failed = 0; |
|---|
| 4517 | | - retried = false; |
|---|
| 4518 | | - ret = btrfs_commit_transaction(trans); |
|---|
| 4519 | | - if (ret) |
|---|
| 4520 | | - goto done; |
|---|
| 4521 | | - goto again; |
|---|
| 4522 | | - } |
|---|
| 4523 | | - } |
|---|
| 4810 | + /* Clear all state bits beyond the shrunk device size */ |
|---|
| 4811 | + clear_extent_bits(&device->alloc_state, new_size, (u64)-1, |
|---|
| 4812 | + CHUNK_STATE_MASK); |
|---|
| 4524 | 4813 | |
|---|
| 4525 | 4814 | btrfs_device_set_disk_total_bytes(device, new_size); |
|---|
| 4526 | | - if (list_empty(&device->resized_list)) |
|---|
| 4527 | | - list_add_tail(&device->resized_list, |
|---|
| 4528 | | - &fs_info->fs_devices->resized_devices); |
|---|
| 4815 | + if (list_empty(&device->post_commit_list)) |
|---|
| 4816 | + list_add_tail(&device->post_commit_list, |
|---|
| 4817 | + &trans->transaction->dev_update_list); |
|---|
| 4529 | 4818 | |
|---|
| 4530 | 4819 | WARN_ON(diff > old_total); |
|---|
| 4531 | 4820 | btrfs_set_super_total_bytes(super_copy, |
|---|
| .. | .. |
|---|
| 4609 | 4898 | btrfs_set_fs_incompat(info, RAID56); |
|---|
| 4610 | 4899 | } |
|---|
| 4611 | 4900 | |
|---|
| 4612 | | -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, |
|---|
| 4613 | | - u64 start, u64 type) |
|---|
| 4901 | +static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) |
|---|
| 4614 | 4902 | { |
|---|
| 4615 | | - struct btrfs_fs_info *info = trans->fs_info; |
|---|
| 4616 | | - struct btrfs_fs_devices *fs_devices = info->fs_devices; |
|---|
| 4617 | | - struct btrfs_device *device; |
|---|
| 4618 | | - struct map_lookup *map = NULL; |
|---|
| 4619 | | - struct extent_map_tree *em_tree; |
|---|
| 4620 | | - struct extent_map *em; |
|---|
| 4621 | | - struct btrfs_device_info *devices_info = NULL; |
|---|
| 4622 | | - u64 total_avail; |
|---|
| 4623 | | - int num_stripes; /* total number of stripes to allocate */ |
|---|
| 4624 | | - int data_stripes; /* number of stripes that count for |
|---|
| 4625 | | - block group size */ |
|---|
| 4626 | | - int sub_stripes; /* sub_stripes info for map */ |
|---|
| 4627 | | - int dev_stripes; /* stripes per dev */ |
|---|
| 4628 | | - int devs_max; /* max devs to use */ |
|---|
| 4629 | | - int devs_min; /* min devs needed */ |
|---|
| 4630 | | - int devs_increment; /* ndevs has to be a multiple of this */ |
|---|
| 4631 | | - int ncopies; /* how many copies to data has */ |
|---|
| 4632 | | - int ret; |
|---|
| 4903 | + if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) |
|---|
| 4904 | + return; |
|---|
| 4905 | + |
|---|
| 4906 | + btrfs_set_fs_incompat(info, RAID1C34); |
|---|
| 4907 | +} |
|---|
| 4908 | + |
|---|
| 4909 | +/* |
|---|
| 4910 | + * Structure used internally for __btrfs_alloc_chunk() function. |
|---|
| 4911 | + * Wraps needed parameters. |
|---|
| 4912 | + */ |
|---|
| 4913 | +struct alloc_chunk_ctl { |
|---|
| 4914 | + u64 start; |
|---|
| 4915 | + u64 type; |
|---|
| 4916 | + /* Total number of stripes to allocate */ |
|---|
| 4917 | + int num_stripes; |
|---|
| 4918 | + /* sub_stripes info for map */ |
|---|
| 4919 | + int sub_stripes; |
|---|
| 4920 | + /* Stripes per device */ |
|---|
| 4921 | + int dev_stripes; |
|---|
| 4922 | + /* Maximum number of devices to use */ |
|---|
| 4923 | + int devs_max; |
|---|
| 4924 | + /* Minimum number of devices to use */ |
|---|
| 4925 | + int devs_min; |
|---|
| 4926 | + /* ndevs has to be a multiple of this */ |
|---|
| 4927 | + int devs_increment; |
|---|
| 4928 | + /* Number of copies */ |
|---|
| 4929 | + int ncopies; |
|---|
| 4930 | + /* Number of stripes worth of bytes to store parity information */ |
|---|
| 4931 | + int nparity; |
|---|
| 4633 | 4932 | u64 max_stripe_size; |
|---|
| 4634 | 4933 | u64 max_chunk_size; |
|---|
| 4934 | + u64 dev_extent_min; |
|---|
| 4635 | 4935 | u64 stripe_size; |
|---|
| 4636 | | - u64 num_bytes; |
|---|
| 4936 | + u64 chunk_size; |
|---|
| 4637 | 4937 | int ndevs; |
|---|
| 4638 | | - int i; |
|---|
| 4639 | | - int j; |
|---|
| 4640 | | - int index; |
|---|
| 4938 | +}; |
|---|
| 4641 | 4939 | |
|---|
| 4642 | | - BUG_ON(!alloc_profile_is_valid(type, 0)); |
|---|
| 4643 | | - |
|---|
| 4644 | | - if (list_empty(&fs_devices->alloc_list)) { |
|---|
| 4645 | | - if (btrfs_test_opt(info, ENOSPC_DEBUG)) |
|---|
| 4646 | | - btrfs_debug(info, "%s: no writable device", __func__); |
|---|
| 4647 | | - return -ENOSPC; |
|---|
| 4648 | | - } |
|---|
| 4649 | | - |
|---|
| 4650 | | - index = btrfs_bg_flags_to_raid_index(type); |
|---|
| 4651 | | - |
|---|
| 4652 | | - sub_stripes = btrfs_raid_array[index].sub_stripes; |
|---|
| 4653 | | - dev_stripes = btrfs_raid_array[index].dev_stripes; |
|---|
| 4654 | | - devs_max = btrfs_raid_array[index].devs_max; |
|---|
| 4655 | | - devs_min = btrfs_raid_array[index].devs_min; |
|---|
| 4656 | | - devs_increment = btrfs_raid_array[index].devs_increment; |
|---|
| 4657 | | - ncopies = btrfs_raid_array[index].ncopies; |
|---|
| 4940 | +static void init_alloc_chunk_ctl_policy_regular( |
|---|
| 4941 | + struct btrfs_fs_devices *fs_devices, |
|---|
| 4942 | + struct alloc_chunk_ctl *ctl) |
|---|
| 4943 | +{ |
|---|
| 4944 | + u64 type = ctl->type; |
|---|
| 4658 | 4945 | |
|---|
| 4659 | 4946 | if (type & BTRFS_BLOCK_GROUP_DATA) { |
|---|
| 4660 | | - max_stripe_size = SZ_1G; |
|---|
| 4661 | | - max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; |
|---|
| 4662 | | - if (!devs_max) |
|---|
| 4663 | | - devs_max = BTRFS_MAX_DEVS(info); |
|---|
| 4947 | + ctl->max_stripe_size = SZ_1G; |
|---|
| 4948 | + ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; |
|---|
| 4664 | 4949 | } else if (type & BTRFS_BLOCK_GROUP_METADATA) { |
|---|
| 4665 | | - /* for larger filesystems, use larger metadata chunks */ |
|---|
| 4950 | + /* For larger filesystems, use larger metadata chunks */ |
|---|
| 4666 | 4951 | if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) |
|---|
| 4667 | | - max_stripe_size = SZ_1G; |
|---|
| 4952 | + ctl->max_stripe_size = SZ_1G; |
|---|
| 4668 | 4953 | else |
|---|
| 4669 | | - max_stripe_size = SZ_256M; |
|---|
| 4670 | | - max_chunk_size = max_stripe_size; |
|---|
| 4671 | | - if (!devs_max) |
|---|
| 4672 | | - devs_max = BTRFS_MAX_DEVS(info); |
|---|
| 4954 | + ctl->max_stripe_size = SZ_256M; |
|---|
| 4955 | + ctl->max_chunk_size = ctl->max_stripe_size; |
|---|
| 4673 | 4956 | } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { |
|---|
| 4674 | | - max_stripe_size = SZ_32M; |
|---|
| 4675 | | - max_chunk_size = 2 * max_stripe_size; |
|---|
| 4676 | | - if (!devs_max) |
|---|
| 4677 | | - devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; |
|---|
| 4957 | + ctl->max_stripe_size = SZ_32M; |
|---|
| 4958 | + ctl->max_chunk_size = 2 * ctl->max_stripe_size; |
|---|
| 4959 | + ctl->devs_max = min_t(int, ctl->devs_max, |
|---|
| 4960 | + BTRFS_MAX_DEVS_SYS_CHUNK); |
|---|
| 4678 | 4961 | } else { |
|---|
| 4679 | | - btrfs_err(info, "invalid chunk type 0x%llx requested", |
|---|
| 4680 | | - type); |
|---|
| 4681 | | - BUG_ON(1); |
|---|
| 4962 | + BUG(); |
|---|
| 4682 | 4963 | } |
|---|
| 4683 | 4964 | |
|---|
| 4684 | | - /* we don't want a chunk larger than 10% of writeable space */ |
|---|
| 4685 | | - max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), |
|---|
| 4686 | | - max_chunk_size); |
|---|
| 4965 | + /* We don't want a chunk larger than 10% of writable space */ |
|---|
| 4966 | + ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), |
|---|
| 4967 | + ctl->max_chunk_size); |
|---|
| 4968 | + ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; |
|---|
| 4969 | +} |
|---|
| 4687 | 4970 | |
|---|
| 4688 | | - devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), |
|---|
| 4689 | | - GFP_NOFS); |
|---|
| 4690 | | - if (!devices_info) |
|---|
| 4691 | | - return -ENOMEM; |
|---|
| 4971 | +static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, |
|---|
| 4972 | + struct alloc_chunk_ctl *ctl) |
|---|
| 4973 | +{ |
|---|
| 4974 | + int index = btrfs_bg_flags_to_raid_index(ctl->type); |
|---|
| 4975 | + |
|---|
| 4976 | + ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; |
|---|
| 4977 | + ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; |
|---|
| 4978 | + ctl->devs_max = btrfs_raid_array[index].devs_max; |
|---|
| 4979 | + if (!ctl->devs_max) |
|---|
| 4980 | + ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); |
|---|
| 4981 | + ctl->devs_min = btrfs_raid_array[index].devs_min; |
|---|
| 4982 | + ctl->devs_increment = btrfs_raid_array[index].devs_increment; |
|---|
| 4983 | + ctl->ncopies = btrfs_raid_array[index].ncopies; |
|---|
| 4984 | + ctl->nparity = btrfs_raid_array[index].nparity; |
|---|
| 4985 | + ctl->ndevs = 0; |
|---|
| 4986 | + |
|---|
| 4987 | + switch (fs_devices->chunk_alloc_policy) { |
|---|
| 4988 | + case BTRFS_CHUNK_ALLOC_REGULAR: |
|---|
| 4989 | + init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); |
|---|
| 4990 | + break; |
|---|
| 4991 | + default: |
|---|
| 4992 | + BUG(); |
|---|
| 4993 | + } |
|---|
| 4994 | +} |
|---|
| 4995 | + |
|---|
| 4996 | +static int gather_device_info(struct btrfs_fs_devices *fs_devices, |
|---|
| 4997 | + struct alloc_chunk_ctl *ctl, |
|---|
| 4998 | + struct btrfs_device_info *devices_info) |
|---|
| 4999 | +{ |
|---|
| 5000 | + struct btrfs_fs_info *info = fs_devices->fs_info; |
|---|
| 5001 | + struct btrfs_device *device; |
|---|
| 5002 | + u64 total_avail; |
|---|
| 5003 | + u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; |
|---|
| 5004 | + int ret; |
|---|
| 5005 | + int ndevs = 0; |
|---|
| 5006 | + u64 max_avail; |
|---|
| 5007 | + u64 dev_offset; |
|---|
| 4692 | 5008 | |
|---|
| 4693 | 5009 | /* |
|---|
| 4694 | 5010 | * in the first pass through the devices list, we gather information |
|---|
| 4695 | 5011 | * about the available holes on each device. |
|---|
| 4696 | 5012 | */ |
|---|
| 4697 | | - ndevs = 0; |
|---|
| 4698 | 5013 | list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { |
|---|
| 4699 | | - u64 max_avail; |
|---|
| 4700 | | - u64 dev_offset; |
|---|
| 4701 | | - |
|---|
| 4702 | 5014 | if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { |
|---|
| 4703 | 5015 | WARN(1, KERN_ERR |
|---|
| 4704 | 5016 | "BTRFS: read-only device in alloc_list\n"); |
|---|
| .. | .. |
|---|
| 4716 | 5028 | total_avail = 0; |
|---|
| 4717 | 5029 | |
|---|
| 4718 | 5030 | /* If there is no space on this device, skip it. */ |
|---|
| 4719 | | - if (total_avail == 0) |
|---|
| 5031 | + if (total_avail < ctl->dev_extent_min) |
|---|
| 4720 | 5032 | continue; |
|---|
| 4721 | 5033 | |
|---|
| 4722 | | - ret = find_free_dev_extent(trans, device, |
|---|
| 4723 | | - max_stripe_size * dev_stripes, |
|---|
| 4724 | | - &dev_offset, &max_avail); |
|---|
| 5034 | + ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, |
|---|
| 5035 | + &max_avail); |
|---|
| 4725 | 5036 | if (ret && ret != -ENOSPC) |
|---|
| 4726 | | - goto error; |
|---|
| 5037 | + return ret; |
|---|
| 4727 | 5038 | |
|---|
| 4728 | 5039 | if (ret == 0) |
|---|
| 4729 | | - max_avail = max_stripe_size * dev_stripes; |
|---|
| 5040 | + max_avail = dev_extent_want; |
|---|
| 4730 | 5041 | |
|---|
| 4731 | | - if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { |
|---|
| 5042 | + if (max_avail < ctl->dev_extent_min) { |
|---|
| 4732 | 5043 | if (btrfs_test_opt(info, ENOSPC_DEBUG)) |
|---|
| 4733 | 5044 | btrfs_debug(info, |
|---|
| 4734 | | - "%s: devid %llu has no free space, have=%llu want=%u", |
|---|
| 5045 | + "%s: devid %llu has no free space, have=%llu want=%llu", |
|---|
| 4735 | 5046 | __func__, device->devid, max_avail, |
|---|
| 4736 | | - BTRFS_STRIPE_LEN * dev_stripes); |
|---|
| 5047 | + ctl->dev_extent_min); |
|---|
| 4737 | 5048 | continue; |
|---|
| 4738 | 5049 | } |
|---|
| 4739 | 5050 | |
|---|
| .. | .. |
|---|
| 4748 | 5059 | devices_info[ndevs].dev = device; |
|---|
| 4749 | 5060 | ++ndevs; |
|---|
| 4750 | 5061 | } |
|---|
| 5062 | + ctl->ndevs = ndevs; |
|---|
| 4751 | 5063 | |
|---|
| 4752 | 5064 | /* |
|---|
| 4753 | 5065 | * now sort the devices by hole size / available space |
|---|
| .. | .. |
|---|
| 4755 | 5067 | sort(devices_info, ndevs, sizeof(struct btrfs_device_info), |
|---|
| 4756 | 5068 | btrfs_cmp_device_info, NULL); |
|---|
| 4757 | 5069 | |
|---|
| 4758 | | - /* round down to number of usable stripes */ |
|---|
| 4759 | | - ndevs = round_down(ndevs, devs_increment); |
|---|
| 5070 | + return 0; |
|---|
| 5071 | +} |
|---|
| 4760 | 5072 | |
|---|
| 4761 | | - if (ndevs < devs_min) { |
|---|
| 4762 | | - ret = -ENOSPC; |
|---|
| 4763 | | - if (btrfs_test_opt(info, ENOSPC_DEBUG)) { |
|---|
| 4764 | | - btrfs_debug(info, |
|---|
| 4765 | | - "%s: not enough devices with free space: have=%d minimum required=%d", |
|---|
| 4766 | | - __func__, ndevs, devs_min); |
|---|
| 4767 | | - } |
|---|
| 4768 | | - goto error; |
|---|
| 4769 | | - } |
|---|
| 4770 | | - |
|---|
| 4771 | | - ndevs = min(ndevs, devs_max); |
|---|
| 5073 | +static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, |
|---|
| 5074 | + struct btrfs_device_info *devices_info) |
|---|
| 5075 | +{ |
|---|
| 5076 | + /* Number of stripes that count for block group size */ |
|---|
| 5077 | + int data_stripes; |
|---|
| 4772 | 5078 | |
|---|
| 4773 | 5079 | /* |
|---|
| 4774 | 5080 | * The primary goal is to maximize the number of stripes, so use as |
|---|
| .. | .. |
|---|
| 4777 | 5083 | * The DUP profile stores more than one stripe per device, the |
|---|
| 4778 | 5084 | * max_avail is the total size so we have to adjust. |
|---|
| 4779 | 5085 | */ |
|---|
| 4780 | | - stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); |
|---|
| 4781 | | - num_stripes = ndevs * dev_stripes; |
|---|
| 5086 | + ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, |
|---|
| 5087 | + ctl->dev_stripes); |
|---|
| 5088 | + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; |
|---|
| 5089 | + |
|---|
| 5090 | + /* This will have to be fixed for RAID1 and RAID10 over more drives */ |
|---|
| 5091 | + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; |
|---|
| 4782 | 5092 | |
|---|
| 4783 | 5093 | /* |
|---|
| 4784 | | - * this will have to be fixed for RAID1 and RAID10 over |
|---|
| 4785 | | - * more drives |
|---|
| 5094 | + * Use the number of data stripes to figure out how big this chunk is |
|---|
| 5095 | + * really going to be in terms of logical address space, and compare |
|---|
| 5096 | + * that answer with the max chunk size. If it's higher, we try to |
|---|
| 5097 | + * reduce stripe_size. |
|---|
| 4786 | 5098 | */ |
|---|
| 4787 | | - data_stripes = num_stripes / ncopies; |
|---|
| 4788 | | - |
|---|
| 4789 | | - if (type & BTRFS_BLOCK_GROUP_RAID5) |
|---|
| 4790 | | - data_stripes = num_stripes - 1; |
|---|
| 4791 | | - |
|---|
| 4792 | | - if (type & BTRFS_BLOCK_GROUP_RAID6) |
|---|
| 4793 | | - data_stripes = num_stripes - 2; |
|---|
| 4794 | | - |
|---|
| 4795 | | - /* |
|---|
| 4796 | | - * Use the number of data stripes to figure out how big this chunk |
|---|
| 4797 | | - * is really going to be in terms of logical address space, |
|---|
| 4798 | | - * and compare that answer with the max chunk size. If it's higher, |
|---|
| 4799 | | - * we try to reduce stripe_size. |
|---|
| 4800 | | - */ |
|---|
| 4801 | | - if (stripe_size * data_stripes > max_chunk_size) { |
|---|
| 5099 | + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { |
|---|
| 4802 | 5100 | /* |
|---|
| 4803 | 5101 | * Reduce stripe_size, round it up to a 16MB boundary again and |
|---|
| 4804 | 5102 | * then use it, unless it ends up being even bigger than the |
|---|
| 4805 | 5103 | * previous value we had already. |
|---|
| 4806 | 5104 | */ |
|---|
| 4807 | | - stripe_size = min(round_up(div_u64(max_chunk_size, |
|---|
| 4808 | | - data_stripes), SZ_16M), |
|---|
| 4809 | | - stripe_size); |
|---|
| 5105 | + ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, |
|---|
| 5106 | + data_stripes), SZ_16M), |
|---|
| 5107 | + ctl->stripe_size); |
|---|
| 4810 | 5108 | } |
|---|
| 4811 | 5109 | |
|---|
| 4812 | | - /* align to BTRFS_STRIPE_LEN */ |
|---|
| 4813 | | - stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); |
|---|
| 5110 | + /* Align to BTRFS_STRIPE_LEN */ |
|---|
| 5111 | + ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); |
|---|
| 5112 | + ctl->chunk_size = ctl->stripe_size * data_stripes; |
|---|
| 4814 | 5113 | |
|---|
| 4815 | | - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); |
|---|
| 4816 | | - if (!map) { |
|---|
| 4817 | | - ret = -ENOMEM; |
|---|
| 4818 | | - goto error; |
|---|
| 5114 | + return 0; |
|---|
| 5115 | +} |
|---|
| 5116 | + |
|---|
| 5117 | +static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, |
|---|
| 5118 | + struct alloc_chunk_ctl *ctl, |
|---|
| 5119 | + struct btrfs_device_info *devices_info) |
|---|
| 5120 | +{ |
|---|
| 5121 | + struct btrfs_fs_info *info = fs_devices->fs_info; |
|---|
| 5122 | + |
|---|
| 5123 | + /* |
|---|
| 5124 | + * Round down to number of usable stripes, devs_increment can be any |
|---|
| 5125 | + * number so we can't use round_down() that requires power of 2, while |
|---|
| 5126 | + * rounddown is safe. |
|---|
| 5127 | + */ |
|---|
| 5128 | + ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); |
|---|
| 5129 | + |
|---|
| 5130 | + if (ctl->ndevs < ctl->devs_min) { |
|---|
| 5131 | + if (btrfs_test_opt(info, ENOSPC_DEBUG)) { |
|---|
| 5132 | + btrfs_debug(info, |
|---|
| 5133 | + "%s: not enough devices with free space: have=%d minimum required=%d", |
|---|
| 5134 | + __func__, ctl->ndevs, ctl->devs_min); |
|---|
| 5135 | + } |
|---|
| 5136 | + return -ENOSPC; |
|---|
| 4819 | 5137 | } |
|---|
| 4820 | | - map->num_stripes = num_stripes; |
|---|
| 4821 | 5138 | |
|---|
| 4822 | | - for (i = 0; i < ndevs; ++i) { |
|---|
| 4823 | | - for (j = 0; j < dev_stripes; ++j) { |
|---|
| 4824 | | - int s = i * dev_stripes + j; |
|---|
| 5139 | + ctl->ndevs = min(ctl->ndevs, ctl->devs_max); |
|---|
| 5140 | + |
|---|
| 5141 | + switch (fs_devices->chunk_alloc_policy) { |
|---|
| 5142 | + case BTRFS_CHUNK_ALLOC_REGULAR: |
|---|
| 5143 | + return decide_stripe_size_regular(ctl, devices_info); |
|---|
| 5144 | + default: |
|---|
| 5145 | + BUG(); |
|---|
| 5146 | + } |
|---|
| 5147 | +} |
|---|
| 5148 | + |
|---|
| 5149 | +static int create_chunk(struct btrfs_trans_handle *trans, |
|---|
| 5150 | + struct alloc_chunk_ctl *ctl, |
|---|
| 5151 | + struct btrfs_device_info *devices_info) |
|---|
| 5152 | +{ |
|---|
| 5153 | + struct btrfs_fs_info *info = trans->fs_info; |
|---|
| 5154 | + struct map_lookup *map = NULL; |
|---|
| 5155 | + struct extent_map_tree *em_tree; |
|---|
| 5156 | + struct extent_map *em; |
|---|
| 5157 | + u64 start = ctl->start; |
|---|
| 5158 | + u64 type = ctl->type; |
|---|
| 5159 | + int ret; |
|---|
| 5160 | + int i; |
|---|
| 5161 | + int j; |
|---|
| 5162 | + |
|---|
| 5163 | + map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); |
|---|
| 5164 | + if (!map) |
|---|
| 5165 | + return -ENOMEM; |
|---|
| 5166 | + map->num_stripes = ctl->num_stripes; |
|---|
| 5167 | + |
|---|
| 5168 | + for (i = 0; i < ctl->ndevs; ++i) { |
|---|
| 5169 | + for (j = 0; j < ctl->dev_stripes; ++j) { |
|---|
| 5170 | + int s = i * ctl->dev_stripes + j; |
|---|
| 4825 | 5171 | map->stripes[s].dev = devices_info[i].dev; |
|---|
| 4826 | 5172 | map->stripes[s].physical = devices_info[i].dev_offset + |
|---|
| 4827 | | - j * stripe_size; |
|---|
| 5173 | + j * ctl->stripe_size; |
|---|
| 4828 | 5174 | } |
|---|
| 4829 | 5175 | } |
|---|
| 4830 | 5176 | map->stripe_len = BTRFS_STRIPE_LEN; |
|---|
| 4831 | 5177 | map->io_align = BTRFS_STRIPE_LEN; |
|---|
| 4832 | 5178 | map->io_width = BTRFS_STRIPE_LEN; |
|---|
| 4833 | 5179 | map->type = type; |
|---|
| 4834 | | - map->sub_stripes = sub_stripes; |
|---|
| 5180 | + map->sub_stripes = ctl->sub_stripes; |
|---|
| 4835 | 5181 | |
|---|
| 4836 | | - num_bytes = stripe_size * data_stripes; |
|---|
| 4837 | | - |
|---|
| 4838 | | - trace_btrfs_chunk_alloc(info, map, start, num_bytes); |
|---|
| 5182 | + trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); |
|---|
| 4839 | 5183 | |
|---|
| 4840 | 5184 | em = alloc_extent_map(); |
|---|
| 4841 | 5185 | if (!em) { |
|---|
| 4842 | 5186 | kfree(map); |
|---|
| 4843 | | - ret = -ENOMEM; |
|---|
| 4844 | | - goto error; |
|---|
| 5187 | + return -ENOMEM; |
|---|
| 4845 | 5188 | } |
|---|
| 4846 | 5189 | set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); |
|---|
| 4847 | 5190 | em->map_lookup = map; |
|---|
| 4848 | 5191 | em->start = start; |
|---|
| 4849 | | - em->len = num_bytes; |
|---|
| 5192 | + em->len = ctl->chunk_size; |
|---|
| 4850 | 5193 | em->block_start = 0; |
|---|
| 4851 | 5194 | em->block_len = em->len; |
|---|
| 4852 | | - em->orig_block_len = stripe_size; |
|---|
| 5195 | + em->orig_block_len = ctl->stripe_size; |
|---|
| 4853 | 5196 | |
|---|
| 4854 | | - em_tree = &info->mapping_tree.map_tree; |
|---|
| 5197 | + em_tree = &info->mapping_tree; |
|---|
| 4855 | 5198 | write_lock(&em_tree->lock); |
|---|
| 4856 | 5199 | ret = add_extent_mapping(em_tree, em, 0); |
|---|
| 4857 | 5200 | if (ret) { |
|---|
| 4858 | 5201 | write_unlock(&em_tree->lock); |
|---|
| 4859 | 5202 | free_extent_map(em); |
|---|
| 4860 | | - goto error; |
|---|
| 5203 | + return ret; |
|---|
| 4861 | 5204 | } |
|---|
| 4862 | | - |
|---|
| 4863 | | - list_add_tail(&em->list, &trans->transaction->pending_chunks); |
|---|
| 4864 | | - refcount_inc(&em->refs); |
|---|
| 4865 | 5205 | write_unlock(&em_tree->lock); |
|---|
| 4866 | 5206 | |
|---|
| 4867 | | - ret = btrfs_make_block_group(trans, 0, type, start, num_bytes); |
|---|
| 5207 | + ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); |
|---|
| 4868 | 5208 | if (ret) |
|---|
| 4869 | 5209 | goto error_del_extent; |
|---|
| 4870 | 5210 | |
|---|
| 4871 | 5211 | for (i = 0; i < map->num_stripes; i++) { |
|---|
| 4872 | | - num_bytes = map->stripes[i].dev->bytes_used + stripe_size; |
|---|
| 4873 | | - btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); |
|---|
| 4874 | | - map->stripes[i].dev->has_pending_chunks = true; |
|---|
| 5212 | + struct btrfs_device *dev = map->stripes[i].dev; |
|---|
| 5213 | + |
|---|
| 5214 | + btrfs_device_set_bytes_used(dev, |
|---|
| 5215 | + dev->bytes_used + ctl->stripe_size); |
|---|
| 5216 | + if (list_empty(&dev->post_commit_list)) |
|---|
| 5217 | + list_add_tail(&dev->post_commit_list, |
|---|
| 5218 | + &trans->transaction->dev_update_list); |
|---|
| 4875 | 5219 | } |
|---|
| 4876 | 5220 | |
|---|
| 4877 | | - atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); |
|---|
| 5221 | + atomic64_sub(ctl->stripe_size * map->num_stripes, |
|---|
| 5222 | + &info->free_chunk_space); |
|---|
| 4878 | 5223 | |
|---|
| 4879 | 5224 | free_extent_map(em); |
|---|
| 4880 | 5225 | check_raid56_incompat_flag(info, type); |
|---|
| 5226 | + check_raid1c34_incompat_flag(info, type); |
|---|
| 4881 | 5227 | |
|---|
| 4882 | | - kfree(devices_info); |
|---|
| 4883 | 5228 | return 0; |
|---|
| 4884 | 5229 | |
|---|
| 4885 | 5230 | error_del_extent: |
|---|
| .. | .. |
|---|
| 4891 | 5236 | free_extent_map(em); |
|---|
| 4892 | 5237 | /* One for the tree reference */ |
|---|
| 4893 | 5238 | free_extent_map(em); |
|---|
| 4894 | | - /* One for the pending_chunks list reference */ |
|---|
| 4895 | | - free_extent_map(em); |
|---|
| 4896 | | -error: |
|---|
| 5239 | + |
|---|
| 5240 | + return ret; |
|---|
| 5241 | +} |
|---|
| 5242 | + |
|---|
| 5243 | +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) |
|---|
| 5244 | +{ |
|---|
| 5245 | + struct btrfs_fs_info *info = trans->fs_info; |
|---|
| 5246 | + struct btrfs_fs_devices *fs_devices = info->fs_devices; |
|---|
| 5247 | + struct btrfs_device_info *devices_info = NULL; |
|---|
| 5248 | + struct alloc_chunk_ctl ctl; |
|---|
| 5249 | + int ret; |
|---|
| 5250 | + |
|---|
| 5251 | + lockdep_assert_held(&info->chunk_mutex); |
|---|
| 5252 | + |
|---|
| 5253 | + if (!alloc_profile_is_valid(type, 0)) { |
|---|
| 5254 | + ASSERT(0); |
|---|
| 5255 | + return -EINVAL; |
|---|
| 5256 | + } |
|---|
| 5257 | + |
|---|
| 5258 | + if (list_empty(&fs_devices->alloc_list)) { |
|---|
| 5259 | + if (btrfs_test_opt(info, ENOSPC_DEBUG)) |
|---|
| 5260 | + btrfs_debug(info, "%s: no writable device", __func__); |
|---|
| 5261 | + return -ENOSPC; |
|---|
| 5262 | + } |
|---|
| 5263 | + |
|---|
| 5264 | + if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { |
|---|
| 5265 | + btrfs_err(info, "invalid chunk type 0x%llx requested", type); |
|---|
| 5266 | + ASSERT(0); |
|---|
| 5267 | + return -EINVAL; |
|---|
| 5268 | + } |
|---|
| 5269 | + |
|---|
| 5270 | + ctl.start = find_next_chunk(info); |
|---|
| 5271 | + ctl.type = type; |
|---|
| 5272 | + init_alloc_chunk_ctl(fs_devices, &ctl); |
|---|
| 5273 | + |
|---|
| 5274 | + devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), |
|---|
| 5275 | + GFP_NOFS); |
|---|
| 5276 | + if (!devices_info) |
|---|
| 5277 | + return -ENOMEM; |
|---|
| 5278 | + |
|---|
| 5279 | + ret = gather_device_info(fs_devices, &ctl, devices_info); |
|---|
| 5280 | + if (ret < 0) |
|---|
| 5281 | + goto out; |
|---|
| 5282 | + |
|---|
| 5283 | + ret = decide_stripe_size(fs_devices, &ctl, devices_info); |
|---|
| 5284 | + if (ret < 0) |
|---|
| 5285 | + goto out; |
|---|
| 5286 | + |
|---|
| 5287 | + ret = create_chunk(trans, &ctl, devices_info); |
|---|
| 5288 | + |
|---|
| 5289 | +out: |
|---|
| 4897 | 5290 | kfree(devices_info); |
|---|
| 4898 | 5291 | return ret; |
|---|
| 4899 | 5292 | } |
|---|
| 4900 | 5293 | |
|---|
| 5294 | +/* |
|---|
| 5295 | + * Chunk allocation falls into two parts. The first part does work |
|---|
| 5296 | + * that makes the new allocated chunk usable, but does not do any operation |
|---|
| 5297 | + * that modifies the chunk tree. The second part does the work that |
|---|
| 5298 | + * requires modifying the chunk tree. This division is important for the |
|---|
| 5299 | + * bootstrap process of adding storage to a seed btrfs. |
|---|
| 5300 | + */ |
|---|
| 4901 | 5301 | int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, |
|---|
| 4902 | 5302 | u64 chunk_offset, u64 chunk_size) |
|---|
| 4903 | 5303 | { |
|---|
| .. | .. |
|---|
| 4916 | 5316 | int i = 0; |
|---|
| 4917 | 5317 | int ret = 0; |
|---|
| 4918 | 5318 | |
|---|
| 4919 | | - em = get_chunk_map(fs_info, chunk_offset, chunk_size); |
|---|
| 5319 | + em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); |
|---|
| 4920 | 5320 | if (IS_ERR(em)) |
|---|
| 4921 | 5321 | return PTR_ERR(em); |
|---|
| 4922 | 5322 | |
|---|
| .. | .. |
|---|
| 4996 | 5396 | return ret; |
|---|
| 4997 | 5397 | } |
|---|
| 4998 | 5398 | |
|---|
| 4999 | | -/* |
|---|
| 5000 | | - * Chunk allocation falls into two parts. The first part does works |
|---|
| 5001 | | - * that make the new allocated chunk useable, but not do any operation |
|---|
| 5002 | | - * that modifies the chunk tree. The second part does the works that |
|---|
| 5003 | | - * require modifying the chunk tree. This division is important for the |
|---|
| 5004 | | - * bootstrap process of adding storage to a seed btrfs. |
|---|
| 5005 | | - */ |
|---|
| 5006 | | -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) |
|---|
| 5399 | +static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) |
|---|
| 5007 | 5400 | { |
|---|
| 5008 | | - u64 chunk_offset; |
|---|
| 5009 | | - |
|---|
| 5010 | | - lockdep_assert_held(&trans->fs_info->chunk_mutex); |
|---|
| 5011 | | - chunk_offset = find_next_chunk(trans->fs_info); |
|---|
| 5012 | | - return __btrfs_alloc_chunk(trans, chunk_offset, type); |
|---|
| 5013 | | -} |
|---|
| 5014 | | - |
|---|
| 5015 | | -static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, |
|---|
| 5016 | | - struct btrfs_fs_info *fs_info) |
|---|
| 5017 | | -{ |
|---|
| 5018 | | - u64 chunk_offset; |
|---|
| 5019 | | - u64 sys_chunk_offset; |
|---|
| 5401 | + struct btrfs_fs_info *fs_info = trans->fs_info; |
|---|
| 5020 | 5402 | u64 alloc_profile; |
|---|
| 5021 | 5403 | int ret; |
|---|
| 5022 | 5404 | |
|---|
| 5023 | | - chunk_offset = find_next_chunk(fs_info); |
|---|
| 5024 | 5405 | alloc_profile = btrfs_metadata_alloc_profile(fs_info); |
|---|
| 5025 | | - ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); |
|---|
| 5406 | + ret = btrfs_alloc_chunk(trans, alloc_profile); |
|---|
| 5026 | 5407 | if (ret) |
|---|
| 5027 | 5408 | return ret; |
|---|
| 5028 | 5409 | |
|---|
| 5029 | | - sys_chunk_offset = find_next_chunk(fs_info); |
|---|
| 5030 | 5410 | alloc_profile = btrfs_system_alloc_profile(fs_info); |
|---|
| 5031 | | - ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); |
|---|
| 5411 | + ret = btrfs_alloc_chunk(trans, alloc_profile); |
|---|
| 5032 | 5412 | return ret; |
|---|
| 5033 | 5413 | } |
|---|
| 5034 | 5414 | |
|---|
| 5035 | 5415 | static inline int btrfs_chunk_max_errors(struct map_lookup *map) |
|---|
| 5036 | 5416 | { |
|---|
| 5037 | | - int max_errors; |
|---|
| 5417 | + const int index = btrfs_bg_flags_to_raid_index(map->type); |
|---|
| 5038 | 5418 | |
|---|
| 5039 | | - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
|---|
| 5040 | | - BTRFS_BLOCK_GROUP_RAID10 | |
|---|
| 5041 | | - BTRFS_BLOCK_GROUP_RAID5)) { |
|---|
| 5042 | | - max_errors = 1; |
|---|
| 5043 | | - } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { |
|---|
| 5044 | | - max_errors = 2; |
|---|
| 5045 | | - } else { |
|---|
| 5046 | | - max_errors = 0; |
|---|
| 5047 | | - } |
|---|
| 5048 | | - |
|---|
| 5049 | | - return max_errors; |
|---|
| 5419 | + return btrfs_raid_array[index].tolerated_failures; |
|---|
| 5050 | 5420 | } |
|---|
| 5051 | 5421 | |
|---|
| 5052 | 5422 | int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) |
|---|
| .. | .. |
|---|
| 5057 | 5427 | int miss_ndevs = 0; |
|---|
| 5058 | 5428 | int i; |
|---|
| 5059 | 5429 | |
|---|
| 5060 | | - em = get_chunk_map(fs_info, chunk_offset, 1); |
|---|
| 5430 | + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); |
|---|
| 5061 | 5431 | if (IS_ERR(em)) |
|---|
| 5062 | 5432 | return 1; |
|---|
| 5063 | 5433 | |
|---|
| .. | .. |
|---|
| 5087 | 5457 | return readonly; |
|---|
| 5088 | 5458 | } |
|---|
| 5089 | 5459 | |
|---|
| 5090 | | -void btrfs_mapping_init(struct btrfs_mapping_tree *tree) |
|---|
| 5091 | | -{ |
|---|
| 5092 | | - extent_map_tree_init(&tree->map_tree); |
|---|
| 5093 | | -} |
|---|
| 5094 | | - |
|---|
| 5095 | | -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) |
|---|
| 5460 | +void btrfs_mapping_tree_free(struct extent_map_tree *tree) |
|---|
| 5096 | 5461 | { |
|---|
| 5097 | 5462 | struct extent_map *em; |
|---|
| 5098 | 5463 | |
|---|
| 5099 | 5464 | while (1) { |
|---|
| 5100 | | - write_lock(&tree->map_tree.lock); |
|---|
| 5101 | | - em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); |
|---|
| 5465 | + write_lock(&tree->lock); |
|---|
| 5466 | + em = lookup_extent_mapping(tree, 0, (u64)-1); |
|---|
| 5102 | 5467 | if (em) |
|---|
| 5103 | | - remove_extent_mapping(&tree->map_tree, em); |
|---|
| 5104 | | - write_unlock(&tree->map_tree.lock); |
|---|
| 5468 | + remove_extent_mapping(tree, em); |
|---|
| 5469 | + write_unlock(&tree->lock); |
|---|
| 5105 | 5470 | if (!em) |
|---|
| 5106 | 5471 | break; |
|---|
| 5107 | 5472 | /* once for us */ |
|---|
| .. | .. |
|---|
| 5117 | 5482 | struct map_lookup *map; |
|---|
| 5118 | 5483 | int ret; |
|---|
| 5119 | 5484 | |
|---|
| 5120 | | - em = get_chunk_map(fs_info, logical, len); |
|---|
| 5485 | + em = btrfs_get_chunk_map(fs_info, logical, len); |
|---|
| 5121 | 5486 | if (IS_ERR(em)) |
|---|
| 5122 | 5487 | /* |
|---|
| 5123 | 5488 | * We could return errors for these cases, but that could get |
|---|
| .. | .. |
|---|
| 5128 | 5493 | return 1; |
|---|
| 5129 | 5494 | |
|---|
| 5130 | 5495 | map = em->map_lookup; |
|---|
| 5131 | | - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) |
|---|
| 5496 | + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) |
|---|
| 5132 | 5497 | ret = map->num_stripes; |
|---|
| 5133 | 5498 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
|---|
| 5134 | 5499 | ret = map->sub_stripes; |
|---|
| .. | .. |
|---|
| 5147 | 5512 | ret = 1; |
|---|
| 5148 | 5513 | free_extent_map(em); |
|---|
| 5149 | 5514 | |
|---|
| 5150 | | - btrfs_dev_replace_read_lock(&fs_info->dev_replace); |
|---|
| 5515 | + down_read(&fs_info->dev_replace.rwsem); |
|---|
| 5151 | 5516 | if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && |
|---|
| 5152 | 5517 | fs_info->dev_replace.tgtdev) |
|---|
| 5153 | 5518 | ret++; |
|---|
| 5154 | | - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); |
|---|
| 5519 | + up_read(&fs_info->dev_replace.rwsem); |
|---|
| 5155 | 5520 | |
|---|
| 5156 | 5521 | return ret; |
|---|
| 5157 | 5522 | } |
|---|
| .. | .. |
|---|
| 5163 | 5528 | struct map_lookup *map; |
|---|
| 5164 | 5529 | unsigned long len = fs_info->sectorsize; |
|---|
| 5165 | 5530 | |
|---|
| 5166 | | - em = get_chunk_map(fs_info, logical, len); |
|---|
| 5531 | + em = btrfs_get_chunk_map(fs_info, logical, len); |
|---|
| 5167 | 5532 | |
|---|
| 5168 | 5533 | if (!WARN_ON(IS_ERR(em))) { |
|---|
| 5169 | 5534 | map = em->map_lookup; |
|---|
| .. | .. |
|---|
| 5180 | 5545 | struct map_lookup *map; |
|---|
| 5181 | 5546 | int ret = 0; |
|---|
| 5182 | 5547 | |
|---|
| 5183 | | - em = get_chunk_map(fs_info, logical, len); |
|---|
| 5548 | + em = btrfs_get_chunk_map(fs_info, logical, len); |
|---|
| 5184 | 5549 | |
|---|
| 5185 | 5550 | if(!WARN_ON(IS_ERR(em))) { |
|---|
| 5186 | 5551 | map = em->map_lookup; |
|---|
| .. | .. |
|---|
| 5202 | 5567 | struct btrfs_device *srcdev; |
|---|
| 5203 | 5568 | |
|---|
| 5204 | 5569 | ASSERT((map->type & |
|---|
| 5205 | | - (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); |
|---|
| 5570 | + (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); |
|---|
| 5206 | 5571 | |
|---|
| 5207 | 5572 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
|---|
| 5208 | 5573 | num_stripes = map->sub_stripes; |
|---|
| .. | .. |
|---|
| 5240 | 5605 | return preferred_mirror; |
|---|
| 5241 | 5606 | } |
|---|
| 5242 | 5607 | |
|---|
| 5243 | | -static inline int parity_smaller(u64 a, u64 b) |
|---|
| 5244 | | -{ |
|---|
| 5245 | | - return a > b; |
|---|
| 5246 | | -} |
|---|
| 5247 | | - |
|---|
| 5248 | 5608 | /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ |
|---|
| 5249 | 5609 | static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) |
|---|
| 5250 | 5610 | { |
|---|
| 5251 | | - struct btrfs_bio_stripe s; |
|---|
| 5252 | 5611 | int i; |
|---|
| 5253 | | - u64 l; |
|---|
| 5254 | 5612 | int again = 1; |
|---|
| 5255 | 5613 | |
|---|
| 5256 | 5614 | while (again) { |
|---|
| 5257 | 5615 | again = 0; |
|---|
| 5258 | 5616 | for (i = 0; i < num_stripes - 1; i++) { |
|---|
| 5259 | | - if (parity_smaller(bbio->raid_map[i], |
|---|
| 5260 | | - bbio->raid_map[i+1])) { |
|---|
| 5261 | | - s = bbio->stripes[i]; |
|---|
| 5262 | | - l = bbio->raid_map[i]; |
|---|
| 5263 | | - bbio->stripes[i] = bbio->stripes[i+1]; |
|---|
| 5264 | | - bbio->raid_map[i] = bbio->raid_map[i+1]; |
|---|
| 5265 | | - bbio->stripes[i+1] = s; |
|---|
| 5266 | | - bbio->raid_map[i+1] = l; |
|---|
| 5267 | | - |
|---|
| 5617 | + /* Swap if parity is on a smaller index */ |
|---|
| 5618 | + if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { |
|---|
| 5619 | + swap(bbio->stripes[i], bbio->stripes[i + 1]); |
|---|
| 5620 | + swap(bbio->raid_map[i], bbio->raid_map[i + 1]); |
|---|
| 5268 | 5621 | again = 1; |
|---|
| 5269 | 5622 | } |
|---|
| 5270 | 5623 | } |
|---|
| .. | .. |
|---|
| 5290 | 5643 | atomic_set(&bbio->error, 0); |
|---|
| 5291 | 5644 | refcount_set(&bbio->refs, 1); |
|---|
| 5292 | 5645 | |
|---|
| 5646 | + bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); |
|---|
| 5647 | + bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); |
|---|
| 5648 | + |
|---|
| 5293 | 5649 | return bbio; |
|---|
| 5294 | 5650 | } |
|---|
| 5295 | 5651 | |
|---|
| .. | .. |
|---|
| 5313 | 5669 | * replace. |
|---|
| 5314 | 5670 | */ |
|---|
| 5315 | 5671 | static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, |
|---|
| 5316 | | - u64 logical, u64 length, |
|---|
| 5672 | + u64 logical, u64 *length_ret, |
|---|
| 5317 | 5673 | struct btrfs_bio **bbio_ret) |
|---|
| 5318 | 5674 | { |
|---|
| 5319 | 5675 | struct extent_map *em; |
|---|
| 5320 | 5676 | struct map_lookup *map; |
|---|
| 5321 | 5677 | struct btrfs_bio *bbio; |
|---|
| 5678 | + u64 length = *length_ret; |
|---|
| 5322 | 5679 | u64 offset; |
|---|
| 5323 | 5680 | u64 stripe_nr; |
|---|
| 5324 | 5681 | u64 stripe_nr_end; |
|---|
| .. | .. |
|---|
| 5339 | 5696 | /* discard always return a bbio */ |
|---|
| 5340 | 5697 | ASSERT(bbio_ret); |
|---|
| 5341 | 5698 | |
|---|
| 5342 | | - em = get_chunk_map(fs_info, logical, length); |
|---|
| 5699 | + em = btrfs_get_chunk_map(fs_info, logical, length); |
|---|
| 5343 | 5700 | if (IS_ERR(em)) |
|---|
| 5344 | 5701 | return PTR_ERR(em); |
|---|
| 5345 | 5702 | |
|---|
| .. | .. |
|---|
| 5351 | 5708 | } |
|---|
| 5352 | 5709 | |
|---|
| 5353 | 5710 | offset = logical - em->start; |
|---|
| 5354 | | - length = min_t(u64, em->len - offset, length); |
|---|
| 5711 | + length = min_t(u64, em->start + em->len - logical, length); |
|---|
| 5712 | + *length_ret = length; |
|---|
| 5355 | 5713 | |
|---|
| 5356 | 5714 | stripe_len = map->stripe_len; |
|---|
| 5357 | 5715 | /* |
|---|
| .. | .. |
|---|
| 5391 | 5749 | &remaining_stripes); |
|---|
| 5392 | 5750 | div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); |
|---|
| 5393 | 5751 | last_stripe *= sub_stripes; |
|---|
| 5394 | | - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
|---|
| 5752 | + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | |
|---|
| 5395 | 5753 | BTRFS_BLOCK_GROUP_DUP)) { |
|---|
| 5396 | 5754 | num_stripes = map->num_stripes; |
|---|
| 5397 | 5755 | } else { |
|---|
| .. | .. |
|---|
| 5635 | 5993 | return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); |
|---|
| 5636 | 5994 | } |
|---|
| 5637 | 5995 | |
|---|
| 5996 | +/* |
|---|
| 5997 | + * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) |
|---|
| 5998 | + * tuple. This information is used to calculate how big a |
|---|
| 5999 | + * particular bio can get before it straddles a stripe. |
|---|
| 6000 | + * |
|---|
| 6001 | + * @fs_info - the filesystem |
|---|
| 6002 | + * @logical - address that we want to figure out the geometry of |
|---|
| 6003 | + * @len - the length of IO we are going to perform, starting at @logical |
|---|
| 6004 | + * @op - type of operation - write or read |
|---|
| 6005 | + * @io_geom - pointer used to return values |
|---|
| 6006 | + * |
|---|
| 6007 | + * Returns < 0 in case a chunk for the given logical address cannot be found, |
|---|
| 6008 | + * usually shouldn't happen unless @logical is corrupted, 0 otherwise. |
|---|
| 6009 | + */ |
|---|
| 6010 | +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, |
|---|
| 6011 | + u64 logical, u64 len, struct btrfs_io_geometry *io_geom) |
|---|
| 6012 | +{ |
|---|
| 6013 | + struct extent_map *em; |
|---|
| 6014 | + struct map_lookup *map; |
|---|
| 6015 | + u64 offset; |
|---|
| 6016 | + u64 stripe_offset; |
|---|
| 6017 | + u64 stripe_nr; |
|---|
| 6018 | + u64 stripe_len; |
|---|
| 6019 | + u64 raid56_full_stripe_start = (u64)-1; |
|---|
| 6020 | + int data_stripes; |
|---|
| 6021 | + int ret = 0; |
|---|
| 6022 | + |
|---|
| 6023 | + ASSERT(op != BTRFS_MAP_DISCARD); |
|---|
| 6024 | + |
|---|
| 6025 | + em = btrfs_get_chunk_map(fs_info, logical, len); |
|---|
| 6026 | + if (IS_ERR(em)) |
|---|
| 6027 | + return PTR_ERR(em); |
|---|
| 6028 | + |
|---|
| 6029 | + map = em->map_lookup; |
|---|
| 6030 | + /* Offset of this logical address in the chunk */ |
|---|
| 6031 | + offset = logical - em->start; |
|---|
| 6032 | + /* Len of a stripe in a chunk */ |
|---|
| 6033 | + stripe_len = map->stripe_len; |
|---|
| 6034 | + /* Stripe wher this block falls in */ |
|---|
| 6035 | + stripe_nr = div64_u64(offset, stripe_len); |
|---|
| 6036 | + /* Offset of stripe in the chunk */ |
|---|
| 6037 | + stripe_offset = stripe_nr * stripe_len; |
|---|
| 6038 | + if (offset < stripe_offset) { |
|---|
| 6039 | + btrfs_crit(fs_info, |
|---|
| 6040 | +"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", |
|---|
| 6041 | + stripe_offset, offset, em->start, logical, stripe_len); |
|---|
| 6042 | + ret = -EINVAL; |
|---|
| 6043 | + goto out; |
|---|
| 6044 | + } |
|---|
| 6045 | + |
|---|
| 6046 | + /* stripe_offset is the offset of this block in its stripe */ |
|---|
| 6047 | + stripe_offset = offset - stripe_offset; |
|---|
| 6048 | + data_stripes = nr_data_stripes(map); |
|---|
| 6049 | + |
|---|
| 6050 | + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
|---|
| 6051 | + u64 max_len = stripe_len - stripe_offset; |
|---|
| 6052 | + |
|---|
| 6053 | + /* |
|---|
| 6054 | + * In case of raid56, we need to know the stripe aligned start |
|---|
| 6055 | + */ |
|---|
| 6056 | + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { |
|---|
| 6057 | + unsigned long full_stripe_len = stripe_len * data_stripes; |
|---|
| 6058 | + raid56_full_stripe_start = offset; |
|---|
| 6059 | + |
|---|
| 6060 | + /* |
|---|
| 6061 | + * Allow a write of a full stripe, but make sure we |
|---|
| 6062 | + * don't allow straddling of stripes |
|---|
| 6063 | + */ |
|---|
| 6064 | + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, |
|---|
| 6065 | + full_stripe_len); |
|---|
| 6066 | + raid56_full_stripe_start *= full_stripe_len; |
|---|
| 6067 | + |
|---|
| 6068 | + /* |
|---|
| 6069 | + * For writes to RAID[56], allow a full stripeset across |
|---|
| 6070 | + * all disks. For other RAID types and for RAID[56] |
|---|
| 6071 | + * reads, just allow a single stripe (on a single disk). |
|---|
| 6072 | + */ |
|---|
| 6073 | + if (op == BTRFS_MAP_WRITE) { |
|---|
| 6074 | + max_len = stripe_len * data_stripes - |
|---|
| 6075 | + (offset - raid56_full_stripe_start); |
|---|
| 6076 | + } |
|---|
| 6077 | + } |
|---|
| 6078 | + len = min_t(u64, em->len - offset, max_len); |
|---|
| 6079 | + } else { |
|---|
| 6080 | + len = em->len - offset; |
|---|
| 6081 | + } |
|---|
| 6082 | + |
|---|
| 6083 | + io_geom->len = len; |
|---|
| 6084 | + io_geom->offset = offset; |
|---|
| 6085 | + io_geom->stripe_len = stripe_len; |
|---|
| 6086 | + io_geom->stripe_nr = stripe_nr; |
|---|
| 6087 | + io_geom->stripe_offset = stripe_offset; |
|---|
| 6088 | + io_geom->raid56_stripe_offset = raid56_full_stripe_start; |
|---|
| 6089 | + |
|---|
| 6090 | +out: |
|---|
| 6091 | + /* once for us */ |
|---|
| 6092 | + free_extent_map(em); |
|---|
| 6093 | + return ret; |
|---|
| 6094 | +} |
|---|
| 6095 | + |
|---|
| 5638 | 6096 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, |
|---|
| 5639 | 6097 | enum btrfs_map_op op, |
|---|
| 5640 | 6098 | u64 logical, u64 *length, |
|---|
| .. | .. |
|---|
| 5643 | 6101 | { |
|---|
| 5644 | 6102 | struct extent_map *em; |
|---|
| 5645 | 6103 | struct map_lookup *map; |
|---|
| 5646 | | - u64 offset; |
|---|
| 5647 | 6104 | u64 stripe_offset; |
|---|
| 5648 | 6105 | u64 stripe_nr; |
|---|
| 5649 | 6106 | u64 stripe_len; |
|---|
| 5650 | 6107 | u32 stripe_index; |
|---|
| 6108 | + int data_stripes; |
|---|
| 5651 | 6109 | int i; |
|---|
| 5652 | 6110 | int ret = 0; |
|---|
| 5653 | 6111 | int num_stripes; |
|---|
| .. | .. |
|---|
| 5660 | 6118 | int patch_the_first_stripe_for_dev_replace = 0; |
|---|
| 5661 | 6119 | u64 physical_to_patch_in_first_stripe = 0; |
|---|
| 5662 | 6120 | u64 raid56_full_stripe_start = (u64)-1; |
|---|
| 6121 | + struct btrfs_io_geometry geom; |
|---|
| 5663 | 6122 | |
|---|
| 5664 | | - if (op == BTRFS_MAP_DISCARD) |
|---|
| 5665 | | - return __btrfs_map_block_for_discard(fs_info, logical, |
|---|
| 5666 | | - *length, bbio_ret); |
|---|
| 6123 | + ASSERT(bbio_ret); |
|---|
| 6124 | + ASSERT(op != BTRFS_MAP_DISCARD); |
|---|
| 5667 | 6125 | |
|---|
| 5668 | | - em = get_chunk_map(fs_info, logical, *length); |
|---|
| 5669 | | - if (IS_ERR(em)) |
|---|
| 5670 | | - return PTR_ERR(em); |
|---|
| 6126 | + ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); |
|---|
| 6127 | + if (ret < 0) |
|---|
| 6128 | + return ret; |
|---|
| 5671 | 6129 | |
|---|
| 6130 | + em = btrfs_get_chunk_map(fs_info, logical, *length); |
|---|
| 6131 | + ASSERT(!IS_ERR(em)); |
|---|
| 5672 | 6132 | map = em->map_lookup; |
|---|
| 5673 | | - offset = logical - em->start; |
|---|
| 5674 | 6133 | |
|---|
| 5675 | | - stripe_len = map->stripe_len; |
|---|
| 5676 | | - stripe_nr = offset; |
|---|
| 5677 | | - /* |
|---|
| 5678 | | - * stripe_nr counts the total number of stripes we have to stride |
|---|
| 5679 | | - * to get to this block |
|---|
| 5680 | | - */ |
|---|
| 5681 | | - stripe_nr = div64_u64(stripe_nr, stripe_len); |
|---|
| 6134 | + *length = geom.len; |
|---|
| 6135 | + stripe_len = geom.stripe_len; |
|---|
| 6136 | + stripe_nr = geom.stripe_nr; |
|---|
| 6137 | + stripe_offset = geom.stripe_offset; |
|---|
| 6138 | + raid56_full_stripe_start = geom.raid56_stripe_offset; |
|---|
| 6139 | + data_stripes = nr_data_stripes(map); |
|---|
| 5682 | 6140 | |
|---|
| 5683 | | - stripe_offset = stripe_nr * stripe_len; |
|---|
| 5684 | | - if (offset < stripe_offset) { |
|---|
| 5685 | | - btrfs_crit(fs_info, |
|---|
| 5686 | | - "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", |
|---|
| 5687 | | - stripe_offset, offset, em->start, logical, |
|---|
| 5688 | | - stripe_len); |
|---|
| 5689 | | - free_extent_map(em); |
|---|
| 5690 | | - return -EINVAL; |
|---|
| 5691 | | - } |
|---|
| 5692 | | - |
|---|
| 5693 | | - /* stripe_offset is the offset of this block in its stripe*/ |
|---|
| 5694 | | - stripe_offset = offset - stripe_offset; |
|---|
| 5695 | | - |
|---|
| 5696 | | - /* if we're here for raid56, we need to know the stripe aligned start */ |
|---|
| 5697 | | - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { |
|---|
| 5698 | | - unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); |
|---|
| 5699 | | - raid56_full_stripe_start = offset; |
|---|
| 5700 | | - |
|---|
| 5701 | | - /* allow a write of a full stripe, but make sure we don't |
|---|
| 5702 | | - * allow straddling of stripes |
|---|
| 5703 | | - */ |
|---|
| 5704 | | - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, |
|---|
| 5705 | | - full_stripe_len); |
|---|
| 5706 | | - raid56_full_stripe_start *= full_stripe_len; |
|---|
| 5707 | | - } |
|---|
| 5708 | | - |
|---|
| 5709 | | - if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
|---|
| 5710 | | - u64 max_len; |
|---|
| 5711 | | - /* For writes to RAID[56], allow a full stripeset across all disks. |
|---|
| 5712 | | - For other RAID types and for RAID[56] reads, just allow a single |
|---|
| 5713 | | - stripe (on a single disk). */ |
|---|
| 5714 | | - if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && |
|---|
| 5715 | | - (op == BTRFS_MAP_WRITE)) { |
|---|
| 5716 | | - max_len = stripe_len * nr_data_stripes(map) - |
|---|
| 5717 | | - (offset - raid56_full_stripe_start); |
|---|
| 5718 | | - } else { |
|---|
| 5719 | | - /* we limit the length of each bio to what fits in a stripe */ |
|---|
| 5720 | | - max_len = stripe_len - stripe_offset; |
|---|
| 5721 | | - } |
|---|
| 5722 | | - *length = min_t(u64, em->len - offset, max_len); |
|---|
| 5723 | | - } else { |
|---|
| 5724 | | - *length = em->len - offset; |
|---|
| 5725 | | - } |
|---|
| 5726 | | - |
|---|
| 5727 | | - /* This is for when we're called from btrfs_merge_bio_hook() and all |
|---|
| 5728 | | - it cares about is the length */ |
|---|
| 5729 | | - if (!bbio_ret) |
|---|
| 5730 | | - goto out; |
|---|
| 5731 | | - |
|---|
| 5732 | | - btrfs_dev_replace_read_lock(dev_replace); |
|---|
| 6141 | + down_read(&dev_replace->rwsem); |
|---|
| 5733 | 6142 | dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); |
|---|
| 6143 | + /* |
|---|
| 6144 | + * Hold the semaphore for read during the whole operation, write is |
|---|
| 6145 | + * requested at commit time but must wait. |
|---|
| 6146 | + */ |
|---|
| 5734 | 6147 | if (!dev_replace_is_ongoing) |
|---|
| 5735 | | - btrfs_dev_replace_read_unlock(dev_replace); |
|---|
| 5736 | | - else |
|---|
| 5737 | | - btrfs_dev_replace_set_lock_blocking(dev_replace); |
|---|
| 6148 | + up_read(&dev_replace->rwsem); |
|---|
| 5738 | 6149 | |
|---|
| 5739 | 6150 | if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && |
|---|
| 5740 | 6151 | !need_full_stripe(op) && dev_replace->tgtdev != NULL) { |
|---|
| .. | .. |
|---|
| 5757 | 6168 | &stripe_index); |
|---|
| 5758 | 6169 | if (!need_full_stripe(op)) |
|---|
| 5759 | 6170 | mirror_num = 1; |
|---|
| 5760 | | - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { |
|---|
| 6171 | + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { |
|---|
| 5761 | 6172 | if (need_full_stripe(op)) |
|---|
| 5762 | 6173 | num_stripes = map->num_stripes; |
|---|
| 5763 | 6174 | else if (mirror_num) |
|---|
| .. | .. |
|---|
| 5799 | 6210 | if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { |
|---|
| 5800 | 6211 | /* push stripe_nr back to the start of the full stripe */ |
|---|
| 5801 | 6212 | stripe_nr = div64_u64(raid56_full_stripe_start, |
|---|
| 5802 | | - stripe_len * nr_data_stripes(map)); |
|---|
| 6213 | + stripe_len * data_stripes); |
|---|
| 5803 | 6214 | |
|---|
| 5804 | 6215 | /* RAID[56] write or recovery. Return all stripes */ |
|---|
| 5805 | 6216 | num_stripes = map->num_stripes; |
|---|
| .. | .. |
|---|
| 5815 | 6226 | * Mirror #3 is RAID6 Q block. |
|---|
| 5816 | 6227 | */ |
|---|
| 5817 | 6228 | stripe_nr = div_u64_rem(stripe_nr, |
|---|
| 5818 | | - nr_data_stripes(map), &stripe_index); |
|---|
| 6229 | + data_stripes, &stripe_index); |
|---|
| 5819 | 6230 | if (mirror_num > 1) |
|---|
| 5820 | | - stripe_index = nr_data_stripes(map) + |
|---|
| 5821 | | - mirror_num - 2; |
|---|
| 6231 | + stripe_index = data_stripes + mirror_num - 2; |
|---|
| 5822 | 6232 | |
|---|
| 5823 | 6233 | /* We distribute the parity blocks across stripes */ |
|---|
| 5824 | 6234 | div_u64_rem(stripe_nr + stripe_index, map->num_stripes, |
|---|
| .. | .. |
|---|
| 5858 | 6268 | ret = -ENOMEM; |
|---|
| 5859 | 6269 | goto out; |
|---|
| 5860 | 6270 | } |
|---|
| 5861 | | - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) |
|---|
| 5862 | | - bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); |
|---|
| 6271 | + |
|---|
| 6272 | + for (i = 0; i < num_stripes; i++) { |
|---|
| 6273 | + bbio->stripes[i].physical = map->stripes[stripe_index].physical + |
|---|
| 6274 | + stripe_offset + stripe_nr * map->stripe_len; |
|---|
| 6275 | + bbio->stripes[i].dev = map->stripes[stripe_index].dev; |
|---|
| 6276 | + stripe_index++; |
|---|
| 6277 | + } |
|---|
| 5863 | 6278 | |
|---|
| 5864 | 6279 | /* build raid_map */ |
|---|
| 5865 | 6280 | if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && |
|---|
| .. | .. |
|---|
| 5867 | 6282 | u64 tmp; |
|---|
| 5868 | 6283 | unsigned rot; |
|---|
| 5869 | 6284 | |
|---|
| 5870 | | - bbio->raid_map = (u64 *)((void *)bbio->stripes + |
|---|
| 5871 | | - sizeof(struct btrfs_bio_stripe) * |
|---|
| 5872 | | - num_alloc_stripes + |
|---|
| 5873 | | - sizeof(int) * tgtdev_indexes); |
|---|
| 5874 | | - |
|---|
| 5875 | 6285 | /* Work out the disk rotation on this stripe-set */ |
|---|
| 5876 | 6286 | div_u64_rem(stripe_nr, num_stripes, &rot); |
|---|
| 5877 | 6287 | |
|---|
| 5878 | 6288 | /* Fill in the logical address of each stripe */ |
|---|
| 5879 | | - tmp = stripe_nr * nr_data_stripes(map); |
|---|
| 5880 | | - for (i = 0; i < nr_data_stripes(map); i++) |
|---|
| 6289 | + tmp = stripe_nr * data_stripes; |
|---|
| 6290 | + for (i = 0; i < data_stripes; i++) |
|---|
| 5881 | 6291 | bbio->raid_map[(i+rot) % num_stripes] = |
|---|
| 5882 | 6292 | em->start + (tmp + i) * map->stripe_len; |
|---|
| 5883 | 6293 | |
|---|
| .. | .. |
|---|
| 5885 | 6295 | if (map->type & BTRFS_BLOCK_GROUP_RAID6) |
|---|
| 5886 | 6296 | bbio->raid_map[(i+rot+1) % num_stripes] = |
|---|
| 5887 | 6297 | RAID6_Q_STRIPE; |
|---|
| 5888 | | - } |
|---|
| 5889 | 6298 | |
|---|
| 5890 | | - |
|---|
| 5891 | | - for (i = 0; i < num_stripes; i++) { |
|---|
| 5892 | | - bbio->stripes[i].physical = |
|---|
| 5893 | | - map->stripes[stripe_index].physical + |
|---|
| 5894 | | - stripe_offset + |
|---|
| 5895 | | - stripe_nr * map->stripe_len; |
|---|
| 5896 | | - bbio->stripes[i].dev = |
|---|
| 5897 | | - map->stripes[stripe_index].dev; |
|---|
| 5898 | | - stripe_index++; |
|---|
| 6299 | + sort_parity_stripes(bbio, num_stripes); |
|---|
| 5899 | 6300 | } |
|---|
| 5900 | 6301 | |
|---|
| 5901 | 6302 | if (need_full_stripe(op)) |
|---|
| 5902 | 6303 | max_errors = btrfs_chunk_max_errors(map); |
|---|
| 5903 | | - |
|---|
| 5904 | | - if (bbio->raid_map) |
|---|
| 5905 | | - sort_parity_stripes(bbio, num_stripes); |
|---|
| 5906 | 6304 | |
|---|
| 5907 | 6305 | if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && |
|---|
| 5908 | 6306 | need_full_stripe(op)) { |
|---|
| .. | .. |
|---|
| 5929 | 6327 | } |
|---|
| 5930 | 6328 | out: |
|---|
| 5931 | 6329 | if (dev_replace_is_ongoing) { |
|---|
| 5932 | | - btrfs_dev_replace_clear_lock_blocking(dev_replace); |
|---|
| 5933 | | - btrfs_dev_replace_read_unlock(dev_replace); |
|---|
| 6330 | + lockdep_assert_held(&dev_replace->rwsem); |
|---|
| 6331 | + /* Unlock and let waiting writers proceed */ |
|---|
| 6332 | + up_read(&dev_replace->rwsem); |
|---|
| 5934 | 6333 | } |
|---|
| 5935 | 6334 | free_extent_map(em); |
|---|
| 5936 | 6335 | return ret; |
|---|
| .. | .. |
|---|
| 5940 | 6339 | u64 logical, u64 *length, |
|---|
| 5941 | 6340 | struct btrfs_bio **bbio_ret, int mirror_num) |
|---|
| 5942 | 6341 | { |
|---|
| 6342 | + if (op == BTRFS_MAP_DISCARD) |
|---|
| 6343 | + return __btrfs_map_block_for_discard(fs_info, logical, |
|---|
| 6344 | + length, bbio_ret); |
|---|
| 6345 | + |
|---|
| 5943 | 6346 | return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, |
|---|
| 5944 | 6347 | mirror_num, 0); |
|---|
| 5945 | 6348 | } |
|---|
| .. | .. |
|---|
| 5950 | 6353 | struct btrfs_bio **bbio_ret) |
|---|
| 5951 | 6354 | { |
|---|
| 5952 | 6355 | return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); |
|---|
| 5953 | | -} |
|---|
| 5954 | | - |
|---|
| 5955 | | -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, |
|---|
| 5956 | | - u64 physical, u64 **logical, int *naddrs, int *stripe_len) |
|---|
| 5957 | | -{ |
|---|
| 5958 | | - struct extent_map *em; |
|---|
| 5959 | | - struct map_lookup *map; |
|---|
| 5960 | | - u64 *buf; |
|---|
| 5961 | | - u64 bytenr; |
|---|
| 5962 | | - u64 length; |
|---|
| 5963 | | - u64 stripe_nr; |
|---|
| 5964 | | - u64 rmap_len; |
|---|
| 5965 | | - int i, j, nr = 0; |
|---|
| 5966 | | - |
|---|
| 5967 | | - em = get_chunk_map(fs_info, chunk_start, 1); |
|---|
| 5968 | | - if (IS_ERR(em)) |
|---|
| 5969 | | - return -EIO; |
|---|
| 5970 | | - |
|---|
| 5971 | | - map = em->map_lookup; |
|---|
| 5972 | | - length = em->len; |
|---|
| 5973 | | - rmap_len = map->stripe_len; |
|---|
| 5974 | | - |
|---|
| 5975 | | - if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
|---|
| 5976 | | - length = div_u64(length, map->num_stripes / map->sub_stripes); |
|---|
| 5977 | | - else if (map->type & BTRFS_BLOCK_GROUP_RAID0) |
|---|
| 5978 | | - length = div_u64(length, map->num_stripes); |
|---|
| 5979 | | - else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { |
|---|
| 5980 | | - length = div_u64(length, nr_data_stripes(map)); |
|---|
| 5981 | | - rmap_len = map->stripe_len * nr_data_stripes(map); |
|---|
| 5982 | | - } |
|---|
| 5983 | | - |
|---|
| 5984 | | - buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); |
|---|
| 5985 | | - BUG_ON(!buf); /* -ENOMEM */ |
|---|
| 5986 | | - |
|---|
| 5987 | | - for (i = 0; i < map->num_stripes; i++) { |
|---|
| 5988 | | - if (map->stripes[i].physical > physical || |
|---|
| 5989 | | - map->stripes[i].physical + length <= physical) |
|---|
| 5990 | | - continue; |
|---|
| 5991 | | - |
|---|
| 5992 | | - stripe_nr = physical - map->stripes[i].physical; |
|---|
| 5993 | | - stripe_nr = div64_u64(stripe_nr, map->stripe_len); |
|---|
| 5994 | | - |
|---|
| 5995 | | - if (map->type & BTRFS_BLOCK_GROUP_RAID10) { |
|---|
| 5996 | | - stripe_nr = stripe_nr * map->num_stripes + i; |
|---|
| 5997 | | - stripe_nr = div_u64(stripe_nr, map->sub_stripes); |
|---|
| 5998 | | - } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
|---|
| 5999 | | - stripe_nr = stripe_nr * map->num_stripes + i; |
|---|
| 6000 | | - } /* else if RAID[56], multiply by nr_data_stripes(). |
|---|
| 6001 | | - * Alternatively, just use rmap_len below instead of |
|---|
| 6002 | | - * map->stripe_len */ |
|---|
| 6003 | | - |
|---|
| 6004 | | - bytenr = chunk_start + stripe_nr * rmap_len; |
|---|
| 6005 | | - WARN_ON(nr >= map->num_stripes); |
|---|
| 6006 | | - for (j = 0; j < nr; j++) { |
|---|
| 6007 | | - if (buf[j] == bytenr) |
|---|
| 6008 | | - break; |
|---|
| 6009 | | - } |
|---|
| 6010 | | - if (j == nr) { |
|---|
| 6011 | | - WARN_ON(nr >= map->num_stripes); |
|---|
| 6012 | | - buf[nr++] = bytenr; |
|---|
| 6013 | | - } |
|---|
| 6014 | | - } |
|---|
| 6015 | | - |
|---|
| 6016 | | - *logical = buf; |
|---|
| 6017 | | - *naddrs = nr; |
|---|
| 6018 | | - *stripe_len = rmap_len; |
|---|
| 6019 | | - |
|---|
| 6020 | | - free_extent_map(em); |
|---|
| 6021 | | - return 0; |
|---|
| 6022 | 6356 | } |
|---|
| 6023 | 6357 | |
|---|
| 6024 | 6358 | static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) |
|---|
| .. | .. |
|---|
| 6039 | 6373 | atomic_inc(&bbio->error); |
|---|
| 6040 | 6374 | if (bio->bi_status == BLK_STS_IOERR || |
|---|
| 6041 | 6375 | bio->bi_status == BLK_STS_TARGET) { |
|---|
| 6042 | | - unsigned int stripe_index = |
|---|
| 6043 | | - btrfs_io_bio(bio)->stripe_index; |
|---|
| 6044 | | - struct btrfs_device *dev; |
|---|
| 6376 | + struct btrfs_device *dev = btrfs_io_bio(bio)->device; |
|---|
| 6045 | 6377 | |
|---|
| 6046 | | - BUG_ON(stripe_index >= bbio->num_stripes); |
|---|
| 6047 | | - dev = bbio->stripes[stripe_index].dev; |
|---|
| 6048 | | - if (dev->bdev) { |
|---|
| 6049 | | - if (bio_op(bio) == REQ_OP_WRITE) |
|---|
| 6050 | | - btrfs_dev_stat_inc_and_print(dev, |
|---|
| 6378 | + ASSERT(dev->bdev); |
|---|
| 6379 | + if (bio_op(bio) == REQ_OP_WRITE) |
|---|
| 6380 | + btrfs_dev_stat_inc_and_print(dev, |
|---|
| 6051 | 6381 | BTRFS_DEV_STAT_WRITE_ERRS); |
|---|
| 6052 | | - else if (!(bio->bi_opf & REQ_RAHEAD)) |
|---|
| 6053 | | - btrfs_dev_stat_inc_and_print(dev, |
|---|
| 6382 | + else if (!(bio->bi_opf & REQ_RAHEAD)) |
|---|
| 6383 | + btrfs_dev_stat_inc_and_print(dev, |
|---|
| 6054 | 6384 | BTRFS_DEV_STAT_READ_ERRS); |
|---|
| 6055 | | - if (bio->bi_opf & REQ_PREFLUSH) |
|---|
| 6056 | | - btrfs_dev_stat_inc_and_print(dev, |
|---|
| 6385 | + if (bio->bi_opf & REQ_PREFLUSH) |
|---|
| 6386 | + btrfs_dev_stat_inc_and_print(dev, |
|---|
| 6057 | 6387 | BTRFS_DEV_STAT_FLUSH_ERRS); |
|---|
| 6058 | | - } |
|---|
| 6059 | 6388 | } |
|---|
| 6060 | 6389 | } |
|---|
| 6061 | 6390 | |
|---|
| .. | .. |
|---|
| 6090 | 6419 | } |
|---|
| 6091 | 6420 | } |
|---|
| 6092 | 6421 | |
|---|
| 6093 | | -/* |
|---|
| 6094 | | - * see run_scheduled_bios for a description of why bios are collected for |
|---|
| 6095 | | - * async submit. |
|---|
| 6096 | | - * |
|---|
| 6097 | | - * This will add one bio to the pending list for a device and make sure |
|---|
| 6098 | | - * the work struct is scheduled. |
|---|
| 6099 | | - */ |
|---|
| 6100 | | -static noinline void btrfs_schedule_bio(struct btrfs_device *device, |
|---|
| 6101 | | - struct bio *bio) |
|---|
| 6102 | | -{ |
|---|
| 6103 | | - struct btrfs_fs_info *fs_info = device->fs_info; |
|---|
| 6104 | | - int should_queue = 1; |
|---|
| 6105 | | - struct btrfs_pending_bios *pending_bios; |
|---|
| 6106 | | - |
|---|
| 6107 | | - /* don't bother with additional async steps for reads, right now */ |
|---|
| 6108 | | - if (bio_op(bio) == REQ_OP_READ) { |
|---|
| 6109 | | - btrfsic_submit_bio(bio); |
|---|
| 6110 | | - return; |
|---|
| 6111 | | - } |
|---|
| 6112 | | - |
|---|
| 6113 | | - WARN_ON(bio->bi_next); |
|---|
| 6114 | | - bio->bi_next = NULL; |
|---|
| 6115 | | - |
|---|
| 6116 | | - spin_lock(&device->io_lock); |
|---|
| 6117 | | - if (op_is_sync(bio->bi_opf)) |
|---|
| 6118 | | - pending_bios = &device->pending_sync_bios; |
|---|
| 6119 | | - else |
|---|
| 6120 | | - pending_bios = &device->pending_bios; |
|---|
| 6121 | | - |
|---|
| 6122 | | - if (pending_bios->tail) |
|---|
| 6123 | | - pending_bios->tail->bi_next = bio; |
|---|
| 6124 | | - |
|---|
| 6125 | | - pending_bios->tail = bio; |
|---|
| 6126 | | - if (!pending_bios->head) |
|---|
| 6127 | | - pending_bios->head = bio; |
|---|
| 6128 | | - if (device->running_pending) |
|---|
| 6129 | | - should_queue = 0; |
|---|
| 6130 | | - |
|---|
| 6131 | | - spin_unlock(&device->io_lock); |
|---|
| 6132 | | - |
|---|
| 6133 | | - if (should_queue) |
|---|
| 6134 | | - btrfs_queue_work(fs_info->submit_workers, &device->work); |
|---|
| 6135 | | -} |
|---|
| 6136 | | - |
|---|
| 6137 | 6422 | static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, |
|---|
| 6138 | | - u64 physical, int dev_nr, int async) |
|---|
| 6423 | + u64 physical, struct btrfs_device *dev) |
|---|
| 6139 | 6424 | { |
|---|
| 6140 | | - struct btrfs_device *dev = bbio->stripes[dev_nr].dev; |
|---|
| 6141 | 6425 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
|---|
| 6142 | 6426 | |
|---|
| 6143 | 6427 | bio->bi_private = bbio; |
|---|
| 6144 | | - btrfs_io_bio(bio)->stripe_index = dev_nr; |
|---|
| 6428 | + btrfs_io_bio(bio)->device = dev; |
|---|
| 6145 | 6429 | bio->bi_end_io = btrfs_end_bio; |
|---|
| 6146 | 6430 | bio->bi_iter.bi_sector = physical >> 9; |
|---|
| 6147 | 6431 | btrfs_debug_in_rcu(fs_info, |
|---|
| 6148 | 6432 | "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", |
|---|
| 6149 | 6433 | bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, |
|---|
| 6150 | | - (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, |
|---|
| 6151 | | - bio->bi_iter.bi_size); |
|---|
| 6434 | + (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), |
|---|
| 6435 | + dev->devid, bio->bi_iter.bi_size); |
|---|
| 6152 | 6436 | bio_set_dev(bio, dev->bdev); |
|---|
| 6153 | 6437 | |
|---|
| 6154 | 6438 | btrfs_bio_counter_inc_noblocked(fs_info); |
|---|
| 6155 | 6439 | |
|---|
| 6156 | | - if (async) |
|---|
| 6157 | | - btrfs_schedule_bio(dev, bio); |
|---|
| 6158 | | - else |
|---|
| 6159 | | - btrfsic_submit_bio(bio); |
|---|
| 6440 | + btrfsic_submit_bio(bio); |
|---|
| 6160 | 6441 | } |
|---|
| 6161 | 6442 | |
|---|
| 6162 | 6443 | static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) |
|---|
| .. | .. |
|---|
| 6177 | 6458 | } |
|---|
| 6178 | 6459 | |
|---|
| 6179 | 6460 | blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, |
|---|
| 6180 | | - int mirror_num, int async_submit) |
|---|
| 6461 | + int mirror_num) |
|---|
| 6181 | 6462 | { |
|---|
| 6182 | 6463 | struct btrfs_device *dev; |
|---|
| 6183 | 6464 | struct bio *first_bio = bio; |
|---|
| .. | .. |
|---|
| 6245 | 6526 | else |
|---|
| 6246 | 6527 | bio = first_bio; |
|---|
| 6247 | 6528 | |
|---|
| 6248 | | - submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, |
|---|
| 6249 | | - dev_nr, async_submit); |
|---|
| 6529 | + submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); |
|---|
| 6250 | 6530 | } |
|---|
| 6251 | 6531 | btrfs_bio_counter_dec(fs_info); |
|---|
| 6252 | 6532 | return BLK_STS_OK; |
|---|
| .. | .. |
|---|
| 6262 | 6542 | * If @seed is true, traverse through the seed devices. |
|---|
| 6263 | 6543 | */ |
|---|
| 6264 | 6544 | struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, |
|---|
| 6265 | | - u64 devid, u8 *uuid, u8 *fsid, |
|---|
| 6266 | | - bool seed) |
|---|
| 6545 | + u64 devid, u8 *uuid, u8 *fsid, |
|---|
| 6546 | + bool seed) |
|---|
| 6267 | 6547 | { |
|---|
| 6268 | 6548 | struct btrfs_device *device; |
|---|
| 6549 | + struct btrfs_fs_devices *seed_devs; |
|---|
| 6269 | 6550 | |
|---|
| 6270 | | - while (fs_devices) { |
|---|
| 6551 | + if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { |
|---|
| 6552 | + list_for_each_entry(device, &fs_devices->devices, dev_list) { |
|---|
| 6553 | + if (device->devid == devid && |
|---|
| 6554 | + (!uuid || memcmp(device->uuid, uuid, |
|---|
| 6555 | + BTRFS_UUID_SIZE) == 0)) |
|---|
| 6556 | + return device; |
|---|
| 6557 | + } |
|---|
| 6558 | + } |
|---|
| 6559 | + |
|---|
| 6560 | + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { |
|---|
| 6271 | 6561 | if (!fsid || |
|---|
| 6272 | | - !memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) { |
|---|
| 6273 | | - list_for_each_entry(device, &fs_devices->devices, |
|---|
| 6562 | + !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { |
|---|
| 6563 | + list_for_each_entry(device, &seed_devs->devices, |
|---|
| 6274 | 6564 | dev_list) { |
|---|
| 6275 | 6565 | if (device->devid == devid && |
|---|
| 6276 | 6566 | (!uuid || memcmp(device->uuid, uuid, |
|---|
| .. | .. |
|---|
| 6278 | 6568 | return device; |
|---|
| 6279 | 6569 | } |
|---|
| 6280 | 6570 | } |
|---|
| 6281 | | - if (seed) |
|---|
| 6282 | | - fs_devices = fs_devices->seed; |
|---|
| 6283 | | - else |
|---|
| 6284 | | - return NULL; |
|---|
| 6285 | 6571 | } |
|---|
| 6572 | + |
|---|
| 6286 | 6573 | return NULL; |
|---|
| 6287 | 6574 | } |
|---|
| 6288 | 6575 | |
|---|
| .. | .. |
|---|
| 6337 | 6624 | if (WARN_ON(!devid && !fs_info)) |
|---|
| 6338 | 6625 | return ERR_PTR(-EINVAL); |
|---|
| 6339 | 6626 | |
|---|
| 6340 | | - dev = __alloc_device(); |
|---|
| 6627 | + dev = __alloc_device(fs_info); |
|---|
| 6341 | 6628 | if (IS_ERR(dev)) |
|---|
| 6342 | 6629 | return dev; |
|---|
| 6343 | 6630 | |
|---|
| .. | .. |
|---|
| 6359 | 6646 | else |
|---|
| 6360 | 6647 | generate_random_uuid(dev->uuid); |
|---|
| 6361 | 6648 | |
|---|
| 6362 | | - btrfs_init_work(&dev->work, btrfs_submit_helper, |
|---|
| 6363 | | - pending_bios_fn, NULL, NULL); |
|---|
| 6364 | | - |
|---|
| 6365 | 6649 | return dev; |
|---|
| 6366 | 6650 | } |
|---|
| 6367 | 6651 | |
|---|
| .. | .. |
|---|
| 6376 | 6660 | devid, uuid); |
|---|
| 6377 | 6661 | } |
|---|
| 6378 | 6662 | |
|---|
| 6379 | | -static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, |
|---|
| 6380 | | - struct extent_buffer *leaf, |
|---|
| 6663 | +static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) |
|---|
| 6664 | +{ |
|---|
| 6665 | + int index = btrfs_bg_flags_to_raid_index(type); |
|---|
| 6666 | + int ncopies = btrfs_raid_array[index].ncopies; |
|---|
| 6667 | + const int nparity = btrfs_raid_array[index].nparity; |
|---|
| 6668 | + int data_stripes; |
|---|
| 6669 | + |
|---|
| 6670 | + if (nparity) |
|---|
| 6671 | + data_stripes = num_stripes - nparity; |
|---|
| 6672 | + else |
|---|
| 6673 | + data_stripes = num_stripes / ncopies; |
|---|
| 6674 | + |
|---|
| 6675 | + return div_u64(chunk_len, data_stripes); |
|---|
| 6676 | +} |
|---|
| 6677 | + |
|---|
| 6678 | +static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, |
|---|
| 6381 | 6679 | struct btrfs_chunk *chunk) |
|---|
| 6382 | 6680 | { |
|---|
| 6383 | | - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; |
|---|
| 6681 | + struct btrfs_fs_info *fs_info = leaf->fs_info; |
|---|
| 6682 | + struct extent_map_tree *map_tree = &fs_info->mapping_tree; |
|---|
| 6384 | 6683 | struct map_lookup *map; |
|---|
| 6385 | 6684 | struct extent_map *em; |
|---|
| 6386 | 6685 | u64 logical; |
|---|
| .. | .. |
|---|
| 6400 | 6699 | * as chunk item in tree block is already verified by tree-checker. |
|---|
| 6401 | 6700 | */ |
|---|
| 6402 | 6701 | if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { |
|---|
| 6403 | | - ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); |
|---|
| 6702 | + ret = btrfs_check_chunk_valid(leaf, chunk, logical); |
|---|
| 6404 | 6703 | if (ret) |
|---|
| 6405 | 6704 | return ret; |
|---|
| 6406 | 6705 | } |
|---|
| 6407 | 6706 | |
|---|
| 6408 | | - read_lock(&map_tree->map_tree.lock); |
|---|
| 6409 | | - em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); |
|---|
| 6410 | | - read_unlock(&map_tree->map_tree.lock); |
|---|
| 6707 | + read_lock(&map_tree->lock); |
|---|
| 6708 | + em = lookup_extent_mapping(map_tree, logical, 1); |
|---|
| 6709 | + read_unlock(&map_tree->lock); |
|---|
| 6411 | 6710 | |
|---|
| 6412 | 6711 | /* already mapped? */ |
|---|
| 6413 | 6712 | if (em && em->start <= logical && em->start + em->len > logical) { |
|---|
| .. | .. |
|---|
| 6441 | 6740 | map->type = btrfs_chunk_type(leaf, chunk); |
|---|
| 6442 | 6741 | map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); |
|---|
| 6443 | 6742 | map->verified_stripes = 0; |
|---|
| 6743 | + em->orig_block_len = calc_stripe_length(map->type, em->len, |
|---|
| 6744 | + map->num_stripes); |
|---|
| 6444 | 6745 | for (i = 0; i < num_stripes; i++) { |
|---|
| 6445 | 6746 | map->stripes[i].physical = |
|---|
| 6446 | 6747 | btrfs_stripe_offset_nr(leaf, chunk, i); |
|---|
| .. | .. |
|---|
| 6449 | 6750 | btrfs_stripe_dev_uuid_nr(chunk, i), |
|---|
| 6450 | 6751 | BTRFS_UUID_SIZE); |
|---|
| 6451 | 6752 | map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, |
|---|
| 6452 | | - devid, uuid, NULL, true); |
|---|
| 6753 | + devid, uuid, NULL, true); |
|---|
| 6453 | 6754 | if (!map->stripes[i].dev && |
|---|
| 6454 | 6755 | !btrfs_test_opt(fs_info, DEGRADED)) { |
|---|
| 6455 | 6756 | free_extent_map(em); |
|---|
| .. | .. |
|---|
| 6474 | 6775 | |
|---|
| 6475 | 6776 | } |
|---|
| 6476 | 6777 | |
|---|
| 6477 | | - write_lock(&map_tree->map_tree.lock); |
|---|
| 6478 | | - ret = add_extent_mapping(&map_tree->map_tree, em, 0); |
|---|
| 6479 | | - write_unlock(&map_tree->map_tree.lock); |
|---|
| 6778 | + write_lock(&map_tree->lock); |
|---|
| 6779 | + ret = add_extent_mapping(map_tree, em, 0); |
|---|
| 6780 | + write_unlock(&map_tree->lock); |
|---|
| 6480 | 6781 | if (ret < 0) { |
|---|
| 6481 | 6782 | btrfs_err(fs_info, |
|---|
| 6482 | 6783 | "failed to add chunk map, start=%llu len=%llu: %d", |
|---|
| .. | .. |
|---|
| 6519 | 6820 | lockdep_assert_held(&uuid_mutex); |
|---|
| 6520 | 6821 | ASSERT(fsid); |
|---|
| 6521 | 6822 | |
|---|
| 6522 | | - fs_devices = fs_info->fs_devices->seed; |
|---|
| 6523 | | - while (fs_devices) { |
|---|
| 6823 | + /* This will match only for multi-device seed fs */ |
|---|
| 6824 | + list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) |
|---|
| 6524 | 6825 | if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) |
|---|
| 6525 | 6826 | return fs_devices; |
|---|
| 6526 | 6827 | |
|---|
| 6527 | | - fs_devices = fs_devices->seed; |
|---|
| 6528 | | - } |
|---|
| 6529 | 6828 | |
|---|
| 6530 | | - fs_devices = find_fsid(fsid); |
|---|
| 6829 | + fs_devices = find_fsid(fsid, NULL); |
|---|
| 6531 | 6830 | if (!fs_devices) { |
|---|
| 6532 | 6831 | if (!btrfs_test_opt(fs_info, DEGRADED)) |
|---|
| 6533 | 6832 | return ERR_PTR(-ENOENT); |
|---|
| 6534 | 6833 | |
|---|
| 6535 | | - fs_devices = alloc_fs_devices(fsid); |
|---|
| 6834 | + fs_devices = alloc_fs_devices(fsid, NULL); |
|---|
| 6536 | 6835 | if (IS_ERR(fs_devices)) |
|---|
| 6537 | 6836 | return fs_devices; |
|---|
| 6538 | 6837 | |
|---|
| 6539 | | - fs_devices->seeding = 1; |
|---|
| 6838 | + fs_devices->seeding = true; |
|---|
| 6540 | 6839 | fs_devices->opened = 1; |
|---|
| 6541 | 6840 | return fs_devices; |
|---|
| 6542 | 6841 | } |
|---|
| 6543 | 6842 | |
|---|
| 6843 | + /* |
|---|
| 6844 | + * Upon first call for a seed fs fsid, just create a private copy of the |
|---|
| 6845 | + * respective fs_devices and anchor it at fs_info->fs_devices->seed_list |
|---|
| 6846 | + */ |
|---|
| 6544 | 6847 | fs_devices = clone_fs_devices(fs_devices); |
|---|
| 6545 | 6848 | if (IS_ERR(fs_devices)) |
|---|
| 6546 | 6849 | return fs_devices; |
|---|
| .. | .. |
|---|
| 6548 | 6851 | ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); |
|---|
| 6549 | 6852 | if (ret) { |
|---|
| 6550 | 6853 | free_fs_devices(fs_devices); |
|---|
| 6551 | | - fs_devices = ERR_PTR(ret); |
|---|
| 6552 | | - goto out; |
|---|
| 6854 | + return ERR_PTR(ret); |
|---|
| 6553 | 6855 | } |
|---|
| 6554 | 6856 | |
|---|
| 6555 | 6857 | if (!fs_devices->seeding) { |
|---|
| 6556 | 6858 | close_fs_devices(fs_devices); |
|---|
| 6557 | 6859 | free_fs_devices(fs_devices); |
|---|
| 6558 | | - fs_devices = ERR_PTR(-EINVAL); |
|---|
| 6559 | | - goto out; |
|---|
| 6860 | + return ERR_PTR(-EINVAL); |
|---|
| 6560 | 6861 | } |
|---|
| 6561 | 6862 | |
|---|
| 6562 | | - fs_devices->seed = fs_info->fs_devices->seed; |
|---|
| 6563 | | - fs_info->fs_devices->seed = fs_devices; |
|---|
| 6564 | | -out: |
|---|
| 6863 | + list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); |
|---|
| 6864 | + |
|---|
| 6565 | 6865 | return fs_devices; |
|---|
| 6566 | 6866 | } |
|---|
| 6567 | 6867 | |
|---|
| 6568 | | -static int read_one_dev(struct btrfs_fs_info *fs_info, |
|---|
| 6569 | | - struct extent_buffer *leaf, |
|---|
| 6868 | +static int read_one_dev(struct extent_buffer *leaf, |
|---|
| 6570 | 6869 | struct btrfs_dev_item *dev_item) |
|---|
| 6571 | 6870 | { |
|---|
| 6871 | + struct btrfs_fs_info *fs_info = leaf->fs_info; |
|---|
| 6572 | 6872 | struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
|---|
| 6573 | 6873 | struct btrfs_device *device; |
|---|
| 6574 | 6874 | u64 devid; |
|---|
| .. | .. |
|---|
| 6582 | 6882 | read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), |
|---|
| 6583 | 6883 | BTRFS_FSID_SIZE); |
|---|
| 6584 | 6884 | |
|---|
| 6585 | | - if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { |
|---|
| 6885 | + if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { |
|---|
| 6586 | 6886 | fs_devices = open_seed_devices(fs_info, fs_uuid); |
|---|
| 6587 | 6887 | if (IS_ERR(fs_devices)) |
|---|
| 6588 | 6888 | return PTR_ERR(fs_devices); |
|---|
| .. | .. |
|---|
| 6725 | 7025 | sb_array_offset += len; |
|---|
| 6726 | 7026 | cur_offset += len; |
|---|
| 6727 | 7027 | |
|---|
| 6728 | | - if (key.type == BTRFS_CHUNK_ITEM_KEY) { |
|---|
| 6729 | | - chunk = (struct btrfs_chunk *)sb_array_offset; |
|---|
| 6730 | | - /* |
|---|
| 6731 | | - * At least one btrfs_chunk with one stripe must be |
|---|
| 6732 | | - * present, exact stripe count check comes afterwards |
|---|
| 6733 | | - */ |
|---|
| 6734 | | - len = btrfs_chunk_item_size(1); |
|---|
| 6735 | | - if (cur_offset + len > array_size) |
|---|
| 6736 | | - goto out_short_read; |
|---|
| 6737 | | - |
|---|
| 6738 | | - num_stripes = btrfs_chunk_num_stripes(sb, chunk); |
|---|
| 6739 | | - if (!num_stripes) { |
|---|
| 6740 | | - btrfs_err(fs_info, |
|---|
| 6741 | | - "invalid number of stripes %u in sys_array at offset %u", |
|---|
| 6742 | | - num_stripes, cur_offset); |
|---|
| 6743 | | - ret = -EIO; |
|---|
| 6744 | | - break; |
|---|
| 6745 | | - } |
|---|
| 6746 | | - |
|---|
| 6747 | | - type = btrfs_chunk_type(sb, chunk); |
|---|
| 6748 | | - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { |
|---|
| 6749 | | - btrfs_err(fs_info, |
|---|
| 6750 | | - "invalid chunk type %llu in sys_array at offset %u", |
|---|
| 6751 | | - type, cur_offset); |
|---|
| 6752 | | - ret = -EIO; |
|---|
| 6753 | | - break; |
|---|
| 6754 | | - } |
|---|
| 6755 | | - |
|---|
| 6756 | | - len = btrfs_chunk_item_size(num_stripes); |
|---|
| 6757 | | - if (cur_offset + len > array_size) |
|---|
| 6758 | | - goto out_short_read; |
|---|
| 6759 | | - |
|---|
| 6760 | | - ret = read_one_chunk(fs_info, &key, sb, chunk); |
|---|
| 6761 | | - if (ret) |
|---|
| 6762 | | - break; |
|---|
| 6763 | | - } else { |
|---|
| 7028 | + if (key.type != BTRFS_CHUNK_ITEM_KEY) { |
|---|
| 6764 | 7029 | btrfs_err(fs_info, |
|---|
| 6765 | 7030 | "unexpected item type %u in sys_array at offset %u", |
|---|
| 6766 | 7031 | (u32)key.type, cur_offset); |
|---|
| 6767 | 7032 | ret = -EIO; |
|---|
| 6768 | 7033 | break; |
|---|
| 6769 | 7034 | } |
|---|
| 7035 | + |
|---|
| 7036 | + chunk = (struct btrfs_chunk *)sb_array_offset; |
|---|
| 7037 | + /* |
|---|
| 7038 | + * At least one btrfs_chunk with one stripe must be present, |
|---|
| 7039 | + * exact stripe count check comes afterwards |
|---|
| 7040 | + */ |
|---|
| 7041 | + len = btrfs_chunk_item_size(1); |
|---|
| 7042 | + if (cur_offset + len > array_size) |
|---|
| 7043 | + goto out_short_read; |
|---|
| 7044 | + |
|---|
| 7045 | + num_stripes = btrfs_chunk_num_stripes(sb, chunk); |
|---|
| 7046 | + if (!num_stripes) { |
|---|
| 7047 | + btrfs_err(fs_info, |
|---|
| 7048 | + "invalid number of stripes %u in sys_array at offset %u", |
|---|
| 7049 | + num_stripes, cur_offset); |
|---|
| 7050 | + ret = -EIO; |
|---|
| 7051 | + break; |
|---|
| 7052 | + } |
|---|
| 7053 | + |
|---|
| 7054 | + type = btrfs_chunk_type(sb, chunk); |
|---|
| 7055 | + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { |
|---|
| 7056 | + btrfs_err(fs_info, |
|---|
| 7057 | + "invalid chunk type %llu in sys_array at offset %u", |
|---|
| 7058 | + type, cur_offset); |
|---|
| 7059 | + ret = -EIO; |
|---|
| 7060 | + break; |
|---|
| 7061 | + } |
|---|
| 7062 | + |
|---|
| 7063 | + len = btrfs_chunk_item_size(num_stripes); |
|---|
| 7064 | + if (cur_offset + len > array_size) |
|---|
| 7065 | + goto out_short_read; |
|---|
| 7066 | + |
|---|
| 7067 | + ret = read_one_chunk(&key, sb, chunk); |
|---|
| 7068 | + if (ret) |
|---|
| 7069 | + break; |
|---|
| 7070 | + |
|---|
| 6770 | 7071 | array_ptr += len; |
|---|
| 6771 | 7072 | sb_array_offset += len; |
|---|
| 6772 | 7073 | cur_offset += len; |
|---|
| .. | .. |
|---|
| 6794 | 7095 | bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, |
|---|
| 6795 | 7096 | struct btrfs_device *failing_dev) |
|---|
| 6796 | 7097 | { |
|---|
| 6797 | | - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; |
|---|
| 7098 | + struct extent_map_tree *map_tree = &fs_info->mapping_tree; |
|---|
| 6798 | 7099 | struct extent_map *em; |
|---|
| 6799 | 7100 | u64 next_start = 0; |
|---|
| 6800 | 7101 | bool ret = true; |
|---|
| 6801 | 7102 | |
|---|
| 6802 | | - read_lock(&map_tree->map_tree.lock); |
|---|
| 6803 | | - em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); |
|---|
| 6804 | | - read_unlock(&map_tree->map_tree.lock); |
|---|
| 7103 | + read_lock(&map_tree->lock); |
|---|
| 7104 | + em = lookup_extent_mapping(map_tree, 0, (u64)-1); |
|---|
| 7105 | + read_unlock(&map_tree->lock); |
|---|
| 6805 | 7106 | /* No chunk at all? Return false anyway */ |
|---|
| 6806 | 7107 | if (!em) { |
|---|
| 6807 | 7108 | ret = false; |
|---|
| .. | .. |
|---|
| 6830 | 7131 | if (missing > max_tolerated) { |
|---|
| 6831 | 7132 | if (!failing_dev) |
|---|
| 6832 | 7133 | btrfs_warn(fs_info, |
|---|
| 6833 | | - "chunk %llu missing %d devices, max tolerance is %d for writeable mount", |
|---|
| 7134 | + "chunk %llu missing %d devices, max tolerance is %d for writable mount", |
|---|
| 6834 | 7135 | em->start, missing, max_tolerated); |
|---|
| 6835 | 7136 | free_extent_map(em); |
|---|
| 6836 | 7137 | ret = false; |
|---|
| .. | .. |
|---|
| 6839 | 7140 | next_start = extent_map_end(em); |
|---|
| 6840 | 7141 | free_extent_map(em); |
|---|
| 6841 | 7142 | |
|---|
| 6842 | | - read_lock(&map_tree->map_tree.lock); |
|---|
| 6843 | | - em = lookup_extent_mapping(&map_tree->map_tree, next_start, |
|---|
| 7143 | + read_lock(&map_tree->lock); |
|---|
| 7144 | + em = lookup_extent_mapping(map_tree, next_start, |
|---|
| 6844 | 7145 | (u64)(-1) - next_start); |
|---|
| 6845 | | - read_unlock(&map_tree->map_tree.lock); |
|---|
| 7146 | + read_unlock(&map_tree->lock); |
|---|
| 6846 | 7147 | } |
|---|
| 6847 | 7148 | out: |
|---|
| 6848 | 7149 | return ret; |
|---|
| 7150 | +} |
|---|
| 7151 | + |
|---|
| 7152 | +static void readahead_tree_node_children(struct extent_buffer *node) |
|---|
| 7153 | +{ |
|---|
| 7154 | + int i; |
|---|
| 7155 | + const int nr_items = btrfs_header_nritems(node); |
|---|
| 7156 | + |
|---|
| 7157 | + for (i = 0; i < nr_items; i++) { |
|---|
| 7158 | + u64 start; |
|---|
| 7159 | + |
|---|
| 7160 | + start = btrfs_node_blockptr(node, i); |
|---|
| 7161 | + readahead_tree_block(node->fs_info, start); |
|---|
| 7162 | + } |
|---|
| 6849 | 7163 | } |
|---|
| 6850 | 7164 | |
|---|
| 6851 | 7165 | int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) |
|---|
| .. | .. |
|---|
| 6858 | 7172 | int ret; |
|---|
| 6859 | 7173 | int slot; |
|---|
| 6860 | 7174 | u64 total_dev = 0; |
|---|
| 7175 | + u64 last_ra_node = 0; |
|---|
| 6861 | 7176 | |
|---|
| 6862 | 7177 | path = btrfs_alloc_path(); |
|---|
| 6863 | 7178 | if (!path) |
|---|
| .. | .. |
|---|
| 6868 | 7183 | * otherwise we don't need it. |
|---|
| 6869 | 7184 | */ |
|---|
| 6870 | 7185 | mutex_lock(&uuid_mutex); |
|---|
| 6871 | | - mutex_lock(&fs_info->chunk_mutex); |
|---|
| 6872 | 7186 | |
|---|
| 6873 | 7187 | /* |
|---|
| 6874 | 7188 | * It is possible for mount and umount to race in such a way that |
|---|
| .. | .. |
|---|
| 6891 | 7205 | if (ret < 0) |
|---|
| 6892 | 7206 | goto error; |
|---|
| 6893 | 7207 | while (1) { |
|---|
| 7208 | + struct extent_buffer *node; |
|---|
| 7209 | + |
|---|
| 6894 | 7210 | leaf = path->nodes[0]; |
|---|
| 6895 | 7211 | slot = path->slots[0]; |
|---|
| 6896 | 7212 | if (slot >= btrfs_header_nritems(leaf)) { |
|---|
| .. | .. |
|---|
| 6901 | 7217 | goto error; |
|---|
| 6902 | 7218 | break; |
|---|
| 6903 | 7219 | } |
|---|
| 7220 | + /* |
|---|
| 7221 | + * The nodes on level 1 are not locked but we don't need to do |
|---|
| 7222 | + * that during mount time as nothing else can access the tree |
|---|
| 7223 | + */ |
|---|
| 7224 | + node = path->nodes[1]; |
|---|
| 7225 | + if (node) { |
|---|
| 7226 | + if (last_ra_node != node->start) { |
|---|
| 7227 | + readahead_tree_node_children(node); |
|---|
| 7228 | + last_ra_node = node->start; |
|---|
| 7229 | + } |
|---|
| 7230 | + } |
|---|
| 6904 | 7231 | btrfs_item_key_to_cpu(leaf, &found_key, slot); |
|---|
| 6905 | 7232 | if (found_key.type == BTRFS_DEV_ITEM_KEY) { |
|---|
| 6906 | 7233 | struct btrfs_dev_item *dev_item; |
|---|
| 6907 | 7234 | dev_item = btrfs_item_ptr(leaf, slot, |
|---|
| 6908 | 7235 | struct btrfs_dev_item); |
|---|
| 6909 | | - ret = read_one_dev(fs_info, leaf, dev_item); |
|---|
| 7236 | + ret = read_one_dev(leaf, dev_item); |
|---|
| 6910 | 7237 | if (ret) |
|---|
| 6911 | 7238 | goto error; |
|---|
| 6912 | 7239 | total_dev++; |
|---|
| 6913 | 7240 | } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { |
|---|
| 6914 | 7241 | struct btrfs_chunk *chunk; |
|---|
| 6915 | 7242 | chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); |
|---|
| 6916 | | - ret = read_one_chunk(fs_info, &found_key, leaf, chunk); |
|---|
| 7243 | + mutex_lock(&fs_info->chunk_mutex); |
|---|
| 7244 | + ret = read_one_chunk(&found_key, leaf, chunk); |
|---|
| 7245 | + mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 6917 | 7246 | if (ret) |
|---|
| 6918 | 7247 | goto error; |
|---|
| 6919 | 7248 | } |
|---|
| .. | .. |
|---|
| 6925 | 7254 | * do another round of validation checks. |
|---|
| 6926 | 7255 | */ |
|---|
| 6927 | 7256 | if (total_dev != fs_info->fs_devices->total_devices) { |
|---|
| 6928 | | - btrfs_err(fs_info, |
|---|
| 6929 | | - "super_num_devices %llu mismatch with num_devices %llu found here", |
|---|
| 7257 | + btrfs_warn(fs_info, |
|---|
| 7258 | +"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", |
|---|
| 6930 | 7259 | btrfs_super_num_devices(fs_info->super_copy), |
|---|
| 6931 | 7260 | total_dev); |
|---|
| 6932 | | - ret = -EINVAL; |
|---|
| 6933 | | - goto error; |
|---|
| 7261 | + fs_info->fs_devices->total_devices = total_dev; |
|---|
| 7262 | + btrfs_set_super_num_devices(fs_info->super_copy, total_dev); |
|---|
| 6934 | 7263 | } |
|---|
| 6935 | 7264 | if (btrfs_super_total_bytes(fs_info->super_copy) < |
|---|
| 6936 | 7265 | fs_info->fs_devices->total_rw_bytes) { |
|---|
| .. | .. |
|---|
| 6943 | 7272 | } |
|---|
| 6944 | 7273 | ret = 0; |
|---|
| 6945 | 7274 | error: |
|---|
| 6946 | | - mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 6947 | 7275 | mutex_unlock(&uuid_mutex); |
|---|
| 6948 | 7276 | |
|---|
| 6949 | 7277 | btrfs_free_path(path); |
|---|
| .. | .. |
|---|
| 6952 | 7280 | |
|---|
| 6953 | 7281 | void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) |
|---|
| 6954 | 7282 | { |
|---|
| 6955 | | - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
|---|
| 7283 | + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; |
|---|
| 6956 | 7284 | struct btrfs_device *device; |
|---|
| 6957 | 7285 | |
|---|
| 6958 | | - while (fs_devices) { |
|---|
| 6959 | | - mutex_lock(&fs_devices->device_list_mutex); |
|---|
| 6960 | | - list_for_each_entry(device, &fs_devices->devices, dev_list) |
|---|
| 6961 | | - device->fs_info = fs_info; |
|---|
| 6962 | | - mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 7286 | + fs_devices->fs_info = fs_info; |
|---|
| 6963 | 7287 | |
|---|
| 6964 | | - fs_devices = fs_devices->seed; |
|---|
| 7288 | + mutex_lock(&fs_devices->device_list_mutex); |
|---|
| 7289 | + list_for_each_entry(device, &fs_devices->devices, dev_list) |
|---|
| 7290 | + device->fs_info = fs_info; |
|---|
| 7291 | + |
|---|
| 7292 | + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { |
|---|
| 7293 | + list_for_each_entry(device, &seed_devs->devices, dev_list) |
|---|
| 7294 | + device->fs_info = fs_info; |
|---|
| 7295 | + |
|---|
| 7296 | + seed_devs->fs_info = fs_info; |
|---|
| 6965 | 7297 | } |
|---|
| 7298 | + mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 6966 | 7299 | } |
|---|
| 6967 | 7300 | |
|---|
| 6968 | | -static void __btrfs_reset_dev_stats(struct btrfs_device *dev) |
|---|
| 7301 | +static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, |
|---|
| 7302 | + const struct btrfs_dev_stats_item *ptr, |
|---|
| 7303 | + int index) |
|---|
| 6969 | 7304 | { |
|---|
| 6970 | | - int i; |
|---|
| 7305 | + u64 val; |
|---|
| 6971 | 7306 | |
|---|
| 6972 | | - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) |
|---|
| 6973 | | - btrfs_dev_stat_reset(dev, i); |
|---|
| 7307 | + read_extent_buffer(eb, &val, |
|---|
| 7308 | + offsetof(struct btrfs_dev_stats_item, values) + |
|---|
| 7309 | + ((unsigned long)ptr) + (index * sizeof(u64)), |
|---|
| 7310 | + sizeof(val)); |
|---|
| 7311 | + return val; |
|---|
| 7312 | +} |
|---|
| 7313 | + |
|---|
| 7314 | +static void btrfs_set_dev_stats_value(struct extent_buffer *eb, |
|---|
| 7315 | + struct btrfs_dev_stats_item *ptr, |
|---|
| 7316 | + int index, u64 val) |
|---|
| 7317 | +{ |
|---|
| 7318 | + write_extent_buffer(eb, &val, |
|---|
| 7319 | + offsetof(struct btrfs_dev_stats_item, values) + |
|---|
| 7320 | + ((unsigned long)ptr) + (index * sizeof(u64)), |
|---|
| 7321 | + sizeof(val)); |
|---|
| 7322 | +} |
|---|
| 7323 | + |
|---|
| 7324 | +static int btrfs_device_init_dev_stats(struct btrfs_device *device, |
|---|
| 7325 | + struct btrfs_path *path) |
|---|
| 7326 | +{ |
|---|
| 7327 | + struct btrfs_dev_stats_item *ptr; |
|---|
| 7328 | + struct extent_buffer *eb; |
|---|
| 7329 | + struct btrfs_key key; |
|---|
| 7330 | + int item_size; |
|---|
| 7331 | + int i, ret, slot; |
|---|
| 7332 | + |
|---|
| 7333 | + key.objectid = BTRFS_DEV_STATS_OBJECTID; |
|---|
| 7334 | + key.type = BTRFS_PERSISTENT_ITEM_KEY; |
|---|
| 7335 | + key.offset = device->devid; |
|---|
| 7336 | + ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); |
|---|
| 7337 | + if (ret) { |
|---|
| 7338 | + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) |
|---|
| 7339 | + btrfs_dev_stat_set(device, i, 0); |
|---|
| 7340 | + device->dev_stats_valid = 1; |
|---|
| 7341 | + btrfs_release_path(path); |
|---|
| 7342 | + return ret < 0 ? ret : 0; |
|---|
| 7343 | + } |
|---|
| 7344 | + slot = path->slots[0]; |
|---|
| 7345 | + eb = path->nodes[0]; |
|---|
| 7346 | + item_size = btrfs_item_size_nr(eb, slot); |
|---|
| 7347 | + |
|---|
| 7348 | + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); |
|---|
| 7349 | + |
|---|
| 7350 | + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { |
|---|
| 7351 | + if (item_size >= (1 + i) * sizeof(__le64)) |
|---|
| 7352 | + btrfs_dev_stat_set(device, i, |
|---|
| 7353 | + btrfs_dev_stats_value(eb, ptr, i)); |
|---|
| 7354 | + else |
|---|
| 7355 | + btrfs_dev_stat_set(device, i, 0); |
|---|
| 7356 | + } |
|---|
| 7357 | + |
|---|
| 7358 | + device->dev_stats_valid = 1; |
|---|
| 7359 | + btrfs_dev_stat_print_on_load(device); |
|---|
| 7360 | + btrfs_release_path(path); |
|---|
| 7361 | + |
|---|
| 7362 | + return 0; |
|---|
| 6974 | 7363 | } |
|---|
| 6975 | 7364 | |
|---|
| 6976 | 7365 | int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) |
|---|
| 6977 | 7366 | { |
|---|
| 6978 | | - struct btrfs_key key; |
|---|
| 6979 | | - struct btrfs_key found_key; |
|---|
| 6980 | | - struct btrfs_root *dev_root = fs_info->dev_root; |
|---|
| 6981 | | - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
|---|
| 6982 | | - struct extent_buffer *eb; |
|---|
| 6983 | | - int slot; |
|---|
| 6984 | | - int ret = 0; |
|---|
| 7367 | + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; |
|---|
| 6985 | 7368 | struct btrfs_device *device; |
|---|
| 6986 | 7369 | struct btrfs_path *path = NULL; |
|---|
| 6987 | | - int i; |
|---|
| 7370 | + int ret = 0; |
|---|
| 6988 | 7371 | |
|---|
| 6989 | 7372 | path = btrfs_alloc_path(); |
|---|
| 6990 | | - if (!path) { |
|---|
| 6991 | | - ret = -ENOMEM; |
|---|
| 6992 | | - goto out; |
|---|
| 6993 | | - } |
|---|
| 7373 | + if (!path) |
|---|
| 7374 | + return -ENOMEM; |
|---|
| 6994 | 7375 | |
|---|
| 6995 | 7376 | mutex_lock(&fs_devices->device_list_mutex); |
|---|
| 6996 | 7377 | list_for_each_entry(device, &fs_devices->devices, dev_list) { |
|---|
| 6997 | | - int item_size; |
|---|
| 6998 | | - struct btrfs_dev_stats_item *ptr; |
|---|
| 6999 | | - |
|---|
| 7000 | | - key.objectid = BTRFS_DEV_STATS_OBJECTID; |
|---|
| 7001 | | - key.type = BTRFS_PERSISTENT_ITEM_KEY; |
|---|
| 7002 | | - key.offset = device->devid; |
|---|
| 7003 | | - ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); |
|---|
| 7004 | | - if (ret) { |
|---|
| 7005 | | - __btrfs_reset_dev_stats(device); |
|---|
| 7006 | | - device->dev_stats_valid = 1; |
|---|
| 7007 | | - btrfs_release_path(path); |
|---|
| 7008 | | - continue; |
|---|
| 7009 | | - } |
|---|
| 7010 | | - slot = path->slots[0]; |
|---|
| 7011 | | - eb = path->nodes[0]; |
|---|
| 7012 | | - btrfs_item_key_to_cpu(eb, &found_key, slot); |
|---|
| 7013 | | - item_size = btrfs_item_size_nr(eb, slot); |
|---|
| 7014 | | - |
|---|
| 7015 | | - ptr = btrfs_item_ptr(eb, slot, |
|---|
| 7016 | | - struct btrfs_dev_stats_item); |
|---|
| 7017 | | - |
|---|
| 7018 | | - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { |
|---|
| 7019 | | - if (item_size >= (1 + i) * sizeof(__le64)) |
|---|
| 7020 | | - btrfs_dev_stat_set(device, i, |
|---|
| 7021 | | - btrfs_dev_stats_value(eb, ptr, i)); |
|---|
| 7022 | | - else |
|---|
| 7023 | | - btrfs_dev_stat_reset(device, i); |
|---|
| 7024 | | - } |
|---|
| 7025 | | - |
|---|
| 7026 | | - device->dev_stats_valid = 1; |
|---|
| 7027 | | - btrfs_dev_stat_print_on_load(device); |
|---|
| 7028 | | - btrfs_release_path(path); |
|---|
| 7378 | + ret = btrfs_device_init_dev_stats(device, path); |
|---|
| 7379 | + if (ret) |
|---|
| 7380 | + goto out; |
|---|
| 7029 | 7381 | } |
|---|
| 7382 | + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { |
|---|
| 7383 | + list_for_each_entry(device, &seed_devs->devices, dev_list) { |
|---|
| 7384 | + ret = btrfs_device_init_dev_stats(device, path); |
|---|
| 7385 | + if (ret) |
|---|
| 7386 | + goto out; |
|---|
| 7387 | + } |
|---|
| 7388 | + } |
|---|
| 7389 | +out: |
|---|
| 7030 | 7390 | mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 7031 | 7391 | |
|---|
| 7032 | | -out: |
|---|
| 7033 | 7392 | btrfs_free_path(path); |
|---|
| 7034 | | - return ret < 0 ? ret : 0; |
|---|
| 7393 | + return ret; |
|---|
| 7035 | 7394 | } |
|---|
| 7036 | 7395 | |
|---|
| 7037 | 7396 | static int update_dev_stat_item(struct btrfs_trans_handle *trans, |
|---|
| .. | .. |
|---|
| 7102 | 7461 | /* |
|---|
| 7103 | 7462 | * called from commit_transaction. Writes all changed device stats to disk. |
|---|
| 7104 | 7463 | */ |
|---|
| 7105 | | -int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, |
|---|
| 7106 | | - struct btrfs_fs_info *fs_info) |
|---|
| 7464 | +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) |
|---|
| 7107 | 7465 | { |
|---|
| 7466 | + struct btrfs_fs_info *fs_info = trans->fs_info; |
|---|
| 7108 | 7467 | struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
|---|
| 7109 | 7468 | struct btrfs_device *device; |
|---|
| 7110 | 7469 | int stats_cnt; |
|---|
| .. | .. |
|---|
| 7187 | 7546 | int i; |
|---|
| 7188 | 7547 | |
|---|
| 7189 | 7548 | mutex_lock(&fs_devices->device_list_mutex); |
|---|
| 7190 | | - dev = btrfs_find_device(fs_info->fs_devices, stats->devid, |
|---|
| 7191 | | - NULL, NULL, true); |
|---|
| 7549 | + dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL, |
|---|
| 7550 | + true); |
|---|
| 7192 | 7551 | mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 7193 | 7552 | |
|---|
| 7194 | 7553 | if (!dev) { |
|---|
| .. | .. |
|---|
| 7203 | 7562 | stats->values[i] = |
|---|
| 7204 | 7563 | btrfs_dev_stat_read_and_reset(dev, i); |
|---|
| 7205 | 7564 | else |
|---|
| 7206 | | - btrfs_dev_stat_reset(dev, i); |
|---|
| 7565 | + btrfs_dev_stat_set(dev, i, 0); |
|---|
| 7207 | 7566 | } |
|---|
| 7208 | 7567 | btrfs_info(fs_info, "device stats zeroed by %s (%d)", |
|---|
| 7209 | 7568 | current->comm, task_pid_nr(current)); |
|---|
| .. | .. |
|---|
| 7217 | 7576 | return 0; |
|---|
| 7218 | 7577 | } |
|---|
| 7219 | 7578 | |
|---|
| 7220 | | -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) |
|---|
| 7221 | | -{ |
|---|
| 7222 | | - struct buffer_head *bh; |
|---|
| 7223 | | - struct btrfs_super_block *disk_super; |
|---|
| 7224 | | - int copy_num; |
|---|
| 7225 | | - |
|---|
| 7226 | | - if (!bdev) |
|---|
| 7227 | | - return; |
|---|
| 7228 | | - |
|---|
| 7229 | | - for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; |
|---|
| 7230 | | - copy_num++) { |
|---|
| 7231 | | - |
|---|
| 7232 | | - if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) |
|---|
| 7233 | | - continue; |
|---|
| 7234 | | - |
|---|
| 7235 | | - disk_super = (struct btrfs_super_block *)bh->b_data; |
|---|
| 7236 | | - |
|---|
| 7237 | | - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); |
|---|
| 7238 | | - set_buffer_dirty(bh); |
|---|
| 7239 | | - sync_dirty_buffer(bh); |
|---|
| 7240 | | - brelse(bh); |
|---|
| 7241 | | - } |
|---|
| 7242 | | - |
|---|
| 7243 | | - /* Notify udev that device has changed */ |
|---|
| 7244 | | - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); |
|---|
| 7245 | | - |
|---|
| 7246 | | - /* Update ctime/mtime for device path for libblkid */ |
|---|
| 7247 | | - update_dev_time(device_path); |
|---|
| 7248 | | -} |
|---|
| 7249 | | - |
|---|
| 7250 | 7579 | /* |
|---|
| 7251 | | - * Update the size of all devices, which is used for writing out the |
|---|
| 7252 | | - * super blocks. |
|---|
| 7580 | + * Update the size and bytes used for each device where it changed. This is |
|---|
| 7581 | + * delayed since we would otherwise get errors while writing out the |
|---|
| 7582 | + * superblocks. |
|---|
| 7583 | + * |
|---|
| 7584 | + * Must be invoked during transaction commit. |
|---|
| 7253 | 7585 | */ |
|---|
| 7254 | | -void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) |
|---|
| 7586 | +void btrfs_commit_device_sizes(struct btrfs_transaction *trans) |
|---|
| 7255 | 7587 | { |
|---|
| 7256 | | - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
|---|
| 7257 | 7588 | struct btrfs_device *curr, *next; |
|---|
| 7258 | 7589 | |
|---|
| 7259 | | - if (list_empty(&fs_devices->resized_devices)) |
|---|
| 7590 | + ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); |
|---|
| 7591 | + |
|---|
| 7592 | + if (list_empty(&trans->dev_update_list)) |
|---|
| 7260 | 7593 | return; |
|---|
| 7261 | 7594 | |
|---|
| 7262 | | - mutex_lock(&fs_devices->device_list_mutex); |
|---|
| 7263 | | - mutex_lock(&fs_info->chunk_mutex); |
|---|
| 7264 | | - list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, |
|---|
| 7265 | | - resized_list) { |
|---|
| 7266 | | - list_del_init(&curr->resized_list); |
|---|
| 7595 | + /* |
|---|
| 7596 | + * We don't need the device_list_mutex here. This list is owned by the |
|---|
| 7597 | + * transaction and the transaction must complete before the device is |
|---|
| 7598 | + * released. |
|---|
| 7599 | + */ |
|---|
| 7600 | + mutex_lock(&trans->fs_info->chunk_mutex); |
|---|
| 7601 | + list_for_each_entry_safe(curr, next, &trans->dev_update_list, |
|---|
| 7602 | + post_commit_list) { |
|---|
| 7603 | + list_del_init(&curr->post_commit_list); |
|---|
| 7267 | 7604 | curr->commit_total_bytes = curr->disk_total_bytes; |
|---|
| 7605 | + curr->commit_bytes_used = curr->bytes_used; |
|---|
| 7268 | 7606 | } |
|---|
| 7269 | | - mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 7270 | | - mutex_unlock(&fs_devices->device_list_mutex); |
|---|
| 7271 | | -} |
|---|
| 7272 | | - |
|---|
| 7273 | | -/* Must be invoked during the transaction commit */ |
|---|
| 7274 | | -void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans) |
|---|
| 7275 | | -{ |
|---|
| 7276 | | - struct btrfs_fs_info *fs_info = trans->fs_info; |
|---|
| 7277 | | - struct extent_map *em; |
|---|
| 7278 | | - struct map_lookup *map; |
|---|
| 7279 | | - struct btrfs_device *dev; |
|---|
| 7280 | | - int i; |
|---|
| 7281 | | - |
|---|
| 7282 | | - if (list_empty(&trans->pending_chunks)) |
|---|
| 7283 | | - return; |
|---|
| 7284 | | - |
|---|
| 7285 | | - /* In order to kick the device replace finish process */ |
|---|
| 7286 | | - mutex_lock(&fs_info->chunk_mutex); |
|---|
| 7287 | | - list_for_each_entry(em, &trans->pending_chunks, list) { |
|---|
| 7288 | | - map = em->map_lookup; |
|---|
| 7289 | | - |
|---|
| 7290 | | - for (i = 0; i < map->num_stripes; i++) { |
|---|
| 7291 | | - dev = map->stripes[i].dev; |
|---|
| 7292 | | - dev->commit_bytes_used = dev->bytes_used; |
|---|
| 7293 | | - dev->has_pending_chunks = false; |
|---|
| 7294 | | - } |
|---|
| 7295 | | - } |
|---|
| 7296 | | - mutex_unlock(&fs_info->chunk_mutex); |
|---|
| 7297 | | -} |
|---|
| 7298 | | - |
|---|
| 7299 | | -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) |
|---|
| 7300 | | -{ |
|---|
| 7301 | | - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
|---|
| 7302 | | - while (fs_devices) { |
|---|
| 7303 | | - fs_devices->fs_info = fs_info; |
|---|
| 7304 | | - fs_devices = fs_devices->seed; |
|---|
| 7305 | | - } |
|---|
| 7306 | | -} |
|---|
| 7307 | | - |
|---|
| 7308 | | -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) |
|---|
| 7309 | | -{ |
|---|
| 7310 | | - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
|---|
| 7311 | | - while (fs_devices) { |
|---|
| 7312 | | - fs_devices->fs_info = NULL; |
|---|
| 7313 | | - fs_devices = fs_devices->seed; |
|---|
| 7314 | | - } |
|---|
| 7607 | + mutex_unlock(&trans->fs_info->chunk_mutex); |
|---|
| 7315 | 7608 | } |
|---|
| 7316 | 7609 | |
|---|
| 7317 | 7610 | /* |
|---|
| .. | .. |
|---|
| 7319 | 7612 | */ |
|---|
| 7320 | 7613 | int btrfs_bg_type_to_factor(u64 flags) |
|---|
| 7321 | 7614 | { |
|---|
| 7322 | | - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | |
|---|
| 7323 | | - BTRFS_BLOCK_GROUP_RAID10)) |
|---|
| 7324 | | - return 2; |
|---|
| 7325 | | - return 1; |
|---|
| 7615 | + const int index = btrfs_bg_flags_to_raid_index(flags); |
|---|
| 7616 | + |
|---|
| 7617 | + return btrfs_raid_array[index].ncopies; |
|---|
| 7326 | 7618 | } |
|---|
| 7327 | 7619 | |
|---|
| 7328 | 7620 | |
|---|
| 7329 | | -static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) |
|---|
| 7330 | | -{ |
|---|
| 7331 | | - int index = btrfs_bg_flags_to_raid_index(type); |
|---|
| 7332 | | - int ncopies = btrfs_raid_array[index].ncopies; |
|---|
| 7333 | | - int data_stripes; |
|---|
| 7334 | | - |
|---|
| 7335 | | - switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
|---|
| 7336 | | - case BTRFS_BLOCK_GROUP_RAID5: |
|---|
| 7337 | | - data_stripes = num_stripes - 1; |
|---|
| 7338 | | - break; |
|---|
| 7339 | | - case BTRFS_BLOCK_GROUP_RAID6: |
|---|
| 7340 | | - data_stripes = num_stripes - 2; |
|---|
| 7341 | | - break; |
|---|
| 7342 | | - default: |
|---|
| 7343 | | - data_stripes = num_stripes / ncopies; |
|---|
| 7344 | | - break; |
|---|
| 7345 | | - } |
|---|
| 7346 | | - return div_u64(chunk_len, data_stripes); |
|---|
| 7347 | | -} |
|---|
| 7348 | 7621 | |
|---|
| 7349 | 7622 | static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, |
|---|
| 7350 | 7623 | u64 chunk_offset, u64 devid, |
|---|
| 7351 | 7624 | u64 physical_offset, u64 physical_len) |
|---|
| 7352 | 7625 | { |
|---|
| 7353 | | - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; |
|---|
| 7626 | + struct extent_map_tree *em_tree = &fs_info->mapping_tree; |
|---|
| 7354 | 7627 | struct extent_map *em; |
|---|
| 7355 | 7628 | struct map_lookup *map; |
|---|
| 7356 | 7629 | struct btrfs_device *dev; |
|---|
| .. | .. |
|---|
| 7414 | 7687 | |
|---|
| 7415 | 7688 | /* It's possible this device is a dummy for seed device */ |
|---|
| 7416 | 7689 | if (dev->disk_total_bytes == 0) { |
|---|
| 7417 | | - dev = btrfs_find_device(fs_info->fs_devices->seed, devid, |
|---|
| 7418 | | - NULL, NULL, false); |
|---|
| 7690 | + struct btrfs_fs_devices *devs; |
|---|
| 7691 | + |
|---|
| 7692 | + devs = list_first_entry(&fs_info->fs_devices->seed_list, |
|---|
| 7693 | + struct btrfs_fs_devices, seed_list); |
|---|
| 7694 | + dev = btrfs_find_device(devs, devid, NULL, NULL, false); |
|---|
| 7419 | 7695 | if (!dev) { |
|---|
| 7420 | 7696 | btrfs_err(fs_info, "failed to find seed devid %llu", |
|---|
| 7421 | 7697 | devid); |
|---|
| .. | .. |
|---|
| 7439 | 7715 | |
|---|
| 7440 | 7716 | static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) |
|---|
| 7441 | 7717 | { |
|---|
| 7442 | | - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; |
|---|
| 7718 | + struct extent_map_tree *em_tree = &fs_info->mapping_tree; |
|---|
| 7443 | 7719 | struct extent_map *em; |
|---|
| 7444 | 7720 | struct rb_node *node; |
|---|
| 7445 | 7721 | int ret = 0; |
|---|
| 7446 | 7722 | |
|---|
| 7447 | 7723 | read_lock(&em_tree->lock); |
|---|
| 7448 | | - for (node = rb_first(&em_tree->map); node; node = rb_next(node)) { |
|---|
| 7724 | + for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { |
|---|
| 7449 | 7725 | em = rb_entry(node, struct extent_map, rb_node); |
|---|
| 7450 | 7726 | if (em->map_lookup->num_stripes != |
|---|
| 7451 | 7727 | em->map_lookup->verified_stripes) { |
|---|
| .. | .. |
|---|
| 7551 | 7827 | btrfs_free_path(path); |
|---|
| 7552 | 7828 | return ret; |
|---|
| 7553 | 7829 | } |
|---|
| 7830 | + |
|---|
| 7831 | +/* |
|---|
| 7832 | + * Check whether the given block group or device is pinned by any inode being |
|---|
| 7833 | + * used as a swapfile. |
|---|
| 7834 | + */ |
|---|
| 7835 | +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) |
|---|
| 7836 | +{ |
|---|
| 7837 | + struct btrfs_swapfile_pin *sp; |
|---|
| 7838 | + struct rb_node *node; |
|---|
| 7839 | + |
|---|
| 7840 | + spin_lock(&fs_info->swapfile_pins_lock); |
|---|
| 7841 | + node = fs_info->swapfile_pins.rb_node; |
|---|
| 7842 | + while (node) { |
|---|
| 7843 | + sp = rb_entry(node, struct btrfs_swapfile_pin, node); |
|---|
| 7844 | + if (ptr < sp->ptr) |
|---|
| 7845 | + node = node->rb_left; |
|---|
| 7846 | + else if (ptr > sp->ptr) |
|---|
| 7847 | + node = node->rb_right; |
|---|
| 7848 | + else |
|---|
| 7849 | + break; |
|---|
| 7850 | + } |
|---|
| 7851 | + spin_unlock(&fs_info->swapfile_pins_lock); |
|---|
| 7852 | + return node != NULL; |
|---|
| 7853 | +} |
|---|