hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/fs/btrfs/volumes.c
....@@ -7,7 +7,6 @@
77 #include <linux/sched/mm.h>
88 #include <linux/bio.h>
99 #include <linux/slab.h>
10
-#include <linux/buffer_head.h>
1110 #include <linux/blkdev.h>
1211 #include <linux/ratelimit.h>
1312 #include <linux/kthread.h>
....@@ -15,6 +14,8 @@
1514 #include <linux/semaphore.h>
1615 #include <linux/uuid.h>
1716 #include <linux/list_sort.h>
17
+#include <linux/namei.h>
18
+#include "misc.h"
1819 #include "ctree.h"
1920 #include "extent_map.h"
2021 #include "disk-io.h"
....@@ -25,10 +26,12 @@
2526 #include "async-thread.h"
2627 #include "check-integrity.h"
2728 #include "rcu-string.h"
28
-#include "math.h"
2929 #include "dev-replace.h"
3030 #include "sysfs.h"
3131 #include "tree-checker.h"
32
+#include "space-info.h"
33
+#include "block-group.h"
34
+#include "discard.h"
3235
3336 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3437 [BTRFS_RAID_RAID10] = {
....@@ -39,6 +42,7 @@
3942 .tolerated_failures = 1,
4043 .devs_increment = 2,
4144 .ncopies = 2,
45
+ .nparity = 0,
4246 .raid_name = "raid10",
4347 .bg_flag = BTRFS_BLOCK_GROUP_RAID10,
4448 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
....@@ -51,9 +55,36 @@
5155 .tolerated_failures = 1,
5256 .devs_increment = 2,
5357 .ncopies = 2,
58
+ .nparity = 0,
5459 .raid_name = "raid1",
5560 .bg_flag = BTRFS_BLOCK_GROUP_RAID1,
5661 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
62
+ },
63
+ [BTRFS_RAID_RAID1C3] = {
64
+ .sub_stripes = 1,
65
+ .dev_stripes = 1,
66
+ .devs_max = 3,
67
+ .devs_min = 3,
68
+ .tolerated_failures = 2,
69
+ .devs_increment = 3,
70
+ .ncopies = 3,
71
+ .nparity = 0,
72
+ .raid_name = "raid1c3",
73
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
74
+ .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
75
+ },
76
+ [BTRFS_RAID_RAID1C4] = {
77
+ .sub_stripes = 1,
78
+ .dev_stripes = 1,
79
+ .devs_max = 4,
80
+ .devs_min = 4,
81
+ .tolerated_failures = 3,
82
+ .devs_increment = 4,
83
+ .ncopies = 4,
84
+ .nparity = 0,
85
+ .raid_name = "raid1c4",
86
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
87
+ .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
5788 },
5889 [BTRFS_RAID_DUP] = {
5990 .sub_stripes = 1,
....@@ -63,6 +94,7 @@
6394 .tolerated_failures = 0,
6495 .devs_increment = 1,
6596 .ncopies = 2,
97
+ .nparity = 0,
6698 .raid_name = "dup",
6799 .bg_flag = BTRFS_BLOCK_GROUP_DUP,
68100 .mindev_error = 0,
....@@ -75,6 +107,7 @@
75107 .tolerated_failures = 0,
76108 .devs_increment = 1,
77109 .ncopies = 1,
110
+ .nparity = 0,
78111 .raid_name = "raid0",
79112 .bg_flag = BTRFS_BLOCK_GROUP_RAID0,
80113 .mindev_error = 0,
....@@ -87,6 +120,7 @@
87120 .tolerated_failures = 0,
88121 .devs_increment = 1,
89122 .ncopies = 1,
123
+ .nparity = 0,
90124 .raid_name = "single",
91125 .bg_flag = 0,
92126 .mindev_error = 0,
....@@ -99,6 +133,7 @@
99133 .tolerated_failures = 1,
100134 .devs_increment = 1,
101135 .ncopies = 1,
136
+ .nparity = 1,
102137 .raid_name = "raid5",
103138 .bg_flag = BTRFS_BLOCK_GROUP_RAID5,
104139 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
....@@ -111,24 +146,79 @@
111146 .tolerated_failures = 2,
112147 .devs_increment = 1,
113148 .ncopies = 1,
149
+ .nparity = 2,
114150 .raid_name = "raid6",
115151 .bg_flag = BTRFS_BLOCK_GROUP_RAID6,
116152 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
117153 },
118154 };
119155
120
-const char *get_raid_name(enum btrfs_raid_types type)
156
+const char *btrfs_bg_type_to_raid_name(u64 flags)
121157 {
122
- if (type >= BTRFS_NR_RAID_TYPES)
158
+ const int index = btrfs_bg_flags_to_raid_index(flags);
159
+
160
+ if (index >= BTRFS_NR_RAID_TYPES)
123161 return NULL;
124162
125
- return btrfs_raid_array[type].raid_name;
163
+ return btrfs_raid_array[index].raid_name;
126164 }
127165
128
-static int init_first_rw_device(struct btrfs_trans_handle *trans,
129
- struct btrfs_fs_info *fs_info);
166
+/*
167
+ * Fill @buf with textual description of @bg_flags, no more than @size_buf
168
+ * bytes including terminating null byte.
169
+ */
170
+void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
171
+{
172
+ int i;
173
+ int ret;
174
+ char *bp = buf;
175
+ u64 flags = bg_flags;
176
+ u32 size_bp = size_buf;
177
+
178
+ if (!flags) {
179
+ strcpy(bp, "NONE");
180
+ return;
181
+ }
182
+
183
+#define DESCRIBE_FLAG(flag, desc) \
184
+ do { \
185
+ if (flags & (flag)) { \
186
+ ret = snprintf(bp, size_bp, "%s|", (desc)); \
187
+ if (ret < 0 || ret >= size_bp) \
188
+ goto out_overflow; \
189
+ size_bp -= ret; \
190
+ bp += ret; \
191
+ flags &= ~(flag); \
192
+ } \
193
+ } while (0)
194
+
195
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
196
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
197
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
198
+
199
+ DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
200
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
201
+ DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
202
+ btrfs_raid_array[i].raid_name);
203
+#undef DESCRIBE_FLAG
204
+
205
+ if (flags) {
206
+ ret = snprintf(bp, size_bp, "0x%llx|", flags);
207
+ size_bp -= ret;
208
+ }
209
+
210
+ if (size_bp < size_buf)
211
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
212
+
213
+ /*
214
+ * The text is trimmed, it's up to the caller to provide sufficiently
215
+ * large buffer
216
+ */
217
+out_overflow:;
218
+}
219
+
220
+static int init_first_rw_device(struct btrfs_trans_handle *trans);
130221 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
131
-static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
132222 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
133223 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
134224 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
....@@ -153,7 +243,7 @@
153243 * the mutex can be very coarse and can cover long-running operations
154244 *
155245 * protects: updates to fs_devices counters like missing devices, rw devices,
156
- * seeding, structure cloning, openning/closing devices at mount/umount time
246
+ * seeding, structure cloning, opening/closing devices at mount/umount time
157247 *
158248 * global::fs_devs - add, remove, updates to the global list
159249 *
....@@ -183,7 +273,9 @@
183273 * chunk_mutex
184274 * -----------
185275 * protects chunks, adding or removing during allocation, trim or when a new
186
- * device is added/removed
276
+ * device is added/removed. Additionally it also protects post_commit_list of
277
+ * individual devices, since they can be added to the transaction's
278
+ * post_commit_list only with chunk_mutex held.
187279 *
188280 * cleaner_mutex
189281 * -------------
....@@ -195,14 +287,13 @@
195287 * ============
196288 *
197289 * uuid_mutex
198
- * volume_mutex
199
- * device_list_mutex
200
- * chunk_mutex
201
- * balance_mutex
290
+ * device_list_mutex
291
+ * chunk_mutex
292
+ * balance_mutex
202293 *
203294 *
204
- * Exclusive operations, BTRFS_FS_EXCL_OP
205
- * ======================================
295
+ * Exclusive operations
296
+ * ====================
206297 *
207298 * Maintains the exclusivity of the following operations that apply to the
208299 * whole filesystem and cannot run in parallel.
....@@ -228,30 +319,32 @@
228319 * - system power-cycle and filesystem mounted as read-only
229320 * - filesystem or device errors leading to forced read-only
230321 *
231
- * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
232
- * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
322
+ * The status of exclusive operation is set and cleared atomically.
323
+ * During the course of Paused state, fs_info::exclusive_operation remains set.
233324 * A device operation in Paused or Running state can be canceled or resumed
234325 * either by ioctl (Balance only) or when remounted as read-write.
235
- * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
326
+ * The exclusive status is cleared when the device operation is canceled or
236327 * completed.
237328 */
238329
239330 DEFINE_MUTEX(uuid_mutex);
240331 static LIST_HEAD(fs_uuids);
241
-struct list_head *btrfs_get_fs_uuids(void)
332
+struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
242333 {
243334 return &fs_uuids;
244335 }
245336
246337 /*
247338 * alloc_fs_devices - allocate struct btrfs_fs_devices
248
- * @fsid: if not NULL, copy the uuid to fs_devices::fsid
339
+ * @fsid: if not NULL, copy the UUID to fs_devices::fsid
340
+ * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
249341 *
250342 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
251343 * The returned struct is not linked onto any lists and can be destroyed with
252344 * kfree() right away.
253345 */
254
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
346
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
347
+ const u8 *metadata_fsid)
255348 {
256349 struct btrfs_fs_devices *fs_devs;
257350
....@@ -262,18 +355,25 @@
262355 mutex_init(&fs_devs->device_list_mutex);
263356
264357 INIT_LIST_HEAD(&fs_devs->devices);
265
- INIT_LIST_HEAD(&fs_devs->resized_devices);
266358 INIT_LIST_HEAD(&fs_devs->alloc_list);
267359 INIT_LIST_HEAD(&fs_devs->fs_list);
360
+ INIT_LIST_HEAD(&fs_devs->seed_list);
268361 if (fsid)
269362 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
363
+
364
+ if (metadata_fsid)
365
+ memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
366
+ else if (fsid)
367
+ memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
270368
271369 return fs_devs;
272370 }
273371
274372 void btrfs_free_device(struct btrfs_device *device)
275373 {
374
+ WARN_ON(!list_empty(&device->post_commit_list));
276375 rcu_string_free(device->name);
376
+ extent_io_tree_release(&device->alloc_state);
277377 bio_put(device->flush_bio);
278378 kfree(device);
279379 }
....@@ -281,6 +381,7 @@
281381 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
282382 {
283383 struct btrfs_device *device;
384
+
284385 WARN_ON(fs_devices->opened);
285386 while (!list_empty(&fs_devices->devices)) {
286387 device = list_entry(fs_devices->devices.next,
....@@ -289,19 +390,6 @@
289390 btrfs_free_device(device);
290391 }
291392 kfree(fs_devices);
292
-}
293
-
294
-static void btrfs_kobject_uevent(struct block_device *bdev,
295
- enum kobject_action action)
296
-{
297
- int ret;
298
-
299
- ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
300
- if (ret)
301
- pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
302
- action,
303
- kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
304
- &disk_to_dev(bdev->bd_disk)->kobj);
305393 }
306394
307395 void __exit btrfs_cleanup_fs_uuids(void)
....@@ -321,7 +409,7 @@
321409 * Returned struct is not linked onto any lists and must be destroyed using
322410 * btrfs_free_device.
323411 */
324
-static struct btrfs_device *__alloc_device(void)
412
+static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
325413 {
326414 struct btrfs_device *dev;
327415
....@@ -341,34 +429,86 @@
341429
342430 INIT_LIST_HEAD(&dev->dev_list);
343431 INIT_LIST_HEAD(&dev->dev_alloc_list);
344
- INIT_LIST_HEAD(&dev->resized_list);
345
-
346
- spin_lock_init(&dev->io_lock);
432
+ INIT_LIST_HEAD(&dev->post_commit_list);
347433
348434 atomic_set(&dev->reada_in_flight, 0);
349435 atomic_set(&dev->dev_stats_ccnt, 0);
350436 btrfs_device_data_ordered_init(dev);
351437 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
352438 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
439
+ extent_io_tree_init(fs_info, &dev->alloc_state,
440
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
353441
354442 return dev;
355443 }
356444
357
-static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
445
+static noinline struct btrfs_fs_devices *find_fsid(
446
+ const u8 *fsid, const u8 *metadata_fsid)
358447 {
359448 struct btrfs_fs_devices *fs_devices;
360449
450
+ ASSERT(fsid);
451
+
452
+ /* Handle non-split brain cases */
361453 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
362
- if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
363
- return fs_devices;
454
+ if (metadata_fsid) {
455
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
456
+ && memcmp(metadata_fsid, fs_devices->metadata_uuid,
457
+ BTRFS_FSID_SIZE) == 0)
458
+ return fs_devices;
459
+ } else {
460
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
461
+ return fs_devices;
462
+ }
364463 }
365464 return NULL;
366465 }
367466
467
+static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
468
+ struct btrfs_super_block *disk_super)
469
+{
470
+
471
+ struct btrfs_fs_devices *fs_devices;
472
+
473
+ /*
474
+ * Handle scanned device having completed its fsid change but
475
+ * belonging to a fs_devices that was created by first scanning
476
+ * a device which didn't have its fsid/metadata_uuid changed
477
+ * at all and the CHANGING_FSID_V2 flag set.
478
+ */
479
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
480
+ if (fs_devices->fsid_change &&
481
+ memcmp(disk_super->metadata_uuid, fs_devices->fsid,
482
+ BTRFS_FSID_SIZE) == 0 &&
483
+ memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
484
+ BTRFS_FSID_SIZE) == 0) {
485
+ return fs_devices;
486
+ }
487
+ }
488
+ /*
489
+ * Handle scanned device having completed its fsid change but
490
+ * belonging to a fs_devices that was created by a device that
491
+ * has an outdated pair of fsid/metadata_uuid and
492
+ * CHANGING_FSID_V2 flag set.
493
+ */
494
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
495
+ if (fs_devices->fsid_change &&
496
+ memcmp(fs_devices->metadata_uuid,
497
+ fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
498
+ memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
499
+ BTRFS_FSID_SIZE) == 0) {
500
+ return fs_devices;
501
+ }
502
+ }
503
+
504
+ return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
505
+}
506
+
507
+
368508 static int
369509 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
370510 int flush, struct block_device **bdev,
371
- struct buffer_head **bh)
511
+ struct btrfs_super_block **disk_super)
372512 {
373513 int ret;
374514
....@@ -387,9 +527,9 @@
387527 goto error;
388528 }
389529 invalidate_bdev(*bdev);
390
- *bh = btrfs_read_dev_super(*bdev);
391
- if (IS_ERR(*bh)) {
392
- ret = PTR_ERR(*bh);
530
+ *disk_super = btrfs_read_dev_super(*bdev);
531
+ if (IS_ERR(*disk_super)) {
532
+ ret = PTR_ERR(*disk_super);
393533 blkdev_put(*bdev, flags);
394534 goto error;
395535 }
....@@ -398,214 +538,50 @@
398538
399539 error:
400540 *bdev = NULL;
401
- *bh = NULL;
402541 return ret;
403542 }
404543
405
-static void requeue_list(struct btrfs_pending_bios *pending_bios,
406
- struct bio *head, struct bio *tail)
407
-{
408
-
409
- struct bio *old_head;
410
-
411
- old_head = pending_bios->head;
412
- pending_bios->head = head;
413
- if (pending_bios->tail)
414
- tail->bi_next = old_head;
415
- else
416
- pending_bios->tail = tail;
417
-}
418
-
419544 /*
420
- * we try to collect pending bios for a device so we don't get a large
421
- * number of procs sending bios down to the same device. This greatly
422
- * improves the schedulers ability to collect and merge the bios.
545
+ * Check if the device in the path matches the device in the given struct device.
423546 *
424
- * But, it also turns into a long list of bios to process and that is sure
425
- * to eventually make the worker thread block. The solution here is to
426
- * make some progress and then put this work struct back at the end of
427
- * the list if the block device is congested. This way, multiple devices
428
- * can make progress from a single worker thread.
547
+ * Returns:
548
+ * true If it is the same device.
549
+ * false If it is not the same device or on error.
429550 */
430
-static noinline void run_scheduled_bios(struct btrfs_device *device)
551
+static bool device_matched(const struct btrfs_device *device, const char *path)
431552 {
432
- struct btrfs_fs_info *fs_info = device->fs_info;
433
- struct bio *pending;
434
- struct backing_dev_info *bdi;
435
- struct btrfs_pending_bios *pending_bios;
436
- struct bio *tail;
437
- struct bio *cur;
438
- int again = 0;
439
- unsigned long num_run;
440
- unsigned long batch_run = 0;
441
- unsigned long last_waited = 0;
442
- int force_reg = 0;
443
- int sync_pending = 0;
444
- struct blk_plug plug;
553
+ char *device_name;
554
+ struct block_device *bdev_old;
555
+ struct block_device *bdev_new;
445556
446557 /*
447
- * this function runs all the bios we've collected for
448
- * a particular device. We don't want to wander off to
449
- * another device without first sending all of these down.
450
- * So, setup a plug here and finish it off before we return
558
+ * If we are looking for a device with the matching dev_t, then skip
559
+ * device without a name (a missing device).
451560 */
452
- blk_start_plug(&plug);
561
+ if (!device->name)
562
+ return false;
453563
454
- bdi = device->bdev->bd_bdi;
564
+ device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
565
+ if (!device_name)
566
+ return false;
455567
456
-loop:
457
- spin_lock(&device->io_lock);
568
+ rcu_read_lock();
569
+ scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
570
+ rcu_read_unlock();
458571
459
-loop_lock:
460
- num_run = 0;
572
+ bdev_old = lookup_bdev(device_name);
573
+ kfree(device_name);
574
+ if (IS_ERR(bdev_old))
575
+ return false;
461576
462
- /* take all the bios off the list at once and process them
463
- * later on (without the lock held). But, remember the
464
- * tail and other pointers so the bios can be properly reinserted
465
- * into the list if we hit congestion
466
- */
467
- if (!force_reg && device->pending_sync_bios.head) {
468
- pending_bios = &device->pending_sync_bios;
469
- force_reg = 1;
470
- } else {
471
- pending_bios = &device->pending_bios;
472
- force_reg = 0;
473
- }
577
+ bdev_new = lookup_bdev(path);
578
+ if (IS_ERR(bdev_new))
579
+ return false;
474580
475
- pending = pending_bios->head;
476
- tail = pending_bios->tail;
477
- WARN_ON(pending && !tail);
581
+ if (bdev_old == bdev_new)
582
+ return true;
478583
479
- /*
480
- * if pending was null this time around, no bios need processing
481
- * at all and we can stop. Otherwise it'll loop back up again
482
- * and do an additional check so no bios are missed.
483
- *
484
- * device->running_pending is used to synchronize with the
485
- * schedule_bio code.
486
- */
487
- if (device->pending_sync_bios.head == NULL &&
488
- device->pending_bios.head == NULL) {
489
- again = 0;
490
- device->running_pending = 0;
491
- } else {
492
- again = 1;
493
- device->running_pending = 1;
494
- }
495
-
496
- pending_bios->head = NULL;
497
- pending_bios->tail = NULL;
498
-
499
- spin_unlock(&device->io_lock);
500
-
501
- while (pending) {
502
-
503
- rmb();
504
- /* we want to work on both lists, but do more bios on the
505
- * sync list than the regular list
506
- */
507
- if ((num_run > 32 &&
508
- pending_bios != &device->pending_sync_bios &&
509
- device->pending_sync_bios.head) ||
510
- (num_run > 64 && pending_bios == &device->pending_sync_bios &&
511
- device->pending_bios.head)) {
512
- spin_lock(&device->io_lock);
513
- requeue_list(pending_bios, pending, tail);
514
- goto loop_lock;
515
- }
516
-
517
- cur = pending;
518
- pending = pending->bi_next;
519
- cur->bi_next = NULL;
520
-
521
- BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
522
-
523
- /*
524
- * if we're doing the sync list, record that our
525
- * plug has some sync requests on it
526
- *
527
- * If we're doing the regular list and there are
528
- * sync requests sitting around, unplug before
529
- * we add more
530
- */
531
- if (pending_bios == &device->pending_sync_bios) {
532
- sync_pending = 1;
533
- } else if (sync_pending) {
534
- blk_finish_plug(&plug);
535
- blk_start_plug(&plug);
536
- sync_pending = 0;
537
- }
538
-
539
- btrfsic_submit_bio(cur);
540
- num_run++;
541
- batch_run++;
542
-
543
- cond_resched();
544
-
545
- /*
546
- * we made progress, there is more work to do and the bdi
547
- * is now congested. Back off and let other work structs
548
- * run instead
549
- */
550
- if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
551
- fs_info->fs_devices->open_devices > 1) {
552
- struct io_context *ioc;
553
-
554
- ioc = current->io_context;
555
-
556
- /*
557
- * the main goal here is that we don't want to
558
- * block if we're going to be able to submit
559
- * more requests without blocking.
560
- *
561
- * This code does two great things, it pokes into
562
- * the elevator code from a filesystem _and_
563
- * it makes assumptions about how batching works.
564
- */
565
- if (ioc && ioc->nr_batch_requests > 0 &&
566
- time_before(jiffies, ioc->last_waited + HZ/50UL) &&
567
- (last_waited == 0 ||
568
- ioc->last_waited == last_waited)) {
569
- /*
570
- * we want to go through our batch of
571
- * requests and stop. So, we copy out
572
- * the ioc->last_waited time and test
573
- * against it before looping
574
- */
575
- last_waited = ioc->last_waited;
576
- cond_resched();
577
- continue;
578
- }
579
- spin_lock(&device->io_lock);
580
- requeue_list(pending_bios, pending, tail);
581
- device->running_pending = 1;
582
-
583
- spin_unlock(&device->io_lock);
584
- btrfs_queue_work(fs_info->submit_workers,
585
- &device->work);
586
- goto done;
587
- }
588
- }
589
-
590
- cond_resched();
591
- if (again)
592
- goto loop;
593
-
594
- spin_lock(&device->io_lock);
595
- if (device->pending_bios.head || device->pending_sync_bios.head)
596
- goto loop_lock;
597
- spin_unlock(&device->io_lock);
598
-
599
-done:
600
- blk_finish_plug(&plug);
601
-}
602
-
603
-static void pending_bios_fn(struct btrfs_work *work)
604
-{
605
- struct btrfs_device *device;
606
-
607
- device = container_of(work, struct btrfs_device, work);
608
- run_scheduled_bios(device);
584
+ return false;
609585 }
610586
611587 /*
....@@ -615,52 +591,55 @@
615591 * matching this path only.
616592 * skip_dev: Optional. Will skip this device when searching for the stale
617593 * devices.
594
+ * Return: 0 for success or if @path is NULL.
595
+ * -EBUSY if @path is a mounted device.
596
+ * -ENOENT if @path does not match any device in the list.
618597 */
619
-static void btrfs_free_stale_devices(const char *path,
598
+static int btrfs_free_stale_devices(const char *path,
620599 struct btrfs_device *skip_device)
621600 {
622601 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
623602 struct btrfs_device *device, *tmp_device;
603
+ int ret = 0;
604
+
605
+ lockdep_assert_held(&uuid_mutex);
606
+
607
+ if (path)
608
+ ret = -ENOENT;
624609
625610 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
626
- mutex_lock(&fs_devices->device_list_mutex);
627
- if (fs_devices->opened) {
628
- mutex_unlock(&fs_devices->device_list_mutex);
629
- continue;
630
- }
631611
612
+ mutex_lock(&fs_devices->device_list_mutex);
632613 list_for_each_entry_safe(device, tmp_device,
633614 &fs_devices->devices, dev_list) {
634
- int not_found = 0;
635
-
636615 if (skip_device && skip_device == device)
637616 continue;
638
- if (path && !device->name)
617
+ if (path && !device_matched(device, path))
639618 continue;
640
-
641
- rcu_read_lock();
642
- if (path)
643
- not_found = strcmp(rcu_str_deref(device->name),
644
- path);
645
- rcu_read_unlock();
646
- if (not_found)
647
- continue;
619
+ if (fs_devices->opened) {
620
+ /* for an already deleted device return 0 */
621
+ if (path && ret != 0)
622
+ ret = -EBUSY;
623
+ break;
624
+ }
648625
649626 /* delete the stale device */
650627 fs_devices->num_devices--;
651628 list_del(&device->dev_list);
652629 btrfs_free_device(device);
653630
654
- if (fs_devices->num_devices == 0)
655
- break;
631
+ ret = 0;
656632 }
657633 mutex_unlock(&fs_devices->device_list_mutex);
634
+
658635 if (fs_devices->num_devices == 0) {
659636 btrfs_sysfs_remove_fsid(fs_devices);
660637 list_del(&fs_devices->fs_list);
661638 free_fs_devices(fs_devices);
662639 }
663640 }
641
+
642
+ return ret;
664643 }
665644
666645 /*
....@@ -674,7 +653,6 @@
674653 {
675654 struct request_queue *q;
676655 struct block_device *bdev;
677
- struct buffer_head *bh;
678656 struct btrfs_super_block *disk_super;
679657 u64 devid;
680658 int ret;
....@@ -685,23 +663,29 @@
685663 return -EINVAL;
686664
687665 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
688
- &bdev, &bh);
666
+ &bdev, &disk_super);
689667 if (ret)
690668 return ret;
691669
692
- disk_super = (struct btrfs_super_block *)bh->b_data;
693670 devid = btrfs_stack_device_id(&disk_super->dev_item);
694671 if (devid != device->devid)
695
- goto error_brelse;
672
+ goto error_free_page;
696673
697674 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
698
- goto error_brelse;
675
+ goto error_free_page;
699676
700677 device->generation = btrfs_super_generation(disk_super);
701678
702679 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
680
+ if (btrfs_super_incompat_flags(disk_super) &
681
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
682
+ pr_err(
683
+ "BTRFS: Invalid seeding and uuid-changed device detected\n");
684
+ goto error_free_page;
685
+ }
686
+
703687 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
704
- fs_devices->seeding = 1;
688
+ fs_devices->seeding = true;
705689 } else {
706690 if (bdev_read_only(bdev))
707691 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
....@@ -711,7 +695,7 @@
711695
712696 q = bdev_get_queue(bdev);
713697 if (!blk_queue_nonrot(q))
714
- fs_devices->rotating = 1;
698
+ fs_devices->rotating = true;
715699
716700 device->bdev = bdev;
717701 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
....@@ -723,17 +707,109 @@
723707 fs_devices->rw_devices++;
724708 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
725709 }
726
- brelse(bh);
710
+ btrfs_release_disk_super(disk_super);
727711
728712 return 0;
729713
730
-error_brelse:
731
- brelse(bh);
714
+error_free_page:
715
+ btrfs_release_disk_super(disk_super);
732716 blkdev_put(bdev, flags);
733717
734718 return -EINVAL;
735719 }
736720
721
+u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
722
+{
723
+ bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
724
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
725
+
726
+ return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
727
+}
728
+
729
+/*
730
+ * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
731
+ * being created with a disk that has already completed its fsid change. Such
732
+ * disk can belong to an fs which has its FSID changed or to one which doesn't.
733
+ * Handle both cases here.
734
+ */
735
+static struct btrfs_fs_devices *find_fsid_inprogress(
736
+ struct btrfs_super_block *disk_super)
737
+{
738
+ struct btrfs_fs_devices *fs_devices;
739
+
740
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
741
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
742
+ BTRFS_FSID_SIZE) != 0 &&
743
+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
744
+ BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
745
+ return fs_devices;
746
+ }
747
+ }
748
+
749
+ return find_fsid(disk_super->fsid, NULL);
750
+}
751
+
752
+
753
+static struct btrfs_fs_devices *find_fsid_changed(
754
+ struct btrfs_super_block *disk_super)
755
+{
756
+ struct btrfs_fs_devices *fs_devices;
757
+
758
+ /*
759
+ * Handles the case where scanned device is part of an fs that had
760
+ * multiple successful changes of FSID but curently device didn't
761
+ * observe it. Meaning our fsid will be different than theirs. We need
762
+ * to handle two subcases :
763
+ * 1 - The fs still continues to have different METADATA/FSID uuids.
764
+ * 2 - The fs is switched back to its original FSID (METADATA/FSID
765
+ * are equal).
766
+ */
767
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
768
+ /* Changed UUIDs */
769
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
770
+ BTRFS_FSID_SIZE) != 0 &&
771
+ memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
772
+ BTRFS_FSID_SIZE) == 0 &&
773
+ memcmp(fs_devices->fsid, disk_super->fsid,
774
+ BTRFS_FSID_SIZE) != 0)
775
+ return fs_devices;
776
+
777
+ /* Unchanged UUIDs */
778
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
779
+ BTRFS_FSID_SIZE) == 0 &&
780
+ memcmp(fs_devices->fsid, disk_super->metadata_uuid,
781
+ BTRFS_FSID_SIZE) == 0)
782
+ return fs_devices;
783
+ }
784
+
785
+ return NULL;
786
+}
787
+
788
+static struct btrfs_fs_devices *find_fsid_reverted_metadata(
789
+ struct btrfs_super_block *disk_super)
790
+{
791
+ struct btrfs_fs_devices *fs_devices;
792
+
793
+ /*
794
+ * Handle the case where the scanned device is part of an fs whose last
795
+ * metadata UUID change reverted it to the original FSID. At the same
796
+ * time * fs_devices was first created by another constitutent device
797
+ * which didn't fully observe the operation. This results in an
798
+ * btrfs_fs_devices created with metadata/fsid different AND
799
+ * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
800
+ * fs_devices equal to the FSID of the disk.
801
+ */
802
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
803
+ if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
804
+ BTRFS_FSID_SIZE) != 0 &&
805
+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
806
+ BTRFS_FSID_SIZE) == 0 &&
807
+ fs_devices->fsid_change)
808
+ return fs_devices;
809
+ }
810
+
811
+ return NULL;
812
+}
737813 /*
738814 * Add new device to list of registered devices
739815 *
....@@ -746,16 +822,40 @@
746822 bool *new_device_added)
747823 {
748824 struct btrfs_device *device;
749
- struct btrfs_fs_devices *fs_devices;
825
+ struct btrfs_fs_devices *fs_devices = NULL;
750826 struct rcu_string *name;
751827 u64 found_transid = btrfs_super_generation(disk_super);
752828 u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
829
+ bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
830
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
831
+ bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
832
+ BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
753833
754
- fs_devices = find_fsid(disk_super->fsid);
834
+ if (fsid_change_in_progress) {
835
+ if (!has_metadata_uuid)
836
+ fs_devices = find_fsid_inprogress(disk_super);
837
+ else
838
+ fs_devices = find_fsid_changed(disk_super);
839
+ } else if (has_metadata_uuid) {
840
+ fs_devices = find_fsid_with_metadata_uuid(disk_super);
841
+ } else {
842
+ fs_devices = find_fsid_reverted_metadata(disk_super);
843
+ if (!fs_devices)
844
+ fs_devices = find_fsid(disk_super->fsid, NULL);
845
+ }
846
+
847
+
755848 if (!fs_devices) {
756
- fs_devices = alloc_fs_devices(disk_super->fsid);
849
+ if (has_metadata_uuid)
850
+ fs_devices = alloc_fs_devices(disk_super->fsid,
851
+ disk_super->metadata_uuid);
852
+ else
853
+ fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
854
+
757855 if (IS_ERR(fs_devices))
758856 return ERR_CAST(fs_devices);
857
+
858
+ fs_devices->fsid_change = fsid_change_in_progress;
759859
760860 mutex_lock(&fs_devices->device_list_mutex);
761861 list_add(&fs_devices->fs_list, &fs_uuids);
....@@ -765,6 +865,27 @@
765865 mutex_lock(&fs_devices->device_list_mutex);
766866 device = btrfs_find_device(fs_devices, devid,
767867 disk_super->dev_item.uuid, NULL, false);
868
+
869
+ /*
870
+ * If this disk has been pulled into an fs devices created by
871
+ * a device which had the CHANGING_FSID_V2 flag then replace the
872
+ * metadata_uuid/fsid values of the fs_devices.
873
+ */
874
+ if (fs_devices->fsid_change &&
875
+ found_transid > fs_devices->latest_generation) {
876
+ memcpy(fs_devices->fsid, disk_super->fsid,
877
+ BTRFS_FSID_SIZE);
878
+
879
+ if (has_metadata_uuid)
880
+ memcpy(fs_devices->metadata_uuid,
881
+ disk_super->metadata_uuid,
882
+ BTRFS_FSID_SIZE);
883
+ else
884
+ memcpy(fs_devices->metadata_uuid,
885
+ disk_super->fsid, BTRFS_FSID_SIZE);
886
+
887
+ fs_devices->fsid_change = false;
888
+ }
768889 }
769890
770891 if (!device) {
....@@ -796,11 +917,15 @@
796917 *new_device_added = true;
797918
798919 if (disk_super->label[0])
799
- pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
800
- disk_super->label, devid, found_transid, path);
920
+ pr_info(
921
+ "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
922
+ disk_super->label, devid, found_transid, path,
923
+ current->comm, task_pid_nr(current));
801924 else
802
- pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
803
- disk_super->fsid, devid, found_transid, path);
925
+ pr_info(
926
+ "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
927
+ disk_super->fsid, devid, found_transid, path,
928
+ current->comm, task_pid_nr(current));
804929
805930 } else if (!device->name || strcmp(device->name->str, path)) {
806931 /*
....@@ -897,8 +1022,11 @@
8971022 * it back. We need it to pick the disk with largest generation
8981023 * (as above).
8991024 */
900
- if (!fs_devices->opened)
1025
+ if (!fs_devices->opened) {
9011026 device->generation = found_transid;
1027
+ fs_devices->latest_generation = max_t(u64, found_transid,
1028
+ fs_devices->latest_generation);
1029
+ }
9021030
9031031 fs_devices->total_devices = btrfs_super_num_devices(disk_super);
9041032
....@@ -911,22 +1039,25 @@
9111039 struct btrfs_fs_devices *fs_devices;
9121040 struct btrfs_device *device;
9131041 struct btrfs_device *orig_dev;
1042
+ int ret = 0;
9141043
915
- fs_devices = alloc_fs_devices(orig->fsid);
1044
+ lockdep_assert_held(&uuid_mutex);
1045
+
1046
+ fs_devices = alloc_fs_devices(orig->fsid, NULL);
9161047 if (IS_ERR(fs_devices))
9171048 return fs_devices;
9181049
919
- mutex_lock(&orig->device_list_mutex);
9201050 fs_devices->total_devices = orig->total_devices;
9211051
922
- /* We have held the volume lock, it is safe to get the devices. */
9231052 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
9241053 struct rcu_string *name;
9251054
9261055 device = btrfs_alloc_device(NULL, &orig_dev->devid,
9271056 orig_dev->uuid);
928
- if (IS_ERR(device))
1057
+ if (IS_ERR(device)) {
1058
+ ret = PTR_ERR(device);
9291059 goto error;
1060
+ }
9301061
9311062 /*
9321063 * This is ok to do without rcu read locked because we hold the
....@@ -937,6 +1068,7 @@
9371068 GFP_KERNEL);
9381069 if (!name) {
9391070 btrfs_free_device(device);
1071
+ ret = -ENOMEM;
9401072 goto error;
9411073 }
9421074 rcu_assign_pointer(device->name, name);
....@@ -946,36 +1078,27 @@
9461078 device->fs_devices = fs_devices;
9471079 fs_devices->num_devices++;
9481080 }
949
- mutex_unlock(&orig->device_list_mutex);
9501081 return fs_devices;
9511082 error:
952
- mutex_unlock(&orig->device_list_mutex);
9531083 free_fs_devices(fs_devices);
954
- return ERR_PTR(-ENOMEM);
1084
+ return ERR_PTR(ret);
9551085 }
9561086
957
-/*
958
- * After we have read the system tree and know devids belonging to
959
- * this filesystem, remove the device which does not belong there.
960
- */
961
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1087
+static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1088
+ int step, struct btrfs_device **latest_dev)
9621089 {
9631090 struct btrfs_device *device, *next;
964
- struct btrfs_device *latest_dev = NULL;
9651091
966
- mutex_lock(&uuid_mutex);
967
-again:
9681092 /* This is the initialized path, it is safe to release the devices. */
9691093 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
970
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
971
- &device->dev_state)) {
1094
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
9721095 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
973
- &device->dev_state) &&
1096
+ &device->dev_state) &&
9741097 !test_bit(BTRFS_DEV_STATE_MISSING,
9751098 &device->dev_state) &&
976
- (!latest_dev ||
977
- device->generation > latest_dev->generation)) {
978
- latest_dev = device;
1099
+ (!*latest_dev ||
1100
+ device->generation > (*latest_dev)->generation)) {
1101
+ *latest_dev = device;
9791102 }
9801103 continue;
9811104 }
....@@ -1002,22 +1125,26 @@
10021125 btrfs_free_device(device);
10031126 }
10041127
1005
- if (fs_devices->seed) {
1006
- fs_devices = fs_devices->seed;
1007
- goto again;
1008
- }
1128
+}
1129
+
1130
+/*
1131
+ * After we have read the system tree and know devids belonging to this
1132
+ * filesystem, remove the device which does not belong there.
1133
+ */
1134
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1135
+{
1136
+ struct btrfs_device *latest_dev = NULL;
1137
+ struct btrfs_fs_devices *seed_dev;
1138
+
1139
+ mutex_lock(&uuid_mutex);
1140
+ __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
1141
+
1142
+ list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1143
+ __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
10091144
10101145 fs_devices->latest_bdev = latest_dev->bdev;
10111146
10121147 mutex_unlock(&uuid_mutex);
1013
-}
1014
-
1015
-static void free_device_rcu(struct rcu_head *head)
1016
-{
1017
- struct btrfs_device *device;
1018
-
1019
- device = container_of(head, struct btrfs_device, rcu);
1020
- btrfs_free_device(device);
10211148 }
10221149
10231150 static void btrfs_close_bdev(struct btrfs_device *device)
....@@ -1036,11 +1163,6 @@
10361163 static void btrfs_close_one_device(struct btrfs_device *device)
10371164 {
10381165 struct btrfs_fs_devices *fs_devices = device->fs_devices;
1039
- struct btrfs_device *new_device;
1040
- struct rcu_string *name;
1041
-
1042
- if (device->bdev)
1043
- fs_devices->open_devices--;
10441166
10451167 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
10461168 device->devid != BTRFS_DEV_REPLACE_DEVID) {
....@@ -1057,65 +1179,85 @@
10571179 }
10581180
10591181 btrfs_close_bdev(device);
1060
-
1061
- new_device = btrfs_alloc_device(NULL, &device->devid,
1062
- device->uuid);
1063
- BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1064
-
1065
- /* Safe because we are under uuid_mutex */
1066
- if (device->name) {
1067
- name = rcu_string_strdup(device->name->str, GFP_NOFS);
1068
- BUG_ON(!name); /* -ENOMEM */
1069
- rcu_assign_pointer(new_device->name, name);
1182
+ if (device->bdev) {
1183
+ fs_devices->open_devices--;
1184
+ device->bdev = NULL;
10701185 }
1186
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
10711187
1072
- list_replace_rcu(&device->dev_list, &new_device->dev_list);
1073
- new_device->fs_devices = device->fs_devices;
1188
+ device->fs_info = NULL;
1189
+ atomic_set(&device->dev_stats_ccnt, 0);
1190
+ extent_io_tree_release(&device->alloc_state);
10741191
1075
- call_rcu(&device->rcu, free_device_rcu);
1192
+ /*
1193
+ * Reset the flush error record. We might have a transient flush error
1194
+ * in this mount, and if so we aborted the current transaction and set
1195
+ * the fs to an error state, guaranteeing no super blocks can be further
1196
+ * committed. However that error might be transient and if we unmount the
1197
+ * filesystem and mount it again, we should allow the mount to succeed
1198
+ * (btrfs_check_rw_degradable() should not fail) - if after mounting the
1199
+ * filesystem again we still get flush errors, then we will again abort
1200
+ * any transaction and set the error state, guaranteeing no commits of
1201
+ * unsafe super blocks.
1202
+ */
1203
+ device->last_flush_error = 0;
1204
+
1205
+ /* Verify the device is back in a pristine state */
1206
+ ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1207
+ ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1208
+ ASSERT(list_empty(&device->dev_alloc_list));
1209
+ ASSERT(list_empty(&device->post_commit_list));
1210
+ ASSERT(atomic_read(&device->reada_in_flight) == 0);
10761211 }
10771212
1078
-static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1213
+static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
10791214 {
10801215 struct btrfs_device *device, *tmp;
10811216
1082
- if (--fs_devices->opened > 0)
1083
- return 0;
1217
+ lockdep_assert_held(&uuid_mutex);
10841218
1085
- mutex_lock(&fs_devices->device_list_mutex);
1086
- list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1219
+ if (--fs_devices->opened > 0)
1220
+ return;
1221
+
1222
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
10871223 btrfs_close_one_device(device);
1088
- }
1089
- mutex_unlock(&fs_devices->device_list_mutex);
10901224
10911225 WARN_ON(fs_devices->open_devices);
10921226 WARN_ON(fs_devices->rw_devices);
10931227 fs_devices->opened = 0;
1094
- fs_devices->seeding = 0;
1095
-
1096
- return 0;
1228
+ fs_devices->seeding = false;
1229
+ fs_devices->fs_info = NULL;
10971230 }
10981231
1099
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1232
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
11001233 {
1101
- struct btrfs_fs_devices *seed_devices = NULL;
1102
- int ret;
1234
+ LIST_HEAD(list);
1235
+ struct btrfs_fs_devices *tmp;
11031236
11041237 mutex_lock(&uuid_mutex);
1105
- ret = close_fs_devices(fs_devices);
1238
+ close_fs_devices(fs_devices);
11061239 if (!fs_devices->opened) {
1107
- seed_devices = fs_devices->seed;
1108
- fs_devices->seed = NULL;
1109
- }
1110
- mutex_unlock(&uuid_mutex);
1240
+ list_splice_init(&fs_devices->seed_list, &list);
11111241
1112
- while (seed_devices) {
1113
- fs_devices = seed_devices;
1114
- seed_devices = fs_devices->seed;
1242
+ /*
1243
+ * If the struct btrfs_fs_devices is not assembled with any
1244
+ * other device, it can be re-initialized during the next mount
1245
+ * without the needing device-scan step. Therefore, it can be
1246
+ * fully freed.
1247
+ */
1248
+ if (fs_devices->num_devices == 1) {
1249
+ list_del(&fs_devices->fs_list);
1250
+ free_fs_devices(fs_devices);
1251
+ }
1252
+ }
1253
+
1254
+
1255
+ list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
11151256 close_fs_devices(fs_devices);
1257
+ list_del(&fs_devices->seed_list);
11161258 free_fs_devices(fs_devices);
11171259 }
1118
- return ret;
1260
+ mutex_unlock(&uuid_mutex);
11191261 }
11201262
11211263 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
....@@ -1123,28 +1265,33 @@
11231265 {
11241266 struct btrfs_device *device;
11251267 struct btrfs_device *latest_dev = NULL;
1126
- int ret = 0;
1268
+ struct btrfs_device *tmp_device;
11271269
11281270 flags |= FMODE_EXCL;
11291271
1130
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
1131
- /* Just open everything we can; ignore failures here */
1132
- if (btrfs_open_one_device(fs_devices, device, flags, holder))
1133
- continue;
1272
+ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1273
+ dev_list) {
1274
+ int ret;
11341275
1135
- if (!latest_dev ||
1136
- device->generation > latest_dev->generation)
1276
+ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1277
+ if (ret == 0 &&
1278
+ (!latest_dev || device->generation > latest_dev->generation)) {
11371279 latest_dev = device;
1280
+ } else if (ret == -ENODATA) {
1281
+ fs_devices->num_devices--;
1282
+ list_del(&device->dev_list);
1283
+ btrfs_free_device(device);
1284
+ }
11381285 }
1139
- if (fs_devices->open_devices == 0) {
1140
- ret = -EINVAL;
1141
- goto out;
1142
- }
1286
+ if (fs_devices->open_devices == 0)
1287
+ return -EINVAL;
1288
+
11431289 fs_devices->opened = 1;
11441290 fs_devices->latest_bdev = latest_dev->bdev;
11451291 fs_devices->total_rw_bytes = 0;
1146
-out:
1147
- return ret;
1292
+ fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1293
+
1294
+ return 0;
11481295 }
11491296
11501297 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
....@@ -1186,55 +1333,66 @@
11861333 return ret;
11871334 }
11881335
1189
-static void btrfs_release_disk_super(struct page *page)
1336
+void btrfs_release_disk_super(struct btrfs_super_block *super)
11901337 {
1191
- kunmap(page);
1338
+ struct page *page = virt_to_page(super);
1339
+
11921340 put_page(page);
11931341 }
11941342
1195
-static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1196
- struct page **page,
1197
- struct btrfs_super_block **disk_super)
1343
+static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1344
+ u64 bytenr)
11981345 {
1346
+ struct btrfs_super_block *disk_super;
1347
+ struct page *page;
11991348 void *p;
12001349 pgoff_t index;
12011350
12021351 /* make sure our super fits in the device */
12031352 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1204
- return 1;
1353
+ return ERR_PTR(-EINVAL);
12051354
12061355 /* make sure our super fits in the page */
1207
- if (sizeof(**disk_super) > PAGE_SIZE)
1208
- return 1;
1356
+ if (sizeof(*disk_super) > PAGE_SIZE)
1357
+ return ERR_PTR(-EINVAL);
12091358
12101359 /* make sure our super doesn't straddle pages on disk */
12111360 index = bytenr >> PAGE_SHIFT;
1212
- if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1213
- return 1;
1361
+ if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1362
+ return ERR_PTR(-EINVAL);
12141363
12151364 /* pull in the page with our super */
1216
- *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1217
- index, GFP_KERNEL);
1365
+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
12181366
1219
- if (IS_ERR_OR_NULL(*page))
1220
- return 1;
1367
+ if (IS_ERR(page))
1368
+ return ERR_CAST(page);
12211369
1222
- p = kmap(*page);
1370
+ p = page_address(page);
12231371
12241372 /* align our pointer to the offset of the super block */
1225
- *disk_super = p + (bytenr & ~PAGE_MASK);
1373
+ disk_super = p + offset_in_page(bytenr);
12261374
1227
- if (btrfs_super_bytenr(*disk_super) != bytenr ||
1228
- btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1229
- btrfs_release_disk_super(*page);
1230
- return 1;
1375
+ if (btrfs_super_bytenr(disk_super) != bytenr ||
1376
+ btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1377
+ btrfs_release_disk_super(p);
1378
+ return ERR_PTR(-EINVAL);
12311379 }
12321380
1233
- if ((*disk_super)->label[0] &&
1234
- (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1235
- (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1381
+ if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1382
+ disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
12361383
1237
- return 0;
1384
+ return disk_super;
1385
+}
1386
+
1387
+int btrfs_forget_devices(const char *path)
1388
+{
1389
+ int ret;
1390
+
1391
+ mutex_lock(&uuid_mutex);
1392
+ ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1393
+ mutex_unlock(&uuid_mutex);
1394
+
1395
+ return ret;
12381396 }
12391397
12401398 /*
....@@ -1249,7 +1407,6 @@
12491407 bool new_device_added = false;
12501408 struct btrfs_device *device = NULL;
12511409 struct block_device *bdev;
1252
- struct page *page;
12531410 u64 bytenr;
12541411
12551412 lockdep_assert_held(&uuid_mutex);
....@@ -1261,14 +1418,24 @@
12611418 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
12621419 */
12631420 bytenr = btrfs_sb_offset(0);
1264
- flags |= FMODE_EXCL;
12651421
1422
+ /*
1423
+ * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may
1424
+ * initiate the device scan which may race with the user's mount
1425
+ * or mkfs command, resulting in failure.
1426
+ * Since the device scan is solely for reading purposes, there is
1427
+ * no need for FMODE_EXCL. Additionally, the devices are read again
1428
+ * during the mount process. It is ok to get some inconsistent
1429
+ * values temporarily, as the device paths of the fsid are the only
1430
+ * required information for assembling the volume.
1431
+ */
12661432 bdev = blkdev_get_by_path(path, flags, holder);
12671433 if (IS_ERR(bdev))
12681434 return ERR_CAST(bdev);
12691435
1270
- if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1271
- device = ERR_PTR(-EINVAL);
1436
+ disk_super = btrfs_read_disk_super(bdev, bytenr);
1437
+ if (IS_ERR(disk_super)) {
1438
+ device = ERR_CAST(disk_super);
12721439 goto error_bdev_put;
12731440 }
12741441
....@@ -1278,7 +1445,7 @@
12781445 btrfs_free_stale_devices(path, device);
12791446 }
12801447
1281
- btrfs_release_disk_super(page);
1448
+ btrfs_release_disk_super(disk_super);
12821449
12831450 error_bdev_put:
12841451 blkdev_put(bdev, flags);
....@@ -1286,60 +1453,84 @@
12861453 return device;
12871454 }
12881455
1289
-static int contains_pending_extent(struct btrfs_transaction *transaction,
1290
- struct btrfs_device *device,
1291
- u64 *start, u64 len)
1456
+/*
1457
+ * Try to find a chunk that intersects [start, start + len] range and when one
1458
+ * such is found, record the end of it in *start
1459
+ */
1460
+static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1461
+ u64 len)
12921462 {
1293
- struct btrfs_fs_info *fs_info = device->fs_info;
1294
- struct extent_map *em;
1295
- struct list_head *search_list = &fs_info->pinned_chunks;
1296
- int ret = 0;
1297
- u64 physical_start = *start;
1463
+ u64 physical_start, physical_end;
12981464
1299
- if (transaction)
1300
- search_list = &transaction->pending_chunks;
1301
-again:
1302
- list_for_each_entry(em, search_list, list) {
1303
- struct map_lookup *map;
1304
- int i;
1465
+ lockdep_assert_held(&device->fs_info->chunk_mutex);
13051466
1306
- map = em->map_lookup;
1307
- for (i = 0; i < map->num_stripes; i++) {
1308
- u64 end;
1467
+ if (!find_first_extent_bit(&device->alloc_state, *start,
1468
+ &physical_start, &physical_end,
1469
+ CHUNK_ALLOCATED, NULL)) {
13091470
1310
- if (map->stripes[i].dev != device)
1311
- continue;
1312
- if (map->stripes[i].physical >= physical_start + len ||
1313
- map->stripes[i].physical + em->orig_block_len <=
1314
- physical_start)
1315
- continue;
1316
- /*
1317
- * Make sure that while processing the pinned list we do
1318
- * not override our *start with a lower value, because
1319
- * we can have pinned chunks that fall within this
1320
- * device hole and that have lower physical addresses
1321
- * than the pending chunks we processed before. If we
1322
- * do not take this special care we can end up getting
1323
- * 2 pending chunks that start at the same physical
1324
- * device offsets because the end offset of a pinned
1325
- * chunk can be equal to the start offset of some
1326
- * pending chunk.
1327
- */
1328
- end = map->stripes[i].physical + em->orig_block_len;
1329
- if (end > *start) {
1330
- *start = end;
1331
- ret = 1;
1332
- }
1471
+ if (in_range(physical_start, *start, len) ||
1472
+ in_range(*start, physical_start,
1473
+ physical_end - physical_start)) {
1474
+ *start = physical_end + 1;
1475
+ return true;
13331476 }
13341477 }
1335
- if (search_list != &fs_info->pinned_chunks) {
1336
- search_list = &fs_info->pinned_chunks;
1337
- goto again;
1338
- }
1339
-
1340
- return ret;
1478
+ return false;
13411479 }
13421480
1481
+static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1482
+{
1483
+ switch (device->fs_devices->chunk_alloc_policy) {
1484
+ case BTRFS_CHUNK_ALLOC_REGULAR:
1485
+ /*
1486
+ * We don't want to overwrite the superblock on the drive nor
1487
+ * any area used by the boot loader (grub for example), so we
1488
+ * make sure to start at an offset of at least 1MB.
1489
+ */
1490
+ return max_t(u64, start, SZ_1M);
1491
+ default:
1492
+ BUG();
1493
+ }
1494
+}
1495
+
1496
+/**
1497
+ * dev_extent_hole_check - check if specified hole is suitable for allocation
1498
+ * @device: the device which we have the hole
1499
+ * @hole_start: starting position of the hole
1500
+ * @hole_size: the size of the hole
1501
+ * @num_bytes: the size of the free space that we need
1502
+ *
1503
+ * This function may modify @hole_start and @hole_end to reflect the suitable
1504
+ * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1505
+ */
1506
+static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1507
+ u64 *hole_size, u64 num_bytes)
1508
+{
1509
+ bool changed = false;
1510
+ u64 hole_end = *hole_start + *hole_size;
1511
+
1512
+ /*
1513
+ * Check before we set max_hole_start, otherwise we could end up
1514
+ * sending back this offset anyway.
1515
+ */
1516
+ if (contains_pending_extent(device, hole_start, *hole_size)) {
1517
+ if (hole_end >= *hole_start)
1518
+ *hole_size = hole_end - *hole_start;
1519
+ else
1520
+ *hole_size = 0;
1521
+ changed = true;
1522
+ }
1523
+
1524
+ switch (device->fs_devices->chunk_alloc_policy) {
1525
+ case BTRFS_CHUNK_ALLOC_REGULAR:
1526
+ /* No extra check */
1527
+ break;
1528
+ default:
1529
+ BUG();
1530
+ }
1531
+
1532
+ return changed;
1533
+}
13431534
13441535 /*
13451536 * find_free_dev_extent_start - find free space in the specified device
....@@ -1361,10 +1552,16 @@
13611552 * @len is used to store the size of the free space that we find.
13621553 * But if we don't find suitable free space, it is used to store the size of
13631554 * the max free space.
1555
+ *
1556
+ * NOTE: This function will search *commit* root of device tree, and does extra
1557
+ * check to ensure dev extents are not double allocated.
1558
+ * This makes the function safe to allocate dev extents but may not report
1559
+ * correct usable device space, as device extent freed in current transaction
1560
+ * is not reported as avaiable.
13641561 */
1365
-int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1366
- struct btrfs_device *device, u64 num_bytes,
1367
- u64 search_start, u64 *start, u64 *len)
1562
+static int find_free_dev_extent_start(struct btrfs_device *device,
1563
+ u64 num_bytes, u64 search_start, u64 *start,
1564
+ u64 *len)
13681565 {
13691566 struct btrfs_fs_info *fs_info = device->fs_info;
13701567 struct btrfs_root *root = fs_info->dev_root;
....@@ -1380,12 +1577,7 @@
13801577 int slot;
13811578 struct extent_buffer *l;
13821579
1383
- /*
1384
- * We don't want to overwrite the superblock on the drive nor any area
1385
- * used by the boot loader (grub for example), so we make sure to start
1386
- * at an offset of at least 1MB.
1387
- */
1388
- search_start = max_t(u64, search_start, SZ_1M);
1580
+ search_start = dev_extent_search_start(device, search_start);
13891581
13901582 path = btrfs_alloc_path();
13911583 if (!path)
....@@ -1418,7 +1610,7 @@
14181610 goto out;
14191611 }
14201612
1421
- while (1) {
1613
+ while (search_start < search_end) {
14221614 l = path->nodes[0];
14231615 slot = path->slots[0];
14241616 if (slot >= btrfs_header_nritems(l)) {
....@@ -1441,23 +1633,13 @@
14411633 if (key.type != BTRFS_DEV_EXTENT_KEY)
14421634 goto next;
14431635
1636
+ if (key.offset > search_end)
1637
+ break;
1638
+
14441639 if (key.offset > search_start) {
14451640 hole_size = key.offset - search_start;
1446
-
1447
- /*
1448
- * Have to check before we set max_hole_start, otherwise
1449
- * we could end up sending back this offset anyway.
1450
- */
1451
- if (contains_pending_extent(transaction, device,
1452
- &search_start,
1453
- hole_size)) {
1454
- if (key.offset >= search_start) {
1455
- hole_size = key.offset - search_start;
1456
- } else {
1457
- WARN_ON_ONCE(1);
1458
- hole_size = 0;
1459
- }
1460
- }
1641
+ dev_extent_hole_check(device, &search_start, &hole_size,
1642
+ num_bytes);
14611643
14621644 if (hole_size > max_hole_size) {
14631645 max_hole_start = search_start;
....@@ -1496,9 +1678,8 @@
14961678 */
14971679 if (search_end > search_start) {
14981680 hole_size = search_end - search_start;
1499
-
1500
- if (contains_pending_extent(transaction, device, &search_start,
1501
- hole_size)) {
1681
+ if (dev_extent_hole_check(device, &search_start, &hole_size,
1682
+ num_bytes)) {
15021683 btrfs_release_path(path);
15031684 goto again;
15041685 }
....@@ -1515,6 +1696,7 @@
15151696 else
15161697 ret = 0;
15171698
1699
+ ASSERT(max_hole_start + max_hole_size <= search_end);
15181700 out:
15191701 btrfs_free_path(path);
15201702 *start = max_hole_start;
....@@ -1523,13 +1705,11 @@
15231705 return ret;
15241706 }
15251707
1526
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
1527
- struct btrfs_device *device, u64 num_bytes,
1708
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
15281709 u64 *start, u64 *len)
15291710 {
15301711 /* FIXME use last free of some kind */
1531
- return find_free_dev_extent_start(trans->transaction, device,
1532
- num_bytes, 0, start, len);
1712
+ return find_free_dev_extent_start(device, num_bytes, 0, start, len);
15331713 }
15341714
15351715 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
....@@ -1640,9 +1820,9 @@
16401820 struct rb_node *n;
16411821 u64 ret = 0;
16421822
1643
- em_tree = &fs_info->mapping_tree.map_tree;
1823
+ em_tree = &fs_info->mapping_tree;
16441824 read_lock(&em_tree->lock);
1645
- n = rb_last(&em_tree->map);
1825
+ n = rb_last(&em_tree->map.rb_root);
16461826 if (n) {
16471827 em = rb_entry(n, struct extent_map, rb_node);
16481828 ret = em->start + em->len;
....@@ -1672,7 +1852,12 @@
16721852 if (ret < 0)
16731853 goto error;
16741854
1675
- BUG_ON(ret == 0); /* Corruption */
1855
+ if (ret == 0) {
1856
+ /* Corruption */
1857
+ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1858
+ ret = -EUCLEAN;
1859
+ goto error;
1860
+ }
16761861
16771862 ret = btrfs_previous_item(fs_info->chunk_root, path,
16781863 BTRFS_DEV_ITEMS_OBJECTID,
....@@ -1738,7 +1923,8 @@
17381923 ptr = btrfs_device_uuid(dev_item);
17391924 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
17401925 ptr = btrfs_device_fsid(dev_item);
1741
- write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1926
+ write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1927
+ ptr, BTRFS_FSID_SIZE);
17421928 btrfs_mark_buffer_dirty(leaf);
17431929
17441930 ret = 0;
....@@ -1750,22 +1936,27 @@
17501936 /*
17511937 * Function to update ctime/mtime for a given device path.
17521938 * Mainly used for ctime/mtime based probe like libblkid.
1939
+ *
1940
+ * We don't care about errors here, this is just to be kind to userspace.
17531941 */
1754
-static void update_dev_time(const char *path_name)
1942
+static void update_dev_time(const char *device_path)
17551943 {
1756
- struct file *filp;
1944
+ struct path path;
1945
+ struct timespec64 now;
1946
+ int ret;
17571947
1758
- filp = filp_open(path_name, O_RDWR, 0);
1759
- if (IS_ERR(filp))
1948
+ ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
1949
+ if (ret)
17601950 return;
1761
- file_update_time(filp);
1762
- filp_close(filp, NULL);
1951
+
1952
+ now = current_time(d_inode(path.dentry));
1953
+ inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
1954
+ path_put(&path);
17631955 }
17641956
1765
-static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1766
- struct btrfs_device *device)
1957
+static int btrfs_rm_dev_item(struct btrfs_device *device)
17671958 {
1768
- struct btrfs_root *root = fs_info->chunk_root;
1959
+ struct btrfs_root *root = device->fs_info->chunk_root;
17691960 int ret;
17701961 struct btrfs_path *path;
17711962 struct btrfs_key key;
....@@ -1862,17 +2053,14 @@
18622053 * where this function called, there should be always be another device (or
18632054 * this_dev) which is active.
18642055 */
1865
-void btrfs_assign_next_active_device(struct btrfs_device *device,
1866
- struct btrfs_device *this_dev)
2056
+void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
2057
+ struct btrfs_device *next_device)
18672058 {
18682059 struct btrfs_fs_info *fs_info = device->fs_info;
1869
- struct btrfs_device *next_device;
18702060
1871
- if (this_dev)
1872
- next_device = this_dev;
1873
- else
2061
+ if (!next_device)
18742062 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1875
- device);
2063
+ device);
18762064 ASSERT(next_device);
18772065
18782066 if (fs_info->sb->s_bdev &&
....@@ -1883,8 +2071,66 @@
18832071 fs_info->fs_devices->latest_bdev = next_device->bdev;
18842072 }
18852073
2074
+/*
2075
+ * Return btrfs_fs_devices::num_devices excluding the device that's being
2076
+ * currently replaced.
2077
+ */
2078
+static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2079
+{
2080
+ u64 num_devices = fs_info->fs_devices->num_devices;
2081
+
2082
+ down_read(&fs_info->dev_replace.rwsem);
2083
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2084
+ ASSERT(num_devices > 1);
2085
+ num_devices--;
2086
+ }
2087
+ up_read(&fs_info->dev_replace.rwsem);
2088
+
2089
+ return num_devices;
2090
+}
2091
+
2092
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2093
+ struct block_device *bdev,
2094
+ const char *device_path)
2095
+{
2096
+ struct btrfs_super_block *disk_super;
2097
+ int copy_num;
2098
+
2099
+ if (!bdev)
2100
+ return;
2101
+
2102
+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2103
+ struct page *page;
2104
+ int ret;
2105
+
2106
+ disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2107
+ if (IS_ERR(disk_super))
2108
+ continue;
2109
+
2110
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2111
+
2112
+ page = virt_to_page(disk_super);
2113
+ set_page_dirty(page);
2114
+ lock_page(page);
2115
+ /* write_on_page() unlocks the page */
2116
+ ret = write_one_page(page);
2117
+ if (ret)
2118
+ btrfs_warn(fs_info,
2119
+ "error clearing superblock number %d (%d)",
2120
+ copy_num, ret);
2121
+ btrfs_release_disk_super(disk_super);
2122
+
2123
+ }
2124
+
2125
+ /* Notify udev that device has changed */
2126
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2127
+
2128
+ /* Update ctime/mtime for device path for libblkid */
2129
+ update_dev_time(device_path);
2130
+}
2131
+
18862132 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1887
- u64 devid)
2133
+ u64 devid)
18882134 {
18892135 struct btrfs_device *device;
18902136 struct btrfs_fs_devices *cur_devices;
....@@ -1892,24 +2138,35 @@
18922138 u64 num_devices;
18932139 int ret = 0;
18942140
1895
- mutex_lock(&uuid_mutex);
1896
-
1897
- num_devices = fs_devices->num_devices;
1898
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1899
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1900
- WARN_ON(num_devices < 1);
1901
- num_devices--;
1902
- }
1903
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
2141
+ /*
2142
+ * The device list in fs_devices is accessed without locks (neither
2143
+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
2144
+ * filesystem and another device rm cannot run.
2145
+ */
2146
+ num_devices = btrfs_num_devices(fs_info);
19042147
19052148 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
19062149 if (ret)
19072150 goto out;
19082151
1909
- ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1910
- &device);
1911
- if (ret)
2152
+ device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2153
+
2154
+ if (IS_ERR(device)) {
2155
+ if (PTR_ERR(device) == -ENOENT &&
2156
+ device_path && strcmp(device_path, "missing") == 0)
2157
+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2158
+ else
2159
+ ret = PTR_ERR(device);
19122160 goto out;
2161
+ }
2162
+
2163
+ if (btrfs_pinned_by_swapfile(fs_info, device)) {
2164
+ btrfs_warn_in_rcu(fs_info,
2165
+ "cannot remove device %s (devid %llu) due to active swapfile",
2166
+ rcu_str_deref(device->name), device->devid);
2167
+ ret = -ETXTBSY;
2168
+ goto out;
2169
+ }
19132170
19142171 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
19152172 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
....@@ -1929,9 +2186,9 @@
19292186 mutex_unlock(&fs_info->chunk_mutex);
19302187 }
19312188
1932
- mutex_unlock(&uuid_mutex);
19332189 ret = btrfs_shrink_device(device, 0);
1934
- mutex_lock(&uuid_mutex);
2190
+ if (!ret)
2191
+ btrfs_reada_remove_dev(device);
19352192 if (ret)
19362193 goto error_undo;
19372194
....@@ -1940,12 +2197,12 @@
19402197 * counter although write_all_supers() is not locked out. This
19412198 * could give a filesystem state which requires a degraded mount.
19422199 */
1943
- ret = btrfs_rm_dev_item(fs_info, device);
2200
+ ret = btrfs_rm_dev_item(device);
19442201 if (ret)
19452202 goto error_undo;
19462203
19472204 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
1948
- btrfs_scrub_cancel_dev(fs_info, device);
2205
+ btrfs_scrub_cancel_dev(device);
19492206
19502207 /*
19512208 * the device list mutex makes sure that we don't change
....@@ -1980,7 +2237,7 @@
19802237 if (device->bdev) {
19812238 cur_devices->open_devices--;
19822239 /* remove sysfs entry */
1983
- btrfs_sysfs_rm_device_link(fs_devices, device);
2240
+ btrfs_sysfs_remove_device(device);
19842241 }
19852242
19862243 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
....@@ -1993,29 +2250,24 @@
19932250 * supers and free the device.
19942251 */
19952252 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
1996
- btrfs_scratch_superblocks(device->bdev, device->name->str);
2253
+ btrfs_scratch_superblocks(fs_info, device->bdev,
2254
+ device->name->str);
19972255
19982256 btrfs_close_bdev(device);
1999
- call_rcu(&device->rcu, free_device_rcu);
2257
+ synchronize_rcu();
2258
+ btrfs_free_device(device);
20002259
20012260 if (cur_devices->open_devices == 0) {
2002
- while (fs_devices) {
2003
- if (fs_devices->seed == cur_devices) {
2004
- fs_devices->seed = cur_devices->seed;
2005
- break;
2006
- }
2007
- fs_devices = fs_devices->seed;
2008
- }
2009
- cur_devices->seed = NULL;
2261
+ list_del_init(&cur_devices->seed_list);
20102262 close_fs_devices(cur_devices);
20112263 free_fs_devices(cur_devices);
20122264 }
20132265
20142266 out:
2015
- mutex_unlock(&uuid_mutex);
20162267 return ret;
20172268
20182269 error_undo:
2270
+ btrfs_reada_undo_remove_dev(device);
20192271 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
20202272 mutex_lock(&fs_info->chunk_mutex);
20212273 list_add(&device->dev_alloc_list,
....@@ -2053,23 +2305,18 @@
20532305 fs_devices->open_devices--;
20542306 }
20552307
2056
-void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2057
- struct btrfs_device *srcdev)
2308
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
20582309 {
20592310 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
20602311
2061
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2062
- /* zero out the old super if it is writable */
2063
- btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2064
- }
2312
+ mutex_lock(&uuid_mutex);
20652313
20662314 btrfs_close_bdev(srcdev);
2067
- call_rcu(&srcdev->rcu, free_device_rcu);
2315
+ synchronize_rcu();
2316
+ btrfs_free_device(srcdev);
20682317
20692318 /* if this is no devs we rather delete the fs_devices */
20702319 if (!fs_devices->num_devices) {
2071
- struct btrfs_fs_devices *tmp_fs_devices;
2072
-
20732320 /*
20742321 * On a mounted FS, num_devices can't be zero unless it's a
20752322 * seed. In case of a seed device being replaced, the replace
....@@ -2078,28 +2325,20 @@
20782325 */
20792326 ASSERT(fs_devices->seeding);
20802327
2081
- tmp_fs_devices = fs_info->fs_devices;
2082
- while (tmp_fs_devices) {
2083
- if (tmp_fs_devices->seed == fs_devices) {
2084
- tmp_fs_devices->seed = fs_devices->seed;
2085
- break;
2086
- }
2087
- tmp_fs_devices = tmp_fs_devices->seed;
2088
- }
2089
- fs_devices->seed = NULL;
2328
+ list_del_init(&fs_devices->seed_list);
20902329 close_fs_devices(fs_devices);
20912330 free_fs_devices(fs_devices);
20922331 }
2332
+ mutex_unlock(&uuid_mutex);
20932333 }
20942334
20952335 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
20962336 {
20972337 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
20982338
2099
- WARN_ON(!tgtdev);
21002339 mutex_lock(&fs_devices->device_list_mutex);
21012340
2102
- btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
2341
+ btrfs_sysfs_remove_device(tgtdev);
21032342
21042343 if (tgtdev->bdev)
21052344 fs_devices->open_devices--;
....@@ -2119,90 +2358,77 @@
21192358 * is already out of device list, so we don't have to hold
21202359 * the device_list_mutex lock.
21212360 */
2122
- btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2361
+ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2362
+ tgtdev->name->str);
21232363
21242364 btrfs_close_bdev(tgtdev);
2125
- call_rcu(&tgtdev->rcu, free_device_rcu);
2365
+ synchronize_rcu();
2366
+ btrfs_free_device(tgtdev);
21262367 }
21272368
2128
-static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2129
- const char *device_path,
2130
- struct btrfs_device **device)
2369
+static struct btrfs_device *btrfs_find_device_by_path(
2370
+ struct btrfs_fs_info *fs_info, const char *device_path)
21312371 {
21322372 int ret = 0;
21332373 struct btrfs_super_block *disk_super;
21342374 u64 devid;
21352375 u8 *dev_uuid;
21362376 struct block_device *bdev;
2137
- struct buffer_head *bh;
2377
+ struct btrfs_device *device;
21382378
2139
- *device = NULL;
21402379 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2141
- fs_info->bdev_holder, 0, &bdev, &bh);
2380
+ fs_info->bdev_holder, 0, &bdev, &disk_super);
21422381 if (ret)
2143
- return ret;
2144
- disk_super = (struct btrfs_super_block *)bh->b_data;
2382
+ return ERR_PTR(ret);
2383
+
21452384 devid = btrfs_stack_device_id(&disk_super->dev_item);
21462385 dev_uuid = disk_super->dev_item.uuid;
2147
- *device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2148
- disk_super->fsid, true);
2149
- brelse(bh);
2150
- if (!*device)
2151
- ret = -ENOENT;
2386
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2387
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2388
+ disk_super->metadata_uuid, true);
2389
+ else
2390
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2391
+ disk_super->fsid, true);
2392
+
2393
+ btrfs_release_disk_super(disk_super);
2394
+ if (!device)
2395
+ device = ERR_PTR(-ENOENT);
21522396 blkdev_put(bdev, FMODE_READ);
2153
- return ret;
2154
-}
2155
-
2156
-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2157
- const char *device_path,
2158
- struct btrfs_device **device)
2159
-{
2160
- *device = NULL;
2161
- if (strcmp(device_path, "missing") == 0) {
2162
- struct list_head *devices;
2163
- struct btrfs_device *tmp;
2164
-
2165
- devices = &fs_info->fs_devices->devices;
2166
- list_for_each_entry(tmp, devices, dev_list) {
2167
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2168
- &tmp->dev_state) && !tmp->bdev) {
2169
- *device = tmp;
2170
- break;
2171
- }
2172
- }
2173
-
2174
- if (!*device)
2175
- return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2176
-
2177
- return 0;
2178
- } else {
2179
- return btrfs_find_device_by_path(fs_info, device_path, device);
2180
- }
2397
+ return device;
21812398 }
21822399
21832400 /*
21842401 * Lookup a device given by device id, or the path if the id is 0.
21852402 */
2186
-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2187
- const char *devpath,
2188
- struct btrfs_device **device)
2403
+struct btrfs_device *btrfs_find_device_by_devspec(
2404
+ struct btrfs_fs_info *fs_info, u64 devid,
2405
+ const char *device_path)
21892406 {
2190
- int ret;
2407
+ struct btrfs_device *device;
21912408
21922409 if (devid) {
2193
- ret = 0;
2194
- *device = btrfs_find_device(fs_info->fs_devices, devid,
2195
- NULL, NULL, true);
2196
- if (!*device)
2197
- ret = -ENOENT;
2198
- } else {
2199
- if (!devpath || !devpath[0])
2200
- return -EINVAL;
2201
-
2202
- ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2203
- device);
2410
+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2411
+ NULL, true);
2412
+ if (!device)
2413
+ return ERR_PTR(-ENOENT);
2414
+ return device;
22042415 }
2205
- return ret;
2416
+
2417
+ if (!device_path || !device_path[0])
2418
+ return ERR_PTR(-EINVAL);
2419
+
2420
+ if (strcmp(device_path, "missing") == 0) {
2421
+ /* Find first missing device */
2422
+ list_for_each_entry(device, &fs_info->fs_devices->devices,
2423
+ dev_list) {
2424
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2425
+ &device->dev_state) && !device->bdev)
2426
+ return device;
2427
+ }
2428
+ return ERR_PTR(-ENOENT);
2429
+ }
2430
+
2431
+ return btrfs_find_device_by_path(fs_info, device_path);
22062432 }
22072433
22082434 /*
....@@ -2221,10 +2447,20 @@
22212447 if (!fs_devices->seeding)
22222448 return -EINVAL;
22232449
2224
- seed_devices = alloc_fs_devices(NULL);
2450
+ /*
2451
+ * Private copy of the seed devices, anchored at
2452
+ * fs_info->fs_devices->seed_list
2453
+ */
2454
+ seed_devices = alloc_fs_devices(NULL, NULL);
22252455 if (IS_ERR(seed_devices))
22262456 return PTR_ERR(seed_devices);
22272457
2458
+ /*
2459
+ * It's necessary to retain a copy of the original seed fs_devices in
2460
+ * fs_uuids so that filesystems which have been seeded can successfully
2461
+ * reference the seed device from open_seed_devices. This also supports
2462
+ * multiple fs seed.
2463
+ */
22282464 old_devices = clone_fs_devices(fs_devices);
22292465 if (IS_ERR(old_devices)) {
22302466 kfree(seed_devices);
....@@ -2245,19 +2481,15 @@
22452481 list_for_each_entry(device, &seed_devices->devices, dev_list)
22462482 device->fs_devices = seed_devices;
22472483
2248
- mutex_lock(&fs_info->chunk_mutex);
2249
- list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2250
- mutex_unlock(&fs_info->chunk_mutex);
2251
-
2252
- fs_devices->seeding = 0;
2484
+ fs_devices->seeding = false;
22532485 fs_devices->num_devices = 0;
22542486 fs_devices->open_devices = 0;
22552487 fs_devices->missing_devices = 0;
2256
- fs_devices->rotating = 0;
2257
- fs_devices->seed = seed_devices;
2488
+ fs_devices->rotating = false;
2489
+ list_add(&seed_devices->seed_list, &fs_devices->seed_list);
22582490
22592491 generate_random_uuid(fs_devices->fsid);
2260
- memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2492
+ memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
22612493 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
22622494 mutex_unlock(&fs_devices->device_list_mutex);
22632495
....@@ -2271,9 +2503,9 @@
22712503 /*
22722504 * Store the expected generation for seed devices in device items.
22732505 */
2274
-static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2275
- struct btrfs_fs_info *fs_info)
2506
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
22762507 {
2508
+ struct btrfs_fs_info *fs_info = trans->fs_info;
22772509 struct btrfs_root *root = fs_info->chunk_root;
22782510 struct btrfs_path *path;
22792511 struct extent_buffer *leaf;
....@@ -2357,7 +2589,7 @@
23572589 u64 orig_super_num_devices;
23582590 int seeding_dev = 0;
23592591 int ret = 0;
2360
- bool unlocked = false;
2592
+ bool locked = false;
23612593
23622594 if (sb_rdonly(sb) && !fs_devices->seeding)
23632595 return -EROFS;
....@@ -2371,20 +2603,20 @@
23712603 seeding_dev = 1;
23722604 down_write(&sb->s_umount);
23732605 mutex_lock(&uuid_mutex);
2606
+ locked = true;
23742607 }
23752608
2376
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
2609
+ sync_blockdev(bdev);
23772610
2378
- mutex_lock(&fs_devices->device_list_mutex);
2379
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
2611
+ rcu_read_lock();
2612
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
23802613 if (device->bdev == bdev) {
23812614 ret = -EEXIST;
2382
- mutex_unlock(
2383
- &fs_devices->device_list_mutex);
2615
+ rcu_read_unlock();
23842616 goto error;
23852617 }
23862618 }
2387
- mutex_unlock(&fs_devices->device_list_mutex);
2619
+ rcu_read_unlock();
23882620
23892621 device = btrfs_alloc_device(fs_info, NULL, NULL);
23902622 if (IS_ERR(device)) {
....@@ -2448,7 +2680,7 @@
24482680 atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
24492681
24502682 if (!blk_queue_nonrot(q))
2451
- fs_devices->rotating = 1;
2683
+ fs_devices->rotating = true;
24522684
24532685 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
24542686 btrfs_set_super_total_bytes(fs_info->super_copy,
....@@ -2468,13 +2700,13 @@
24682700 mutex_unlock(&fs_info->chunk_mutex);
24692701
24702702 /* Add sysfs device entry */
2471
- btrfs_sysfs_add_device_link(fs_devices, device);
2703
+ btrfs_sysfs_add_device(device);
24722704
24732705 mutex_unlock(&fs_devices->device_list_mutex);
24742706
24752707 if (seeding_dev) {
24762708 mutex_lock(&fs_info->chunk_mutex);
2477
- ret = init_first_rw_device(trans, fs_info);
2709
+ ret = init_first_rw_device(trans);
24782710 mutex_unlock(&fs_info->chunk_mutex);
24792711 if (ret) {
24802712 btrfs_abort_transaction(trans, ret);
....@@ -2489,22 +2721,17 @@
24892721 }
24902722
24912723 if (seeding_dev) {
2492
- char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2493
-
2494
- ret = btrfs_finish_sprout(trans, fs_info);
2724
+ ret = btrfs_finish_sprout(trans);
24952725 if (ret) {
24962726 btrfs_abort_transaction(trans, ret);
24972727 goto error_sysfs;
24982728 }
24992729
2500
- /* Sprouting would change fsid of the mounted root,
2501
- * so rename the fsid on the sysfs
2730
+ /*
2731
+ * fs_devices now represents the newly sprouted filesystem and
2732
+ * its fsid has been changed by btrfs_prepare_sprout
25022733 */
2503
- snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2504
- fs_info->fsid);
2505
- if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
2506
- btrfs_warn(fs_info,
2507
- "sysfs: failed to create fsid for sprout");
2734
+ btrfs_sysfs_update_sprout_fsid(fs_devices);
25082735 }
25092736
25102737 ret = btrfs_commit_transaction(trans);
....@@ -2512,7 +2739,7 @@
25122739 if (seeding_dev) {
25132740 mutex_unlock(&uuid_mutex);
25142741 up_write(&sb->s_umount);
2515
- unlocked = true;
2742
+ locked = false;
25162743
25172744 if (ret) /* transaction commit */
25182745 return ret;
....@@ -2532,12 +2759,22 @@
25322759 ret = btrfs_commit_transaction(trans);
25332760 }
25342761
2535
- /* Update ctime/mtime for libblkid */
2762
+ /*
2763
+ * Now that we have written a new super block to this device, check all
2764
+ * other fs_devices list if device_path alienates any other scanned
2765
+ * device.
2766
+ * We can ignore the return value as it typically returns -EINVAL and
2767
+ * only succeeds if the device was an alien.
2768
+ */
2769
+ btrfs_forget_devices(device_path);
2770
+
2771
+ /* Update ctime/mtime for blkid or udev */
25362772 update_dev_time(device_path);
2773
+
25372774 return ret;
25382775
25392776 error_sysfs:
2540
- btrfs_sysfs_rm_device_link(fs_devices, device);
2777
+ btrfs_sysfs_remove_device(device);
25412778 mutex_lock(&fs_info->fs_devices->device_list_mutex);
25422779 mutex_lock(&fs_info->chunk_mutex);
25432780 list_del_rcu(&device->dev_list);
....@@ -2563,7 +2800,7 @@
25632800 btrfs_free_device(device);
25642801 error:
25652802 blkdev_put(bdev, FMODE_EXCL);
2566
- if (seeding_dev && !unlocked) {
2803
+ if (locked) {
25672804 mutex_unlock(&uuid_mutex);
25682805 up_write(&sb->s_umount);
25692806 }
....@@ -2621,7 +2858,6 @@
26212858 {
26222859 struct btrfs_fs_info *fs_info = device->fs_info;
26232860 struct btrfs_super_block *super_copy = fs_info->super_copy;
2624
- struct btrfs_fs_devices *fs_devices;
26252861 u64 old_total;
26262862 u64 diff;
26272863
....@@ -2640,8 +2876,6 @@
26402876 return -EINVAL;
26412877 }
26422878
2643
- fs_devices = fs_info->fs_devices;
2644
-
26452879 btrfs_set_super_total_bytes(super_copy,
26462880 round_down(old_total + diff, fs_info->sectorsize));
26472881 device->fs_devices->total_rw_bytes += diff;
....@@ -2649,9 +2883,9 @@
26492883 btrfs_device_set_total_bytes(device, new_size);
26502884 btrfs_device_set_disk_total_bytes(device, new_size);
26512885 btrfs_clear_space_info_full(device->fs_info);
2652
- if (list_empty(&device->resized_list))
2653
- list_add_tail(&device->resized_list,
2654
- &fs_devices->resized_devices);
2886
+ if (list_empty(&device->post_commit_list))
2887
+ list_add_tail(&device->post_commit_list,
2888
+ &trans->transaction->dev_update_list);
26552889 mutex_unlock(&fs_info->chunk_mutex);
26562890
26572891 return btrfs_update_device(trans, device);
....@@ -2739,13 +2973,20 @@
27392973 return ret;
27402974 }
27412975
2742
-static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2743
- u64 logical, u64 length)
2976
+/*
2977
+ * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2978
+ * @logical: Logical block offset in bytes.
2979
+ * @length: Length of extent in bytes.
2980
+ *
2981
+ * Return: Chunk mapping or ERR_PTR.
2982
+ */
2983
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2984
+ u64 logical, u64 length)
27442985 {
27452986 struct extent_map_tree *em_tree;
27462987 struct extent_map *em;
27472988
2748
- em_tree = &fs_info->mapping_tree.map_tree;
2989
+ em_tree = &fs_info->mapping_tree;
27492990 read_lock(&em_tree->lock);
27502991 em = lookup_extent_mapping(em_tree, logical, length);
27512992 read_unlock(&em_tree->lock);
....@@ -2777,7 +3018,7 @@
27773018 int i, ret = 0;
27783019 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
27793020
2780
- em = get_chunk_map(fs_info, chunk_offset, 1);
3021
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
27813022 if (IS_ERR(em)) {
27823023 /*
27833024 * This is a logic error, but we don't want to just rely on the
....@@ -2818,13 +3059,11 @@
28183059 mutex_unlock(&fs_info->chunk_mutex);
28193060 }
28203061
2821
- if (map->stripes[i].dev) {
2822
- ret = btrfs_update_device(trans, map->stripes[i].dev);
2823
- if (ret) {
2824
- mutex_unlock(&fs_devices->device_list_mutex);
2825
- btrfs_abort_transaction(trans, ret);
2826
- goto out;
2827
- }
3062
+ ret = btrfs_update_device(trans, device);
3063
+ if (ret) {
3064
+ mutex_unlock(&fs_devices->device_list_mutex);
3065
+ btrfs_abort_transaction(trans, ret);
3066
+ goto out;
28283067 }
28293068 }
28303069 mutex_unlock(&fs_devices->device_list_mutex);
....@@ -2861,6 +3100,7 @@
28613100 {
28623101 struct btrfs_root *root = fs_info->chunk_root;
28633102 struct btrfs_trans_handle *trans;
3103
+ struct btrfs_block_group *block_group;
28643104 int ret;
28653105
28663106 /*
....@@ -2877,10 +3117,6 @@
28773117 */
28783118 lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
28793119
2880
- ret = btrfs_can_relocate(fs_info, chunk_offset);
2881
- if (ret)
2882
- return -ENOSPC;
2883
-
28843120 /* step one, relocate all the extents inside this chunk */
28853121 btrfs_scrub_pause(fs_info);
28863122 ret = btrfs_relocate_block_group(fs_info, chunk_offset);
....@@ -2888,15 +3124,11 @@
28883124 if (ret)
28893125 return ret;
28903126
2891
- /*
2892
- * We add the kobjects here (and after forcing data chunk creation)
2893
- * since relocation is the only place we'll create chunks of a new
2894
- * type at runtime. The only place where we'll remove the last
2895
- * chunk of a type is the call immediately below this one. Even
2896
- * so, we're protected against races with the cleaner thread since
2897
- * we're covered by the delete_unused_bgs_mutex.
2898
- */
2899
- btrfs_add_raid_kobjects(fs_info);
3127
+ block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3128
+ if (!block_group)
3129
+ return -ENOENT;
3130
+ btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3131
+ btrfs_put_block_group(block_group);
29003132
29013133 trans = btrfs_start_trans_remove_block_group(root->fs_info,
29023134 chunk_offset);
....@@ -2997,7 +3229,7 @@
29973229 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
29983230 u64 chunk_offset)
29993231 {
3000
- struct btrfs_block_group_cache *cache;
3232
+ struct btrfs_block_group *cache;
30013233 u64 bytes_used;
30023234 u64 chunk_type;
30033235
....@@ -3006,30 +3238,28 @@
30063238 chunk_type = cache->flags;
30073239 btrfs_put_block_group(cache);
30083240
3009
- if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3010
- spin_lock(&fs_info->data_sinfo->lock);
3011
- bytes_used = fs_info->data_sinfo->bytes_used;
3012
- spin_unlock(&fs_info->data_sinfo->lock);
3241
+ if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3242
+ return 0;
30133243
3014
- if (!bytes_used) {
3015
- struct btrfs_trans_handle *trans;
3016
- int ret;
3244
+ spin_lock(&fs_info->data_sinfo->lock);
3245
+ bytes_used = fs_info->data_sinfo->bytes_used;
3246
+ spin_unlock(&fs_info->data_sinfo->lock);
30173247
3018
- trans = btrfs_join_transaction(fs_info->tree_root);
3019
- if (IS_ERR(trans))
3020
- return PTR_ERR(trans);
3248
+ if (!bytes_used) {
3249
+ struct btrfs_trans_handle *trans;
3250
+ int ret;
30213251
3022
- ret = btrfs_force_chunk_alloc(trans,
3023
- BTRFS_BLOCK_GROUP_DATA);
3024
- btrfs_end_transaction(trans);
3025
- if (ret < 0)
3026
- return ret;
3252
+ trans = btrfs_join_transaction(fs_info->tree_root);
3253
+ if (IS_ERR(trans))
3254
+ return PTR_ERR(trans);
30273255
3028
- btrfs_add_raid_kobjects(fs_info);
3029
-
3030
- return 1;
3031
- }
3256
+ ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3257
+ btrfs_end_transaction(trans);
3258
+ if (ret < 0)
3259
+ return ret;
3260
+ return 1;
30323261 }
3262
+
30333263 return 0;
30343264 }
30353265
....@@ -3099,7 +3329,7 @@
30993329 if (!path)
31003330 return -ENOMEM;
31013331
3102
- trans = btrfs_start_transaction(root, 0);
3332
+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
31033333 if (IS_ERR(trans)) {
31043334 btrfs_free_path(path);
31053335 return PTR_ERR(trans);
....@@ -3208,28 +3438,28 @@
32083438 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
32093439 struct btrfs_balance_args *bargs)
32103440 {
3211
- struct btrfs_block_group_cache *cache;
3441
+ struct btrfs_block_group *cache;
32123442 u64 chunk_used;
32133443 u64 user_thresh_min;
32143444 u64 user_thresh_max;
32153445 int ret = 1;
32163446
32173447 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3218
- chunk_used = btrfs_block_group_used(&cache->item);
3448
+ chunk_used = cache->used;
32193449
32203450 if (bargs->usage_min == 0)
32213451 user_thresh_min = 0;
32223452 else
3223
- user_thresh_min = div_factor_fine(cache->key.offset,
3224
- bargs->usage_min);
3453
+ user_thresh_min = div_factor_fine(cache->length,
3454
+ bargs->usage_min);
32253455
32263456 if (bargs->usage_max == 0)
32273457 user_thresh_max = 1;
32283458 else if (bargs->usage_max > 100)
3229
- user_thresh_max = cache->key.offset;
3459
+ user_thresh_max = cache->length;
32303460 else
3231
- user_thresh_max = div_factor_fine(cache->key.offset,
3232
- bargs->usage_max);
3461
+ user_thresh_max = div_factor_fine(cache->length,
3462
+ bargs->usage_max);
32333463
32343464 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
32353465 ret = 0;
....@@ -3241,20 +3471,19 @@
32413471 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
32423472 u64 chunk_offset, struct btrfs_balance_args *bargs)
32433473 {
3244
- struct btrfs_block_group_cache *cache;
3474
+ struct btrfs_block_group *cache;
32453475 u64 chunk_used, user_thresh;
32463476 int ret = 1;
32473477
32483478 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3249
- chunk_used = btrfs_block_group_used(&cache->item);
3479
+ chunk_used = cache->used;
32503480
32513481 if (bargs->usage_min == 0)
32523482 user_thresh = 1;
32533483 else if (bargs->usage > 100)
3254
- user_thresh = cache->key.offset;
3484
+ user_thresh = cache->length;
32553485 else
3256
- user_thresh = div_factor_fine(cache->key.offset,
3257
- bargs->usage);
3486
+ user_thresh = div_factor_fine(cache->length, bargs->usage);
32583487
32593488 if (chunk_used < user_thresh)
32603489 ret = 0;
....@@ -3280,6 +3509,18 @@
32803509 return 1;
32813510 }
32823511
3512
+static u64 calc_data_stripes(u64 type, int num_stripes)
3513
+{
3514
+ const int index = btrfs_bg_flags_to_raid_index(type);
3515
+ const int ncopies = btrfs_raid_array[index].ncopies;
3516
+ const int nparity = btrfs_raid_array[index].nparity;
3517
+
3518
+ if (nparity)
3519
+ return num_stripes - nparity;
3520
+ else
3521
+ return num_stripes / ncopies;
3522
+}
3523
+
32833524 /* [pstart, pend) */
32843525 static int chunk_drange_filter(struct extent_buffer *leaf,
32853526 struct btrfs_chunk *chunk,
....@@ -3289,22 +3530,15 @@
32893530 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
32903531 u64 stripe_offset;
32913532 u64 stripe_length;
3533
+ u64 type;
32923534 int factor;
32933535 int i;
32943536
32953537 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
32963538 return 0;
32973539
3298
- if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
3299
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3300
- factor = num_stripes / 2;
3301
- } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3302
- factor = num_stripes - 1;
3303
- } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3304
- factor = num_stripes - 2;
3305
- } else {
3306
- factor = num_stripes;
3307
- }
3540
+ type = btrfs_chunk_type(leaf, chunk);
3541
+ factor = calc_data_stripes(type, num_stripes);
33083542
33093543 for (i = 0; i < num_stripes; i++) {
33103544 stripe = btrfs_stripe_nr(chunk, i);
....@@ -3365,10 +3599,10 @@
33653599 return 0;
33663600 }
33673601
3368
-static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3369
- struct extent_buffer *leaf,
3602
+static int should_balance_chunk(struct extent_buffer *leaf,
33703603 struct btrfs_chunk *chunk, u64 chunk_offset)
33713604 {
3605
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
33723606 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
33733607 struct btrfs_balance_args *bargs = NULL;
33743608 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
....@@ -3458,17 +3692,11 @@
34583692 {
34593693 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
34603694 struct btrfs_root *chunk_root = fs_info->chunk_root;
3461
- struct btrfs_root *dev_root = fs_info->dev_root;
3462
- struct list_head *devices;
3463
- struct btrfs_device *device;
3464
- u64 old_size;
3465
- u64 size_to_free;
34663695 u64 chunk_type;
34673696 struct btrfs_chunk *chunk;
34683697 struct btrfs_path *path = NULL;
34693698 struct btrfs_key key;
34703699 struct btrfs_key found_key;
3471
- struct btrfs_trans_handle *trans;
34723700 struct extent_buffer *leaf;
34733701 int slot;
34743702 int ret;
....@@ -3483,53 +3711,6 @@
34833711 u32 count_sys = 0;
34843712 int chunk_reserved = 0;
34853713
3486
- /* step one make some room on all the devices */
3487
- devices = &fs_info->fs_devices->devices;
3488
- list_for_each_entry(device, devices, dev_list) {
3489
- old_size = btrfs_device_get_total_bytes(device);
3490
- size_to_free = div_factor(old_size, 1);
3491
- size_to_free = min_t(u64, size_to_free, SZ_1M);
3492
- if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
3493
- btrfs_device_get_total_bytes(device) -
3494
- btrfs_device_get_bytes_used(device) > size_to_free ||
3495
- test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
3496
- continue;
3497
-
3498
- ret = btrfs_shrink_device(device, old_size - size_to_free);
3499
- if (ret == -ENOSPC)
3500
- break;
3501
- if (ret) {
3502
- /* btrfs_shrink_device never returns ret > 0 */
3503
- WARN_ON(ret > 0);
3504
- goto error;
3505
- }
3506
-
3507
- trans = btrfs_start_transaction(dev_root, 0);
3508
- if (IS_ERR(trans)) {
3509
- ret = PTR_ERR(trans);
3510
- btrfs_info_in_rcu(fs_info,
3511
- "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
3512
- rcu_str_deref(device->name), ret,
3513
- old_size, old_size - size_to_free);
3514
- goto error;
3515
- }
3516
-
3517
- ret = btrfs_grow_device(trans, device, old_size);
3518
- if (ret) {
3519
- btrfs_end_transaction(trans);
3520
- /* btrfs_grow_device never returns ret > 0 */
3521
- WARN_ON(ret > 0);
3522
- btrfs_info_in_rcu(fs_info,
3523
- "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
3524
- rcu_str_deref(device->name), ret,
3525
- old_size, old_size - size_to_free);
3526
- goto error;
3527
- }
3528
-
3529
- btrfs_end_transaction(trans);
3530
- }
3531
-
3532
- /* step two, relocate all the chunks */
35333714 path = btrfs_alloc_path();
35343715 if (!path) {
35353716 ret = -ENOMEM;
....@@ -3601,8 +3782,7 @@
36013782 spin_unlock(&fs_info->balance_lock);
36023783 }
36033784
3604
- ret = should_balance_chunk(fs_info, leaf, chunk,
3605
- found_key.offset);
3785
+ ret = should_balance_chunk(leaf, chunk, found_key.offset);
36063786
36073787 btrfs_release_path(path);
36083788 if (!ret) {
....@@ -3659,10 +3839,15 @@
36593839
36603840 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
36613841 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3662
- if (ret && ret != -ENOSPC)
3663
- goto error;
36643842 if (ret == -ENOSPC) {
36653843 enospc_errors++;
3844
+ } else if (ret == -ETXTBSY) {
3845
+ btrfs_info(fs_info,
3846
+ "skipping relocation of block group %llu due to active swapfile",
3847
+ found_key.offset);
3848
+ ret = 0;
3849
+ } else if (ret) {
3850
+ goto error;
36663851 } else {
36673852 spin_lock(&fs_info->balance_lock);
36683853 bctl->stat.completed++;
....@@ -3711,8 +3896,7 @@
37113896 if (flags == 0)
37123897 return !extended; /* "0" is valid for usual profiles */
37133898
3714
- /* true if exactly one bit set */
3715
- return (flags & (flags - 1)) == 0;
3899
+ return has_single_bit_set(flags);
37163900 }
37173901
37183902 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
....@@ -3723,13 +3907,179 @@
37233907 atomic_read(&fs_info->balance_cancel_req) == 0);
37243908 }
37253909
3726
-/* Non-zero return value signifies invalidity */
3727
-static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
3728
- u64 allowed)
3910
+/*
3911
+ * Validate target profile against allowed profiles and return true if it's OK.
3912
+ * Otherwise print the error message and return false.
3913
+ */
3914
+static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
3915
+ const struct btrfs_balance_args *bargs,
3916
+ u64 allowed, const char *type)
37293917 {
3730
- return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3731
- (!alloc_profile_is_valid(bctl_arg->target, 1) ||
3732
- (bctl_arg->target & ~allowed)));
3918
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3919
+ return true;
3920
+
3921
+ /* Profile is valid and does not have bits outside of the allowed set */
3922
+ if (alloc_profile_is_valid(bargs->target, 1) &&
3923
+ (bargs->target & ~allowed) == 0)
3924
+ return true;
3925
+
3926
+ btrfs_err(fs_info, "balance: invalid convert %s profile %s",
3927
+ type, btrfs_bg_type_to_raid_name(bargs->target));
3928
+ return false;
3929
+}
3930
+
3931
+/*
3932
+ * Fill @buf with textual description of balance filter flags @bargs, up to
3933
+ * @size_buf including the terminating null. The output may be trimmed if it
3934
+ * does not fit into the provided buffer.
3935
+ */
3936
+static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3937
+ u32 size_buf)
3938
+{
3939
+ int ret;
3940
+ u32 size_bp = size_buf;
3941
+ char *bp = buf;
3942
+ u64 flags = bargs->flags;
3943
+ char tmp_buf[128] = {'\0'};
3944
+
3945
+ if (!flags)
3946
+ return;
3947
+
3948
+#define CHECK_APPEND_NOARG(a) \
3949
+ do { \
3950
+ ret = snprintf(bp, size_bp, (a)); \
3951
+ if (ret < 0 || ret >= size_bp) \
3952
+ goto out_overflow; \
3953
+ size_bp -= ret; \
3954
+ bp += ret; \
3955
+ } while (0)
3956
+
3957
+#define CHECK_APPEND_1ARG(a, v1) \
3958
+ do { \
3959
+ ret = snprintf(bp, size_bp, (a), (v1)); \
3960
+ if (ret < 0 || ret >= size_bp) \
3961
+ goto out_overflow; \
3962
+ size_bp -= ret; \
3963
+ bp += ret; \
3964
+ } while (0)
3965
+
3966
+#define CHECK_APPEND_2ARG(a, v1, v2) \
3967
+ do { \
3968
+ ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
3969
+ if (ret < 0 || ret >= size_bp) \
3970
+ goto out_overflow; \
3971
+ size_bp -= ret; \
3972
+ bp += ret; \
3973
+ } while (0)
3974
+
3975
+ if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3976
+ CHECK_APPEND_1ARG("convert=%s,",
3977
+ btrfs_bg_type_to_raid_name(bargs->target));
3978
+
3979
+ if (flags & BTRFS_BALANCE_ARGS_SOFT)
3980
+ CHECK_APPEND_NOARG("soft,");
3981
+
3982
+ if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
3983
+ btrfs_describe_block_groups(bargs->profiles, tmp_buf,
3984
+ sizeof(tmp_buf));
3985
+ CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
3986
+ }
3987
+
3988
+ if (flags & BTRFS_BALANCE_ARGS_USAGE)
3989
+ CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
3990
+
3991
+ if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
3992
+ CHECK_APPEND_2ARG("usage=%u..%u,",
3993
+ bargs->usage_min, bargs->usage_max);
3994
+
3995
+ if (flags & BTRFS_BALANCE_ARGS_DEVID)
3996
+ CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
3997
+
3998
+ if (flags & BTRFS_BALANCE_ARGS_DRANGE)
3999
+ CHECK_APPEND_2ARG("drange=%llu..%llu,",
4000
+ bargs->pstart, bargs->pend);
4001
+
4002
+ if (flags & BTRFS_BALANCE_ARGS_VRANGE)
4003
+ CHECK_APPEND_2ARG("vrange=%llu..%llu,",
4004
+ bargs->vstart, bargs->vend);
4005
+
4006
+ if (flags & BTRFS_BALANCE_ARGS_LIMIT)
4007
+ CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
4008
+
4009
+ if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
4010
+ CHECK_APPEND_2ARG("limit=%u..%u,",
4011
+ bargs->limit_min, bargs->limit_max);
4012
+
4013
+ if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
4014
+ CHECK_APPEND_2ARG("stripes=%u..%u,",
4015
+ bargs->stripes_min, bargs->stripes_max);
4016
+
4017
+#undef CHECK_APPEND_2ARG
4018
+#undef CHECK_APPEND_1ARG
4019
+#undef CHECK_APPEND_NOARG
4020
+
4021
+out_overflow:
4022
+
4023
+ if (size_bp < size_buf)
4024
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
4025
+ else
4026
+ buf[0] = '\0';
4027
+}
4028
+
4029
+static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
4030
+{
4031
+ u32 size_buf = 1024;
4032
+ char tmp_buf[192] = {'\0'};
4033
+ char *buf;
4034
+ char *bp;
4035
+ u32 size_bp = size_buf;
4036
+ int ret;
4037
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4038
+
4039
+ buf = kzalloc(size_buf, GFP_KERNEL);
4040
+ if (!buf)
4041
+ return;
4042
+
4043
+ bp = buf;
4044
+
4045
+#define CHECK_APPEND_1ARG(a, v1) \
4046
+ do { \
4047
+ ret = snprintf(bp, size_bp, (a), (v1)); \
4048
+ if (ret < 0 || ret >= size_bp) \
4049
+ goto out_overflow; \
4050
+ size_bp -= ret; \
4051
+ bp += ret; \
4052
+ } while (0)
4053
+
4054
+ if (bctl->flags & BTRFS_BALANCE_FORCE)
4055
+ CHECK_APPEND_1ARG("%s", "-f ");
4056
+
4057
+ if (bctl->flags & BTRFS_BALANCE_DATA) {
4058
+ describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4059
+ CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4060
+ }
4061
+
4062
+ if (bctl->flags & BTRFS_BALANCE_METADATA) {
4063
+ describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4064
+ CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4065
+ }
4066
+
4067
+ if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4068
+ describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4069
+ CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4070
+ }
4071
+
4072
+#undef CHECK_APPEND_1ARG
4073
+
4074
+out_overflow:
4075
+
4076
+ if (size_bp < size_buf)
4077
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4078
+ btrfs_info(fs_info, "balance: %s %s",
4079
+ (bctl->flags & BTRFS_BALANCE_RESUME) ?
4080
+ "resume" : "start", buf);
4081
+
4082
+ kfree(buf);
37334083 }
37344084
37354085 /*
....@@ -3745,11 +4095,12 @@
37454095 int ret;
37464096 u64 num_devices;
37474097 unsigned seq;
3748
- bool reducing_integrity;
4098
+ bool reducing_redundancy;
4099
+ int i;
37494100
37504101 if (btrfs_fs_closing(fs_info) ||
37514102 atomic_read(&fs_info->balance_pause_req) ||
3752
- atomic_read(&fs_info->balance_cancel_req)) {
4103
+ btrfs_should_cancel_balance(fs_info)) {
37534104 ret = -EINVAL;
37544105 goto out;
37554106 }
....@@ -3774,54 +4125,39 @@
37744125 }
37754126 }
37764127
3777
- num_devices = fs_info->fs_devices->num_devices;
3778
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
3779
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3780
- BUG_ON(num_devices < 1);
3781
- num_devices--;
3782
- }
3783
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3784
- allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
3785
- if (num_devices > 1)
3786
- allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3787
- if (num_devices > 2)
3788
- allowed |= BTRFS_BLOCK_GROUP_RAID5;
3789
- if (num_devices > 3)
3790
- allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3791
- BTRFS_BLOCK_GROUP_RAID6);
3792
- if (validate_convert_profile(&bctl->data, allowed)) {
3793
- int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
4128
+ /*
4129
+ * rw_devices will not change at the moment, device add/delete/replace
4130
+ * are exclusive
4131
+ */
4132
+ num_devices = fs_info->fs_devices->rw_devices;
37944133
3795
- btrfs_err(fs_info,
3796
- "balance: invalid convert data profile %s",
3797
- get_raid_name(index));
3798
- ret = -EINVAL;
3799
- goto out;
3800
- }
3801
- if (validate_convert_profile(&bctl->meta, allowed)) {
3802
- int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
4134
+ /*
4135
+ * SINGLE profile on-disk has no profile bit, but in-memory we have a
4136
+ * special bit for it, to make it easier to distinguish. Thus we need
4137
+ * to set it manually, or balance would refuse the profile.
4138
+ */
4139
+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4140
+ for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4141
+ if (num_devices >= btrfs_raid_array[i].devs_min)
4142
+ allowed |= btrfs_raid_array[i].bg_flag;
38034143
3804
- btrfs_err(fs_info,
3805
- "balance: invalid convert metadata profile %s",
3806
- get_raid_name(index));
3807
- ret = -EINVAL;
3808
- goto out;
3809
- }
3810
- if (validate_convert_profile(&bctl->sys, allowed)) {
3811
- int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
3812
-
3813
- btrfs_err(fs_info,
3814
- "balance: invalid convert system profile %s",
3815
- get_raid_name(index));
4144
+ if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4145
+ !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4146
+ !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
38164147 ret = -EINVAL;
38174148 goto out;
38184149 }
38194150
3820
- /* allow to reduce meta or sys integrity only if force set */
3821
- allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3822
- BTRFS_BLOCK_GROUP_RAID10 |
3823
- BTRFS_BLOCK_GROUP_RAID5 |
3824
- BTRFS_BLOCK_GROUP_RAID6;
4151
+ /*
4152
+ * Allow to reduce metadata or system integrity only if force set for
4153
+ * profiles with redundancy (copies, parity)
4154
+ */
4155
+ allowed = 0;
4156
+ for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4157
+ if (btrfs_raid_array[i].ncopies >= 2 ||
4158
+ btrfs_raid_array[i].tolerated_failures >= 1)
4159
+ allowed |= btrfs_raid_array[i].bg_flag;
4160
+ }
38254161 do {
38264162 seq = read_seqbegin(&fs_info->profiles_lock);
38274163
....@@ -3831,9 +4167,9 @@
38314167 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
38324168 (fs_info->avail_metadata_alloc_bits & allowed) &&
38334169 !(bctl->meta.target & allowed)))
3834
- reducing_integrity = true;
4170
+ reducing_redundancy = true;
38354171 else
3836
- reducing_integrity = false;
4172
+ reducing_redundancy = false;
38374173
38384174 /* if we're not converting, the target field is uninitialized */
38394175 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
....@@ -3842,13 +4178,13 @@
38424178 bctl->data.target : fs_info->avail_data_alloc_bits;
38434179 } while (read_seqretry(&fs_info->profiles_lock, seq));
38444180
3845
- if (reducing_integrity) {
4181
+ if (reducing_redundancy) {
38464182 if (bctl->flags & BTRFS_BALANCE_FORCE) {
38474183 btrfs_info(fs_info,
3848
- "balance: force reducing metadata integrity");
4184
+ "balance: force reducing metadata redundancy");
38494185 } else {
38504186 btrfs_err(fs_info,
3851
- "balance: reduces metadata integrity, use --force if you want this");
4187
+ "balance: reduces metadata redundancy, use --force if you want this");
38524188 ret = -EINVAL;
38534189 goto out;
38544190 }
....@@ -3856,12 +4192,18 @@
38564192
38574193 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
38584194 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
3859
- int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
3860
- int data_index = btrfs_bg_flags_to_raid_index(data_target);
3861
-
38624195 btrfs_warn(fs_info,
38634196 "balance: metadata profile %s has lower redundancy than data profile %s",
3864
- get_raid_name(meta_index), get_raid_name(data_index));
4197
+ btrfs_bg_type_to_raid_name(meta_target),
4198
+ btrfs_bg_type_to_raid_name(data_target));
4199
+ }
4200
+
4201
+ if (fs_info->send_in_progress) {
4202
+ btrfs_warn_rl(fs_info,
4203
+"cannot run balance while send operations are in progress (%d in progress)",
4204
+ fs_info->send_in_progress);
4205
+ ret = -EAGAIN;
4206
+ goto out;
38654207 }
38664208
38674209 ret = insert_balance_item(fs_info, bctl);
....@@ -3883,11 +4225,34 @@
38834225
38844226 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
38854227 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4228
+ describe_balance_start_or_resume(fs_info);
38864229 mutex_unlock(&fs_info->balance_mutex);
38874230
38884231 ret = __btrfs_balance(fs_info);
38894232
38904233 mutex_lock(&fs_info->balance_mutex);
4234
+ if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4235
+ btrfs_info(fs_info, "balance: paused");
4236
+ /*
4237
+ * Balance can be canceled by:
4238
+ *
4239
+ * - Regular cancel request
4240
+ * Then ret == -ECANCELED and balance_cancel_req > 0
4241
+ *
4242
+ * - Fatal signal to "btrfs" process
4243
+ * Either the signal caught by wait_reserve_ticket() and callers
4244
+ * got -EINTR, or caught by btrfs_should_cancel_balance() and
4245
+ * got -ECANCELED.
4246
+ * Either way, in this case balance_cancel_req = 0, and
4247
+ * ret == -EINTR or ret == -ECANCELED.
4248
+ *
4249
+ * So here we only check the return value to catch canceled balance.
4250
+ */
4251
+ else if (ret == -ECANCELED || ret == -EINTR)
4252
+ btrfs_info(fs_info, "balance: canceled");
4253
+ else
4254
+ btrfs_info(fs_info, "balance: ended with status: %d", ret);
4255
+
38914256 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
38924257
38934258 if (bargs) {
....@@ -3898,7 +4263,7 @@
38984263 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
38994264 balance_need_close(fs_info)) {
39004265 reset_balance_state(fs_info);
3901
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4266
+ btrfs_exclop_finish(fs_info);
39024267 }
39034268
39044269 wake_up(&fs_info->balance_wait_q);
....@@ -3909,7 +4274,7 @@
39094274 reset_balance_state(fs_info);
39104275 else
39114276 kfree(bctl);
3912
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4277
+ btrfs_exclop_finish(fs_info);
39134278
39144279 return ret;
39154280 }
....@@ -3919,12 +4284,12 @@
39194284 struct btrfs_fs_info *fs_info = data;
39204285 int ret = 0;
39214286
4287
+ sb_start_write(fs_info->sb);
39224288 mutex_lock(&fs_info->balance_mutex);
3923
- if (fs_info->balance_ctl) {
3924
- btrfs_info(fs_info, "balance: resuming");
4289
+ if (fs_info->balance_ctl)
39254290 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
3926
- }
39274291 mutex_unlock(&fs_info->balance_mutex);
4292
+ sb_end_write(fs_info->sb);
39284293
39294294 return ret;
39304295 }
....@@ -4013,7 +4378,7 @@
40134378 * is in a paused state and must have fs_info::balance_ctl properly
40144379 * set up.
40154380 */
4016
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
4381
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
40174382 btrfs_warn(fs_info,
40184383 "balance: cannot set exclusive op status, resume manually");
40194384
....@@ -4097,19 +4462,18 @@
40974462
40984463 if (fs_info->balance_ctl) {
40994464 reset_balance_state(fs_info);
4100
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4465
+ btrfs_exclop_finish(fs_info);
41014466 btrfs_info(fs_info, "balance: canceled");
41024467 }
41034468 }
41044469
4105
- BUG_ON(fs_info->balance_ctl ||
4106
- test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4470
+ ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
41074471 atomic_dec(&fs_info->balance_cancel_req);
41084472 mutex_unlock(&fs_info->balance_mutex);
41094473 return 0;
41104474 }
41114475
4112
-static int btrfs_uuid_scan_kthread(void *data)
4476
+int btrfs_uuid_scan_kthread(void *data)
41134477 {
41144478 struct btrfs_fs_info *fs_info = data;
41154479 struct btrfs_root *root = fs_info->tree_root;
....@@ -4121,6 +4485,7 @@
41214485 struct btrfs_root_item root_item;
41224486 u32 item_size;
41234487 struct btrfs_trans_handle *trans = NULL;
4488
+ bool closing = false;
41244489
41254490 path = btrfs_alloc_path();
41264491 if (!path) {
....@@ -4133,6 +4498,10 @@
41334498 key.offset = 0;
41344499
41354500 while (1) {
4501
+ if (btrfs_fs_closing(fs_info)) {
4502
+ closing = true;
4503
+ break;
4504
+ }
41364505 ret = btrfs_search_forward(root, &key, path,
41374506 BTRFS_OLDEST_GENERATION);
41384507 if (ret) {
....@@ -4233,74 +4602,10 @@
42334602 btrfs_end_transaction(trans);
42344603 if (ret)
42354604 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4236
- else
4605
+ else if (!closing)
42374606 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
42384607 up(&fs_info->uuid_tree_rescan_sem);
42394608 return 0;
4240
-}
4241
-
4242
-/*
4243
- * Callback for btrfs_uuid_tree_iterate().
4244
- * returns:
4245
- * 0 check succeeded, the entry is not outdated.
4246
- * < 0 if an error occurred.
4247
- * > 0 if the check failed, which means the caller shall remove the entry.
4248
- */
4249
-static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
4250
- u8 *uuid, u8 type, u64 subid)
4251
-{
4252
- struct btrfs_key key;
4253
- int ret = 0;
4254
- struct btrfs_root *subvol_root;
4255
-
4256
- if (type != BTRFS_UUID_KEY_SUBVOL &&
4257
- type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
4258
- goto out;
4259
-
4260
- key.objectid = subid;
4261
- key.type = BTRFS_ROOT_ITEM_KEY;
4262
- key.offset = (u64)-1;
4263
- subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
4264
- if (IS_ERR(subvol_root)) {
4265
- ret = PTR_ERR(subvol_root);
4266
- if (ret == -ENOENT)
4267
- ret = 1;
4268
- goto out;
4269
- }
4270
-
4271
- switch (type) {
4272
- case BTRFS_UUID_KEY_SUBVOL:
4273
- if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
4274
- ret = 1;
4275
- break;
4276
- case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
4277
- if (memcmp(uuid, subvol_root->root_item.received_uuid,
4278
- BTRFS_UUID_SIZE))
4279
- ret = 1;
4280
- break;
4281
- }
4282
-
4283
-out:
4284
- return ret;
4285
-}
4286
-
4287
-static int btrfs_uuid_rescan_kthread(void *data)
4288
-{
4289
- struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
4290
- int ret;
4291
-
4292
- /*
4293
- * 1st step is to iterate through the existing UUID tree and
4294
- * to delete all entries that contain outdated data.
4295
- * 2nd step is to add all missing entries to the UUID tree.
4296
- */
4297
- ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
4298
- if (ret < 0) {
4299
- btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
4300
- up(&fs_info->uuid_tree_rescan_sem);
4301
- return ret;
4302
- }
4303
- return btrfs_uuid_scan_kthread(data);
43044609 }
43054610
43064611 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
....@@ -4319,8 +4624,7 @@
43194624 if (IS_ERR(trans))
43204625 return PTR_ERR(trans);
43214626
4322
- uuid_root = btrfs_create_tree(trans, fs_info,
4323
- BTRFS_UUID_TREE_OBJECTID);
4627
+ uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
43244628 if (IS_ERR(uuid_root)) {
43254629 ret = PTR_ERR(uuid_root);
43264630 btrfs_abort_transaction(trans, ret);
....@@ -4346,22 +4650,6 @@
43464650 return 0;
43474651 }
43484652
4349
-int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
4350
-{
4351
- struct task_struct *task;
4352
-
4353
- down(&fs_info->uuid_tree_rescan_sem);
4354
- task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
4355
- if (IS_ERR(task)) {
4356
- /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4357
- btrfs_warn(fs_info, "failed to start uuid_rescan task");
4358
- up(&fs_info->uuid_tree_rescan_sem);
4359
- return PTR_ERR(task);
4360
- }
4361
-
4362
- return 0;
4363
-}
4364
-
43654653 /*
43664654 * shrinking a device means finding all of the device extents past
43674655 * the new size, and then following the back refs to the chunks.
....@@ -4380,15 +4668,16 @@
43804668 int slot;
43814669 int failed = 0;
43824670 bool retried = false;
4383
- bool checked_pending_chunks = false;
43844671 struct extent_buffer *l;
43854672 struct btrfs_key key;
43864673 struct btrfs_super_block *super_copy = fs_info->super_copy;
43874674 u64 old_total = btrfs_super_total_bytes(super_copy);
43884675 u64 old_size = btrfs_device_get_total_bytes(device);
43894676 u64 diff;
4677
+ u64 start;
43904678
43914679 new_size = round_down(new_size, fs_info->sectorsize);
4680
+ start = new_size;
43924681 diff = round_down(old_size - new_size, fs_info->sectorsize);
43934682
43944683 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
....@@ -4400,6 +4689,12 @@
44004689
44014690 path->reada = READA_BACK;
44024691
4692
+ trans = btrfs_start_transaction(root, 0);
4693
+ if (IS_ERR(trans)) {
4694
+ btrfs_free_path(path);
4695
+ return PTR_ERR(trans);
4696
+ }
4697
+
44034698 mutex_lock(&fs_info->chunk_mutex);
44044699
44054700 btrfs_device_set_total_bytes(device, new_size);
....@@ -4407,7 +4702,21 @@
44074702 device->fs_devices->total_rw_bytes -= diff;
44084703 atomic64_sub(diff, &fs_info->free_chunk_space);
44094704 }
4410
- mutex_unlock(&fs_info->chunk_mutex);
4705
+
4706
+ /*
4707
+ * Once the device's size has been set to the new size, ensure all
4708
+ * in-memory chunks are synced to disk so that the loop below sees them
4709
+ * and relocates them accordingly.
4710
+ */
4711
+ if (contains_pending_extent(device, &start, diff)) {
4712
+ mutex_unlock(&fs_info->chunk_mutex);
4713
+ ret = btrfs_commit_transaction(trans);
4714
+ if (ret)
4715
+ goto done;
4716
+ } else {
4717
+ mutex_unlock(&fs_info->chunk_mutex);
4718
+ btrfs_end_transaction(trans);
4719
+ }
44114720
44124721 again:
44134722 key.objectid = device->devid;
....@@ -4469,10 +4778,16 @@
44694778
44704779 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
44714780 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4472
- if (ret && ret != -ENOSPC)
4473
- goto done;
4474
- if (ret == -ENOSPC)
4781
+ if (ret == -ENOSPC) {
44754782 failed++;
4783
+ } else if (ret) {
4784
+ if (ret == -ETXTBSY) {
4785
+ btrfs_warn(fs_info,
4786
+ "could not shrink block group %llu due to active swapfile",
4787
+ chunk_offset);
4788
+ }
4789
+ goto done;
4790
+ }
44764791 } while (key.offset-- > 0);
44774792
44784793 if (failed && !retried) {
....@@ -4492,40 +4807,14 @@
44924807 }
44934808
44944809 mutex_lock(&fs_info->chunk_mutex);
4495
-
4496
- /*
4497
- * We checked in the above loop all device extents that were already in
4498
- * the device tree. However before we have updated the device's
4499
- * total_bytes to the new size, we might have had chunk allocations that
4500
- * have not complete yet (new block groups attached to transaction
4501
- * handles), and therefore their device extents were not yet in the
4502
- * device tree and we missed them in the loop above. So if we have any
4503
- * pending chunk using a device extent that overlaps the device range
4504
- * that we can not use anymore, commit the current transaction and
4505
- * repeat the search on the device tree - this way we guarantee we will
4506
- * not have chunks using device extents that end beyond 'new_size'.
4507
- */
4508
- if (!checked_pending_chunks) {
4509
- u64 start = new_size;
4510
- u64 len = old_size - new_size;
4511
-
4512
- if (contains_pending_extent(trans->transaction, device,
4513
- &start, len)) {
4514
- mutex_unlock(&fs_info->chunk_mutex);
4515
- checked_pending_chunks = true;
4516
- failed = 0;
4517
- retried = false;
4518
- ret = btrfs_commit_transaction(trans);
4519
- if (ret)
4520
- goto done;
4521
- goto again;
4522
- }
4523
- }
4810
+ /* Clear all state bits beyond the shrunk device size */
4811
+ clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4812
+ CHUNK_STATE_MASK);
45244813
45254814 btrfs_device_set_disk_total_bytes(device, new_size);
4526
- if (list_empty(&device->resized_list))
4527
- list_add_tail(&device->resized_list,
4528
- &fs_info->fs_devices->resized_devices);
4815
+ if (list_empty(&device->post_commit_list))
4816
+ list_add_tail(&device->post_commit_list,
4817
+ &trans->transaction->dev_update_list);
45294818
45304819 WARN_ON(diff > old_total);
45314820 btrfs_set_super_total_bytes(super_copy,
....@@ -4609,96 +4898,119 @@
46094898 btrfs_set_fs_incompat(info, RAID56);
46104899 }
46114900
4612
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4613
- u64 start, u64 type)
4901
+static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
46144902 {
4615
- struct btrfs_fs_info *info = trans->fs_info;
4616
- struct btrfs_fs_devices *fs_devices = info->fs_devices;
4617
- struct btrfs_device *device;
4618
- struct map_lookup *map = NULL;
4619
- struct extent_map_tree *em_tree;
4620
- struct extent_map *em;
4621
- struct btrfs_device_info *devices_info = NULL;
4622
- u64 total_avail;
4623
- int num_stripes; /* total number of stripes to allocate */
4624
- int data_stripes; /* number of stripes that count for
4625
- block group size */
4626
- int sub_stripes; /* sub_stripes info for map */
4627
- int dev_stripes; /* stripes per dev */
4628
- int devs_max; /* max devs to use */
4629
- int devs_min; /* min devs needed */
4630
- int devs_increment; /* ndevs has to be a multiple of this */
4631
- int ncopies; /* how many copies to data has */
4632
- int ret;
4903
+ if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4904
+ return;
4905
+
4906
+ btrfs_set_fs_incompat(info, RAID1C34);
4907
+}
4908
+
4909
+/*
4910
+ * Structure used internally for __btrfs_alloc_chunk() function.
4911
+ * Wraps needed parameters.
4912
+ */
4913
+struct alloc_chunk_ctl {
4914
+ u64 start;
4915
+ u64 type;
4916
+ /* Total number of stripes to allocate */
4917
+ int num_stripes;
4918
+ /* sub_stripes info for map */
4919
+ int sub_stripes;
4920
+ /* Stripes per device */
4921
+ int dev_stripes;
4922
+ /* Maximum number of devices to use */
4923
+ int devs_max;
4924
+ /* Minimum number of devices to use */
4925
+ int devs_min;
4926
+ /* ndevs has to be a multiple of this */
4927
+ int devs_increment;
4928
+ /* Number of copies */
4929
+ int ncopies;
4930
+ /* Number of stripes worth of bytes to store parity information */
4931
+ int nparity;
46334932 u64 max_stripe_size;
46344933 u64 max_chunk_size;
4934
+ u64 dev_extent_min;
46354935 u64 stripe_size;
4636
- u64 num_bytes;
4936
+ u64 chunk_size;
46374937 int ndevs;
4638
- int i;
4639
- int j;
4640
- int index;
4938
+};
46414939
4642
- BUG_ON(!alloc_profile_is_valid(type, 0));
4643
-
4644
- if (list_empty(&fs_devices->alloc_list)) {
4645
- if (btrfs_test_opt(info, ENOSPC_DEBUG))
4646
- btrfs_debug(info, "%s: no writable device", __func__);
4647
- return -ENOSPC;
4648
- }
4649
-
4650
- index = btrfs_bg_flags_to_raid_index(type);
4651
-
4652
- sub_stripes = btrfs_raid_array[index].sub_stripes;
4653
- dev_stripes = btrfs_raid_array[index].dev_stripes;
4654
- devs_max = btrfs_raid_array[index].devs_max;
4655
- devs_min = btrfs_raid_array[index].devs_min;
4656
- devs_increment = btrfs_raid_array[index].devs_increment;
4657
- ncopies = btrfs_raid_array[index].ncopies;
4940
+static void init_alloc_chunk_ctl_policy_regular(
4941
+ struct btrfs_fs_devices *fs_devices,
4942
+ struct alloc_chunk_ctl *ctl)
4943
+{
4944
+ u64 type = ctl->type;
46584945
46594946 if (type & BTRFS_BLOCK_GROUP_DATA) {
4660
- max_stripe_size = SZ_1G;
4661
- max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4662
- if (!devs_max)
4663
- devs_max = BTRFS_MAX_DEVS(info);
4947
+ ctl->max_stripe_size = SZ_1G;
4948
+ ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
46644949 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4665
- /* for larger filesystems, use larger metadata chunks */
4950
+ /* For larger filesystems, use larger metadata chunks */
46664951 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4667
- max_stripe_size = SZ_1G;
4952
+ ctl->max_stripe_size = SZ_1G;
46684953 else
4669
- max_stripe_size = SZ_256M;
4670
- max_chunk_size = max_stripe_size;
4671
- if (!devs_max)
4672
- devs_max = BTRFS_MAX_DEVS(info);
4954
+ ctl->max_stripe_size = SZ_256M;
4955
+ ctl->max_chunk_size = ctl->max_stripe_size;
46734956 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4674
- max_stripe_size = SZ_32M;
4675
- max_chunk_size = 2 * max_stripe_size;
4676
- if (!devs_max)
4677
- devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4957
+ ctl->max_stripe_size = SZ_32M;
4958
+ ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4959
+ ctl->devs_max = min_t(int, ctl->devs_max,
4960
+ BTRFS_MAX_DEVS_SYS_CHUNK);
46784961 } else {
4679
- btrfs_err(info, "invalid chunk type 0x%llx requested",
4680
- type);
4681
- BUG_ON(1);
4962
+ BUG();
46824963 }
46834964
4684
- /* we don't want a chunk larger than 10% of writeable space */
4685
- max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4686
- max_chunk_size);
4965
+ /* We don't want a chunk larger than 10% of writable space */
4966
+ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4967
+ ctl->max_chunk_size);
4968
+ ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
4969
+}
46874970
4688
- devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4689
- GFP_NOFS);
4690
- if (!devices_info)
4691
- return -ENOMEM;
4971
+static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
4972
+ struct alloc_chunk_ctl *ctl)
4973
+{
4974
+ int index = btrfs_bg_flags_to_raid_index(ctl->type);
4975
+
4976
+ ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
4977
+ ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
4978
+ ctl->devs_max = btrfs_raid_array[index].devs_max;
4979
+ if (!ctl->devs_max)
4980
+ ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
4981
+ ctl->devs_min = btrfs_raid_array[index].devs_min;
4982
+ ctl->devs_increment = btrfs_raid_array[index].devs_increment;
4983
+ ctl->ncopies = btrfs_raid_array[index].ncopies;
4984
+ ctl->nparity = btrfs_raid_array[index].nparity;
4985
+ ctl->ndevs = 0;
4986
+
4987
+ switch (fs_devices->chunk_alloc_policy) {
4988
+ case BTRFS_CHUNK_ALLOC_REGULAR:
4989
+ init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
4990
+ break;
4991
+ default:
4992
+ BUG();
4993
+ }
4994
+}
4995
+
4996
+static int gather_device_info(struct btrfs_fs_devices *fs_devices,
4997
+ struct alloc_chunk_ctl *ctl,
4998
+ struct btrfs_device_info *devices_info)
4999
+{
5000
+ struct btrfs_fs_info *info = fs_devices->fs_info;
5001
+ struct btrfs_device *device;
5002
+ u64 total_avail;
5003
+ u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5004
+ int ret;
5005
+ int ndevs = 0;
5006
+ u64 max_avail;
5007
+ u64 dev_offset;
46925008
46935009 /*
46945010 * in the first pass through the devices list, we gather information
46955011 * about the available holes on each device.
46965012 */
4697
- ndevs = 0;
46985013 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4699
- u64 max_avail;
4700
- u64 dev_offset;
4701
-
47025014 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
47035015 WARN(1, KERN_ERR
47045016 "BTRFS: read-only device in alloc_list\n");
....@@ -4716,24 +5028,23 @@
47165028 total_avail = 0;
47175029
47185030 /* If there is no space on this device, skip it. */
4719
- if (total_avail == 0)
5031
+ if (total_avail < ctl->dev_extent_min)
47205032 continue;
47215033
4722
- ret = find_free_dev_extent(trans, device,
4723
- max_stripe_size * dev_stripes,
4724
- &dev_offset, &max_avail);
5034
+ ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5035
+ &max_avail);
47255036 if (ret && ret != -ENOSPC)
4726
- goto error;
5037
+ return ret;
47275038
47285039 if (ret == 0)
4729
- max_avail = max_stripe_size * dev_stripes;
5040
+ max_avail = dev_extent_want;
47305041
4731
- if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
5042
+ if (max_avail < ctl->dev_extent_min) {
47325043 if (btrfs_test_opt(info, ENOSPC_DEBUG))
47335044 btrfs_debug(info,
4734
- "%s: devid %llu has no free space, have=%llu want=%u",
5045
+ "%s: devid %llu has no free space, have=%llu want=%llu",
47355046 __func__, device->devid, max_avail,
4736
- BTRFS_STRIPE_LEN * dev_stripes);
5047
+ ctl->dev_extent_min);
47375048 continue;
47385049 }
47395050
....@@ -4748,6 +5059,7 @@
47485059 devices_info[ndevs].dev = device;
47495060 ++ndevs;
47505061 }
5062
+ ctl->ndevs = ndevs;
47515063
47525064 /*
47535065 * now sort the devices by hole size / available space
....@@ -4755,20 +5067,14 @@
47555067 sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
47565068 btrfs_cmp_device_info, NULL);
47575069
4758
- /* round down to number of usable stripes */
4759
- ndevs = round_down(ndevs, devs_increment);
5070
+ return 0;
5071
+}
47605072
4761
- if (ndevs < devs_min) {
4762
- ret = -ENOSPC;
4763
- if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
4764
- btrfs_debug(info,
4765
- "%s: not enough devices with free space: have=%d minimum required=%d",
4766
- __func__, ndevs, devs_min);
4767
- }
4768
- goto error;
4769
- }
4770
-
4771
- ndevs = min(ndevs, devs_max);
5073
+static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5074
+ struct btrfs_device_info *devices_info)
5075
+{
5076
+ /* Number of stripes that count for block group size */
5077
+ int data_stripes;
47725078
47735079 /*
47745080 * The primary goal is to maximize the number of stripes, so use as
....@@ -4777,109 +5083,148 @@
47775083 * The DUP profile stores more than one stripe per device, the
47785084 * max_avail is the total size so we have to adjust.
47795085 */
4780
- stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
4781
- num_stripes = ndevs * dev_stripes;
5086
+ ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5087
+ ctl->dev_stripes);
5088
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5089
+
5090
+ /* This will have to be fixed for RAID1 and RAID10 over more drives */
5091
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
47825092
47835093 /*
4784
- * this will have to be fixed for RAID1 and RAID10 over
4785
- * more drives
5094
+ * Use the number of data stripes to figure out how big this chunk is
5095
+ * really going to be in terms of logical address space, and compare
5096
+ * that answer with the max chunk size. If it's higher, we try to
5097
+ * reduce stripe_size.
47865098 */
4787
- data_stripes = num_stripes / ncopies;
4788
-
4789
- if (type & BTRFS_BLOCK_GROUP_RAID5)
4790
- data_stripes = num_stripes - 1;
4791
-
4792
- if (type & BTRFS_BLOCK_GROUP_RAID6)
4793
- data_stripes = num_stripes - 2;
4794
-
4795
- /*
4796
- * Use the number of data stripes to figure out how big this chunk
4797
- * is really going to be in terms of logical address space,
4798
- * and compare that answer with the max chunk size. If it's higher,
4799
- * we try to reduce stripe_size.
4800
- */
4801
- if (stripe_size * data_stripes > max_chunk_size) {
5099
+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
48025100 /*
48035101 * Reduce stripe_size, round it up to a 16MB boundary again and
48045102 * then use it, unless it ends up being even bigger than the
48055103 * previous value we had already.
48065104 */
4807
- stripe_size = min(round_up(div_u64(max_chunk_size,
4808
- data_stripes), SZ_16M),
4809
- stripe_size);
5105
+ ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5106
+ data_stripes), SZ_16M),
5107
+ ctl->stripe_size);
48105108 }
48115109
4812
- /* align to BTRFS_STRIPE_LEN */
4813
- stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
5110
+ /* Align to BTRFS_STRIPE_LEN */
5111
+ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5112
+ ctl->chunk_size = ctl->stripe_size * data_stripes;
48145113
4815
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4816
- if (!map) {
4817
- ret = -ENOMEM;
4818
- goto error;
5114
+ return 0;
5115
+}
5116
+
5117
+static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5118
+ struct alloc_chunk_ctl *ctl,
5119
+ struct btrfs_device_info *devices_info)
5120
+{
5121
+ struct btrfs_fs_info *info = fs_devices->fs_info;
5122
+
5123
+ /*
5124
+ * Round down to number of usable stripes, devs_increment can be any
5125
+ * number so we can't use round_down() that requires power of 2, while
5126
+ * rounddown is safe.
5127
+ */
5128
+ ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5129
+
5130
+ if (ctl->ndevs < ctl->devs_min) {
5131
+ if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5132
+ btrfs_debug(info,
5133
+ "%s: not enough devices with free space: have=%d minimum required=%d",
5134
+ __func__, ctl->ndevs, ctl->devs_min);
5135
+ }
5136
+ return -ENOSPC;
48195137 }
4820
- map->num_stripes = num_stripes;
48215138
4822
- for (i = 0; i < ndevs; ++i) {
4823
- for (j = 0; j < dev_stripes; ++j) {
4824
- int s = i * dev_stripes + j;
5139
+ ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5140
+
5141
+ switch (fs_devices->chunk_alloc_policy) {
5142
+ case BTRFS_CHUNK_ALLOC_REGULAR:
5143
+ return decide_stripe_size_regular(ctl, devices_info);
5144
+ default:
5145
+ BUG();
5146
+ }
5147
+}
5148
+
5149
+static int create_chunk(struct btrfs_trans_handle *trans,
5150
+ struct alloc_chunk_ctl *ctl,
5151
+ struct btrfs_device_info *devices_info)
5152
+{
5153
+ struct btrfs_fs_info *info = trans->fs_info;
5154
+ struct map_lookup *map = NULL;
5155
+ struct extent_map_tree *em_tree;
5156
+ struct extent_map *em;
5157
+ u64 start = ctl->start;
5158
+ u64 type = ctl->type;
5159
+ int ret;
5160
+ int i;
5161
+ int j;
5162
+
5163
+ map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5164
+ if (!map)
5165
+ return -ENOMEM;
5166
+ map->num_stripes = ctl->num_stripes;
5167
+
5168
+ for (i = 0; i < ctl->ndevs; ++i) {
5169
+ for (j = 0; j < ctl->dev_stripes; ++j) {
5170
+ int s = i * ctl->dev_stripes + j;
48255171 map->stripes[s].dev = devices_info[i].dev;
48265172 map->stripes[s].physical = devices_info[i].dev_offset +
4827
- j * stripe_size;
5173
+ j * ctl->stripe_size;
48285174 }
48295175 }
48305176 map->stripe_len = BTRFS_STRIPE_LEN;
48315177 map->io_align = BTRFS_STRIPE_LEN;
48325178 map->io_width = BTRFS_STRIPE_LEN;
48335179 map->type = type;
4834
- map->sub_stripes = sub_stripes;
5180
+ map->sub_stripes = ctl->sub_stripes;
48355181
4836
- num_bytes = stripe_size * data_stripes;
4837
-
4838
- trace_btrfs_chunk_alloc(info, map, start, num_bytes);
5182
+ trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
48395183
48405184 em = alloc_extent_map();
48415185 if (!em) {
48425186 kfree(map);
4843
- ret = -ENOMEM;
4844
- goto error;
5187
+ return -ENOMEM;
48455188 }
48465189 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
48475190 em->map_lookup = map;
48485191 em->start = start;
4849
- em->len = num_bytes;
5192
+ em->len = ctl->chunk_size;
48505193 em->block_start = 0;
48515194 em->block_len = em->len;
4852
- em->orig_block_len = stripe_size;
5195
+ em->orig_block_len = ctl->stripe_size;
48535196
4854
- em_tree = &info->mapping_tree.map_tree;
5197
+ em_tree = &info->mapping_tree;
48555198 write_lock(&em_tree->lock);
48565199 ret = add_extent_mapping(em_tree, em, 0);
48575200 if (ret) {
48585201 write_unlock(&em_tree->lock);
48595202 free_extent_map(em);
4860
- goto error;
5203
+ return ret;
48615204 }
4862
-
4863
- list_add_tail(&em->list, &trans->transaction->pending_chunks);
4864
- refcount_inc(&em->refs);
48655205 write_unlock(&em_tree->lock);
48665206
4867
- ret = btrfs_make_block_group(trans, 0, type, start, num_bytes);
5207
+ ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
48685208 if (ret)
48695209 goto error_del_extent;
48705210
48715211 for (i = 0; i < map->num_stripes; i++) {
4872
- num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4873
- btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4874
- map->stripes[i].dev->has_pending_chunks = true;
5212
+ struct btrfs_device *dev = map->stripes[i].dev;
5213
+
5214
+ btrfs_device_set_bytes_used(dev,
5215
+ dev->bytes_used + ctl->stripe_size);
5216
+ if (list_empty(&dev->post_commit_list))
5217
+ list_add_tail(&dev->post_commit_list,
5218
+ &trans->transaction->dev_update_list);
48755219 }
48765220
4877
- atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
5221
+ atomic64_sub(ctl->stripe_size * map->num_stripes,
5222
+ &info->free_chunk_space);
48785223
48795224 free_extent_map(em);
48805225 check_raid56_incompat_flag(info, type);
5226
+ check_raid1c34_incompat_flag(info, type);
48815227
4882
- kfree(devices_info);
48835228 return 0;
48845229
48855230 error_del_extent:
....@@ -4891,13 +5236,68 @@
48915236 free_extent_map(em);
48925237 /* One for the tree reference */
48935238 free_extent_map(em);
4894
- /* One for the pending_chunks list reference */
4895
- free_extent_map(em);
4896
-error:
5239
+
5240
+ return ret;
5241
+}
5242
+
5243
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5244
+{
5245
+ struct btrfs_fs_info *info = trans->fs_info;
5246
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
5247
+ struct btrfs_device_info *devices_info = NULL;
5248
+ struct alloc_chunk_ctl ctl;
5249
+ int ret;
5250
+
5251
+ lockdep_assert_held(&info->chunk_mutex);
5252
+
5253
+ if (!alloc_profile_is_valid(type, 0)) {
5254
+ ASSERT(0);
5255
+ return -EINVAL;
5256
+ }
5257
+
5258
+ if (list_empty(&fs_devices->alloc_list)) {
5259
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
5260
+ btrfs_debug(info, "%s: no writable device", __func__);
5261
+ return -ENOSPC;
5262
+ }
5263
+
5264
+ if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5265
+ btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5266
+ ASSERT(0);
5267
+ return -EINVAL;
5268
+ }
5269
+
5270
+ ctl.start = find_next_chunk(info);
5271
+ ctl.type = type;
5272
+ init_alloc_chunk_ctl(fs_devices, &ctl);
5273
+
5274
+ devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5275
+ GFP_NOFS);
5276
+ if (!devices_info)
5277
+ return -ENOMEM;
5278
+
5279
+ ret = gather_device_info(fs_devices, &ctl, devices_info);
5280
+ if (ret < 0)
5281
+ goto out;
5282
+
5283
+ ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5284
+ if (ret < 0)
5285
+ goto out;
5286
+
5287
+ ret = create_chunk(trans, &ctl, devices_info);
5288
+
5289
+out:
48975290 kfree(devices_info);
48985291 return ret;
48995292 }
49005293
5294
+/*
5295
+ * Chunk allocation falls into two parts. The first part does work
5296
+ * that makes the new allocated chunk usable, but does not do any operation
5297
+ * that modifies the chunk tree. The second part does the work that
5298
+ * requires modifying the chunk tree. This division is important for the
5299
+ * bootstrap process of adding storage to a seed btrfs.
5300
+ */
49015301 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
49025302 u64 chunk_offset, u64 chunk_size)
49035303 {
....@@ -4916,7 +5316,7 @@
49165316 int i = 0;
49175317 int ret = 0;
49185318
4919
- em = get_chunk_map(fs_info, chunk_offset, chunk_size);
5319
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
49205320 if (IS_ERR(em))
49215321 return PTR_ERR(em);
49225322
....@@ -4996,57 +5396,27 @@
49965396 return ret;
49975397 }
49985398
4999
-/*
5000
- * Chunk allocation falls into two parts. The first part does works
5001
- * that make the new allocated chunk useable, but not do any operation
5002
- * that modifies the chunk tree. The second part does the works that
5003
- * require modifying the chunk tree. This division is important for the
5004
- * bootstrap process of adding storage to a seed btrfs.
5005
- */
5006
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5399
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
50075400 {
5008
- u64 chunk_offset;
5009
-
5010
- lockdep_assert_held(&trans->fs_info->chunk_mutex);
5011
- chunk_offset = find_next_chunk(trans->fs_info);
5012
- return __btrfs_alloc_chunk(trans, chunk_offset, type);
5013
-}
5014
-
5015
-static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
5016
- struct btrfs_fs_info *fs_info)
5017
-{
5018
- u64 chunk_offset;
5019
- u64 sys_chunk_offset;
5401
+ struct btrfs_fs_info *fs_info = trans->fs_info;
50205402 u64 alloc_profile;
50215403 int ret;
50225404
5023
- chunk_offset = find_next_chunk(fs_info);
50245405 alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5025
- ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
5406
+ ret = btrfs_alloc_chunk(trans, alloc_profile);
50265407 if (ret)
50275408 return ret;
50285409
5029
- sys_chunk_offset = find_next_chunk(fs_info);
50305410 alloc_profile = btrfs_system_alloc_profile(fs_info);
5031
- ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
5411
+ ret = btrfs_alloc_chunk(trans, alloc_profile);
50325412 return ret;
50335413 }
50345414
50355415 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
50365416 {
5037
- int max_errors;
5417
+ const int index = btrfs_bg_flags_to_raid_index(map->type);
50385418
5039
- if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
5040
- BTRFS_BLOCK_GROUP_RAID10 |
5041
- BTRFS_BLOCK_GROUP_RAID5)) {
5042
- max_errors = 1;
5043
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
5044
- max_errors = 2;
5045
- } else {
5046
- max_errors = 0;
5047
- }
5048
-
5049
- return max_errors;
5419
+ return btrfs_raid_array[index].tolerated_failures;
50505420 }
50515421
50525422 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
....@@ -5057,7 +5427,7 @@
50575427 int miss_ndevs = 0;
50585428 int i;
50595429
5060
- em = get_chunk_map(fs_info, chunk_offset, 1);
5430
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
50615431 if (IS_ERR(em))
50625432 return 1;
50635433
....@@ -5087,21 +5457,16 @@
50875457 return readonly;
50885458 }
50895459
5090
-void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
5091
-{
5092
- extent_map_tree_init(&tree->map_tree);
5093
-}
5094
-
5095
-void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
5460
+void btrfs_mapping_tree_free(struct extent_map_tree *tree)
50965461 {
50975462 struct extent_map *em;
50985463
50995464 while (1) {
5100
- write_lock(&tree->map_tree.lock);
5101
- em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
5465
+ write_lock(&tree->lock);
5466
+ em = lookup_extent_mapping(tree, 0, (u64)-1);
51025467 if (em)
5103
- remove_extent_mapping(&tree->map_tree, em);
5104
- write_unlock(&tree->map_tree.lock);
5468
+ remove_extent_mapping(tree, em);
5469
+ write_unlock(&tree->lock);
51055470 if (!em)
51065471 break;
51075472 /* once for us */
....@@ -5117,7 +5482,7 @@
51175482 struct map_lookup *map;
51185483 int ret;
51195484
5120
- em = get_chunk_map(fs_info, logical, len);
5485
+ em = btrfs_get_chunk_map(fs_info, logical, len);
51215486 if (IS_ERR(em))
51225487 /*
51235488 * We could return errors for these cases, but that could get
....@@ -5128,7 +5493,7 @@
51285493 return 1;
51295494
51305495 map = em->map_lookup;
5131
- if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
5496
+ if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
51325497 ret = map->num_stripes;
51335498 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
51345499 ret = map->sub_stripes;
....@@ -5147,11 +5512,11 @@
51475512 ret = 1;
51485513 free_extent_map(em);
51495514
5150
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
5515
+ down_read(&fs_info->dev_replace.rwsem);
51515516 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
51525517 fs_info->dev_replace.tgtdev)
51535518 ret++;
5154
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
5519
+ up_read(&fs_info->dev_replace.rwsem);
51555520
51565521 return ret;
51575522 }
....@@ -5163,7 +5528,7 @@
51635528 struct map_lookup *map;
51645529 unsigned long len = fs_info->sectorsize;
51655530
5166
- em = get_chunk_map(fs_info, logical, len);
5531
+ em = btrfs_get_chunk_map(fs_info, logical, len);
51675532
51685533 if (!WARN_ON(IS_ERR(em))) {
51695534 map = em->map_lookup;
....@@ -5180,7 +5545,7 @@
51805545 struct map_lookup *map;
51815546 int ret = 0;
51825547
5183
- em = get_chunk_map(fs_info, logical, len);
5548
+ em = btrfs_get_chunk_map(fs_info, logical, len);
51845549
51855550 if(!WARN_ON(IS_ERR(em))) {
51865551 map = em->map_lookup;
....@@ -5202,7 +5567,7 @@
52025567 struct btrfs_device *srcdev;
52035568
52045569 ASSERT((map->type &
5205
- (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));
5570
+ (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
52065571
52075572 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
52085573 num_stripes = map->sub_stripes;
....@@ -5240,31 +5605,19 @@
52405605 return preferred_mirror;
52415606 }
52425607
5243
-static inline int parity_smaller(u64 a, u64 b)
5244
-{
5245
- return a > b;
5246
-}
5247
-
52485608 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
52495609 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
52505610 {
5251
- struct btrfs_bio_stripe s;
52525611 int i;
5253
- u64 l;
52545612 int again = 1;
52555613
52565614 while (again) {
52575615 again = 0;
52585616 for (i = 0; i < num_stripes - 1; i++) {
5259
- if (parity_smaller(bbio->raid_map[i],
5260
- bbio->raid_map[i+1])) {
5261
- s = bbio->stripes[i];
5262
- l = bbio->raid_map[i];
5263
- bbio->stripes[i] = bbio->stripes[i+1];
5264
- bbio->raid_map[i] = bbio->raid_map[i+1];
5265
- bbio->stripes[i+1] = s;
5266
- bbio->raid_map[i+1] = l;
5267
-
5617
+ /* Swap if parity is on a smaller index */
5618
+ if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5619
+ swap(bbio->stripes[i], bbio->stripes[i + 1]);
5620
+ swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
52685621 again = 1;
52695622 }
52705623 }
....@@ -5290,6 +5643,9 @@
52905643 atomic_set(&bbio->error, 0);
52915644 refcount_set(&bbio->refs, 1);
52925645
5646
+ bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
5647
+ bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
5648
+
52935649 return bbio;
52945650 }
52955651
....@@ -5313,12 +5669,13 @@
53135669 * replace.
53145670 */
53155671 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5316
- u64 logical, u64 length,
5672
+ u64 logical, u64 *length_ret,
53175673 struct btrfs_bio **bbio_ret)
53185674 {
53195675 struct extent_map *em;
53205676 struct map_lookup *map;
53215677 struct btrfs_bio *bbio;
5678
+ u64 length = *length_ret;
53225679 u64 offset;
53235680 u64 stripe_nr;
53245681 u64 stripe_nr_end;
....@@ -5339,7 +5696,7 @@
53395696 /* discard always return a bbio */
53405697 ASSERT(bbio_ret);
53415698
5342
- em = get_chunk_map(fs_info, logical, length);
5699
+ em = btrfs_get_chunk_map(fs_info, logical, length);
53435700 if (IS_ERR(em))
53445701 return PTR_ERR(em);
53455702
....@@ -5351,7 +5708,8 @@
53515708 }
53525709
53535710 offset = logical - em->start;
5354
- length = min_t(u64, em->len - offset, length);
5711
+ length = min_t(u64, em->start + em->len - logical, length);
5712
+ *length_ret = length;
53555713
53565714 stripe_len = map->stripe_len;
53575715 /*
....@@ -5391,7 +5749,7 @@
53915749 &remaining_stripes);
53925750 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
53935751 last_stripe *= sub_stripes;
5394
- } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
5752
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
53955753 BTRFS_BLOCK_GROUP_DUP)) {
53965754 num_stripes = map->num_stripes;
53975755 } else {
....@@ -5635,6 +5993,106 @@
56355993 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
56365994 }
56375995
5996
+/*
5997
+ * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
5998
+ * tuple. This information is used to calculate how big a
5999
+ * particular bio can get before it straddles a stripe.
6000
+ *
6001
+ * @fs_info - the filesystem
6002
+ * @logical - address that we want to figure out the geometry of
6003
+ * @len - the length of IO we are going to perform, starting at @logical
6004
+ * @op - type of operation - write or read
6005
+ * @io_geom - pointer used to return values
6006
+ *
6007
+ * Returns < 0 in case a chunk for the given logical address cannot be found,
6008
+ * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
6009
+ */
6010
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6011
+ u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
6012
+{
6013
+ struct extent_map *em;
6014
+ struct map_lookup *map;
6015
+ u64 offset;
6016
+ u64 stripe_offset;
6017
+ u64 stripe_nr;
6018
+ u64 stripe_len;
6019
+ u64 raid56_full_stripe_start = (u64)-1;
6020
+ int data_stripes;
6021
+ int ret = 0;
6022
+
6023
+ ASSERT(op != BTRFS_MAP_DISCARD);
6024
+
6025
+ em = btrfs_get_chunk_map(fs_info, logical, len);
6026
+ if (IS_ERR(em))
6027
+ return PTR_ERR(em);
6028
+
6029
+ map = em->map_lookup;
6030
+ /* Offset of this logical address in the chunk */
6031
+ offset = logical - em->start;
6032
+ /* Len of a stripe in a chunk */
6033
+ stripe_len = map->stripe_len;
6034
+ /* Stripe wher this block falls in */
6035
+ stripe_nr = div64_u64(offset, stripe_len);
6036
+ /* Offset of stripe in the chunk */
6037
+ stripe_offset = stripe_nr * stripe_len;
6038
+ if (offset < stripe_offset) {
6039
+ btrfs_crit(fs_info,
6040
+"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
6041
+ stripe_offset, offset, em->start, logical, stripe_len);
6042
+ ret = -EINVAL;
6043
+ goto out;
6044
+ }
6045
+
6046
+ /* stripe_offset is the offset of this block in its stripe */
6047
+ stripe_offset = offset - stripe_offset;
6048
+ data_stripes = nr_data_stripes(map);
6049
+
6050
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
6051
+ u64 max_len = stripe_len - stripe_offset;
6052
+
6053
+ /*
6054
+ * In case of raid56, we need to know the stripe aligned start
6055
+ */
6056
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6057
+ unsigned long full_stripe_len = stripe_len * data_stripes;
6058
+ raid56_full_stripe_start = offset;
6059
+
6060
+ /*
6061
+ * Allow a write of a full stripe, but make sure we
6062
+ * don't allow straddling of stripes
6063
+ */
6064
+ raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6065
+ full_stripe_len);
6066
+ raid56_full_stripe_start *= full_stripe_len;
6067
+
6068
+ /*
6069
+ * For writes to RAID[56], allow a full stripeset across
6070
+ * all disks. For other RAID types and for RAID[56]
6071
+ * reads, just allow a single stripe (on a single disk).
6072
+ */
6073
+ if (op == BTRFS_MAP_WRITE) {
6074
+ max_len = stripe_len * data_stripes -
6075
+ (offset - raid56_full_stripe_start);
6076
+ }
6077
+ }
6078
+ len = min_t(u64, em->len - offset, max_len);
6079
+ } else {
6080
+ len = em->len - offset;
6081
+ }
6082
+
6083
+ io_geom->len = len;
6084
+ io_geom->offset = offset;
6085
+ io_geom->stripe_len = stripe_len;
6086
+ io_geom->stripe_nr = stripe_nr;
6087
+ io_geom->stripe_offset = stripe_offset;
6088
+ io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6089
+
6090
+out:
6091
+ /* once for us */
6092
+ free_extent_map(em);
6093
+ return ret;
6094
+}
6095
+
56386096 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
56396097 enum btrfs_map_op op,
56406098 u64 logical, u64 *length,
....@@ -5643,11 +6101,11 @@
56436101 {
56446102 struct extent_map *em;
56456103 struct map_lookup *map;
5646
- u64 offset;
56476104 u64 stripe_offset;
56486105 u64 stripe_nr;
56496106 u64 stripe_len;
56506107 u32 stripe_index;
6108
+ int data_stripes;
56516109 int i;
56526110 int ret = 0;
56536111 int num_stripes;
....@@ -5660,81 +6118,34 @@
56606118 int patch_the_first_stripe_for_dev_replace = 0;
56616119 u64 physical_to_patch_in_first_stripe = 0;
56626120 u64 raid56_full_stripe_start = (u64)-1;
6121
+ struct btrfs_io_geometry geom;
56636122
5664
- if (op == BTRFS_MAP_DISCARD)
5665
- return __btrfs_map_block_for_discard(fs_info, logical,
5666
- *length, bbio_ret);
6123
+ ASSERT(bbio_ret);
6124
+ ASSERT(op != BTRFS_MAP_DISCARD);
56676125
5668
- em = get_chunk_map(fs_info, logical, *length);
5669
- if (IS_ERR(em))
5670
- return PTR_ERR(em);
6126
+ ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
6127
+ if (ret < 0)
6128
+ return ret;
56716129
6130
+ em = btrfs_get_chunk_map(fs_info, logical, *length);
6131
+ ASSERT(!IS_ERR(em));
56726132 map = em->map_lookup;
5673
- offset = logical - em->start;
56746133
5675
- stripe_len = map->stripe_len;
5676
- stripe_nr = offset;
5677
- /*
5678
- * stripe_nr counts the total number of stripes we have to stride
5679
- * to get to this block
5680
- */
5681
- stripe_nr = div64_u64(stripe_nr, stripe_len);
6134
+ *length = geom.len;
6135
+ stripe_len = geom.stripe_len;
6136
+ stripe_nr = geom.stripe_nr;
6137
+ stripe_offset = geom.stripe_offset;
6138
+ raid56_full_stripe_start = geom.raid56_stripe_offset;
6139
+ data_stripes = nr_data_stripes(map);
56826140
5683
- stripe_offset = stripe_nr * stripe_len;
5684
- if (offset < stripe_offset) {
5685
- btrfs_crit(fs_info,
5686
- "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
5687
- stripe_offset, offset, em->start, logical,
5688
- stripe_len);
5689
- free_extent_map(em);
5690
- return -EINVAL;
5691
- }
5692
-
5693
- /* stripe_offset is the offset of this block in its stripe*/
5694
- stripe_offset = offset - stripe_offset;
5695
-
5696
- /* if we're here for raid56, we need to know the stripe aligned start */
5697
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5698
- unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
5699
- raid56_full_stripe_start = offset;
5700
-
5701
- /* allow a write of a full stripe, but make sure we don't
5702
- * allow straddling of stripes
5703
- */
5704
- raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5705
- full_stripe_len);
5706
- raid56_full_stripe_start *= full_stripe_len;
5707
- }
5708
-
5709
- if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5710
- u64 max_len;
5711
- /* For writes to RAID[56], allow a full stripeset across all disks.
5712
- For other RAID types and for RAID[56] reads, just allow a single
5713
- stripe (on a single disk). */
5714
- if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5715
- (op == BTRFS_MAP_WRITE)) {
5716
- max_len = stripe_len * nr_data_stripes(map) -
5717
- (offset - raid56_full_stripe_start);
5718
- } else {
5719
- /* we limit the length of each bio to what fits in a stripe */
5720
- max_len = stripe_len - stripe_offset;
5721
- }
5722
- *length = min_t(u64, em->len - offset, max_len);
5723
- } else {
5724
- *length = em->len - offset;
5725
- }
5726
-
5727
- /* This is for when we're called from btrfs_merge_bio_hook() and all
5728
- it cares about is the length */
5729
- if (!bbio_ret)
5730
- goto out;
5731
-
5732
- btrfs_dev_replace_read_lock(dev_replace);
6141
+ down_read(&dev_replace->rwsem);
57336142 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6143
+ /*
6144
+ * Hold the semaphore for read during the whole operation, write is
6145
+ * requested at commit time but must wait.
6146
+ */
57346147 if (!dev_replace_is_ongoing)
5735
- btrfs_dev_replace_read_unlock(dev_replace);
5736
- else
5737
- btrfs_dev_replace_set_lock_blocking(dev_replace);
6148
+ up_read(&dev_replace->rwsem);
57386149
57396150 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
57406151 !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
....@@ -5757,7 +6168,7 @@
57576168 &stripe_index);
57586169 if (!need_full_stripe(op))
57596170 mirror_num = 1;
5760
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
6171
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
57616172 if (need_full_stripe(op))
57626173 num_stripes = map->num_stripes;
57636174 else if (mirror_num)
....@@ -5799,7 +6210,7 @@
57996210 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
58006211 /* push stripe_nr back to the start of the full stripe */
58016212 stripe_nr = div64_u64(raid56_full_stripe_start,
5802
- stripe_len * nr_data_stripes(map));
6213
+ stripe_len * data_stripes);
58036214
58046215 /* RAID[56] write or recovery. Return all stripes */
58056216 num_stripes = map->num_stripes;
....@@ -5815,10 +6226,9 @@
58156226 * Mirror #3 is RAID6 Q block.
58166227 */
58176228 stripe_nr = div_u64_rem(stripe_nr,
5818
- nr_data_stripes(map), &stripe_index);
6229
+ data_stripes, &stripe_index);
58196230 if (mirror_num > 1)
5820
- stripe_index = nr_data_stripes(map) +
5821
- mirror_num - 2;
6231
+ stripe_index = data_stripes + mirror_num - 2;
58226232
58236233 /* We distribute the parity blocks across stripes */
58246234 div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
....@@ -5858,8 +6268,13 @@
58586268 ret = -ENOMEM;
58596269 goto out;
58606270 }
5861
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
5862
- bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
6271
+
6272
+ for (i = 0; i < num_stripes; i++) {
6273
+ bbio->stripes[i].physical = map->stripes[stripe_index].physical +
6274
+ stripe_offset + stripe_nr * map->stripe_len;
6275
+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6276
+ stripe_index++;
6277
+ }
58636278
58646279 /* build raid_map */
58656280 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
....@@ -5867,17 +6282,12 @@
58676282 u64 tmp;
58686283 unsigned rot;
58696284
5870
- bbio->raid_map = (u64 *)((void *)bbio->stripes +
5871
- sizeof(struct btrfs_bio_stripe) *
5872
- num_alloc_stripes +
5873
- sizeof(int) * tgtdev_indexes);
5874
-
58756285 /* Work out the disk rotation on this stripe-set */
58766286 div_u64_rem(stripe_nr, num_stripes, &rot);
58776287
58786288 /* Fill in the logical address of each stripe */
5879
- tmp = stripe_nr * nr_data_stripes(map);
5880
- for (i = 0; i < nr_data_stripes(map); i++)
6289
+ tmp = stripe_nr * data_stripes;
6290
+ for (i = 0; i < data_stripes; i++)
58816291 bbio->raid_map[(i+rot) % num_stripes] =
58826292 em->start + (tmp + i) * map->stripe_len;
58836293
....@@ -5885,24 +6295,12 @@
58856295 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
58866296 bbio->raid_map[(i+rot+1) % num_stripes] =
58876297 RAID6_Q_STRIPE;
5888
- }
58896298
5890
-
5891
- for (i = 0; i < num_stripes; i++) {
5892
- bbio->stripes[i].physical =
5893
- map->stripes[stripe_index].physical +
5894
- stripe_offset +
5895
- stripe_nr * map->stripe_len;
5896
- bbio->stripes[i].dev =
5897
- map->stripes[stripe_index].dev;
5898
- stripe_index++;
6299
+ sort_parity_stripes(bbio, num_stripes);
58996300 }
59006301
59016302 if (need_full_stripe(op))
59026303 max_errors = btrfs_chunk_max_errors(map);
5903
-
5904
- if (bbio->raid_map)
5905
- sort_parity_stripes(bbio, num_stripes);
59066304
59076305 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
59086306 need_full_stripe(op)) {
....@@ -5929,8 +6327,9 @@
59296327 }
59306328 out:
59316329 if (dev_replace_is_ongoing) {
5932
- btrfs_dev_replace_clear_lock_blocking(dev_replace);
5933
- btrfs_dev_replace_read_unlock(dev_replace);
6330
+ lockdep_assert_held(&dev_replace->rwsem);
6331
+ /* Unlock and let waiting writers proceed */
6332
+ up_read(&dev_replace->rwsem);
59346333 }
59356334 free_extent_map(em);
59366335 return ret;
....@@ -5940,6 +6339,10 @@
59406339 u64 logical, u64 *length,
59416340 struct btrfs_bio **bbio_ret, int mirror_num)
59426341 {
6342
+ if (op == BTRFS_MAP_DISCARD)
6343
+ return __btrfs_map_block_for_discard(fs_info, logical,
6344
+ length, bbio_ret);
6345
+
59436346 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
59446347 mirror_num, 0);
59456348 }
....@@ -5950,75 +6353,6 @@
59506353 struct btrfs_bio **bbio_ret)
59516354 {
59526355 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
5953
-}
5954
-
5955
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
5956
- u64 physical, u64 **logical, int *naddrs, int *stripe_len)
5957
-{
5958
- struct extent_map *em;
5959
- struct map_lookup *map;
5960
- u64 *buf;
5961
- u64 bytenr;
5962
- u64 length;
5963
- u64 stripe_nr;
5964
- u64 rmap_len;
5965
- int i, j, nr = 0;
5966
-
5967
- em = get_chunk_map(fs_info, chunk_start, 1);
5968
- if (IS_ERR(em))
5969
- return -EIO;
5970
-
5971
- map = em->map_lookup;
5972
- length = em->len;
5973
- rmap_len = map->stripe_len;
5974
-
5975
- if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5976
- length = div_u64(length, map->num_stripes / map->sub_stripes);
5977
- else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5978
- length = div_u64(length, map->num_stripes);
5979
- else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5980
- length = div_u64(length, nr_data_stripes(map));
5981
- rmap_len = map->stripe_len * nr_data_stripes(map);
5982
- }
5983
-
5984
- buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
5985
- BUG_ON(!buf); /* -ENOMEM */
5986
-
5987
- for (i = 0; i < map->num_stripes; i++) {
5988
- if (map->stripes[i].physical > physical ||
5989
- map->stripes[i].physical + length <= physical)
5990
- continue;
5991
-
5992
- stripe_nr = physical - map->stripes[i].physical;
5993
- stripe_nr = div64_u64(stripe_nr, map->stripe_len);
5994
-
5995
- if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5996
- stripe_nr = stripe_nr * map->num_stripes + i;
5997
- stripe_nr = div_u64(stripe_nr, map->sub_stripes);
5998
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5999
- stripe_nr = stripe_nr * map->num_stripes + i;
6000
- } /* else if RAID[56], multiply by nr_data_stripes().
6001
- * Alternatively, just use rmap_len below instead of
6002
- * map->stripe_len */
6003
-
6004
- bytenr = chunk_start + stripe_nr * rmap_len;
6005
- WARN_ON(nr >= map->num_stripes);
6006
- for (j = 0; j < nr; j++) {
6007
- if (buf[j] == bytenr)
6008
- break;
6009
- }
6010
- if (j == nr) {
6011
- WARN_ON(nr >= map->num_stripes);
6012
- buf[nr++] = bytenr;
6013
- }
6014
- }
6015
-
6016
- *logical = buf;
6017
- *naddrs = nr;
6018
- *stripe_len = rmap_len;
6019
-
6020
- free_extent_map(em);
6021
- return 0;
60226356 }
60236357
60246358 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
....@@ -6039,23 +6373,18 @@
60396373 atomic_inc(&bbio->error);
60406374 if (bio->bi_status == BLK_STS_IOERR ||
60416375 bio->bi_status == BLK_STS_TARGET) {
6042
- unsigned int stripe_index =
6043
- btrfs_io_bio(bio)->stripe_index;
6044
- struct btrfs_device *dev;
6376
+ struct btrfs_device *dev = btrfs_io_bio(bio)->device;
60456377
6046
- BUG_ON(stripe_index >= bbio->num_stripes);
6047
- dev = bbio->stripes[stripe_index].dev;
6048
- if (dev->bdev) {
6049
- if (bio_op(bio) == REQ_OP_WRITE)
6050
- btrfs_dev_stat_inc_and_print(dev,
6378
+ ASSERT(dev->bdev);
6379
+ if (bio_op(bio) == REQ_OP_WRITE)
6380
+ btrfs_dev_stat_inc_and_print(dev,
60516381 BTRFS_DEV_STAT_WRITE_ERRS);
6052
- else if (!(bio->bi_opf & REQ_RAHEAD))
6053
- btrfs_dev_stat_inc_and_print(dev,
6382
+ else if (!(bio->bi_opf & REQ_RAHEAD))
6383
+ btrfs_dev_stat_inc_and_print(dev,
60546384 BTRFS_DEV_STAT_READ_ERRS);
6055
- if (bio->bi_opf & REQ_PREFLUSH)
6056
- btrfs_dev_stat_inc_and_print(dev,
6385
+ if (bio->bi_opf & REQ_PREFLUSH)
6386
+ btrfs_dev_stat_inc_and_print(dev,
60576387 BTRFS_DEV_STAT_FLUSH_ERRS);
6058
- }
60596388 }
60606389 }
60616390
....@@ -6090,73 +6419,25 @@
60906419 }
60916420 }
60926421
6093
-/*
6094
- * see run_scheduled_bios for a description of why bios are collected for
6095
- * async submit.
6096
- *
6097
- * This will add one bio to the pending list for a device and make sure
6098
- * the work struct is scheduled.
6099
- */
6100
-static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6101
- struct bio *bio)
6102
-{
6103
- struct btrfs_fs_info *fs_info = device->fs_info;
6104
- int should_queue = 1;
6105
- struct btrfs_pending_bios *pending_bios;
6106
-
6107
- /* don't bother with additional async steps for reads, right now */
6108
- if (bio_op(bio) == REQ_OP_READ) {
6109
- btrfsic_submit_bio(bio);
6110
- return;
6111
- }
6112
-
6113
- WARN_ON(bio->bi_next);
6114
- bio->bi_next = NULL;
6115
-
6116
- spin_lock(&device->io_lock);
6117
- if (op_is_sync(bio->bi_opf))
6118
- pending_bios = &device->pending_sync_bios;
6119
- else
6120
- pending_bios = &device->pending_bios;
6121
-
6122
- if (pending_bios->tail)
6123
- pending_bios->tail->bi_next = bio;
6124
-
6125
- pending_bios->tail = bio;
6126
- if (!pending_bios->head)
6127
- pending_bios->head = bio;
6128
- if (device->running_pending)
6129
- should_queue = 0;
6130
-
6131
- spin_unlock(&device->io_lock);
6132
-
6133
- if (should_queue)
6134
- btrfs_queue_work(fs_info->submit_workers, &device->work);
6135
-}
6136
-
61376422 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6138
- u64 physical, int dev_nr, int async)
6423
+ u64 physical, struct btrfs_device *dev)
61396424 {
6140
- struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
61416425 struct btrfs_fs_info *fs_info = bbio->fs_info;
61426426
61436427 bio->bi_private = bbio;
6144
- btrfs_io_bio(bio)->stripe_index = dev_nr;
6428
+ btrfs_io_bio(bio)->device = dev;
61456429 bio->bi_end_io = btrfs_end_bio;
61466430 bio->bi_iter.bi_sector = physical >> 9;
61476431 btrfs_debug_in_rcu(fs_info,
61486432 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
61496433 bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
6150
- (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid,
6151
- bio->bi_iter.bi_size);
6434
+ (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6435
+ dev->devid, bio->bi_iter.bi_size);
61526436 bio_set_dev(bio, dev->bdev);
61536437
61546438 btrfs_bio_counter_inc_noblocked(fs_info);
61556439
6156
- if (async)
6157
- btrfs_schedule_bio(dev, bio);
6158
- else
6159
- btrfsic_submit_bio(bio);
6440
+ btrfsic_submit_bio(bio);
61606441 }
61616442
61626443 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
....@@ -6177,7 +6458,7 @@
61776458 }
61786459
61796460 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6180
- int mirror_num, int async_submit)
6461
+ int mirror_num)
61816462 {
61826463 struct btrfs_device *dev;
61836464 struct bio *first_bio = bio;
....@@ -6245,8 +6526,7 @@
62456526 else
62466527 bio = first_bio;
62476528
6248
- submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
6249
- dev_nr, async_submit);
6529
+ submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
62506530 }
62516531 btrfs_bio_counter_dec(fs_info);
62526532 return BLK_STS_OK;
....@@ -6262,15 +6542,25 @@
62626542 * If @seed is true, traverse through the seed devices.
62636543 */
62646544 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6265
- u64 devid, u8 *uuid, u8 *fsid,
6266
- bool seed)
6545
+ u64 devid, u8 *uuid, u8 *fsid,
6546
+ bool seed)
62676547 {
62686548 struct btrfs_device *device;
6549
+ struct btrfs_fs_devices *seed_devs;
62696550
6270
- while (fs_devices) {
6551
+ if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6552
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
6553
+ if (device->devid == devid &&
6554
+ (!uuid || memcmp(device->uuid, uuid,
6555
+ BTRFS_UUID_SIZE) == 0))
6556
+ return device;
6557
+ }
6558
+ }
6559
+
6560
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
62716561 if (!fsid ||
6272
- !memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
6273
- list_for_each_entry(device, &fs_devices->devices,
6562
+ !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6563
+ list_for_each_entry(device, &seed_devs->devices,
62746564 dev_list) {
62756565 if (device->devid == devid &&
62766566 (!uuid || memcmp(device->uuid, uuid,
....@@ -6278,11 +6568,8 @@
62786568 return device;
62796569 }
62806570 }
6281
- if (seed)
6282
- fs_devices = fs_devices->seed;
6283
- else
6284
- return NULL;
62856571 }
6572
+
62866573 return NULL;
62876574 }
62886575
....@@ -6337,7 +6624,7 @@
63376624 if (WARN_ON(!devid && !fs_info))
63386625 return ERR_PTR(-EINVAL);
63396626
6340
- dev = __alloc_device();
6627
+ dev = __alloc_device(fs_info);
63416628 if (IS_ERR(dev))
63426629 return dev;
63436630
....@@ -6359,9 +6646,6 @@
63596646 else
63606647 generate_random_uuid(dev->uuid);
63616648
6362
- btrfs_init_work(&dev->work, btrfs_submit_helper,
6363
- pending_bios_fn, NULL, NULL);
6364
-
63656649 return dev;
63666650 }
63676651
....@@ -6376,11 +6660,26 @@
63766660 devid, uuid);
63776661 }
63786662
6379
-static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
6380
- struct extent_buffer *leaf,
6663
+static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6664
+{
6665
+ int index = btrfs_bg_flags_to_raid_index(type);
6666
+ int ncopies = btrfs_raid_array[index].ncopies;
6667
+ const int nparity = btrfs_raid_array[index].nparity;
6668
+ int data_stripes;
6669
+
6670
+ if (nparity)
6671
+ data_stripes = num_stripes - nparity;
6672
+ else
6673
+ data_stripes = num_stripes / ncopies;
6674
+
6675
+ return div_u64(chunk_len, data_stripes);
6676
+}
6677
+
6678
+static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
63816679 struct btrfs_chunk *chunk)
63826680 {
6383
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
6681
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
6682
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
63846683 struct map_lookup *map;
63856684 struct extent_map *em;
63866685 u64 logical;
....@@ -6400,14 +6699,14 @@
64006699 * as chunk item in tree block is already verified by tree-checker.
64016700 */
64026701 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6403
- ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
6702
+ ret = btrfs_check_chunk_valid(leaf, chunk, logical);
64046703 if (ret)
64056704 return ret;
64066705 }
64076706
6408
- read_lock(&map_tree->map_tree.lock);
6409
- em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6410
- read_unlock(&map_tree->map_tree.lock);
6707
+ read_lock(&map_tree->lock);
6708
+ em = lookup_extent_mapping(map_tree, logical, 1);
6709
+ read_unlock(&map_tree->lock);
64116710
64126711 /* already mapped? */
64136712 if (em && em->start <= logical && em->start + em->len > logical) {
....@@ -6441,6 +6740,8 @@
64416740 map->type = btrfs_chunk_type(leaf, chunk);
64426741 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
64436742 map->verified_stripes = 0;
6743
+ em->orig_block_len = calc_stripe_length(map->type, em->len,
6744
+ map->num_stripes);
64446745 for (i = 0; i < num_stripes; i++) {
64456746 map->stripes[i].physical =
64466747 btrfs_stripe_offset_nr(leaf, chunk, i);
....@@ -6449,7 +6750,7 @@
64496750 btrfs_stripe_dev_uuid_nr(chunk, i),
64506751 BTRFS_UUID_SIZE);
64516752 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6452
- devid, uuid, NULL, true);
6753
+ devid, uuid, NULL, true);
64536754 if (!map->stripes[i].dev &&
64546755 !btrfs_test_opt(fs_info, DEGRADED)) {
64556756 free_extent_map(em);
....@@ -6474,9 +6775,9 @@
64746775
64756776 }
64766777
6477
- write_lock(&map_tree->map_tree.lock);
6478
- ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6479
- write_unlock(&map_tree->map_tree.lock);
6778
+ write_lock(&map_tree->lock);
6779
+ ret = add_extent_mapping(map_tree, em, 0);
6780
+ write_unlock(&map_tree->lock);
64806781 if (ret < 0) {
64816782 btrfs_err(fs_info,
64826783 "failed to add chunk map, start=%llu len=%llu: %d",
....@@ -6519,28 +6820,30 @@
65196820 lockdep_assert_held(&uuid_mutex);
65206821 ASSERT(fsid);
65216822
6522
- fs_devices = fs_info->fs_devices->seed;
6523
- while (fs_devices) {
6823
+ /* This will match only for multi-device seed fs */
6824
+ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
65246825 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
65256826 return fs_devices;
65266827
6527
- fs_devices = fs_devices->seed;
6528
- }
65296828
6530
- fs_devices = find_fsid(fsid);
6829
+ fs_devices = find_fsid(fsid, NULL);
65316830 if (!fs_devices) {
65326831 if (!btrfs_test_opt(fs_info, DEGRADED))
65336832 return ERR_PTR(-ENOENT);
65346833
6535
- fs_devices = alloc_fs_devices(fsid);
6834
+ fs_devices = alloc_fs_devices(fsid, NULL);
65366835 if (IS_ERR(fs_devices))
65376836 return fs_devices;
65386837
6539
- fs_devices->seeding = 1;
6838
+ fs_devices->seeding = true;
65406839 fs_devices->opened = 1;
65416840 return fs_devices;
65426841 }
65436842
6843
+ /*
6844
+ * Upon first call for a seed fs fsid, just create a private copy of the
6845
+ * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6846
+ */
65446847 fs_devices = clone_fs_devices(fs_devices);
65456848 if (IS_ERR(fs_devices))
65466849 return fs_devices;
....@@ -6548,27 +6851,24 @@
65486851 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
65496852 if (ret) {
65506853 free_fs_devices(fs_devices);
6551
- fs_devices = ERR_PTR(ret);
6552
- goto out;
6854
+ return ERR_PTR(ret);
65536855 }
65546856
65556857 if (!fs_devices->seeding) {
65566858 close_fs_devices(fs_devices);
65576859 free_fs_devices(fs_devices);
6558
- fs_devices = ERR_PTR(-EINVAL);
6559
- goto out;
6860
+ return ERR_PTR(-EINVAL);
65606861 }
65616862
6562
- fs_devices->seed = fs_info->fs_devices->seed;
6563
- fs_info->fs_devices->seed = fs_devices;
6564
-out:
6863
+ list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
6864
+
65656865 return fs_devices;
65666866 }
65676867
6568
-static int read_one_dev(struct btrfs_fs_info *fs_info,
6569
- struct extent_buffer *leaf,
6868
+static int read_one_dev(struct extent_buffer *leaf,
65706869 struct btrfs_dev_item *dev_item)
65716870 {
6871
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
65726872 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
65736873 struct btrfs_device *device;
65746874 u64 devid;
....@@ -6582,7 +6882,7 @@
65826882 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
65836883 BTRFS_FSID_SIZE);
65846884
6585
- if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
6885
+ if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
65866886 fs_devices = open_seed_devices(fs_info, fs_uuid);
65876887 if (IS_ERR(fs_devices))
65886888 return PTR_ERR(fs_devices);
....@@ -6725,48 +7025,49 @@
67257025 sb_array_offset += len;
67267026 cur_offset += len;
67277027
6728
- if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6729
- chunk = (struct btrfs_chunk *)sb_array_offset;
6730
- /*
6731
- * At least one btrfs_chunk with one stripe must be
6732
- * present, exact stripe count check comes afterwards
6733
- */
6734
- len = btrfs_chunk_item_size(1);
6735
- if (cur_offset + len > array_size)
6736
- goto out_short_read;
6737
-
6738
- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6739
- if (!num_stripes) {
6740
- btrfs_err(fs_info,
6741
- "invalid number of stripes %u in sys_array at offset %u",
6742
- num_stripes, cur_offset);
6743
- ret = -EIO;
6744
- break;
6745
- }
6746
-
6747
- type = btrfs_chunk_type(sb, chunk);
6748
- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6749
- btrfs_err(fs_info,
6750
- "invalid chunk type %llu in sys_array at offset %u",
6751
- type, cur_offset);
6752
- ret = -EIO;
6753
- break;
6754
- }
6755
-
6756
- len = btrfs_chunk_item_size(num_stripes);
6757
- if (cur_offset + len > array_size)
6758
- goto out_short_read;
6759
-
6760
- ret = read_one_chunk(fs_info, &key, sb, chunk);
6761
- if (ret)
6762
- break;
6763
- } else {
7028
+ if (key.type != BTRFS_CHUNK_ITEM_KEY) {
67647029 btrfs_err(fs_info,
67657030 "unexpected item type %u in sys_array at offset %u",
67667031 (u32)key.type, cur_offset);
67677032 ret = -EIO;
67687033 break;
67697034 }
7035
+
7036
+ chunk = (struct btrfs_chunk *)sb_array_offset;
7037
+ /*
7038
+ * At least one btrfs_chunk with one stripe must be present,
7039
+ * exact stripe count check comes afterwards
7040
+ */
7041
+ len = btrfs_chunk_item_size(1);
7042
+ if (cur_offset + len > array_size)
7043
+ goto out_short_read;
7044
+
7045
+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7046
+ if (!num_stripes) {
7047
+ btrfs_err(fs_info,
7048
+ "invalid number of stripes %u in sys_array at offset %u",
7049
+ num_stripes, cur_offset);
7050
+ ret = -EIO;
7051
+ break;
7052
+ }
7053
+
7054
+ type = btrfs_chunk_type(sb, chunk);
7055
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7056
+ btrfs_err(fs_info,
7057
+ "invalid chunk type %llu in sys_array at offset %u",
7058
+ type, cur_offset);
7059
+ ret = -EIO;
7060
+ break;
7061
+ }
7062
+
7063
+ len = btrfs_chunk_item_size(num_stripes);
7064
+ if (cur_offset + len > array_size)
7065
+ goto out_short_read;
7066
+
7067
+ ret = read_one_chunk(&key, sb, chunk);
7068
+ if (ret)
7069
+ break;
7070
+
67707071 array_ptr += len;
67717072 sb_array_offset += len;
67727073 cur_offset += len;
....@@ -6794,14 +7095,14 @@
67947095 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
67957096 struct btrfs_device *failing_dev)
67967097 {
6797
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
7098
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
67987099 struct extent_map *em;
67997100 u64 next_start = 0;
68007101 bool ret = true;
68017102
6802
- read_lock(&map_tree->map_tree.lock);
6803
- em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
6804
- read_unlock(&map_tree->map_tree.lock);
7103
+ read_lock(&map_tree->lock);
7104
+ em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7105
+ read_unlock(&map_tree->lock);
68057106 /* No chunk at all? Return false anyway */
68067107 if (!em) {
68077108 ret = false;
....@@ -6830,7 +7131,7 @@
68307131 if (missing > max_tolerated) {
68317132 if (!failing_dev)
68327133 btrfs_warn(fs_info,
6833
- "chunk %llu missing %d devices, max tolerance is %d for writeable mount",
7134
+ "chunk %llu missing %d devices, max tolerance is %d for writable mount",
68347135 em->start, missing, max_tolerated);
68357136 free_extent_map(em);
68367137 ret = false;
....@@ -6839,13 +7140,26 @@
68397140 next_start = extent_map_end(em);
68407141 free_extent_map(em);
68417142
6842
- read_lock(&map_tree->map_tree.lock);
6843
- em = lookup_extent_mapping(&map_tree->map_tree, next_start,
7143
+ read_lock(&map_tree->lock);
7144
+ em = lookup_extent_mapping(map_tree, next_start,
68447145 (u64)(-1) - next_start);
6845
- read_unlock(&map_tree->map_tree.lock);
7146
+ read_unlock(&map_tree->lock);
68467147 }
68477148 out:
68487149 return ret;
7150
+}
7151
+
7152
+static void readahead_tree_node_children(struct extent_buffer *node)
7153
+{
7154
+ int i;
7155
+ const int nr_items = btrfs_header_nritems(node);
7156
+
7157
+ for (i = 0; i < nr_items; i++) {
7158
+ u64 start;
7159
+
7160
+ start = btrfs_node_blockptr(node, i);
7161
+ readahead_tree_block(node->fs_info, start);
7162
+ }
68497163 }
68507164
68517165 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
....@@ -6858,6 +7172,7 @@
68587172 int ret;
68597173 int slot;
68607174 u64 total_dev = 0;
7175
+ u64 last_ra_node = 0;
68617176
68627177 path = btrfs_alloc_path();
68637178 if (!path)
....@@ -6868,7 +7183,6 @@
68687183 * otherwise we don't need it.
68697184 */
68707185 mutex_lock(&uuid_mutex);
6871
- mutex_lock(&fs_info->chunk_mutex);
68727186
68737187 /*
68747188 * It is possible for mount and umount to race in such a way that
....@@ -6891,6 +7205,8 @@
68917205 if (ret < 0)
68927206 goto error;
68937207 while (1) {
7208
+ struct extent_buffer *node;
7209
+
68947210 leaf = path->nodes[0];
68957211 slot = path->slots[0];
68967212 if (slot >= btrfs_header_nritems(leaf)) {
....@@ -6901,19 +7217,32 @@
69017217 goto error;
69027218 break;
69037219 }
7220
+ /*
7221
+ * The nodes on level 1 are not locked but we don't need to do
7222
+ * that during mount time as nothing else can access the tree
7223
+ */
7224
+ node = path->nodes[1];
7225
+ if (node) {
7226
+ if (last_ra_node != node->start) {
7227
+ readahead_tree_node_children(node);
7228
+ last_ra_node = node->start;
7229
+ }
7230
+ }
69047231 btrfs_item_key_to_cpu(leaf, &found_key, slot);
69057232 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
69067233 struct btrfs_dev_item *dev_item;
69077234 dev_item = btrfs_item_ptr(leaf, slot,
69087235 struct btrfs_dev_item);
6909
- ret = read_one_dev(fs_info, leaf, dev_item);
7236
+ ret = read_one_dev(leaf, dev_item);
69107237 if (ret)
69117238 goto error;
69127239 total_dev++;
69137240 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
69147241 struct btrfs_chunk *chunk;
69157242 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6916
- ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
7243
+ mutex_lock(&fs_info->chunk_mutex);
7244
+ ret = read_one_chunk(&found_key, leaf, chunk);
7245
+ mutex_unlock(&fs_info->chunk_mutex);
69177246 if (ret)
69187247 goto error;
69197248 }
....@@ -6925,12 +7254,12 @@
69257254 * do another round of validation checks.
69267255 */
69277256 if (total_dev != fs_info->fs_devices->total_devices) {
6928
- btrfs_err(fs_info,
6929
- "super_num_devices %llu mismatch with num_devices %llu found here",
7257
+ btrfs_warn(fs_info,
7258
+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
69307259 btrfs_super_num_devices(fs_info->super_copy),
69317260 total_dev);
6932
- ret = -EINVAL;
6933
- goto error;
7261
+ fs_info->fs_devices->total_devices = total_dev;
7262
+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
69347263 }
69357264 if (btrfs_super_total_bytes(fs_info->super_copy) <
69367265 fs_info->fs_devices->total_rw_bytes) {
....@@ -6943,7 +7272,6 @@
69437272 }
69447273 ret = 0;
69457274 error:
6946
- mutex_unlock(&fs_info->chunk_mutex);
69477275 mutex_unlock(&uuid_mutex);
69487276
69497277 btrfs_free_path(path);
....@@ -6952,86 +7280,117 @@
69527280
69537281 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
69547282 {
6955
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7283
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
69567284 struct btrfs_device *device;
69577285
6958
- while (fs_devices) {
6959
- mutex_lock(&fs_devices->device_list_mutex);
6960
- list_for_each_entry(device, &fs_devices->devices, dev_list)
6961
- device->fs_info = fs_info;
6962
- mutex_unlock(&fs_devices->device_list_mutex);
7286
+ fs_devices->fs_info = fs_info;
69637287
6964
- fs_devices = fs_devices->seed;
7288
+ mutex_lock(&fs_devices->device_list_mutex);
7289
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
7290
+ device->fs_info = fs_info;
7291
+
7292
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7293
+ list_for_each_entry(device, &seed_devs->devices, dev_list)
7294
+ device->fs_info = fs_info;
7295
+
7296
+ seed_devs->fs_info = fs_info;
69657297 }
7298
+ mutex_unlock(&fs_devices->device_list_mutex);
69667299 }
69677300
6968
-static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
7301
+static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7302
+ const struct btrfs_dev_stats_item *ptr,
7303
+ int index)
69697304 {
6970
- int i;
7305
+ u64 val;
69717306
6972
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6973
- btrfs_dev_stat_reset(dev, i);
7307
+ read_extent_buffer(eb, &val,
7308
+ offsetof(struct btrfs_dev_stats_item, values) +
7309
+ ((unsigned long)ptr) + (index * sizeof(u64)),
7310
+ sizeof(val));
7311
+ return val;
7312
+}
7313
+
7314
+static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7315
+ struct btrfs_dev_stats_item *ptr,
7316
+ int index, u64 val)
7317
+{
7318
+ write_extent_buffer(eb, &val,
7319
+ offsetof(struct btrfs_dev_stats_item, values) +
7320
+ ((unsigned long)ptr) + (index * sizeof(u64)),
7321
+ sizeof(val));
7322
+}
7323
+
7324
+static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7325
+ struct btrfs_path *path)
7326
+{
7327
+ struct btrfs_dev_stats_item *ptr;
7328
+ struct extent_buffer *eb;
7329
+ struct btrfs_key key;
7330
+ int item_size;
7331
+ int i, ret, slot;
7332
+
7333
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
7334
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
7335
+ key.offset = device->devid;
7336
+ ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7337
+ if (ret) {
7338
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7339
+ btrfs_dev_stat_set(device, i, 0);
7340
+ device->dev_stats_valid = 1;
7341
+ btrfs_release_path(path);
7342
+ return ret < 0 ? ret : 0;
7343
+ }
7344
+ slot = path->slots[0];
7345
+ eb = path->nodes[0];
7346
+ item_size = btrfs_item_size_nr(eb, slot);
7347
+
7348
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7349
+
7350
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7351
+ if (item_size >= (1 + i) * sizeof(__le64))
7352
+ btrfs_dev_stat_set(device, i,
7353
+ btrfs_dev_stats_value(eb, ptr, i));
7354
+ else
7355
+ btrfs_dev_stat_set(device, i, 0);
7356
+ }
7357
+
7358
+ device->dev_stats_valid = 1;
7359
+ btrfs_dev_stat_print_on_load(device);
7360
+ btrfs_release_path(path);
7361
+
7362
+ return 0;
69747363 }
69757364
69767365 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
69777366 {
6978
- struct btrfs_key key;
6979
- struct btrfs_key found_key;
6980
- struct btrfs_root *dev_root = fs_info->dev_root;
6981
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6982
- struct extent_buffer *eb;
6983
- int slot;
6984
- int ret = 0;
7367
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
69857368 struct btrfs_device *device;
69867369 struct btrfs_path *path = NULL;
6987
- int i;
7370
+ int ret = 0;
69887371
69897372 path = btrfs_alloc_path();
6990
- if (!path) {
6991
- ret = -ENOMEM;
6992
- goto out;
6993
- }
7373
+ if (!path)
7374
+ return -ENOMEM;
69947375
69957376 mutex_lock(&fs_devices->device_list_mutex);
69967377 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6997
- int item_size;
6998
- struct btrfs_dev_stats_item *ptr;
6999
-
7000
- key.objectid = BTRFS_DEV_STATS_OBJECTID;
7001
- key.type = BTRFS_PERSISTENT_ITEM_KEY;
7002
- key.offset = device->devid;
7003
- ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
7004
- if (ret) {
7005
- __btrfs_reset_dev_stats(device);
7006
- device->dev_stats_valid = 1;
7007
- btrfs_release_path(path);
7008
- continue;
7009
- }
7010
- slot = path->slots[0];
7011
- eb = path->nodes[0];
7012
- btrfs_item_key_to_cpu(eb, &found_key, slot);
7013
- item_size = btrfs_item_size_nr(eb, slot);
7014
-
7015
- ptr = btrfs_item_ptr(eb, slot,
7016
- struct btrfs_dev_stats_item);
7017
-
7018
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7019
- if (item_size >= (1 + i) * sizeof(__le64))
7020
- btrfs_dev_stat_set(device, i,
7021
- btrfs_dev_stats_value(eb, ptr, i));
7022
- else
7023
- btrfs_dev_stat_reset(device, i);
7024
- }
7025
-
7026
- device->dev_stats_valid = 1;
7027
- btrfs_dev_stat_print_on_load(device);
7028
- btrfs_release_path(path);
7378
+ ret = btrfs_device_init_dev_stats(device, path);
7379
+ if (ret)
7380
+ goto out;
70297381 }
7382
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7383
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
7384
+ ret = btrfs_device_init_dev_stats(device, path);
7385
+ if (ret)
7386
+ goto out;
7387
+ }
7388
+ }
7389
+out:
70307390 mutex_unlock(&fs_devices->device_list_mutex);
70317391
7032
-out:
70337392 btrfs_free_path(path);
7034
- return ret < 0 ? ret : 0;
7393
+ return ret;
70357394 }
70367395
70377396 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
....@@ -7102,9 +7461,9 @@
71027461 /*
71037462 * called from commit_transaction. Writes all changed device stats to disk.
71047463 */
7105
-int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
7106
- struct btrfs_fs_info *fs_info)
7464
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
71077465 {
7466
+ struct btrfs_fs_info *fs_info = trans->fs_info;
71087467 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
71097468 struct btrfs_device *device;
71107469 int stats_cnt;
....@@ -7187,8 +7546,8 @@
71877546 int i;
71887547
71897548 mutex_lock(&fs_devices->device_list_mutex);
7190
- dev = btrfs_find_device(fs_info->fs_devices, stats->devid,
7191
- NULL, NULL, true);
7549
+ dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
7550
+ true);
71927551 mutex_unlock(&fs_devices->device_list_mutex);
71937552
71947553 if (!dev) {
....@@ -7203,7 +7562,7 @@
72037562 stats->values[i] =
72047563 btrfs_dev_stat_read_and_reset(dev, i);
72057564 else
7206
- btrfs_dev_stat_reset(dev, i);
7565
+ btrfs_dev_stat_set(dev, i, 0);
72077566 }
72087567 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
72097568 current->comm, task_pid_nr(current));
....@@ -7217,101 +7576,35 @@
72177576 return 0;
72187577 }
72197578
7220
-void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
7221
-{
7222
- struct buffer_head *bh;
7223
- struct btrfs_super_block *disk_super;
7224
- int copy_num;
7225
-
7226
- if (!bdev)
7227
- return;
7228
-
7229
- for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
7230
- copy_num++) {
7231
-
7232
- if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
7233
- continue;
7234
-
7235
- disk_super = (struct btrfs_super_block *)bh->b_data;
7236
-
7237
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
7238
- set_buffer_dirty(bh);
7239
- sync_dirty_buffer(bh);
7240
- brelse(bh);
7241
- }
7242
-
7243
- /* Notify udev that device has changed */
7244
- btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
7245
-
7246
- /* Update ctime/mtime for device path for libblkid */
7247
- update_dev_time(device_path);
7248
-}
7249
-
72507579 /*
7251
- * Update the size of all devices, which is used for writing out the
7252
- * super blocks.
7580
+ * Update the size and bytes used for each device where it changed. This is
7581
+ * delayed since we would otherwise get errors while writing out the
7582
+ * superblocks.
7583
+ *
7584
+ * Must be invoked during transaction commit.
72537585 */
7254
-void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
7586
+void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
72557587 {
7256
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
72577588 struct btrfs_device *curr, *next;
72587589
7259
- if (list_empty(&fs_devices->resized_devices))
7590
+ ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7591
+
7592
+ if (list_empty(&trans->dev_update_list))
72607593 return;
72617594
7262
- mutex_lock(&fs_devices->device_list_mutex);
7263
- mutex_lock(&fs_info->chunk_mutex);
7264
- list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
7265
- resized_list) {
7266
- list_del_init(&curr->resized_list);
7595
+ /*
7596
+ * We don't need the device_list_mutex here. This list is owned by the
7597
+ * transaction and the transaction must complete before the device is
7598
+ * released.
7599
+ */
7600
+ mutex_lock(&trans->fs_info->chunk_mutex);
7601
+ list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7602
+ post_commit_list) {
7603
+ list_del_init(&curr->post_commit_list);
72677604 curr->commit_total_bytes = curr->disk_total_bytes;
7605
+ curr->commit_bytes_used = curr->bytes_used;
72687606 }
7269
- mutex_unlock(&fs_info->chunk_mutex);
7270
- mutex_unlock(&fs_devices->device_list_mutex);
7271
-}
7272
-
7273
-/* Must be invoked during the transaction commit */
7274
-void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
7275
-{
7276
- struct btrfs_fs_info *fs_info = trans->fs_info;
7277
- struct extent_map *em;
7278
- struct map_lookup *map;
7279
- struct btrfs_device *dev;
7280
- int i;
7281
-
7282
- if (list_empty(&trans->pending_chunks))
7283
- return;
7284
-
7285
- /* In order to kick the device replace finish process */
7286
- mutex_lock(&fs_info->chunk_mutex);
7287
- list_for_each_entry(em, &trans->pending_chunks, list) {
7288
- map = em->map_lookup;
7289
-
7290
- for (i = 0; i < map->num_stripes; i++) {
7291
- dev = map->stripes[i].dev;
7292
- dev->commit_bytes_used = dev->bytes_used;
7293
- dev->has_pending_chunks = false;
7294
- }
7295
- }
7296
- mutex_unlock(&fs_info->chunk_mutex);
7297
-}
7298
-
7299
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
7300
-{
7301
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7302
- while (fs_devices) {
7303
- fs_devices->fs_info = fs_info;
7304
- fs_devices = fs_devices->seed;
7305
- }
7306
-}
7307
-
7308
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
7309
-{
7310
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7311
- while (fs_devices) {
7312
- fs_devices->fs_info = NULL;
7313
- fs_devices = fs_devices->seed;
7314
- }
7607
+ mutex_unlock(&trans->fs_info->chunk_mutex);
73157608 }
73167609
73177610 /*
....@@ -7319,38 +7612,18 @@
73197612 */
73207613 int btrfs_bg_type_to_factor(u64 flags)
73217614 {
7322
- if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
7323
- BTRFS_BLOCK_GROUP_RAID10))
7324
- return 2;
7325
- return 1;
7615
+ const int index = btrfs_bg_flags_to_raid_index(flags);
7616
+
7617
+ return btrfs_raid_array[index].ncopies;
73267618 }
73277619
73287620
7329
-static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
7330
-{
7331
- int index = btrfs_bg_flags_to_raid_index(type);
7332
- int ncopies = btrfs_raid_array[index].ncopies;
7333
- int data_stripes;
7334
-
7335
- switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
7336
- case BTRFS_BLOCK_GROUP_RAID5:
7337
- data_stripes = num_stripes - 1;
7338
- break;
7339
- case BTRFS_BLOCK_GROUP_RAID6:
7340
- data_stripes = num_stripes - 2;
7341
- break;
7342
- default:
7343
- data_stripes = num_stripes / ncopies;
7344
- break;
7345
- }
7346
- return div_u64(chunk_len, data_stripes);
7347
-}
73487621
73497622 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
73507623 u64 chunk_offset, u64 devid,
73517624 u64 physical_offset, u64 physical_len)
73527625 {
7353
- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
7626
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
73547627 struct extent_map *em;
73557628 struct map_lookup *map;
73567629 struct btrfs_device *dev;
....@@ -7414,8 +7687,11 @@
74147687
74157688 /* It's possible this device is a dummy for seed device */
74167689 if (dev->disk_total_bytes == 0) {
7417
- dev = btrfs_find_device(fs_info->fs_devices->seed, devid,
7418
- NULL, NULL, false);
7690
+ struct btrfs_fs_devices *devs;
7691
+
7692
+ devs = list_first_entry(&fs_info->fs_devices->seed_list,
7693
+ struct btrfs_fs_devices, seed_list);
7694
+ dev = btrfs_find_device(devs, devid, NULL, NULL, false);
74197695 if (!dev) {
74207696 btrfs_err(fs_info, "failed to find seed devid %llu",
74217697 devid);
....@@ -7439,13 +7715,13 @@
74397715
74407716 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
74417717 {
7442
- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
7718
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
74437719 struct extent_map *em;
74447720 struct rb_node *node;
74457721 int ret = 0;
74467722
74477723 read_lock(&em_tree->lock);
7448
- for (node = rb_first(&em_tree->map); node; node = rb_next(node)) {
7724
+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
74497725 em = rb_entry(node, struct extent_map, rb_node);
74507726 if (em->map_lookup->num_stripes !=
74517727 em->map_lookup->verified_stripes) {
....@@ -7551,3 +7827,27 @@
75517827 btrfs_free_path(path);
75527828 return ret;
75537829 }
7830
+
7831
+/*
7832
+ * Check whether the given block group or device is pinned by any inode being
7833
+ * used as a swapfile.
7834
+ */
7835
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7836
+{
7837
+ struct btrfs_swapfile_pin *sp;
7838
+ struct rb_node *node;
7839
+
7840
+ spin_lock(&fs_info->swapfile_pins_lock);
7841
+ node = fs_info->swapfile_pins.rb_node;
7842
+ while (node) {
7843
+ sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7844
+ if (ptr < sp->ptr)
7845
+ node = node->rb_left;
7846
+ else if (ptr > sp->ptr)
7847
+ node = node->rb_right;
7848
+ else
7849
+ break;
7850
+ }
7851
+ spin_unlock(&fs_info->swapfile_pins_lock);
7852
+ return node != NULL;
7853
+}