forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-11 072de836f53be56a70cecf70b43ae43b7ce17376
kernel/fs/btrfs/volumes.c
....@@ -7,7 +7,6 @@
77 #include <linux/sched/mm.h>
88 #include <linux/bio.h>
99 #include <linux/slab.h>
10
-#include <linux/buffer_head.h>
1110 #include <linux/blkdev.h>
1211 #include <linux/ratelimit.h>
1312 #include <linux/kthread.h>
....@@ -15,6 +14,8 @@
1514 #include <linux/semaphore.h>
1615 #include <linux/uuid.h>
1716 #include <linux/list_sort.h>
17
+#include <linux/namei.h>
18
+#include "misc.h"
1819 #include "ctree.h"
1920 #include "extent_map.h"
2021 #include "disk-io.h"
....@@ -25,10 +26,12 @@
2526 #include "async-thread.h"
2627 #include "check-integrity.h"
2728 #include "rcu-string.h"
28
-#include "math.h"
2929 #include "dev-replace.h"
3030 #include "sysfs.h"
3131 #include "tree-checker.h"
32
+#include "space-info.h"
33
+#include "block-group.h"
34
+#include "discard.h"
3235
3336 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3437 [BTRFS_RAID_RAID10] = {
....@@ -39,6 +42,7 @@
3942 .tolerated_failures = 1,
4043 .devs_increment = 2,
4144 .ncopies = 2,
45
+ .nparity = 0,
4246 .raid_name = "raid10",
4347 .bg_flag = BTRFS_BLOCK_GROUP_RAID10,
4448 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
....@@ -51,9 +55,36 @@
5155 .tolerated_failures = 1,
5256 .devs_increment = 2,
5357 .ncopies = 2,
58
+ .nparity = 0,
5459 .raid_name = "raid1",
5560 .bg_flag = BTRFS_BLOCK_GROUP_RAID1,
5661 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
62
+ },
63
+ [BTRFS_RAID_RAID1C3] = {
64
+ .sub_stripes = 1,
65
+ .dev_stripes = 1,
66
+ .devs_max = 3,
67
+ .devs_min = 3,
68
+ .tolerated_failures = 2,
69
+ .devs_increment = 3,
70
+ .ncopies = 3,
71
+ .nparity = 0,
72
+ .raid_name = "raid1c3",
73
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
74
+ .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
75
+ },
76
+ [BTRFS_RAID_RAID1C4] = {
77
+ .sub_stripes = 1,
78
+ .dev_stripes = 1,
79
+ .devs_max = 4,
80
+ .devs_min = 4,
81
+ .tolerated_failures = 3,
82
+ .devs_increment = 4,
83
+ .ncopies = 4,
84
+ .nparity = 0,
85
+ .raid_name = "raid1c4",
86
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
87
+ .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
5788 },
5889 [BTRFS_RAID_DUP] = {
5990 .sub_stripes = 1,
....@@ -63,6 +94,7 @@
6394 .tolerated_failures = 0,
6495 .devs_increment = 1,
6596 .ncopies = 2,
97
+ .nparity = 0,
6698 .raid_name = "dup",
6799 .bg_flag = BTRFS_BLOCK_GROUP_DUP,
68100 .mindev_error = 0,
....@@ -75,6 +107,7 @@
75107 .tolerated_failures = 0,
76108 .devs_increment = 1,
77109 .ncopies = 1,
110
+ .nparity = 0,
78111 .raid_name = "raid0",
79112 .bg_flag = BTRFS_BLOCK_GROUP_RAID0,
80113 .mindev_error = 0,
....@@ -87,6 +120,7 @@
87120 .tolerated_failures = 0,
88121 .devs_increment = 1,
89122 .ncopies = 1,
123
+ .nparity = 0,
90124 .raid_name = "single",
91125 .bg_flag = 0,
92126 .mindev_error = 0,
....@@ -99,6 +133,7 @@
99133 .tolerated_failures = 1,
100134 .devs_increment = 1,
101135 .ncopies = 1,
136
+ .nparity = 1,
102137 .raid_name = "raid5",
103138 .bg_flag = BTRFS_BLOCK_GROUP_RAID5,
104139 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
....@@ -111,24 +146,79 @@
111146 .tolerated_failures = 2,
112147 .devs_increment = 1,
113148 .ncopies = 1,
149
+ .nparity = 2,
114150 .raid_name = "raid6",
115151 .bg_flag = BTRFS_BLOCK_GROUP_RAID6,
116152 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
117153 },
118154 };
119155
120
-const char *get_raid_name(enum btrfs_raid_types type)
156
+const char *btrfs_bg_type_to_raid_name(u64 flags)
121157 {
122
- if (type >= BTRFS_NR_RAID_TYPES)
158
+ const int index = btrfs_bg_flags_to_raid_index(flags);
159
+
160
+ if (index >= BTRFS_NR_RAID_TYPES)
123161 return NULL;
124162
125
- return btrfs_raid_array[type].raid_name;
163
+ return btrfs_raid_array[index].raid_name;
126164 }
127165
128
-static int init_first_rw_device(struct btrfs_trans_handle *trans,
129
- struct btrfs_fs_info *fs_info);
166
+/*
167
+ * Fill @buf with textual description of @bg_flags, no more than @size_buf
168
+ * bytes including terminating null byte.
169
+ */
170
+void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
171
+{
172
+ int i;
173
+ int ret;
174
+ char *bp = buf;
175
+ u64 flags = bg_flags;
176
+ u32 size_bp = size_buf;
177
+
178
+ if (!flags) {
179
+ strcpy(bp, "NONE");
180
+ return;
181
+ }
182
+
183
+#define DESCRIBE_FLAG(flag, desc) \
184
+ do { \
185
+ if (flags & (flag)) { \
186
+ ret = snprintf(bp, size_bp, "%s|", (desc)); \
187
+ if (ret < 0 || ret >= size_bp) \
188
+ goto out_overflow; \
189
+ size_bp -= ret; \
190
+ bp += ret; \
191
+ flags &= ~(flag); \
192
+ } \
193
+ } while (0)
194
+
195
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
196
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
197
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
198
+
199
+ DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
200
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
201
+ DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
202
+ btrfs_raid_array[i].raid_name);
203
+#undef DESCRIBE_FLAG
204
+
205
+ if (flags) {
206
+ ret = snprintf(bp, size_bp, "0x%llx|", flags);
207
+ size_bp -= ret;
208
+ }
209
+
210
+ if (size_bp < size_buf)
211
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
212
+
213
+ /*
214
+ * The text is trimmed, it's up to the caller to provide sufficiently
215
+ * large buffer
216
+ */
217
+out_overflow:;
218
+}
219
+
220
+static int init_first_rw_device(struct btrfs_trans_handle *trans);
130221 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
131
-static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
132222 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
133223 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
134224 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
....@@ -153,7 +243,7 @@
153243 * the mutex can be very coarse and can cover long-running operations
154244 *
155245 * protects: updates to fs_devices counters like missing devices, rw devices,
156
- * seeding, structure cloning, openning/closing devices at mount/umount time
246
+ * seeding, structure cloning, opening/closing devices at mount/umount time
157247 *
158248 * global::fs_devs - add, remove, updates to the global list
159249 *
....@@ -183,7 +273,9 @@
183273 * chunk_mutex
184274 * -----------
185275 * protects chunks, adding or removing during allocation, trim or when a new
186
- * device is added/removed
276
+ * device is added/removed. Additionally it also protects post_commit_list of
277
+ * individual devices, since they can be added to the transaction's
278
+ * post_commit_list only with chunk_mutex held.
187279 *
188280 * cleaner_mutex
189281 * -------------
....@@ -195,14 +287,13 @@
195287 * ============
196288 *
197289 * uuid_mutex
198
- * volume_mutex
199
- * device_list_mutex
200
- * chunk_mutex
201
- * balance_mutex
290
+ * device_list_mutex
291
+ * chunk_mutex
292
+ * balance_mutex
202293 *
203294 *
204
- * Exclusive operations, BTRFS_FS_EXCL_OP
205
- * ======================================
295
+ * Exclusive operations
296
+ * ====================
206297 *
207298 * Maintains the exclusivity of the following operations that apply to the
208299 * whole filesystem and cannot run in parallel.
....@@ -228,30 +319,32 @@
228319 * - system power-cycle and filesystem mounted as read-only
229320 * - filesystem or device errors leading to forced read-only
230321 *
231
- * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
232
- * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
322
+ * The status of exclusive operation is set and cleared atomically.
323
+ * During the course of Paused state, fs_info::exclusive_operation remains set.
233324 * A device operation in Paused or Running state can be canceled or resumed
234325 * either by ioctl (Balance only) or when remounted as read-write.
235
- * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
326
+ * The exclusive status is cleared when the device operation is canceled or
236327 * completed.
237328 */
238329
239330 DEFINE_MUTEX(uuid_mutex);
240331 static LIST_HEAD(fs_uuids);
241
-struct list_head *btrfs_get_fs_uuids(void)
332
+struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
242333 {
243334 return &fs_uuids;
244335 }
245336
246337 /*
247338 * alloc_fs_devices - allocate struct btrfs_fs_devices
248
- * @fsid: if not NULL, copy the uuid to fs_devices::fsid
339
+ * @fsid: if not NULL, copy the UUID to fs_devices::fsid
340
+ * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
249341 *
250342 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
251343 * The returned struct is not linked onto any lists and can be destroyed with
252344 * kfree() right away.
253345 */
254
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
346
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
347
+ const u8 *metadata_fsid)
255348 {
256349 struct btrfs_fs_devices *fs_devs;
257350
....@@ -262,18 +355,25 @@
262355 mutex_init(&fs_devs->device_list_mutex);
263356
264357 INIT_LIST_HEAD(&fs_devs->devices);
265
- INIT_LIST_HEAD(&fs_devs->resized_devices);
266358 INIT_LIST_HEAD(&fs_devs->alloc_list);
267359 INIT_LIST_HEAD(&fs_devs->fs_list);
360
+ INIT_LIST_HEAD(&fs_devs->seed_list);
268361 if (fsid)
269362 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
363
+
364
+ if (metadata_fsid)
365
+ memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
366
+ else if (fsid)
367
+ memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
270368
271369 return fs_devs;
272370 }
273371
274372 void btrfs_free_device(struct btrfs_device *device)
275373 {
374
+ WARN_ON(!list_empty(&device->post_commit_list));
276375 rcu_string_free(device->name);
376
+ extent_io_tree_release(&device->alloc_state);
277377 bio_put(device->flush_bio);
278378 kfree(device);
279379 }
....@@ -289,19 +389,6 @@
289389 btrfs_free_device(device);
290390 }
291391 kfree(fs_devices);
292
-}
293
-
294
-static void btrfs_kobject_uevent(struct block_device *bdev,
295
- enum kobject_action action)
296
-{
297
- int ret;
298
-
299
- ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
300
- if (ret)
301
- pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
302
- action,
303
- kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
304
- &disk_to_dev(bdev->bd_disk)->kobj);
305392 }
306393
307394 void __exit btrfs_cleanup_fs_uuids(void)
....@@ -321,7 +408,7 @@
321408 * Returned struct is not linked onto any lists and must be destroyed using
322409 * btrfs_free_device.
323410 */
324
-static struct btrfs_device *__alloc_device(void)
411
+static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
325412 {
326413 struct btrfs_device *dev;
327414
....@@ -341,34 +428,86 @@
341428
342429 INIT_LIST_HEAD(&dev->dev_list);
343430 INIT_LIST_HEAD(&dev->dev_alloc_list);
344
- INIT_LIST_HEAD(&dev->resized_list);
345
-
346
- spin_lock_init(&dev->io_lock);
431
+ INIT_LIST_HEAD(&dev->post_commit_list);
347432
348433 atomic_set(&dev->reada_in_flight, 0);
349434 atomic_set(&dev->dev_stats_ccnt, 0);
350435 btrfs_device_data_ordered_init(dev);
351436 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
352437 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
438
+ extent_io_tree_init(fs_info, &dev->alloc_state,
439
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
353440
354441 return dev;
355442 }
356443
357
-static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
444
+static noinline struct btrfs_fs_devices *find_fsid(
445
+ const u8 *fsid, const u8 *metadata_fsid)
358446 {
359447 struct btrfs_fs_devices *fs_devices;
360448
449
+ ASSERT(fsid);
450
+
451
+ /* Handle non-split brain cases */
361452 list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
362
- if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
363
- return fs_devices;
453
+ if (metadata_fsid) {
454
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
455
+ && memcmp(metadata_fsid, fs_devices->metadata_uuid,
456
+ BTRFS_FSID_SIZE) == 0)
457
+ return fs_devices;
458
+ } else {
459
+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
460
+ return fs_devices;
461
+ }
364462 }
365463 return NULL;
366464 }
367465
466
+static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
467
+ struct btrfs_super_block *disk_super)
468
+{
469
+
470
+ struct btrfs_fs_devices *fs_devices;
471
+
472
+ /*
473
+ * Handle scanned device having completed its fsid change but
474
+ * belonging to a fs_devices that was created by first scanning
475
+ * a device which didn't have its fsid/metadata_uuid changed
476
+ * at all and the CHANGING_FSID_V2 flag set.
477
+ */
478
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
479
+ if (fs_devices->fsid_change &&
480
+ memcmp(disk_super->metadata_uuid, fs_devices->fsid,
481
+ BTRFS_FSID_SIZE) == 0 &&
482
+ memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
483
+ BTRFS_FSID_SIZE) == 0) {
484
+ return fs_devices;
485
+ }
486
+ }
487
+ /*
488
+ * Handle scanned device having completed its fsid change but
489
+ * belonging to a fs_devices that was created by a device that
490
+ * has an outdated pair of fsid/metadata_uuid and
491
+ * CHANGING_FSID_V2 flag set.
492
+ */
493
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
494
+ if (fs_devices->fsid_change &&
495
+ memcmp(fs_devices->metadata_uuid,
496
+ fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
497
+ memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
498
+ BTRFS_FSID_SIZE) == 0) {
499
+ return fs_devices;
500
+ }
501
+ }
502
+
503
+ return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
504
+}
505
+
506
+
368507 static int
369508 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
370509 int flush, struct block_device **bdev,
371
- struct buffer_head **bh)
510
+ struct btrfs_super_block **disk_super)
372511 {
373512 int ret;
374513
....@@ -387,9 +526,9 @@
387526 goto error;
388527 }
389528 invalidate_bdev(*bdev);
390
- *bh = btrfs_read_dev_super(*bdev);
391
- if (IS_ERR(*bh)) {
392
- ret = PTR_ERR(*bh);
529
+ *disk_super = btrfs_read_dev_super(*bdev);
530
+ if (IS_ERR(*disk_super)) {
531
+ ret = PTR_ERR(*disk_super);
393532 blkdev_put(*bdev, flags);
394533 goto error;
395534 }
....@@ -398,214 +537,50 @@
398537
399538 error:
400539 *bdev = NULL;
401
- *bh = NULL;
402540 return ret;
403541 }
404542
405
-static void requeue_list(struct btrfs_pending_bios *pending_bios,
406
- struct bio *head, struct bio *tail)
407
-{
408
-
409
- struct bio *old_head;
410
-
411
- old_head = pending_bios->head;
412
- pending_bios->head = head;
413
- if (pending_bios->tail)
414
- tail->bi_next = old_head;
415
- else
416
- pending_bios->tail = tail;
417
-}
418
-
419543 /*
420
- * we try to collect pending bios for a device so we don't get a large
421
- * number of procs sending bios down to the same device. This greatly
422
- * improves the schedulers ability to collect and merge the bios.
544
+ * Check if the device in the path matches the device in the given struct device.
423545 *
424
- * But, it also turns into a long list of bios to process and that is sure
425
- * to eventually make the worker thread block. The solution here is to
426
- * make some progress and then put this work struct back at the end of
427
- * the list if the block device is congested. This way, multiple devices
428
- * can make progress from a single worker thread.
546
+ * Returns:
547
+ * true If it is the same device.
548
+ * false If it is not the same device or on error.
429549 */
430
-static noinline void run_scheduled_bios(struct btrfs_device *device)
550
+static bool device_matched(const struct btrfs_device *device, const char *path)
431551 {
432
- struct btrfs_fs_info *fs_info = device->fs_info;
433
- struct bio *pending;
434
- struct backing_dev_info *bdi;
435
- struct btrfs_pending_bios *pending_bios;
436
- struct bio *tail;
437
- struct bio *cur;
438
- int again = 0;
439
- unsigned long num_run;
440
- unsigned long batch_run = 0;
441
- unsigned long last_waited = 0;
442
- int force_reg = 0;
443
- int sync_pending = 0;
444
- struct blk_plug plug;
552
+ char *device_name;
553
+ struct block_device *bdev_old;
554
+ struct block_device *bdev_new;
445555
446556 /*
447
- * this function runs all the bios we've collected for
448
- * a particular device. We don't want to wander off to
449
- * another device without first sending all of these down.
450
- * So, setup a plug here and finish it off before we return
557
+ * If we are looking for a device with the matching dev_t, then skip
558
+ * device without a name (a missing device).
451559 */
452
- blk_start_plug(&plug);
560
+ if (!device->name)
561
+ return false;
453562
454
- bdi = device->bdev->bd_bdi;
563
+ device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
564
+ if (!device_name)
565
+ return false;
455566
456
-loop:
457
- spin_lock(&device->io_lock);
567
+ rcu_read_lock();
568
+ scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
569
+ rcu_read_unlock();
458570
459
-loop_lock:
460
- num_run = 0;
571
+ bdev_old = lookup_bdev(device_name);
572
+ kfree(device_name);
573
+ if (IS_ERR(bdev_old))
574
+ return false;
461575
462
- /* take all the bios off the list at once and process them
463
- * later on (without the lock held). But, remember the
464
- * tail and other pointers so the bios can be properly reinserted
465
- * into the list if we hit congestion
466
- */
467
- if (!force_reg && device->pending_sync_bios.head) {
468
- pending_bios = &device->pending_sync_bios;
469
- force_reg = 1;
470
- } else {
471
- pending_bios = &device->pending_bios;
472
- force_reg = 0;
473
- }
576
+ bdev_new = lookup_bdev(path);
577
+ if (IS_ERR(bdev_new))
578
+ return false;
474579
475
- pending = pending_bios->head;
476
- tail = pending_bios->tail;
477
- WARN_ON(pending && !tail);
580
+ if (bdev_old == bdev_new)
581
+ return true;
478582
479
- /*
480
- * if pending was null this time around, no bios need processing
481
- * at all and we can stop. Otherwise it'll loop back up again
482
- * and do an additional check so no bios are missed.
483
- *
484
- * device->running_pending is used to synchronize with the
485
- * schedule_bio code.
486
- */
487
- if (device->pending_sync_bios.head == NULL &&
488
- device->pending_bios.head == NULL) {
489
- again = 0;
490
- device->running_pending = 0;
491
- } else {
492
- again = 1;
493
- device->running_pending = 1;
494
- }
495
-
496
- pending_bios->head = NULL;
497
- pending_bios->tail = NULL;
498
-
499
- spin_unlock(&device->io_lock);
500
-
501
- while (pending) {
502
-
503
- rmb();
504
- /* we want to work on both lists, but do more bios on the
505
- * sync list than the regular list
506
- */
507
- if ((num_run > 32 &&
508
- pending_bios != &device->pending_sync_bios &&
509
- device->pending_sync_bios.head) ||
510
- (num_run > 64 && pending_bios == &device->pending_sync_bios &&
511
- device->pending_bios.head)) {
512
- spin_lock(&device->io_lock);
513
- requeue_list(pending_bios, pending, tail);
514
- goto loop_lock;
515
- }
516
-
517
- cur = pending;
518
- pending = pending->bi_next;
519
- cur->bi_next = NULL;
520
-
521
- BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
522
-
523
- /*
524
- * if we're doing the sync list, record that our
525
- * plug has some sync requests on it
526
- *
527
- * If we're doing the regular list and there are
528
- * sync requests sitting around, unplug before
529
- * we add more
530
- */
531
- if (pending_bios == &device->pending_sync_bios) {
532
- sync_pending = 1;
533
- } else if (sync_pending) {
534
- blk_finish_plug(&plug);
535
- blk_start_plug(&plug);
536
- sync_pending = 0;
537
- }
538
-
539
- btrfsic_submit_bio(cur);
540
- num_run++;
541
- batch_run++;
542
-
543
- cond_resched();
544
-
545
- /*
546
- * we made progress, there is more work to do and the bdi
547
- * is now congested. Back off and let other work structs
548
- * run instead
549
- */
550
- if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
551
- fs_info->fs_devices->open_devices > 1) {
552
- struct io_context *ioc;
553
-
554
- ioc = current->io_context;
555
-
556
- /*
557
- * the main goal here is that we don't want to
558
- * block if we're going to be able to submit
559
- * more requests without blocking.
560
- *
561
- * This code does two great things, it pokes into
562
- * the elevator code from a filesystem _and_
563
- * it makes assumptions about how batching works.
564
- */
565
- if (ioc && ioc->nr_batch_requests > 0 &&
566
- time_before(jiffies, ioc->last_waited + HZ/50UL) &&
567
- (last_waited == 0 ||
568
- ioc->last_waited == last_waited)) {
569
- /*
570
- * we want to go through our batch of
571
- * requests and stop. So, we copy out
572
- * the ioc->last_waited time and test
573
- * against it before looping
574
- */
575
- last_waited = ioc->last_waited;
576
- cond_resched();
577
- continue;
578
- }
579
- spin_lock(&device->io_lock);
580
- requeue_list(pending_bios, pending, tail);
581
- device->running_pending = 1;
582
-
583
- spin_unlock(&device->io_lock);
584
- btrfs_queue_work(fs_info->submit_workers,
585
- &device->work);
586
- goto done;
587
- }
588
- }
589
-
590
- cond_resched();
591
- if (again)
592
- goto loop;
593
-
594
- spin_lock(&device->io_lock);
595
- if (device->pending_bios.head || device->pending_sync_bios.head)
596
- goto loop_lock;
597
- spin_unlock(&device->io_lock);
598
-
599
-done:
600
- blk_finish_plug(&plug);
601
-}
602
-
603
-static void pending_bios_fn(struct btrfs_work *work)
604
-{
605
- struct btrfs_device *device;
606
-
607
- device = container_of(work, struct btrfs_device, work);
608
- run_scheduled_bios(device);
583
+ return false;
609584 }
610585
611586 /*
....@@ -615,52 +590,55 @@
615590 * matching this path only.
616591 * skip_dev: Optional. Will skip this device when searching for the stale
617592 * devices.
593
+ * Return: 0 for success or if @path is NULL.
594
+ * -EBUSY if @path is a mounted device.
595
+ * -ENOENT if @path does not match any device in the list.
618596 */
619
-static void btrfs_free_stale_devices(const char *path,
597
+static int btrfs_free_stale_devices(const char *path,
620598 struct btrfs_device *skip_device)
621599 {
622600 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
623601 struct btrfs_device *device, *tmp_device;
602
+ int ret = 0;
603
+
604
+ lockdep_assert_held(&uuid_mutex);
605
+
606
+ if (path)
607
+ ret = -ENOENT;
624608
625609 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
626
- mutex_lock(&fs_devices->device_list_mutex);
627
- if (fs_devices->opened) {
628
- mutex_unlock(&fs_devices->device_list_mutex);
629
- continue;
630
- }
631610
611
+ mutex_lock(&fs_devices->device_list_mutex);
632612 list_for_each_entry_safe(device, tmp_device,
633613 &fs_devices->devices, dev_list) {
634
- int not_found = 0;
635
-
636614 if (skip_device && skip_device == device)
637615 continue;
638
- if (path && !device->name)
616
+ if (path && !device_matched(device, path))
639617 continue;
640
-
641
- rcu_read_lock();
642
- if (path)
643
- not_found = strcmp(rcu_str_deref(device->name),
644
- path);
645
- rcu_read_unlock();
646
- if (not_found)
647
- continue;
618
+ if (fs_devices->opened) {
619
+ /* for an already deleted device return 0 */
620
+ if (path && ret != 0)
621
+ ret = -EBUSY;
622
+ break;
623
+ }
648624
649625 /* delete the stale device */
650626 fs_devices->num_devices--;
651627 list_del(&device->dev_list);
652628 btrfs_free_device(device);
653629
654
- if (fs_devices->num_devices == 0)
655
- break;
630
+ ret = 0;
656631 }
657632 mutex_unlock(&fs_devices->device_list_mutex);
633
+
658634 if (fs_devices->num_devices == 0) {
659635 btrfs_sysfs_remove_fsid(fs_devices);
660636 list_del(&fs_devices->fs_list);
661637 free_fs_devices(fs_devices);
662638 }
663639 }
640
+
641
+ return ret;
664642 }
665643
666644 /*
....@@ -674,7 +652,6 @@
674652 {
675653 struct request_queue *q;
676654 struct block_device *bdev;
677
- struct buffer_head *bh;
678655 struct btrfs_super_block *disk_super;
679656 u64 devid;
680657 int ret;
....@@ -685,23 +662,29 @@
685662 return -EINVAL;
686663
687664 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
688
- &bdev, &bh);
665
+ &bdev, &disk_super);
689666 if (ret)
690667 return ret;
691668
692
- disk_super = (struct btrfs_super_block *)bh->b_data;
693669 devid = btrfs_stack_device_id(&disk_super->dev_item);
694670 if (devid != device->devid)
695
- goto error_brelse;
671
+ goto error_free_page;
696672
697673 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
698
- goto error_brelse;
674
+ goto error_free_page;
699675
700676 device->generation = btrfs_super_generation(disk_super);
701677
702678 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
679
+ if (btrfs_super_incompat_flags(disk_super) &
680
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
681
+ pr_err(
682
+ "BTRFS: Invalid seeding and uuid-changed device detected\n");
683
+ goto error_free_page;
684
+ }
685
+
703686 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
704
- fs_devices->seeding = 1;
687
+ fs_devices->seeding = true;
705688 } else {
706689 if (bdev_read_only(bdev))
707690 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
....@@ -711,7 +694,7 @@
711694
712695 q = bdev_get_queue(bdev);
713696 if (!blk_queue_nonrot(q))
714
- fs_devices->rotating = 1;
697
+ fs_devices->rotating = true;
715698
716699 device->bdev = bdev;
717700 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
....@@ -723,17 +706,101 @@
723706 fs_devices->rw_devices++;
724707 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
725708 }
726
- brelse(bh);
709
+ btrfs_release_disk_super(disk_super);
727710
728711 return 0;
729712
730
-error_brelse:
731
- brelse(bh);
713
+error_free_page:
714
+ btrfs_release_disk_super(disk_super);
732715 blkdev_put(bdev, flags);
733716
734717 return -EINVAL;
735718 }
736719
720
+/*
721
+ * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
722
+ * being created with a disk that has already completed its fsid change. Such
723
+ * disk can belong to an fs which has its FSID changed or to one which doesn't.
724
+ * Handle both cases here.
725
+ */
726
+static struct btrfs_fs_devices *find_fsid_inprogress(
727
+ struct btrfs_super_block *disk_super)
728
+{
729
+ struct btrfs_fs_devices *fs_devices;
730
+
731
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
732
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
733
+ BTRFS_FSID_SIZE) != 0 &&
734
+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
735
+ BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
736
+ return fs_devices;
737
+ }
738
+ }
739
+
740
+ return find_fsid(disk_super->fsid, NULL);
741
+}
742
+
743
+
744
+static struct btrfs_fs_devices *find_fsid_changed(
745
+ struct btrfs_super_block *disk_super)
746
+{
747
+ struct btrfs_fs_devices *fs_devices;
748
+
749
+ /*
750
+ * Handles the case where scanned device is part of an fs that had
751
+ * multiple successful changes of FSID but curently device didn't
752
+ * observe it. Meaning our fsid will be different than theirs. We need
753
+ * to handle two subcases :
754
+ * 1 - The fs still continues to have different METADATA/FSID uuids.
755
+ * 2 - The fs is switched back to its original FSID (METADATA/FSID
756
+ * are equal).
757
+ */
758
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
759
+ /* Changed UUIDs */
760
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
761
+ BTRFS_FSID_SIZE) != 0 &&
762
+ memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
763
+ BTRFS_FSID_SIZE) == 0 &&
764
+ memcmp(fs_devices->fsid, disk_super->fsid,
765
+ BTRFS_FSID_SIZE) != 0)
766
+ return fs_devices;
767
+
768
+ /* Unchanged UUIDs */
769
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
770
+ BTRFS_FSID_SIZE) == 0 &&
771
+ memcmp(fs_devices->fsid, disk_super->metadata_uuid,
772
+ BTRFS_FSID_SIZE) == 0)
773
+ return fs_devices;
774
+ }
775
+
776
+ return NULL;
777
+}
778
+
779
+static struct btrfs_fs_devices *find_fsid_reverted_metadata(
780
+ struct btrfs_super_block *disk_super)
781
+{
782
+ struct btrfs_fs_devices *fs_devices;
783
+
784
+ /*
785
+ * Handle the case where the scanned device is part of an fs whose last
786
+ * metadata UUID change reverted it to the original FSID. At the same
787
+ * time * fs_devices was first created by another constitutent device
788
+ * which didn't fully observe the operation. This results in an
789
+ * btrfs_fs_devices created with metadata/fsid different AND
790
+ * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
791
+ * fs_devices equal to the FSID of the disk.
792
+ */
793
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
794
+ if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
795
+ BTRFS_FSID_SIZE) != 0 &&
796
+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
797
+ BTRFS_FSID_SIZE) == 0 &&
798
+ fs_devices->fsid_change)
799
+ return fs_devices;
800
+ }
801
+
802
+ return NULL;
803
+}
737804 /*
738805 * Add new device to list of registered devices
739806 *
....@@ -746,16 +813,40 @@
746813 bool *new_device_added)
747814 {
748815 struct btrfs_device *device;
749
- struct btrfs_fs_devices *fs_devices;
816
+ struct btrfs_fs_devices *fs_devices = NULL;
750817 struct rcu_string *name;
751818 u64 found_transid = btrfs_super_generation(disk_super);
752819 u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
820
+ bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
821
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
822
+ bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
823
+ BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
753824
754
- fs_devices = find_fsid(disk_super->fsid);
825
+ if (fsid_change_in_progress) {
826
+ if (!has_metadata_uuid)
827
+ fs_devices = find_fsid_inprogress(disk_super);
828
+ else
829
+ fs_devices = find_fsid_changed(disk_super);
830
+ } else if (has_metadata_uuid) {
831
+ fs_devices = find_fsid_with_metadata_uuid(disk_super);
832
+ } else {
833
+ fs_devices = find_fsid_reverted_metadata(disk_super);
834
+ if (!fs_devices)
835
+ fs_devices = find_fsid(disk_super->fsid, NULL);
836
+ }
837
+
838
+
755839 if (!fs_devices) {
756
- fs_devices = alloc_fs_devices(disk_super->fsid);
840
+ if (has_metadata_uuid)
841
+ fs_devices = alloc_fs_devices(disk_super->fsid,
842
+ disk_super->metadata_uuid);
843
+ else
844
+ fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
845
+
757846 if (IS_ERR(fs_devices))
758847 return ERR_CAST(fs_devices);
848
+
849
+ fs_devices->fsid_change = fsid_change_in_progress;
759850
760851 mutex_lock(&fs_devices->device_list_mutex);
761852 list_add(&fs_devices->fs_list, &fs_uuids);
....@@ -765,6 +856,27 @@
765856 mutex_lock(&fs_devices->device_list_mutex);
766857 device = btrfs_find_device(fs_devices, devid,
767858 disk_super->dev_item.uuid, NULL, false);
859
+
860
+ /*
861
+ * If this disk has been pulled into an fs devices created by
862
+ * a device which had the CHANGING_FSID_V2 flag then replace the
863
+ * metadata_uuid/fsid values of the fs_devices.
864
+ */
865
+ if (fs_devices->fsid_change &&
866
+ found_transid > fs_devices->latest_generation) {
867
+ memcpy(fs_devices->fsid, disk_super->fsid,
868
+ BTRFS_FSID_SIZE);
869
+
870
+ if (has_metadata_uuid)
871
+ memcpy(fs_devices->metadata_uuid,
872
+ disk_super->metadata_uuid,
873
+ BTRFS_FSID_SIZE);
874
+ else
875
+ memcpy(fs_devices->metadata_uuid,
876
+ disk_super->fsid, BTRFS_FSID_SIZE);
877
+
878
+ fs_devices->fsid_change = false;
879
+ }
768880 }
769881
770882 if (!device) {
....@@ -796,11 +908,15 @@
796908 *new_device_added = true;
797909
798910 if (disk_super->label[0])
799
- pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
800
- disk_super->label, devid, found_transid, path);
911
+ pr_info(
912
+ "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
913
+ disk_super->label, devid, found_transid, path,
914
+ current->comm, task_pid_nr(current));
801915 else
802
- pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
803
- disk_super->fsid, devid, found_transid, path);
916
+ pr_info(
917
+ "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
918
+ disk_super->fsid, devid, found_transid, path,
919
+ current->comm, task_pid_nr(current));
804920
805921 } else if (!device->name || strcmp(device->name->str, path)) {
806922 /*
....@@ -897,8 +1013,11 @@
8971013 * it back. We need it to pick the disk with largest generation
8981014 * (as above).
8991015 */
900
- if (!fs_devices->opened)
1016
+ if (!fs_devices->opened) {
9011017 device->generation = found_transid;
1018
+ fs_devices->latest_generation = max_t(u64, found_transid,
1019
+ fs_devices->latest_generation);
1020
+ }
9021021
9031022 fs_devices->total_devices = btrfs_super_num_devices(disk_super);
9041023
....@@ -911,22 +1030,25 @@
9111030 struct btrfs_fs_devices *fs_devices;
9121031 struct btrfs_device *device;
9131032 struct btrfs_device *orig_dev;
1033
+ int ret = 0;
9141034
915
- fs_devices = alloc_fs_devices(orig->fsid);
1035
+ lockdep_assert_held(&uuid_mutex);
1036
+
1037
+ fs_devices = alloc_fs_devices(orig->fsid, NULL);
9161038 if (IS_ERR(fs_devices))
9171039 return fs_devices;
9181040
919
- mutex_lock(&orig->device_list_mutex);
9201041 fs_devices->total_devices = orig->total_devices;
9211042
922
- /* We have held the volume lock, it is safe to get the devices. */
9231043 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
9241044 struct rcu_string *name;
9251045
9261046 device = btrfs_alloc_device(NULL, &orig_dev->devid,
9271047 orig_dev->uuid);
928
- if (IS_ERR(device))
1048
+ if (IS_ERR(device)) {
1049
+ ret = PTR_ERR(device);
9291050 goto error;
1051
+ }
9301052
9311053 /*
9321054 * This is ok to do without rcu read locked because we hold the
....@@ -937,6 +1059,7 @@
9371059 GFP_KERNEL);
9381060 if (!name) {
9391061 btrfs_free_device(device);
1062
+ ret = -ENOMEM;
9401063 goto error;
9411064 }
9421065 rcu_assign_pointer(device->name, name);
....@@ -946,36 +1069,27 @@
9461069 device->fs_devices = fs_devices;
9471070 fs_devices->num_devices++;
9481071 }
949
- mutex_unlock(&orig->device_list_mutex);
9501072 return fs_devices;
9511073 error:
952
- mutex_unlock(&orig->device_list_mutex);
9531074 free_fs_devices(fs_devices);
954
- return ERR_PTR(-ENOMEM);
1075
+ return ERR_PTR(ret);
9551076 }
9561077
957
-/*
958
- * After we have read the system tree and know devids belonging to
959
- * this filesystem, remove the device which does not belong there.
960
- */
961
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1078
+static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1079
+ int step, struct btrfs_device **latest_dev)
9621080 {
9631081 struct btrfs_device *device, *next;
964
- struct btrfs_device *latest_dev = NULL;
9651082
966
- mutex_lock(&uuid_mutex);
967
-again:
9681083 /* This is the initialized path, it is safe to release the devices. */
9691084 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
970
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
971
- &device->dev_state)) {
1085
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
9721086 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
973
- &device->dev_state) &&
1087
+ &device->dev_state) &&
9741088 !test_bit(BTRFS_DEV_STATE_MISSING,
9751089 &device->dev_state) &&
976
- (!latest_dev ||
977
- device->generation > latest_dev->generation)) {
978
- latest_dev = device;
1090
+ (!*latest_dev ||
1091
+ device->generation > (*latest_dev)->generation)) {
1092
+ *latest_dev = device;
9791093 }
9801094 continue;
9811095 }
....@@ -1002,22 +1116,26 @@
10021116 btrfs_free_device(device);
10031117 }
10041118
1005
- if (fs_devices->seed) {
1006
- fs_devices = fs_devices->seed;
1007
- goto again;
1008
- }
1119
+}
1120
+
1121
+/*
1122
+ * After we have read the system tree and know devids belonging to this
1123
+ * filesystem, remove the device which does not belong there.
1124
+ */
1125
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1126
+{
1127
+ struct btrfs_device *latest_dev = NULL;
1128
+ struct btrfs_fs_devices *seed_dev;
1129
+
1130
+ mutex_lock(&uuid_mutex);
1131
+ __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
1132
+
1133
+ list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1134
+ __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
10091135
10101136 fs_devices->latest_bdev = latest_dev->bdev;
10111137
10121138 mutex_unlock(&uuid_mutex);
1013
-}
1014
-
1015
-static void free_device_rcu(struct rcu_head *head)
1016
-{
1017
- struct btrfs_device *device;
1018
-
1019
- device = container_of(head, struct btrfs_device, rcu);
1020
- btrfs_free_device(device);
10211139 }
10221140
10231141 static void btrfs_close_bdev(struct btrfs_device *device)
....@@ -1036,11 +1154,6 @@
10361154 static void btrfs_close_one_device(struct btrfs_device *device)
10371155 {
10381156 struct btrfs_fs_devices *fs_devices = device->fs_devices;
1039
- struct btrfs_device *new_device;
1040
- struct rcu_string *name;
1041
-
1042
- if (device->bdev)
1043
- fs_devices->open_devices--;
10441157
10451158 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
10461159 device->devid != BTRFS_DEV_REPLACE_DEVID) {
....@@ -1057,65 +1170,72 @@
10571170 }
10581171
10591172 btrfs_close_bdev(device);
1060
-
1061
- new_device = btrfs_alloc_device(NULL, &device->devid,
1062
- device->uuid);
1063
- BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1064
-
1065
- /* Safe because we are under uuid_mutex */
1066
- if (device->name) {
1067
- name = rcu_string_strdup(device->name->str, GFP_NOFS);
1068
- BUG_ON(!name); /* -ENOMEM */
1069
- rcu_assign_pointer(new_device->name, name);
1173
+ if (device->bdev) {
1174
+ fs_devices->open_devices--;
1175
+ device->bdev = NULL;
10701176 }
1177
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
10711178
1072
- list_replace_rcu(&device->dev_list, &new_device->dev_list);
1073
- new_device->fs_devices = device->fs_devices;
1179
+ device->fs_info = NULL;
1180
+ atomic_set(&device->dev_stats_ccnt, 0);
1181
+ extent_io_tree_release(&device->alloc_state);
10741182
1075
- call_rcu(&device->rcu, free_device_rcu);
1183
+ /*
1184
+ * Reset the flush error record. We might have a transient flush error
1185
+ * in this mount, and if so we aborted the current transaction and set
1186
+ * the fs to an error state, guaranteeing no super blocks can be further
1187
+ * committed. However that error might be transient and if we unmount the
1188
+ * filesystem and mount it again, we should allow the mount to succeed
1189
+ * (btrfs_check_rw_degradable() should not fail) - if after mounting the
1190
+ * filesystem again we still get flush errors, then we will again abort
1191
+ * any transaction and set the error state, guaranteeing no commits of
1192
+ * unsafe super blocks.
1193
+ */
1194
+ device->last_flush_error = 0;
1195
+
1196
+ /* Verify the device is back in a pristine state */
1197
+ ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1198
+ ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1199
+ ASSERT(list_empty(&device->dev_alloc_list));
1200
+ ASSERT(list_empty(&device->post_commit_list));
1201
+ ASSERT(atomic_read(&device->reada_in_flight) == 0);
10761202 }
10771203
1078
-static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1204
+static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
10791205 {
10801206 struct btrfs_device *device, *tmp;
10811207
1082
- if (--fs_devices->opened > 0)
1083
- return 0;
1208
+ lockdep_assert_held(&uuid_mutex);
10841209
1085
- mutex_lock(&fs_devices->device_list_mutex);
1086
- list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1210
+ if (--fs_devices->opened > 0)
1211
+ return;
1212
+
1213
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
10871214 btrfs_close_one_device(device);
1088
- }
1089
- mutex_unlock(&fs_devices->device_list_mutex);
10901215
10911216 WARN_ON(fs_devices->open_devices);
10921217 WARN_ON(fs_devices->rw_devices);
10931218 fs_devices->opened = 0;
1094
- fs_devices->seeding = 0;
1095
-
1096
- return 0;
1219
+ fs_devices->seeding = false;
1220
+ fs_devices->fs_info = NULL;
10971221 }
10981222
1099
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1223
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
11001224 {
1101
- struct btrfs_fs_devices *seed_devices = NULL;
1102
- int ret;
1225
+ LIST_HEAD(list);
1226
+ struct btrfs_fs_devices *tmp;
11031227
11041228 mutex_lock(&uuid_mutex);
1105
- ret = close_fs_devices(fs_devices);
1106
- if (!fs_devices->opened) {
1107
- seed_devices = fs_devices->seed;
1108
- fs_devices->seed = NULL;
1109
- }
1110
- mutex_unlock(&uuid_mutex);
1229
+ close_fs_devices(fs_devices);
1230
+ if (!fs_devices->opened)
1231
+ list_splice_init(&fs_devices->seed_list, &list);
11111232
1112
- while (seed_devices) {
1113
- fs_devices = seed_devices;
1114
- seed_devices = fs_devices->seed;
1233
+ list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
11151234 close_fs_devices(fs_devices);
1235
+ list_del(&fs_devices->seed_list);
11161236 free_fs_devices(fs_devices);
11171237 }
1118
- return ret;
1238
+ mutex_unlock(&uuid_mutex);
11191239 }
11201240
11211241 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
....@@ -1123,28 +1243,33 @@
11231243 {
11241244 struct btrfs_device *device;
11251245 struct btrfs_device *latest_dev = NULL;
1126
- int ret = 0;
1246
+ struct btrfs_device *tmp_device;
11271247
11281248 flags |= FMODE_EXCL;
11291249
1130
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
1131
- /* Just open everything we can; ignore failures here */
1132
- if (btrfs_open_one_device(fs_devices, device, flags, holder))
1133
- continue;
1250
+ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1251
+ dev_list) {
1252
+ int ret;
11341253
1135
- if (!latest_dev ||
1136
- device->generation > latest_dev->generation)
1254
+ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1255
+ if (ret == 0 &&
1256
+ (!latest_dev || device->generation > latest_dev->generation)) {
11371257 latest_dev = device;
1258
+ } else if (ret == -ENODATA) {
1259
+ fs_devices->num_devices--;
1260
+ list_del(&device->dev_list);
1261
+ btrfs_free_device(device);
1262
+ }
11381263 }
1139
- if (fs_devices->open_devices == 0) {
1140
- ret = -EINVAL;
1141
- goto out;
1142
- }
1264
+ if (fs_devices->open_devices == 0)
1265
+ return -EINVAL;
1266
+
11431267 fs_devices->opened = 1;
11441268 fs_devices->latest_bdev = latest_dev->bdev;
11451269 fs_devices->total_rw_bytes = 0;
1146
-out:
1147
- return ret;
1270
+ fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1271
+
1272
+ return 0;
11481273 }
11491274
11501275 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
....@@ -1186,55 +1311,66 @@
11861311 return ret;
11871312 }
11881313
1189
-static void btrfs_release_disk_super(struct page *page)
1314
+void btrfs_release_disk_super(struct btrfs_super_block *super)
11901315 {
1191
- kunmap(page);
1316
+ struct page *page = virt_to_page(super);
1317
+
11921318 put_page(page);
11931319 }
11941320
1195
-static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1196
- struct page **page,
1197
- struct btrfs_super_block **disk_super)
1321
+static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1322
+ u64 bytenr)
11981323 {
1324
+ struct btrfs_super_block *disk_super;
1325
+ struct page *page;
11991326 void *p;
12001327 pgoff_t index;
12011328
12021329 /* make sure our super fits in the device */
12031330 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1204
- return 1;
1331
+ return ERR_PTR(-EINVAL);
12051332
12061333 /* make sure our super fits in the page */
1207
- if (sizeof(**disk_super) > PAGE_SIZE)
1208
- return 1;
1334
+ if (sizeof(*disk_super) > PAGE_SIZE)
1335
+ return ERR_PTR(-EINVAL);
12091336
12101337 /* make sure our super doesn't straddle pages on disk */
12111338 index = bytenr >> PAGE_SHIFT;
1212
- if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1213
- return 1;
1339
+ if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1340
+ return ERR_PTR(-EINVAL);
12141341
12151342 /* pull in the page with our super */
1216
- *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1217
- index, GFP_KERNEL);
1343
+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
12181344
1219
- if (IS_ERR_OR_NULL(*page))
1220
- return 1;
1345
+ if (IS_ERR(page))
1346
+ return ERR_CAST(page);
12211347
1222
- p = kmap(*page);
1348
+ p = page_address(page);
12231349
12241350 /* align our pointer to the offset of the super block */
1225
- *disk_super = p + (bytenr & ~PAGE_MASK);
1351
+ disk_super = p + offset_in_page(bytenr);
12261352
1227
- if (btrfs_super_bytenr(*disk_super) != bytenr ||
1228
- btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1229
- btrfs_release_disk_super(*page);
1230
- return 1;
1353
+ if (btrfs_super_bytenr(disk_super) != bytenr ||
1354
+ btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1355
+ btrfs_release_disk_super(p);
1356
+ return ERR_PTR(-EINVAL);
12311357 }
12321358
1233
- if ((*disk_super)->label[0] &&
1234
- (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1235
- (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1359
+ if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1360
+ disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
12361361
1237
- return 0;
1362
+ return disk_super;
1363
+}
1364
+
1365
+int btrfs_forget_devices(const char *path)
1366
+{
1367
+ int ret;
1368
+
1369
+ mutex_lock(&uuid_mutex);
1370
+ ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1371
+ mutex_unlock(&uuid_mutex);
1372
+
1373
+ return ret;
12381374 }
12391375
12401376 /*
....@@ -1249,7 +1385,6 @@
12491385 bool new_device_added = false;
12501386 struct btrfs_device *device = NULL;
12511387 struct block_device *bdev;
1252
- struct page *page;
12531388 u64 bytenr;
12541389
12551390 lockdep_assert_held(&uuid_mutex);
....@@ -1267,8 +1402,9 @@
12671402 if (IS_ERR(bdev))
12681403 return ERR_CAST(bdev);
12691404
1270
- if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1271
- device = ERR_PTR(-EINVAL);
1405
+ disk_super = btrfs_read_disk_super(bdev, bytenr);
1406
+ if (IS_ERR(disk_super)) {
1407
+ device = ERR_CAST(disk_super);
12721408 goto error_bdev_put;
12731409 }
12741410
....@@ -1278,7 +1414,7 @@
12781414 btrfs_free_stale_devices(path, device);
12791415 }
12801416
1281
- btrfs_release_disk_super(page);
1417
+ btrfs_release_disk_super(disk_super);
12821418
12831419 error_bdev_put:
12841420 blkdev_put(bdev, flags);
....@@ -1286,60 +1422,84 @@
12861422 return device;
12871423 }
12881424
1289
-static int contains_pending_extent(struct btrfs_transaction *transaction,
1290
- struct btrfs_device *device,
1291
- u64 *start, u64 len)
1425
+/*
1426
+ * Try to find a chunk that intersects [start, start + len] range and when one
1427
+ * such is found, record the end of it in *start
1428
+ */
1429
+static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1430
+ u64 len)
12921431 {
1293
- struct btrfs_fs_info *fs_info = device->fs_info;
1294
- struct extent_map *em;
1295
- struct list_head *search_list = &fs_info->pinned_chunks;
1296
- int ret = 0;
1297
- u64 physical_start = *start;
1432
+ u64 physical_start, physical_end;
12981433
1299
- if (transaction)
1300
- search_list = &transaction->pending_chunks;
1301
-again:
1302
- list_for_each_entry(em, search_list, list) {
1303
- struct map_lookup *map;
1304
- int i;
1434
+ lockdep_assert_held(&device->fs_info->chunk_mutex);
13051435
1306
- map = em->map_lookup;
1307
- for (i = 0; i < map->num_stripes; i++) {
1308
- u64 end;
1436
+ if (!find_first_extent_bit(&device->alloc_state, *start,
1437
+ &physical_start, &physical_end,
1438
+ CHUNK_ALLOCATED, NULL)) {
13091439
1310
- if (map->stripes[i].dev != device)
1311
- continue;
1312
- if (map->stripes[i].physical >= physical_start + len ||
1313
- map->stripes[i].physical + em->orig_block_len <=
1314
- physical_start)
1315
- continue;
1316
- /*
1317
- * Make sure that while processing the pinned list we do
1318
- * not override our *start with a lower value, because
1319
- * we can have pinned chunks that fall within this
1320
- * device hole and that have lower physical addresses
1321
- * than the pending chunks we processed before. If we
1322
- * do not take this special care we can end up getting
1323
- * 2 pending chunks that start at the same physical
1324
- * device offsets because the end offset of a pinned
1325
- * chunk can be equal to the start offset of some
1326
- * pending chunk.
1327
- */
1328
- end = map->stripes[i].physical + em->orig_block_len;
1329
- if (end > *start) {
1330
- *start = end;
1331
- ret = 1;
1332
- }
1440
+ if (in_range(physical_start, *start, len) ||
1441
+ in_range(*start, physical_start,
1442
+ physical_end - physical_start)) {
1443
+ *start = physical_end + 1;
1444
+ return true;
13331445 }
13341446 }
1335
- if (search_list != &fs_info->pinned_chunks) {
1336
- search_list = &fs_info->pinned_chunks;
1337
- goto again;
1338
- }
1339
-
1340
- return ret;
1447
+ return false;
13411448 }
13421449
1450
+static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1451
+{
1452
+ switch (device->fs_devices->chunk_alloc_policy) {
1453
+ case BTRFS_CHUNK_ALLOC_REGULAR:
1454
+ /*
1455
+ * We don't want to overwrite the superblock on the drive nor
1456
+ * any area used by the boot loader (grub for example), so we
1457
+ * make sure to start at an offset of at least 1MB.
1458
+ */
1459
+ return max_t(u64, start, SZ_1M);
1460
+ default:
1461
+ BUG();
1462
+ }
1463
+}
1464
+
1465
+/**
1466
+ * dev_extent_hole_check - check if specified hole is suitable for allocation
1467
+ * @device: the device which we have the hole
1468
+ * @hole_start: starting position of the hole
1469
+ * @hole_size: the size of the hole
1470
+ * @num_bytes: the size of the free space that we need
1471
+ *
1472
+ * This function may modify @hole_start and @hole_end to reflect the suitable
1473
+ * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1474
+ */
1475
+static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1476
+ u64 *hole_size, u64 num_bytes)
1477
+{
1478
+ bool changed = false;
1479
+ u64 hole_end = *hole_start + *hole_size;
1480
+
1481
+ /*
1482
+ * Check before we set max_hole_start, otherwise we could end up
1483
+ * sending back this offset anyway.
1484
+ */
1485
+ if (contains_pending_extent(device, hole_start, *hole_size)) {
1486
+ if (hole_end >= *hole_start)
1487
+ *hole_size = hole_end - *hole_start;
1488
+ else
1489
+ *hole_size = 0;
1490
+ changed = true;
1491
+ }
1492
+
1493
+ switch (device->fs_devices->chunk_alloc_policy) {
1494
+ case BTRFS_CHUNK_ALLOC_REGULAR:
1495
+ /* No extra check */
1496
+ break;
1497
+ default:
1498
+ BUG();
1499
+ }
1500
+
1501
+ return changed;
1502
+}
13431503
13441504 /*
13451505 * find_free_dev_extent_start - find free space in the specified device
....@@ -1361,10 +1521,16 @@
13611521 * @len is used to store the size of the free space that we find.
13621522 * But if we don't find suitable free space, it is used to store the size of
13631523 * the max free space.
1524
+ *
1525
+ * NOTE: This function will search *commit* root of device tree, and does extra
1526
+ * check to ensure dev extents are not double allocated.
1527
+ * This makes the function safe to allocate dev extents but may not report
1528
+ * correct usable device space, as device extent freed in current transaction
1529
+ * is not reported as avaiable.
13641530 */
1365
-int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1366
- struct btrfs_device *device, u64 num_bytes,
1367
- u64 search_start, u64 *start, u64 *len)
1531
+static int find_free_dev_extent_start(struct btrfs_device *device,
1532
+ u64 num_bytes, u64 search_start, u64 *start,
1533
+ u64 *len)
13681534 {
13691535 struct btrfs_fs_info *fs_info = device->fs_info;
13701536 struct btrfs_root *root = fs_info->dev_root;
....@@ -1380,12 +1546,7 @@
13801546 int slot;
13811547 struct extent_buffer *l;
13821548
1383
- /*
1384
- * We don't want to overwrite the superblock on the drive nor any area
1385
- * used by the boot loader (grub for example), so we make sure to start
1386
- * at an offset of at least 1MB.
1387
- */
1388
- search_start = max_t(u64, search_start, SZ_1M);
1549
+ search_start = dev_extent_search_start(device, search_start);
13891550
13901551 path = btrfs_alloc_path();
13911552 if (!path)
....@@ -1443,21 +1604,8 @@
14431604
14441605 if (key.offset > search_start) {
14451606 hole_size = key.offset - search_start;
1446
-
1447
- /*
1448
- * Have to check before we set max_hole_start, otherwise
1449
- * we could end up sending back this offset anyway.
1450
- */
1451
- if (contains_pending_extent(transaction, device,
1452
- &search_start,
1453
- hole_size)) {
1454
- if (key.offset >= search_start) {
1455
- hole_size = key.offset - search_start;
1456
- } else {
1457
- WARN_ON_ONCE(1);
1458
- hole_size = 0;
1459
- }
1460
- }
1607
+ dev_extent_hole_check(device, &search_start, &hole_size,
1608
+ num_bytes);
14611609
14621610 if (hole_size > max_hole_size) {
14631611 max_hole_start = search_start;
....@@ -1496,9 +1644,8 @@
14961644 */
14971645 if (search_end > search_start) {
14981646 hole_size = search_end - search_start;
1499
-
1500
- if (contains_pending_extent(transaction, device, &search_start,
1501
- hole_size)) {
1647
+ if (dev_extent_hole_check(device, &search_start, &hole_size,
1648
+ num_bytes)) {
15021649 btrfs_release_path(path);
15031650 goto again;
15041651 }
....@@ -1523,13 +1670,11 @@
15231670 return ret;
15241671 }
15251672
1526
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
1527
- struct btrfs_device *device, u64 num_bytes,
1673
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
15281674 u64 *start, u64 *len)
15291675 {
15301676 /* FIXME use last free of some kind */
1531
- return find_free_dev_extent_start(trans->transaction, device,
1532
- num_bytes, 0, start, len);
1677
+ return find_free_dev_extent_start(device, num_bytes, 0, start, len);
15331678 }
15341679
15351680 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
....@@ -1640,9 +1785,9 @@
16401785 struct rb_node *n;
16411786 u64 ret = 0;
16421787
1643
- em_tree = &fs_info->mapping_tree.map_tree;
1788
+ em_tree = &fs_info->mapping_tree;
16441789 read_lock(&em_tree->lock);
1645
- n = rb_last(&em_tree->map);
1790
+ n = rb_last(&em_tree->map.rb_root);
16461791 if (n) {
16471792 em = rb_entry(n, struct extent_map, rb_node);
16481793 ret = em->start + em->len;
....@@ -1672,7 +1817,12 @@
16721817 if (ret < 0)
16731818 goto error;
16741819
1675
- BUG_ON(ret == 0); /* Corruption */
1820
+ if (ret == 0) {
1821
+ /* Corruption */
1822
+ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1823
+ ret = -EUCLEAN;
1824
+ goto error;
1825
+ }
16761826
16771827 ret = btrfs_previous_item(fs_info->chunk_root, path,
16781828 BTRFS_DEV_ITEMS_OBJECTID,
....@@ -1738,7 +1888,8 @@
17381888 ptr = btrfs_device_uuid(dev_item);
17391889 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
17401890 ptr = btrfs_device_fsid(dev_item);
1741
- write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1891
+ write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1892
+ ptr, BTRFS_FSID_SIZE);
17421893 btrfs_mark_buffer_dirty(leaf);
17431894
17441895 ret = 0;
....@@ -1750,22 +1901,27 @@
17501901 /*
17511902 * Function to update ctime/mtime for a given device path.
17521903 * Mainly used for ctime/mtime based probe like libblkid.
1904
+ *
1905
+ * We don't care about errors here, this is just to be kind to userspace.
17531906 */
1754
-static void update_dev_time(const char *path_name)
1907
+static void update_dev_time(const char *device_path)
17551908 {
1756
- struct file *filp;
1909
+ struct path path;
1910
+ struct timespec64 now;
1911
+ int ret;
17571912
1758
- filp = filp_open(path_name, O_RDWR, 0);
1759
- if (IS_ERR(filp))
1913
+ ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
1914
+ if (ret)
17601915 return;
1761
- file_update_time(filp);
1762
- filp_close(filp, NULL);
1916
+
1917
+ now = current_time(d_inode(path.dentry));
1918
+ inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
1919
+ path_put(&path);
17631920 }
17641921
1765
-static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1766
- struct btrfs_device *device)
1922
+static int btrfs_rm_dev_item(struct btrfs_device *device)
17671923 {
1768
- struct btrfs_root *root = fs_info->chunk_root;
1924
+ struct btrfs_root *root = device->fs_info->chunk_root;
17691925 int ret;
17701926 struct btrfs_path *path;
17711927 struct btrfs_key key;
....@@ -1862,17 +2018,14 @@
18622018 * where this function called, there should be always be another device (or
18632019 * this_dev) which is active.
18642020 */
1865
-void btrfs_assign_next_active_device(struct btrfs_device *device,
1866
- struct btrfs_device *this_dev)
2021
+void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
2022
+ struct btrfs_device *next_device)
18672023 {
18682024 struct btrfs_fs_info *fs_info = device->fs_info;
1869
- struct btrfs_device *next_device;
18702025
1871
- if (this_dev)
1872
- next_device = this_dev;
1873
- else
2026
+ if (!next_device)
18742027 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1875
- device);
2028
+ device);
18762029 ASSERT(next_device);
18772030
18782031 if (fs_info->sb->s_bdev &&
....@@ -1883,8 +2036,66 @@
18832036 fs_info->fs_devices->latest_bdev = next_device->bdev;
18842037 }
18852038
2039
+/*
2040
+ * Return btrfs_fs_devices::num_devices excluding the device that's being
2041
+ * currently replaced.
2042
+ */
2043
+static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2044
+{
2045
+ u64 num_devices = fs_info->fs_devices->num_devices;
2046
+
2047
+ down_read(&fs_info->dev_replace.rwsem);
2048
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2049
+ ASSERT(num_devices > 1);
2050
+ num_devices--;
2051
+ }
2052
+ up_read(&fs_info->dev_replace.rwsem);
2053
+
2054
+ return num_devices;
2055
+}
2056
+
2057
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2058
+ struct block_device *bdev,
2059
+ const char *device_path)
2060
+{
2061
+ struct btrfs_super_block *disk_super;
2062
+ int copy_num;
2063
+
2064
+ if (!bdev)
2065
+ return;
2066
+
2067
+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2068
+ struct page *page;
2069
+ int ret;
2070
+
2071
+ disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2072
+ if (IS_ERR(disk_super))
2073
+ continue;
2074
+
2075
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2076
+
2077
+ page = virt_to_page(disk_super);
2078
+ set_page_dirty(page);
2079
+ lock_page(page);
2080
+ /* write_on_page() unlocks the page */
2081
+ ret = write_one_page(page);
2082
+ if (ret)
2083
+ btrfs_warn(fs_info,
2084
+ "error clearing superblock number %d (%d)",
2085
+ copy_num, ret);
2086
+ btrfs_release_disk_super(disk_super);
2087
+
2088
+ }
2089
+
2090
+ /* Notify udev that device has changed */
2091
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2092
+
2093
+ /* Update ctime/mtime for device path for libblkid */
2094
+ update_dev_time(device_path);
2095
+}
2096
+
18862097 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1887
- u64 devid)
2098
+ u64 devid)
18882099 {
18892100 struct btrfs_device *device;
18902101 struct btrfs_fs_devices *cur_devices;
....@@ -1892,24 +2103,35 @@
18922103 u64 num_devices;
18932104 int ret = 0;
18942105
1895
- mutex_lock(&uuid_mutex);
1896
-
1897
- num_devices = fs_devices->num_devices;
1898
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1899
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1900
- WARN_ON(num_devices < 1);
1901
- num_devices--;
1902
- }
1903
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
2106
+ /*
2107
+ * The device list in fs_devices is accessed without locks (neither
2108
+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
2109
+ * filesystem and another device rm cannot run.
2110
+ */
2111
+ num_devices = btrfs_num_devices(fs_info);
19042112
19052113 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
19062114 if (ret)
19072115 goto out;
19082116
1909
- ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1910
- &device);
1911
- if (ret)
2117
+ device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2118
+
2119
+ if (IS_ERR(device)) {
2120
+ if (PTR_ERR(device) == -ENOENT &&
2121
+ device_path && strcmp(device_path, "missing") == 0)
2122
+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2123
+ else
2124
+ ret = PTR_ERR(device);
19122125 goto out;
2126
+ }
2127
+
2128
+ if (btrfs_pinned_by_swapfile(fs_info, device)) {
2129
+ btrfs_warn_in_rcu(fs_info,
2130
+ "cannot remove device %s (devid %llu) due to active swapfile",
2131
+ rcu_str_deref(device->name), device->devid);
2132
+ ret = -ETXTBSY;
2133
+ goto out;
2134
+ }
19132135
19142136 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
19152137 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
....@@ -1929,9 +2151,9 @@
19292151 mutex_unlock(&fs_info->chunk_mutex);
19302152 }
19312153
1932
- mutex_unlock(&uuid_mutex);
19332154 ret = btrfs_shrink_device(device, 0);
1934
- mutex_lock(&uuid_mutex);
2155
+ if (!ret)
2156
+ btrfs_reada_remove_dev(device);
19352157 if (ret)
19362158 goto error_undo;
19372159
....@@ -1940,12 +2162,12 @@
19402162 * counter although write_all_supers() is not locked out. This
19412163 * could give a filesystem state which requires a degraded mount.
19422164 */
1943
- ret = btrfs_rm_dev_item(fs_info, device);
2165
+ ret = btrfs_rm_dev_item(device);
19442166 if (ret)
19452167 goto error_undo;
19462168
19472169 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
1948
- btrfs_scrub_cancel_dev(fs_info, device);
2170
+ btrfs_scrub_cancel_dev(device);
19492171
19502172 /*
19512173 * the device list mutex makes sure that we don't change
....@@ -1980,7 +2202,7 @@
19802202 if (device->bdev) {
19812203 cur_devices->open_devices--;
19822204 /* remove sysfs entry */
1983
- btrfs_sysfs_rm_device_link(fs_devices, device);
2205
+ btrfs_sysfs_remove_device(device);
19842206 }
19852207
19862208 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
....@@ -1993,29 +2215,24 @@
19932215 * supers and free the device.
19942216 */
19952217 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
1996
- btrfs_scratch_superblocks(device->bdev, device->name->str);
2218
+ btrfs_scratch_superblocks(fs_info, device->bdev,
2219
+ device->name->str);
19972220
19982221 btrfs_close_bdev(device);
1999
- call_rcu(&device->rcu, free_device_rcu);
2222
+ synchronize_rcu();
2223
+ btrfs_free_device(device);
20002224
20012225 if (cur_devices->open_devices == 0) {
2002
- while (fs_devices) {
2003
- if (fs_devices->seed == cur_devices) {
2004
- fs_devices->seed = cur_devices->seed;
2005
- break;
2006
- }
2007
- fs_devices = fs_devices->seed;
2008
- }
2009
- cur_devices->seed = NULL;
2226
+ list_del_init(&cur_devices->seed_list);
20102227 close_fs_devices(cur_devices);
20112228 free_fs_devices(cur_devices);
20122229 }
20132230
20142231 out:
2015
- mutex_unlock(&uuid_mutex);
20162232 return ret;
20172233
20182234 error_undo:
2235
+ btrfs_reada_undo_remove_dev(device);
20192236 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
20202237 mutex_lock(&fs_info->chunk_mutex);
20212238 list_add(&device->dev_alloc_list,
....@@ -2053,23 +2270,18 @@
20532270 fs_devices->open_devices--;
20542271 }
20552272
2056
-void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2057
- struct btrfs_device *srcdev)
2273
+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
20582274 {
20592275 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
20602276
2061
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2062
- /* zero out the old super if it is writable */
2063
- btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2064
- }
2277
+ mutex_lock(&uuid_mutex);
20652278
20662279 btrfs_close_bdev(srcdev);
2067
- call_rcu(&srcdev->rcu, free_device_rcu);
2280
+ synchronize_rcu();
2281
+ btrfs_free_device(srcdev);
20682282
20692283 /* if this is no devs we rather delete the fs_devices */
20702284 if (!fs_devices->num_devices) {
2071
- struct btrfs_fs_devices *tmp_fs_devices;
2072
-
20732285 /*
20742286 * On a mounted FS, num_devices can't be zero unless it's a
20752287 * seed. In case of a seed device being replaced, the replace
....@@ -2078,28 +2290,20 @@
20782290 */
20792291 ASSERT(fs_devices->seeding);
20802292
2081
- tmp_fs_devices = fs_info->fs_devices;
2082
- while (tmp_fs_devices) {
2083
- if (tmp_fs_devices->seed == fs_devices) {
2084
- tmp_fs_devices->seed = fs_devices->seed;
2085
- break;
2086
- }
2087
- tmp_fs_devices = tmp_fs_devices->seed;
2088
- }
2089
- fs_devices->seed = NULL;
2293
+ list_del_init(&fs_devices->seed_list);
20902294 close_fs_devices(fs_devices);
20912295 free_fs_devices(fs_devices);
20922296 }
2297
+ mutex_unlock(&uuid_mutex);
20932298 }
20942299
20952300 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
20962301 {
20972302 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
20982303
2099
- WARN_ON(!tgtdev);
21002304 mutex_lock(&fs_devices->device_list_mutex);
21012305
2102
- btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
2306
+ btrfs_sysfs_remove_device(tgtdev);
21032307
21042308 if (tgtdev->bdev)
21052309 fs_devices->open_devices--;
....@@ -2119,90 +2323,77 @@
21192323 * is already out of device list, so we don't have to hold
21202324 * the device_list_mutex lock.
21212325 */
2122
- btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2326
+ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2327
+ tgtdev->name->str);
21232328
21242329 btrfs_close_bdev(tgtdev);
2125
- call_rcu(&tgtdev->rcu, free_device_rcu);
2330
+ synchronize_rcu();
2331
+ btrfs_free_device(tgtdev);
21262332 }
21272333
2128
-static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2129
- const char *device_path,
2130
- struct btrfs_device **device)
2334
+static struct btrfs_device *btrfs_find_device_by_path(
2335
+ struct btrfs_fs_info *fs_info, const char *device_path)
21312336 {
21322337 int ret = 0;
21332338 struct btrfs_super_block *disk_super;
21342339 u64 devid;
21352340 u8 *dev_uuid;
21362341 struct block_device *bdev;
2137
- struct buffer_head *bh;
2342
+ struct btrfs_device *device;
21382343
2139
- *device = NULL;
21402344 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2141
- fs_info->bdev_holder, 0, &bdev, &bh);
2345
+ fs_info->bdev_holder, 0, &bdev, &disk_super);
21422346 if (ret)
2143
- return ret;
2144
- disk_super = (struct btrfs_super_block *)bh->b_data;
2347
+ return ERR_PTR(ret);
2348
+
21452349 devid = btrfs_stack_device_id(&disk_super->dev_item);
21462350 dev_uuid = disk_super->dev_item.uuid;
2147
- *device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2148
- disk_super->fsid, true);
2149
- brelse(bh);
2150
- if (!*device)
2151
- ret = -ENOENT;
2351
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2352
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2353
+ disk_super->metadata_uuid, true);
2354
+ else
2355
+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2356
+ disk_super->fsid, true);
2357
+
2358
+ btrfs_release_disk_super(disk_super);
2359
+ if (!device)
2360
+ device = ERR_PTR(-ENOENT);
21522361 blkdev_put(bdev, FMODE_READ);
2153
- return ret;
2154
-}
2155
-
2156
-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2157
- const char *device_path,
2158
- struct btrfs_device **device)
2159
-{
2160
- *device = NULL;
2161
- if (strcmp(device_path, "missing") == 0) {
2162
- struct list_head *devices;
2163
- struct btrfs_device *tmp;
2164
-
2165
- devices = &fs_info->fs_devices->devices;
2166
- list_for_each_entry(tmp, devices, dev_list) {
2167
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2168
- &tmp->dev_state) && !tmp->bdev) {
2169
- *device = tmp;
2170
- break;
2171
- }
2172
- }
2173
-
2174
- if (!*device)
2175
- return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2176
-
2177
- return 0;
2178
- } else {
2179
- return btrfs_find_device_by_path(fs_info, device_path, device);
2180
- }
2362
+ return device;
21812363 }
21822364
21832365 /*
21842366 * Lookup a device given by device id, or the path if the id is 0.
21852367 */
2186
-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2187
- const char *devpath,
2188
- struct btrfs_device **device)
2368
+struct btrfs_device *btrfs_find_device_by_devspec(
2369
+ struct btrfs_fs_info *fs_info, u64 devid,
2370
+ const char *device_path)
21892371 {
2190
- int ret;
2372
+ struct btrfs_device *device;
21912373
21922374 if (devid) {
2193
- ret = 0;
2194
- *device = btrfs_find_device(fs_info->fs_devices, devid,
2195
- NULL, NULL, true);
2196
- if (!*device)
2197
- ret = -ENOENT;
2198
- } else {
2199
- if (!devpath || !devpath[0])
2200
- return -EINVAL;
2201
-
2202
- ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2203
- device);
2375
+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2376
+ NULL, true);
2377
+ if (!device)
2378
+ return ERR_PTR(-ENOENT);
2379
+ return device;
22042380 }
2205
- return ret;
2381
+
2382
+ if (!device_path || !device_path[0])
2383
+ return ERR_PTR(-EINVAL);
2384
+
2385
+ if (strcmp(device_path, "missing") == 0) {
2386
+ /* Find first missing device */
2387
+ list_for_each_entry(device, &fs_info->fs_devices->devices,
2388
+ dev_list) {
2389
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2390
+ &device->dev_state) && !device->bdev)
2391
+ return device;
2392
+ }
2393
+ return ERR_PTR(-ENOENT);
2394
+ }
2395
+
2396
+ return btrfs_find_device_by_path(fs_info, device_path);
22062397 }
22072398
22082399 /*
....@@ -2221,10 +2412,20 @@
22212412 if (!fs_devices->seeding)
22222413 return -EINVAL;
22232414
2224
- seed_devices = alloc_fs_devices(NULL);
2415
+ /*
2416
+ * Private copy of the seed devices, anchored at
2417
+ * fs_info->fs_devices->seed_list
2418
+ */
2419
+ seed_devices = alloc_fs_devices(NULL, NULL);
22252420 if (IS_ERR(seed_devices))
22262421 return PTR_ERR(seed_devices);
22272422
2423
+ /*
2424
+ * It's necessary to retain a copy of the original seed fs_devices in
2425
+ * fs_uuids so that filesystems which have been seeded can successfully
2426
+ * reference the seed device from open_seed_devices. This also supports
2427
+ * multiple fs seed.
2428
+ */
22282429 old_devices = clone_fs_devices(fs_devices);
22292430 if (IS_ERR(old_devices)) {
22302431 kfree(seed_devices);
....@@ -2245,19 +2446,15 @@
22452446 list_for_each_entry(device, &seed_devices->devices, dev_list)
22462447 device->fs_devices = seed_devices;
22472448
2248
- mutex_lock(&fs_info->chunk_mutex);
2249
- list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2250
- mutex_unlock(&fs_info->chunk_mutex);
2251
-
2252
- fs_devices->seeding = 0;
2449
+ fs_devices->seeding = false;
22532450 fs_devices->num_devices = 0;
22542451 fs_devices->open_devices = 0;
22552452 fs_devices->missing_devices = 0;
2256
- fs_devices->rotating = 0;
2257
- fs_devices->seed = seed_devices;
2453
+ fs_devices->rotating = false;
2454
+ list_add(&seed_devices->seed_list, &fs_devices->seed_list);
22582455
22592456 generate_random_uuid(fs_devices->fsid);
2260
- memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2457
+ memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
22612458 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
22622459 mutex_unlock(&fs_devices->device_list_mutex);
22632460
....@@ -2271,9 +2468,9 @@
22712468 /*
22722469 * Store the expected generation for seed devices in device items.
22732470 */
2274
-static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2275
- struct btrfs_fs_info *fs_info)
2471
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
22762472 {
2473
+ struct btrfs_fs_info *fs_info = trans->fs_info;
22772474 struct btrfs_root *root = fs_info->chunk_root;
22782475 struct btrfs_path *path;
22792476 struct extent_buffer *leaf;
....@@ -2357,7 +2554,7 @@
23572554 u64 orig_super_num_devices;
23582555 int seeding_dev = 0;
23592556 int ret = 0;
2360
- bool unlocked = false;
2557
+ bool locked = false;
23612558
23622559 if (sb_rdonly(sb) && !fs_devices->seeding)
23632560 return -EROFS;
....@@ -2371,20 +2568,20 @@
23712568 seeding_dev = 1;
23722569 down_write(&sb->s_umount);
23732570 mutex_lock(&uuid_mutex);
2571
+ locked = true;
23742572 }
23752573
2376
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
2574
+ sync_blockdev(bdev);
23772575
2378
- mutex_lock(&fs_devices->device_list_mutex);
2379
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
2576
+ rcu_read_lock();
2577
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
23802578 if (device->bdev == bdev) {
23812579 ret = -EEXIST;
2382
- mutex_unlock(
2383
- &fs_devices->device_list_mutex);
2580
+ rcu_read_unlock();
23842581 goto error;
23852582 }
23862583 }
2387
- mutex_unlock(&fs_devices->device_list_mutex);
2584
+ rcu_read_unlock();
23882585
23892586 device = btrfs_alloc_device(fs_info, NULL, NULL);
23902587 if (IS_ERR(device)) {
....@@ -2448,7 +2645,7 @@
24482645 atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
24492646
24502647 if (!blk_queue_nonrot(q))
2451
- fs_devices->rotating = 1;
2648
+ fs_devices->rotating = true;
24522649
24532650 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
24542651 btrfs_set_super_total_bytes(fs_info->super_copy,
....@@ -2468,13 +2665,13 @@
24682665 mutex_unlock(&fs_info->chunk_mutex);
24692666
24702667 /* Add sysfs device entry */
2471
- btrfs_sysfs_add_device_link(fs_devices, device);
2668
+ btrfs_sysfs_add_device(device);
24722669
24732670 mutex_unlock(&fs_devices->device_list_mutex);
24742671
24752672 if (seeding_dev) {
24762673 mutex_lock(&fs_info->chunk_mutex);
2477
- ret = init_first_rw_device(trans, fs_info);
2674
+ ret = init_first_rw_device(trans);
24782675 mutex_unlock(&fs_info->chunk_mutex);
24792676 if (ret) {
24802677 btrfs_abort_transaction(trans, ret);
....@@ -2489,22 +2686,17 @@
24892686 }
24902687
24912688 if (seeding_dev) {
2492
- char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2493
-
2494
- ret = btrfs_finish_sprout(trans, fs_info);
2689
+ ret = btrfs_finish_sprout(trans);
24952690 if (ret) {
24962691 btrfs_abort_transaction(trans, ret);
24972692 goto error_sysfs;
24982693 }
24992694
2500
- /* Sprouting would change fsid of the mounted root,
2501
- * so rename the fsid on the sysfs
2695
+ /*
2696
+ * fs_devices now represents the newly sprouted filesystem and
2697
+ * its fsid has been changed by btrfs_prepare_sprout
25022698 */
2503
- snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2504
- fs_info->fsid);
2505
- if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
2506
- btrfs_warn(fs_info,
2507
- "sysfs: failed to create fsid for sprout");
2699
+ btrfs_sysfs_update_sprout_fsid(fs_devices);
25082700 }
25092701
25102702 ret = btrfs_commit_transaction(trans);
....@@ -2512,7 +2704,7 @@
25122704 if (seeding_dev) {
25132705 mutex_unlock(&uuid_mutex);
25142706 up_write(&sb->s_umount);
2515
- unlocked = true;
2707
+ locked = false;
25162708
25172709 if (ret) /* transaction commit */
25182710 return ret;
....@@ -2532,12 +2724,22 @@
25322724 ret = btrfs_commit_transaction(trans);
25332725 }
25342726
2535
- /* Update ctime/mtime for libblkid */
2727
+ /*
2728
+ * Now that we have written a new super block to this device, check all
2729
+ * other fs_devices list if device_path alienates any other scanned
2730
+ * device.
2731
+ * We can ignore the return value as it typically returns -EINVAL and
2732
+ * only succeeds if the device was an alien.
2733
+ */
2734
+ btrfs_forget_devices(device_path);
2735
+
2736
+ /* Update ctime/mtime for blkid or udev */
25362737 update_dev_time(device_path);
2738
+
25372739 return ret;
25382740
25392741 error_sysfs:
2540
- btrfs_sysfs_rm_device_link(fs_devices, device);
2742
+ btrfs_sysfs_remove_device(device);
25412743 mutex_lock(&fs_info->fs_devices->device_list_mutex);
25422744 mutex_lock(&fs_info->chunk_mutex);
25432745 list_del_rcu(&device->dev_list);
....@@ -2563,7 +2765,7 @@
25632765 btrfs_free_device(device);
25642766 error:
25652767 blkdev_put(bdev, FMODE_EXCL);
2566
- if (seeding_dev && !unlocked) {
2768
+ if (locked) {
25672769 mutex_unlock(&uuid_mutex);
25682770 up_write(&sb->s_umount);
25692771 }
....@@ -2621,7 +2823,6 @@
26212823 {
26222824 struct btrfs_fs_info *fs_info = device->fs_info;
26232825 struct btrfs_super_block *super_copy = fs_info->super_copy;
2624
- struct btrfs_fs_devices *fs_devices;
26252826 u64 old_total;
26262827 u64 diff;
26272828
....@@ -2640,8 +2841,6 @@
26402841 return -EINVAL;
26412842 }
26422843
2643
- fs_devices = fs_info->fs_devices;
2644
-
26452844 btrfs_set_super_total_bytes(super_copy,
26462845 round_down(old_total + diff, fs_info->sectorsize));
26472846 device->fs_devices->total_rw_bytes += diff;
....@@ -2649,9 +2848,9 @@
26492848 btrfs_device_set_total_bytes(device, new_size);
26502849 btrfs_device_set_disk_total_bytes(device, new_size);
26512850 btrfs_clear_space_info_full(device->fs_info);
2652
- if (list_empty(&device->resized_list))
2653
- list_add_tail(&device->resized_list,
2654
- &fs_devices->resized_devices);
2851
+ if (list_empty(&device->post_commit_list))
2852
+ list_add_tail(&device->post_commit_list,
2853
+ &trans->transaction->dev_update_list);
26552854 mutex_unlock(&fs_info->chunk_mutex);
26562855
26572856 return btrfs_update_device(trans, device);
....@@ -2739,13 +2938,20 @@
27392938 return ret;
27402939 }
27412940
2742
-static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2743
- u64 logical, u64 length)
2941
+/*
2942
+ * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2943
+ * @logical: Logical block offset in bytes.
2944
+ * @length: Length of extent in bytes.
2945
+ *
2946
+ * Return: Chunk mapping or ERR_PTR.
2947
+ */
2948
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2949
+ u64 logical, u64 length)
27442950 {
27452951 struct extent_map_tree *em_tree;
27462952 struct extent_map *em;
27472953
2748
- em_tree = &fs_info->mapping_tree.map_tree;
2954
+ em_tree = &fs_info->mapping_tree;
27492955 read_lock(&em_tree->lock);
27502956 em = lookup_extent_mapping(em_tree, logical, length);
27512957 read_unlock(&em_tree->lock);
....@@ -2777,7 +2983,7 @@
27772983 int i, ret = 0;
27782984 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
27792985
2780
- em = get_chunk_map(fs_info, chunk_offset, 1);
2986
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
27812987 if (IS_ERR(em)) {
27822988 /*
27832989 * This is a logic error, but we don't want to just rely on the
....@@ -2818,13 +3024,11 @@
28183024 mutex_unlock(&fs_info->chunk_mutex);
28193025 }
28203026
2821
- if (map->stripes[i].dev) {
2822
- ret = btrfs_update_device(trans, map->stripes[i].dev);
2823
- if (ret) {
2824
- mutex_unlock(&fs_devices->device_list_mutex);
2825
- btrfs_abort_transaction(trans, ret);
2826
- goto out;
2827
- }
3027
+ ret = btrfs_update_device(trans, device);
3028
+ if (ret) {
3029
+ mutex_unlock(&fs_devices->device_list_mutex);
3030
+ btrfs_abort_transaction(trans, ret);
3031
+ goto out;
28283032 }
28293033 }
28303034 mutex_unlock(&fs_devices->device_list_mutex);
....@@ -2861,6 +3065,7 @@
28613065 {
28623066 struct btrfs_root *root = fs_info->chunk_root;
28633067 struct btrfs_trans_handle *trans;
3068
+ struct btrfs_block_group *block_group;
28643069 int ret;
28653070
28663071 /*
....@@ -2877,10 +3082,6 @@
28773082 */
28783083 lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
28793084
2880
- ret = btrfs_can_relocate(fs_info, chunk_offset);
2881
- if (ret)
2882
- return -ENOSPC;
2883
-
28843085 /* step one, relocate all the extents inside this chunk */
28853086 btrfs_scrub_pause(fs_info);
28863087 ret = btrfs_relocate_block_group(fs_info, chunk_offset);
....@@ -2888,15 +3089,11 @@
28883089 if (ret)
28893090 return ret;
28903091
2891
- /*
2892
- * We add the kobjects here (and after forcing data chunk creation)
2893
- * since relocation is the only place we'll create chunks of a new
2894
- * type at runtime. The only place where we'll remove the last
2895
- * chunk of a type is the call immediately below this one. Even
2896
- * so, we're protected against races with the cleaner thread since
2897
- * we're covered by the delete_unused_bgs_mutex.
2898
- */
2899
- btrfs_add_raid_kobjects(fs_info);
3092
+ block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3093
+ if (!block_group)
3094
+ return -ENOENT;
3095
+ btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3096
+ btrfs_put_block_group(block_group);
29003097
29013098 trans = btrfs_start_trans_remove_block_group(root->fs_info,
29023099 chunk_offset);
....@@ -2997,7 +3194,7 @@
29973194 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
29983195 u64 chunk_offset)
29993196 {
3000
- struct btrfs_block_group_cache *cache;
3197
+ struct btrfs_block_group *cache;
30013198 u64 bytes_used;
30023199 u64 chunk_type;
30033200
....@@ -3006,30 +3203,28 @@
30063203 chunk_type = cache->flags;
30073204 btrfs_put_block_group(cache);
30083205
3009
- if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3010
- spin_lock(&fs_info->data_sinfo->lock);
3011
- bytes_used = fs_info->data_sinfo->bytes_used;
3012
- spin_unlock(&fs_info->data_sinfo->lock);
3206
+ if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3207
+ return 0;
30133208
3014
- if (!bytes_used) {
3015
- struct btrfs_trans_handle *trans;
3016
- int ret;
3209
+ spin_lock(&fs_info->data_sinfo->lock);
3210
+ bytes_used = fs_info->data_sinfo->bytes_used;
3211
+ spin_unlock(&fs_info->data_sinfo->lock);
30173212
3018
- trans = btrfs_join_transaction(fs_info->tree_root);
3019
- if (IS_ERR(trans))
3020
- return PTR_ERR(trans);
3213
+ if (!bytes_used) {
3214
+ struct btrfs_trans_handle *trans;
3215
+ int ret;
30213216
3022
- ret = btrfs_force_chunk_alloc(trans,
3023
- BTRFS_BLOCK_GROUP_DATA);
3024
- btrfs_end_transaction(trans);
3025
- if (ret < 0)
3026
- return ret;
3217
+ trans = btrfs_join_transaction(fs_info->tree_root);
3218
+ if (IS_ERR(trans))
3219
+ return PTR_ERR(trans);
30273220
3028
- btrfs_add_raid_kobjects(fs_info);
3029
-
3030
- return 1;
3031
- }
3221
+ ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3222
+ btrfs_end_transaction(trans);
3223
+ if (ret < 0)
3224
+ return ret;
3225
+ return 1;
30323226 }
3227
+
30333228 return 0;
30343229 }
30353230
....@@ -3099,7 +3294,7 @@
30993294 if (!path)
31003295 return -ENOMEM;
31013296
3102
- trans = btrfs_start_transaction(root, 0);
3297
+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
31033298 if (IS_ERR(trans)) {
31043299 btrfs_free_path(path);
31053300 return PTR_ERR(trans);
....@@ -3208,28 +3403,28 @@
32083403 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
32093404 struct btrfs_balance_args *bargs)
32103405 {
3211
- struct btrfs_block_group_cache *cache;
3406
+ struct btrfs_block_group *cache;
32123407 u64 chunk_used;
32133408 u64 user_thresh_min;
32143409 u64 user_thresh_max;
32153410 int ret = 1;
32163411
32173412 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3218
- chunk_used = btrfs_block_group_used(&cache->item);
3413
+ chunk_used = cache->used;
32193414
32203415 if (bargs->usage_min == 0)
32213416 user_thresh_min = 0;
32223417 else
3223
- user_thresh_min = div_factor_fine(cache->key.offset,
3224
- bargs->usage_min);
3418
+ user_thresh_min = div_factor_fine(cache->length,
3419
+ bargs->usage_min);
32253420
32263421 if (bargs->usage_max == 0)
32273422 user_thresh_max = 1;
32283423 else if (bargs->usage_max > 100)
3229
- user_thresh_max = cache->key.offset;
3424
+ user_thresh_max = cache->length;
32303425 else
3231
- user_thresh_max = div_factor_fine(cache->key.offset,
3232
- bargs->usage_max);
3426
+ user_thresh_max = div_factor_fine(cache->length,
3427
+ bargs->usage_max);
32333428
32343429 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
32353430 ret = 0;
....@@ -3241,20 +3436,19 @@
32413436 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
32423437 u64 chunk_offset, struct btrfs_balance_args *bargs)
32433438 {
3244
- struct btrfs_block_group_cache *cache;
3439
+ struct btrfs_block_group *cache;
32453440 u64 chunk_used, user_thresh;
32463441 int ret = 1;
32473442
32483443 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3249
- chunk_used = btrfs_block_group_used(&cache->item);
3444
+ chunk_used = cache->used;
32503445
32513446 if (bargs->usage_min == 0)
32523447 user_thresh = 1;
32533448 else if (bargs->usage > 100)
3254
- user_thresh = cache->key.offset;
3449
+ user_thresh = cache->length;
32553450 else
3256
- user_thresh = div_factor_fine(cache->key.offset,
3257
- bargs->usage);
3451
+ user_thresh = div_factor_fine(cache->length, bargs->usage);
32583452
32593453 if (chunk_used < user_thresh)
32603454 ret = 0;
....@@ -3280,6 +3474,18 @@
32803474 return 1;
32813475 }
32823476
3477
+static u64 calc_data_stripes(u64 type, int num_stripes)
3478
+{
3479
+ const int index = btrfs_bg_flags_to_raid_index(type);
3480
+ const int ncopies = btrfs_raid_array[index].ncopies;
3481
+ const int nparity = btrfs_raid_array[index].nparity;
3482
+
3483
+ if (nparity)
3484
+ return num_stripes - nparity;
3485
+ else
3486
+ return num_stripes / ncopies;
3487
+}
3488
+
32833489 /* [pstart, pend) */
32843490 static int chunk_drange_filter(struct extent_buffer *leaf,
32853491 struct btrfs_chunk *chunk,
....@@ -3289,22 +3495,15 @@
32893495 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
32903496 u64 stripe_offset;
32913497 u64 stripe_length;
3498
+ u64 type;
32923499 int factor;
32933500 int i;
32943501
32953502 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
32963503 return 0;
32973504
3298
- if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
3299
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3300
- factor = num_stripes / 2;
3301
- } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3302
- factor = num_stripes - 1;
3303
- } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3304
- factor = num_stripes - 2;
3305
- } else {
3306
- factor = num_stripes;
3307
- }
3505
+ type = btrfs_chunk_type(leaf, chunk);
3506
+ factor = calc_data_stripes(type, num_stripes);
33083507
33093508 for (i = 0; i < num_stripes; i++) {
33103509 stripe = btrfs_stripe_nr(chunk, i);
....@@ -3365,10 +3564,10 @@
33653564 return 0;
33663565 }
33673566
3368
-static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3369
- struct extent_buffer *leaf,
3567
+static int should_balance_chunk(struct extent_buffer *leaf,
33703568 struct btrfs_chunk *chunk, u64 chunk_offset)
33713569 {
3570
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
33723571 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
33733572 struct btrfs_balance_args *bargs = NULL;
33743573 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
....@@ -3458,17 +3657,11 @@
34583657 {
34593658 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
34603659 struct btrfs_root *chunk_root = fs_info->chunk_root;
3461
- struct btrfs_root *dev_root = fs_info->dev_root;
3462
- struct list_head *devices;
3463
- struct btrfs_device *device;
3464
- u64 old_size;
3465
- u64 size_to_free;
34663660 u64 chunk_type;
34673661 struct btrfs_chunk *chunk;
34683662 struct btrfs_path *path = NULL;
34693663 struct btrfs_key key;
34703664 struct btrfs_key found_key;
3471
- struct btrfs_trans_handle *trans;
34723665 struct extent_buffer *leaf;
34733666 int slot;
34743667 int ret;
....@@ -3483,53 +3676,6 @@
34833676 u32 count_sys = 0;
34843677 int chunk_reserved = 0;
34853678
3486
- /* step one make some room on all the devices */
3487
- devices = &fs_info->fs_devices->devices;
3488
- list_for_each_entry(device, devices, dev_list) {
3489
- old_size = btrfs_device_get_total_bytes(device);
3490
- size_to_free = div_factor(old_size, 1);
3491
- size_to_free = min_t(u64, size_to_free, SZ_1M);
3492
- if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
3493
- btrfs_device_get_total_bytes(device) -
3494
- btrfs_device_get_bytes_used(device) > size_to_free ||
3495
- test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
3496
- continue;
3497
-
3498
- ret = btrfs_shrink_device(device, old_size - size_to_free);
3499
- if (ret == -ENOSPC)
3500
- break;
3501
- if (ret) {
3502
- /* btrfs_shrink_device never returns ret > 0 */
3503
- WARN_ON(ret > 0);
3504
- goto error;
3505
- }
3506
-
3507
- trans = btrfs_start_transaction(dev_root, 0);
3508
- if (IS_ERR(trans)) {
3509
- ret = PTR_ERR(trans);
3510
- btrfs_info_in_rcu(fs_info,
3511
- "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
3512
- rcu_str_deref(device->name), ret,
3513
- old_size, old_size - size_to_free);
3514
- goto error;
3515
- }
3516
-
3517
- ret = btrfs_grow_device(trans, device, old_size);
3518
- if (ret) {
3519
- btrfs_end_transaction(trans);
3520
- /* btrfs_grow_device never returns ret > 0 */
3521
- WARN_ON(ret > 0);
3522
- btrfs_info_in_rcu(fs_info,
3523
- "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
3524
- rcu_str_deref(device->name), ret,
3525
- old_size, old_size - size_to_free);
3526
- goto error;
3527
- }
3528
-
3529
- btrfs_end_transaction(trans);
3530
- }
3531
-
3532
- /* step two, relocate all the chunks */
35333679 path = btrfs_alloc_path();
35343680 if (!path) {
35353681 ret = -ENOMEM;
....@@ -3601,8 +3747,7 @@
36013747 spin_unlock(&fs_info->balance_lock);
36023748 }
36033749
3604
- ret = should_balance_chunk(fs_info, leaf, chunk,
3605
- found_key.offset);
3750
+ ret = should_balance_chunk(leaf, chunk, found_key.offset);
36063751
36073752 btrfs_release_path(path);
36083753 if (!ret) {
....@@ -3659,10 +3804,15 @@
36593804
36603805 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
36613806 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3662
- if (ret && ret != -ENOSPC)
3663
- goto error;
36643807 if (ret == -ENOSPC) {
36653808 enospc_errors++;
3809
+ } else if (ret == -ETXTBSY) {
3810
+ btrfs_info(fs_info,
3811
+ "skipping relocation of block group %llu due to active swapfile",
3812
+ found_key.offset);
3813
+ ret = 0;
3814
+ } else if (ret) {
3815
+ goto error;
36663816 } else {
36673817 spin_lock(&fs_info->balance_lock);
36683818 bctl->stat.completed++;
....@@ -3711,8 +3861,7 @@
37113861 if (flags == 0)
37123862 return !extended; /* "0" is valid for usual profiles */
37133863
3714
- /* true if exactly one bit set */
3715
- return (flags & (flags - 1)) == 0;
3864
+ return has_single_bit_set(flags);
37163865 }
37173866
37183867 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
....@@ -3723,13 +3872,179 @@
37233872 atomic_read(&fs_info->balance_cancel_req) == 0);
37243873 }
37253874
3726
-/* Non-zero return value signifies invalidity */
3727
-static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
3728
- u64 allowed)
3875
+/*
3876
+ * Validate target profile against allowed profiles and return true if it's OK.
3877
+ * Otherwise print the error message and return false.
3878
+ */
3879
+static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
3880
+ const struct btrfs_balance_args *bargs,
3881
+ u64 allowed, const char *type)
37293882 {
3730
- return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3731
- (!alloc_profile_is_valid(bctl_arg->target, 1) ||
3732
- (bctl_arg->target & ~allowed)));
3883
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3884
+ return true;
3885
+
3886
+ /* Profile is valid and does not have bits outside of the allowed set */
3887
+ if (alloc_profile_is_valid(bargs->target, 1) &&
3888
+ (bargs->target & ~allowed) == 0)
3889
+ return true;
3890
+
3891
+ btrfs_err(fs_info, "balance: invalid convert %s profile %s",
3892
+ type, btrfs_bg_type_to_raid_name(bargs->target));
3893
+ return false;
3894
+}
3895
+
3896
+/*
3897
+ * Fill @buf with textual description of balance filter flags @bargs, up to
3898
+ * @size_buf including the terminating null. The output may be trimmed if it
3899
+ * does not fit into the provided buffer.
3900
+ */
3901
+static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3902
+ u32 size_buf)
3903
+{
3904
+ int ret;
3905
+ u32 size_bp = size_buf;
3906
+ char *bp = buf;
3907
+ u64 flags = bargs->flags;
3908
+ char tmp_buf[128] = {'\0'};
3909
+
3910
+ if (!flags)
3911
+ return;
3912
+
3913
+#define CHECK_APPEND_NOARG(a) \
3914
+ do { \
3915
+ ret = snprintf(bp, size_bp, (a)); \
3916
+ if (ret < 0 || ret >= size_bp) \
3917
+ goto out_overflow; \
3918
+ size_bp -= ret; \
3919
+ bp += ret; \
3920
+ } while (0)
3921
+
3922
+#define CHECK_APPEND_1ARG(a, v1) \
3923
+ do { \
3924
+ ret = snprintf(bp, size_bp, (a), (v1)); \
3925
+ if (ret < 0 || ret >= size_bp) \
3926
+ goto out_overflow; \
3927
+ size_bp -= ret; \
3928
+ bp += ret; \
3929
+ } while (0)
3930
+
3931
+#define CHECK_APPEND_2ARG(a, v1, v2) \
3932
+ do { \
3933
+ ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
3934
+ if (ret < 0 || ret >= size_bp) \
3935
+ goto out_overflow; \
3936
+ size_bp -= ret; \
3937
+ bp += ret; \
3938
+ } while (0)
3939
+
3940
+ if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3941
+ CHECK_APPEND_1ARG("convert=%s,",
3942
+ btrfs_bg_type_to_raid_name(bargs->target));
3943
+
3944
+ if (flags & BTRFS_BALANCE_ARGS_SOFT)
3945
+ CHECK_APPEND_NOARG("soft,");
3946
+
3947
+ if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
3948
+ btrfs_describe_block_groups(bargs->profiles, tmp_buf,
3949
+ sizeof(tmp_buf));
3950
+ CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
3951
+ }
3952
+
3953
+ if (flags & BTRFS_BALANCE_ARGS_USAGE)
3954
+ CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
3955
+
3956
+ if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
3957
+ CHECK_APPEND_2ARG("usage=%u..%u,",
3958
+ bargs->usage_min, bargs->usage_max);
3959
+
3960
+ if (flags & BTRFS_BALANCE_ARGS_DEVID)
3961
+ CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
3962
+
3963
+ if (flags & BTRFS_BALANCE_ARGS_DRANGE)
3964
+ CHECK_APPEND_2ARG("drange=%llu..%llu,",
3965
+ bargs->pstart, bargs->pend);
3966
+
3967
+ if (flags & BTRFS_BALANCE_ARGS_VRANGE)
3968
+ CHECK_APPEND_2ARG("vrange=%llu..%llu,",
3969
+ bargs->vstart, bargs->vend);
3970
+
3971
+ if (flags & BTRFS_BALANCE_ARGS_LIMIT)
3972
+ CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
3973
+
3974
+ if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
3975
+ CHECK_APPEND_2ARG("limit=%u..%u,",
3976
+ bargs->limit_min, bargs->limit_max);
3977
+
3978
+ if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
3979
+ CHECK_APPEND_2ARG("stripes=%u..%u,",
3980
+ bargs->stripes_min, bargs->stripes_max);
3981
+
3982
+#undef CHECK_APPEND_2ARG
3983
+#undef CHECK_APPEND_1ARG
3984
+#undef CHECK_APPEND_NOARG
3985
+
3986
+out_overflow:
3987
+
3988
+ if (size_bp < size_buf)
3989
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
3990
+ else
3991
+ buf[0] = '\0';
3992
+}
3993
+
3994
+static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
3995
+{
3996
+ u32 size_buf = 1024;
3997
+ char tmp_buf[192] = {'\0'};
3998
+ char *buf;
3999
+ char *bp;
4000
+ u32 size_bp = size_buf;
4001
+ int ret;
4002
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4003
+
4004
+ buf = kzalloc(size_buf, GFP_KERNEL);
4005
+ if (!buf)
4006
+ return;
4007
+
4008
+ bp = buf;
4009
+
4010
+#define CHECK_APPEND_1ARG(a, v1) \
4011
+ do { \
4012
+ ret = snprintf(bp, size_bp, (a), (v1)); \
4013
+ if (ret < 0 || ret >= size_bp) \
4014
+ goto out_overflow; \
4015
+ size_bp -= ret; \
4016
+ bp += ret; \
4017
+ } while (0)
4018
+
4019
+ if (bctl->flags & BTRFS_BALANCE_FORCE)
4020
+ CHECK_APPEND_1ARG("%s", "-f ");
4021
+
4022
+ if (bctl->flags & BTRFS_BALANCE_DATA) {
4023
+ describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4024
+ CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4025
+ }
4026
+
4027
+ if (bctl->flags & BTRFS_BALANCE_METADATA) {
4028
+ describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4029
+ CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4030
+ }
4031
+
4032
+ if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4033
+ describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4034
+ CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4035
+ }
4036
+
4037
+#undef CHECK_APPEND_1ARG
4038
+
4039
+out_overflow:
4040
+
4041
+ if (size_bp < size_buf)
4042
+ buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4043
+ btrfs_info(fs_info, "balance: %s %s",
4044
+ (bctl->flags & BTRFS_BALANCE_RESUME) ?
4045
+ "resume" : "start", buf);
4046
+
4047
+ kfree(buf);
37334048 }
37344049
37354050 /*
....@@ -3745,11 +4060,12 @@
37454060 int ret;
37464061 u64 num_devices;
37474062 unsigned seq;
3748
- bool reducing_integrity;
4063
+ bool reducing_redundancy;
4064
+ int i;
37494065
37504066 if (btrfs_fs_closing(fs_info) ||
37514067 atomic_read(&fs_info->balance_pause_req) ||
3752
- atomic_read(&fs_info->balance_cancel_req)) {
4068
+ btrfs_should_cancel_balance(fs_info)) {
37534069 ret = -EINVAL;
37544070 goto out;
37554071 }
....@@ -3774,54 +4090,39 @@
37744090 }
37754091 }
37764092
3777
- num_devices = fs_info->fs_devices->num_devices;
3778
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
3779
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3780
- BUG_ON(num_devices < 1);
3781
- num_devices--;
3782
- }
3783
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3784
- allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
3785
- if (num_devices > 1)
3786
- allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3787
- if (num_devices > 2)
3788
- allowed |= BTRFS_BLOCK_GROUP_RAID5;
3789
- if (num_devices > 3)
3790
- allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3791
- BTRFS_BLOCK_GROUP_RAID6);
3792
- if (validate_convert_profile(&bctl->data, allowed)) {
3793
- int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
4093
+ /*
4094
+ * rw_devices will not change at the moment, device add/delete/replace
4095
+ * are exclusive
4096
+ */
4097
+ num_devices = fs_info->fs_devices->rw_devices;
37944098
3795
- btrfs_err(fs_info,
3796
- "balance: invalid convert data profile %s",
3797
- get_raid_name(index));
3798
- ret = -EINVAL;
3799
- goto out;
3800
- }
3801
- if (validate_convert_profile(&bctl->meta, allowed)) {
3802
- int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
4099
+ /*
4100
+ * SINGLE profile on-disk has no profile bit, but in-memory we have a
4101
+ * special bit for it, to make it easier to distinguish. Thus we need
4102
+ * to set it manually, or balance would refuse the profile.
4103
+ */
4104
+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4105
+ for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4106
+ if (num_devices >= btrfs_raid_array[i].devs_min)
4107
+ allowed |= btrfs_raid_array[i].bg_flag;
38034108
3804
- btrfs_err(fs_info,
3805
- "balance: invalid convert metadata profile %s",
3806
- get_raid_name(index));
3807
- ret = -EINVAL;
3808
- goto out;
3809
- }
3810
- if (validate_convert_profile(&bctl->sys, allowed)) {
3811
- int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
3812
-
3813
- btrfs_err(fs_info,
3814
- "balance: invalid convert system profile %s",
3815
- get_raid_name(index));
4109
+ if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4110
+ !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4111
+ !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
38164112 ret = -EINVAL;
38174113 goto out;
38184114 }
38194115
3820
- /* allow to reduce meta or sys integrity only if force set */
3821
- allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3822
- BTRFS_BLOCK_GROUP_RAID10 |
3823
- BTRFS_BLOCK_GROUP_RAID5 |
3824
- BTRFS_BLOCK_GROUP_RAID6;
4116
+ /*
4117
+ * Allow to reduce metadata or system integrity only if force set for
4118
+ * profiles with redundancy (copies, parity)
4119
+ */
4120
+ allowed = 0;
4121
+ for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4122
+ if (btrfs_raid_array[i].ncopies >= 2 ||
4123
+ btrfs_raid_array[i].tolerated_failures >= 1)
4124
+ allowed |= btrfs_raid_array[i].bg_flag;
4125
+ }
38254126 do {
38264127 seq = read_seqbegin(&fs_info->profiles_lock);
38274128
....@@ -3831,9 +4132,9 @@
38314132 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
38324133 (fs_info->avail_metadata_alloc_bits & allowed) &&
38334134 !(bctl->meta.target & allowed)))
3834
- reducing_integrity = true;
4135
+ reducing_redundancy = true;
38354136 else
3836
- reducing_integrity = false;
4137
+ reducing_redundancy = false;
38374138
38384139 /* if we're not converting, the target field is uninitialized */
38394140 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
....@@ -3842,13 +4143,13 @@
38424143 bctl->data.target : fs_info->avail_data_alloc_bits;
38434144 } while (read_seqretry(&fs_info->profiles_lock, seq));
38444145
3845
- if (reducing_integrity) {
4146
+ if (reducing_redundancy) {
38464147 if (bctl->flags & BTRFS_BALANCE_FORCE) {
38474148 btrfs_info(fs_info,
3848
- "balance: force reducing metadata integrity");
4149
+ "balance: force reducing metadata redundancy");
38494150 } else {
38504151 btrfs_err(fs_info,
3851
- "balance: reduces metadata integrity, use --force if you want this");
4152
+ "balance: reduces metadata redundancy, use --force if you want this");
38524153 ret = -EINVAL;
38534154 goto out;
38544155 }
....@@ -3856,12 +4157,18 @@
38564157
38574158 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
38584159 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
3859
- int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
3860
- int data_index = btrfs_bg_flags_to_raid_index(data_target);
3861
-
38624160 btrfs_warn(fs_info,
38634161 "balance: metadata profile %s has lower redundancy than data profile %s",
3864
- get_raid_name(meta_index), get_raid_name(data_index));
4162
+ btrfs_bg_type_to_raid_name(meta_target),
4163
+ btrfs_bg_type_to_raid_name(data_target));
4164
+ }
4165
+
4166
+ if (fs_info->send_in_progress) {
4167
+ btrfs_warn_rl(fs_info,
4168
+"cannot run balance while send operations are in progress (%d in progress)",
4169
+ fs_info->send_in_progress);
4170
+ ret = -EAGAIN;
4171
+ goto out;
38654172 }
38664173
38674174 ret = insert_balance_item(fs_info, bctl);
....@@ -3883,11 +4190,34 @@
38834190
38844191 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
38854192 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4193
+ describe_balance_start_or_resume(fs_info);
38864194 mutex_unlock(&fs_info->balance_mutex);
38874195
38884196 ret = __btrfs_balance(fs_info);
38894197
38904198 mutex_lock(&fs_info->balance_mutex);
4199
+ if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4200
+ btrfs_info(fs_info, "balance: paused");
4201
+ /*
4202
+ * Balance can be canceled by:
4203
+ *
4204
+ * - Regular cancel request
4205
+ * Then ret == -ECANCELED and balance_cancel_req > 0
4206
+ *
4207
+ * - Fatal signal to "btrfs" process
4208
+ * Either the signal caught by wait_reserve_ticket() and callers
4209
+ * got -EINTR, or caught by btrfs_should_cancel_balance() and
4210
+ * got -ECANCELED.
4211
+ * Either way, in this case balance_cancel_req = 0, and
4212
+ * ret == -EINTR or ret == -ECANCELED.
4213
+ *
4214
+ * So here we only check the return value to catch canceled balance.
4215
+ */
4216
+ else if (ret == -ECANCELED || ret == -EINTR)
4217
+ btrfs_info(fs_info, "balance: canceled");
4218
+ else
4219
+ btrfs_info(fs_info, "balance: ended with status: %d", ret);
4220
+
38914221 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
38924222
38934223 if (bargs) {
....@@ -3898,7 +4228,7 @@
38984228 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
38994229 balance_need_close(fs_info)) {
39004230 reset_balance_state(fs_info);
3901
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4231
+ btrfs_exclop_finish(fs_info);
39024232 }
39034233
39044234 wake_up(&fs_info->balance_wait_q);
....@@ -3909,7 +4239,7 @@
39094239 reset_balance_state(fs_info);
39104240 else
39114241 kfree(bctl);
3912
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4242
+ btrfs_exclop_finish(fs_info);
39134243
39144244 return ret;
39154245 }
....@@ -3919,12 +4249,12 @@
39194249 struct btrfs_fs_info *fs_info = data;
39204250 int ret = 0;
39214251
4252
+ sb_start_write(fs_info->sb);
39224253 mutex_lock(&fs_info->balance_mutex);
3923
- if (fs_info->balance_ctl) {
3924
- btrfs_info(fs_info, "balance: resuming");
4254
+ if (fs_info->balance_ctl)
39254255 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
3926
- }
39274256 mutex_unlock(&fs_info->balance_mutex);
4257
+ sb_end_write(fs_info->sb);
39284258
39294259 return ret;
39304260 }
....@@ -4013,7 +4343,7 @@
40134343 * is in a paused state and must have fs_info::balance_ctl properly
40144344 * set up.
40154345 */
4016
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
4346
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
40174347 btrfs_warn(fs_info,
40184348 "balance: cannot set exclusive op status, resume manually");
40194349
....@@ -4097,7 +4427,7 @@
40974427
40984428 if (fs_info->balance_ctl) {
40994429 reset_balance_state(fs_info);
4100
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4430
+ btrfs_exclop_finish(fs_info);
41014431 btrfs_info(fs_info, "balance: canceled");
41024432 }
41034433 }
....@@ -4109,7 +4439,7 @@
41094439 return 0;
41104440 }
41114441
4112
-static int btrfs_uuid_scan_kthread(void *data)
4442
+int btrfs_uuid_scan_kthread(void *data)
41134443 {
41144444 struct btrfs_fs_info *fs_info = data;
41154445 struct btrfs_root *root = fs_info->tree_root;
....@@ -4121,6 +4451,7 @@
41214451 struct btrfs_root_item root_item;
41224452 u32 item_size;
41234453 struct btrfs_trans_handle *trans = NULL;
4454
+ bool closing = false;
41244455
41254456 path = btrfs_alloc_path();
41264457 if (!path) {
....@@ -4133,6 +4464,10 @@
41334464 key.offset = 0;
41344465
41354466 while (1) {
4467
+ if (btrfs_fs_closing(fs_info)) {
4468
+ closing = true;
4469
+ break;
4470
+ }
41364471 ret = btrfs_search_forward(root, &key, path,
41374472 BTRFS_OLDEST_GENERATION);
41384473 if (ret) {
....@@ -4233,74 +4568,10 @@
42334568 btrfs_end_transaction(trans);
42344569 if (ret)
42354570 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4236
- else
4571
+ else if (!closing)
42374572 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
42384573 up(&fs_info->uuid_tree_rescan_sem);
42394574 return 0;
4240
-}
4241
-
4242
-/*
4243
- * Callback for btrfs_uuid_tree_iterate().
4244
- * returns:
4245
- * 0 check succeeded, the entry is not outdated.
4246
- * < 0 if an error occurred.
4247
- * > 0 if the check failed, which means the caller shall remove the entry.
4248
- */
4249
-static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
4250
- u8 *uuid, u8 type, u64 subid)
4251
-{
4252
- struct btrfs_key key;
4253
- int ret = 0;
4254
- struct btrfs_root *subvol_root;
4255
-
4256
- if (type != BTRFS_UUID_KEY_SUBVOL &&
4257
- type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
4258
- goto out;
4259
-
4260
- key.objectid = subid;
4261
- key.type = BTRFS_ROOT_ITEM_KEY;
4262
- key.offset = (u64)-1;
4263
- subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
4264
- if (IS_ERR(subvol_root)) {
4265
- ret = PTR_ERR(subvol_root);
4266
- if (ret == -ENOENT)
4267
- ret = 1;
4268
- goto out;
4269
- }
4270
-
4271
- switch (type) {
4272
- case BTRFS_UUID_KEY_SUBVOL:
4273
- if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
4274
- ret = 1;
4275
- break;
4276
- case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
4277
- if (memcmp(uuid, subvol_root->root_item.received_uuid,
4278
- BTRFS_UUID_SIZE))
4279
- ret = 1;
4280
- break;
4281
- }
4282
-
4283
-out:
4284
- return ret;
4285
-}
4286
-
4287
-static int btrfs_uuid_rescan_kthread(void *data)
4288
-{
4289
- struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
4290
- int ret;
4291
-
4292
- /*
4293
- * 1st step is to iterate through the existing UUID tree and
4294
- * to delete all entries that contain outdated data.
4295
- * 2nd step is to add all missing entries to the UUID tree.
4296
- */
4297
- ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
4298
- if (ret < 0) {
4299
- btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
4300
- up(&fs_info->uuid_tree_rescan_sem);
4301
- return ret;
4302
- }
4303
- return btrfs_uuid_scan_kthread(data);
43044575 }
43054576
43064577 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
....@@ -4319,8 +4590,7 @@
43194590 if (IS_ERR(trans))
43204591 return PTR_ERR(trans);
43214592
4322
- uuid_root = btrfs_create_tree(trans, fs_info,
4323
- BTRFS_UUID_TREE_OBJECTID);
4593
+ uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
43244594 if (IS_ERR(uuid_root)) {
43254595 ret = PTR_ERR(uuid_root);
43264596 btrfs_abort_transaction(trans, ret);
....@@ -4346,22 +4616,6 @@
43464616 return 0;
43474617 }
43484618
4349
-int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
4350
-{
4351
- struct task_struct *task;
4352
-
4353
- down(&fs_info->uuid_tree_rescan_sem);
4354
- task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
4355
- if (IS_ERR(task)) {
4356
- /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4357
- btrfs_warn(fs_info, "failed to start uuid_rescan task");
4358
- up(&fs_info->uuid_tree_rescan_sem);
4359
- return PTR_ERR(task);
4360
- }
4361
-
4362
- return 0;
4363
-}
4364
-
43654619 /*
43664620 * shrinking a device means finding all of the device extents past
43674621 * the new size, and then following the back refs to the chunks.
....@@ -4380,15 +4634,16 @@
43804634 int slot;
43814635 int failed = 0;
43824636 bool retried = false;
4383
- bool checked_pending_chunks = false;
43844637 struct extent_buffer *l;
43854638 struct btrfs_key key;
43864639 struct btrfs_super_block *super_copy = fs_info->super_copy;
43874640 u64 old_total = btrfs_super_total_bytes(super_copy);
43884641 u64 old_size = btrfs_device_get_total_bytes(device);
43894642 u64 diff;
4643
+ u64 start;
43904644
43914645 new_size = round_down(new_size, fs_info->sectorsize);
4646
+ start = new_size;
43924647 diff = round_down(old_size - new_size, fs_info->sectorsize);
43934648
43944649 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
....@@ -4400,6 +4655,12 @@
44004655
44014656 path->reada = READA_BACK;
44024657
4658
+ trans = btrfs_start_transaction(root, 0);
4659
+ if (IS_ERR(trans)) {
4660
+ btrfs_free_path(path);
4661
+ return PTR_ERR(trans);
4662
+ }
4663
+
44034664 mutex_lock(&fs_info->chunk_mutex);
44044665
44054666 btrfs_device_set_total_bytes(device, new_size);
....@@ -4407,7 +4668,21 @@
44074668 device->fs_devices->total_rw_bytes -= diff;
44084669 atomic64_sub(diff, &fs_info->free_chunk_space);
44094670 }
4410
- mutex_unlock(&fs_info->chunk_mutex);
4671
+
4672
+ /*
4673
+ * Once the device's size has been set to the new size, ensure all
4674
+ * in-memory chunks are synced to disk so that the loop below sees them
4675
+ * and relocates them accordingly.
4676
+ */
4677
+ if (contains_pending_extent(device, &start, diff)) {
4678
+ mutex_unlock(&fs_info->chunk_mutex);
4679
+ ret = btrfs_commit_transaction(trans);
4680
+ if (ret)
4681
+ goto done;
4682
+ } else {
4683
+ mutex_unlock(&fs_info->chunk_mutex);
4684
+ btrfs_end_transaction(trans);
4685
+ }
44114686
44124687 again:
44134688 key.objectid = device->devid;
....@@ -4469,10 +4744,16 @@
44694744
44704745 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
44714746 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4472
- if (ret && ret != -ENOSPC)
4473
- goto done;
4474
- if (ret == -ENOSPC)
4747
+ if (ret == -ENOSPC) {
44754748 failed++;
4749
+ } else if (ret) {
4750
+ if (ret == -ETXTBSY) {
4751
+ btrfs_warn(fs_info,
4752
+ "could not shrink block group %llu due to active swapfile",
4753
+ chunk_offset);
4754
+ }
4755
+ goto done;
4756
+ }
44764757 } while (key.offset-- > 0);
44774758
44784759 if (failed && !retried) {
....@@ -4492,40 +4773,14 @@
44924773 }
44934774
44944775 mutex_lock(&fs_info->chunk_mutex);
4495
-
4496
- /*
4497
- * We checked in the above loop all device extents that were already in
4498
- * the device tree. However before we have updated the device's
4499
- * total_bytes to the new size, we might have had chunk allocations that
4500
- * have not complete yet (new block groups attached to transaction
4501
- * handles), and therefore their device extents were not yet in the
4502
- * device tree and we missed them in the loop above. So if we have any
4503
- * pending chunk using a device extent that overlaps the device range
4504
- * that we can not use anymore, commit the current transaction and
4505
- * repeat the search on the device tree - this way we guarantee we will
4506
- * not have chunks using device extents that end beyond 'new_size'.
4507
- */
4508
- if (!checked_pending_chunks) {
4509
- u64 start = new_size;
4510
- u64 len = old_size - new_size;
4511
-
4512
- if (contains_pending_extent(trans->transaction, device,
4513
- &start, len)) {
4514
- mutex_unlock(&fs_info->chunk_mutex);
4515
- checked_pending_chunks = true;
4516
- failed = 0;
4517
- retried = false;
4518
- ret = btrfs_commit_transaction(trans);
4519
- if (ret)
4520
- goto done;
4521
- goto again;
4522
- }
4523
- }
4776
+ /* Clear all state bits beyond the shrunk device size */
4777
+ clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4778
+ CHUNK_STATE_MASK);
45244779
45254780 btrfs_device_set_disk_total_bytes(device, new_size);
4526
- if (list_empty(&device->resized_list))
4527
- list_add_tail(&device->resized_list,
4528
- &fs_info->fs_devices->resized_devices);
4781
+ if (list_empty(&device->post_commit_list))
4782
+ list_add_tail(&device->post_commit_list,
4783
+ &trans->transaction->dev_update_list);
45294784
45304785 WARN_ON(diff > old_total);
45314786 btrfs_set_super_total_bytes(super_copy,
....@@ -4609,96 +4864,119 @@
46094864 btrfs_set_fs_incompat(info, RAID56);
46104865 }
46114866
4612
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4613
- u64 start, u64 type)
4867
+static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
46144868 {
4615
- struct btrfs_fs_info *info = trans->fs_info;
4616
- struct btrfs_fs_devices *fs_devices = info->fs_devices;
4617
- struct btrfs_device *device;
4618
- struct map_lookup *map = NULL;
4619
- struct extent_map_tree *em_tree;
4620
- struct extent_map *em;
4621
- struct btrfs_device_info *devices_info = NULL;
4622
- u64 total_avail;
4623
- int num_stripes; /* total number of stripes to allocate */
4624
- int data_stripes; /* number of stripes that count for
4625
- block group size */
4626
- int sub_stripes; /* sub_stripes info for map */
4627
- int dev_stripes; /* stripes per dev */
4628
- int devs_max; /* max devs to use */
4629
- int devs_min; /* min devs needed */
4630
- int devs_increment; /* ndevs has to be a multiple of this */
4631
- int ncopies; /* how many copies to data has */
4632
- int ret;
4869
+ if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4870
+ return;
4871
+
4872
+ btrfs_set_fs_incompat(info, RAID1C34);
4873
+}
4874
+
4875
+/*
4876
+ * Structure used internally for __btrfs_alloc_chunk() function.
4877
+ * Wraps needed parameters.
4878
+ */
4879
+struct alloc_chunk_ctl {
4880
+ u64 start;
4881
+ u64 type;
4882
+ /* Total number of stripes to allocate */
4883
+ int num_stripes;
4884
+ /* sub_stripes info for map */
4885
+ int sub_stripes;
4886
+ /* Stripes per device */
4887
+ int dev_stripes;
4888
+ /* Maximum number of devices to use */
4889
+ int devs_max;
4890
+ /* Minimum number of devices to use */
4891
+ int devs_min;
4892
+ /* ndevs has to be a multiple of this */
4893
+ int devs_increment;
4894
+ /* Number of copies */
4895
+ int ncopies;
4896
+ /* Number of stripes worth of bytes to store parity information */
4897
+ int nparity;
46334898 u64 max_stripe_size;
46344899 u64 max_chunk_size;
4900
+ u64 dev_extent_min;
46354901 u64 stripe_size;
4636
- u64 num_bytes;
4902
+ u64 chunk_size;
46374903 int ndevs;
4638
- int i;
4639
- int j;
4640
- int index;
4904
+};
46414905
4642
- BUG_ON(!alloc_profile_is_valid(type, 0));
4643
-
4644
- if (list_empty(&fs_devices->alloc_list)) {
4645
- if (btrfs_test_opt(info, ENOSPC_DEBUG))
4646
- btrfs_debug(info, "%s: no writable device", __func__);
4647
- return -ENOSPC;
4648
- }
4649
-
4650
- index = btrfs_bg_flags_to_raid_index(type);
4651
-
4652
- sub_stripes = btrfs_raid_array[index].sub_stripes;
4653
- dev_stripes = btrfs_raid_array[index].dev_stripes;
4654
- devs_max = btrfs_raid_array[index].devs_max;
4655
- devs_min = btrfs_raid_array[index].devs_min;
4656
- devs_increment = btrfs_raid_array[index].devs_increment;
4657
- ncopies = btrfs_raid_array[index].ncopies;
4906
+static void init_alloc_chunk_ctl_policy_regular(
4907
+ struct btrfs_fs_devices *fs_devices,
4908
+ struct alloc_chunk_ctl *ctl)
4909
+{
4910
+ u64 type = ctl->type;
46584911
46594912 if (type & BTRFS_BLOCK_GROUP_DATA) {
4660
- max_stripe_size = SZ_1G;
4661
- max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4662
- if (!devs_max)
4663
- devs_max = BTRFS_MAX_DEVS(info);
4913
+ ctl->max_stripe_size = SZ_1G;
4914
+ ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
46644915 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4665
- /* for larger filesystems, use larger metadata chunks */
4916
+ /* For larger filesystems, use larger metadata chunks */
46664917 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4667
- max_stripe_size = SZ_1G;
4918
+ ctl->max_stripe_size = SZ_1G;
46684919 else
4669
- max_stripe_size = SZ_256M;
4670
- max_chunk_size = max_stripe_size;
4671
- if (!devs_max)
4672
- devs_max = BTRFS_MAX_DEVS(info);
4920
+ ctl->max_stripe_size = SZ_256M;
4921
+ ctl->max_chunk_size = ctl->max_stripe_size;
46734922 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4674
- max_stripe_size = SZ_32M;
4675
- max_chunk_size = 2 * max_stripe_size;
4676
- if (!devs_max)
4677
- devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4923
+ ctl->max_stripe_size = SZ_32M;
4924
+ ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4925
+ ctl->devs_max = min_t(int, ctl->devs_max,
4926
+ BTRFS_MAX_DEVS_SYS_CHUNK);
46784927 } else {
4679
- btrfs_err(info, "invalid chunk type 0x%llx requested",
4680
- type);
4681
- BUG_ON(1);
4928
+ BUG();
46824929 }
46834930
4684
- /* we don't want a chunk larger than 10% of writeable space */
4685
- max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4686
- max_chunk_size);
4931
+ /* We don't want a chunk larger than 10% of writable space */
4932
+ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4933
+ ctl->max_chunk_size);
4934
+ ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
4935
+}
46874936
4688
- devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4689
- GFP_NOFS);
4690
- if (!devices_info)
4691
- return -ENOMEM;
4937
+static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
4938
+ struct alloc_chunk_ctl *ctl)
4939
+{
4940
+ int index = btrfs_bg_flags_to_raid_index(ctl->type);
4941
+
4942
+ ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
4943
+ ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
4944
+ ctl->devs_max = btrfs_raid_array[index].devs_max;
4945
+ if (!ctl->devs_max)
4946
+ ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
4947
+ ctl->devs_min = btrfs_raid_array[index].devs_min;
4948
+ ctl->devs_increment = btrfs_raid_array[index].devs_increment;
4949
+ ctl->ncopies = btrfs_raid_array[index].ncopies;
4950
+ ctl->nparity = btrfs_raid_array[index].nparity;
4951
+ ctl->ndevs = 0;
4952
+
4953
+ switch (fs_devices->chunk_alloc_policy) {
4954
+ case BTRFS_CHUNK_ALLOC_REGULAR:
4955
+ init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
4956
+ break;
4957
+ default:
4958
+ BUG();
4959
+ }
4960
+}
4961
+
4962
+static int gather_device_info(struct btrfs_fs_devices *fs_devices,
4963
+ struct alloc_chunk_ctl *ctl,
4964
+ struct btrfs_device_info *devices_info)
4965
+{
4966
+ struct btrfs_fs_info *info = fs_devices->fs_info;
4967
+ struct btrfs_device *device;
4968
+ u64 total_avail;
4969
+ u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
4970
+ int ret;
4971
+ int ndevs = 0;
4972
+ u64 max_avail;
4973
+ u64 dev_offset;
46924974
46934975 /*
46944976 * in the first pass through the devices list, we gather information
46954977 * about the available holes on each device.
46964978 */
4697
- ndevs = 0;
46984979 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4699
- u64 max_avail;
4700
- u64 dev_offset;
4701
-
47024980 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
47034981 WARN(1, KERN_ERR
47044982 "BTRFS: read-only device in alloc_list\n");
....@@ -4716,24 +4994,23 @@
47164994 total_avail = 0;
47174995
47184996 /* If there is no space on this device, skip it. */
4719
- if (total_avail == 0)
4997
+ if (total_avail < ctl->dev_extent_min)
47204998 continue;
47214999
4722
- ret = find_free_dev_extent(trans, device,
4723
- max_stripe_size * dev_stripes,
4724
- &dev_offset, &max_avail);
5000
+ ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5001
+ &max_avail);
47255002 if (ret && ret != -ENOSPC)
4726
- goto error;
5003
+ return ret;
47275004
47285005 if (ret == 0)
4729
- max_avail = max_stripe_size * dev_stripes;
5006
+ max_avail = dev_extent_want;
47305007
4731
- if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
5008
+ if (max_avail < ctl->dev_extent_min) {
47325009 if (btrfs_test_opt(info, ENOSPC_DEBUG))
47335010 btrfs_debug(info,
4734
- "%s: devid %llu has no free space, have=%llu want=%u",
5011
+ "%s: devid %llu has no free space, have=%llu want=%llu",
47355012 __func__, device->devid, max_avail,
4736
- BTRFS_STRIPE_LEN * dev_stripes);
5013
+ ctl->dev_extent_min);
47375014 continue;
47385015 }
47395016
....@@ -4748,6 +5025,7 @@
47485025 devices_info[ndevs].dev = device;
47495026 ++ndevs;
47505027 }
5028
+ ctl->ndevs = ndevs;
47515029
47525030 /*
47535031 * now sort the devices by hole size / available space
....@@ -4755,20 +5033,14 @@
47555033 sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
47565034 btrfs_cmp_device_info, NULL);
47575035
4758
- /* round down to number of usable stripes */
4759
- ndevs = round_down(ndevs, devs_increment);
5036
+ return 0;
5037
+}
47605038
4761
- if (ndevs < devs_min) {
4762
- ret = -ENOSPC;
4763
- if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
4764
- btrfs_debug(info,
4765
- "%s: not enough devices with free space: have=%d minimum required=%d",
4766
- __func__, ndevs, devs_min);
4767
- }
4768
- goto error;
4769
- }
4770
-
4771
- ndevs = min(ndevs, devs_max);
5039
+static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5040
+ struct btrfs_device_info *devices_info)
5041
+{
5042
+ /* Number of stripes that count for block group size */
5043
+ int data_stripes;
47725044
47735045 /*
47745046 * The primary goal is to maximize the number of stripes, so use as
....@@ -4777,109 +5049,148 @@
47775049 * The DUP profile stores more than one stripe per device, the
47785050 * max_avail is the total size so we have to adjust.
47795051 */
4780
- stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
4781
- num_stripes = ndevs * dev_stripes;
5052
+ ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5053
+ ctl->dev_stripes);
5054
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5055
+
5056
+ /* This will have to be fixed for RAID1 and RAID10 over more drives */
5057
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
47825058
47835059 /*
4784
- * this will have to be fixed for RAID1 and RAID10 over
4785
- * more drives
5060
+ * Use the number of data stripes to figure out how big this chunk is
5061
+ * really going to be in terms of logical address space, and compare
5062
+ * that answer with the max chunk size. If it's higher, we try to
5063
+ * reduce stripe_size.
47865064 */
4787
- data_stripes = num_stripes / ncopies;
4788
-
4789
- if (type & BTRFS_BLOCK_GROUP_RAID5)
4790
- data_stripes = num_stripes - 1;
4791
-
4792
- if (type & BTRFS_BLOCK_GROUP_RAID6)
4793
- data_stripes = num_stripes - 2;
4794
-
4795
- /*
4796
- * Use the number of data stripes to figure out how big this chunk
4797
- * is really going to be in terms of logical address space,
4798
- * and compare that answer with the max chunk size. If it's higher,
4799
- * we try to reduce stripe_size.
4800
- */
4801
- if (stripe_size * data_stripes > max_chunk_size) {
5065
+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
48025066 /*
48035067 * Reduce stripe_size, round it up to a 16MB boundary again and
48045068 * then use it, unless it ends up being even bigger than the
48055069 * previous value we had already.
48065070 */
4807
- stripe_size = min(round_up(div_u64(max_chunk_size,
4808
- data_stripes), SZ_16M),
4809
- stripe_size);
5071
+ ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5072
+ data_stripes), SZ_16M),
5073
+ ctl->stripe_size);
48105074 }
48115075
4812
- /* align to BTRFS_STRIPE_LEN */
4813
- stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
5076
+ /* Align to BTRFS_STRIPE_LEN */
5077
+ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5078
+ ctl->chunk_size = ctl->stripe_size * data_stripes;
48145079
4815
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4816
- if (!map) {
4817
- ret = -ENOMEM;
4818
- goto error;
5080
+ return 0;
5081
+}
5082
+
5083
+static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5084
+ struct alloc_chunk_ctl *ctl,
5085
+ struct btrfs_device_info *devices_info)
5086
+{
5087
+ struct btrfs_fs_info *info = fs_devices->fs_info;
5088
+
5089
+ /*
5090
+ * Round down to number of usable stripes, devs_increment can be any
5091
+ * number so we can't use round_down() that requires power of 2, while
5092
+ * rounddown is safe.
5093
+ */
5094
+ ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5095
+
5096
+ if (ctl->ndevs < ctl->devs_min) {
5097
+ if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5098
+ btrfs_debug(info,
5099
+ "%s: not enough devices with free space: have=%d minimum required=%d",
5100
+ __func__, ctl->ndevs, ctl->devs_min);
5101
+ }
5102
+ return -ENOSPC;
48195103 }
4820
- map->num_stripes = num_stripes;
48215104
4822
- for (i = 0; i < ndevs; ++i) {
4823
- for (j = 0; j < dev_stripes; ++j) {
4824
- int s = i * dev_stripes + j;
5105
+ ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5106
+
5107
+ switch (fs_devices->chunk_alloc_policy) {
5108
+ case BTRFS_CHUNK_ALLOC_REGULAR:
5109
+ return decide_stripe_size_regular(ctl, devices_info);
5110
+ default:
5111
+ BUG();
5112
+ }
5113
+}
5114
+
5115
+static int create_chunk(struct btrfs_trans_handle *trans,
5116
+ struct alloc_chunk_ctl *ctl,
5117
+ struct btrfs_device_info *devices_info)
5118
+{
5119
+ struct btrfs_fs_info *info = trans->fs_info;
5120
+ struct map_lookup *map = NULL;
5121
+ struct extent_map_tree *em_tree;
5122
+ struct extent_map *em;
5123
+ u64 start = ctl->start;
5124
+ u64 type = ctl->type;
5125
+ int ret;
5126
+ int i;
5127
+ int j;
5128
+
5129
+ map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5130
+ if (!map)
5131
+ return -ENOMEM;
5132
+ map->num_stripes = ctl->num_stripes;
5133
+
5134
+ for (i = 0; i < ctl->ndevs; ++i) {
5135
+ for (j = 0; j < ctl->dev_stripes; ++j) {
5136
+ int s = i * ctl->dev_stripes + j;
48255137 map->stripes[s].dev = devices_info[i].dev;
48265138 map->stripes[s].physical = devices_info[i].dev_offset +
4827
- j * stripe_size;
5139
+ j * ctl->stripe_size;
48285140 }
48295141 }
48305142 map->stripe_len = BTRFS_STRIPE_LEN;
48315143 map->io_align = BTRFS_STRIPE_LEN;
48325144 map->io_width = BTRFS_STRIPE_LEN;
48335145 map->type = type;
4834
- map->sub_stripes = sub_stripes;
5146
+ map->sub_stripes = ctl->sub_stripes;
48355147
4836
- num_bytes = stripe_size * data_stripes;
4837
-
4838
- trace_btrfs_chunk_alloc(info, map, start, num_bytes);
5148
+ trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
48395149
48405150 em = alloc_extent_map();
48415151 if (!em) {
48425152 kfree(map);
4843
- ret = -ENOMEM;
4844
- goto error;
5153
+ return -ENOMEM;
48455154 }
48465155 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
48475156 em->map_lookup = map;
48485157 em->start = start;
4849
- em->len = num_bytes;
5158
+ em->len = ctl->chunk_size;
48505159 em->block_start = 0;
48515160 em->block_len = em->len;
4852
- em->orig_block_len = stripe_size;
5161
+ em->orig_block_len = ctl->stripe_size;
48535162
4854
- em_tree = &info->mapping_tree.map_tree;
5163
+ em_tree = &info->mapping_tree;
48555164 write_lock(&em_tree->lock);
48565165 ret = add_extent_mapping(em_tree, em, 0);
48575166 if (ret) {
48585167 write_unlock(&em_tree->lock);
48595168 free_extent_map(em);
4860
- goto error;
5169
+ return ret;
48615170 }
4862
-
4863
- list_add_tail(&em->list, &trans->transaction->pending_chunks);
4864
- refcount_inc(&em->refs);
48655171 write_unlock(&em_tree->lock);
48665172
4867
- ret = btrfs_make_block_group(trans, 0, type, start, num_bytes);
5173
+ ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
48685174 if (ret)
48695175 goto error_del_extent;
48705176
48715177 for (i = 0; i < map->num_stripes; i++) {
4872
- num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4873
- btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4874
- map->stripes[i].dev->has_pending_chunks = true;
5178
+ struct btrfs_device *dev = map->stripes[i].dev;
5179
+
5180
+ btrfs_device_set_bytes_used(dev,
5181
+ dev->bytes_used + ctl->stripe_size);
5182
+ if (list_empty(&dev->post_commit_list))
5183
+ list_add_tail(&dev->post_commit_list,
5184
+ &trans->transaction->dev_update_list);
48755185 }
48765186
4877
- atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
5187
+ atomic64_sub(ctl->stripe_size * map->num_stripes,
5188
+ &info->free_chunk_space);
48785189
48795190 free_extent_map(em);
48805191 check_raid56_incompat_flag(info, type);
5192
+ check_raid1c34_incompat_flag(info, type);
48815193
4882
- kfree(devices_info);
48835194 return 0;
48845195
48855196 error_del_extent:
....@@ -4891,13 +5202,68 @@
48915202 free_extent_map(em);
48925203 /* One for the tree reference */
48935204 free_extent_map(em);
4894
- /* One for the pending_chunks list reference */
4895
- free_extent_map(em);
4896
-error:
5205
+
5206
+ return ret;
5207
+}
5208
+
5209
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5210
+{
5211
+ struct btrfs_fs_info *info = trans->fs_info;
5212
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
5213
+ struct btrfs_device_info *devices_info = NULL;
5214
+ struct alloc_chunk_ctl ctl;
5215
+ int ret;
5216
+
5217
+ lockdep_assert_held(&info->chunk_mutex);
5218
+
5219
+ if (!alloc_profile_is_valid(type, 0)) {
5220
+ ASSERT(0);
5221
+ return -EINVAL;
5222
+ }
5223
+
5224
+ if (list_empty(&fs_devices->alloc_list)) {
5225
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
5226
+ btrfs_debug(info, "%s: no writable device", __func__);
5227
+ return -ENOSPC;
5228
+ }
5229
+
5230
+ if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5231
+ btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5232
+ ASSERT(0);
5233
+ return -EINVAL;
5234
+ }
5235
+
5236
+ ctl.start = find_next_chunk(info);
5237
+ ctl.type = type;
5238
+ init_alloc_chunk_ctl(fs_devices, &ctl);
5239
+
5240
+ devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5241
+ GFP_NOFS);
5242
+ if (!devices_info)
5243
+ return -ENOMEM;
5244
+
5245
+ ret = gather_device_info(fs_devices, &ctl, devices_info);
5246
+ if (ret < 0)
5247
+ goto out;
5248
+
5249
+ ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5250
+ if (ret < 0)
5251
+ goto out;
5252
+
5253
+ ret = create_chunk(trans, &ctl, devices_info);
5254
+
5255
+out:
48975256 kfree(devices_info);
48985257 return ret;
48995258 }
49005259
5260
+/*
5261
+ * Chunk allocation falls into two parts. The first part does work
5262
+ * that makes the new allocated chunk usable, but does not do any operation
5263
+ * that modifies the chunk tree. The second part does the work that
5264
+ * requires modifying the chunk tree. This division is important for the
5265
+ * bootstrap process of adding storage to a seed btrfs.
5266
+ */
49015267 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
49025268 u64 chunk_offset, u64 chunk_size)
49035269 {
....@@ -4916,7 +5282,7 @@
49165282 int i = 0;
49175283 int ret = 0;
49185284
4919
- em = get_chunk_map(fs_info, chunk_offset, chunk_size);
5285
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
49205286 if (IS_ERR(em))
49215287 return PTR_ERR(em);
49225288
....@@ -4996,57 +5362,27 @@
49965362 return ret;
49975363 }
49985364
4999
-/*
5000
- * Chunk allocation falls into two parts. The first part does works
5001
- * that make the new allocated chunk useable, but not do any operation
5002
- * that modifies the chunk tree. The second part does the works that
5003
- * require modifying the chunk tree. This division is important for the
5004
- * bootstrap process of adding storage to a seed btrfs.
5005
- */
5006
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5365
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
50075366 {
5008
- u64 chunk_offset;
5009
-
5010
- lockdep_assert_held(&trans->fs_info->chunk_mutex);
5011
- chunk_offset = find_next_chunk(trans->fs_info);
5012
- return __btrfs_alloc_chunk(trans, chunk_offset, type);
5013
-}
5014
-
5015
-static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
5016
- struct btrfs_fs_info *fs_info)
5017
-{
5018
- u64 chunk_offset;
5019
- u64 sys_chunk_offset;
5367
+ struct btrfs_fs_info *fs_info = trans->fs_info;
50205368 u64 alloc_profile;
50215369 int ret;
50225370
5023
- chunk_offset = find_next_chunk(fs_info);
50245371 alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5025
- ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
5372
+ ret = btrfs_alloc_chunk(trans, alloc_profile);
50265373 if (ret)
50275374 return ret;
50285375
5029
- sys_chunk_offset = find_next_chunk(fs_info);
50305376 alloc_profile = btrfs_system_alloc_profile(fs_info);
5031
- ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
5377
+ ret = btrfs_alloc_chunk(trans, alloc_profile);
50325378 return ret;
50335379 }
50345380
50355381 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
50365382 {
5037
- int max_errors;
5383
+ const int index = btrfs_bg_flags_to_raid_index(map->type);
50385384
5039
- if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
5040
- BTRFS_BLOCK_GROUP_RAID10 |
5041
- BTRFS_BLOCK_GROUP_RAID5)) {
5042
- max_errors = 1;
5043
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
5044
- max_errors = 2;
5045
- } else {
5046
- max_errors = 0;
5047
- }
5048
-
5049
- return max_errors;
5385
+ return btrfs_raid_array[index].tolerated_failures;
50505386 }
50515387
50525388 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
....@@ -5057,7 +5393,7 @@
50575393 int miss_ndevs = 0;
50585394 int i;
50595395
5060
- em = get_chunk_map(fs_info, chunk_offset, 1);
5396
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
50615397 if (IS_ERR(em))
50625398 return 1;
50635399
....@@ -5087,21 +5423,16 @@
50875423 return readonly;
50885424 }
50895425
5090
-void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
5091
-{
5092
- extent_map_tree_init(&tree->map_tree);
5093
-}
5094
-
5095
-void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
5426
+void btrfs_mapping_tree_free(struct extent_map_tree *tree)
50965427 {
50975428 struct extent_map *em;
50985429
50995430 while (1) {
5100
- write_lock(&tree->map_tree.lock);
5101
- em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
5431
+ write_lock(&tree->lock);
5432
+ em = lookup_extent_mapping(tree, 0, (u64)-1);
51025433 if (em)
5103
- remove_extent_mapping(&tree->map_tree, em);
5104
- write_unlock(&tree->map_tree.lock);
5434
+ remove_extent_mapping(tree, em);
5435
+ write_unlock(&tree->lock);
51055436 if (!em)
51065437 break;
51075438 /* once for us */
....@@ -5117,7 +5448,7 @@
51175448 struct map_lookup *map;
51185449 int ret;
51195450
5120
- em = get_chunk_map(fs_info, logical, len);
5451
+ em = btrfs_get_chunk_map(fs_info, logical, len);
51215452 if (IS_ERR(em))
51225453 /*
51235454 * We could return errors for these cases, but that could get
....@@ -5128,7 +5459,7 @@
51285459 return 1;
51295460
51305461 map = em->map_lookup;
5131
- if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
5462
+ if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
51325463 ret = map->num_stripes;
51335464 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
51345465 ret = map->sub_stripes;
....@@ -5147,11 +5478,11 @@
51475478 ret = 1;
51485479 free_extent_map(em);
51495480
5150
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
5481
+ down_read(&fs_info->dev_replace.rwsem);
51515482 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
51525483 fs_info->dev_replace.tgtdev)
51535484 ret++;
5154
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
5485
+ up_read(&fs_info->dev_replace.rwsem);
51555486
51565487 return ret;
51575488 }
....@@ -5163,7 +5494,7 @@
51635494 struct map_lookup *map;
51645495 unsigned long len = fs_info->sectorsize;
51655496
5166
- em = get_chunk_map(fs_info, logical, len);
5497
+ em = btrfs_get_chunk_map(fs_info, logical, len);
51675498
51685499 if (!WARN_ON(IS_ERR(em))) {
51695500 map = em->map_lookup;
....@@ -5180,7 +5511,7 @@
51805511 struct map_lookup *map;
51815512 int ret = 0;
51825513
5183
- em = get_chunk_map(fs_info, logical, len);
5514
+ em = btrfs_get_chunk_map(fs_info, logical, len);
51845515
51855516 if(!WARN_ON(IS_ERR(em))) {
51865517 map = em->map_lookup;
....@@ -5202,7 +5533,7 @@
52025533 struct btrfs_device *srcdev;
52035534
52045535 ASSERT((map->type &
5205
- (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));
5536
+ (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
52065537
52075538 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
52085539 num_stripes = map->sub_stripes;
....@@ -5240,31 +5571,19 @@
52405571 return preferred_mirror;
52415572 }
52425573
5243
-static inline int parity_smaller(u64 a, u64 b)
5244
-{
5245
- return a > b;
5246
-}
5247
-
52485574 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
52495575 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
52505576 {
5251
- struct btrfs_bio_stripe s;
52525577 int i;
5253
- u64 l;
52545578 int again = 1;
52555579
52565580 while (again) {
52575581 again = 0;
52585582 for (i = 0; i < num_stripes - 1; i++) {
5259
- if (parity_smaller(bbio->raid_map[i],
5260
- bbio->raid_map[i+1])) {
5261
- s = bbio->stripes[i];
5262
- l = bbio->raid_map[i];
5263
- bbio->stripes[i] = bbio->stripes[i+1];
5264
- bbio->raid_map[i] = bbio->raid_map[i+1];
5265
- bbio->stripes[i+1] = s;
5266
- bbio->raid_map[i+1] = l;
5267
-
5583
+ /* Swap if parity is on a smaller index */
5584
+ if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5585
+ swap(bbio->stripes[i], bbio->stripes[i + 1]);
5586
+ swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
52685587 again = 1;
52695588 }
52705589 }
....@@ -5290,6 +5609,9 @@
52905609 atomic_set(&bbio->error, 0);
52915610 refcount_set(&bbio->refs, 1);
52925611
5612
+ bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
5613
+ bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
5614
+
52935615 return bbio;
52945616 }
52955617
....@@ -5313,12 +5635,13 @@
53135635 * replace.
53145636 */
53155637 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5316
- u64 logical, u64 length,
5638
+ u64 logical, u64 *length_ret,
53175639 struct btrfs_bio **bbio_ret)
53185640 {
53195641 struct extent_map *em;
53205642 struct map_lookup *map;
53215643 struct btrfs_bio *bbio;
5644
+ u64 length = *length_ret;
53225645 u64 offset;
53235646 u64 stripe_nr;
53245647 u64 stripe_nr_end;
....@@ -5339,7 +5662,7 @@
53395662 /* discard always return a bbio */
53405663 ASSERT(bbio_ret);
53415664
5342
- em = get_chunk_map(fs_info, logical, length);
5665
+ em = btrfs_get_chunk_map(fs_info, logical, length);
53435666 if (IS_ERR(em))
53445667 return PTR_ERR(em);
53455668
....@@ -5351,7 +5674,8 @@
53515674 }
53525675
53535676 offset = logical - em->start;
5354
- length = min_t(u64, em->len - offset, length);
5677
+ length = min_t(u64, em->start + em->len - logical, length);
5678
+ *length_ret = length;
53555679
53565680 stripe_len = map->stripe_len;
53575681 /*
....@@ -5391,7 +5715,7 @@
53915715 &remaining_stripes);
53925716 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
53935717 last_stripe *= sub_stripes;
5394
- } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
5718
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
53955719 BTRFS_BLOCK_GROUP_DUP)) {
53965720 num_stripes = map->num_stripes;
53975721 } else {
....@@ -5635,6 +5959,106 @@
56355959 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
56365960 }
56375961
5962
+/*
5963
+ * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
5964
+ * tuple. This information is used to calculate how big a
5965
+ * particular bio can get before it straddles a stripe.
5966
+ *
5967
+ * @fs_info - the filesystem
5968
+ * @logical - address that we want to figure out the geometry of
5969
+ * @len - the length of IO we are going to perform, starting at @logical
5970
+ * @op - type of operation - write or read
5971
+ * @io_geom - pointer used to return values
5972
+ *
5973
+ * Returns < 0 in case a chunk for the given logical address cannot be found,
5974
+ * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
5975
+ */
5976
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5977
+ u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
5978
+{
5979
+ struct extent_map *em;
5980
+ struct map_lookup *map;
5981
+ u64 offset;
5982
+ u64 stripe_offset;
5983
+ u64 stripe_nr;
5984
+ u64 stripe_len;
5985
+ u64 raid56_full_stripe_start = (u64)-1;
5986
+ int data_stripes;
5987
+ int ret = 0;
5988
+
5989
+ ASSERT(op != BTRFS_MAP_DISCARD);
5990
+
5991
+ em = btrfs_get_chunk_map(fs_info, logical, len);
5992
+ if (IS_ERR(em))
5993
+ return PTR_ERR(em);
5994
+
5995
+ map = em->map_lookup;
5996
+ /* Offset of this logical address in the chunk */
5997
+ offset = logical - em->start;
5998
+ /* Len of a stripe in a chunk */
5999
+ stripe_len = map->stripe_len;
6000
+ /* Stripe wher this block falls in */
6001
+ stripe_nr = div64_u64(offset, stripe_len);
6002
+ /* Offset of stripe in the chunk */
6003
+ stripe_offset = stripe_nr * stripe_len;
6004
+ if (offset < stripe_offset) {
6005
+ btrfs_crit(fs_info,
6006
+"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
6007
+ stripe_offset, offset, em->start, logical, stripe_len);
6008
+ ret = -EINVAL;
6009
+ goto out;
6010
+ }
6011
+
6012
+ /* stripe_offset is the offset of this block in its stripe */
6013
+ stripe_offset = offset - stripe_offset;
6014
+ data_stripes = nr_data_stripes(map);
6015
+
6016
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
6017
+ u64 max_len = stripe_len - stripe_offset;
6018
+
6019
+ /*
6020
+ * In case of raid56, we need to know the stripe aligned start
6021
+ */
6022
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6023
+ unsigned long full_stripe_len = stripe_len * data_stripes;
6024
+ raid56_full_stripe_start = offset;
6025
+
6026
+ /*
6027
+ * Allow a write of a full stripe, but make sure we
6028
+ * don't allow straddling of stripes
6029
+ */
6030
+ raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6031
+ full_stripe_len);
6032
+ raid56_full_stripe_start *= full_stripe_len;
6033
+
6034
+ /*
6035
+ * For writes to RAID[56], allow a full stripeset across
6036
+ * all disks. For other RAID types and for RAID[56]
6037
+ * reads, just allow a single stripe (on a single disk).
6038
+ */
6039
+ if (op == BTRFS_MAP_WRITE) {
6040
+ max_len = stripe_len * data_stripes -
6041
+ (offset - raid56_full_stripe_start);
6042
+ }
6043
+ }
6044
+ len = min_t(u64, em->len - offset, max_len);
6045
+ } else {
6046
+ len = em->len - offset;
6047
+ }
6048
+
6049
+ io_geom->len = len;
6050
+ io_geom->offset = offset;
6051
+ io_geom->stripe_len = stripe_len;
6052
+ io_geom->stripe_nr = stripe_nr;
6053
+ io_geom->stripe_offset = stripe_offset;
6054
+ io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6055
+
6056
+out:
6057
+ /* once for us */
6058
+ free_extent_map(em);
6059
+ return ret;
6060
+}
6061
+
56386062 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
56396063 enum btrfs_map_op op,
56406064 u64 logical, u64 *length,
....@@ -5643,11 +6067,11 @@
56436067 {
56446068 struct extent_map *em;
56456069 struct map_lookup *map;
5646
- u64 offset;
56476070 u64 stripe_offset;
56486071 u64 stripe_nr;
56496072 u64 stripe_len;
56506073 u32 stripe_index;
6074
+ int data_stripes;
56516075 int i;
56526076 int ret = 0;
56536077 int num_stripes;
....@@ -5660,81 +6084,34 @@
56606084 int patch_the_first_stripe_for_dev_replace = 0;
56616085 u64 physical_to_patch_in_first_stripe = 0;
56626086 u64 raid56_full_stripe_start = (u64)-1;
6087
+ struct btrfs_io_geometry geom;
56636088
5664
- if (op == BTRFS_MAP_DISCARD)
5665
- return __btrfs_map_block_for_discard(fs_info, logical,
5666
- *length, bbio_ret);
6089
+ ASSERT(bbio_ret);
6090
+ ASSERT(op != BTRFS_MAP_DISCARD);
56676091
5668
- em = get_chunk_map(fs_info, logical, *length);
5669
- if (IS_ERR(em))
5670
- return PTR_ERR(em);
6092
+ ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
6093
+ if (ret < 0)
6094
+ return ret;
56716095
6096
+ em = btrfs_get_chunk_map(fs_info, logical, *length);
6097
+ ASSERT(!IS_ERR(em));
56726098 map = em->map_lookup;
5673
- offset = logical - em->start;
56746099
5675
- stripe_len = map->stripe_len;
5676
- stripe_nr = offset;
5677
- /*
5678
- * stripe_nr counts the total number of stripes we have to stride
5679
- * to get to this block
5680
- */
5681
- stripe_nr = div64_u64(stripe_nr, stripe_len);
6100
+ *length = geom.len;
6101
+ stripe_len = geom.stripe_len;
6102
+ stripe_nr = geom.stripe_nr;
6103
+ stripe_offset = geom.stripe_offset;
6104
+ raid56_full_stripe_start = geom.raid56_stripe_offset;
6105
+ data_stripes = nr_data_stripes(map);
56826106
5683
- stripe_offset = stripe_nr * stripe_len;
5684
- if (offset < stripe_offset) {
5685
- btrfs_crit(fs_info,
5686
- "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
5687
- stripe_offset, offset, em->start, logical,
5688
- stripe_len);
5689
- free_extent_map(em);
5690
- return -EINVAL;
5691
- }
5692
-
5693
- /* stripe_offset is the offset of this block in its stripe*/
5694
- stripe_offset = offset - stripe_offset;
5695
-
5696
- /* if we're here for raid56, we need to know the stripe aligned start */
5697
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5698
- unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
5699
- raid56_full_stripe_start = offset;
5700
-
5701
- /* allow a write of a full stripe, but make sure we don't
5702
- * allow straddling of stripes
5703
- */
5704
- raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5705
- full_stripe_len);
5706
- raid56_full_stripe_start *= full_stripe_len;
5707
- }
5708
-
5709
- if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5710
- u64 max_len;
5711
- /* For writes to RAID[56], allow a full stripeset across all disks.
5712
- For other RAID types and for RAID[56] reads, just allow a single
5713
- stripe (on a single disk). */
5714
- if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5715
- (op == BTRFS_MAP_WRITE)) {
5716
- max_len = stripe_len * nr_data_stripes(map) -
5717
- (offset - raid56_full_stripe_start);
5718
- } else {
5719
- /* we limit the length of each bio to what fits in a stripe */
5720
- max_len = stripe_len - stripe_offset;
5721
- }
5722
- *length = min_t(u64, em->len - offset, max_len);
5723
- } else {
5724
- *length = em->len - offset;
5725
- }
5726
-
5727
- /* This is for when we're called from btrfs_merge_bio_hook() and all
5728
- it cares about is the length */
5729
- if (!bbio_ret)
5730
- goto out;
5731
-
5732
- btrfs_dev_replace_read_lock(dev_replace);
6107
+ down_read(&dev_replace->rwsem);
57336108 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6109
+ /*
6110
+ * Hold the semaphore for read during the whole operation, write is
6111
+ * requested at commit time but must wait.
6112
+ */
57346113 if (!dev_replace_is_ongoing)
5735
- btrfs_dev_replace_read_unlock(dev_replace);
5736
- else
5737
- btrfs_dev_replace_set_lock_blocking(dev_replace);
6114
+ up_read(&dev_replace->rwsem);
57386115
57396116 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
57406117 !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
....@@ -5757,7 +6134,7 @@
57576134 &stripe_index);
57586135 if (!need_full_stripe(op))
57596136 mirror_num = 1;
5760
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
6137
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
57616138 if (need_full_stripe(op))
57626139 num_stripes = map->num_stripes;
57636140 else if (mirror_num)
....@@ -5799,7 +6176,7 @@
57996176 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
58006177 /* push stripe_nr back to the start of the full stripe */
58016178 stripe_nr = div64_u64(raid56_full_stripe_start,
5802
- stripe_len * nr_data_stripes(map));
6179
+ stripe_len * data_stripes);
58036180
58046181 /* RAID[56] write or recovery. Return all stripes */
58056182 num_stripes = map->num_stripes;
....@@ -5815,10 +6192,9 @@
58156192 * Mirror #3 is RAID6 Q block.
58166193 */
58176194 stripe_nr = div_u64_rem(stripe_nr,
5818
- nr_data_stripes(map), &stripe_index);
6195
+ data_stripes, &stripe_index);
58196196 if (mirror_num > 1)
5820
- stripe_index = nr_data_stripes(map) +
5821
- mirror_num - 2;
6197
+ stripe_index = data_stripes + mirror_num - 2;
58226198
58236199 /* We distribute the parity blocks across stripes */
58246200 div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
....@@ -5858,8 +6234,13 @@
58586234 ret = -ENOMEM;
58596235 goto out;
58606236 }
5861
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
5862
- bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
6237
+
6238
+ for (i = 0; i < num_stripes; i++) {
6239
+ bbio->stripes[i].physical = map->stripes[stripe_index].physical +
6240
+ stripe_offset + stripe_nr * map->stripe_len;
6241
+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6242
+ stripe_index++;
6243
+ }
58636244
58646245 /* build raid_map */
58656246 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
....@@ -5867,17 +6248,12 @@
58676248 u64 tmp;
58686249 unsigned rot;
58696250
5870
- bbio->raid_map = (u64 *)((void *)bbio->stripes +
5871
- sizeof(struct btrfs_bio_stripe) *
5872
- num_alloc_stripes +
5873
- sizeof(int) * tgtdev_indexes);
5874
-
58756251 /* Work out the disk rotation on this stripe-set */
58766252 div_u64_rem(stripe_nr, num_stripes, &rot);
58776253
58786254 /* Fill in the logical address of each stripe */
5879
- tmp = stripe_nr * nr_data_stripes(map);
5880
- for (i = 0; i < nr_data_stripes(map); i++)
6255
+ tmp = stripe_nr * data_stripes;
6256
+ for (i = 0; i < data_stripes; i++)
58816257 bbio->raid_map[(i+rot) % num_stripes] =
58826258 em->start + (tmp + i) * map->stripe_len;
58836259
....@@ -5885,24 +6261,12 @@
58856261 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
58866262 bbio->raid_map[(i+rot+1) % num_stripes] =
58876263 RAID6_Q_STRIPE;
5888
- }
58896264
5890
-
5891
- for (i = 0; i < num_stripes; i++) {
5892
- bbio->stripes[i].physical =
5893
- map->stripes[stripe_index].physical +
5894
- stripe_offset +
5895
- stripe_nr * map->stripe_len;
5896
- bbio->stripes[i].dev =
5897
- map->stripes[stripe_index].dev;
5898
- stripe_index++;
6265
+ sort_parity_stripes(bbio, num_stripes);
58996266 }
59006267
59016268 if (need_full_stripe(op))
59026269 max_errors = btrfs_chunk_max_errors(map);
5903
-
5904
- if (bbio->raid_map)
5905
- sort_parity_stripes(bbio, num_stripes);
59066270
59076271 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
59086272 need_full_stripe(op)) {
....@@ -5929,8 +6293,9 @@
59296293 }
59306294 out:
59316295 if (dev_replace_is_ongoing) {
5932
- btrfs_dev_replace_clear_lock_blocking(dev_replace);
5933
- btrfs_dev_replace_read_unlock(dev_replace);
6296
+ lockdep_assert_held(&dev_replace->rwsem);
6297
+ /* Unlock and let waiting writers proceed */
6298
+ up_read(&dev_replace->rwsem);
59346299 }
59356300 free_extent_map(em);
59366301 return ret;
....@@ -5940,6 +6305,10 @@
59406305 u64 logical, u64 *length,
59416306 struct btrfs_bio **bbio_ret, int mirror_num)
59426307 {
6308
+ if (op == BTRFS_MAP_DISCARD)
6309
+ return __btrfs_map_block_for_discard(fs_info, logical,
6310
+ length, bbio_ret);
6311
+
59436312 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
59446313 mirror_num, 0);
59456314 }
....@@ -5950,75 +6319,6 @@
59506319 struct btrfs_bio **bbio_ret)
59516320 {
59526321 return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
5953
-}
5954
-
5955
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
5956
- u64 physical, u64 **logical, int *naddrs, int *stripe_len)
5957
-{
5958
- struct extent_map *em;
5959
- struct map_lookup *map;
5960
- u64 *buf;
5961
- u64 bytenr;
5962
- u64 length;
5963
- u64 stripe_nr;
5964
- u64 rmap_len;
5965
- int i, j, nr = 0;
5966
-
5967
- em = get_chunk_map(fs_info, chunk_start, 1);
5968
- if (IS_ERR(em))
5969
- return -EIO;
5970
-
5971
- map = em->map_lookup;
5972
- length = em->len;
5973
- rmap_len = map->stripe_len;
5974
-
5975
- if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5976
- length = div_u64(length, map->num_stripes / map->sub_stripes);
5977
- else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5978
- length = div_u64(length, map->num_stripes);
5979
- else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5980
- length = div_u64(length, nr_data_stripes(map));
5981
- rmap_len = map->stripe_len * nr_data_stripes(map);
5982
- }
5983
-
5984
- buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
5985
- BUG_ON(!buf); /* -ENOMEM */
5986
-
5987
- for (i = 0; i < map->num_stripes; i++) {
5988
- if (map->stripes[i].physical > physical ||
5989
- map->stripes[i].physical + length <= physical)
5990
- continue;
5991
-
5992
- stripe_nr = physical - map->stripes[i].physical;
5993
- stripe_nr = div64_u64(stripe_nr, map->stripe_len);
5994
-
5995
- if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5996
- stripe_nr = stripe_nr * map->num_stripes + i;
5997
- stripe_nr = div_u64(stripe_nr, map->sub_stripes);
5998
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5999
- stripe_nr = stripe_nr * map->num_stripes + i;
6000
- } /* else if RAID[56], multiply by nr_data_stripes().
6001
- * Alternatively, just use rmap_len below instead of
6002
- * map->stripe_len */
6003
-
6004
- bytenr = chunk_start + stripe_nr * rmap_len;
6005
- WARN_ON(nr >= map->num_stripes);
6006
- for (j = 0; j < nr; j++) {
6007
- if (buf[j] == bytenr)
6008
- break;
6009
- }
6010
- if (j == nr) {
6011
- WARN_ON(nr >= map->num_stripes);
6012
- buf[nr++] = bytenr;
6013
- }
6014
- }
6015
-
6016
- *logical = buf;
6017
- *naddrs = nr;
6018
- *stripe_len = rmap_len;
6019
-
6020
- free_extent_map(em);
6021
- return 0;
60226322 }
60236323
60246324 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
....@@ -6039,23 +6339,18 @@
60396339 atomic_inc(&bbio->error);
60406340 if (bio->bi_status == BLK_STS_IOERR ||
60416341 bio->bi_status == BLK_STS_TARGET) {
6042
- unsigned int stripe_index =
6043
- btrfs_io_bio(bio)->stripe_index;
6044
- struct btrfs_device *dev;
6342
+ struct btrfs_device *dev = btrfs_io_bio(bio)->device;
60456343
6046
- BUG_ON(stripe_index >= bbio->num_stripes);
6047
- dev = bbio->stripes[stripe_index].dev;
6048
- if (dev->bdev) {
6049
- if (bio_op(bio) == REQ_OP_WRITE)
6050
- btrfs_dev_stat_inc_and_print(dev,
6344
+ ASSERT(dev->bdev);
6345
+ if (bio_op(bio) == REQ_OP_WRITE)
6346
+ btrfs_dev_stat_inc_and_print(dev,
60516347 BTRFS_DEV_STAT_WRITE_ERRS);
6052
- else if (!(bio->bi_opf & REQ_RAHEAD))
6053
- btrfs_dev_stat_inc_and_print(dev,
6348
+ else if (!(bio->bi_opf & REQ_RAHEAD))
6349
+ btrfs_dev_stat_inc_and_print(dev,
60546350 BTRFS_DEV_STAT_READ_ERRS);
6055
- if (bio->bi_opf & REQ_PREFLUSH)
6056
- btrfs_dev_stat_inc_and_print(dev,
6351
+ if (bio->bi_opf & REQ_PREFLUSH)
6352
+ btrfs_dev_stat_inc_and_print(dev,
60576353 BTRFS_DEV_STAT_FLUSH_ERRS);
6058
- }
60596354 }
60606355 }
60616356
....@@ -6090,73 +6385,25 @@
60906385 }
60916386 }
60926387
6093
-/*
6094
- * see run_scheduled_bios for a description of why bios are collected for
6095
- * async submit.
6096
- *
6097
- * This will add one bio to the pending list for a device and make sure
6098
- * the work struct is scheduled.
6099
- */
6100
-static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6101
- struct bio *bio)
6102
-{
6103
- struct btrfs_fs_info *fs_info = device->fs_info;
6104
- int should_queue = 1;
6105
- struct btrfs_pending_bios *pending_bios;
6106
-
6107
- /* don't bother with additional async steps for reads, right now */
6108
- if (bio_op(bio) == REQ_OP_READ) {
6109
- btrfsic_submit_bio(bio);
6110
- return;
6111
- }
6112
-
6113
- WARN_ON(bio->bi_next);
6114
- bio->bi_next = NULL;
6115
-
6116
- spin_lock(&device->io_lock);
6117
- if (op_is_sync(bio->bi_opf))
6118
- pending_bios = &device->pending_sync_bios;
6119
- else
6120
- pending_bios = &device->pending_bios;
6121
-
6122
- if (pending_bios->tail)
6123
- pending_bios->tail->bi_next = bio;
6124
-
6125
- pending_bios->tail = bio;
6126
- if (!pending_bios->head)
6127
- pending_bios->head = bio;
6128
- if (device->running_pending)
6129
- should_queue = 0;
6130
-
6131
- spin_unlock(&device->io_lock);
6132
-
6133
- if (should_queue)
6134
- btrfs_queue_work(fs_info->submit_workers, &device->work);
6135
-}
6136
-
61376388 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6138
- u64 physical, int dev_nr, int async)
6389
+ u64 physical, struct btrfs_device *dev)
61396390 {
6140
- struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
61416391 struct btrfs_fs_info *fs_info = bbio->fs_info;
61426392
61436393 bio->bi_private = bbio;
6144
- btrfs_io_bio(bio)->stripe_index = dev_nr;
6394
+ btrfs_io_bio(bio)->device = dev;
61456395 bio->bi_end_io = btrfs_end_bio;
61466396 bio->bi_iter.bi_sector = physical >> 9;
61476397 btrfs_debug_in_rcu(fs_info,
61486398 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
61496399 bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
6150
- (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid,
6151
- bio->bi_iter.bi_size);
6400
+ (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6401
+ dev->devid, bio->bi_iter.bi_size);
61526402 bio_set_dev(bio, dev->bdev);
61536403
61546404 btrfs_bio_counter_inc_noblocked(fs_info);
61556405
6156
- if (async)
6157
- btrfs_schedule_bio(dev, bio);
6158
- else
6159
- btrfsic_submit_bio(bio);
6406
+ btrfsic_submit_bio(bio);
61606407 }
61616408
61626409 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
....@@ -6177,7 +6424,7 @@
61776424 }
61786425
61796426 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6180
- int mirror_num, int async_submit)
6427
+ int mirror_num)
61816428 {
61826429 struct btrfs_device *dev;
61836430 struct bio *first_bio = bio;
....@@ -6245,8 +6492,7 @@
62456492 else
62466493 bio = first_bio;
62476494
6248
- submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
6249
- dev_nr, async_submit);
6495
+ submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
62506496 }
62516497 btrfs_bio_counter_dec(fs_info);
62526498 return BLK_STS_OK;
....@@ -6262,15 +6508,25 @@
62626508 * If @seed is true, traverse through the seed devices.
62636509 */
62646510 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6265
- u64 devid, u8 *uuid, u8 *fsid,
6266
- bool seed)
6511
+ u64 devid, u8 *uuid, u8 *fsid,
6512
+ bool seed)
62676513 {
62686514 struct btrfs_device *device;
6515
+ struct btrfs_fs_devices *seed_devs;
62696516
6270
- while (fs_devices) {
6517
+ if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6518
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
6519
+ if (device->devid == devid &&
6520
+ (!uuid || memcmp(device->uuid, uuid,
6521
+ BTRFS_UUID_SIZE) == 0))
6522
+ return device;
6523
+ }
6524
+ }
6525
+
6526
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
62716527 if (!fsid ||
6272
- !memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
6273
- list_for_each_entry(device, &fs_devices->devices,
6528
+ !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6529
+ list_for_each_entry(device, &seed_devs->devices,
62746530 dev_list) {
62756531 if (device->devid == devid &&
62766532 (!uuid || memcmp(device->uuid, uuid,
....@@ -6278,11 +6534,8 @@
62786534 return device;
62796535 }
62806536 }
6281
- if (seed)
6282
- fs_devices = fs_devices->seed;
6283
- else
6284
- return NULL;
62856537 }
6538
+
62866539 return NULL;
62876540 }
62886541
....@@ -6337,7 +6590,7 @@
63376590 if (WARN_ON(!devid && !fs_info))
63386591 return ERR_PTR(-EINVAL);
63396592
6340
- dev = __alloc_device();
6593
+ dev = __alloc_device(fs_info);
63416594 if (IS_ERR(dev))
63426595 return dev;
63436596
....@@ -6359,9 +6612,6 @@
63596612 else
63606613 generate_random_uuid(dev->uuid);
63616614
6362
- btrfs_init_work(&dev->work, btrfs_submit_helper,
6363
- pending_bios_fn, NULL, NULL);
6364
-
63656615 return dev;
63666616 }
63676617
....@@ -6376,11 +6626,26 @@
63766626 devid, uuid);
63776627 }
63786628
6379
-static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
6380
- struct extent_buffer *leaf,
6629
+static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6630
+{
6631
+ int index = btrfs_bg_flags_to_raid_index(type);
6632
+ int ncopies = btrfs_raid_array[index].ncopies;
6633
+ const int nparity = btrfs_raid_array[index].nparity;
6634
+ int data_stripes;
6635
+
6636
+ if (nparity)
6637
+ data_stripes = num_stripes - nparity;
6638
+ else
6639
+ data_stripes = num_stripes / ncopies;
6640
+
6641
+ return div_u64(chunk_len, data_stripes);
6642
+}
6643
+
6644
+static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
63816645 struct btrfs_chunk *chunk)
63826646 {
6383
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
6647
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
6648
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
63846649 struct map_lookup *map;
63856650 struct extent_map *em;
63866651 u64 logical;
....@@ -6400,14 +6665,14 @@
64006665 * as chunk item in tree block is already verified by tree-checker.
64016666 */
64026667 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6403
- ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
6668
+ ret = btrfs_check_chunk_valid(leaf, chunk, logical);
64046669 if (ret)
64056670 return ret;
64066671 }
64076672
6408
- read_lock(&map_tree->map_tree.lock);
6409
- em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6410
- read_unlock(&map_tree->map_tree.lock);
6673
+ read_lock(&map_tree->lock);
6674
+ em = lookup_extent_mapping(map_tree, logical, 1);
6675
+ read_unlock(&map_tree->lock);
64116676
64126677 /* already mapped? */
64136678 if (em && em->start <= logical && em->start + em->len > logical) {
....@@ -6441,6 +6706,8 @@
64416706 map->type = btrfs_chunk_type(leaf, chunk);
64426707 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
64436708 map->verified_stripes = 0;
6709
+ em->orig_block_len = calc_stripe_length(map->type, em->len,
6710
+ map->num_stripes);
64446711 for (i = 0; i < num_stripes; i++) {
64456712 map->stripes[i].physical =
64466713 btrfs_stripe_offset_nr(leaf, chunk, i);
....@@ -6449,7 +6716,7 @@
64496716 btrfs_stripe_dev_uuid_nr(chunk, i),
64506717 BTRFS_UUID_SIZE);
64516718 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6452
- devid, uuid, NULL, true);
6719
+ devid, uuid, NULL, true);
64536720 if (!map->stripes[i].dev &&
64546721 !btrfs_test_opt(fs_info, DEGRADED)) {
64556722 free_extent_map(em);
....@@ -6474,9 +6741,9 @@
64746741
64756742 }
64766743
6477
- write_lock(&map_tree->map_tree.lock);
6478
- ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6479
- write_unlock(&map_tree->map_tree.lock);
6744
+ write_lock(&map_tree->lock);
6745
+ ret = add_extent_mapping(map_tree, em, 0);
6746
+ write_unlock(&map_tree->lock);
64806747 if (ret < 0) {
64816748 btrfs_err(fs_info,
64826749 "failed to add chunk map, start=%llu len=%llu: %d",
....@@ -6519,28 +6786,30 @@
65196786 lockdep_assert_held(&uuid_mutex);
65206787 ASSERT(fsid);
65216788
6522
- fs_devices = fs_info->fs_devices->seed;
6523
- while (fs_devices) {
6789
+ /* This will match only for multi-device seed fs */
6790
+ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
65246791 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
65256792 return fs_devices;
65266793
6527
- fs_devices = fs_devices->seed;
6528
- }
65296794
6530
- fs_devices = find_fsid(fsid);
6795
+ fs_devices = find_fsid(fsid, NULL);
65316796 if (!fs_devices) {
65326797 if (!btrfs_test_opt(fs_info, DEGRADED))
65336798 return ERR_PTR(-ENOENT);
65346799
6535
- fs_devices = alloc_fs_devices(fsid);
6800
+ fs_devices = alloc_fs_devices(fsid, NULL);
65366801 if (IS_ERR(fs_devices))
65376802 return fs_devices;
65386803
6539
- fs_devices->seeding = 1;
6804
+ fs_devices->seeding = true;
65406805 fs_devices->opened = 1;
65416806 return fs_devices;
65426807 }
65436808
6809
+ /*
6810
+ * Upon first call for a seed fs fsid, just create a private copy of the
6811
+ * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6812
+ */
65446813 fs_devices = clone_fs_devices(fs_devices);
65456814 if (IS_ERR(fs_devices))
65466815 return fs_devices;
....@@ -6548,27 +6817,24 @@
65486817 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
65496818 if (ret) {
65506819 free_fs_devices(fs_devices);
6551
- fs_devices = ERR_PTR(ret);
6552
- goto out;
6820
+ return ERR_PTR(ret);
65536821 }
65546822
65556823 if (!fs_devices->seeding) {
65566824 close_fs_devices(fs_devices);
65576825 free_fs_devices(fs_devices);
6558
- fs_devices = ERR_PTR(-EINVAL);
6559
- goto out;
6826
+ return ERR_PTR(-EINVAL);
65606827 }
65616828
6562
- fs_devices->seed = fs_info->fs_devices->seed;
6563
- fs_info->fs_devices->seed = fs_devices;
6564
-out:
6829
+ list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
6830
+
65656831 return fs_devices;
65666832 }
65676833
6568
-static int read_one_dev(struct btrfs_fs_info *fs_info,
6569
- struct extent_buffer *leaf,
6834
+static int read_one_dev(struct extent_buffer *leaf,
65706835 struct btrfs_dev_item *dev_item)
65716836 {
6837
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
65726838 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
65736839 struct btrfs_device *device;
65746840 u64 devid;
....@@ -6582,7 +6848,7 @@
65826848 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
65836849 BTRFS_FSID_SIZE);
65846850
6585
- if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
6851
+ if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
65866852 fs_devices = open_seed_devices(fs_info, fs_uuid);
65876853 if (IS_ERR(fs_devices))
65886854 return PTR_ERR(fs_devices);
....@@ -6725,48 +6991,49 @@
67256991 sb_array_offset += len;
67266992 cur_offset += len;
67276993
6728
- if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6729
- chunk = (struct btrfs_chunk *)sb_array_offset;
6730
- /*
6731
- * At least one btrfs_chunk with one stripe must be
6732
- * present, exact stripe count check comes afterwards
6733
- */
6734
- len = btrfs_chunk_item_size(1);
6735
- if (cur_offset + len > array_size)
6736
- goto out_short_read;
6737
-
6738
- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6739
- if (!num_stripes) {
6740
- btrfs_err(fs_info,
6741
- "invalid number of stripes %u in sys_array at offset %u",
6742
- num_stripes, cur_offset);
6743
- ret = -EIO;
6744
- break;
6745
- }
6746
-
6747
- type = btrfs_chunk_type(sb, chunk);
6748
- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6749
- btrfs_err(fs_info,
6750
- "invalid chunk type %llu in sys_array at offset %u",
6751
- type, cur_offset);
6752
- ret = -EIO;
6753
- break;
6754
- }
6755
-
6756
- len = btrfs_chunk_item_size(num_stripes);
6757
- if (cur_offset + len > array_size)
6758
- goto out_short_read;
6759
-
6760
- ret = read_one_chunk(fs_info, &key, sb, chunk);
6761
- if (ret)
6762
- break;
6763
- } else {
6994
+ if (key.type != BTRFS_CHUNK_ITEM_KEY) {
67646995 btrfs_err(fs_info,
67656996 "unexpected item type %u in sys_array at offset %u",
67666997 (u32)key.type, cur_offset);
67676998 ret = -EIO;
67686999 break;
67697000 }
7001
+
7002
+ chunk = (struct btrfs_chunk *)sb_array_offset;
7003
+ /*
7004
+ * At least one btrfs_chunk with one stripe must be present,
7005
+ * exact stripe count check comes afterwards
7006
+ */
7007
+ len = btrfs_chunk_item_size(1);
7008
+ if (cur_offset + len > array_size)
7009
+ goto out_short_read;
7010
+
7011
+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7012
+ if (!num_stripes) {
7013
+ btrfs_err(fs_info,
7014
+ "invalid number of stripes %u in sys_array at offset %u",
7015
+ num_stripes, cur_offset);
7016
+ ret = -EIO;
7017
+ break;
7018
+ }
7019
+
7020
+ type = btrfs_chunk_type(sb, chunk);
7021
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7022
+ btrfs_err(fs_info,
7023
+ "invalid chunk type %llu in sys_array at offset %u",
7024
+ type, cur_offset);
7025
+ ret = -EIO;
7026
+ break;
7027
+ }
7028
+
7029
+ len = btrfs_chunk_item_size(num_stripes);
7030
+ if (cur_offset + len > array_size)
7031
+ goto out_short_read;
7032
+
7033
+ ret = read_one_chunk(&key, sb, chunk);
7034
+ if (ret)
7035
+ break;
7036
+
67707037 array_ptr += len;
67717038 sb_array_offset += len;
67727039 cur_offset += len;
....@@ -6794,14 +7061,14 @@
67947061 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
67957062 struct btrfs_device *failing_dev)
67967063 {
6797
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
7064
+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
67987065 struct extent_map *em;
67997066 u64 next_start = 0;
68007067 bool ret = true;
68017068
6802
- read_lock(&map_tree->map_tree.lock);
6803
- em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
6804
- read_unlock(&map_tree->map_tree.lock);
7069
+ read_lock(&map_tree->lock);
7070
+ em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7071
+ read_unlock(&map_tree->lock);
68057072 /* No chunk at all? Return false anyway */
68067073 if (!em) {
68077074 ret = false;
....@@ -6830,7 +7097,7 @@
68307097 if (missing > max_tolerated) {
68317098 if (!failing_dev)
68327099 btrfs_warn(fs_info,
6833
- "chunk %llu missing %d devices, max tolerance is %d for writeable mount",
7100
+ "chunk %llu missing %d devices, max tolerance is %d for writable mount",
68347101 em->start, missing, max_tolerated);
68357102 free_extent_map(em);
68367103 ret = false;
....@@ -6839,13 +7106,26 @@
68397106 next_start = extent_map_end(em);
68407107 free_extent_map(em);
68417108
6842
- read_lock(&map_tree->map_tree.lock);
6843
- em = lookup_extent_mapping(&map_tree->map_tree, next_start,
7109
+ read_lock(&map_tree->lock);
7110
+ em = lookup_extent_mapping(map_tree, next_start,
68447111 (u64)(-1) - next_start);
6845
- read_unlock(&map_tree->map_tree.lock);
7112
+ read_unlock(&map_tree->lock);
68467113 }
68477114 out:
68487115 return ret;
7116
+}
7117
+
7118
+static void readahead_tree_node_children(struct extent_buffer *node)
7119
+{
7120
+ int i;
7121
+ const int nr_items = btrfs_header_nritems(node);
7122
+
7123
+ for (i = 0; i < nr_items; i++) {
7124
+ u64 start;
7125
+
7126
+ start = btrfs_node_blockptr(node, i);
7127
+ readahead_tree_block(node->fs_info, start);
7128
+ }
68497129 }
68507130
68517131 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
....@@ -6858,6 +7138,7 @@
68587138 int ret;
68597139 int slot;
68607140 u64 total_dev = 0;
7141
+ u64 last_ra_node = 0;
68617142
68627143 path = btrfs_alloc_path();
68637144 if (!path)
....@@ -6868,7 +7149,6 @@
68687149 * otherwise we don't need it.
68697150 */
68707151 mutex_lock(&uuid_mutex);
6871
- mutex_lock(&fs_info->chunk_mutex);
68727152
68737153 /*
68747154 * It is possible for mount and umount to race in such a way that
....@@ -6891,6 +7171,8 @@
68917171 if (ret < 0)
68927172 goto error;
68937173 while (1) {
7174
+ struct extent_buffer *node;
7175
+
68947176 leaf = path->nodes[0];
68957177 slot = path->slots[0];
68967178 if (slot >= btrfs_header_nritems(leaf)) {
....@@ -6901,19 +7183,32 @@
69017183 goto error;
69027184 break;
69037185 }
7186
+ /*
7187
+ * The nodes on level 1 are not locked but we don't need to do
7188
+ * that during mount time as nothing else can access the tree
7189
+ */
7190
+ node = path->nodes[1];
7191
+ if (node) {
7192
+ if (last_ra_node != node->start) {
7193
+ readahead_tree_node_children(node);
7194
+ last_ra_node = node->start;
7195
+ }
7196
+ }
69047197 btrfs_item_key_to_cpu(leaf, &found_key, slot);
69057198 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
69067199 struct btrfs_dev_item *dev_item;
69077200 dev_item = btrfs_item_ptr(leaf, slot,
69087201 struct btrfs_dev_item);
6909
- ret = read_one_dev(fs_info, leaf, dev_item);
7202
+ ret = read_one_dev(leaf, dev_item);
69107203 if (ret)
69117204 goto error;
69127205 total_dev++;
69137206 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
69147207 struct btrfs_chunk *chunk;
69157208 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6916
- ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
7209
+ mutex_lock(&fs_info->chunk_mutex);
7210
+ ret = read_one_chunk(&found_key, leaf, chunk);
7211
+ mutex_unlock(&fs_info->chunk_mutex);
69177212 if (ret)
69187213 goto error;
69197214 }
....@@ -6925,12 +7220,12 @@
69257220 * do another round of validation checks.
69267221 */
69277222 if (total_dev != fs_info->fs_devices->total_devices) {
6928
- btrfs_err(fs_info,
6929
- "super_num_devices %llu mismatch with num_devices %llu found here",
7223
+ btrfs_warn(fs_info,
7224
+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
69307225 btrfs_super_num_devices(fs_info->super_copy),
69317226 total_dev);
6932
- ret = -EINVAL;
6933
- goto error;
7227
+ fs_info->fs_devices->total_devices = total_dev;
7228
+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
69347229 }
69357230 if (btrfs_super_total_bytes(fs_info->super_copy) <
69367231 fs_info->fs_devices->total_rw_bytes) {
....@@ -6943,7 +7238,6 @@
69437238 }
69447239 ret = 0;
69457240 error:
6946
- mutex_unlock(&fs_info->chunk_mutex);
69477241 mutex_unlock(&uuid_mutex);
69487242
69497243 btrfs_free_path(path);
....@@ -6952,86 +7246,117 @@
69527246
69537247 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
69547248 {
6955
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7249
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
69567250 struct btrfs_device *device;
69577251
6958
- while (fs_devices) {
6959
- mutex_lock(&fs_devices->device_list_mutex);
6960
- list_for_each_entry(device, &fs_devices->devices, dev_list)
6961
- device->fs_info = fs_info;
6962
- mutex_unlock(&fs_devices->device_list_mutex);
7252
+ fs_devices->fs_info = fs_info;
69637253
6964
- fs_devices = fs_devices->seed;
7254
+ mutex_lock(&fs_devices->device_list_mutex);
7255
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
7256
+ device->fs_info = fs_info;
7257
+
7258
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7259
+ list_for_each_entry(device, &seed_devs->devices, dev_list)
7260
+ device->fs_info = fs_info;
7261
+
7262
+ seed_devs->fs_info = fs_info;
69657263 }
7264
+ mutex_unlock(&fs_devices->device_list_mutex);
69667265 }
69677266
6968
-static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
7267
+static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7268
+ const struct btrfs_dev_stats_item *ptr,
7269
+ int index)
69697270 {
6970
- int i;
7271
+ u64 val;
69717272
6972
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6973
- btrfs_dev_stat_reset(dev, i);
7273
+ read_extent_buffer(eb, &val,
7274
+ offsetof(struct btrfs_dev_stats_item, values) +
7275
+ ((unsigned long)ptr) + (index * sizeof(u64)),
7276
+ sizeof(val));
7277
+ return val;
7278
+}
7279
+
7280
+static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7281
+ struct btrfs_dev_stats_item *ptr,
7282
+ int index, u64 val)
7283
+{
7284
+ write_extent_buffer(eb, &val,
7285
+ offsetof(struct btrfs_dev_stats_item, values) +
7286
+ ((unsigned long)ptr) + (index * sizeof(u64)),
7287
+ sizeof(val));
7288
+}
7289
+
7290
+static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7291
+ struct btrfs_path *path)
7292
+{
7293
+ struct btrfs_dev_stats_item *ptr;
7294
+ struct extent_buffer *eb;
7295
+ struct btrfs_key key;
7296
+ int item_size;
7297
+ int i, ret, slot;
7298
+
7299
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
7300
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
7301
+ key.offset = device->devid;
7302
+ ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7303
+ if (ret) {
7304
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7305
+ btrfs_dev_stat_set(device, i, 0);
7306
+ device->dev_stats_valid = 1;
7307
+ btrfs_release_path(path);
7308
+ return ret < 0 ? ret : 0;
7309
+ }
7310
+ slot = path->slots[0];
7311
+ eb = path->nodes[0];
7312
+ item_size = btrfs_item_size_nr(eb, slot);
7313
+
7314
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7315
+
7316
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7317
+ if (item_size >= (1 + i) * sizeof(__le64))
7318
+ btrfs_dev_stat_set(device, i,
7319
+ btrfs_dev_stats_value(eb, ptr, i));
7320
+ else
7321
+ btrfs_dev_stat_set(device, i, 0);
7322
+ }
7323
+
7324
+ device->dev_stats_valid = 1;
7325
+ btrfs_dev_stat_print_on_load(device);
7326
+ btrfs_release_path(path);
7327
+
7328
+ return 0;
69747329 }
69757330
69767331 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
69777332 {
6978
- struct btrfs_key key;
6979
- struct btrfs_key found_key;
6980
- struct btrfs_root *dev_root = fs_info->dev_root;
6981
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6982
- struct extent_buffer *eb;
6983
- int slot;
6984
- int ret = 0;
7333
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
69857334 struct btrfs_device *device;
69867335 struct btrfs_path *path = NULL;
6987
- int i;
7336
+ int ret = 0;
69887337
69897338 path = btrfs_alloc_path();
6990
- if (!path) {
6991
- ret = -ENOMEM;
6992
- goto out;
6993
- }
7339
+ if (!path)
7340
+ return -ENOMEM;
69947341
69957342 mutex_lock(&fs_devices->device_list_mutex);
69967343 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6997
- int item_size;
6998
- struct btrfs_dev_stats_item *ptr;
6999
-
7000
- key.objectid = BTRFS_DEV_STATS_OBJECTID;
7001
- key.type = BTRFS_PERSISTENT_ITEM_KEY;
7002
- key.offset = device->devid;
7003
- ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
7004
- if (ret) {
7005
- __btrfs_reset_dev_stats(device);
7006
- device->dev_stats_valid = 1;
7007
- btrfs_release_path(path);
7008
- continue;
7009
- }
7010
- slot = path->slots[0];
7011
- eb = path->nodes[0];
7012
- btrfs_item_key_to_cpu(eb, &found_key, slot);
7013
- item_size = btrfs_item_size_nr(eb, slot);
7014
-
7015
- ptr = btrfs_item_ptr(eb, slot,
7016
- struct btrfs_dev_stats_item);
7017
-
7018
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7019
- if (item_size >= (1 + i) * sizeof(__le64))
7020
- btrfs_dev_stat_set(device, i,
7021
- btrfs_dev_stats_value(eb, ptr, i));
7022
- else
7023
- btrfs_dev_stat_reset(device, i);
7024
- }
7025
-
7026
- device->dev_stats_valid = 1;
7027
- btrfs_dev_stat_print_on_load(device);
7028
- btrfs_release_path(path);
7344
+ ret = btrfs_device_init_dev_stats(device, path);
7345
+ if (ret)
7346
+ goto out;
70297347 }
7348
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7349
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
7350
+ ret = btrfs_device_init_dev_stats(device, path);
7351
+ if (ret)
7352
+ goto out;
7353
+ }
7354
+ }
7355
+out:
70307356 mutex_unlock(&fs_devices->device_list_mutex);
70317357
7032
-out:
70337358 btrfs_free_path(path);
7034
- return ret < 0 ? ret : 0;
7359
+ return ret;
70357360 }
70367361
70377362 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
....@@ -7102,9 +7427,9 @@
71027427 /*
71037428 * called from commit_transaction. Writes all changed device stats to disk.
71047429 */
7105
-int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
7106
- struct btrfs_fs_info *fs_info)
7430
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
71077431 {
7432
+ struct btrfs_fs_info *fs_info = trans->fs_info;
71087433 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
71097434 struct btrfs_device *device;
71107435 int stats_cnt;
....@@ -7187,8 +7512,8 @@
71877512 int i;
71887513
71897514 mutex_lock(&fs_devices->device_list_mutex);
7190
- dev = btrfs_find_device(fs_info->fs_devices, stats->devid,
7191
- NULL, NULL, true);
7515
+ dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
7516
+ true);
71927517 mutex_unlock(&fs_devices->device_list_mutex);
71937518
71947519 if (!dev) {
....@@ -7203,7 +7528,7 @@
72037528 stats->values[i] =
72047529 btrfs_dev_stat_read_and_reset(dev, i);
72057530 else
7206
- btrfs_dev_stat_reset(dev, i);
7531
+ btrfs_dev_stat_set(dev, i, 0);
72077532 }
72087533 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
72097534 current->comm, task_pid_nr(current));
....@@ -7217,101 +7542,35 @@
72177542 return 0;
72187543 }
72197544
7220
-void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
7221
-{
7222
- struct buffer_head *bh;
7223
- struct btrfs_super_block *disk_super;
7224
- int copy_num;
7225
-
7226
- if (!bdev)
7227
- return;
7228
-
7229
- for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
7230
- copy_num++) {
7231
-
7232
- if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
7233
- continue;
7234
-
7235
- disk_super = (struct btrfs_super_block *)bh->b_data;
7236
-
7237
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
7238
- set_buffer_dirty(bh);
7239
- sync_dirty_buffer(bh);
7240
- brelse(bh);
7241
- }
7242
-
7243
- /* Notify udev that device has changed */
7244
- btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
7245
-
7246
- /* Update ctime/mtime for device path for libblkid */
7247
- update_dev_time(device_path);
7248
-}
7249
-
72507545 /*
7251
- * Update the size of all devices, which is used for writing out the
7252
- * super blocks.
7546
+ * Update the size and bytes used for each device where it changed. This is
7547
+ * delayed since we would otherwise get errors while writing out the
7548
+ * superblocks.
7549
+ *
7550
+ * Must be invoked during transaction commit.
72537551 */
7254
-void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
7552
+void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
72557553 {
7256
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
72577554 struct btrfs_device *curr, *next;
72587555
7259
- if (list_empty(&fs_devices->resized_devices))
7556
+ ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7557
+
7558
+ if (list_empty(&trans->dev_update_list))
72607559 return;
72617560
7262
- mutex_lock(&fs_devices->device_list_mutex);
7263
- mutex_lock(&fs_info->chunk_mutex);
7264
- list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
7265
- resized_list) {
7266
- list_del_init(&curr->resized_list);
7561
+ /*
7562
+ * We don't need the device_list_mutex here. This list is owned by the
7563
+ * transaction and the transaction must complete before the device is
7564
+ * released.
7565
+ */
7566
+ mutex_lock(&trans->fs_info->chunk_mutex);
7567
+ list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7568
+ post_commit_list) {
7569
+ list_del_init(&curr->post_commit_list);
72677570 curr->commit_total_bytes = curr->disk_total_bytes;
7571
+ curr->commit_bytes_used = curr->bytes_used;
72687572 }
7269
- mutex_unlock(&fs_info->chunk_mutex);
7270
- mutex_unlock(&fs_devices->device_list_mutex);
7271
-}
7272
-
7273
-/* Must be invoked during the transaction commit */
7274
-void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
7275
-{
7276
- struct btrfs_fs_info *fs_info = trans->fs_info;
7277
- struct extent_map *em;
7278
- struct map_lookup *map;
7279
- struct btrfs_device *dev;
7280
- int i;
7281
-
7282
- if (list_empty(&trans->pending_chunks))
7283
- return;
7284
-
7285
- /* In order to kick the device replace finish process */
7286
- mutex_lock(&fs_info->chunk_mutex);
7287
- list_for_each_entry(em, &trans->pending_chunks, list) {
7288
- map = em->map_lookup;
7289
-
7290
- for (i = 0; i < map->num_stripes; i++) {
7291
- dev = map->stripes[i].dev;
7292
- dev->commit_bytes_used = dev->bytes_used;
7293
- dev->has_pending_chunks = false;
7294
- }
7295
- }
7296
- mutex_unlock(&fs_info->chunk_mutex);
7297
-}
7298
-
7299
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
7300
-{
7301
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7302
- while (fs_devices) {
7303
- fs_devices->fs_info = fs_info;
7304
- fs_devices = fs_devices->seed;
7305
- }
7306
-}
7307
-
7308
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
7309
-{
7310
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7311
- while (fs_devices) {
7312
- fs_devices->fs_info = NULL;
7313
- fs_devices = fs_devices->seed;
7314
- }
7573
+ mutex_unlock(&trans->fs_info->chunk_mutex);
73157574 }
73167575
73177576 /*
....@@ -7319,38 +7578,18 @@
73197578 */
73207579 int btrfs_bg_type_to_factor(u64 flags)
73217580 {
7322
- if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
7323
- BTRFS_BLOCK_GROUP_RAID10))
7324
- return 2;
7325
- return 1;
7581
+ const int index = btrfs_bg_flags_to_raid_index(flags);
7582
+
7583
+ return btrfs_raid_array[index].ncopies;
73267584 }
73277585
73287586
7329
-static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
7330
-{
7331
- int index = btrfs_bg_flags_to_raid_index(type);
7332
- int ncopies = btrfs_raid_array[index].ncopies;
7333
- int data_stripes;
7334
-
7335
- switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
7336
- case BTRFS_BLOCK_GROUP_RAID5:
7337
- data_stripes = num_stripes - 1;
7338
- break;
7339
- case BTRFS_BLOCK_GROUP_RAID6:
7340
- data_stripes = num_stripes - 2;
7341
- break;
7342
- default:
7343
- data_stripes = num_stripes / ncopies;
7344
- break;
7345
- }
7346
- return div_u64(chunk_len, data_stripes);
7347
-}
73487587
73497588 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
73507589 u64 chunk_offset, u64 devid,
73517590 u64 physical_offset, u64 physical_len)
73527591 {
7353
- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
7592
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
73547593 struct extent_map *em;
73557594 struct map_lookup *map;
73567595 struct btrfs_device *dev;
....@@ -7414,8 +7653,11 @@
74147653
74157654 /* It's possible this device is a dummy for seed device */
74167655 if (dev->disk_total_bytes == 0) {
7417
- dev = btrfs_find_device(fs_info->fs_devices->seed, devid,
7418
- NULL, NULL, false);
7656
+ struct btrfs_fs_devices *devs;
7657
+
7658
+ devs = list_first_entry(&fs_info->fs_devices->seed_list,
7659
+ struct btrfs_fs_devices, seed_list);
7660
+ dev = btrfs_find_device(devs, devid, NULL, NULL, false);
74197661 if (!dev) {
74207662 btrfs_err(fs_info, "failed to find seed devid %llu",
74217663 devid);
....@@ -7439,13 +7681,13 @@
74397681
74407682 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
74417683 {
7442
- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
7684
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
74437685 struct extent_map *em;
74447686 struct rb_node *node;
74457687 int ret = 0;
74467688
74477689 read_lock(&em_tree->lock);
7448
- for (node = rb_first(&em_tree->map); node; node = rb_next(node)) {
7690
+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
74497691 em = rb_entry(node, struct extent_map, rb_node);
74507692 if (em->map_lookup->num_stripes !=
74517693 em->map_lookup->verified_stripes) {
....@@ -7551,3 +7793,27 @@
75517793 btrfs_free_path(path);
75527794 return ret;
75537795 }
7796
+
7797
+/*
7798
+ * Check whether the given block group or device is pinned by any inode being
7799
+ * used as a swapfile.
7800
+ */
7801
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7802
+{
7803
+ struct btrfs_swapfile_pin *sp;
7804
+ struct rb_node *node;
7805
+
7806
+ spin_lock(&fs_info->swapfile_pins_lock);
7807
+ node = fs_info->swapfile_pins.rb_node;
7808
+ while (node) {
7809
+ sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7810
+ if (ptr < sp->ptr)
7811
+ node = node->rb_left;
7812
+ else if (ptr > sp->ptr)
7813
+ node = node->rb_right;
7814
+ else
7815
+ break;
7816
+ }
7817
+ spin_unlock(&fs_info->swapfile_pins_lock);
7818
+ return node != NULL;
7819
+}