.. | .. |
---|
7 | 7 | #include <linux/sched/mm.h> |
---|
8 | 8 | #include <linux/bio.h> |
---|
9 | 9 | #include <linux/slab.h> |
---|
10 | | -#include <linux/buffer_head.h> |
---|
11 | 10 | #include <linux/blkdev.h> |
---|
12 | 11 | #include <linux/ratelimit.h> |
---|
13 | 12 | #include <linux/kthread.h> |
---|
.. | .. |
---|
15 | 14 | #include <linux/semaphore.h> |
---|
16 | 15 | #include <linux/uuid.h> |
---|
17 | 16 | #include <linux/list_sort.h> |
---|
| 17 | +#include <linux/namei.h> |
---|
| 18 | +#include "misc.h" |
---|
18 | 19 | #include "ctree.h" |
---|
19 | 20 | #include "extent_map.h" |
---|
20 | 21 | #include "disk-io.h" |
---|
.. | .. |
---|
25 | 26 | #include "async-thread.h" |
---|
26 | 27 | #include "check-integrity.h" |
---|
27 | 28 | #include "rcu-string.h" |
---|
28 | | -#include "math.h" |
---|
29 | 29 | #include "dev-replace.h" |
---|
30 | 30 | #include "sysfs.h" |
---|
31 | 31 | #include "tree-checker.h" |
---|
| 32 | +#include "space-info.h" |
---|
| 33 | +#include "block-group.h" |
---|
| 34 | +#include "discard.h" |
---|
32 | 35 | |
---|
33 | 36 | const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { |
---|
34 | 37 | [BTRFS_RAID_RAID10] = { |
---|
.. | .. |
---|
39 | 42 | .tolerated_failures = 1, |
---|
40 | 43 | .devs_increment = 2, |
---|
41 | 44 | .ncopies = 2, |
---|
| 45 | + .nparity = 0, |
---|
42 | 46 | .raid_name = "raid10", |
---|
43 | 47 | .bg_flag = BTRFS_BLOCK_GROUP_RAID10, |
---|
44 | 48 | .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, |
---|
.. | .. |
---|
51 | 55 | .tolerated_failures = 1, |
---|
52 | 56 | .devs_increment = 2, |
---|
53 | 57 | .ncopies = 2, |
---|
| 58 | + .nparity = 0, |
---|
54 | 59 | .raid_name = "raid1", |
---|
55 | 60 | .bg_flag = BTRFS_BLOCK_GROUP_RAID1, |
---|
56 | 61 | .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, |
---|
| 62 | + }, |
---|
| 63 | + [BTRFS_RAID_RAID1C3] = { |
---|
| 64 | + .sub_stripes = 1, |
---|
| 65 | + .dev_stripes = 1, |
---|
| 66 | + .devs_max = 3, |
---|
| 67 | + .devs_min = 3, |
---|
| 68 | + .tolerated_failures = 2, |
---|
| 69 | + .devs_increment = 3, |
---|
| 70 | + .ncopies = 3, |
---|
| 71 | + .nparity = 0, |
---|
| 72 | + .raid_name = "raid1c3", |
---|
| 73 | + .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, |
---|
| 74 | + .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, |
---|
| 75 | + }, |
---|
| 76 | + [BTRFS_RAID_RAID1C4] = { |
---|
| 77 | + .sub_stripes = 1, |
---|
| 78 | + .dev_stripes = 1, |
---|
| 79 | + .devs_max = 4, |
---|
| 80 | + .devs_min = 4, |
---|
| 81 | + .tolerated_failures = 3, |
---|
| 82 | + .devs_increment = 4, |
---|
| 83 | + .ncopies = 4, |
---|
| 84 | + .nparity = 0, |
---|
| 85 | + .raid_name = "raid1c4", |
---|
| 86 | + .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, |
---|
| 87 | + .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, |
---|
57 | 88 | }, |
---|
58 | 89 | [BTRFS_RAID_DUP] = { |
---|
59 | 90 | .sub_stripes = 1, |
---|
.. | .. |
---|
63 | 94 | .tolerated_failures = 0, |
---|
64 | 95 | .devs_increment = 1, |
---|
65 | 96 | .ncopies = 2, |
---|
| 97 | + .nparity = 0, |
---|
66 | 98 | .raid_name = "dup", |
---|
67 | 99 | .bg_flag = BTRFS_BLOCK_GROUP_DUP, |
---|
68 | 100 | .mindev_error = 0, |
---|
.. | .. |
---|
75 | 107 | .tolerated_failures = 0, |
---|
76 | 108 | .devs_increment = 1, |
---|
77 | 109 | .ncopies = 1, |
---|
| 110 | + .nparity = 0, |
---|
78 | 111 | .raid_name = "raid0", |
---|
79 | 112 | .bg_flag = BTRFS_BLOCK_GROUP_RAID0, |
---|
80 | 113 | .mindev_error = 0, |
---|
.. | .. |
---|
87 | 120 | .tolerated_failures = 0, |
---|
88 | 121 | .devs_increment = 1, |
---|
89 | 122 | .ncopies = 1, |
---|
| 123 | + .nparity = 0, |
---|
90 | 124 | .raid_name = "single", |
---|
91 | 125 | .bg_flag = 0, |
---|
92 | 126 | .mindev_error = 0, |
---|
.. | .. |
---|
99 | 133 | .tolerated_failures = 1, |
---|
100 | 134 | .devs_increment = 1, |
---|
101 | 135 | .ncopies = 1, |
---|
| 136 | + .nparity = 1, |
---|
102 | 137 | .raid_name = "raid5", |
---|
103 | 138 | .bg_flag = BTRFS_BLOCK_GROUP_RAID5, |
---|
104 | 139 | .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, |
---|
.. | .. |
---|
111 | 146 | .tolerated_failures = 2, |
---|
112 | 147 | .devs_increment = 1, |
---|
113 | 148 | .ncopies = 1, |
---|
| 149 | + .nparity = 2, |
---|
114 | 150 | .raid_name = "raid6", |
---|
115 | 151 | .bg_flag = BTRFS_BLOCK_GROUP_RAID6, |
---|
116 | 152 | .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, |
---|
117 | 153 | }, |
---|
118 | 154 | }; |
---|
119 | 155 | |
---|
120 | | -const char *get_raid_name(enum btrfs_raid_types type) |
---|
| 156 | +const char *btrfs_bg_type_to_raid_name(u64 flags) |
---|
121 | 157 | { |
---|
122 | | - if (type >= BTRFS_NR_RAID_TYPES) |
---|
| 158 | + const int index = btrfs_bg_flags_to_raid_index(flags); |
---|
| 159 | + |
---|
| 160 | + if (index >= BTRFS_NR_RAID_TYPES) |
---|
123 | 161 | return NULL; |
---|
124 | 162 | |
---|
125 | | - return btrfs_raid_array[type].raid_name; |
---|
| 163 | + return btrfs_raid_array[index].raid_name; |
---|
126 | 164 | } |
---|
127 | 165 | |
---|
128 | | -static int init_first_rw_device(struct btrfs_trans_handle *trans, |
---|
129 | | - struct btrfs_fs_info *fs_info); |
---|
| 166 | +/* |
---|
| 167 | + * Fill @buf with textual description of @bg_flags, no more than @size_buf |
---|
| 168 | + * bytes including terminating null byte. |
---|
| 169 | + */ |
---|
| 170 | +void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) |
---|
| 171 | +{ |
---|
| 172 | + int i; |
---|
| 173 | + int ret; |
---|
| 174 | + char *bp = buf; |
---|
| 175 | + u64 flags = bg_flags; |
---|
| 176 | + u32 size_bp = size_buf; |
---|
| 177 | + |
---|
| 178 | + if (!flags) { |
---|
| 179 | + strcpy(bp, "NONE"); |
---|
| 180 | + return; |
---|
| 181 | + } |
---|
| 182 | + |
---|
| 183 | +#define DESCRIBE_FLAG(flag, desc) \ |
---|
| 184 | + do { \ |
---|
| 185 | + if (flags & (flag)) { \ |
---|
| 186 | + ret = snprintf(bp, size_bp, "%s|", (desc)); \ |
---|
| 187 | + if (ret < 0 || ret >= size_bp) \ |
---|
| 188 | + goto out_overflow; \ |
---|
| 189 | + size_bp -= ret; \ |
---|
| 190 | + bp += ret; \ |
---|
| 191 | + flags &= ~(flag); \ |
---|
| 192 | + } \ |
---|
| 193 | + } while (0) |
---|
| 194 | + |
---|
| 195 | + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); |
---|
| 196 | + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); |
---|
| 197 | + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); |
---|
| 198 | + |
---|
| 199 | + DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); |
---|
| 200 | + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) |
---|
| 201 | + DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, |
---|
| 202 | + btrfs_raid_array[i].raid_name); |
---|
| 203 | +#undef DESCRIBE_FLAG |
---|
| 204 | + |
---|
| 205 | + if (flags) { |
---|
| 206 | + ret = snprintf(bp, size_bp, "0x%llx|", flags); |
---|
| 207 | + size_bp -= ret; |
---|
| 208 | + } |
---|
| 209 | + |
---|
| 210 | + if (size_bp < size_buf) |
---|
| 211 | + buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ |
---|
| 212 | + |
---|
| 213 | + /* |
---|
| 214 | + * The text is trimmed, it's up to the caller to provide sufficiently |
---|
| 215 | + * large buffer |
---|
| 216 | + */ |
---|
| 217 | +out_overflow:; |
---|
| 218 | +} |
---|
| 219 | + |
---|
| 220 | +static int init_first_rw_device(struct btrfs_trans_handle *trans); |
---|
130 | 221 | static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); |
---|
131 | | -static void __btrfs_reset_dev_stats(struct btrfs_device *dev); |
---|
132 | 222 | static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); |
---|
133 | 223 | static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); |
---|
134 | 224 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, |
---|
.. | .. |
---|
153 | 243 | * the mutex can be very coarse and can cover long-running operations |
---|
154 | 244 | * |
---|
155 | 245 | * protects: updates to fs_devices counters like missing devices, rw devices, |
---|
156 | | - * seeding, structure cloning, openning/closing devices at mount/umount time |
---|
| 246 | + * seeding, structure cloning, opening/closing devices at mount/umount time |
---|
157 | 247 | * |
---|
158 | 248 | * global::fs_devs - add, remove, updates to the global list |
---|
159 | 249 | * |
---|
.. | .. |
---|
183 | 273 | * chunk_mutex |
---|
184 | 274 | * ----------- |
---|
185 | 275 | * protects chunks, adding or removing during allocation, trim or when a new |
---|
186 | | - * device is added/removed |
---|
| 276 | + * device is added/removed. Additionally it also protects post_commit_list of |
---|
| 277 | + * individual devices, since they can be added to the transaction's |
---|
| 278 | + * post_commit_list only with chunk_mutex held. |
---|
187 | 279 | * |
---|
188 | 280 | * cleaner_mutex |
---|
189 | 281 | * ------------- |
---|
.. | .. |
---|
195 | 287 | * ============ |
---|
196 | 288 | * |
---|
197 | 289 | * uuid_mutex |
---|
198 | | - * volume_mutex |
---|
199 | | - * device_list_mutex |
---|
200 | | - * chunk_mutex |
---|
201 | | - * balance_mutex |
---|
| 290 | + * device_list_mutex |
---|
| 291 | + * chunk_mutex |
---|
| 292 | + * balance_mutex |
---|
202 | 293 | * |
---|
203 | 294 | * |
---|
204 | | - * Exclusive operations, BTRFS_FS_EXCL_OP |
---|
205 | | - * ====================================== |
---|
| 295 | + * Exclusive operations |
---|
| 296 | + * ==================== |
---|
206 | 297 | * |
---|
207 | 298 | * Maintains the exclusivity of the following operations that apply to the |
---|
208 | 299 | * whole filesystem and cannot run in parallel. |
---|
.. | .. |
---|
228 | 319 | * - system power-cycle and filesystem mounted as read-only |
---|
229 | 320 | * - filesystem or device errors leading to forced read-only |
---|
230 | 321 | * |
---|
231 | | - * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. |
---|
232 | | - * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. |
---|
| 322 | + * The status of exclusive operation is set and cleared atomically. |
---|
| 323 | + * During the course of Paused state, fs_info::exclusive_operation remains set. |
---|
233 | 324 | * A device operation in Paused or Running state can be canceled or resumed |
---|
234 | 325 | * either by ioctl (Balance only) or when remounted as read-write. |
---|
235 | | - * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or |
---|
| 326 | + * The exclusive status is cleared when the device operation is canceled or |
---|
236 | 327 | * completed. |
---|
237 | 328 | */ |
---|
238 | 329 | |
---|
239 | 330 | DEFINE_MUTEX(uuid_mutex); |
---|
240 | 331 | static LIST_HEAD(fs_uuids); |
---|
241 | | -struct list_head *btrfs_get_fs_uuids(void) |
---|
| 332 | +struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) |
---|
242 | 333 | { |
---|
243 | 334 | return &fs_uuids; |
---|
244 | 335 | } |
---|
245 | 336 | |
---|
246 | 337 | /* |
---|
247 | 338 | * alloc_fs_devices - allocate struct btrfs_fs_devices |
---|
248 | | - * @fsid: if not NULL, copy the uuid to fs_devices::fsid |
---|
| 339 | + * @fsid: if not NULL, copy the UUID to fs_devices::fsid |
---|
| 340 | + * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid |
---|
249 | 341 | * |
---|
250 | 342 | * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). |
---|
251 | 343 | * The returned struct is not linked onto any lists and can be destroyed with |
---|
252 | 344 | * kfree() right away. |
---|
253 | 345 | */ |
---|
254 | | -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) |
---|
| 346 | +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, |
---|
| 347 | + const u8 *metadata_fsid) |
---|
255 | 348 | { |
---|
256 | 349 | struct btrfs_fs_devices *fs_devs; |
---|
257 | 350 | |
---|
.. | .. |
---|
262 | 355 | mutex_init(&fs_devs->device_list_mutex); |
---|
263 | 356 | |
---|
264 | 357 | INIT_LIST_HEAD(&fs_devs->devices); |
---|
265 | | - INIT_LIST_HEAD(&fs_devs->resized_devices); |
---|
266 | 358 | INIT_LIST_HEAD(&fs_devs->alloc_list); |
---|
267 | 359 | INIT_LIST_HEAD(&fs_devs->fs_list); |
---|
| 360 | + INIT_LIST_HEAD(&fs_devs->seed_list); |
---|
268 | 361 | if (fsid) |
---|
269 | 362 | memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); |
---|
| 363 | + |
---|
| 364 | + if (metadata_fsid) |
---|
| 365 | + memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); |
---|
| 366 | + else if (fsid) |
---|
| 367 | + memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); |
---|
270 | 368 | |
---|
271 | 369 | return fs_devs; |
---|
272 | 370 | } |
---|
273 | 371 | |
---|
274 | 372 | void btrfs_free_device(struct btrfs_device *device) |
---|
275 | 373 | { |
---|
| 374 | + WARN_ON(!list_empty(&device->post_commit_list)); |
---|
276 | 375 | rcu_string_free(device->name); |
---|
| 376 | + extent_io_tree_release(&device->alloc_state); |
---|
277 | 377 | bio_put(device->flush_bio); |
---|
278 | 378 | kfree(device); |
---|
279 | 379 | } |
---|
.. | .. |
---|
281 | 381 | static void free_fs_devices(struct btrfs_fs_devices *fs_devices) |
---|
282 | 382 | { |
---|
283 | 383 | struct btrfs_device *device; |
---|
| 384 | + |
---|
284 | 385 | WARN_ON(fs_devices->opened); |
---|
285 | 386 | while (!list_empty(&fs_devices->devices)) { |
---|
286 | 387 | device = list_entry(fs_devices->devices.next, |
---|
.. | .. |
---|
289 | 390 | btrfs_free_device(device); |
---|
290 | 391 | } |
---|
291 | 392 | kfree(fs_devices); |
---|
292 | | -} |
---|
293 | | - |
---|
294 | | -static void btrfs_kobject_uevent(struct block_device *bdev, |
---|
295 | | - enum kobject_action action) |
---|
296 | | -{ |
---|
297 | | - int ret; |
---|
298 | | - |
---|
299 | | - ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); |
---|
300 | | - if (ret) |
---|
301 | | - pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n", |
---|
302 | | - action, |
---|
303 | | - kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), |
---|
304 | | - &disk_to_dev(bdev->bd_disk)->kobj); |
---|
305 | 393 | } |
---|
306 | 394 | |
---|
307 | 395 | void __exit btrfs_cleanup_fs_uuids(void) |
---|
.. | .. |
---|
321 | 409 | * Returned struct is not linked onto any lists and must be destroyed using |
---|
322 | 410 | * btrfs_free_device. |
---|
323 | 411 | */ |
---|
324 | | -static struct btrfs_device *__alloc_device(void) |
---|
| 412 | +static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) |
---|
325 | 413 | { |
---|
326 | 414 | struct btrfs_device *dev; |
---|
327 | 415 | |
---|
.. | .. |
---|
341 | 429 | |
---|
342 | 430 | INIT_LIST_HEAD(&dev->dev_list); |
---|
343 | 431 | INIT_LIST_HEAD(&dev->dev_alloc_list); |
---|
344 | | - INIT_LIST_HEAD(&dev->resized_list); |
---|
345 | | - |
---|
346 | | - spin_lock_init(&dev->io_lock); |
---|
| 432 | + INIT_LIST_HEAD(&dev->post_commit_list); |
---|
347 | 433 | |
---|
348 | 434 | atomic_set(&dev->reada_in_flight, 0); |
---|
349 | 435 | atomic_set(&dev->dev_stats_ccnt, 0); |
---|
350 | 436 | btrfs_device_data_ordered_init(dev); |
---|
351 | 437 | INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); |
---|
352 | 438 | INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); |
---|
| 439 | + extent_io_tree_init(fs_info, &dev->alloc_state, |
---|
| 440 | + IO_TREE_DEVICE_ALLOC_STATE, NULL); |
---|
353 | 441 | |
---|
354 | 442 | return dev; |
---|
355 | 443 | } |
---|
356 | 444 | |
---|
357 | | -static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) |
---|
| 445 | +static noinline struct btrfs_fs_devices *find_fsid( |
---|
| 446 | + const u8 *fsid, const u8 *metadata_fsid) |
---|
358 | 447 | { |
---|
359 | 448 | struct btrfs_fs_devices *fs_devices; |
---|
360 | 449 | |
---|
| 450 | + ASSERT(fsid); |
---|
| 451 | + |
---|
| 452 | + /* Handle non-split brain cases */ |
---|
361 | 453 | list_for_each_entry(fs_devices, &fs_uuids, fs_list) { |
---|
362 | | - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) |
---|
363 | | - return fs_devices; |
---|
| 454 | + if (metadata_fsid) { |
---|
| 455 | + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 |
---|
| 456 | + && memcmp(metadata_fsid, fs_devices->metadata_uuid, |
---|
| 457 | + BTRFS_FSID_SIZE) == 0) |
---|
| 458 | + return fs_devices; |
---|
| 459 | + } else { |
---|
| 460 | + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) |
---|
| 461 | + return fs_devices; |
---|
| 462 | + } |
---|
364 | 463 | } |
---|
365 | 464 | return NULL; |
---|
366 | 465 | } |
---|
367 | 466 | |
---|
| 467 | +static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( |
---|
| 468 | + struct btrfs_super_block *disk_super) |
---|
| 469 | +{ |
---|
| 470 | + |
---|
| 471 | + struct btrfs_fs_devices *fs_devices; |
---|
| 472 | + |
---|
| 473 | + /* |
---|
| 474 | + * Handle scanned device having completed its fsid change but |
---|
| 475 | + * belonging to a fs_devices that was created by first scanning |
---|
| 476 | + * a device which didn't have its fsid/metadata_uuid changed |
---|
| 477 | + * at all and the CHANGING_FSID_V2 flag set. |
---|
| 478 | + */ |
---|
| 479 | + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { |
---|
| 480 | + if (fs_devices->fsid_change && |
---|
| 481 | + memcmp(disk_super->metadata_uuid, fs_devices->fsid, |
---|
| 482 | + BTRFS_FSID_SIZE) == 0 && |
---|
| 483 | + memcmp(fs_devices->fsid, fs_devices->metadata_uuid, |
---|
| 484 | + BTRFS_FSID_SIZE) == 0) { |
---|
| 485 | + return fs_devices; |
---|
| 486 | + } |
---|
| 487 | + } |
---|
| 488 | + /* |
---|
| 489 | + * Handle scanned device having completed its fsid change but |
---|
| 490 | + * belonging to a fs_devices that was created by a device that |
---|
| 491 | + * has an outdated pair of fsid/metadata_uuid and |
---|
| 492 | + * CHANGING_FSID_V2 flag set. |
---|
| 493 | + */ |
---|
| 494 | + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { |
---|
| 495 | + if (fs_devices->fsid_change && |
---|
| 496 | + memcmp(fs_devices->metadata_uuid, |
---|
| 497 | + fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && |
---|
| 498 | + memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, |
---|
| 499 | + BTRFS_FSID_SIZE) == 0) { |
---|
| 500 | + return fs_devices; |
---|
| 501 | + } |
---|
| 502 | + } |
---|
| 503 | + |
---|
| 504 | + return find_fsid(disk_super->fsid, disk_super->metadata_uuid); |
---|
| 505 | +} |
---|
| 506 | + |
---|
| 507 | + |
---|
368 | 508 | static int |
---|
369 | 509 | btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, |
---|
370 | 510 | int flush, struct block_device **bdev, |
---|
371 | | - struct buffer_head **bh) |
---|
| 511 | + struct btrfs_super_block **disk_super) |
---|
372 | 512 | { |
---|
373 | 513 | int ret; |
---|
374 | 514 | |
---|
.. | .. |
---|
387 | 527 | goto error; |
---|
388 | 528 | } |
---|
389 | 529 | invalidate_bdev(*bdev); |
---|
390 | | - *bh = btrfs_read_dev_super(*bdev); |
---|
391 | | - if (IS_ERR(*bh)) { |
---|
392 | | - ret = PTR_ERR(*bh); |
---|
| 530 | + *disk_super = btrfs_read_dev_super(*bdev); |
---|
| 531 | + if (IS_ERR(*disk_super)) { |
---|
| 532 | + ret = PTR_ERR(*disk_super); |
---|
393 | 533 | blkdev_put(*bdev, flags); |
---|
394 | 534 | goto error; |
---|
395 | 535 | } |
---|
.. | .. |
---|
398 | 538 | |
---|
399 | 539 | error: |
---|
400 | 540 | *bdev = NULL; |
---|
401 | | - *bh = NULL; |
---|
402 | 541 | return ret; |
---|
403 | 542 | } |
---|
404 | 543 | |
---|
405 | | -static void requeue_list(struct btrfs_pending_bios *pending_bios, |
---|
406 | | - struct bio *head, struct bio *tail) |
---|
407 | | -{ |
---|
408 | | - |
---|
409 | | - struct bio *old_head; |
---|
410 | | - |
---|
411 | | - old_head = pending_bios->head; |
---|
412 | | - pending_bios->head = head; |
---|
413 | | - if (pending_bios->tail) |
---|
414 | | - tail->bi_next = old_head; |
---|
415 | | - else |
---|
416 | | - pending_bios->tail = tail; |
---|
417 | | -} |
---|
418 | | - |
---|
419 | 544 | /* |
---|
420 | | - * we try to collect pending bios for a device so we don't get a large |
---|
421 | | - * number of procs sending bios down to the same device. This greatly |
---|
422 | | - * improves the schedulers ability to collect and merge the bios. |
---|
| 545 | + * Check if the device in the path matches the device in the given struct device. |
---|
423 | 546 | * |
---|
424 | | - * But, it also turns into a long list of bios to process and that is sure |
---|
425 | | - * to eventually make the worker thread block. The solution here is to |
---|
426 | | - * make some progress and then put this work struct back at the end of |
---|
427 | | - * the list if the block device is congested. This way, multiple devices |
---|
428 | | - * can make progress from a single worker thread. |
---|
| 547 | + * Returns: |
---|
| 548 | + * true If it is the same device. |
---|
| 549 | + * false If it is not the same device or on error. |
---|
429 | 550 | */ |
---|
430 | | -static noinline void run_scheduled_bios(struct btrfs_device *device) |
---|
| 551 | +static bool device_matched(const struct btrfs_device *device, const char *path) |
---|
431 | 552 | { |
---|
432 | | - struct btrfs_fs_info *fs_info = device->fs_info; |
---|
433 | | - struct bio *pending; |
---|
434 | | - struct backing_dev_info *bdi; |
---|
435 | | - struct btrfs_pending_bios *pending_bios; |
---|
436 | | - struct bio *tail; |
---|
437 | | - struct bio *cur; |
---|
438 | | - int again = 0; |
---|
439 | | - unsigned long num_run; |
---|
440 | | - unsigned long batch_run = 0; |
---|
441 | | - unsigned long last_waited = 0; |
---|
442 | | - int force_reg = 0; |
---|
443 | | - int sync_pending = 0; |
---|
444 | | - struct blk_plug plug; |
---|
| 553 | + char *device_name; |
---|
| 554 | + struct block_device *bdev_old; |
---|
| 555 | + struct block_device *bdev_new; |
---|
445 | 556 | |
---|
446 | 557 | /* |
---|
447 | | - * this function runs all the bios we've collected for |
---|
448 | | - * a particular device. We don't want to wander off to |
---|
449 | | - * another device without first sending all of these down. |
---|
450 | | - * So, setup a plug here and finish it off before we return |
---|
| 558 | + * If we are looking for a device with the matching dev_t, then skip |
---|
| 559 | + * device without a name (a missing device). |
---|
451 | 560 | */ |
---|
452 | | - blk_start_plug(&plug); |
---|
| 561 | + if (!device->name) |
---|
| 562 | + return false; |
---|
453 | 563 | |
---|
454 | | - bdi = device->bdev->bd_bdi; |
---|
| 564 | + device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); |
---|
| 565 | + if (!device_name) |
---|
| 566 | + return false; |
---|
455 | 567 | |
---|
456 | | -loop: |
---|
457 | | - spin_lock(&device->io_lock); |
---|
| 568 | + rcu_read_lock(); |
---|
| 569 | + scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name)); |
---|
| 570 | + rcu_read_unlock(); |
---|
458 | 571 | |
---|
459 | | -loop_lock: |
---|
460 | | - num_run = 0; |
---|
| 572 | + bdev_old = lookup_bdev(device_name); |
---|
| 573 | + kfree(device_name); |
---|
| 574 | + if (IS_ERR(bdev_old)) |
---|
| 575 | + return false; |
---|
461 | 576 | |
---|
462 | | - /* take all the bios off the list at once and process them |
---|
463 | | - * later on (without the lock held). But, remember the |
---|
464 | | - * tail and other pointers so the bios can be properly reinserted |
---|
465 | | - * into the list if we hit congestion |
---|
466 | | - */ |
---|
467 | | - if (!force_reg && device->pending_sync_bios.head) { |
---|
468 | | - pending_bios = &device->pending_sync_bios; |
---|
469 | | - force_reg = 1; |
---|
470 | | - } else { |
---|
471 | | - pending_bios = &device->pending_bios; |
---|
472 | | - force_reg = 0; |
---|
473 | | - } |
---|
| 577 | + bdev_new = lookup_bdev(path); |
---|
| 578 | + if (IS_ERR(bdev_new)) |
---|
| 579 | + return false; |
---|
474 | 580 | |
---|
475 | | - pending = pending_bios->head; |
---|
476 | | - tail = pending_bios->tail; |
---|
477 | | - WARN_ON(pending && !tail); |
---|
| 581 | + if (bdev_old == bdev_new) |
---|
| 582 | + return true; |
---|
478 | 583 | |
---|
479 | | - /* |
---|
480 | | - * if pending was null this time around, no bios need processing |
---|
481 | | - * at all and we can stop. Otherwise it'll loop back up again |
---|
482 | | - * and do an additional check so no bios are missed. |
---|
483 | | - * |
---|
484 | | - * device->running_pending is used to synchronize with the |
---|
485 | | - * schedule_bio code. |
---|
486 | | - */ |
---|
487 | | - if (device->pending_sync_bios.head == NULL && |
---|
488 | | - device->pending_bios.head == NULL) { |
---|
489 | | - again = 0; |
---|
490 | | - device->running_pending = 0; |
---|
491 | | - } else { |
---|
492 | | - again = 1; |
---|
493 | | - device->running_pending = 1; |
---|
494 | | - } |
---|
495 | | - |
---|
496 | | - pending_bios->head = NULL; |
---|
497 | | - pending_bios->tail = NULL; |
---|
498 | | - |
---|
499 | | - spin_unlock(&device->io_lock); |
---|
500 | | - |
---|
501 | | - while (pending) { |
---|
502 | | - |
---|
503 | | - rmb(); |
---|
504 | | - /* we want to work on both lists, but do more bios on the |
---|
505 | | - * sync list than the regular list |
---|
506 | | - */ |
---|
507 | | - if ((num_run > 32 && |
---|
508 | | - pending_bios != &device->pending_sync_bios && |
---|
509 | | - device->pending_sync_bios.head) || |
---|
510 | | - (num_run > 64 && pending_bios == &device->pending_sync_bios && |
---|
511 | | - device->pending_bios.head)) { |
---|
512 | | - spin_lock(&device->io_lock); |
---|
513 | | - requeue_list(pending_bios, pending, tail); |
---|
514 | | - goto loop_lock; |
---|
515 | | - } |
---|
516 | | - |
---|
517 | | - cur = pending; |
---|
518 | | - pending = pending->bi_next; |
---|
519 | | - cur->bi_next = NULL; |
---|
520 | | - |
---|
521 | | - BUG_ON(atomic_read(&cur->__bi_cnt) == 0); |
---|
522 | | - |
---|
523 | | - /* |
---|
524 | | - * if we're doing the sync list, record that our |
---|
525 | | - * plug has some sync requests on it |
---|
526 | | - * |
---|
527 | | - * If we're doing the regular list and there are |
---|
528 | | - * sync requests sitting around, unplug before |
---|
529 | | - * we add more |
---|
530 | | - */ |
---|
531 | | - if (pending_bios == &device->pending_sync_bios) { |
---|
532 | | - sync_pending = 1; |
---|
533 | | - } else if (sync_pending) { |
---|
534 | | - blk_finish_plug(&plug); |
---|
535 | | - blk_start_plug(&plug); |
---|
536 | | - sync_pending = 0; |
---|
537 | | - } |
---|
538 | | - |
---|
539 | | - btrfsic_submit_bio(cur); |
---|
540 | | - num_run++; |
---|
541 | | - batch_run++; |
---|
542 | | - |
---|
543 | | - cond_resched(); |
---|
544 | | - |
---|
545 | | - /* |
---|
546 | | - * we made progress, there is more work to do and the bdi |
---|
547 | | - * is now congested. Back off and let other work structs |
---|
548 | | - * run instead |
---|
549 | | - */ |
---|
550 | | - if (pending && bdi_write_congested(bdi) && batch_run > 8 && |
---|
551 | | - fs_info->fs_devices->open_devices > 1) { |
---|
552 | | - struct io_context *ioc; |
---|
553 | | - |
---|
554 | | - ioc = current->io_context; |
---|
555 | | - |
---|
556 | | - /* |
---|
557 | | - * the main goal here is that we don't want to |
---|
558 | | - * block if we're going to be able to submit |
---|
559 | | - * more requests without blocking. |
---|
560 | | - * |
---|
561 | | - * This code does two great things, it pokes into |
---|
562 | | - * the elevator code from a filesystem _and_ |
---|
563 | | - * it makes assumptions about how batching works. |
---|
564 | | - */ |
---|
565 | | - if (ioc && ioc->nr_batch_requests > 0 && |
---|
566 | | - time_before(jiffies, ioc->last_waited + HZ/50UL) && |
---|
567 | | - (last_waited == 0 || |
---|
568 | | - ioc->last_waited == last_waited)) { |
---|
569 | | - /* |
---|
570 | | - * we want to go through our batch of |
---|
571 | | - * requests and stop. So, we copy out |
---|
572 | | - * the ioc->last_waited time and test |
---|
573 | | - * against it before looping |
---|
574 | | - */ |
---|
575 | | - last_waited = ioc->last_waited; |
---|
576 | | - cond_resched(); |
---|
577 | | - continue; |
---|
578 | | - } |
---|
579 | | - spin_lock(&device->io_lock); |
---|
580 | | - requeue_list(pending_bios, pending, tail); |
---|
581 | | - device->running_pending = 1; |
---|
582 | | - |
---|
583 | | - spin_unlock(&device->io_lock); |
---|
584 | | - btrfs_queue_work(fs_info->submit_workers, |
---|
585 | | - &device->work); |
---|
586 | | - goto done; |
---|
587 | | - } |
---|
588 | | - } |
---|
589 | | - |
---|
590 | | - cond_resched(); |
---|
591 | | - if (again) |
---|
592 | | - goto loop; |
---|
593 | | - |
---|
594 | | - spin_lock(&device->io_lock); |
---|
595 | | - if (device->pending_bios.head || device->pending_sync_bios.head) |
---|
596 | | - goto loop_lock; |
---|
597 | | - spin_unlock(&device->io_lock); |
---|
598 | | - |
---|
599 | | -done: |
---|
600 | | - blk_finish_plug(&plug); |
---|
601 | | -} |
---|
602 | | - |
---|
603 | | -static void pending_bios_fn(struct btrfs_work *work) |
---|
604 | | -{ |
---|
605 | | - struct btrfs_device *device; |
---|
606 | | - |
---|
607 | | - device = container_of(work, struct btrfs_device, work); |
---|
608 | | - run_scheduled_bios(device); |
---|
| 584 | + return false; |
---|
609 | 585 | } |
---|
610 | 586 | |
---|
611 | 587 | /* |
---|
.. | .. |
---|
615 | 591 | * matching this path only. |
---|
616 | 592 | * skip_dev: Optional. Will skip this device when searching for the stale |
---|
617 | 593 | * devices. |
---|
| 594 | + * Return: 0 for success or if @path is NULL. |
---|
| 595 | + * -EBUSY if @path is a mounted device. |
---|
| 596 | + * -ENOENT if @path does not match any device in the list. |
---|
618 | 597 | */ |
---|
619 | | -static void btrfs_free_stale_devices(const char *path, |
---|
| 598 | +static int btrfs_free_stale_devices(const char *path, |
---|
620 | 599 | struct btrfs_device *skip_device) |
---|
621 | 600 | { |
---|
622 | 601 | struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; |
---|
623 | 602 | struct btrfs_device *device, *tmp_device; |
---|
| 603 | + int ret = 0; |
---|
| 604 | + |
---|
| 605 | + lockdep_assert_held(&uuid_mutex); |
---|
| 606 | + |
---|
| 607 | + if (path) |
---|
| 608 | + ret = -ENOENT; |
---|
624 | 609 | |
---|
625 | 610 | list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { |
---|
626 | | - mutex_lock(&fs_devices->device_list_mutex); |
---|
627 | | - if (fs_devices->opened) { |
---|
628 | | - mutex_unlock(&fs_devices->device_list_mutex); |
---|
629 | | - continue; |
---|
630 | | - } |
---|
631 | 611 | |
---|
| 612 | + mutex_lock(&fs_devices->device_list_mutex); |
---|
632 | 613 | list_for_each_entry_safe(device, tmp_device, |
---|
633 | 614 | &fs_devices->devices, dev_list) { |
---|
634 | | - int not_found = 0; |
---|
635 | | - |
---|
636 | 615 | if (skip_device && skip_device == device) |
---|
637 | 616 | continue; |
---|
638 | | - if (path && !device->name) |
---|
| 617 | + if (path && !device_matched(device, path)) |
---|
639 | 618 | continue; |
---|
640 | | - |
---|
641 | | - rcu_read_lock(); |
---|
642 | | - if (path) |
---|
643 | | - not_found = strcmp(rcu_str_deref(device->name), |
---|
644 | | - path); |
---|
645 | | - rcu_read_unlock(); |
---|
646 | | - if (not_found) |
---|
647 | | - continue; |
---|
| 619 | + if (fs_devices->opened) { |
---|
| 620 | + /* for an already deleted device return 0 */ |
---|
| 621 | + if (path && ret != 0) |
---|
| 622 | + ret = -EBUSY; |
---|
| 623 | + break; |
---|
| 624 | + } |
---|
648 | 625 | |
---|
649 | 626 | /* delete the stale device */ |
---|
650 | 627 | fs_devices->num_devices--; |
---|
651 | 628 | list_del(&device->dev_list); |
---|
652 | 629 | btrfs_free_device(device); |
---|
653 | 630 | |
---|
654 | | - if (fs_devices->num_devices == 0) |
---|
655 | | - break; |
---|
| 631 | + ret = 0; |
---|
656 | 632 | } |
---|
657 | 633 | mutex_unlock(&fs_devices->device_list_mutex); |
---|
| 634 | + |
---|
658 | 635 | if (fs_devices->num_devices == 0) { |
---|
659 | 636 | btrfs_sysfs_remove_fsid(fs_devices); |
---|
660 | 637 | list_del(&fs_devices->fs_list); |
---|
661 | 638 | free_fs_devices(fs_devices); |
---|
662 | 639 | } |
---|
663 | 640 | } |
---|
| 641 | + |
---|
| 642 | + return ret; |
---|
664 | 643 | } |
---|
665 | 644 | |
---|
666 | 645 | /* |
---|
.. | .. |
---|
674 | 653 | { |
---|
675 | 654 | struct request_queue *q; |
---|
676 | 655 | struct block_device *bdev; |
---|
677 | | - struct buffer_head *bh; |
---|
678 | 656 | struct btrfs_super_block *disk_super; |
---|
679 | 657 | u64 devid; |
---|
680 | 658 | int ret; |
---|
.. | .. |
---|
685 | 663 | return -EINVAL; |
---|
686 | 664 | |
---|
687 | 665 | ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, |
---|
688 | | - &bdev, &bh); |
---|
| 666 | + &bdev, &disk_super); |
---|
689 | 667 | if (ret) |
---|
690 | 668 | return ret; |
---|
691 | 669 | |
---|
692 | | - disk_super = (struct btrfs_super_block *)bh->b_data; |
---|
693 | 670 | devid = btrfs_stack_device_id(&disk_super->dev_item); |
---|
694 | 671 | if (devid != device->devid) |
---|
695 | | - goto error_brelse; |
---|
| 672 | + goto error_free_page; |
---|
696 | 673 | |
---|
697 | 674 | if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) |
---|
698 | | - goto error_brelse; |
---|
| 675 | + goto error_free_page; |
---|
699 | 676 | |
---|
700 | 677 | device->generation = btrfs_super_generation(disk_super); |
---|
701 | 678 | |
---|
702 | 679 | if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { |
---|
| 680 | + if (btrfs_super_incompat_flags(disk_super) & |
---|
| 681 | + BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { |
---|
| 682 | + pr_err( |
---|
| 683 | + "BTRFS: Invalid seeding and uuid-changed device detected\n"); |
---|
| 684 | + goto error_free_page; |
---|
| 685 | + } |
---|
| 686 | + |
---|
703 | 687 | clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); |
---|
704 | | - fs_devices->seeding = 1; |
---|
| 688 | + fs_devices->seeding = true; |
---|
705 | 689 | } else { |
---|
706 | 690 | if (bdev_read_only(bdev)) |
---|
707 | 691 | clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); |
---|
.. | .. |
---|
711 | 695 | |
---|
712 | 696 | q = bdev_get_queue(bdev); |
---|
713 | 697 | if (!blk_queue_nonrot(q)) |
---|
714 | | - fs_devices->rotating = 1; |
---|
| 698 | + fs_devices->rotating = true; |
---|
715 | 699 | |
---|
716 | 700 | device->bdev = bdev; |
---|
717 | 701 | clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); |
---|
.. | .. |
---|
723 | 707 | fs_devices->rw_devices++; |
---|
724 | 708 | list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); |
---|
725 | 709 | } |
---|
726 | | - brelse(bh); |
---|
| 710 | + btrfs_release_disk_super(disk_super); |
---|
727 | 711 | |
---|
728 | 712 | return 0; |
---|
729 | 713 | |
---|
730 | | -error_brelse: |
---|
731 | | - brelse(bh); |
---|
| 714 | +error_free_page: |
---|
| 715 | + btrfs_release_disk_super(disk_super); |
---|
732 | 716 | blkdev_put(bdev, flags); |
---|
733 | 717 | |
---|
734 | 718 | return -EINVAL; |
---|
735 | 719 | } |
---|
736 | 720 | |
---|
| 721 | +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) |
---|
| 722 | +{ |
---|
| 723 | + bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & |
---|
| 724 | + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); |
---|
| 725 | + |
---|
| 726 | + return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; |
---|
| 727 | +} |
---|
| 728 | + |
---|
| 729 | +/* |
---|
| 730 | + * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices |
---|
| 731 | + * being created with a disk that has already completed its fsid change. Such |
---|
| 732 | + * disk can belong to an fs which has its FSID changed or to one which doesn't. |
---|
| 733 | + * Handle both cases here. |
---|
| 734 | + */ |
---|
| 735 | +static struct btrfs_fs_devices *find_fsid_inprogress( |
---|
| 736 | + struct btrfs_super_block *disk_super) |
---|
| 737 | +{ |
---|
| 738 | + struct btrfs_fs_devices *fs_devices; |
---|
| 739 | + |
---|
| 740 | + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { |
---|
| 741 | + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, |
---|
| 742 | + BTRFS_FSID_SIZE) != 0 && |
---|
| 743 | + memcmp(fs_devices->metadata_uuid, disk_super->fsid, |
---|
| 744 | + BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { |
---|
| 745 | + return fs_devices; |
---|
| 746 | + } |
---|
| 747 | + } |
---|
| 748 | + |
---|
| 749 | + return find_fsid(disk_super->fsid, NULL); |
---|
| 750 | +} |
---|
| 751 | + |
---|
| 752 | + |
---|
| 753 | +static struct btrfs_fs_devices *find_fsid_changed( |
---|
| 754 | + struct btrfs_super_block *disk_super) |
---|
| 755 | +{ |
---|
| 756 | + struct btrfs_fs_devices *fs_devices; |
---|
| 757 | + |
---|
| 758 | + /* |
---|
| 759 | + * Handles the case where scanned device is part of an fs that had |
---|
| 760 | + * multiple successful changes of FSID but curently device didn't |
---|
| 761 | + * observe it. Meaning our fsid will be different than theirs. We need |
---|
| 762 | + * to handle two subcases : |
---|
| 763 | + * 1 - The fs still continues to have different METADATA/FSID uuids. |
---|
| 764 | + * 2 - The fs is switched back to its original FSID (METADATA/FSID |
---|
| 765 | + * are equal). |
---|
| 766 | + */ |
---|
| 767 | + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { |
---|
| 768 | + /* Changed UUIDs */ |
---|
| 769 | + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, |
---|
| 770 | + BTRFS_FSID_SIZE) != 0 && |
---|
| 771 | + memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, |
---|
| 772 | + BTRFS_FSID_SIZE) == 0 && |
---|
| 773 | + memcmp(fs_devices->fsid, disk_super->fsid, |
---|
| 774 | + BTRFS_FSID_SIZE) != 0) |
---|
| 775 | + return fs_devices; |
---|
| 776 | + |
---|
| 777 | + /* Unchanged UUIDs */ |
---|
| 778 | + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, |
---|
| 779 | + BTRFS_FSID_SIZE) == 0 && |
---|
| 780 | + memcmp(fs_devices->fsid, disk_super->metadata_uuid, |
---|
| 781 | + BTRFS_FSID_SIZE) == 0) |
---|
| 782 | + return fs_devices; |
---|
| 783 | + } |
---|
| 784 | + |
---|
| 785 | + return NULL; |
---|
| 786 | +} |
---|
| 787 | + |
---|
| 788 | +static struct btrfs_fs_devices *find_fsid_reverted_metadata( |
---|
| 789 | + struct btrfs_super_block *disk_super) |
---|
| 790 | +{ |
---|
| 791 | + struct btrfs_fs_devices *fs_devices; |
---|
| 792 | + |
---|
| 793 | + /* |
---|
| 794 | + * Handle the case where the scanned device is part of an fs whose last |
---|
| 795 | + * metadata UUID change reverted it to the original FSID. At the same |
---|
| 796 | + * time * fs_devices was first created by another constitutent device |
---|
| 797 | + * which didn't fully observe the operation. This results in an |
---|
| 798 | + * btrfs_fs_devices created with metadata/fsid different AND |
---|
| 799 | + * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the |
---|
| 800 | + * fs_devices equal to the FSID of the disk. |
---|
| 801 | + */ |
---|
| 802 | + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { |
---|
| 803 | + if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, |
---|
| 804 | + BTRFS_FSID_SIZE) != 0 && |
---|
| 805 | + memcmp(fs_devices->metadata_uuid, disk_super->fsid, |
---|
| 806 | + BTRFS_FSID_SIZE) == 0 && |
---|
| 807 | + fs_devices->fsid_change) |
---|
| 808 | + return fs_devices; |
---|
| 809 | + } |
---|
| 810 | + |
---|
| 811 | + return NULL; |
---|
| 812 | +} |
---|
737 | 813 | /* |
---|
738 | 814 | * Add new device to list of registered devices |
---|
739 | 815 | * |
---|
.. | .. |
---|
746 | 822 | bool *new_device_added) |
---|
747 | 823 | { |
---|
748 | 824 | struct btrfs_device *device; |
---|
749 | | - struct btrfs_fs_devices *fs_devices; |
---|
| 825 | + struct btrfs_fs_devices *fs_devices = NULL; |
---|
750 | 826 | struct rcu_string *name; |
---|
751 | 827 | u64 found_transid = btrfs_super_generation(disk_super); |
---|
752 | 828 | u64 devid = btrfs_stack_device_id(&disk_super->dev_item); |
---|
| 829 | + bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & |
---|
| 830 | + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); |
---|
| 831 | + bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & |
---|
| 832 | + BTRFS_SUPER_FLAG_CHANGING_FSID_V2); |
---|
753 | 833 | |
---|
754 | | - fs_devices = find_fsid(disk_super->fsid); |
---|
| 834 | + if (fsid_change_in_progress) { |
---|
| 835 | + if (!has_metadata_uuid) |
---|
| 836 | + fs_devices = find_fsid_inprogress(disk_super); |
---|
| 837 | + else |
---|
| 838 | + fs_devices = find_fsid_changed(disk_super); |
---|
| 839 | + } else if (has_metadata_uuid) { |
---|
| 840 | + fs_devices = find_fsid_with_metadata_uuid(disk_super); |
---|
| 841 | + } else { |
---|
| 842 | + fs_devices = find_fsid_reverted_metadata(disk_super); |
---|
| 843 | + if (!fs_devices) |
---|
| 844 | + fs_devices = find_fsid(disk_super->fsid, NULL); |
---|
| 845 | + } |
---|
| 846 | + |
---|
| 847 | + |
---|
755 | 848 | if (!fs_devices) { |
---|
756 | | - fs_devices = alloc_fs_devices(disk_super->fsid); |
---|
| 849 | + if (has_metadata_uuid) |
---|
| 850 | + fs_devices = alloc_fs_devices(disk_super->fsid, |
---|
| 851 | + disk_super->metadata_uuid); |
---|
| 852 | + else |
---|
| 853 | + fs_devices = alloc_fs_devices(disk_super->fsid, NULL); |
---|
| 854 | + |
---|
757 | 855 | if (IS_ERR(fs_devices)) |
---|
758 | 856 | return ERR_CAST(fs_devices); |
---|
| 857 | + |
---|
| 858 | + fs_devices->fsid_change = fsid_change_in_progress; |
---|
759 | 859 | |
---|
760 | 860 | mutex_lock(&fs_devices->device_list_mutex); |
---|
761 | 861 | list_add(&fs_devices->fs_list, &fs_uuids); |
---|
.. | .. |
---|
765 | 865 | mutex_lock(&fs_devices->device_list_mutex); |
---|
766 | 866 | device = btrfs_find_device(fs_devices, devid, |
---|
767 | 867 | disk_super->dev_item.uuid, NULL, false); |
---|
| 868 | + |
---|
| 869 | + /* |
---|
| 870 | + * If this disk has been pulled into an fs devices created by |
---|
| 871 | + * a device which had the CHANGING_FSID_V2 flag then replace the |
---|
| 872 | + * metadata_uuid/fsid values of the fs_devices. |
---|
| 873 | + */ |
---|
| 874 | + if (fs_devices->fsid_change && |
---|
| 875 | + found_transid > fs_devices->latest_generation) { |
---|
| 876 | + memcpy(fs_devices->fsid, disk_super->fsid, |
---|
| 877 | + BTRFS_FSID_SIZE); |
---|
| 878 | + |
---|
| 879 | + if (has_metadata_uuid) |
---|
| 880 | + memcpy(fs_devices->metadata_uuid, |
---|
| 881 | + disk_super->metadata_uuid, |
---|
| 882 | + BTRFS_FSID_SIZE); |
---|
| 883 | + else |
---|
| 884 | + memcpy(fs_devices->metadata_uuid, |
---|
| 885 | + disk_super->fsid, BTRFS_FSID_SIZE); |
---|
| 886 | + |
---|
| 887 | + fs_devices->fsid_change = false; |
---|
| 888 | + } |
---|
768 | 889 | } |
---|
769 | 890 | |
---|
770 | 891 | if (!device) { |
---|
.. | .. |
---|
796 | 917 | *new_device_added = true; |
---|
797 | 918 | |
---|
798 | 919 | if (disk_super->label[0]) |
---|
799 | | - pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", |
---|
800 | | - disk_super->label, devid, found_transid, path); |
---|
| 920 | + pr_info( |
---|
| 921 | + "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", |
---|
| 922 | + disk_super->label, devid, found_transid, path, |
---|
| 923 | + current->comm, task_pid_nr(current)); |
---|
801 | 924 | else |
---|
802 | | - pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", |
---|
803 | | - disk_super->fsid, devid, found_transid, path); |
---|
| 925 | + pr_info( |
---|
| 926 | + "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", |
---|
| 927 | + disk_super->fsid, devid, found_transid, path, |
---|
| 928 | + current->comm, task_pid_nr(current)); |
---|
804 | 929 | |
---|
805 | 930 | } else if (!device->name || strcmp(device->name->str, path)) { |
---|
806 | 931 | /* |
---|
.. | .. |
---|
897 | 1022 | * it back. We need it to pick the disk with largest generation |
---|
898 | 1023 | * (as above). |
---|
899 | 1024 | */ |
---|
900 | | - if (!fs_devices->opened) |
---|
| 1025 | + if (!fs_devices->opened) { |
---|
901 | 1026 | device->generation = found_transid; |
---|
| 1027 | + fs_devices->latest_generation = max_t(u64, found_transid, |
---|
| 1028 | + fs_devices->latest_generation); |
---|
| 1029 | + } |
---|
902 | 1030 | |
---|
903 | 1031 | fs_devices->total_devices = btrfs_super_num_devices(disk_super); |
---|
904 | 1032 | |
---|
.. | .. |
---|
911 | 1039 | struct btrfs_fs_devices *fs_devices; |
---|
912 | 1040 | struct btrfs_device *device; |
---|
913 | 1041 | struct btrfs_device *orig_dev; |
---|
| 1042 | + int ret = 0; |
---|
914 | 1043 | |
---|
915 | | - fs_devices = alloc_fs_devices(orig->fsid); |
---|
| 1044 | + lockdep_assert_held(&uuid_mutex); |
---|
| 1045 | + |
---|
| 1046 | + fs_devices = alloc_fs_devices(orig->fsid, NULL); |
---|
916 | 1047 | if (IS_ERR(fs_devices)) |
---|
917 | 1048 | return fs_devices; |
---|
918 | 1049 | |
---|
919 | | - mutex_lock(&orig->device_list_mutex); |
---|
920 | 1050 | fs_devices->total_devices = orig->total_devices; |
---|
921 | 1051 | |
---|
922 | | - /* We have held the volume lock, it is safe to get the devices. */ |
---|
923 | 1052 | list_for_each_entry(orig_dev, &orig->devices, dev_list) { |
---|
924 | 1053 | struct rcu_string *name; |
---|
925 | 1054 | |
---|
926 | 1055 | device = btrfs_alloc_device(NULL, &orig_dev->devid, |
---|
927 | 1056 | orig_dev->uuid); |
---|
928 | | - if (IS_ERR(device)) |
---|
| 1057 | + if (IS_ERR(device)) { |
---|
| 1058 | + ret = PTR_ERR(device); |
---|
929 | 1059 | goto error; |
---|
| 1060 | + } |
---|
930 | 1061 | |
---|
931 | 1062 | /* |
---|
932 | 1063 | * This is ok to do without rcu read locked because we hold the |
---|
.. | .. |
---|
937 | 1068 | GFP_KERNEL); |
---|
938 | 1069 | if (!name) { |
---|
939 | 1070 | btrfs_free_device(device); |
---|
| 1071 | + ret = -ENOMEM; |
---|
940 | 1072 | goto error; |
---|
941 | 1073 | } |
---|
942 | 1074 | rcu_assign_pointer(device->name, name); |
---|
.. | .. |
---|
946 | 1078 | device->fs_devices = fs_devices; |
---|
947 | 1079 | fs_devices->num_devices++; |
---|
948 | 1080 | } |
---|
949 | | - mutex_unlock(&orig->device_list_mutex); |
---|
950 | 1081 | return fs_devices; |
---|
951 | 1082 | error: |
---|
952 | | - mutex_unlock(&orig->device_list_mutex); |
---|
953 | 1083 | free_fs_devices(fs_devices); |
---|
954 | | - return ERR_PTR(-ENOMEM); |
---|
| 1084 | + return ERR_PTR(ret); |
---|
955 | 1085 | } |
---|
956 | 1086 | |
---|
957 | | -/* |
---|
958 | | - * After we have read the system tree and know devids belonging to |
---|
959 | | - * this filesystem, remove the device which does not belong there. |
---|
960 | | - */ |
---|
961 | | -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) |
---|
| 1087 | +static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, |
---|
| 1088 | + int step, struct btrfs_device **latest_dev) |
---|
962 | 1089 | { |
---|
963 | 1090 | struct btrfs_device *device, *next; |
---|
964 | | - struct btrfs_device *latest_dev = NULL; |
---|
965 | 1091 | |
---|
966 | | - mutex_lock(&uuid_mutex); |
---|
967 | | -again: |
---|
968 | 1092 | /* This is the initialized path, it is safe to release the devices. */ |
---|
969 | 1093 | list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { |
---|
970 | | - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, |
---|
971 | | - &device->dev_state)) { |
---|
| 1094 | + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { |
---|
972 | 1095 | if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, |
---|
973 | | - &device->dev_state) && |
---|
| 1096 | + &device->dev_state) && |
---|
974 | 1097 | !test_bit(BTRFS_DEV_STATE_MISSING, |
---|
975 | 1098 | &device->dev_state) && |
---|
976 | | - (!latest_dev || |
---|
977 | | - device->generation > latest_dev->generation)) { |
---|
978 | | - latest_dev = device; |
---|
| 1099 | + (!*latest_dev || |
---|
| 1100 | + device->generation > (*latest_dev)->generation)) { |
---|
| 1101 | + *latest_dev = device; |
---|
979 | 1102 | } |
---|
980 | 1103 | continue; |
---|
981 | 1104 | } |
---|
.. | .. |
---|
1002 | 1125 | btrfs_free_device(device); |
---|
1003 | 1126 | } |
---|
1004 | 1127 | |
---|
1005 | | - if (fs_devices->seed) { |
---|
1006 | | - fs_devices = fs_devices->seed; |
---|
1007 | | - goto again; |
---|
1008 | | - } |
---|
| 1128 | +} |
---|
| 1129 | + |
---|
| 1130 | +/* |
---|
| 1131 | + * After we have read the system tree and know devids belonging to this |
---|
| 1132 | + * filesystem, remove the device which does not belong there. |
---|
| 1133 | + */ |
---|
| 1134 | +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) |
---|
| 1135 | +{ |
---|
| 1136 | + struct btrfs_device *latest_dev = NULL; |
---|
| 1137 | + struct btrfs_fs_devices *seed_dev; |
---|
| 1138 | + |
---|
| 1139 | + mutex_lock(&uuid_mutex); |
---|
| 1140 | + __btrfs_free_extra_devids(fs_devices, step, &latest_dev); |
---|
| 1141 | + |
---|
| 1142 | + list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) |
---|
| 1143 | + __btrfs_free_extra_devids(seed_dev, step, &latest_dev); |
---|
1009 | 1144 | |
---|
1010 | 1145 | fs_devices->latest_bdev = latest_dev->bdev; |
---|
1011 | 1146 | |
---|
1012 | 1147 | mutex_unlock(&uuid_mutex); |
---|
1013 | | -} |
---|
1014 | | - |
---|
1015 | | -static void free_device_rcu(struct rcu_head *head) |
---|
1016 | | -{ |
---|
1017 | | - struct btrfs_device *device; |
---|
1018 | | - |
---|
1019 | | - device = container_of(head, struct btrfs_device, rcu); |
---|
1020 | | - btrfs_free_device(device); |
---|
1021 | 1148 | } |
---|
1022 | 1149 | |
---|
1023 | 1150 | static void btrfs_close_bdev(struct btrfs_device *device) |
---|
.. | .. |
---|
1036 | 1163 | static void btrfs_close_one_device(struct btrfs_device *device) |
---|
1037 | 1164 | { |
---|
1038 | 1165 | struct btrfs_fs_devices *fs_devices = device->fs_devices; |
---|
1039 | | - struct btrfs_device *new_device; |
---|
1040 | | - struct rcu_string *name; |
---|
1041 | | - |
---|
1042 | | - if (device->bdev) |
---|
1043 | | - fs_devices->open_devices--; |
---|
1044 | 1166 | |
---|
1045 | 1167 | if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && |
---|
1046 | 1168 | device->devid != BTRFS_DEV_REPLACE_DEVID) { |
---|
.. | .. |
---|
1057 | 1179 | } |
---|
1058 | 1180 | |
---|
1059 | 1181 | btrfs_close_bdev(device); |
---|
1060 | | - |
---|
1061 | | - new_device = btrfs_alloc_device(NULL, &device->devid, |
---|
1062 | | - device->uuid); |
---|
1063 | | - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ |
---|
1064 | | - |
---|
1065 | | - /* Safe because we are under uuid_mutex */ |
---|
1066 | | - if (device->name) { |
---|
1067 | | - name = rcu_string_strdup(device->name->str, GFP_NOFS); |
---|
1068 | | - BUG_ON(!name); /* -ENOMEM */ |
---|
1069 | | - rcu_assign_pointer(new_device->name, name); |
---|
| 1182 | + if (device->bdev) { |
---|
| 1183 | + fs_devices->open_devices--; |
---|
| 1184 | + device->bdev = NULL; |
---|
1070 | 1185 | } |
---|
| 1186 | + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); |
---|
1071 | 1187 | |
---|
1072 | | - list_replace_rcu(&device->dev_list, &new_device->dev_list); |
---|
1073 | | - new_device->fs_devices = device->fs_devices; |
---|
| 1188 | + device->fs_info = NULL; |
---|
| 1189 | + atomic_set(&device->dev_stats_ccnt, 0); |
---|
| 1190 | + extent_io_tree_release(&device->alloc_state); |
---|
1074 | 1191 | |
---|
1075 | | - call_rcu(&device->rcu, free_device_rcu); |
---|
| 1192 | + /* |
---|
| 1193 | + * Reset the flush error record. We might have a transient flush error |
---|
| 1194 | + * in this mount, and if so we aborted the current transaction and set |
---|
| 1195 | + * the fs to an error state, guaranteeing no super blocks can be further |
---|
| 1196 | + * committed. However that error might be transient and if we unmount the |
---|
| 1197 | + * filesystem and mount it again, we should allow the mount to succeed |
---|
| 1198 | + * (btrfs_check_rw_degradable() should not fail) - if after mounting the |
---|
| 1199 | + * filesystem again we still get flush errors, then we will again abort |
---|
| 1200 | + * any transaction and set the error state, guaranteeing no commits of |
---|
| 1201 | + * unsafe super blocks. |
---|
| 1202 | + */ |
---|
| 1203 | + device->last_flush_error = 0; |
---|
| 1204 | + |
---|
| 1205 | + /* Verify the device is back in a pristine state */ |
---|
| 1206 | + ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); |
---|
| 1207 | + ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); |
---|
| 1208 | + ASSERT(list_empty(&device->dev_alloc_list)); |
---|
| 1209 | + ASSERT(list_empty(&device->post_commit_list)); |
---|
| 1210 | + ASSERT(atomic_read(&device->reada_in_flight) == 0); |
---|
1076 | 1211 | } |
---|
1077 | 1212 | |
---|
1078 | | -static int close_fs_devices(struct btrfs_fs_devices *fs_devices) |
---|
| 1213 | +static void close_fs_devices(struct btrfs_fs_devices *fs_devices) |
---|
1079 | 1214 | { |
---|
1080 | 1215 | struct btrfs_device *device, *tmp; |
---|
1081 | 1216 | |
---|
1082 | | - if (--fs_devices->opened > 0) |
---|
1083 | | - return 0; |
---|
| 1217 | + lockdep_assert_held(&uuid_mutex); |
---|
1084 | 1218 | |
---|
1085 | | - mutex_lock(&fs_devices->device_list_mutex); |
---|
1086 | | - list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { |
---|
| 1219 | + if (--fs_devices->opened > 0) |
---|
| 1220 | + return; |
---|
| 1221 | + |
---|
| 1222 | + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) |
---|
1087 | 1223 | btrfs_close_one_device(device); |
---|
1088 | | - } |
---|
1089 | | - mutex_unlock(&fs_devices->device_list_mutex); |
---|
1090 | 1224 | |
---|
1091 | 1225 | WARN_ON(fs_devices->open_devices); |
---|
1092 | 1226 | WARN_ON(fs_devices->rw_devices); |
---|
1093 | 1227 | fs_devices->opened = 0; |
---|
1094 | | - fs_devices->seeding = 0; |
---|
1095 | | - |
---|
1096 | | - return 0; |
---|
| 1228 | + fs_devices->seeding = false; |
---|
| 1229 | + fs_devices->fs_info = NULL; |
---|
1097 | 1230 | } |
---|
1098 | 1231 | |
---|
1099 | | -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) |
---|
| 1232 | +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) |
---|
1100 | 1233 | { |
---|
1101 | | - struct btrfs_fs_devices *seed_devices = NULL; |
---|
1102 | | - int ret; |
---|
| 1234 | + LIST_HEAD(list); |
---|
| 1235 | + struct btrfs_fs_devices *tmp; |
---|
1103 | 1236 | |
---|
1104 | 1237 | mutex_lock(&uuid_mutex); |
---|
1105 | | - ret = close_fs_devices(fs_devices); |
---|
| 1238 | + close_fs_devices(fs_devices); |
---|
1106 | 1239 | if (!fs_devices->opened) { |
---|
1107 | | - seed_devices = fs_devices->seed; |
---|
1108 | | - fs_devices->seed = NULL; |
---|
1109 | | - } |
---|
1110 | | - mutex_unlock(&uuid_mutex); |
---|
| 1240 | + list_splice_init(&fs_devices->seed_list, &list); |
---|
1111 | 1241 | |
---|
1112 | | - while (seed_devices) { |
---|
1113 | | - fs_devices = seed_devices; |
---|
1114 | | - seed_devices = fs_devices->seed; |
---|
| 1242 | + /* |
---|
| 1243 | + * If the struct btrfs_fs_devices is not assembled with any |
---|
| 1244 | + * other device, it can be re-initialized during the next mount |
---|
| 1245 | + * without the needing device-scan step. Therefore, it can be |
---|
| 1246 | + * fully freed. |
---|
| 1247 | + */ |
---|
| 1248 | + if (fs_devices->num_devices == 1) { |
---|
| 1249 | + list_del(&fs_devices->fs_list); |
---|
| 1250 | + free_fs_devices(fs_devices); |
---|
| 1251 | + } |
---|
| 1252 | + } |
---|
| 1253 | + |
---|
| 1254 | + |
---|
| 1255 | + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { |
---|
1115 | 1256 | close_fs_devices(fs_devices); |
---|
| 1257 | + list_del(&fs_devices->seed_list); |
---|
1116 | 1258 | free_fs_devices(fs_devices); |
---|
1117 | 1259 | } |
---|
1118 | | - return ret; |
---|
| 1260 | + mutex_unlock(&uuid_mutex); |
---|
1119 | 1261 | } |
---|
1120 | 1262 | |
---|
1121 | 1263 | static int open_fs_devices(struct btrfs_fs_devices *fs_devices, |
---|
.. | .. |
---|
1123 | 1265 | { |
---|
1124 | 1266 | struct btrfs_device *device; |
---|
1125 | 1267 | struct btrfs_device *latest_dev = NULL; |
---|
1126 | | - int ret = 0; |
---|
| 1268 | + struct btrfs_device *tmp_device; |
---|
1127 | 1269 | |
---|
1128 | 1270 | flags |= FMODE_EXCL; |
---|
1129 | 1271 | |
---|
1130 | | - list_for_each_entry(device, &fs_devices->devices, dev_list) { |
---|
1131 | | - /* Just open everything we can; ignore failures here */ |
---|
1132 | | - if (btrfs_open_one_device(fs_devices, device, flags, holder)) |
---|
1133 | | - continue; |
---|
| 1272 | + list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, |
---|
| 1273 | + dev_list) { |
---|
| 1274 | + int ret; |
---|
1134 | 1275 | |
---|
1135 | | - if (!latest_dev || |
---|
1136 | | - device->generation > latest_dev->generation) |
---|
| 1276 | + ret = btrfs_open_one_device(fs_devices, device, flags, holder); |
---|
| 1277 | + if (ret == 0 && |
---|
| 1278 | + (!latest_dev || device->generation > latest_dev->generation)) { |
---|
1137 | 1279 | latest_dev = device; |
---|
| 1280 | + } else if (ret == -ENODATA) { |
---|
| 1281 | + fs_devices->num_devices--; |
---|
| 1282 | + list_del(&device->dev_list); |
---|
| 1283 | + btrfs_free_device(device); |
---|
| 1284 | + } |
---|
1138 | 1285 | } |
---|
1139 | | - if (fs_devices->open_devices == 0) { |
---|
1140 | | - ret = -EINVAL; |
---|
1141 | | - goto out; |
---|
1142 | | - } |
---|
| 1286 | + if (fs_devices->open_devices == 0) |
---|
| 1287 | + return -EINVAL; |
---|
| 1288 | + |
---|
1143 | 1289 | fs_devices->opened = 1; |
---|
1144 | 1290 | fs_devices->latest_bdev = latest_dev->bdev; |
---|
1145 | 1291 | fs_devices->total_rw_bytes = 0; |
---|
1146 | | -out: |
---|
1147 | | - return ret; |
---|
| 1292 | + fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; |
---|
| 1293 | + |
---|
| 1294 | + return 0; |
---|
1148 | 1295 | } |
---|
1149 | 1296 | |
---|
1150 | 1297 | static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) |
---|
.. | .. |
---|
1186 | 1333 | return ret; |
---|
1187 | 1334 | } |
---|
1188 | 1335 | |
---|
1189 | | -static void btrfs_release_disk_super(struct page *page) |
---|
| 1336 | +void btrfs_release_disk_super(struct btrfs_super_block *super) |
---|
1190 | 1337 | { |
---|
1191 | | - kunmap(page); |
---|
| 1338 | + struct page *page = virt_to_page(super); |
---|
| 1339 | + |
---|
1192 | 1340 | put_page(page); |
---|
1193 | 1341 | } |
---|
1194 | 1342 | |
---|
1195 | | -static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, |
---|
1196 | | - struct page **page, |
---|
1197 | | - struct btrfs_super_block **disk_super) |
---|
| 1343 | +static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, |
---|
| 1344 | + u64 bytenr) |
---|
1198 | 1345 | { |
---|
| 1346 | + struct btrfs_super_block *disk_super; |
---|
| 1347 | + struct page *page; |
---|
1199 | 1348 | void *p; |
---|
1200 | 1349 | pgoff_t index; |
---|
1201 | 1350 | |
---|
1202 | 1351 | /* make sure our super fits in the device */ |
---|
1203 | 1352 | if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) |
---|
1204 | | - return 1; |
---|
| 1353 | + return ERR_PTR(-EINVAL); |
---|
1205 | 1354 | |
---|
1206 | 1355 | /* make sure our super fits in the page */ |
---|
1207 | | - if (sizeof(**disk_super) > PAGE_SIZE) |
---|
1208 | | - return 1; |
---|
| 1356 | + if (sizeof(*disk_super) > PAGE_SIZE) |
---|
| 1357 | + return ERR_PTR(-EINVAL); |
---|
1209 | 1358 | |
---|
1210 | 1359 | /* make sure our super doesn't straddle pages on disk */ |
---|
1211 | 1360 | index = bytenr >> PAGE_SHIFT; |
---|
1212 | | - if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) |
---|
1213 | | - return 1; |
---|
| 1361 | + if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) |
---|
| 1362 | + return ERR_PTR(-EINVAL); |
---|
1214 | 1363 | |
---|
1215 | 1364 | /* pull in the page with our super */ |
---|
1216 | | - *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, |
---|
1217 | | - index, GFP_KERNEL); |
---|
| 1365 | + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); |
---|
1218 | 1366 | |
---|
1219 | | - if (IS_ERR_OR_NULL(*page)) |
---|
1220 | | - return 1; |
---|
| 1367 | + if (IS_ERR(page)) |
---|
| 1368 | + return ERR_CAST(page); |
---|
1221 | 1369 | |
---|
1222 | | - p = kmap(*page); |
---|
| 1370 | + p = page_address(page); |
---|
1223 | 1371 | |
---|
1224 | 1372 | /* align our pointer to the offset of the super block */ |
---|
1225 | | - *disk_super = p + (bytenr & ~PAGE_MASK); |
---|
| 1373 | + disk_super = p + offset_in_page(bytenr); |
---|
1226 | 1374 | |
---|
1227 | | - if (btrfs_super_bytenr(*disk_super) != bytenr || |
---|
1228 | | - btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { |
---|
1229 | | - btrfs_release_disk_super(*page); |
---|
1230 | | - return 1; |
---|
| 1375 | + if (btrfs_super_bytenr(disk_super) != bytenr || |
---|
| 1376 | + btrfs_super_magic(disk_super) != BTRFS_MAGIC) { |
---|
| 1377 | + btrfs_release_disk_super(p); |
---|
| 1378 | + return ERR_PTR(-EINVAL); |
---|
1231 | 1379 | } |
---|
1232 | 1380 | |
---|
1233 | | - if ((*disk_super)->label[0] && |
---|
1234 | | - (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) |
---|
1235 | | - (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; |
---|
| 1381 | + if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) |
---|
| 1382 | + disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; |
---|
1236 | 1383 | |
---|
1237 | | - return 0; |
---|
| 1384 | + return disk_super; |
---|
| 1385 | +} |
---|
| 1386 | + |
---|
| 1387 | +int btrfs_forget_devices(const char *path) |
---|
| 1388 | +{ |
---|
| 1389 | + int ret; |
---|
| 1390 | + |
---|
| 1391 | + mutex_lock(&uuid_mutex); |
---|
| 1392 | + ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); |
---|
| 1393 | + mutex_unlock(&uuid_mutex); |
---|
| 1394 | + |
---|
| 1395 | + return ret; |
---|
1238 | 1396 | } |
---|
1239 | 1397 | |
---|
1240 | 1398 | /* |
---|
.. | .. |
---|
1249 | 1407 | bool new_device_added = false; |
---|
1250 | 1408 | struct btrfs_device *device = NULL; |
---|
1251 | 1409 | struct block_device *bdev; |
---|
1252 | | - struct page *page; |
---|
1253 | 1410 | u64 bytenr; |
---|
1254 | 1411 | |
---|
1255 | 1412 | lockdep_assert_held(&uuid_mutex); |
---|
.. | .. |
---|
1261 | 1418 | * later supers, using BTRFS_SUPER_MIRROR_MAX instead |
---|
1262 | 1419 | */ |
---|
1263 | 1420 | bytenr = btrfs_sb_offset(0); |
---|
1264 | | - flags |= FMODE_EXCL; |
---|
1265 | 1421 | |
---|
| 1422 | + /* |
---|
| 1423 | + * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may |
---|
| 1424 | + * initiate the device scan which may race with the user's mount |
---|
| 1425 | + * or mkfs command, resulting in failure. |
---|
| 1426 | + * Since the device scan is solely for reading purposes, there is |
---|
| 1427 | + * no need for FMODE_EXCL. Additionally, the devices are read again |
---|
| 1428 | + * during the mount process. It is ok to get some inconsistent |
---|
| 1429 | + * values temporarily, as the device paths of the fsid are the only |
---|
| 1430 | + * required information for assembling the volume. |
---|
| 1431 | + */ |
---|
1266 | 1432 | bdev = blkdev_get_by_path(path, flags, holder); |
---|
1267 | 1433 | if (IS_ERR(bdev)) |
---|
1268 | 1434 | return ERR_CAST(bdev); |
---|
1269 | 1435 | |
---|
1270 | | - if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { |
---|
1271 | | - device = ERR_PTR(-EINVAL); |
---|
| 1436 | + disk_super = btrfs_read_disk_super(bdev, bytenr); |
---|
| 1437 | + if (IS_ERR(disk_super)) { |
---|
| 1438 | + device = ERR_CAST(disk_super); |
---|
1272 | 1439 | goto error_bdev_put; |
---|
1273 | 1440 | } |
---|
1274 | 1441 | |
---|
.. | .. |
---|
1278 | 1445 | btrfs_free_stale_devices(path, device); |
---|
1279 | 1446 | } |
---|
1280 | 1447 | |
---|
1281 | | - btrfs_release_disk_super(page); |
---|
| 1448 | + btrfs_release_disk_super(disk_super); |
---|
1282 | 1449 | |
---|
1283 | 1450 | error_bdev_put: |
---|
1284 | 1451 | blkdev_put(bdev, flags); |
---|
.. | .. |
---|
1286 | 1453 | return device; |
---|
1287 | 1454 | } |
---|
1288 | 1455 | |
---|
1289 | | -static int contains_pending_extent(struct btrfs_transaction *transaction, |
---|
1290 | | - struct btrfs_device *device, |
---|
1291 | | - u64 *start, u64 len) |
---|
| 1456 | +/* |
---|
| 1457 | + * Try to find a chunk that intersects [start, start + len] range and when one |
---|
| 1458 | + * such is found, record the end of it in *start |
---|
| 1459 | + */ |
---|
| 1460 | +static bool contains_pending_extent(struct btrfs_device *device, u64 *start, |
---|
| 1461 | + u64 len) |
---|
1292 | 1462 | { |
---|
1293 | | - struct btrfs_fs_info *fs_info = device->fs_info; |
---|
1294 | | - struct extent_map *em; |
---|
1295 | | - struct list_head *search_list = &fs_info->pinned_chunks; |
---|
1296 | | - int ret = 0; |
---|
1297 | | - u64 physical_start = *start; |
---|
| 1463 | + u64 physical_start, physical_end; |
---|
1298 | 1464 | |
---|
1299 | | - if (transaction) |
---|
1300 | | - search_list = &transaction->pending_chunks; |
---|
1301 | | -again: |
---|
1302 | | - list_for_each_entry(em, search_list, list) { |
---|
1303 | | - struct map_lookup *map; |
---|
1304 | | - int i; |
---|
| 1465 | + lockdep_assert_held(&device->fs_info->chunk_mutex); |
---|
1305 | 1466 | |
---|
1306 | | - map = em->map_lookup; |
---|
1307 | | - for (i = 0; i < map->num_stripes; i++) { |
---|
1308 | | - u64 end; |
---|
| 1467 | + if (!find_first_extent_bit(&device->alloc_state, *start, |
---|
| 1468 | + &physical_start, &physical_end, |
---|
| 1469 | + CHUNK_ALLOCATED, NULL)) { |
---|
1309 | 1470 | |
---|
1310 | | - if (map->stripes[i].dev != device) |
---|
1311 | | - continue; |
---|
1312 | | - if (map->stripes[i].physical >= physical_start + len || |
---|
1313 | | - map->stripes[i].physical + em->orig_block_len <= |
---|
1314 | | - physical_start) |
---|
1315 | | - continue; |
---|
1316 | | - /* |
---|
1317 | | - * Make sure that while processing the pinned list we do |
---|
1318 | | - * not override our *start with a lower value, because |
---|
1319 | | - * we can have pinned chunks that fall within this |
---|
1320 | | - * device hole and that have lower physical addresses |
---|
1321 | | - * than the pending chunks we processed before. If we |
---|
1322 | | - * do not take this special care we can end up getting |
---|
1323 | | - * 2 pending chunks that start at the same physical |
---|
1324 | | - * device offsets because the end offset of a pinned |
---|
1325 | | - * chunk can be equal to the start offset of some |
---|
1326 | | - * pending chunk. |
---|
1327 | | - */ |
---|
1328 | | - end = map->stripes[i].physical + em->orig_block_len; |
---|
1329 | | - if (end > *start) { |
---|
1330 | | - *start = end; |
---|
1331 | | - ret = 1; |
---|
1332 | | - } |
---|
| 1471 | + if (in_range(physical_start, *start, len) || |
---|
| 1472 | + in_range(*start, physical_start, |
---|
| 1473 | + physical_end - physical_start)) { |
---|
| 1474 | + *start = physical_end + 1; |
---|
| 1475 | + return true; |
---|
1333 | 1476 | } |
---|
1334 | 1477 | } |
---|
1335 | | - if (search_list != &fs_info->pinned_chunks) { |
---|
1336 | | - search_list = &fs_info->pinned_chunks; |
---|
1337 | | - goto again; |
---|
1338 | | - } |
---|
1339 | | - |
---|
1340 | | - return ret; |
---|
| 1478 | + return false; |
---|
1341 | 1479 | } |
---|
1342 | 1480 | |
---|
| 1481 | +static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) |
---|
| 1482 | +{ |
---|
| 1483 | + switch (device->fs_devices->chunk_alloc_policy) { |
---|
| 1484 | + case BTRFS_CHUNK_ALLOC_REGULAR: |
---|
| 1485 | + /* |
---|
| 1486 | + * We don't want to overwrite the superblock on the drive nor |
---|
| 1487 | + * any area used by the boot loader (grub for example), so we |
---|
| 1488 | + * make sure to start at an offset of at least 1MB. |
---|
| 1489 | + */ |
---|
| 1490 | + return max_t(u64, start, SZ_1M); |
---|
| 1491 | + default: |
---|
| 1492 | + BUG(); |
---|
| 1493 | + } |
---|
| 1494 | +} |
---|
| 1495 | + |
---|
| 1496 | +/** |
---|
| 1497 | + * dev_extent_hole_check - check if specified hole is suitable for allocation |
---|
| 1498 | + * @device: the device which we have the hole |
---|
| 1499 | + * @hole_start: starting position of the hole |
---|
| 1500 | + * @hole_size: the size of the hole |
---|
| 1501 | + * @num_bytes: the size of the free space that we need |
---|
| 1502 | + * |
---|
| 1503 | + * This function may modify @hole_start and @hole_end to reflect the suitable |
---|
| 1504 | + * position for allocation. Returns 1 if hole position is updated, 0 otherwise. |
---|
| 1505 | + */ |
---|
| 1506 | +static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, |
---|
| 1507 | + u64 *hole_size, u64 num_bytes) |
---|
| 1508 | +{ |
---|
| 1509 | + bool changed = false; |
---|
| 1510 | + u64 hole_end = *hole_start + *hole_size; |
---|
| 1511 | + |
---|
| 1512 | + /* |
---|
| 1513 | + * Check before we set max_hole_start, otherwise we could end up |
---|
| 1514 | + * sending back this offset anyway. |
---|
| 1515 | + */ |
---|
| 1516 | + if (contains_pending_extent(device, hole_start, *hole_size)) { |
---|
| 1517 | + if (hole_end >= *hole_start) |
---|
| 1518 | + *hole_size = hole_end - *hole_start; |
---|
| 1519 | + else |
---|
| 1520 | + *hole_size = 0; |
---|
| 1521 | + changed = true; |
---|
| 1522 | + } |
---|
| 1523 | + |
---|
| 1524 | + switch (device->fs_devices->chunk_alloc_policy) { |
---|
| 1525 | + case BTRFS_CHUNK_ALLOC_REGULAR: |
---|
| 1526 | + /* No extra check */ |
---|
| 1527 | + break; |
---|
| 1528 | + default: |
---|
| 1529 | + BUG(); |
---|
| 1530 | + } |
---|
| 1531 | + |
---|
| 1532 | + return changed; |
---|
| 1533 | +} |
---|
1343 | 1534 | |
---|
1344 | 1535 | /* |
---|
1345 | 1536 | * find_free_dev_extent_start - find free space in the specified device |
---|
.. | .. |
---|
1361 | 1552 | * @len is used to store the size of the free space that we find. |
---|
1362 | 1553 | * But if we don't find suitable free space, it is used to store the size of |
---|
1363 | 1554 | * the max free space. |
---|
| 1555 | + * |
---|
| 1556 | + * NOTE: This function will search *commit* root of device tree, and does extra |
---|
| 1557 | + * check to ensure dev extents are not double allocated. |
---|
| 1558 | + * This makes the function safe to allocate dev extents but may not report |
---|
| 1559 | + * correct usable device space, as device extent freed in current transaction |
---|
| 1560 | + * is not reported as avaiable. |
---|
1364 | 1561 | */ |
---|
1365 | | -int find_free_dev_extent_start(struct btrfs_transaction *transaction, |
---|
1366 | | - struct btrfs_device *device, u64 num_bytes, |
---|
1367 | | - u64 search_start, u64 *start, u64 *len) |
---|
| 1562 | +static int find_free_dev_extent_start(struct btrfs_device *device, |
---|
| 1563 | + u64 num_bytes, u64 search_start, u64 *start, |
---|
| 1564 | + u64 *len) |
---|
1368 | 1565 | { |
---|
1369 | 1566 | struct btrfs_fs_info *fs_info = device->fs_info; |
---|
1370 | 1567 | struct btrfs_root *root = fs_info->dev_root; |
---|
.. | .. |
---|
1380 | 1577 | int slot; |
---|
1381 | 1578 | struct extent_buffer *l; |
---|
1382 | 1579 | |
---|
1383 | | - /* |
---|
1384 | | - * We don't want to overwrite the superblock on the drive nor any area |
---|
1385 | | - * used by the boot loader (grub for example), so we make sure to start |
---|
1386 | | - * at an offset of at least 1MB. |
---|
1387 | | - */ |
---|
1388 | | - search_start = max_t(u64, search_start, SZ_1M); |
---|
| 1580 | + search_start = dev_extent_search_start(device, search_start); |
---|
1389 | 1581 | |
---|
1390 | 1582 | path = btrfs_alloc_path(); |
---|
1391 | 1583 | if (!path) |
---|
.. | .. |
---|
1418 | 1610 | goto out; |
---|
1419 | 1611 | } |
---|
1420 | 1612 | |
---|
1421 | | - while (1) { |
---|
| 1613 | + while (search_start < search_end) { |
---|
1422 | 1614 | l = path->nodes[0]; |
---|
1423 | 1615 | slot = path->slots[0]; |
---|
1424 | 1616 | if (slot >= btrfs_header_nritems(l)) { |
---|
.. | .. |
---|
1441 | 1633 | if (key.type != BTRFS_DEV_EXTENT_KEY) |
---|
1442 | 1634 | goto next; |
---|
1443 | 1635 | |
---|
| 1636 | + if (key.offset > search_end) |
---|
| 1637 | + break; |
---|
| 1638 | + |
---|
1444 | 1639 | if (key.offset > search_start) { |
---|
1445 | 1640 | hole_size = key.offset - search_start; |
---|
1446 | | - |
---|
1447 | | - /* |
---|
1448 | | - * Have to check before we set max_hole_start, otherwise |
---|
1449 | | - * we could end up sending back this offset anyway. |
---|
1450 | | - */ |
---|
1451 | | - if (contains_pending_extent(transaction, device, |
---|
1452 | | - &search_start, |
---|
1453 | | - hole_size)) { |
---|
1454 | | - if (key.offset >= search_start) { |
---|
1455 | | - hole_size = key.offset - search_start; |
---|
1456 | | - } else { |
---|
1457 | | - WARN_ON_ONCE(1); |
---|
1458 | | - hole_size = 0; |
---|
1459 | | - } |
---|
1460 | | - } |
---|
| 1641 | + dev_extent_hole_check(device, &search_start, &hole_size, |
---|
| 1642 | + num_bytes); |
---|
1461 | 1643 | |
---|
1462 | 1644 | if (hole_size > max_hole_size) { |
---|
1463 | 1645 | max_hole_start = search_start; |
---|
.. | .. |
---|
1496 | 1678 | */ |
---|
1497 | 1679 | if (search_end > search_start) { |
---|
1498 | 1680 | hole_size = search_end - search_start; |
---|
1499 | | - |
---|
1500 | | - if (contains_pending_extent(transaction, device, &search_start, |
---|
1501 | | - hole_size)) { |
---|
| 1681 | + if (dev_extent_hole_check(device, &search_start, &hole_size, |
---|
| 1682 | + num_bytes)) { |
---|
1502 | 1683 | btrfs_release_path(path); |
---|
1503 | 1684 | goto again; |
---|
1504 | 1685 | } |
---|
.. | .. |
---|
1515 | 1696 | else |
---|
1516 | 1697 | ret = 0; |
---|
1517 | 1698 | |
---|
| 1699 | + ASSERT(max_hole_start + max_hole_size <= search_end); |
---|
1518 | 1700 | out: |
---|
1519 | 1701 | btrfs_free_path(path); |
---|
1520 | 1702 | *start = max_hole_start; |
---|
.. | .. |
---|
1523 | 1705 | return ret; |
---|
1524 | 1706 | } |
---|
1525 | 1707 | |
---|
1526 | | -int find_free_dev_extent(struct btrfs_trans_handle *trans, |
---|
1527 | | - struct btrfs_device *device, u64 num_bytes, |
---|
| 1708 | +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, |
---|
1528 | 1709 | u64 *start, u64 *len) |
---|
1529 | 1710 | { |
---|
1530 | 1711 | /* FIXME use last free of some kind */ |
---|
1531 | | - return find_free_dev_extent_start(trans->transaction, device, |
---|
1532 | | - num_bytes, 0, start, len); |
---|
| 1712 | + return find_free_dev_extent_start(device, num_bytes, 0, start, len); |
---|
1533 | 1713 | } |
---|
1534 | 1714 | |
---|
1535 | 1715 | static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, |
---|
.. | .. |
---|
1640 | 1820 | struct rb_node *n; |
---|
1641 | 1821 | u64 ret = 0; |
---|
1642 | 1822 | |
---|
1643 | | - em_tree = &fs_info->mapping_tree.map_tree; |
---|
| 1823 | + em_tree = &fs_info->mapping_tree; |
---|
1644 | 1824 | read_lock(&em_tree->lock); |
---|
1645 | | - n = rb_last(&em_tree->map); |
---|
| 1825 | + n = rb_last(&em_tree->map.rb_root); |
---|
1646 | 1826 | if (n) { |
---|
1647 | 1827 | em = rb_entry(n, struct extent_map, rb_node); |
---|
1648 | 1828 | ret = em->start + em->len; |
---|
.. | .. |
---|
1672 | 1852 | if (ret < 0) |
---|
1673 | 1853 | goto error; |
---|
1674 | 1854 | |
---|
1675 | | - BUG_ON(ret == 0); /* Corruption */ |
---|
| 1855 | + if (ret == 0) { |
---|
| 1856 | + /* Corruption */ |
---|
| 1857 | + btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); |
---|
| 1858 | + ret = -EUCLEAN; |
---|
| 1859 | + goto error; |
---|
| 1860 | + } |
---|
1676 | 1861 | |
---|
1677 | 1862 | ret = btrfs_previous_item(fs_info->chunk_root, path, |
---|
1678 | 1863 | BTRFS_DEV_ITEMS_OBJECTID, |
---|
.. | .. |
---|
1738 | 1923 | ptr = btrfs_device_uuid(dev_item); |
---|
1739 | 1924 | write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); |
---|
1740 | 1925 | ptr = btrfs_device_fsid(dev_item); |
---|
1741 | | - write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE); |
---|
| 1926 | + write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, |
---|
| 1927 | + ptr, BTRFS_FSID_SIZE); |
---|
1742 | 1928 | btrfs_mark_buffer_dirty(leaf); |
---|
1743 | 1929 | |
---|
1744 | 1930 | ret = 0; |
---|
.. | .. |
---|
1750 | 1936 | /* |
---|
1751 | 1937 | * Function to update ctime/mtime for a given device path. |
---|
1752 | 1938 | * Mainly used for ctime/mtime based probe like libblkid. |
---|
| 1939 | + * |
---|
| 1940 | + * We don't care about errors here, this is just to be kind to userspace. |
---|
1753 | 1941 | */ |
---|
1754 | | -static void update_dev_time(const char *path_name) |
---|
| 1942 | +static void update_dev_time(const char *device_path) |
---|
1755 | 1943 | { |
---|
1756 | | - struct file *filp; |
---|
| 1944 | + struct path path; |
---|
| 1945 | + struct timespec64 now; |
---|
| 1946 | + int ret; |
---|
1757 | 1947 | |
---|
1758 | | - filp = filp_open(path_name, O_RDWR, 0); |
---|
1759 | | - if (IS_ERR(filp)) |
---|
| 1948 | + ret = kern_path(device_path, LOOKUP_FOLLOW, &path); |
---|
| 1949 | + if (ret) |
---|
1760 | 1950 | return; |
---|
1761 | | - file_update_time(filp); |
---|
1762 | | - filp_close(filp, NULL); |
---|
| 1951 | + |
---|
| 1952 | + now = current_time(d_inode(path.dentry)); |
---|
| 1953 | + inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); |
---|
| 1954 | + path_put(&path); |
---|
1763 | 1955 | } |
---|
1764 | 1956 | |
---|
1765 | | -static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, |
---|
1766 | | - struct btrfs_device *device) |
---|
| 1957 | +static int btrfs_rm_dev_item(struct btrfs_device *device) |
---|
1767 | 1958 | { |
---|
1768 | | - struct btrfs_root *root = fs_info->chunk_root; |
---|
| 1959 | + struct btrfs_root *root = device->fs_info->chunk_root; |
---|
1769 | 1960 | int ret; |
---|
1770 | 1961 | struct btrfs_path *path; |
---|
1771 | 1962 | struct btrfs_key key; |
---|
.. | .. |
---|
1862 | 2053 | * where this function called, there should be always be another device (or |
---|
1863 | 2054 | * this_dev) which is active. |
---|
1864 | 2055 | */ |
---|
1865 | | -void btrfs_assign_next_active_device(struct btrfs_device *device, |
---|
1866 | | - struct btrfs_device *this_dev) |
---|
| 2056 | +void __cold btrfs_assign_next_active_device(struct btrfs_device *device, |
---|
| 2057 | + struct btrfs_device *next_device) |
---|
1867 | 2058 | { |
---|
1868 | 2059 | struct btrfs_fs_info *fs_info = device->fs_info; |
---|
1869 | | - struct btrfs_device *next_device; |
---|
1870 | 2060 | |
---|
1871 | | - if (this_dev) |
---|
1872 | | - next_device = this_dev; |
---|
1873 | | - else |
---|
| 2061 | + if (!next_device) |
---|
1874 | 2062 | next_device = btrfs_find_next_active_device(fs_info->fs_devices, |
---|
1875 | | - device); |
---|
| 2063 | + device); |
---|
1876 | 2064 | ASSERT(next_device); |
---|
1877 | 2065 | |
---|
1878 | 2066 | if (fs_info->sb->s_bdev && |
---|
.. | .. |
---|
1883 | 2071 | fs_info->fs_devices->latest_bdev = next_device->bdev; |
---|
1884 | 2072 | } |
---|
1885 | 2073 | |
---|
| 2074 | +/* |
---|
| 2075 | + * Return btrfs_fs_devices::num_devices excluding the device that's being |
---|
| 2076 | + * currently replaced. |
---|
| 2077 | + */ |
---|
| 2078 | +static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) |
---|
| 2079 | +{ |
---|
| 2080 | + u64 num_devices = fs_info->fs_devices->num_devices; |
---|
| 2081 | + |
---|
| 2082 | + down_read(&fs_info->dev_replace.rwsem); |
---|
| 2083 | + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { |
---|
| 2084 | + ASSERT(num_devices > 1); |
---|
| 2085 | + num_devices--; |
---|
| 2086 | + } |
---|
| 2087 | + up_read(&fs_info->dev_replace.rwsem); |
---|
| 2088 | + |
---|
| 2089 | + return num_devices; |
---|
| 2090 | +} |
---|
| 2091 | + |
---|
| 2092 | +void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, |
---|
| 2093 | + struct block_device *bdev, |
---|
| 2094 | + const char *device_path) |
---|
| 2095 | +{ |
---|
| 2096 | + struct btrfs_super_block *disk_super; |
---|
| 2097 | + int copy_num; |
---|
| 2098 | + |
---|
| 2099 | + if (!bdev) |
---|
| 2100 | + return; |
---|
| 2101 | + |
---|
| 2102 | + for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { |
---|
| 2103 | + struct page *page; |
---|
| 2104 | + int ret; |
---|
| 2105 | + |
---|
| 2106 | + disk_super = btrfs_read_dev_one_super(bdev, copy_num); |
---|
| 2107 | + if (IS_ERR(disk_super)) |
---|
| 2108 | + continue; |
---|
| 2109 | + |
---|
| 2110 | + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); |
---|
| 2111 | + |
---|
| 2112 | + page = virt_to_page(disk_super); |
---|
| 2113 | + set_page_dirty(page); |
---|
| 2114 | + lock_page(page); |
---|
| 2115 | + /* write_on_page() unlocks the page */ |
---|
| 2116 | + ret = write_one_page(page); |
---|
| 2117 | + if (ret) |
---|
| 2118 | + btrfs_warn(fs_info, |
---|
| 2119 | + "error clearing superblock number %d (%d)", |
---|
| 2120 | + copy_num, ret); |
---|
| 2121 | + btrfs_release_disk_super(disk_super); |
---|
| 2122 | + |
---|
| 2123 | + } |
---|
| 2124 | + |
---|
| 2125 | + /* Notify udev that device has changed */ |
---|
| 2126 | + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); |
---|
| 2127 | + |
---|
| 2128 | + /* Update ctime/mtime for device path for libblkid */ |
---|
| 2129 | + update_dev_time(device_path); |
---|
| 2130 | +} |
---|
| 2131 | + |
---|
1886 | 2132 | int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, |
---|
1887 | | - u64 devid) |
---|
| 2133 | + u64 devid) |
---|
1888 | 2134 | { |
---|
1889 | 2135 | struct btrfs_device *device; |
---|
1890 | 2136 | struct btrfs_fs_devices *cur_devices; |
---|
.. | .. |
---|
1892 | 2138 | u64 num_devices; |
---|
1893 | 2139 | int ret = 0; |
---|
1894 | 2140 | |
---|
1895 | | - mutex_lock(&uuid_mutex); |
---|
1896 | | - |
---|
1897 | | - num_devices = fs_devices->num_devices; |
---|
1898 | | - btrfs_dev_replace_read_lock(&fs_info->dev_replace); |
---|
1899 | | - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { |
---|
1900 | | - WARN_ON(num_devices < 1); |
---|
1901 | | - num_devices--; |
---|
1902 | | - } |
---|
1903 | | - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); |
---|
| 2141 | + /* |
---|
| 2142 | + * The device list in fs_devices is accessed without locks (neither |
---|
| 2143 | + * uuid_mutex nor device_list_mutex) as it won't change on a mounted |
---|
| 2144 | + * filesystem and another device rm cannot run. |
---|
| 2145 | + */ |
---|
| 2146 | + num_devices = btrfs_num_devices(fs_info); |
---|
1904 | 2147 | |
---|
1905 | 2148 | ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); |
---|
1906 | 2149 | if (ret) |
---|
1907 | 2150 | goto out; |
---|
1908 | 2151 | |
---|
1909 | | - ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, |
---|
1910 | | - &device); |
---|
1911 | | - if (ret) |
---|
| 2152 | + device = btrfs_find_device_by_devspec(fs_info, devid, device_path); |
---|
| 2153 | + |
---|
| 2154 | + if (IS_ERR(device)) { |
---|
| 2155 | + if (PTR_ERR(device) == -ENOENT && |
---|
| 2156 | + device_path && strcmp(device_path, "missing") == 0) |
---|
| 2157 | + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; |
---|
| 2158 | + else |
---|
| 2159 | + ret = PTR_ERR(device); |
---|
1912 | 2160 | goto out; |
---|
| 2161 | + } |
---|
| 2162 | + |
---|
| 2163 | + if (btrfs_pinned_by_swapfile(fs_info, device)) { |
---|
| 2164 | + btrfs_warn_in_rcu(fs_info, |
---|
| 2165 | + "cannot remove device %s (devid %llu) due to active swapfile", |
---|
| 2166 | + rcu_str_deref(device->name), device->devid); |
---|
| 2167 | + ret = -ETXTBSY; |
---|
| 2168 | + goto out; |
---|
| 2169 | + } |
---|
1913 | 2170 | |
---|
1914 | 2171 | if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { |
---|
1915 | 2172 | ret = BTRFS_ERROR_DEV_TGT_REPLACE; |
---|
.. | .. |
---|
1929 | 2186 | mutex_unlock(&fs_info->chunk_mutex); |
---|
1930 | 2187 | } |
---|
1931 | 2188 | |
---|
1932 | | - mutex_unlock(&uuid_mutex); |
---|
1933 | 2189 | ret = btrfs_shrink_device(device, 0); |
---|
1934 | | - mutex_lock(&uuid_mutex); |
---|
| 2190 | + if (!ret) |
---|
| 2191 | + btrfs_reada_remove_dev(device); |
---|
1935 | 2192 | if (ret) |
---|
1936 | 2193 | goto error_undo; |
---|
1937 | 2194 | |
---|
.. | .. |
---|
1940 | 2197 | * counter although write_all_supers() is not locked out. This |
---|
1941 | 2198 | * could give a filesystem state which requires a degraded mount. |
---|
1942 | 2199 | */ |
---|
1943 | | - ret = btrfs_rm_dev_item(fs_info, device); |
---|
| 2200 | + ret = btrfs_rm_dev_item(device); |
---|
1944 | 2201 | if (ret) |
---|
1945 | 2202 | goto error_undo; |
---|
1946 | 2203 | |
---|
1947 | 2204 | clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); |
---|
1948 | | - btrfs_scrub_cancel_dev(fs_info, device); |
---|
| 2205 | + btrfs_scrub_cancel_dev(device); |
---|
1949 | 2206 | |
---|
1950 | 2207 | /* |
---|
1951 | 2208 | * the device list mutex makes sure that we don't change |
---|
.. | .. |
---|
1980 | 2237 | if (device->bdev) { |
---|
1981 | 2238 | cur_devices->open_devices--; |
---|
1982 | 2239 | /* remove sysfs entry */ |
---|
1983 | | - btrfs_sysfs_rm_device_link(fs_devices, device); |
---|
| 2240 | + btrfs_sysfs_remove_device(device); |
---|
1984 | 2241 | } |
---|
1985 | 2242 | |
---|
1986 | 2243 | num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; |
---|
.. | .. |
---|
1993 | 2250 | * supers and free the device. |
---|
1994 | 2251 | */ |
---|
1995 | 2252 | if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) |
---|
1996 | | - btrfs_scratch_superblocks(device->bdev, device->name->str); |
---|
| 2253 | + btrfs_scratch_superblocks(fs_info, device->bdev, |
---|
| 2254 | + device->name->str); |
---|
1997 | 2255 | |
---|
1998 | 2256 | btrfs_close_bdev(device); |
---|
1999 | | - call_rcu(&device->rcu, free_device_rcu); |
---|
| 2257 | + synchronize_rcu(); |
---|
| 2258 | + btrfs_free_device(device); |
---|
2000 | 2259 | |
---|
2001 | 2260 | if (cur_devices->open_devices == 0) { |
---|
2002 | | - while (fs_devices) { |
---|
2003 | | - if (fs_devices->seed == cur_devices) { |
---|
2004 | | - fs_devices->seed = cur_devices->seed; |
---|
2005 | | - break; |
---|
2006 | | - } |
---|
2007 | | - fs_devices = fs_devices->seed; |
---|
2008 | | - } |
---|
2009 | | - cur_devices->seed = NULL; |
---|
| 2261 | + list_del_init(&cur_devices->seed_list); |
---|
2010 | 2262 | close_fs_devices(cur_devices); |
---|
2011 | 2263 | free_fs_devices(cur_devices); |
---|
2012 | 2264 | } |
---|
2013 | 2265 | |
---|
2014 | 2266 | out: |
---|
2015 | | - mutex_unlock(&uuid_mutex); |
---|
2016 | 2267 | return ret; |
---|
2017 | 2268 | |
---|
2018 | 2269 | error_undo: |
---|
| 2270 | + btrfs_reada_undo_remove_dev(device); |
---|
2019 | 2271 | if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { |
---|
2020 | 2272 | mutex_lock(&fs_info->chunk_mutex); |
---|
2021 | 2273 | list_add(&device->dev_alloc_list, |
---|
.. | .. |
---|
2053 | 2305 | fs_devices->open_devices--; |
---|
2054 | 2306 | } |
---|
2055 | 2307 | |
---|
2056 | | -void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, |
---|
2057 | | - struct btrfs_device *srcdev) |
---|
| 2308 | +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) |
---|
2058 | 2309 | { |
---|
2059 | 2310 | struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; |
---|
2060 | 2311 | |
---|
2061 | | - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) { |
---|
2062 | | - /* zero out the old super if it is writable */ |
---|
2063 | | - btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); |
---|
2064 | | - } |
---|
| 2312 | + mutex_lock(&uuid_mutex); |
---|
2065 | 2313 | |
---|
2066 | 2314 | btrfs_close_bdev(srcdev); |
---|
2067 | | - call_rcu(&srcdev->rcu, free_device_rcu); |
---|
| 2315 | + synchronize_rcu(); |
---|
| 2316 | + btrfs_free_device(srcdev); |
---|
2068 | 2317 | |
---|
2069 | 2318 | /* if this is no devs we rather delete the fs_devices */ |
---|
2070 | 2319 | if (!fs_devices->num_devices) { |
---|
2071 | | - struct btrfs_fs_devices *tmp_fs_devices; |
---|
2072 | | - |
---|
2073 | 2320 | /* |
---|
2074 | 2321 | * On a mounted FS, num_devices can't be zero unless it's a |
---|
2075 | 2322 | * seed. In case of a seed device being replaced, the replace |
---|
.. | .. |
---|
2078 | 2325 | */ |
---|
2079 | 2326 | ASSERT(fs_devices->seeding); |
---|
2080 | 2327 | |
---|
2081 | | - tmp_fs_devices = fs_info->fs_devices; |
---|
2082 | | - while (tmp_fs_devices) { |
---|
2083 | | - if (tmp_fs_devices->seed == fs_devices) { |
---|
2084 | | - tmp_fs_devices->seed = fs_devices->seed; |
---|
2085 | | - break; |
---|
2086 | | - } |
---|
2087 | | - tmp_fs_devices = tmp_fs_devices->seed; |
---|
2088 | | - } |
---|
2089 | | - fs_devices->seed = NULL; |
---|
| 2328 | + list_del_init(&fs_devices->seed_list); |
---|
2090 | 2329 | close_fs_devices(fs_devices); |
---|
2091 | 2330 | free_fs_devices(fs_devices); |
---|
2092 | 2331 | } |
---|
| 2332 | + mutex_unlock(&uuid_mutex); |
---|
2093 | 2333 | } |
---|
2094 | 2334 | |
---|
2095 | 2335 | void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) |
---|
2096 | 2336 | { |
---|
2097 | 2337 | struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; |
---|
2098 | 2338 | |
---|
2099 | | - WARN_ON(!tgtdev); |
---|
2100 | 2339 | mutex_lock(&fs_devices->device_list_mutex); |
---|
2101 | 2340 | |
---|
2102 | | - btrfs_sysfs_rm_device_link(fs_devices, tgtdev); |
---|
| 2341 | + btrfs_sysfs_remove_device(tgtdev); |
---|
2103 | 2342 | |
---|
2104 | 2343 | if (tgtdev->bdev) |
---|
2105 | 2344 | fs_devices->open_devices--; |
---|
.. | .. |
---|
2119 | 2358 | * is already out of device list, so we don't have to hold |
---|
2120 | 2359 | * the device_list_mutex lock. |
---|
2121 | 2360 | */ |
---|
2122 | | - btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); |
---|
| 2361 | + btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, |
---|
| 2362 | + tgtdev->name->str); |
---|
2123 | 2363 | |
---|
2124 | 2364 | btrfs_close_bdev(tgtdev); |
---|
2125 | | - call_rcu(&tgtdev->rcu, free_device_rcu); |
---|
| 2365 | + synchronize_rcu(); |
---|
| 2366 | + btrfs_free_device(tgtdev); |
---|
2126 | 2367 | } |
---|
2127 | 2368 | |
---|
2128 | | -static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, |
---|
2129 | | - const char *device_path, |
---|
2130 | | - struct btrfs_device **device) |
---|
| 2369 | +static struct btrfs_device *btrfs_find_device_by_path( |
---|
| 2370 | + struct btrfs_fs_info *fs_info, const char *device_path) |
---|
2131 | 2371 | { |
---|
2132 | 2372 | int ret = 0; |
---|
2133 | 2373 | struct btrfs_super_block *disk_super; |
---|
2134 | 2374 | u64 devid; |
---|
2135 | 2375 | u8 *dev_uuid; |
---|
2136 | 2376 | struct block_device *bdev; |
---|
2137 | | - struct buffer_head *bh; |
---|
| 2377 | + struct btrfs_device *device; |
---|
2138 | 2378 | |
---|
2139 | | - *device = NULL; |
---|
2140 | 2379 | ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, |
---|
2141 | | - fs_info->bdev_holder, 0, &bdev, &bh); |
---|
| 2380 | + fs_info->bdev_holder, 0, &bdev, &disk_super); |
---|
2142 | 2381 | if (ret) |
---|
2143 | | - return ret; |
---|
2144 | | - disk_super = (struct btrfs_super_block *)bh->b_data; |
---|
| 2382 | + return ERR_PTR(ret); |
---|
| 2383 | + |
---|
2145 | 2384 | devid = btrfs_stack_device_id(&disk_super->dev_item); |
---|
2146 | 2385 | dev_uuid = disk_super->dev_item.uuid; |
---|
2147 | | - *device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, |
---|
2148 | | - disk_super->fsid, true); |
---|
2149 | | - brelse(bh); |
---|
2150 | | - if (!*device) |
---|
2151 | | - ret = -ENOENT; |
---|
| 2386 | + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) |
---|
| 2387 | + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, |
---|
| 2388 | + disk_super->metadata_uuid, true); |
---|
| 2389 | + else |
---|
| 2390 | + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, |
---|
| 2391 | + disk_super->fsid, true); |
---|
| 2392 | + |
---|
| 2393 | + btrfs_release_disk_super(disk_super); |
---|
| 2394 | + if (!device) |
---|
| 2395 | + device = ERR_PTR(-ENOENT); |
---|
2152 | 2396 | blkdev_put(bdev, FMODE_READ); |
---|
2153 | | - return ret; |
---|
2154 | | -} |
---|
2155 | | - |
---|
2156 | | -int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, |
---|
2157 | | - const char *device_path, |
---|
2158 | | - struct btrfs_device **device) |
---|
2159 | | -{ |
---|
2160 | | - *device = NULL; |
---|
2161 | | - if (strcmp(device_path, "missing") == 0) { |
---|
2162 | | - struct list_head *devices; |
---|
2163 | | - struct btrfs_device *tmp; |
---|
2164 | | - |
---|
2165 | | - devices = &fs_info->fs_devices->devices; |
---|
2166 | | - list_for_each_entry(tmp, devices, dev_list) { |
---|
2167 | | - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, |
---|
2168 | | - &tmp->dev_state) && !tmp->bdev) { |
---|
2169 | | - *device = tmp; |
---|
2170 | | - break; |
---|
2171 | | - } |
---|
2172 | | - } |
---|
2173 | | - |
---|
2174 | | - if (!*device) |
---|
2175 | | - return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; |
---|
2176 | | - |
---|
2177 | | - return 0; |
---|
2178 | | - } else { |
---|
2179 | | - return btrfs_find_device_by_path(fs_info, device_path, device); |
---|
2180 | | - } |
---|
| 2397 | + return device; |
---|
2181 | 2398 | } |
---|
2182 | 2399 | |
---|
2183 | 2400 | /* |
---|
2184 | 2401 | * Lookup a device given by device id, or the path if the id is 0. |
---|
2185 | 2402 | */ |
---|
2186 | | -int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, |
---|
2187 | | - const char *devpath, |
---|
2188 | | - struct btrfs_device **device) |
---|
| 2403 | +struct btrfs_device *btrfs_find_device_by_devspec( |
---|
| 2404 | + struct btrfs_fs_info *fs_info, u64 devid, |
---|
| 2405 | + const char *device_path) |
---|
2189 | 2406 | { |
---|
2190 | | - int ret; |
---|
| 2407 | + struct btrfs_device *device; |
---|
2191 | 2408 | |
---|
2192 | 2409 | if (devid) { |
---|
2193 | | - ret = 0; |
---|
2194 | | - *device = btrfs_find_device(fs_info->fs_devices, devid, |
---|
2195 | | - NULL, NULL, true); |
---|
2196 | | - if (!*device) |
---|
2197 | | - ret = -ENOENT; |
---|
2198 | | - } else { |
---|
2199 | | - if (!devpath || !devpath[0]) |
---|
2200 | | - return -EINVAL; |
---|
2201 | | - |
---|
2202 | | - ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, |
---|
2203 | | - device); |
---|
| 2410 | + device = btrfs_find_device(fs_info->fs_devices, devid, NULL, |
---|
| 2411 | + NULL, true); |
---|
| 2412 | + if (!device) |
---|
| 2413 | + return ERR_PTR(-ENOENT); |
---|
| 2414 | + return device; |
---|
2204 | 2415 | } |
---|
2205 | | - return ret; |
---|
| 2416 | + |
---|
| 2417 | + if (!device_path || !device_path[0]) |
---|
| 2418 | + return ERR_PTR(-EINVAL); |
---|
| 2419 | + |
---|
| 2420 | + if (strcmp(device_path, "missing") == 0) { |
---|
| 2421 | + /* Find first missing device */ |
---|
| 2422 | + list_for_each_entry(device, &fs_info->fs_devices->devices, |
---|
| 2423 | + dev_list) { |
---|
| 2424 | + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, |
---|
| 2425 | + &device->dev_state) && !device->bdev) |
---|
| 2426 | + return device; |
---|
| 2427 | + } |
---|
| 2428 | + return ERR_PTR(-ENOENT); |
---|
| 2429 | + } |
---|
| 2430 | + |
---|
| 2431 | + return btrfs_find_device_by_path(fs_info, device_path); |
---|
2206 | 2432 | } |
---|
2207 | 2433 | |
---|
2208 | 2434 | /* |
---|
.. | .. |
---|
2221 | 2447 | if (!fs_devices->seeding) |
---|
2222 | 2448 | return -EINVAL; |
---|
2223 | 2449 | |
---|
2224 | | - seed_devices = alloc_fs_devices(NULL); |
---|
| 2450 | + /* |
---|
| 2451 | + * Private copy of the seed devices, anchored at |
---|
| 2452 | + * fs_info->fs_devices->seed_list |
---|
| 2453 | + */ |
---|
| 2454 | + seed_devices = alloc_fs_devices(NULL, NULL); |
---|
2225 | 2455 | if (IS_ERR(seed_devices)) |
---|
2226 | 2456 | return PTR_ERR(seed_devices); |
---|
2227 | 2457 | |
---|
| 2458 | + /* |
---|
| 2459 | + * It's necessary to retain a copy of the original seed fs_devices in |
---|
| 2460 | + * fs_uuids so that filesystems which have been seeded can successfully |
---|
| 2461 | + * reference the seed device from open_seed_devices. This also supports |
---|
| 2462 | + * multiple fs seed. |
---|
| 2463 | + */ |
---|
2228 | 2464 | old_devices = clone_fs_devices(fs_devices); |
---|
2229 | 2465 | if (IS_ERR(old_devices)) { |
---|
2230 | 2466 | kfree(seed_devices); |
---|
.. | .. |
---|
2245 | 2481 | list_for_each_entry(device, &seed_devices->devices, dev_list) |
---|
2246 | 2482 | device->fs_devices = seed_devices; |
---|
2247 | 2483 | |
---|
2248 | | - mutex_lock(&fs_info->chunk_mutex); |
---|
2249 | | - list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); |
---|
2250 | | - mutex_unlock(&fs_info->chunk_mutex); |
---|
2251 | | - |
---|
2252 | | - fs_devices->seeding = 0; |
---|
| 2484 | + fs_devices->seeding = false; |
---|
2253 | 2485 | fs_devices->num_devices = 0; |
---|
2254 | 2486 | fs_devices->open_devices = 0; |
---|
2255 | 2487 | fs_devices->missing_devices = 0; |
---|
2256 | | - fs_devices->rotating = 0; |
---|
2257 | | - fs_devices->seed = seed_devices; |
---|
| 2488 | + fs_devices->rotating = false; |
---|
| 2489 | + list_add(&seed_devices->seed_list, &fs_devices->seed_list); |
---|
2258 | 2490 | |
---|
2259 | 2491 | generate_random_uuid(fs_devices->fsid); |
---|
2260 | | - memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); |
---|
| 2492 | + memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); |
---|
2261 | 2493 | memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); |
---|
2262 | 2494 | mutex_unlock(&fs_devices->device_list_mutex); |
---|
2263 | 2495 | |
---|
.. | .. |
---|
2271 | 2503 | /* |
---|
2272 | 2504 | * Store the expected generation for seed devices in device items. |
---|
2273 | 2505 | */ |
---|
2274 | | -static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, |
---|
2275 | | - struct btrfs_fs_info *fs_info) |
---|
| 2506 | +static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) |
---|
2276 | 2507 | { |
---|
| 2508 | + struct btrfs_fs_info *fs_info = trans->fs_info; |
---|
2277 | 2509 | struct btrfs_root *root = fs_info->chunk_root; |
---|
2278 | 2510 | struct btrfs_path *path; |
---|
2279 | 2511 | struct extent_buffer *leaf; |
---|
.. | .. |
---|
2357 | 2589 | u64 orig_super_num_devices; |
---|
2358 | 2590 | int seeding_dev = 0; |
---|
2359 | 2591 | int ret = 0; |
---|
2360 | | - bool unlocked = false; |
---|
| 2592 | + bool locked = false; |
---|
2361 | 2593 | |
---|
2362 | 2594 | if (sb_rdonly(sb) && !fs_devices->seeding) |
---|
2363 | 2595 | return -EROFS; |
---|
.. | .. |
---|
2371 | 2603 | seeding_dev = 1; |
---|
2372 | 2604 | down_write(&sb->s_umount); |
---|
2373 | 2605 | mutex_lock(&uuid_mutex); |
---|
| 2606 | + locked = true; |
---|
2374 | 2607 | } |
---|
2375 | 2608 | |
---|
2376 | | - filemap_write_and_wait(bdev->bd_inode->i_mapping); |
---|
| 2609 | + sync_blockdev(bdev); |
---|
2377 | 2610 | |
---|
2378 | | - mutex_lock(&fs_devices->device_list_mutex); |
---|
2379 | | - list_for_each_entry(device, &fs_devices->devices, dev_list) { |
---|
| 2611 | + rcu_read_lock(); |
---|
| 2612 | + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { |
---|
2380 | 2613 | if (device->bdev == bdev) { |
---|
2381 | 2614 | ret = -EEXIST; |
---|
2382 | | - mutex_unlock( |
---|
2383 | | - &fs_devices->device_list_mutex); |
---|
| 2615 | + rcu_read_unlock(); |
---|
2384 | 2616 | goto error; |
---|
2385 | 2617 | } |
---|
2386 | 2618 | } |
---|
2387 | | - mutex_unlock(&fs_devices->device_list_mutex); |
---|
| 2619 | + rcu_read_unlock(); |
---|
2388 | 2620 | |
---|
2389 | 2621 | device = btrfs_alloc_device(fs_info, NULL, NULL); |
---|
2390 | 2622 | if (IS_ERR(device)) { |
---|
.. | .. |
---|
2448 | 2680 | atomic64_add(device->total_bytes, &fs_info->free_chunk_space); |
---|
2449 | 2681 | |
---|
2450 | 2682 | if (!blk_queue_nonrot(q)) |
---|
2451 | | - fs_devices->rotating = 1; |
---|
| 2683 | + fs_devices->rotating = true; |
---|
2452 | 2684 | |
---|
2453 | 2685 | orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); |
---|
2454 | 2686 | btrfs_set_super_total_bytes(fs_info->super_copy, |
---|
.. | .. |
---|
2468 | 2700 | mutex_unlock(&fs_info->chunk_mutex); |
---|
2469 | 2701 | |
---|
2470 | 2702 | /* Add sysfs device entry */ |
---|
2471 | | - btrfs_sysfs_add_device_link(fs_devices, device); |
---|
| 2703 | + btrfs_sysfs_add_device(device); |
---|
2472 | 2704 | |
---|
2473 | 2705 | mutex_unlock(&fs_devices->device_list_mutex); |
---|
2474 | 2706 | |
---|
2475 | 2707 | if (seeding_dev) { |
---|
2476 | 2708 | mutex_lock(&fs_info->chunk_mutex); |
---|
2477 | | - ret = init_first_rw_device(trans, fs_info); |
---|
| 2709 | + ret = init_first_rw_device(trans); |
---|
2478 | 2710 | mutex_unlock(&fs_info->chunk_mutex); |
---|
2479 | 2711 | if (ret) { |
---|
2480 | 2712 | btrfs_abort_transaction(trans, ret); |
---|
.. | .. |
---|
2489 | 2721 | } |
---|
2490 | 2722 | |
---|
2491 | 2723 | if (seeding_dev) { |
---|
2492 | | - char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; |
---|
2493 | | - |
---|
2494 | | - ret = btrfs_finish_sprout(trans, fs_info); |
---|
| 2724 | + ret = btrfs_finish_sprout(trans); |
---|
2495 | 2725 | if (ret) { |
---|
2496 | 2726 | btrfs_abort_transaction(trans, ret); |
---|
2497 | 2727 | goto error_sysfs; |
---|
2498 | 2728 | } |
---|
2499 | 2729 | |
---|
2500 | | - /* Sprouting would change fsid of the mounted root, |
---|
2501 | | - * so rename the fsid on the sysfs |
---|
| 2730 | + /* |
---|
| 2731 | + * fs_devices now represents the newly sprouted filesystem and |
---|
| 2732 | + * its fsid has been changed by btrfs_prepare_sprout |
---|
2502 | 2733 | */ |
---|
2503 | | - snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", |
---|
2504 | | - fs_info->fsid); |
---|
2505 | | - if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) |
---|
2506 | | - btrfs_warn(fs_info, |
---|
2507 | | - "sysfs: failed to create fsid for sprout"); |
---|
| 2734 | + btrfs_sysfs_update_sprout_fsid(fs_devices); |
---|
2508 | 2735 | } |
---|
2509 | 2736 | |
---|
2510 | 2737 | ret = btrfs_commit_transaction(trans); |
---|
.. | .. |
---|
2512 | 2739 | if (seeding_dev) { |
---|
2513 | 2740 | mutex_unlock(&uuid_mutex); |
---|
2514 | 2741 | up_write(&sb->s_umount); |
---|
2515 | | - unlocked = true; |
---|
| 2742 | + locked = false; |
---|
2516 | 2743 | |
---|
2517 | 2744 | if (ret) /* transaction commit */ |
---|
2518 | 2745 | return ret; |
---|
.. | .. |
---|
2532 | 2759 | ret = btrfs_commit_transaction(trans); |
---|
2533 | 2760 | } |
---|
2534 | 2761 | |
---|
2535 | | - /* Update ctime/mtime for libblkid */ |
---|
| 2762 | + /* |
---|
| 2763 | + * Now that we have written a new super block to this device, check all |
---|
| 2764 | + * other fs_devices list if device_path alienates any other scanned |
---|
| 2765 | + * device. |
---|
| 2766 | + * We can ignore the return value as it typically returns -EINVAL and |
---|
| 2767 | + * only succeeds if the device was an alien. |
---|
| 2768 | + */ |
---|
| 2769 | + btrfs_forget_devices(device_path); |
---|
| 2770 | + |
---|
| 2771 | + /* Update ctime/mtime for blkid or udev */ |
---|
2536 | 2772 | update_dev_time(device_path); |
---|
| 2773 | + |
---|
2537 | 2774 | return ret; |
---|
2538 | 2775 | |
---|
2539 | 2776 | error_sysfs: |
---|
2540 | | - btrfs_sysfs_rm_device_link(fs_devices, device); |
---|
| 2777 | + btrfs_sysfs_remove_device(device); |
---|
2541 | 2778 | mutex_lock(&fs_info->fs_devices->device_list_mutex); |
---|
2542 | 2779 | mutex_lock(&fs_info->chunk_mutex); |
---|
2543 | 2780 | list_del_rcu(&device->dev_list); |
---|
.. | .. |
---|
2563 | 2800 | btrfs_free_device(device); |
---|
2564 | 2801 | error: |
---|
2565 | 2802 | blkdev_put(bdev, FMODE_EXCL); |
---|
2566 | | - if (seeding_dev && !unlocked) { |
---|
| 2803 | + if (locked) { |
---|
2567 | 2804 | mutex_unlock(&uuid_mutex); |
---|
2568 | 2805 | up_write(&sb->s_umount); |
---|
2569 | 2806 | } |
---|
.. | .. |
---|
2621 | 2858 | { |
---|
2622 | 2859 | struct btrfs_fs_info *fs_info = device->fs_info; |
---|
2623 | 2860 | struct btrfs_super_block *super_copy = fs_info->super_copy; |
---|
2624 | | - struct btrfs_fs_devices *fs_devices; |
---|
2625 | 2861 | u64 old_total; |
---|
2626 | 2862 | u64 diff; |
---|
2627 | 2863 | |
---|
.. | .. |
---|
2640 | 2876 | return -EINVAL; |
---|
2641 | 2877 | } |
---|
2642 | 2878 | |
---|
2643 | | - fs_devices = fs_info->fs_devices; |
---|
2644 | | - |
---|
2645 | 2879 | btrfs_set_super_total_bytes(super_copy, |
---|
2646 | 2880 | round_down(old_total + diff, fs_info->sectorsize)); |
---|
2647 | 2881 | device->fs_devices->total_rw_bytes += diff; |
---|
.. | .. |
---|
2649 | 2883 | btrfs_device_set_total_bytes(device, new_size); |
---|
2650 | 2884 | btrfs_device_set_disk_total_bytes(device, new_size); |
---|
2651 | 2885 | btrfs_clear_space_info_full(device->fs_info); |
---|
2652 | | - if (list_empty(&device->resized_list)) |
---|
2653 | | - list_add_tail(&device->resized_list, |
---|
2654 | | - &fs_devices->resized_devices); |
---|
| 2886 | + if (list_empty(&device->post_commit_list)) |
---|
| 2887 | + list_add_tail(&device->post_commit_list, |
---|
| 2888 | + &trans->transaction->dev_update_list); |
---|
2655 | 2889 | mutex_unlock(&fs_info->chunk_mutex); |
---|
2656 | 2890 | |
---|
2657 | 2891 | return btrfs_update_device(trans, device); |
---|
.. | .. |
---|
2739 | 2973 | return ret; |
---|
2740 | 2974 | } |
---|
2741 | 2975 | |
---|
2742 | | -static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, |
---|
2743 | | - u64 logical, u64 length) |
---|
| 2976 | +/* |
---|
| 2977 | + * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. |
---|
| 2978 | + * @logical: Logical block offset in bytes. |
---|
| 2979 | + * @length: Length of extent in bytes. |
---|
| 2980 | + * |
---|
| 2981 | + * Return: Chunk mapping or ERR_PTR. |
---|
| 2982 | + */ |
---|
| 2983 | +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, |
---|
| 2984 | + u64 logical, u64 length) |
---|
2744 | 2985 | { |
---|
2745 | 2986 | struct extent_map_tree *em_tree; |
---|
2746 | 2987 | struct extent_map *em; |
---|
2747 | 2988 | |
---|
2748 | | - em_tree = &fs_info->mapping_tree.map_tree; |
---|
| 2989 | + em_tree = &fs_info->mapping_tree; |
---|
2749 | 2990 | read_lock(&em_tree->lock); |
---|
2750 | 2991 | em = lookup_extent_mapping(em_tree, logical, length); |
---|
2751 | 2992 | read_unlock(&em_tree->lock); |
---|
.. | .. |
---|
2777 | 3018 | int i, ret = 0; |
---|
2778 | 3019 | struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
---|
2779 | 3020 | |
---|
2780 | | - em = get_chunk_map(fs_info, chunk_offset, 1); |
---|
| 3021 | + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); |
---|
2781 | 3022 | if (IS_ERR(em)) { |
---|
2782 | 3023 | /* |
---|
2783 | 3024 | * This is a logic error, but we don't want to just rely on the |
---|
.. | .. |
---|
2818 | 3059 | mutex_unlock(&fs_info->chunk_mutex); |
---|
2819 | 3060 | } |
---|
2820 | 3061 | |
---|
2821 | | - if (map->stripes[i].dev) { |
---|
2822 | | - ret = btrfs_update_device(trans, map->stripes[i].dev); |
---|
2823 | | - if (ret) { |
---|
2824 | | - mutex_unlock(&fs_devices->device_list_mutex); |
---|
2825 | | - btrfs_abort_transaction(trans, ret); |
---|
2826 | | - goto out; |
---|
2827 | | - } |
---|
| 3062 | + ret = btrfs_update_device(trans, device); |
---|
| 3063 | + if (ret) { |
---|
| 3064 | + mutex_unlock(&fs_devices->device_list_mutex); |
---|
| 3065 | + btrfs_abort_transaction(trans, ret); |
---|
| 3066 | + goto out; |
---|
2828 | 3067 | } |
---|
2829 | 3068 | } |
---|
2830 | 3069 | mutex_unlock(&fs_devices->device_list_mutex); |
---|
.. | .. |
---|
2861 | 3100 | { |
---|
2862 | 3101 | struct btrfs_root *root = fs_info->chunk_root; |
---|
2863 | 3102 | struct btrfs_trans_handle *trans; |
---|
| 3103 | + struct btrfs_block_group *block_group; |
---|
2864 | 3104 | int ret; |
---|
2865 | 3105 | |
---|
2866 | 3106 | /* |
---|
.. | .. |
---|
2877 | 3117 | */ |
---|
2878 | 3118 | lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); |
---|
2879 | 3119 | |
---|
2880 | | - ret = btrfs_can_relocate(fs_info, chunk_offset); |
---|
2881 | | - if (ret) |
---|
2882 | | - return -ENOSPC; |
---|
2883 | | - |
---|
2884 | 3120 | /* step one, relocate all the extents inside this chunk */ |
---|
2885 | 3121 | btrfs_scrub_pause(fs_info); |
---|
2886 | 3122 | ret = btrfs_relocate_block_group(fs_info, chunk_offset); |
---|
.. | .. |
---|
2888 | 3124 | if (ret) |
---|
2889 | 3125 | return ret; |
---|
2890 | 3126 | |
---|
2891 | | - /* |
---|
2892 | | - * We add the kobjects here (and after forcing data chunk creation) |
---|
2893 | | - * since relocation is the only place we'll create chunks of a new |
---|
2894 | | - * type at runtime. The only place where we'll remove the last |
---|
2895 | | - * chunk of a type is the call immediately below this one. Even |
---|
2896 | | - * so, we're protected against races with the cleaner thread since |
---|
2897 | | - * we're covered by the delete_unused_bgs_mutex. |
---|
2898 | | - */ |
---|
2899 | | - btrfs_add_raid_kobjects(fs_info); |
---|
| 3127 | + block_group = btrfs_lookup_block_group(fs_info, chunk_offset); |
---|
| 3128 | + if (!block_group) |
---|
| 3129 | + return -ENOENT; |
---|
| 3130 | + btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); |
---|
| 3131 | + btrfs_put_block_group(block_group); |
---|
2900 | 3132 | |
---|
2901 | 3133 | trans = btrfs_start_trans_remove_block_group(root->fs_info, |
---|
2902 | 3134 | chunk_offset); |
---|
.. | .. |
---|
2997 | 3229 | static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, |
---|
2998 | 3230 | u64 chunk_offset) |
---|
2999 | 3231 | { |
---|
3000 | | - struct btrfs_block_group_cache *cache; |
---|
| 3232 | + struct btrfs_block_group *cache; |
---|
3001 | 3233 | u64 bytes_used; |
---|
3002 | 3234 | u64 chunk_type; |
---|
3003 | 3235 | |
---|
.. | .. |
---|
3006 | 3238 | chunk_type = cache->flags; |
---|
3007 | 3239 | btrfs_put_block_group(cache); |
---|
3008 | 3240 | |
---|
3009 | | - if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { |
---|
3010 | | - spin_lock(&fs_info->data_sinfo->lock); |
---|
3011 | | - bytes_used = fs_info->data_sinfo->bytes_used; |
---|
3012 | | - spin_unlock(&fs_info->data_sinfo->lock); |
---|
| 3241 | + if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) |
---|
| 3242 | + return 0; |
---|
3013 | 3243 | |
---|
3014 | | - if (!bytes_used) { |
---|
3015 | | - struct btrfs_trans_handle *trans; |
---|
3016 | | - int ret; |
---|
| 3244 | + spin_lock(&fs_info->data_sinfo->lock); |
---|
| 3245 | + bytes_used = fs_info->data_sinfo->bytes_used; |
---|
| 3246 | + spin_unlock(&fs_info->data_sinfo->lock); |
---|
3017 | 3247 | |
---|
3018 | | - trans = btrfs_join_transaction(fs_info->tree_root); |
---|
3019 | | - if (IS_ERR(trans)) |
---|
3020 | | - return PTR_ERR(trans); |
---|
| 3248 | + if (!bytes_used) { |
---|
| 3249 | + struct btrfs_trans_handle *trans; |
---|
| 3250 | + int ret; |
---|
3021 | 3251 | |
---|
3022 | | - ret = btrfs_force_chunk_alloc(trans, |
---|
3023 | | - BTRFS_BLOCK_GROUP_DATA); |
---|
3024 | | - btrfs_end_transaction(trans); |
---|
3025 | | - if (ret < 0) |
---|
3026 | | - return ret; |
---|
| 3252 | + trans = btrfs_join_transaction(fs_info->tree_root); |
---|
| 3253 | + if (IS_ERR(trans)) |
---|
| 3254 | + return PTR_ERR(trans); |
---|
3027 | 3255 | |
---|
3028 | | - btrfs_add_raid_kobjects(fs_info); |
---|
3029 | | - |
---|
3030 | | - return 1; |
---|
3031 | | - } |
---|
| 3256 | + ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); |
---|
| 3257 | + btrfs_end_transaction(trans); |
---|
| 3258 | + if (ret < 0) |
---|
| 3259 | + return ret; |
---|
| 3260 | + return 1; |
---|
3032 | 3261 | } |
---|
| 3262 | + |
---|
3033 | 3263 | return 0; |
---|
3034 | 3264 | } |
---|
3035 | 3265 | |
---|
.. | .. |
---|
3099 | 3329 | if (!path) |
---|
3100 | 3330 | return -ENOMEM; |
---|
3101 | 3331 | |
---|
3102 | | - trans = btrfs_start_transaction(root, 0); |
---|
| 3332 | + trans = btrfs_start_transaction_fallback_global_rsv(root, 0); |
---|
3103 | 3333 | if (IS_ERR(trans)) { |
---|
3104 | 3334 | btrfs_free_path(path); |
---|
3105 | 3335 | return PTR_ERR(trans); |
---|
.. | .. |
---|
3208 | 3438 | static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, |
---|
3209 | 3439 | struct btrfs_balance_args *bargs) |
---|
3210 | 3440 | { |
---|
3211 | | - struct btrfs_block_group_cache *cache; |
---|
| 3441 | + struct btrfs_block_group *cache; |
---|
3212 | 3442 | u64 chunk_used; |
---|
3213 | 3443 | u64 user_thresh_min; |
---|
3214 | 3444 | u64 user_thresh_max; |
---|
3215 | 3445 | int ret = 1; |
---|
3216 | 3446 | |
---|
3217 | 3447 | cache = btrfs_lookup_block_group(fs_info, chunk_offset); |
---|
3218 | | - chunk_used = btrfs_block_group_used(&cache->item); |
---|
| 3448 | + chunk_used = cache->used; |
---|
3219 | 3449 | |
---|
3220 | 3450 | if (bargs->usage_min == 0) |
---|
3221 | 3451 | user_thresh_min = 0; |
---|
3222 | 3452 | else |
---|
3223 | | - user_thresh_min = div_factor_fine(cache->key.offset, |
---|
3224 | | - bargs->usage_min); |
---|
| 3453 | + user_thresh_min = div_factor_fine(cache->length, |
---|
| 3454 | + bargs->usage_min); |
---|
3225 | 3455 | |
---|
3226 | 3456 | if (bargs->usage_max == 0) |
---|
3227 | 3457 | user_thresh_max = 1; |
---|
3228 | 3458 | else if (bargs->usage_max > 100) |
---|
3229 | | - user_thresh_max = cache->key.offset; |
---|
| 3459 | + user_thresh_max = cache->length; |
---|
3230 | 3460 | else |
---|
3231 | | - user_thresh_max = div_factor_fine(cache->key.offset, |
---|
3232 | | - bargs->usage_max); |
---|
| 3461 | + user_thresh_max = div_factor_fine(cache->length, |
---|
| 3462 | + bargs->usage_max); |
---|
3233 | 3463 | |
---|
3234 | 3464 | if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) |
---|
3235 | 3465 | ret = 0; |
---|
.. | .. |
---|
3241 | 3471 | static int chunk_usage_filter(struct btrfs_fs_info *fs_info, |
---|
3242 | 3472 | u64 chunk_offset, struct btrfs_balance_args *bargs) |
---|
3243 | 3473 | { |
---|
3244 | | - struct btrfs_block_group_cache *cache; |
---|
| 3474 | + struct btrfs_block_group *cache; |
---|
3245 | 3475 | u64 chunk_used, user_thresh; |
---|
3246 | 3476 | int ret = 1; |
---|
3247 | 3477 | |
---|
3248 | 3478 | cache = btrfs_lookup_block_group(fs_info, chunk_offset); |
---|
3249 | | - chunk_used = btrfs_block_group_used(&cache->item); |
---|
| 3479 | + chunk_used = cache->used; |
---|
3250 | 3480 | |
---|
3251 | 3481 | if (bargs->usage_min == 0) |
---|
3252 | 3482 | user_thresh = 1; |
---|
3253 | 3483 | else if (bargs->usage > 100) |
---|
3254 | | - user_thresh = cache->key.offset; |
---|
| 3484 | + user_thresh = cache->length; |
---|
3255 | 3485 | else |
---|
3256 | | - user_thresh = div_factor_fine(cache->key.offset, |
---|
3257 | | - bargs->usage); |
---|
| 3486 | + user_thresh = div_factor_fine(cache->length, bargs->usage); |
---|
3258 | 3487 | |
---|
3259 | 3488 | if (chunk_used < user_thresh) |
---|
3260 | 3489 | ret = 0; |
---|
.. | .. |
---|
3280 | 3509 | return 1; |
---|
3281 | 3510 | } |
---|
3282 | 3511 | |
---|
| 3512 | +static u64 calc_data_stripes(u64 type, int num_stripes) |
---|
| 3513 | +{ |
---|
| 3514 | + const int index = btrfs_bg_flags_to_raid_index(type); |
---|
| 3515 | + const int ncopies = btrfs_raid_array[index].ncopies; |
---|
| 3516 | + const int nparity = btrfs_raid_array[index].nparity; |
---|
| 3517 | + |
---|
| 3518 | + if (nparity) |
---|
| 3519 | + return num_stripes - nparity; |
---|
| 3520 | + else |
---|
| 3521 | + return num_stripes / ncopies; |
---|
| 3522 | +} |
---|
| 3523 | + |
---|
3283 | 3524 | /* [pstart, pend) */ |
---|
3284 | 3525 | static int chunk_drange_filter(struct extent_buffer *leaf, |
---|
3285 | 3526 | struct btrfs_chunk *chunk, |
---|
.. | .. |
---|
3289 | 3530 | int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); |
---|
3290 | 3531 | u64 stripe_offset; |
---|
3291 | 3532 | u64 stripe_length; |
---|
| 3533 | + u64 type; |
---|
3292 | 3534 | int factor; |
---|
3293 | 3535 | int i; |
---|
3294 | 3536 | |
---|
3295 | 3537 | if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) |
---|
3296 | 3538 | return 0; |
---|
3297 | 3539 | |
---|
3298 | | - if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | |
---|
3299 | | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { |
---|
3300 | | - factor = num_stripes / 2; |
---|
3301 | | - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { |
---|
3302 | | - factor = num_stripes - 1; |
---|
3303 | | - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { |
---|
3304 | | - factor = num_stripes - 2; |
---|
3305 | | - } else { |
---|
3306 | | - factor = num_stripes; |
---|
3307 | | - } |
---|
| 3540 | + type = btrfs_chunk_type(leaf, chunk); |
---|
| 3541 | + factor = calc_data_stripes(type, num_stripes); |
---|
3308 | 3542 | |
---|
3309 | 3543 | for (i = 0; i < num_stripes; i++) { |
---|
3310 | 3544 | stripe = btrfs_stripe_nr(chunk, i); |
---|
.. | .. |
---|
3365 | 3599 | return 0; |
---|
3366 | 3600 | } |
---|
3367 | 3601 | |
---|
3368 | | -static int should_balance_chunk(struct btrfs_fs_info *fs_info, |
---|
3369 | | - struct extent_buffer *leaf, |
---|
| 3602 | +static int should_balance_chunk(struct extent_buffer *leaf, |
---|
3370 | 3603 | struct btrfs_chunk *chunk, u64 chunk_offset) |
---|
3371 | 3604 | { |
---|
| 3605 | + struct btrfs_fs_info *fs_info = leaf->fs_info; |
---|
3372 | 3606 | struct btrfs_balance_control *bctl = fs_info->balance_ctl; |
---|
3373 | 3607 | struct btrfs_balance_args *bargs = NULL; |
---|
3374 | 3608 | u64 chunk_type = btrfs_chunk_type(leaf, chunk); |
---|
.. | .. |
---|
3458 | 3692 | { |
---|
3459 | 3693 | struct btrfs_balance_control *bctl = fs_info->balance_ctl; |
---|
3460 | 3694 | struct btrfs_root *chunk_root = fs_info->chunk_root; |
---|
3461 | | - struct btrfs_root *dev_root = fs_info->dev_root; |
---|
3462 | | - struct list_head *devices; |
---|
3463 | | - struct btrfs_device *device; |
---|
3464 | | - u64 old_size; |
---|
3465 | | - u64 size_to_free; |
---|
3466 | 3695 | u64 chunk_type; |
---|
3467 | 3696 | struct btrfs_chunk *chunk; |
---|
3468 | 3697 | struct btrfs_path *path = NULL; |
---|
3469 | 3698 | struct btrfs_key key; |
---|
3470 | 3699 | struct btrfs_key found_key; |
---|
3471 | | - struct btrfs_trans_handle *trans; |
---|
3472 | 3700 | struct extent_buffer *leaf; |
---|
3473 | 3701 | int slot; |
---|
3474 | 3702 | int ret; |
---|
.. | .. |
---|
3483 | 3711 | u32 count_sys = 0; |
---|
3484 | 3712 | int chunk_reserved = 0; |
---|
3485 | 3713 | |
---|
3486 | | - /* step one make some room on all the devices */ |
---|
3487 | | - devices = &fs_info->fs_devices->devices; |
---|
3488 | | - list_for_each_entry(device, devices, dev_list) { |
---|
3489 | | - old_size = btrfs_device_get_total_bytes(device); |
---|
3490 | | - size_to_free = div_factor(old_size, 1); |
---|
3491 | | - size_to_free = min_t(u64, size_to_free, SZ_1M); |
---|
3492 | | - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || |
---|
3493 | | - btrfs_device_get_total_bytes(device) - |
---|
3494 | | - btrfs_device_get_bytes_used(device) > size_to_free || |
---|
3495 | | - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) |
---|
3496 | | - continue; |
---|
3497 | | - |
---|
3498 | | - ret = btrfs_shrink_device(device, old_size - size_to_free); |
---|
3499 | | - if (ret == -ENOSPC) |
---|
3500 | | - break; |
---|
3501 | | - if (ret) { |
---|
3502 | | - /* btrfs_shrink_device never returns ret > 0 */ |
---|
3503 | | - WARN_ON(ret > 0); |
---|
3504 | | - goto error; |
---|
3505 | | - } |
---|
3506 | | - |
---|
3507 | | - trans = btrfs_start_transaction(dev_root, 0); |
---|
3508 | | - if (IS_ERR(trans)) { |
---|
3509 | | - ret = PTR_ERR(trans); |
---|
3510 | | - btrfs_info_in_rcu(fs_info, |
---|
3511 | | - "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", |
---|
3512 | | - rcu_str_deref(device->name), ret, |
---|
3513 | | - old_size, old_size - size_to_free); |
---|
3514 | | - goto error; |
---|
3515 | | - } |
---|
3516 | | - |
---|
3517 | | - ret = btrfs_grow_device(trans, device, old_size); |
---|
3518 | | - if (ret) { |
---|
3519 | | - btrfs_end_transaction(trans); |
---|
3520 | | - /* btrfs_grow_device never returns ret > 0 */ |
---|
3521 | | - WARN_ON(ret > 0); |
---|
3522 | | - btrfs_info_in_rcu(fs_info, |
---|
3523 | | - "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", |
---|
3524 | | - rcu_str_deref(device->name), ret, |
---|
3525 | | - old_size, old_size - size_to_free); |
---|
3526 | | - goto error; |
---|
3527 | | - } |
---|
3528 | | - |
---|
3529 | | - btrfs_end_transaction(trans); |
---|
3530 | | - } |
---|
3531 | | - |
---|
3532 | | - /* step two, relocate all the chunks */ |
---|
3533 | 3714 | path = btrfs_alloc_path(); |
---|
3534 | 3715 | if (!path) { |
---|
3535 | 3716 | ret = -ENOMEM; |
---|
.. | .. |
---|
3601 | 3782 | spin_unlock(&fs_info->balance_lock); |
---|
3602 | 3783 | } |
---|
3603 | 3784 | |
---|
3604 | | - ret = should_balance_chunk(fs_info, leaf, chunk, |
---|
3605 | | - found_key.offset); |
---|
| 3785 | + ret = should_balance_chunk(leaf, chunk, found_key.offset); |
---|
3606 | 3786 | |
---|
3607 | 3787 | btrfs_release_path(path); |
---|
3608 | 3788 | if (!ret) { |
---|
.. | .. |
---|
3659 | 3839 | |
---|
3660 | 3840 | ret = btrfs_relocate_chunk(fs_info, found_key.offset); |
---|
3661 | 3841 | mutex_unlock(&fs_info->delete_unused_bgs_mutex); |
---|
3662 | | - if (ret && ret != -ENOSPC) |
---|
3663 | | - goto error; |
---|
3664 | 3842 | if (ret == -ENOSPC) { |
---|
3665 | 3843 | enospc_errors++; |
---|
| 3844 | + } else if (ret == -ETXTBSY) { |
---|
| 3845 | + btrfs_info(fs_info, |
---|
| 3846 | + "skipping relocation of block group %llu due to active swapfile", |
---|
| 3847 | + found_key.offset); |
---|
| 3848 | + ret = 0; |
---|
| 3849 | + } else if (ret) { |
---|
| 3850 | + goto error; |
---|
3666 | 3851 | } else { |
---|
3667 | 3852 | spin_lock(&fs_info->balance_lock); |
---|
3668 | 3853 | bctl->stat.completed++; |
---|
.. | .. |
---|
3711 | 3896 | if (flags == 0) |
---|
3712 | 3897 | return !extended; /* "0" is valid for usual profiles */ |
---|
3713 | 3898 | |
---|
3714 | | - /* true if exactly one bit set */ |
---|
3715 | | - return (flags & (flags - 1)) == 0; |
---|
| 3899 | + return has_single_bit_set(flags); |
---|
3716 | 3900 | } |
---|
3717 | 3901 | |
---|
3718 | 3902 | static inline int balance_need_close(struct btrfs_fs_info *fs_info) |
---|
.. | .. |
---|
3723 | 3907 | atomic_read(&fs_info->balance_cancel_req) == 0); |
---|
3724 | 3908 | } |
---|
3725 | 3909 | |
---|
3726 | | -/* Non-zero return value signifies invalidity */ |
---|
3727 | | -static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, |
---|
3728 | | - u64 allowed) |
---|
| 3910 | +/* |
---|
| 3911 | + * Validate target profile against allowed profiles and return true if it's OK. |
---|
| 3912 | + * Otherwise print the error message and return false. |
---|
| 3913 | + */ |
---|
| 3914 | +static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, |
---|
| 3915 | + const struct btrfs_balance_args *bargs, |
---|
| 3916 | + u64 allowed, const char *type) |
---|
3729 | 3917 | { |
---|
3730 | | - return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && |
---|
3731 | | - (!alloc_profile_is_valid(bctl_arg->target, 1) || |
---|
3732 | | - (bctl_arg->target & ~allowed))); |
---|
| 3918 | + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) |
---|
| 3919 | + return true; |
---|
| 3920 | + |
---|
| 3921 | + /* Profile is valid and does not have bits outside of the allowed set */ |
---|
| 3922 | + if (alloc_profile_is_valid(bargs->target, 1) && |
---|
| 3923 | + (bargs->target & ~allowed) == 0) |
---|
| 3924 | + return true; |
---|
| 3925 | + |
---|
| 3926 | + btrfs_err(fs_info, "balance: invalid convert %s profile %s", |
---|
| 3927 | + type, btrfs_bg_type_to_raid_name(bargs->target)); |
---|
| 3928 | + return false; |
---|
| 3929 | +} |
---|
| 3930 | + |
---|
| 3931 | +/* |
---|
| 3932 | + * Fill @buf with textual description of balance filter flags @bargs, up to |
---|
| 3933 | + * @size_buf including the terminating null. The output may be trimmed if it |
---|
| 3934 | + * does not fit into the provided buffer. |
---|
| 3935 | + */ |
---|
| 3936 | +static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, |
---|
| 3937 | + u32 size_buf) |
---|
| 3938 | +{ |
---|
| 3939 | + int ret; |
---|
| 3940 | + u32 size_bp = size_buf; |
---|
| 3941 | + char *bp = buf; |
---|
| 3942 | + u64 flags = bargs->flags; |
---|
| 3943 | + char tmp_buf[128] = {'\0'}; |
---|
| 3944 | + |
---|
| 3945 | + if (!flags) |
---|
| 3946 | + return; |
---|
| 3947 | + |
---|
| 3948 | +#define CHECK_APPEND_NOARG(a) \ |
---|
| 3949 | + do { \ |
---|
| 3950 | + ret = snprintf(bp, size_bp, (a)); \ |
---|
| 3951 | + if (ret < 0 || ret >= size_bp) \ |
---|
| 3952 | + goto out_overflow; \ |
---|
| 3953 | + size_bp -= ret; \ |
---|
| 3954 | + bp += ret; \ |
---|
| 3955 | + } while (0) |
---|
| 3956 | + |
---|
| 3957 | +#define CHECK_APPEND_1ARG(a, v1) \ |
---|
| 3958 | + do { \ |
---|
| 3959 | + ret = snprintf(bp, size_bp, (a), (v1)); \ |
---|
| 3960 | + if (ret < 0 || ret >= size_bp) \ |
---|
| 3961 | + goto out_overflow; \ |
---|
| 3962 | + size_bp -= ret; \ |
---|
| 3963 | + bp += ret; \ |
---|
| 3964 | + } while (0) |
---|
| 3965 | + |
---|
| 3966 | +#define CHECK_APPEND_2ARG(a, v1, v2) \ |
---|
| 3967 | + do { \ |
---|
| 3968 | + ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ |
---|
| 3969 | + if (ret < 0 || ret >= size_bp) \ |
---|
| 3970 | + goto out_overflow; \ |
---|
| 3971 | + size_bp -= ret; \ |
---|
| 3972 | + bp += ret; \ |
---|
| 3973 | + } while (0) |
---|
| 3974 | + |
---|
| 3975 | + if (flags & BTRFS_BALANCE_ARGS_CONVERT) |
---|
| 3976 | + CHECK_APPEND_1ARG("convert=%s,", |
---|
| 3977 | + btrfs_bg_type_to_raid_name(bargs->target)); |
---|
| 3978 | + |
---|
| 3979 | + if (flags & BTRFS_BALANCE_ARGS_SOFT) |
---|
| 3980 | + CHECK_APPEND_NOARG("soft,"); |
---|
| 3981 | + |
---|
| 3982 | + if (flags & BTRFS_BALANCE_ARGS_PROFILES) { |
---|
| 3983 | + btrfs_describe_block_groups(bargs->profiles, tmp_buf, |
---|
| 3984 | + sizeof(tmp_buf)); |
---|
| 3985 | + CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); |
---|
| 3986 | + } |
---|
| 3987 | + |
---|
| 3988 | + if (flags & BTRFS_BALANCE_ARGS_USAGE) |
---|
| 3989 | + CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); |
---|
| 3990 | + |
---|
| 3991 | + if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) |
---|
| 3992 | + CHECK_APPEND_2ARG("usage=%u..%u,", |
---|
| 3993 | + bargs->usage_min, bargs->usage_max); |
---|
| 3994 | + |
---|
| 3995 | + if (flags & BTRFS_BALANCE_ARGS_DEVID) |
---|
| 3996 | + CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); |
---|
| 3997 | + |
---|
| 3998 | + if (flags & BTRFS_BALANCE_ARGS_DRANGE) |
---|
| 3999 | + CHECK_APPEND_2ARG("drange=%llu..%llu,", |
---|
| 4000 | + bargs->pstart, bargs->pend); |
---|
| 4001 | + |
---|
| 4002 | + if (flags & BTRFS_BALANCE_ARGS_VRANGE) |
---|
| 4003 | + CHECK_APPEND_2ARG("vrange=%llu..%llu,", |
---|
| 4004 | + bargs->vstart, bargs->vend); |
---|
| 4005 | + |
---|
| 4006 | + if (flags & BTRFS_BALANCE_ARGS_LIMIT) |
---|
| 4007 | + CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); |
---|
| 4008 | + |
---|
| 4009 | + if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) |
---|
| 4010 | + CHECK_APPEND_2ARG("limit=%u..%u,", |
---|
| 4011 | + bargs->limit_min, bargs->limit_max); |
---|
| 4012 | + |
---|
| 4013 | + if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) |
---|
| 4014 | + CHECK_APPEND_2ARG("stripes=%u..%u,", |
---|
| 4015 | + bargs->stripes_min, bargs->stripes_max); |
---|
| 4016 | + |
---|
| 4017 | +#undef CHECK_APPEND_2ARG |
---|
| 4018 | +#undef CHECK_APPEND_1ARG |
---|
| 4019 | +#undef CHECK_APPEND_NOARG |
---|
| 4020 | + |
---|
| 4021 | +out_overflow: |
---|
| 4022 | + |
---|
| 4023 | + if (size_bp < size_buf) |
---|
| 4024 | + buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ |
---|
| 4025 | + else |
---|
| 4026 | + buf[0] = '\0'; |
---|
| 4027 | +} |
---|
| 4028 | + |
---|
| 4029 | +static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) |
---|
| 4030 | +{ |
---|
| 4031 | + u32 size_buf = 1024; |
---|
| 4032 | + char tmp_buf[192] = {'\0'}; |
---|
| 4033 | + char *buf; |
---|
| 4034 | + char *bp; |
---|
| 4035 | + u32 size_bp = size_buf; |
---|
| 4036 | + int ret; |
---|
| 4037 | + struct btrfs_balance_control *bctl = fs_info->balance_ctl; |
---|
| 4038 | + |
---|
| 4039 | + buf = kzalloc(size_buf, GFP_KERNEL); |
---|
| 4040 | + if (!buf) |
---|
| 4041 | + return; |
---|
| 4042 | + |
---|
| 4043 | + bp = buf; |
---|
| 4044 | + |
---|
| 4045 | +#define CHECK_APPEND_1ARG(a, v1) \ |
---|
| 4046 | + do { \ |
---|
| 4047 | + ret = snprintf(bp, size_bp, (a), (v1)); \ |
---|
| 4048 | + if (ret < 0 || ret >= size_bp) \ |
---|
| 4049 | + goto out_overflow; \ |
---|
| 4050 | + size_bp -= ret; \ |
---|
| 4051 | + bp += ret; \ |
---|
| 4052 | + } while (0) |
---|
| 4053 | + |
---|
| 4054 | + if (bctl->flags & BTRFS_BALANCE_FORCE) |
---|
| 4055 | + CHECK_APPEND_1ARG("%s", "-f "); |
---|
| 4056 | + |
---|
| 4057 | + if (bctl->flags & BTRFS_BALANCE_DATA) { |
---|
| 4058 | + describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); |
---|
| 4059 | + CHECK_APPEND_1ARG("-d%s ", tmp_buf); |
---|
| 4060 | + } |
---|
| 4061 | + |
---|
| 4062 | + if (bctl->flags & BTRFS_BALANCE_METADATA) { |
---|
| 4063 | + describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); |
---|
| 4064 | + CHECK_APPEND_1ARG("-m%s ", tmp_buf); |
---|
| 4065 | + } |
---|
| 4066 | + |
---|
| 4067 | + if (bctl->flags & BTRFS_BALANCE_SYSTEM) { |
---|
| 4068 | + describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); |
---|
| 4069 | + CHECK_APPEND_1ARG("-s%s ", tmp_buf); |
---|
| 4070 | + } |
---|
| 4071 | + |
---|
| 4072 | +#undef CHECK_APPEND_1ARG |
---|
| 4073 | + |
---|
| 4074 | +out_overflow: |
---|
| 4075 | + |
---|
| 4076 | + if (size_bp < size_buf) |
---|
| 4077 | + buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ |
---|
| 4078 | + btrfs_info(fs_info, "balance: %s %s", |
---|
| 4079 | + (bctl->flags & BTRFS_BALANCE_RESUME) ? |
---|
| 4080 | + "resume" : "start", buf); |
---|
| 4081 | + |
---|
| 4082 | + kfree(buf); |
---|
3733 | 4083 | } |
---|
3734 | 4084 | |
---|
3735 | 4085 | /* |
---|
.. | .. |
---|
3745 | 4095 | int ret; |
---|
3746 | 4096 | u64 num_devices; |
---|
3747 | 4097 | unsigned seq; |
---|
3748 | | - bool reducing_integrity; |
---|
| 4098 | + bool reducing_redundancy; |
---|
| 4099 | + int i; |
---|
3749 | 4100 | |
---|
3750 | 4101 | if (btrfs_fs_closing(fs_info) || |
---|
3751 | 4102 | atomic_read(&fs_info->balance_pause_req) || |
---|
3752 | | - atomic_read(&fs_info->balance_cancel_req)) { |
---|
| 4103 | + btrfs_should_cancel_balance(fs_info)) { |
---|
3753 | 4104 | ret = -EINVAL; |
---|
3754 | 4105 | goto out; |
---|
3755 | 4106 | } |
---|
.. | .. |
---|
3774 | 4125 | } |
---|
3775 | 4126 | } |
---|
3776 | 4127 | |
---|
3777 | | - num_devices = fs_info->fs_devices->num_devices; |
---|
3778 | | - btrfs_dev_replace_read_lock(&fs_info->dev_replace); |
---|
3779 | | - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { |
---|
3780 | | - BUG_ON(num_devices < 1); |
---|
3781 | | - num_devices--; |
---|
3782 | | - } |
---|
3783 | | - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); |
---|
3784 | | - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; |
---|
3785 | | - if (num_devices > 1) |
---|
3786 | | - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); |
---|
3787 | | - if (num_devices > 2) |
---|
3788 | | - allowed |= BTRFS_BLOCK_GROUP_RAID5; |
---|
3789 | | - if (num_devices > 3) |
---|
3790 | | - allowed |= (BTRFS_BLOCK_GROUP_RAID10 | |
---|
3791 | | - BTRFS_BLOCK_GROUP_RAID6); |
---|
3792 | | - if (validate_convert_profile(&bctl->data, allowed)) { |
---|
3793 | | - int index = btrfs_bg_flags_to_raid_index(bctl->data.target); |
---|
| 4128 | + /* |
---|
| 4129 | + * rw_devices will not change at the moment, device add/delete/replace |
---|
| 4130 | + * are exclusive |
---|
| 4131 | + */ |
---|
| 4132 | + num_devices = fs_info->fs_devices->rw_devices; |
---|
3794 | 4133 | |
---|
3795 | | - btrfs_err(fs_info, |
---|
3796 | | - "balance: invalid convert data profile %s", |
---|
3797 | | - get_raid_name(index)); |
---|
3798 | | - ret = -EINVAL; |
---|
3799 | | - goto out; |
---|
3800 | | - } |
---|
3801 | | - if (validate_convert_profile(&bctl->meta, allowed)) { |
---|
3802 | | - int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); |
---|
| 4134 | + /* |
---|
| 4135 | + * SINGLE profile on-disk has no profile bit, but in-memory we have a |
---|
| 4136 | + * special bit for it, to make it easier to distinguish. Thus we need |
---|
| 4137 | + * to set it manually, or balance would refuse the profile. |
---|
| 4138 | + */ |
---|
| 4139 | + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; |
---|
| 4140 | + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) |
---|
| 4141 | + if (num_devices >= btrfs_raid_array[i].devs_min) |
---|
| 4142 | + allowed |= btrfs_raid_array[i].bg_flag; |
---|
3803 | 4143 | |
---|
3804 | | - btrfs_err(fs_info, |
---|
3805 | | - "balance: invalid convert metadata profile %s", |
---|
3806 | | - get_raid_name(index)); |
---|
3807 | | - ret = -EINVAL; |
---|
3808 | | - goto out; |
---|
3809 | | - } |
---|
3810 | | - if (validate_convert_profile(&bctl->sys, allowed)) { |
---|
3811 | | - int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); |
---|
3812 | | - |
---|
3813 | | - btrfs_err(fs_info, |
---|
3814 | | - "balance: invalid convert system profile %s", |
---|
3815 | | - get_raid_name(index)); |
---|
| 4144 | + if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || |
---|
| 4145 | + !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || |
---|
| 4146 | + !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { |
---|
3816 | 4147 | ret = -EINVAL; |
---|
3817 | 4148 | goto out; |
---|
3818 | 4149 | } |
---|
3819 | 4150 | |
---|
3820 | | - /* allow to reduce meta or sys integrity only if force set */ |
---|
3821 | | - allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | |
---|
3822 | | - BTRFS_BLOCK_GROUP_RAID10 | |
---|
3823 | | - BTRFS_BLOCK_GROUP_RAID5 | |
---|
3824 | | - BTRFS_BLOCK_GROUP_RAID6; |
---|
| 4151 | + /* |
---|
| 4152 | + * Allow to reduce metadata or system integrity only if force set for |
---|
| 4153 | + * profiles with redundancy (copies, parity) |
---|
| 4154 | + */ |
---|
| 4155 | + allowed = 0; |
---|
| 4156 | + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { |
---|
| 4157 | + if (btrfs_raid_array[i].ncopies >= 2 || |
---|
| 4158 | + btrfs_raid_array[i].tolerated_failures >= 1) |
---|
| 4159 | + allowed |= btrfs_raid_array[i].bg_flag; |
---|
| 4160 | + } |
---|
3825 | 4161 | do { |
---|
3826 | 4162 | seq = read_seqbegin(&fs_info->profiles_lock); |
---|
3827 | 4163 | |
---|
.. | .. |
---|
3831 | 4167 | ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
---|
3832 | 4168 | (fs_info->avail_metadata_alloc_bits & allowed) && |
---|
3833 | 4169 | !(bctl->meta.target & allowed))) |
---|
3834 | | - reducing_integrity = true; |
---|
| 4170 | + reducing_redundancy = true; |
---|
3835 | 4171 | else |
---|
3836 | | - reducing_integrity = false; |
---|
| 4172 | + reducing_redundancy = false; |
---|
3837 | 4173 | |
---|
3838 | 4174 | /* if we're not converting, the target field is uninitialized */ |
---|
3839 | 4175 | meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? |
---|
.. | .. |
---|
3842 | 4178 | bctl->data.target : fs_info->avail_data_alloc_bits; |
---|
3843 | 4179 | } while (read_seqretry(&fs_info->profiles_lock, seq)); |
---|
3844 | 4180 | |
---|
3845 | | - if (reducing_integrity) { |
---|
| 4181 | + if (reducing_redundancy) { |
---|
3846 | 4182 | if (bctl->flags & BTRFS_BALANCE_FORCE) { |
---|
3847 | 4183 | btrfs_info(fs_info, |
---|
3848 | | - "balance: force reducing metadata integrity"); |
---|
| 4184 | + "balance: force reducing metadata redundancy"); |
---|
3849 | 4185 | } else { |
---|
3850 | 4186 | btrfs_err(fs_info, |
---|
3851 | | - "balance: reduces metadata integrity, use --force if you want this"); |
---|
| 4187 | + "balance: reduces metadata redundancy, use --force if you want this"); |
---|
3852 | 4188 | ret = -EINVAL; |
---|
3853 | 4189 | goto out; |
---|
3854 | 4190 | } |
---|
.. | .. |
---|
3856 | 4192 | |
---|
3857 | 4193 | if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < |
---|
3858 | 4194 | btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { |
---|
3859 | | - int meta_index = btrfs_bg_flags_to_raid_index(meta_target); |
---|
3860 | | - int data_index = btrfs_bg_flags_to_raid_index(data_target); |
---|
3861 | | - |
---|
3862 | 4195 | btrfs_warn(fs_info, |
---|
3863 | 4196 | "balance: metadata profile %s has lower redundancy than data profile %s", |
---|
3864 | | - get_raid_name(meta_index), get_raid_name(data_index)); |
---|
| 4197 | + btrfs_bg_type_to_raid_name(meta_target), |
---|
| 4198 | + btrfs_bg_type_to_raid_name(data_target)); |
---|
| 4199 | + } |
---|
| 4200 | + |
---|
| 4201 | + if (fs_info->send_in_progress) { |
---|
| 4202 | + btrfs_warn_rl(fs_info, |
---|
| 4203 | +"cannot run balance while send operations are in progress (%d in progress)", |
---|
| 4204 | + fs_info->send_in_progress); |
---|
| 4205 | + ret = -EAGAIN; |
---|
| 4206 | + goto out; |
---|
3865 | 4207 | } |
---|
3866 | 4208 | |
---|
3867 | 4209 | ret = insert_balance_item(fs_info, bctl); |
---|
.. | .. |
---|
3883 | 4225 | |
---|
3884 | 4226 | ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); |
---|
3885 | 4227 | set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); |
---|
| 4228 | + describe_balance_start_or_resume(fs_info); |
---|
3886 | 4229 | mutex_unlock(&fs_info->balance_mutex); |
---|
3887 | 4230 | |
---|
3888 | 4231 | ret = __btrfs_balance(fs_info); |
---|
3889 | 4232 | |
---|
3890 | 4233 | mutex_lock(&fs_info->balance_mutex); |
---|
| 4234 | + if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) |
---|
| 4235 | + btrfs_info(fs_info, "balance: paused"); |
---|
| 4236 | + /* |
---|
| 4237 | + * Balance can be canceled by: |
---|
| 4238 | + * |
---|
| 4239 | + * - Regular cancel request |
---|
| 4240 | + * Then ret == -ECANCELED and balance_cancel_req > 0 |
---|
| 4241 | + * |
---|
| 4242 | + * - Fatal signal to "btrfs" process |
---|
| 4243 | + * Either the signal caught by wait_reserve_ticket() and callers |
---|
| 4244 | + * got -EINTR, or caught by btrfs_should_cancel_balance() and |
---|
| 4245 | + * got -ECANCELED. |
---|
| 4246 | + * Either way, in this case balance_cancel_req = 0, and |
---|
| 4247 | + * ret == -EINTR or ret == -ECANCELED. |
---|
| 4248 | + * |
---|
| 4249 | + * So here we only check the return value to catch canceled balance. |
---|
| 4250 | + */ |
---|
| 4251 | + else if (ret == -ECANCELED || ret == -EINTR) |
---|
| 4252 | + btrfs_info(fs_info, "balance: canceled"); |
---|
| 4253 | + else |
---|
| 4254 | + btrfs_info(fs_info, "balance: ended with status: %d", ret); |
---|
| 4255 | + |
---|
3891 | 4256 | clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); |
---|
3892 | 4257 | |
---|
3893 | 4258 | if (bargs) { |
---|
.. | .. |
---|
3898 | 4263 | if ((ret && ret != -ECANCELED && ret != -ENOSPC) || |
---|
3899 | 4264 | balance_need_close(fs_info)) { |
---|
3900 | 4265 | reset_balance_state(fs_info); |
---|
3901 | | - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); |
---|
| 4266 | + btrfs_exclop_finish(fs_info); |
---|
3902 | 4267 | } |
---|
3903 | 4268 | |
---|
3904 | 4269 | wake_up(&fs_info->balance_wait_q); |
---|
.. | .. |
---|
3909 | 4274 | reset_balance_state(fs_info); |
---|
3910 | 4275 | else |
---|
3911 | 4276 | kfree(bctl); |
---|
3912 | | - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); |
---|
| 4277 | + btrfs_exclop_finish(fs_info); |
---|
3913 | 4278 | |
---|
3914 | 4279 | return ret; |
---|
3915 | 4280 | } |
---|
.. | .. |
---|
3919 | 4284 | struct btrfs_fs_info *fs_info = data; |
---|
3920 | 4285 | int ret = 0; |
---|
3921 | 4286 | |
---|
| 4287 | + sb_start_write(fs_info->sb); |
---|
3922 | 4288 | mutex_lock(&fs_info->balance_mutex); |
---|
3923 | | - if (fs_info->balance_ctl) { |
---|
3924 | | - btrfs_info(fs_info, "balance: resuming"); |
---|
| 4289 | + if (fs_info->balance_ctl) |
---|
3925 | 4290 | ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); |
---|
3926 | | - } |
---|
3927 | 4291 | mutex_unlock(&fs_info->balance_mutex); |
---|
| 4292 | + sb_end_write(fs_info->sb); |
---|
3928 | 4293 | |
---|
3929 | 4294 | return ret; |
---|
3930 | 4295 | } |
---|
.. | .. |
---|
4013 | 4378 | * is in a paused state and must have fs_info::balance_ctl properly |
---|
4014 | 4379 | * set up. |
---|
4015 | 4380 | */ |
---|
4016 | | - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) |
---|
| 4381 | + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) |
---|
4017 | 4382 | btrfs_warn(fs_info, |
---|
4018 | 4383 | "balance: cannot set exclusive op status, resume manually"); |
---|
4019 | 4384 | |
---|
.. | .. |
---|
4097 | 4462 | |
---|
4098 | 4463 | if (fs_info->balance_ctl) { |
---|
4099 | 4464 | reset_balance_state(fs_info); |
---|
4100 | | - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); |
---|
| 4465 | + btrfs_exclop_finish(fs_info); |
---|
4101 | 4466 | btrfs_info(fs_info, "balance: canceled"); |
---|
4102 | 4467 | } |
---|
4103 | 4468 | } |
---|
4104 | 4469 | |
---|
4105 | | - BUG_ON(fs_info->balance_ctl || |
---|
4106 | | - test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); |
---|
| 4470 | + ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); |
---|
4107 | 4471 | atomic_dec(&fs_info->balance_cancel_req); |
---|
4108 | 4472 | mutex_unlock(&fs_info->balance_mutex); |
---|
4109 | 4473 | return 0; |
---|
4110 | 4474 | } |
---|
4111 | 4475 | |
---|
4112 | | -static int btrfs_uuid_scan_kthread(void *data) |
---|
| 4476 | +int btrfs_uuid_scan_kthread(void *data) |
---|
4113 | 4477 | { |
---|
4114 | 4478 | struct btrfs_fs_info *fs_info = data; |
---|
4115 | 4479 | struct btrfs_root *root = fs_info->tree_root; |
---|
.. | .. |
---|
4121 | 4485 | struct btrfs_root_item root_item; |
---|
4122 | 4486 | u32 item_size; |
---|
4123 | 4487 | struct btrfs_trans_handle *trans = NULL; |
---|
| 4488 | + bool closing = false; |
---|
4124 | 4489 | |
---|
4125 | 4490 | path = btrfs_alloc_path(); |
---|
4126 | 4491 | if (!path) { |
---|
.. | .. |
---|
4133 | 4498 | key.offset = 0; |
---|
4134 | 4499 | |
---|
4135 | 4500 | while (1) { |
---|
| 4501 | + if (btrfs_fs_closing(fs_info)) { |
---|
| 4502 | + closing = true; |
---|
| 4503 | + break; |
---|
| 4504 | + } |
---|
4136 | 4505 | ret = btrfs_search_forward(root, &key, path, |
---|
4137 | 4506 | BTRFS_OLDEST_GENERATION); |
---|
4138 | 4507 | if (ret) { |
---|
.. | .. |
---|
4233 | 4602 | btrfs_end_transaction(trans); |
---|
4234 | 4603 | if (ret) |
---|
4235 | 4604 | btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); |
---|
4236 | | - else |
---|
| 4605 | + else if (!closing) |
---|
4237 | 4606 | set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); |
---|
4238 | 4607 | up(&fs_info->uuid_tree_rescan_sem); |
---|
4239 | 4608 | return 0; |
---|
4240 | | -} |
---|
4241 | | - |
---|
4242 | | -/* |
---|
4243 | | - * Callback for btrfs_uuid_tree_iterate(). |
---|
4244 | | - * returns: |
---|
4245 | | - * 0 check succeeded, the entry is not outdated. |
---|
4246 | | - * < 0 if an error occurred. |
---|
4247 | | - * > 0 if the check failed, which means the caller shall remove the entry. |
---|
4248 | | - */ |
---|
4249 | | -static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, |
---|
4250 | | - u8 *uuid, u8 type, u64 subid) |
---|
4251 | | -{ |
---|
4252 | | - struct btrfs_key key; |
---|
4253 | | - int ret = 0; |
---|
4254 | | - struct btrfs_root *subvol_root; |
---|
4255 | | - |
---|
4256 | | - if (type != BTRFS_UUID_KEY_SUBVOL && |
---|
4257 | | - type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) |
---|
4258 | | - goto out; |
---|
4259 | | - |
---|
4260 | | - key.objectid = subid; |
---|
4261 | | - key.type = BTRFS_ROOT_ITEM_KEY; |
---|
4262 | | - key.offset = (u64)-1; |
---|
4263 | | - subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); |
---|
4264 | | - if (IS_ERR(subvol_root)) { |
---|
4265 | | - ret = PTR_ERR(subvol_root); |
---|
4266 | | - if (ret == -ENOENT) |
---|
4267 | | - ret = 1; |
---|
4268 | | - goto out; |
---|
4269 | | - } |
---|
4270 | | - |
---|
4271 | | - switch (type) { |
---|
4272 | | - case BTRFS_UUID_KEY_SUBVOL: |
---|
4273 | | - if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) |
---|
4274 | | - ret = 1; |
---|
4275 | | - break; |
---|
4276 | | - case BTRFS_UUID_KEY_RECEIVED_SUBVOL: |
---|
4277 | | - if (memcmp(uuid, subvol_root->root_item.received_uuid, |
---|
4278 | | - BTRFS_UUID_SIZE)) |
---|
4279 | | - ret = 1; |
---|
4280 | | - break; |
---|
4281 | | - } |
---|
4282 | | - |
---|
4283 | | -out: |
---|
4284 | | - return ret; |
---|
4285 | | -} |
---|
4286 | | - |
---|
4287 | | -static int btrfs_uuid_rescan_kthread(void *data) |
---|
4288 | | -{ |
---|
4289 | | - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; |
---|
4290 | | - int ret; |
---|
4291 | | - |
---|
4292 | | - /* |
---|
4293 | | - * 1st step is to iterate through the existing UUID tree and |
---|
4294 | | - * to delete all entries that contain outdated data. |
---|
4295 | | - * 2nd step is to add all missing entries to the UUID tree. |
---|
4296 | | - */ |
---|
4297 | | - ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); |
---|
4298 | | - if (ret < 0) { |
---|
4299 | | - btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret); |
---|
4300 | | - up(&fs_info->uuid_tree_rescan_sem); |
---|
4301 | | - return ret; |
---|
4302 | | - } |
---|
4303 | | - return btrfs_uuid_scan_kthread(data); |
---|
4304 | 4609 | } |
---|
4305 | 4610 | |
---|
4306 | 4611 | int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) |
---|
.. | .. |
---|
4319 | 4624 | if (IS_ERR(trans)) |
---|
4320 | 4625 | return PTR_ERR(trans); |
---|
4321 | 4626 | |
---|
4322 | | - uuid_root = btrfs_create_tree(trans, fs_info, |
---|
4323 | | - BTRFS_UUID_TREE_OBJECTID); |
---|
| 4627 | + uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); |
---|
4324 | 4628 | if (IS_ERR(uuid_root)) { |
---|
4325 | 4629 | ret = PTR_ERR(uuid_root); |
---|
4326 | 4630 | btrfs_abort_transaction(trans, ret); |
---|
.. | .. |
---|
4346 | 4650 | return 0; |
---|
4347 | 4651 | } |
---|
4348 | 4652 | |
---|
4349 | | -int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) |
---|
4350 | | -{ |
---|
4351 | | - struct task_struct *task; |
---|
4352 | | - |
---|
4353 | | - down(&fs_info->uuid_tree_rescan_sem); |
---|
4354 | | - task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); |
---|
4355 | | - if (IS_ERR(task)) { |
---|
4356 | | - /* fs_info->update_uuid_tree_gen remains 0 in all error case */ |
---|
4357 | | - btrfs_warn(fs_info, "failed to start uuid_rescan task"); |
---|
4358 | | - up(&fs_info->uuid_tree_rescan_sem); |
---|
4359 | | - return PTR_ERR(task); |
---|
4360 | | - } |
---|
4361 | | - |
---|
4362 | | - return 0; |
---|
4363 | | -} |
---|
4364 | | - |
---|
4365 | 4653 | /* |
---|
4366 | 4654 | * shrinking a device means finding all of the device extents past |
---|
4367 | 4655 | * the new size, and then following the back refs to the chunks. |
---|
.. | .. |
---|
4380 | 4668 | int slot; |
---|
4381 | 4669 | int failed = 0; |
---|
4382 | 4670 | bool retried = false; |
---|
4383 | | - bool checked_pending_chunks = false; |
---|
4384 | 4671 | struct extent_buffer *l; |
---|
4385 | 4672 | struct btrfs_key key; |
---|
4386 | 4673 | struct btrfs_super_block *super_copy = fs_info->super_copy; |
---|
4387 | 4674 | u64 old_total = btrfs_super_total_bytes(super_copy); |
---|
4388 | 4675 | u64 old_size = btrfs_device_get_total_bytes(device); |
---|
4389 | 4676 | u64 diff; |
---|
| 4677 | + u64 start; |
---|
4390 | 4678 | |
---|
4391 | 4679 | new_size = round_down(new_size, fs_info->sectorsize); |
---|
| 4680 | + start = new_size; |
---|
4392 | 4681 | diff = round_down(old_size - new_size, fs_info->sectorsize); |
---|
4393 | 4682 | |
---|
4394 | 4683 | if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) |
---|
.. | .. |
---|
4400 | 4689 | |
---|
4401 | 4690 | path->reada = READA_BACK; |
---|
4402 | 4691 | |
---|
| 4692 | + trans = btrfs_start_transaction(root, 0); |
---|
| 4693 | + if (IS_ERR(trans)) { |
---|
| 4694 | + btrfs_free_path(path); |
---|
| 4695 | + return PTR_ERR(trans); |
---|
| 4696 | + } |
---|
| 4697 | + |
---|
4403 | 4698 | mutex_lock(&fs_info->chunk_mutex); |
---|
4404 | 4699 | |
---|
4405 | 4700 | btrfs_device_set_total_bytes(device, new_size); |
---|
.. | .. |
---|
4407 | 4702 | device->fs_devices->total_rw_bytes -= diff; |
---|
4408 | 4703 | atomic64_sub(diff, &fs_info->free_chunk_space); |
---|
4409 | 4704 | } |
---|
4410 | | - mutex_unlock(&fs_info->chunk_mutex); |
---|
| 4705 | + |
---|
| 4706 | + /* |
---|
| 4707 | + * Once the device's size has been set to the new size, ensure all |
---|
| 4708 | + * in-memory chunks are synced to disk so that the loop below sees them |
---|
| 4709 | + * and relocates them accordingly. |
---|
| 4710 | + */ |
---|
| 4711 | + if (contains_pending_extent(device, &start, diff)) { |
---|
| 4712 | + mutex_unlock(&fs_info->chunk_mutex); |
---|
| 4713 | + ret = btrfs_commit_transaction(trans); |
---|
| 4714 | + if (ret) |
---|
| 4715 | + goto done; |
---|
| 4716 | + } else { |
---|
| 4717 | + mutex_unlock(&fs_info->chunk_mutex); |
---|
| 4718 | + btrfs_end_transaction(trans); |
---|
| 4719 | + } |
---|
4411 | 4720 | |
---|
4412 | 4721 | again: |
---|
4413 | 4722 | key.objectid = device->devid; |
---|
.. | .. |
---|
4469 | 4778 | |
---|
4470 | 4779 | ret = btrfs_relocate_chunk(fs_info, chunk_offset); |
---|
4471 | 4780 | mutex_unlock(&fs_info->delete_unused_bgs_mutex); |
---|
4472 | | - if (ret && ret != -ENOSPC) |
---|
4473 | | - goto done; |
---|
4474 | | - if (ret == -ENOSPC) |
---|
| 4781 | + if (ret == -ENOSPC) { |
---|
4475 | 4782 | failed++; |
---|
| 4783 | + } else if (ret) { |
---|
| 4784 | + if (ret == -ETXTBSY) { |
---|
| 4785 | + btrfs_warn(fs_info, |
---|
| 4786 | + "could not shrink block group %llu due to active swapfile", |
---|
| 4787 | + chunk_offset); |
---|
| 4788 | + } |
---|
| 4789 | + goto done; |
---|
| 4790 | + } |
---|
4476 | 4791 | } while (key.offset-- > 0); |
---|
4477 | 4792 | |
---|
4478 | 4793 | if (failed && !retried) { |
---|
.. | .. |
---|
4492 | 4807 | } |
---|
4493 | 4808 | |
---|
4494 | 4809 | mutex_lock(&fs_info->chunk_mutex); |
---|
4495 | | - |
---|
4496 | | - /* |
---|
4497 | | - * We checked in the above loop all device extents that were already in |
---|
4498 | | - * the device tree. However before we have updated the device's |
---|
4499 | | - * total_bytes to the new size, we might have had chunk allocations that |
---|
4500 | | - * have not complete yet (new block groups attached to transaction |
---|
4501 | | - * handles), and therefore their device extents were not yet in the |
---|
4502 | | - * device tree and we missed them in the loop above. So if we have any |
---|
4503 | | - * pending chunk using a device extent that overlaps the device range |
---|
4504 | | - * that we can not use anymore, commit the current transaction and |
---|
4505 | | - * repeat the search on the device tree - this way we guarantee we will |
---|
4506 | | - * not have chunks using device extents that end beyond 'new_size'. |
---|
4507 | | - */ |
---|
4508 | | - if (!checked_pending_chunks) { |
---|
4509 | | - u64 start = new_size; |
---|
4510 | | - u64 len = old_size - new_size; |
---|
4511 | | - |
---|
4512 | | - if (contains_pending_extent(trans->transaction, device, |
---|
4513 | | - &start, len)) { |
---|
4514 | | - mutex_unlock(&fs_info->chunk_mutex); |
---|
4515 | | - checked_pending_chunks = true; |
---|
4516 | | - failed = 0; |
---|
4517 | | - retried = false; |
---|
4518 | | - ret = btrfs_commit_transaction(trans); |
---|
4519 | | - if (ret) |
---|
4520 | | - goto done; |
---|
4521 | | - goto again; |
---|
4522 | | - } |
---|
4523 | | - } |
---|
| 4810 | + /* Clear all state bits beyond the shrunk device size */ |
---|
| 4811 | + clear_extent_bits(&device->alloc_state, new_size, (u64)-1, |
---|
| 4812 | + CHUNK_STATE_MASK); |
---|
4524 | 4813 | |
---|
4525 | 4814 | btrfs_device_set_disk_total_bytes(device, new_size); |
---|
4526 | | - if (list_empty(&device->resized_list)) |
---|
4527 | | - list_add_tail(&device->resized_list, |
---|
4528 | | - &fs_info->fs_devices->resized_devices); |
---|
| 4815 | + if (list_empty(&device->post_commit_list)) |
---|
| 4816 | + list_add_tail(&device->post_commit_list, |
---|
| 4817 | + &trans->transaction->dev_update_list); |
---|
4529 | 4818 | |
---|
4530 | 4819 | WARN_ON(diff > old_total); |
---|
4531 | 4820 | btrfs_set_super_total_bytes(super_copy, |
---|
.. | .. |
---|
4609 | 4898 | btrfs_set_fs_incompat(info, RAID56); |
---|
4610 | 4899 | } |
---|
4611 | 4900 | |
---|
4612 | | -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, |
---|
4613 | | - u64 start, u64 type) |
---|
| 4901 | +static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) |
---|
4614 | 4902 | { |
---|
4615 | | - struct btrfs_fs_info *info = trans->fs_info; |
---|
4616 | | - struct btrfs_fs_devices *fs_devices = info->fs_devices; |
---|
4617 | | - struct btrfs_device *device; |
---|
4618 | | - struct map_lookup *map = NULL; |
---|
4619 | | - struct extent_map_tree *em_tree; |
---|
4620 | | - struct extent_map *em; |
---|
4621 | | - struct btrfs_device_info *devices_info = NULL; |
---|
4622 | | - u64 total_avail; |
---|
4623 | | - int num_stripes; /* total number of stripes to allocate */ |
---|
4624 | | - int data_stripes; /* number of stripes that count for |
---|
4625 | | - block group size */ |
---|
4626 | | - int sub_stripes; /* sub_stripes info for map */ |
---|
4627 | | - int dev_stripes; /* stripes per dev */ |
---|
4628 | | - int devs_max; /* max devs to use */ |
---|
4629 | | - int devs_min; /* min devs needed */ |
---|
4630 | | - int devs_increment; /* ndevs has to be a multiple of this */ |
---|
4631 | | - int ncopies; /* how many copies to data has */ |
---|
4632 | | - int ret; |
---|
| 4903 | + if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) |
---|
| 4904 | + return; |
---|
| 4905 | + |
---|
| 4906 | + btrfs_set_fs_incompat(info, RAID1C34); |
---|
| 4907 | +} |
---|
| 4908 | + |
---|
| 4909 | +/* |
---|
| 4910 | + * Structure used internally for __btrfs_alloc_chunk() function. |
---|
| 4911 | + * Wraps needed parameters. |
---|
| 4912 | + */ |
---|
| 4913 | +struct alloc_chunk_ctl { |
---|
| 4914 | + u64 start; |
---|
| 4915 | + u64 type; |
---|
| 4916 | + /* Total number of stripes to allocate */ |
---|
| 4917 | + int num_stripes; |
---|
| 4918 | + /* sub_stripes info for map */ |
---|
| 4919 | + int sub_stripes; |
---|
| 4920 | + /* Stripes per device */ |
---|
| 4921 | + int dev_stripes; |
---|
| 4922 | + /* Maximum number of devices to use */ |
---|
| 4923 | + int devs_max; |
---|
| 4924 | + /* Minimum number of devices to use */ |
---|
| 4925 | + int devs_min; |
---|
| 4926 | + /* ndevs has to be a multiple of this */ |
---|
| 4927 | + int devs_increment; |
---|
| 4928 | + /* Number of copies */ |
---|
| 4929 | + int ncopies; |
---|
| 4930 | + /* Number of stripes worth of bytes to store parity information */ |
---|
| 4931 | + int nparity; |
---|
4633 | 4932 | u64 max_stripe_size; |
---|
4634 | 4933 | u64 max_chunk_size; |
---|
| 4934 | + u64 dev_extent_min; |
---|
4635 | 4935 | u64 stripe_size; |
---|
4636 | | - u64 num_bytes; |
---|
| 4936 | + u64 chunk_size; |
---|
4637 | 4937 | int ndevs; |
---|
4638 | | - int i; |
---|
4639 | | - int j; |
---|
4640 | | - int index; |
---|
| 4938 | +}; |
---|
4641 | 4939 | |
---|
4642 | | - BUG_ON(!alloc_profile_is_valid(type, 0)); |
---|
4643 | | - |
---|
4644 | | - if (list_empty(&fs_devices->alloc_list)) { |
---|
4645 | | - if (btrfs_test_opt(info, ENOSPC_DEBUG)) |
---|
4646 | | - btrfs_debug(info, "%s: no writable device", __func__); |
---|
4647 | | - return -ENOSPC; |
---|
4648 | | - } |
---|
4649 | | - |
---|
4650 | | - index = btrfs_bg_flags_to_raid_index(type); |
---|
4651 | | - |
---|
4652 | | - sub_stripes = btrfs_raid_array[index].sub_stripes; |
---|
4653 | | - dev_stripes = btrfs_raid_array[index].dev_stripes; |
---|
4654 | | - devs_max = btrfs_raid_array[index].devs_max; |
---|
4655 | | - devs_min = btrfs_raid_array[index].devs_min; |
---|
4656 | | - devs_increment = btrfs_raid_array[index].devs_increment; |
---|
4657 | | - ncopies = btrfs_raid_array[index].ncopies; |
---|
| 4940 | +static void init_alloc_chunk_ctl_policy_regular( |
---|
| 4941 | + struct btrfs_fs_devices *fs_devices, |
---|
| 4942 | + struct alloc_chunk_ctl *ctl) |
---|
| 4943 | +{ |
---|
| 4944 | + u64 type = ctl->type; |
---|
4658 | 4945 | |
---|
4659 | 4946 | if (type & BTRFS_BLOCK_GROUP_DATA) { |
---|
4660 | | - max_stripe_size = SZ_1G; |
---|
4661 | | - max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; |
---|
4662 | | - if (!devs_max) |
---|
4663 | | - devs_max = BTRFS_MAX_DEVS(info); |
---|
| 4947 | + ctl->max_stripe_size = SZ_1G; |
---|
| 4948 | + ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; |
---|
4664 | 4949 | } else if (type & BTRFS_BLOCK_GROUP_METADATA) { |
---|
4665 | | - /* for larger filesystems, use larger metadata chunks */ |
---|
| 4950 | + /* For larger filesystems, use larger metadata chunks */ |
---|
4666 | 4951 | if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) |
---|
4667 | | - max_stripe_size = SZ_1G; |
---|
| 4952 | + ctl->max_stripe_size = SZ_1G; |
---|
4668 | 4953 | else |
---|
4669 | | - max_stripe_size = SZ_256M; |
---|
4670 | | - max_chunk_size = max_stripe_size; |
---|
4671 | | - if (!devs_max) |
---|
4672 | | - devs_max = BTRFS_MAX_DEVS(info); |
---|
| 4954 | + ctl->max_stripe_size = SZ_256M; |
---|
| 4955 | + ctl->max_chunk_size = ctl->max_stripe_size; |
---|
4673 | 4956 | } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { |
---|
4674 | | - max_stripe_size = SZ_32M; |
---|
4675 | | - max_chunk_size = 2 * max_stripe_size; |
---|
4676 | | - if (!devs_max) |
---|
4677 | | - devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; |
---|
| 4957 | + ctl->max_stripe_size = SZ_32M; |
---|
| 4958 | + ctl->max_chunk_size = 2 * ctl->max_stripe_size; |
---|
| 4959 | + ctl->devs_max = min_t(int, ctl->devs_max, |
---|
| 4960 | + BTRFS_MAX_DEVS_SYS_CHUNK); |
---|
4678 | 4961 | } else { |
---|
4679 | | - btrfs_err(info, "invalid chunk type 0x%llx requested", |
---|
4680 | | - type); |
---|
4681 | | - BUG_ON(1); |
---|
| 4962 | + BUG(); |
---|
4682 | 4963 | } |
---|
4683 | 4964 | |
---|
4684 | | - /* we don't want a chunk larger than 10% of writeable space */ |
---|
4685 | | - max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), |
---|
4686 | | - max_chunk_size); |
---|
| 4965 | + /* We don't want a chunk larger than 10% of writable space */ |
---|
| 4966 | + ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), |
---|
| 4967 | + ctl->max_chunk_size); |
---|
| 4968 | + ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; |
---|
| 4969 | +} |
---|
4687 | 4970 | |
---|
4688 | | - devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), |
---|
4689 | | - GFP_NOFS); |
---|
4690 | | - if (!devices_info) |
---|
4691 | | - return -ENOMEM; |
---|
| 4971 | +static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, |
---|
| 4972 | + struct alloc_chunk_ctl *ctl) |
---|
| 4973 | +{ |
---|
| 4974 | + int index = btrfs_bg_flags_to_raid_index(ctl->type); |
---|
| 4975 | + |
---|
| 4976 | + ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; |
---|
| 4977 | + ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; |
---|
| 4978 | + ctl->devs_max = btrfs_raid_array[index].devs_max; |
---|
| 4979 | + if (!ctl->devs_max) |
---|
| 4980 | + ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); |
---|
| 4981 | + ctl->devs_min = btrfs_raid_array[index].devs_min; |
---|
| 4982 | + ctl->devs_increment = btrfs_raid_array[index].devs_increment; |
---|
| 4983 | + ctl->ncopies = btrfs_raid_array[index].ncopies; |
---|
| 4984 | + ctl->nparity = btrfs_raid_array[index].nparity; |
---|
| 4985 | + ctl->ndevs = 0; |
---|
| 4986 | + |
---|
| 4987 | + switch (fs_devices->chunk_alloc_policy) { |
---|
| 4988 | + case BTRFS_CHUNK_ALLOC_REGULAR: |
---|
| 4989 | + init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); |
---|
| 4990 | + break; |
---|
| 4991 | + default: |
---|
| 4992 | + BUG(); |
---|
| 4993 | + } |
---|
| 4994 | +} |
---|
| 4995 | + |
---|
| 4996 | +static int gather_device_info(struct btrfs_fs_devices *fs_devices, |
---|
| 4997 | + struct alloc_chunk_ctl *ctl, |
---|
| 4998 | + struct btrfs_device_info *devices_info) |
---|
| 4999 | +{ |
---|
| 5000 | + struct btrfs_fs_info *info = fs_devices->fs_info; |
---|
| 5001 | + struct btrfs_device *device; |
---|
| 5002 | + u64 total_avail; |
---|
| 5003 | + u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; |
---|
| 5004 | + int ret; |
---|
| 5005 | + int ndevs = 0; |
---|
| 5006 | + u64 max_avail; |
---|
| 5007 | + u64 dev_offset; |
---|
4692 | 5008 | |
---|
4693 | 5009 | /* |
---|
4694 | 5010 | * in the first pass through the devices list, we gather information |
---|
4695 | 5011 | * about the available holes on each device. |
---|
4696 | 5012 | */ |
---|
4697 | | - ndevs = 0; |
---|
4698 | 5013 | list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { |
---|
4699 | | - u64 max_avail; |
---|
4700 | | - u64 dev_offset; |
---|
4701 | | - |
---|
4702 | 5014 | if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { |
---|
4703 | 5015 | WARN(1, KERN_ERR |
---|
4704 | 5016 | "BTRFS: read-only device in alloc_list\n"); |
---|
.. | .. |
---|
4716 | 5028 | total_avail = 0; |
---|
4717 | 5029 | |
---|
4718 | 5030 | /* If there is no space on this device, skip it. */ |
---|
4719 | | - if (total_avail == 0) |
---|
| 5031 | + if (total_avail < ctl->dev_extent_min) |
---|
4720 | 5032 | continue; |
---|
4721 | 5033 | |
---|
4722 | | - ret = find_free_dev_extent(trans, device, |
---|
4723 | | - max_stripe_size * dev_stripes, |
---|
4724 | | - &dev_offset, &max_avail); |
---|
| 5034 | + ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, |
---|
| 5035 | + &max_avail); |
---|
4725 | 5036 | if (ret && ret != -ENOSPC) |
---|
4726 | | - goto error; |
---|
| 5037 | + return ret; |
---|
4727 | 5038 | |
---|
4728 | 5039 | if (ret == 0) |
---|
4729 | | - max_avail = max_stripe_size * dev_stripes; |
---|
| 5040 | + max_avail = dev_extent_want; |
---|
4730 | 5041 | |
---|
4731 | | - if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) { |
---|
| 5042 | + if (max_avail < ctl->dev_extent_min) { |
---|
4732 | 5043 | if (btrfs_test_opt(info, ENOSPC_DEBUG)) |
---|
4733 | 5044 | btrfs_debug(info, |
---|
4734 | | - "%s: devid %llu has no free space, have=%llu want=%u", |
---|
| 5045 | + "%s: devid %llu has no free space, have=%llu want=%llu", |
---|
4735 | 5046 | __func__, device->devid, max_avail, |
---|
4736 | | - BTRFS_STRIPE_LEN * dev_stripes); |
---|
| 5047 | + ctl->dev_extent_min); |
---|
4737 | 5048 | continue; |
---|
4738 | 5049 | } |
---|
4739 | 5050 | |
---|
.. | .. |
---|
4748 | 5059 | devices_info[ndevs].dev = device; |
---|
4749 | 5060 | ++ndevs; |
---|
4750 | 5061 | } |
---|
| 5062 | + ctl->ndevs = ndevs; |
---|
4751 | 5063 | |
---|
4752 | 5064 | /* |
---|
4753 | 5065 | * now sort the devices by hole size / available space |
---|
.. | .. |
---|
4755 | 5067 | sort(devices_info, ndevs, sizeof(struct btrfs_device_info), |
---|
4756 | 5068 | btrfs_cmp_device_info, NULL); |
---|
4757 | 5069 | |
---|
4758 | | - /* round down to number of usable stripes */ |
---|
4759 | | - ndevs = round_down(ndevs, devs_increment); |
---|
| 5070 | + return 0; |
---|
| 5071 | +} |
---|
4760 | 5072 | |
---|
4761 | | - if (ndevs < devs_min) { |
---|
4762 | | - ret = -ENOSPC; |
---|
4763 | | - if (btrfs_test_opt(info, ENOSPC_DEBUG)) { |
---|
4764 | | - btrfs_debug(info, |
---|
4765 | | - "%s: not enough devices with free space: have=%d minimum required=%d", |
---|
4766 | | - __func__, ndevs, devs_min); |
---|
4767 | | - } |
---|
4768 | | - goto error; |
---|
4769 | | - } |
---|
4770 | | - |
---|
4771 | | - ndevs = min(ndevs, devs_max); |
---|
| 5073 | +static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, |
---|
| 5074 | + struct btrfs_device_info *devices_info) |
---|
| 5075 | +{ |
---|
| 5076 | + /* Number of stripes that count for block group size */ |
---|
| 5077 | + int data_stripes; |
---|
4772 | 5078 | |
---|
4773 | 5079 | /* |
---|
4774 | 5080 | * The primary goal is to maximize the number of stripes, so use as |
---|
.. | .. |
---|
4777 | 5083 | * The DUP profile stores more than one stripe per device, the |
---|
4778 | 5084 | * max_avail is the total size so we have to adjust. |
---|
4779 | 5085 | */ |
---|
4780 | | - stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); |
---|
4781 | | - num_stripes = ndevs * dev_stripes; |
---|
| 5086 | + ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, |
---|
| 5087 | + ctl->dev_stripes); |
---|
| 5088 | + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; |
---|
| 5089 | + |
---|
| 5090 | + /* This will have to be fixed for RAID1 and RAID10 over more drives */ |
---|
| 5091 | + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; |
---|
4782 | 5092 | |
---|
4783 | 5093 | /* |
---|
4784 | | - * this will have to be fixed for RAID1 and RAID10 over |
---|
4785 | | - * more drives |
---|
| 5094 | + * Use the number of data stripes to figure out how big this chunk is |
---|
| 5095 | + * really going to be in terms of logical address space, and compare |
---|
| 5096 | + * that answer with the max chunk size. If it's higher, we try to |
---|
| 5097 | + * reduce stripe_size. |
---|
4786 | 5098 | */ |
---|
4787 | | - data_stripes = num_stripes / ncopies; |
---|
4788 | | - |
---|
4789 | | - if (type & BTRFS_BLOCK_GROUP_RAID5) |
---|
4790 | | - data_stripes = num_stripes - 1; |
---|
4791 | | - |
---|
4792 | | - if (type & BTRFS_BLOCK_GROUP_RAID6) |
---|
4793 | | - data_stripes = num_stripes - 2; |
---|
4794 | | - |
---|
4795 | | - /* |
---|
4796 | | - * Use the number of data stripes to figure out how big this chunk |
---|
4797 | | - * is really going to be in terms of logical address space, |
---|
4798 | | - * and compare that answer with the max chunk size. If it's higher, |
---|
4799 | | - * we try to reduce stripe_size. |
---|
4800 | | - */ |
---|
4801 | | - if (stripe_size * data_stripes > max_chunk_size) { |
---|
| 5099 | + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { |
---|
4802 | 5100 | /* |
---|
4803 | 5101 | * Reduce stripe_size, round it up to a 16MB boundary again and |
---|
4804 | 5102 | * then use it, unless it ends up being even bigger than the |
---|
4805 | 5103 | * previous value we had already. |
---|
4806 | 5104 | */ |
---|
4807 | | - stripe_size = min(round_up(div_u64(max_chunk_size, |
---|
4808 | | - data_stripes), SZ_16M), |
---|
4809 | | - stripe_size); |
---|
| 5105 | + ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, |
---|
| 5106 | + data_stripes), SZ_16M), |
---|
| 5107 | + ctl->stripe_size); |
---|
4810 | 5108 | } |
---|
4811 | 5109 | |
---|
4812 | | - /* align to BTRFS_STRIPE_LEN */ |
---|
4813 | | - stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); |
---|
| 5110 | + /* Align to BTRFS_STRIPE_LEN */ |
---|
| 5111 | + ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); |
---|
| 5112 | + ctl->chunk_size = ctl->stripe_size * data_stripes; |
---|
4814 | 5113 | |
---|
4815 | | - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); |
---|
4816 | | - if (!map) { |
---|
4817 | | - ret = -ENOMEM; |
---|
4818 | | - goto error; |
---|
| 5114 | + return 0; |
---|
| 5115 | +} |
---|
| 5116 | + |
---|
| 5117 | +static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, |
---|
| 5118 | + struct alloc_chunk_ctl *ctl, |
---|
| 5119 | + struct btrfs_device_info *devices_info) |
---|
| 5120 | +{ |
---|
| 5121 | + struct btrfs_fs_info *info = fs_devices->fs_info; |
---|
| 5122 | + |
---|
| 5123 | + /* |
---|
| 5124 | + * Round down to number of usable stripes, devs_increment can be any |
---|
| 5125 | + * number so we can't use round_down() that requires power of 2, while |
---|
| 5126 | + * rounddown is safe. |
---|
| 5127 | + */ |
---|
| 5128 | + ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); |
---|
| 5129 | + |
---|
| 5130 | + if (ctl->ndevs < ctl->devs_min) { |
---|
| 5131 | + if (btrfs_test_opt(info, ENOSPC_DEBUG)) { |
---|
| 5132 | + btrfs_debug(info, |
---|
| 5133 | + "%s: not enough devices with free space: have=%d minimum required=%d", |
---|
| 5134 | + __func__, ctl->ndevs, ctl->devs_min); |
---|
| 5135 | + } |
---|
| 5136 | + return -ENOSPC; |
---|
4819 | 5137 | } |
---|
4820 | | - map->num_stripes = num_stripes; |
---|
4821 | 5138 | |
---|
4822 | | - for (i = 0; i < ndevs; ++i) { |
---|
4823 | | - for (j = 0; j < dev_stripes; ++j) { |
---|
4824 | | - int s = i * dev_stripes + j; |
---|
| 5139 | + ctl->ndevs = min(ctl->ndevs, ctl->devs_max); |
---|
| 5140 | + |
---|
| 5141 | + switch (fs_devices->chunk_alloc_policy) { |
---|
| 5142 | + case BTRFS_CHUNK_ALLOC_REGULAR: |
---|
| 5143 | + return decide_stripe_size_regular(ctl, devices_info); |
---|
| 5144 | + default: |
---|
| 5145 | + BUG(); |
---|
| 5146 | + } |
---|
| 5147 | +} |
---|
| 5148 | + |
---|
| 5149 | +static int create_chunk(struct btrfs_trans_handle *trans, |
---|
| 5150 | + struct alloc_chunk_ctl *ctl, |
---|
| 5151 | + struct btrfs_device_info *devices_info) |
---|
| 5152 | +{ |
---|
| 5153 | + struct btrfs_fs_info *info = trans->fs_info; |
---|
| 5154 | + struct map_lookup *map = NULL; |
---|
| 5155 | + struct extent_map_tree *em_tree; |
---|
| 5156 | + struct extent_map *em; |
---|
| 5157 | + u64 start = ctl->start; |
---|
| 5158 | + u64 type = ctl->type; |
---|
| 5159 | + int ret; |
---|
| 5160 | + int i; |
---|
| 5161 | + int j; |
---|
| 5162 | + |
---|
| 5163 | + map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); |
---|
| 5164 | + if (!map) |
---|
| 5165 | + return -ENOMEM; |
---|
| 5166 | + map->num_stripes = ctl->num_stripes; |
---|
| 5167 | + |
---|
| 5168 | + for (i = 0; i < ctl->ndevs; ++i) { |
---|
| 5169 | + for (j = 0; j < ctl->dev_stripes; ++j) { |
---|
| 5170 | + int s = i * ctl->dev_stripes + j; |
---|
4825 | 5171 | map->stripes[s].dev = devices_info[i].dev; |
---|
4826 | 5172 | map->stripes[s].physical = devices_info[i].dev_offset + |
---|
4827 | | - j * stripe_size; |
---|
| 5173 | + j * ctl->stripe_size; |
---|
4828 | 5174 | } |
---|
4829 | 5175 | } |
---|
4830 | 5176 | map->stripe_len = BTRFS_STRIPE_LEN; |
---|
4831 | 5177 | map->io_align = BTRFS_STRIPE_LEN; |
---|
4832 | 5178 | map->io_width = BTRFS_STRIPE_LEN; |
---|
4833 | 5179 | map->type = type; |
---|
4834 | | - map->sub_stripes = sub_stripes; |
---|
| 5180 | + map->sub_stripes = ctl->sub_stripes; |
---|
4835 | 5181 | |
---|
4836 | | - num_bytes = stripe_size * data_stripes; |
---|
4837 | | - |
---|
4838 | | - trace_btrfs_chunk_alloc(info, map, start, num_bytes); |
---|
| 5182 | + trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); |
---|
4839 | 5183 | |
---|
4840 | 5184 | em = alloc_extent_map(); |
---|
4841 | 5185 | if (!em) { |
---|
4842 | 5186 | kfree(map); |
---|
4843 | | - ret = -ENOMEM; |
---|
4844 | | - goto error; |
---|
| 5187 | + return -ENOMEM; |
---|
4845 | 5188 | } |
---|
4846 | 5189 | set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); |
---|
4847 | 5190 | em->map_lookup = map; |
---|
4848 | 5191 | em->start = start; |
---|
4849 | | - em->len = num_bytes; |
---|
| 5192 | + em->len = ctl->chunk_size; |
---|
4850 | 5193 | em->block_start = 0; |
---|
4851 | 5194 | em->block_len = em->len; |
---|
4852 | | - em->orig_block_len = stripe_size; |
---|
| 5195 | + em->orig_block_len = ctl->stripe_size; |
---|
4853 | 5196 | |
---|
4854 | | - em_tree = &info->mapping_tree.map_tree; |
---|
| 5197 | + em_tree = &info->mapping_tree; |
---|
4855 | 5198 | write_lock(&em_tree->lock); |
---|
4856 | 5199 | ret = add_extent_mapping(em_tree, em, 0); |
---|
4857 | 5200 | if (ret) { |
---|
4858 | 5201 | write_unlock(&em_tree->lock); |
---|
4859 | 5202 | free_extent_map(em); |
---|
4860 | | - goto error; |
---|
| 5203 | + return ret; |
---|
4861 | 5204 | } |
---|
4862 | | - |
---|
4863 | | - list_add_tail(&em->list, &trans->transaction->pending_chunks); |
---|
4864 | | - refcount_inc(&em->refs); |
---|
4865 | 5205 | write_unlock(&em_tree->lock); |
---|
4866 | 5206 | |
---|
4867 | | - ret = btrfs_make_block_group(trans, 0, type, start, num_bytes); |
---|
| 5207 | + ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); |
---|
4868 | 5208 | if (ret) |
---|
4869 | 5209 | goto error_del_extent; |
---|
4870 | 5210 | |
---|
4871 | 5211 | for (i = 0; i < map->num_stripes; i++) { |
---|
4872 | | - num_bytes = map->stripes[i].dev->bytes_used + stripe_size; |
---|
4873 | | - btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); |
---|
4874 | | - map->stripes[i].dev->has_pending_chunks = true; |
---|
| 5212 | + struct btrfs_device *dev = map->stripes[i].dev; |
---|
| 5213 | + |
---|
| 5214 | + btrfs_device_set_bytes_used(dev, |
---|
| 5215 | + dev->bytes_used + ctl->stripe_size); |
---|
| 5216 | + if (list_empty(&dev->post_commit_list)) |
---|
| 5217 | + list_add_tail(&dev->post_commit_list, |
---|
| 5218 | + &trans->transaction->dev_update_list); |
---|
4875 | 5219 | } |
---|
4876 | 5220 | |
---|
4877 | | - atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); |
---|
| 5221 | + atomic64_sub(ctl->stripe_size * map->num_stripes, |
---|
| 5222 | + &info->free_chunk_space); |
---|
4878 | 5223 | |
---|
4879 | 5224 | free_extent_map(em); |
---|
4880 | 5225 | check_raid56_incompat_flag(info, type); |
---|
| 5226 | + check_raid1c34_incompat_flag(info, type); |
---|
4881 | 5227 | |
---|
4882 | | - kfree(devices_info); |
---|
4883 | 5228 | return 0; |
---|
4884 | 5229 | |
---|
4885 | 5230 | error_del_extent: |
---|
.. | .. |
---|
4891 | 5236 | free_extent_map(em); |
---|
4892 | 5237 | /* One for the tree reference */ |
---|
4893 | 5238 | free_extent_map(em); |
---|
4894 | | - /* One for the pending_chunks list reference */ |
---|
4895 | | - free_extent_map(em); |
---|
4896 | | -error: |
---|
| 5239 | + |
---|
| 5240 | + return ret; |
---|
| 5241 | +} |
---|
| 5242 | + |
---|
| 5243 | +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) |
---|
| 5244 | +{ |
---|
| 5245 | + struct btrfs_fs_info *info = trans->fs_info; |
---|
| 5246 | + struct btrfs_fs_devices *fs_devices = info->fs_devices; |
---|
| 5247 | + struct btrfs_device_info *devices_info = NULL; |
---|
| 5248 | + struct alloc_chunk_ctl ctl; |
---|
| 5249 | + int ret; |
---|
| 5250 | + |
---|
| 5251 | + lockdep_assert_held(&info->chunk_mutex); |
---|
| 5252 | + |
---|
| 5253 | + if (!alloc_profile_is_valid(type, 0)) { |
---|
| 5254 | + ASSERT(0); |
---|
| 5255 | + return -EINVAL; |
---|
| 5256 | + } |
---|
| 5257 | + |
---|
| 5258 | + if (list_empty(&fs_devices->alloc_list)) { |
---|
| 5259 | + if (btrfs_test_opt(info, ENOSPC_DEBUG)) |
---|
| 5260 | + btrfs_debug(info, "%s: no writable device", __func__); |
---|
| 5261 | + return -ENOSPC; |
---|
| 5262 | + } |
---|
| 5263 | + |
---|
| 5264 | + if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { |
---|
| 5265 | + btrfs_err(info, "invalid chunk type 0x%llx requested", type); |
---|
| 5266 | + ASSERT(0); |
---|
| 5267 | + return -EINVAL; |
---|
| 5268 | + } |
---|
| 5269 | + |
---|
| 5270 | + ctl.start = find_next_chunk(info); |
---|
| 5271 | + ctl.type = type; |
---|
| 5272 | + init_alloc_chunk_ctl(fs_devices, &ctl); |
---|
| 5273 | + |
---|
| 5274 | + devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), |
---|
| 5275 | + GFP_NOFS); |
---|
| 5276 | + if (!devices_info) |
---|
| 5277 | + return -ENOMEM; |
---|
| 5278 | + |
---|
| 5279 | + ret = gather_device_info(fs_devices, &ctl, devices_info); |
---|
| 5280 | + if (ret < 0) |
---|
| 5281 | + goto out; |
---|
| 5282 | + |
---|
| 5283 | + ret = decide_stripe_size(fs_devices, &ctl, devices_info); |
---|
| 5284 | + if (ret < 0) |
---|
| 5285 | + goto out; |
---|
| 5286 | + |
---|
| 5287 | + ret = create_chunk(trans, &ctl, devices_info); |
---|
| 5288 | + |
---|
| 5289 | +out: |
---|
4897 | 5290 | kfree(devices_info); |
---|
4898 | 5291 | return ret; |
---|
4899 | 5292 | } |
---|
4900 | 5293 | |
---|
| 5294 | +/* |
---|
| 5295 | + * Chunk allocation falls into two parts. The first part does work |
---|
| 5296 | + * that makes the new allocated chunk usable, but does not do any operation |
---|
| 5297 | + * that modifies the chunk tree. The second part does the work that |
---|
| 5298 | + * requires modifying the chunk tree. This division is important for the |
---|
| 5299 | + * bootstrap process of adding storage to a seed btrfs. |
---|
| 5300 | + */ |
---|
4901 | 5301 | int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, |
---|
4902 | 5302 | u64 chunk_offset, u64 chunk_size) |
---|
4903 | 5303 | { |
---|
.. | .. |
---|
4916 | 5316 | int i = 0; |
---|
4917 | 5317 | int ret = 0; |
---|
4918 | 5318 | |
---|
4919 | | - em = get_chunk_map(fs_info, chunk_offset, chunk_size); |
---|
| 5319 | + em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); |
---|
4920 | 5320 | if (IS_ERR(em)) |
---|
4921 | 5321 | return PTR_ERR(em); |
---|
4922 | 5322 | |
---|
.. | .. |
---|
4996 | 5396 | return ret; |
---|
4997 | 5397 | } |
---|
4998 | 5398 | |
---|
4999 | | -/* |
---|
5000 | | - * Chunk allocation falls into two parts. The first part does works |
---|
5001 | | - * that make the new allocated chunk useable, but not do any operation |
---|
5002 | | - * that modifies the chunk tree. The second part does the works that |
---|
5003 | | - * require modifying the chunk tree. This division is important for the |
---|
5004 | | - * bootstrap process of adding storage to a seed btrfs. |
---|
5005 | | - */ |
---|
5006 | | -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) |
---|
| 5399 | +static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) |
---|
5007 | 5400 | { |
---|
5008 | | - u64 chunk_offset; |
---|
5009 | | - |
---|
5010 | | - lockdep_assert_held(&trans->fs_info->chunk_mutex); |
---|
5011 | | - chunk_offset = find_next_chunk(trans->fs_info); |
---|
5012 | | - return __btrfs_alloc_chunk(trans, chunk_offset, type); |
---|
5013 | | -} |
---|
5014 | | - |
---|
5015 | | -static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, |
---|
5016 | | - struct btrfs_fs_info *fs_info) |
---|
5017 | | -{ |
---|
5018 | | - u64 chunk_offset; |
---|
5019 | | - u64 sys_chunk_offset; |
---|
| 5401 | + struct btrfs_fs_info *fs_info = trans->fs_info; |
---|
5020 | 5402 | u64 alloc_profile; |
---|
5021 | 5403 | int ret; |
---|
5022 | 5404 | |
---|
5023 | | - chunk_offset = find_next_chunk(fs_info); |
---|
5024 | 5405 | alloc_profile = btrfs_metadata_alloc_profile(fs_info); |
---|
5025 | | - ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); |
---|
| 5406 | + ret = btrfs_alloc_chunk(trans, alloc_profile); |
---|
5026 | 5407 | if (ret) |
---|
5027 | 5408 | return ret; |
---|
5028 | 5409 | |
---|
5029 | | - sys_chunk_offset = find_next_chunk(fs_info); |
---|
5030 | 5410 | alloc_profile = btrfs_system_alloc_profile(fs_info); |
---|
5031 | | - ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); |
---|
| 5411 | + ret = btrfs_alloc_chunk(trans, alloc_profile); |
---|
5032 | 5412 | return ret; |
---|
5033 | 5413 | } |
---|
5034 | 5414 | |
---|
5035 | 5415 | static inline int btrfs_chunk_max_errors(struct map_lookup *map) |
---|
5036 | 5416 | { |
---|
5037 | | - int max_errors; |
---|
| 5417 | + const int index = btrfs_bg_flags_to_raid_index(map->type); |
---|
5038 | 5418 | |
---|
5039 | | - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
---|
5040 | | - BTRFS_BLOCK_GROUP_RAID10 | |
---|
5041 | | - BTRFS_BLOCK_GROUP_RAID5)) { |
---|
5042 | | - max_errors = 1; |
---|
5043 | | - } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { |
---|
5044 | | - max_errors = 2; |
---|
5045 | | - } else { |
---|
5046 | | - max_errors = 0; |
---|
5047 | | - } |
---|
5048 | | - |
---|
5049 | | - return max_errors; |
---|
| 5419 | + return btrfs_raid_array[index].tolerated_failures; |
---|
5050 | 5420 | } |
---|
5051 | 5421 | |
---|
5052 | 5422 | int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) |
---|
.. | .. |
---|
5057 | 5427 | int miss_ndevs = 0; |
---|
5058 | 5428 | int i; |
---|
5059 | 5429 | |
---|
5060 | | - em = get_chunk_map(fs_info, chunk_offset, 1); |
---|
| 5430 | + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); |
---|
5061 | 5431 | if (IS_ERR(em)) |
---|
5062 | 5432 | return 1; |
---|
5063 | 5433 | |
---|
.. | .. |
---|
5087 | 5457 | return readonly; |
---|
5088 | 5458 | } |
---|
5089 | 5459 | |
---|
5090 | | -void btrfs_mapping_init(struct btrfs_mapping_tree *tree) |
---|
5091 | | -{ |
---|
5092 | | - extent_map_tree_init(&tree->map_tree); |
---|
5093 | | -} |
---|
5094 | | - |
---|
5095 | | -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) |
---|
| 5460 | +void btrfs_mapping_tree_free(struct extent_map_tree *tree) |
---|
5096 | 5461 | { |
---|
5097 | 5462 | struct extent_map *em; |
---|
5098 | 5463 | |
---|
5099 | 5464 | while (1) { |
---|
5100 | | - write_lock(&tree->map_tree.lock); |
---|
5101 | | - em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); |
---|
| 5465 | + write_lock(&tree->lock); |
---|
| 5466 | + em = lookup_extent_mapping(tree, 0, (u64)-1); |
---|
5102 | 5467 | if (em) |
---|
5103 | | - remove_extent_mapping(&tree->map_tree, em); |
---|
5104 | | - write_unlock(&tree->map_tree.lock); |
---|
| 5468 | + remove_extent_mapping(tree, em); |
---|
| 5469 | + write_unlock(&tree->lock); |
---|
5105 | 5470 | if (!em) |
---|
5106 | 5471 | break; |
---|
5107 | 5472 | /* once for us */ |
---|
.. | .. |
---|
5117 | 5482 | struct map_lookup *map; |
---|
5118 | 5483 | int ret; |
---|
5119 | 5484 | |
---|
5120 | | - em = get_chunk_map(fs_info, logical, len); |
---|
| 5485 | + em = btrfs_get_chunk_map(fs_info, logical, len); |
---|
5121 | 5486 | if (IS_ERR(em)) |
---|
5122 | 5487 | /* |
---|
5123 | 5488 | * We could return errors for these cases, but that could get |
---|
.. | .. |
---|
5128 | 5493 | return 1; |
---|
5129 | 5494 | |
---|
5130 | 5495 | map = em->map_lookup; |
---|
5131 | | - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) |
---|
| 5496 | + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) |
---|
5132 | 5497 | ret = map->num_stripes; |
---|
5133 | 5498 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
---|
5134 | 5499 | ret = map->sub_stripes; |
---|
.. | .. |
---|
5147 | 5512 | ret = 1; |
---|
5148 | 5513 | free_extent_map(em); |
---|
5149 | 5514 | |
---|
5150 | | - btrfs_dev_replace_read_lock(&fs_info->dev_replace); |
---|
| 5515 | + down_read(&fs_info->dev_replace.rwsem); |
---|
5151 | 5516 | if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && |
---|
5152 | 5517 | fs_info->dev_replace.tgtdev) |
---|
5153 | 5518 | ret++; |
---|
5154 | | - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); |
---|
| 5519 | + up_read(&fs_info->dev_replace.rwsem); |
---|
5155 | 5520 | |
---|
5156 | 5521 | return ret; |
---|
5157 | 5522 | } |
---|
.. | .. |
---|
5163 | 5528 | struct map_lookup *map; |
---|
5164 | 5529 | unsigned long len = fs_info->sectorsize; |
---|
5165 | 5530 | |
---|
5166 | | - em = get_chunk_map(fs_info, logical, len); |
---|
| 5531 | + em = btrfs_get_chunk_map(fs_info, logical, len); |
---|
5167 | 5532 | |
---|
5168 | 5533 | if (!WARN_ON(IS_ERR(em))) { |
---|
5169 | 5534 | map = em->map_lookup; |
---|
.. | .. |
---|
5180 | 5545 | struct map_lookup *map; |
---|
5181 | 5546 | int ret = 0; |
---|
5182 | 5547 | |
---|
5183 | | - em = get_chunk_map(fs_info, logical, len); |
---|
| 5548 | + em = btrfs_get_chunk_map(fs_info, logical, len); |
---|
5184 | 5549 | |
---|
5185 | 5550 | if(!WARN_ON(IS_ERR(em))) { |
---|
5186 | 5551 | map = em->map_lookup; |
---|
.. | .. |
---|
5202 | 5567 | struct btrfs_device *srcdev; |
---|
5203 | 5568 | |
---|
5204 | 5569 | ASSERT((map->type & |
---|
5205 | | - (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); |
---|
| 5570 | + (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); |
---|
5206 | 5571 | |
---|
5207 | 5572 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
---|
5208 | 5573 | num_stripes = map->sub_stripes; |
---|
.. | .. |
---|
5240 | 5605 | return preferred_mirror; |
---|
5241 | 5606 | } |
---|
5242 | 5607 | |
---|
5243 | | -static inline int parity_smaller(u64 a, u64 b) |
---|
5244 | | -{ |
---|
5245 | | - return a > b; |
---|
5246 | | -} |
---|
5247 | | - |
---|
5248 | 5608 | /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ |
---|
5249 | 5609 | static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) |
---|
5250 | 5610 | { |
---|
5251 | | - struct btrfs_bio_stripe s; |
---|
5252 | 5611 | int i; |
---|
5253 | | - u64 l; |
---|
5254 | 5612 | int again = 1; |
---|
5255 | 5613 | |
---|
5256 | 5614 | while (again) { |
---|
5257 | 5615 | again = 0; |
---|
5258 | 5616 | for (i = 0; i < num_stripes - 1; i++) { |
---|
5259 | | - if (parity_smaller(bbio->raid_map[i], |
---|
5260 | | - bbio->raid_map[i+1])) { |
---|
5261 | | - s = bbio->stripes[i]; |
---|
5262 | | - l = bbio->raid_map[i]; |
---|
5263 | | - bbio->stripes[i] = bbio->stripes[i+1]; |
---|
5264 | | - bbio->raid_map[i] = bbio->raid_map[i+1]; |
---|
5265 | | - bbio->stripes[i+1] = s; |
---|
5266 | | - bbio->raid_map[i+1] = l; |
---|
5267 | | - |
---|
| 5617 | + /* Swap if parity is on a smaller index */ |
---|
| 5618 | + if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { |
---|
| 5619 | + swap(bbio->stripes[i], bbio->stripes[i + 1]); |
---|
| 5620 | + swap(bbio->raid_map[i], bbio->raid_map[i + 1]); |
---|
5268 | 5621 | again = 1; |
---|
5269 | 5622 | } |
---|
5270 | 5623 | } |
---|
.. | .. |
---|
5290 | 5643 | atomic_set(&bbio->error, 0); |
---|
5291 | 5644 | refcount_set(&bbio->refs, 1); |
---|
5292 | 5645 | |
---|
| 5646 | + bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); |
---|
| 5647 | + bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); |
---|
| 5648 | + |
---|
5293 | 5649 | return bbio; |
---|
5294 | 5650 | } |
---|
5295 | 5651 | |
---|
.. | .. |
---|
5313 | 5669 | * replace. |
---|
5314 | 5670 | */ |
---|
5315 | 5671 | static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, |
---|
5316 | | - u64 logical, u64 length, |
---|
| 5672 | + u64 logical, u64 *length_ret, |
---|
5317 | 5673 | struct btrfs_bio **bbio_ret) |
---|
5318 | 5674 | { |
---|
5319 | 5675 | struct extent_map *em; |
---|
5320 | 5676 | struct map_lookup *map; |
---|
5321 | 5677 | struct btrfs_bio *bbio; |
---|
| 5678 | + u64 length = *length_ret; |
---|
5322 | 5679 | u64 offset; |
---|
5323 | 5680 | u64 stripe_nr; |
---|
5324 | 5681 | u64 stripe_nr_end; |
---|
.. | .. |
---|
5339 | 5696 | /* discard always return a bbio */ |
---|
5340 | 5697 | ASSERT(bbio_ret); |
---|
5341 | 5698 | |
---|
5342 | | - em = get_chunk_map(fs_info, logical, length); |
---|
| 5699 | + em = btrfs_get_chunk_map(fs_info, logical, length); |
---|
5343 | 5700 | if (IS_ERR(em)) |
---|
5344 | 5701 | return PTR_ERR(em); |
---|
5345 | 5702 | |
---|
.. | .. |
---|
5351 | 5708 | } |
---|
5352 | 5709 | |
---|
5353 | 5710 | offset = logical - em->start; |
---|
5354 | | - length = min_t(u64, em->len - offset, length); |
---|
| 5711 | + length = min_t(u64, em->start + em->len - logical, length); |
---|
| 5712 | + *length_ret = length; |
---|
5355 | 5713 | |
---|
5356 | 5714 | stripe_len = map->stripe_len; |
---|
5357 | 5715 | /* |
---|
.. | .. |
---|
5391 | 5749 | &remaining_stripes); |
---|
5392 | 5750 | div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); |
---|
5393 | 5751 | last_stripe *= sub_stripes; |
---|
5394 | | - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
---|
| 5752 | + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | |
---|
5395 | 5753 | BTRFS_BLOCK_GROUP_DUP)) { |
---|
5396 | 5754 | num_stripes = map->num_stripes; |
---|
5397 | 5755 | } else { |
---|
.. | .. |
---|
5635 | 5993 | return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); |
---|
5636 | 5994 | } |
---|
5637 | 5995 | |
---|
| 5996 | +/* |
---|
| 5997 | + * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) |
---|
| 5998 | + * tuple. This information is used to calculate how big a |
---|
| 5999 | + * particular bio can get before it straddles a stripe. |
---|
| 6000 | + * |
---|
| 6001 | + * @fs_info - the filesystem |
---|
| 6002 | + * @logical - address that we want to figure out the geometry of |
---|
| 6003 | + * @len - the length of IO we are going to perform, starting at @logical |
---|
| 6004 | + * @op - type of operation - write or read |
---|
| 6005 | + * @io_geom - pointer used to return values |
---|
| 6006 | + * |
---|
| 6007 | + * Returns < 0 in case a chunk for the given logical address cannot be found, |
---|
| 6008 | + * usually shouldn't happen unless @logical is corrupted, 0 otherwise. |
---|
| 6009 | + */ |
---|
| 6010 | +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, |
---|
| 6011 | + u64 logical, u64 len, struct btrfs_io_geometry *io_geom) |
---|
| 6012 | +{ |
---|
| 6013 | + struct extent_map *em; |
---|
| 6014 | + struct map_lookup *map; |
---|
| 6015 | + u64 offset; |
---|
| 6016 | + u64 stripe_offset; |
---|
| 6017 | + u64 stripe_nr; |
---|
| 6018 | + u64 stripe_len; |
---|
| 6019 | + u64 raid56_full_stripe_start = (u64)-1; |
---|
| 6020 | + int data_stripes; |
---|
| 6021 | + int ret = 0; |
---|
| 6022 | + |
---|
| 6023 | + ASSERT(op != BTRFS_MAP_DISCARD); |
---|
| 6024 | + |
---|
| 6025 | + em = btrfs_get_chunk_map(fs_info, logical, len); |
---|
| 6026 | + if (IS_ERR(em)) |
---|
| 6027 | + return PTR_ERR(em); |
---|
| 6028 | + |
---|
| 6029 | + map = em->map_lookup; |
---|
| 6030 | + /* Offset of this logical address in the chunk */ |
---|
| 6031 | + offset = logical - em->start; |
---|
| 6032 | + /* Len of a stripe in a chunk */ |
---|
| 6033 | + stripe_len = map->stripe_len; |
---|
| 6034 | + /* Stripe wher this block falls in */ |
---|
| 6035 | + stripe_nr = div64_u64(offset, stripe_len); |
---|
| 6036 | + /* Offset of stripe in the chunk */ |
---|
| 6037 | + stripe_offset = stripe_nr * stripe_len; |
---|
| 6038 | + if (offset < stripe_offset) { |
---|
| 6039 | + btrfs_crit(fs_info, |
---|
| 6040 | +"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", |
---|
| 6041 | + stripe_offset, offset, em->start, logical, stripe_len); |
---|
| 6042 | + ret = -EINVAL; |
---|
| 6043 | + goto out; |
---|
| 6044 | + } |
---|
| 6045 | + |
---|
| 6046 | + /* stripe_offset is the offset of this block in its stripe */ |
---|
| 6047 | + stripe_offset = offset - stripe_offset; |
---|
| 6048 | + data_stripes = nr_data_stripes(map); |
---|
| 6049 | + |
---|
| 6050 | + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
---|
| 6051 | + u64 max_len = stripe_len - stripe_offset; |
---|
| 6052 | + |
---|
| 6053 | + /* |
---|
| 6054 | + * In case of raid56, we need to know the stripe aligned start |
---|
| 6055 | + */ |
---|
| 6056 | + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { |
---|
| 6057 | + unsigned long full_stripe_len = stripe_len * data_stripes; |
---|
| 6058 | + raid56_full_stripe_start = offset; |
---|
| 6059 | + |
---|
| 6060 | + /* |
---|
| 6061 | + * Allow a write of a full stripe, but make sure we |
---|
| 6062 | + * don't allow straddling of stripes |
---|
| 6063 | + */ |
---|
| 6064 | + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, |
---|
| 6065 | + full_stripe_len); |
---|
| 6066 | + raid56_full_stripe_start *= full_stripe_len; |
---|
| 6067 | + |
---|
| 6068 | + /* |
---|
| 6069 | + * For writes to RAID[56], allow a full stripeset across |
---|
| 6070 | + * all disks. For other RAID types and for RAID[56] |
---|
| 6071 | + * reads, just allow a single stripe (on a single disk). |
---|
| 6072 | + */ |
---|
| 6073 | + if (op == BTRFS_MAP_WRITE) { |
---|
| 6074 | + max_len = stripe_len * data_stripes - |
---|
| 6075 | + (offset - raid56_full_stripe_start); |
---|
| 6076 | + } |
---|
| 6077 | + } |
---|
| 6078 | + len = min_t(u64, em->len - offset, max_len); |
---|
| 6079 | + } else { |
---|
| 6080 | + len = em->len - offset; |
---|
| 6081 | + } |
---|
| 6082 | + |
---|
| 6083 | + io_geom->len = len; |
---|
| 6084 | + io_geom->offset = offset; |
---|
| 6085 | + io_geom->stripe_len = stripe_len; |
---|
| 6086 | + io_geom->stripe_nr = stripe_nr; |
---|
| 6087 | + io_geom->stripe_offset = stripe_offset; |
---|
| 6088 | + io_geom->raid56_stripe_offset = raid56_full_stripe_start; |
---|
| 6089 | + |
---|
| 6090 | +out: |
---|
| 6091 | + /* once for us */ |
---|
| 6092 | + free_extent_map(em); |
---|
| 6093 | + return ret; |
---|
| 6094 | +} |
---|
| 6095 | + |
---|
5638 | 6096 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, |
---|
5639 | 6097 | enum btrfs_map_op op, |
---|
5640 | 6098 | u64 logical, u64 *length, |
---|
.. | .. |
---|
5643 | 6101 | { |
---|
5644 | 6102 | struct extent_map *em; |
---|
5645 | 6103 | struct map_lookup *map; |
---|
5646 | | - u64 offset; |
---|
5647 | 6104 | u64 stripe_offset; |
---|
5648 | 6105 | u64 stripe_nr; |
---|
5649 | 6106 | u64 stripe_len; |
---|
5650 | 6107 | u32 stripe_index; |
---|
| 6108 | + int data_stripes; |
---|
5651 | 6109 | int i; |
---|
5652 | 6110 | int ret = 0; |
---|
5653 | 6111 | int num_stripes; |
---|
.. | .. |
---|
5660 | 6118 | int patch_the_first_stripe_for_dev_replace = 0; |
---|
5661 | 6119 | u64 physical_to_patch_in_first_stripe = 0; |
---|
5662 | 6120 | u64 raid56_full_stripe_start = (u64)-1; |
---|
| 6121 | + struct btrfs_io_geometry geom; |
---|
5663 | 6122 | |
---|
5664 | | - if (op == BTRFS_MAP_DISCARD) |
---|
5665 | | - return __btrfs_map_block_for_discard(fs_info, logical, |
---|
5666 | | - *length, bbio_ret); |
---|
| 6123 | + ASSERT(bbio_ret); |
---|
| 6124 | + ASSERT(op != BTRFS_MAP_DISCARD); |
---|
5667 | 6125 | |
---|
5668 | | - em = get_chunk_map(fs_info, logical, *length); |
---|
5669 | | - if (IS_ERR(em)) |
---|
5670 | | - return PTR_ERR(em); |
---|
| 6126 | + ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); |
---|
| 6127 | + if (ret < 0) |
---|
| 6128 | + return ret; |
---|
5671 | 6129 | |
---|
| 6130 | + em = btrfs_get_chunk_map(fs_info, logical, *length); |
---|
| 6131 | + ASSERT(!IS_ERR(em)); |
---|
5672 | 6132 | map = em->map_lookup; |
---|
5673 | | - offset = logical - em->start; |
---|
5674 | 6133 | |
---|
5675 | | - stripe_len = map->stripe_len; |
---|
5676 | | - stripe_nr = offset; |
---|
5677 | | - /* |
---|
5678 | | - * stripe_nr counts the total number of stripes we have to stride |
---|
5679 | | - * to get to this block |
---|
5680 | | - */ |
---|
5681 | | - stripe_nr = div64_u64(stripe_nr, stripe_len); |
---|
| 6134 | + *length = geom.len; |
---|
| 6135 | + stripe_len = geom.stripe_len; |
---|
| 6136 | + stripe_nr = geom.stripe_nr; |
---|
| 6137 | + stripe_offset = geom.stripe_offset; |
---|
| 6138 | + raid56_full_stripe_start = geom.raid56_stripe_offset; |
---|
| 6139 | + data_stripes = nr_data_stripes(map); |
---|
5682 | 6140 | |
---|
5683 | | - stripe_offset = stripe_nr * stripe_len; |
---|
5684 | | - if (offset < stripe_offset) { |
---|
5685 | | - btrfs_crit(fs_info, |
---|
5686 | | - "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", |
---|
5687 | | - stripe_offset, offset, em->start, logical, |
---|
5688 | | - stripe_len); |
---|
5689 | | - free_extent_map(em); |
---|
5690 | | - return -EINVAL; |
---|
5691 | | - } |
---|
5692 | | - |
---|
5693 | | - /* stripe_offset is the offset of this block in its stripe*/ |
---|
5694 | | - stripe_offset = offset - stripe_offset; |
---|
5695 | | - |
---|
5696 | | - /* if we're here for raid56, we need to know the stripe aligned start */ |
---|
5697 | | - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { |
---|
5698 | | - unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); |
---|
5699 | | - raid56_full_stripe_start = offset; |
---|
5700 | | - |
---|
5701 | | - /* allow a write of a full stripe, but make sure we don't |
---|
5702 | | - * allow straddling of stripes |
---|
5703 | | - */ |
---|
5704 | | - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, |
---|
5705 | | - full_stripe_len); |
---|
5706 | | - raid56_full_stripe_start *= full_stripe_len; |
---|
5707 | | - } |
---|
5708 | | - |
---|
5709 | | - if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
---|
5710 | | - u64 max_len; |
---|
5711 | | - /* For writes to RAID[56], allow a full stripeset across all disks. |
---|
5712 | | - For other RAID types and for RAID[56] reads, just allow a single |
---|
5713 | | - stripe (on a single disk). */ |
---|
5714 | | - if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && |
---|
5715 | | - (op == BTRFS_MAP_WRITE)) { |
---|
5716 | | - max_len = stripe_len * nr_data_stripes(map) - |
---|
5717 | | - (offset - raid56_full_stripe_start); |
---|
5718 | | - } else { |
---|
5719 | | - /* we limit the length of each bio to what fits in a stripe */ |
---|
5720 | | - max_len = stripe_len - stripe_offset; |
---|
5721 | | - } |
---|
5722 | | - *length = min_t(u64, em->len - offset, max_len); |
---|
5723 | | - } else { |
---|
5724 | | - *length = em->len - offset; |
---|
5725 | | - } |
---|
5726 | | - |
---|
5727 | | - /* This is for when we're called from btrfs_merge_bio_hook() and all |
---|
5728 | | - it cares about is the length */ |
---|
5729 | | - if (!bbio_ret) |
---|
5730 | | - goto out; |
---|
5731 | | - |
---|
5732 | | - btrfs_dev_replace_read_lock(dev_replace); |
---|
| 6141 | + down_read(&dev_replace->rwsem); |
---|
5733 | 6142 | dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); |
---|
| 6143 | + /* |
---|
| 6144 | + * Hold the semaphore for read during the whole operation, write is |
---|
| 6145 | + * requested at commit time but must wait. |
---|
| 6146 | + */ |
---|
5734 | 6147 | if (!dev_replace_is_ongoing) |
---|
5735 | | - btrfs_dev_replace_read_unlock(dev_replace); |
---|
5736 | | - else |
---|
5737 | | - btrfs_dev_replace_set_lock_blocking(dev_replace); |
---|
| 6148 | + up_read(&dev_replace->rwsem); |
---|
5738 | 6149 | |
---|
5739 | 6150 | if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && |
---|
5740 | 6151 | !need_full_stripe(op) && dev_replace->tgtdev != NULL) { |
---|
.. | .. |
---|
5757 | 6168 | &stripe_index); |
---|
5758 | 6169 | if (!need_full_stripe(op)) |
---|
5759 | 6170 | mirror_num = 1; |
---|
5760 | | - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { |
---|
| 6171 | + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { |
---|
5761 | 6172 | if (need_full_stripe(op)) |
---|
5762 | 6173 | num_stripes = map->num_stripes; |
---|
5763 | 6174 | else if (mirror_num) |
---|
.. | .. |
---|
5799 | 6210 | if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { |
---|
5800 | 6211 | /* push stripe_nr back to the start of the full stripe */ |
---|
5801 | 6212 | stripe_nr = div64_u64(raid56_full_stripe_start, |
---|
5802 | | - stripe_len * nr_data_stripes(map)); |
---|
| 6213 | + stripe_len * data_stripes); |
---|
5803 | 6214 | |
---|
5804 | 6215 | /* RAID[56] write or recovery. Return all stripes */ |
---|
5805 | 6216 | num_stripes = map->num_stripes; |
---|
.. | .. |
---|
5815 | 6226 | * Mirror #3 is RAID6 Q block. |
---|
5816 | 6227 | */ |
---|
5817 | 6228 | stripe_nr = div_u64_rem(stripe_nr, |
---|
5818 | | - nr_data_stripes(map), &stripe_index); |
---|
| 6229 | + data_stripes, &stripe_index); |
---|
5819 | 6230 | if (mirror_num > 1) |
---|
5820 | | - stripe_index = nr_data_stripes(map) + |
---|
5821 | | - mirror_num - 2; |
---|
| 6231 | + stripe_index = data_stripes + mirror_num - 2; |
---|
5822 | 6232 | |
---|
5823 | 6233 | /* We distribute the parity blocks across stripes */ |
---|
5824 | 6234 | div_u64_rem(stripe_nr + stripe_index, map->num_stripes, |
---|
.. | .. |
---|
5858 | 6268 | ret = -ENOMEM; |
---|
5859 | 6269 | goto out; |
---|
5860 | 6270 | } |
---|
5861 | | - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) |
---|
5862 | | - bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); |
---|
| 6271 | + |
---|
| 6272 | + for (i = 0; i < num_stripes; i++) { |
---|
| 6273 | + bbio->stripes[i].physical = map->stripes[stripe_index].physical + |
---|
| 6274 | + stripe_offset + stripe_nr * map->stripe_len; |
---|
| 6275 | + bbio->stripes[i].dev = map->stripes[stripe_index].dev; |
---|
| 6276 | + stripe_index++; |
---|
| 6277 | + } |
---|
5863 | 6278 | |
---|
5864 | 6279 | /* build raid_map */ |
---|
5865 | 6280 | if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && |
---|
.. | .. |
---|
5867 | 6282 | u64 tmp; |
---|
5868 | 6283 | unsigned rot; |
---|
5869 | 6284 | |
---|
5870 | | - bbio->raid_map = (u64 *)((void *)bbio->stripes + |
---|
5871 | | - sizeof(struct btrfs_bio_stripe) * |
---|
5872 | | - num_alloc_stripes + |
---|
5873 | | - sizeof(int) * tgtdev_indexes); |
---|
5874 | | - |
---|
5875 | 6285 | /* Work out the disk rotation on this stripe-set */ |
---|
5876 | 6286 | div_u64_rem(stripe_nr, num_stripes, &rot); |
---|
5877 | 6287 | |
---|
5878 | 6288 | /* Fill in the logical address of each stripe */ |
---|
5879 | | - tmp = stripe_nr * nr_data_stripes(map); |
---|
5880 | | - for (i = 0; i < nr_data_stripes(map); i++) |
---|
| 6289 | + tmp = stripe_nr * data_stripes; |
---|
| 6290 | + for (i = 0; i < data_stripes; i++) |
---|
5881 | 6291 | bbio->raid_map[(i+rot) % num_stripes] = |
---|
5882 | 6292 | em->start + (tmp + i) * map->stripe_len; |
---|
5883 | 6293 | |
---|
.. | .. |
---|
5885 | 6295 | if (map->type & BTRFS_BLOCK_GROUP_RAID6) |
---|
5886 | 6296 | bbio->raid_map[(i+rot+1) % num_stripes] = |
---|
5887 | 6297 | RAID6_Q_STRIPE; |
---|
5888 | | - } |
---|
5889 | 6298 | |
---|
5890 | | - |
---|
5891 | | - for (i = 0; i < num_stripes; i++) { |
---|
5892 | | - bbio->stripes[i].physical = |
---|
5893 | | - map->stripes[stripe_index].physical + |
---|
5894 | | - stripe_offset + |
---|
5895 | | - stripe_nr * map->stripe_len; |
---|
5896 | | - bbio->stripes[i].dev = |
---|
5897 | | - map->stripes[stripe_index].dev; |
---|
5898 | | - stripe_index++; |
---|
| 6299 | + sort_parity_stripes(bbio, num_stripes); |
---|
5899 | 6300 | } |
---|
5900 | 6301 | |
---|
5901 | 6302 | if (need_full_stripe(op)) |
---|
5902 | 6303 | max_errors = btrfs_chunk_max_errors(map); |
---|
5903 | | - |
---|
5904 | | - if (bbio->raid_map) |
---|
5905 | | - sort_parity_stripes(bbio, num_stripes); |
---|
5906 | 6304 | |
---|
5907 | 6305 | if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && |
---|
5908 | 6306 | need_full_stripe(op)) { |
---|
.. | .. |
---|
5929 | 6327 | } |
---|
5930 | 6328 | out: |
---|
5931 | 6329 | if (dev_replace_is_ongoing) { |
---|
5932 | | - btrfs_dev_replace_clear_lock_blocking(dev_replace); |
---|
5933 | | - btrfs_dev_replace_read_unlock(dev_replace); |
---|
| 6330 | + lockdep_assert_held(&dev_replace->rwsem); |
---|
| 6331 | + /* Unlock and let waiting writers proceed */ |
---|
| 6332 | + up_read(&dev_replace->rwsem); |
---|
5934 | 6333 | } |
---|
5935 | 6334 | free_extent_map(em); |
---|
5936 | 6335 | return ret; |
---|
.. | .. |
---|
5940 | 6339 | u64 logical, u64 *length, |
---|
5941 | 6340 | struct btrfs_bio **bbio_ret, int mirror_num) |
---|
5942 | 6341 | { |
---|
| 6342 | + if (op == BTRFS_MAP_DISCARD) |
---|
| 6343 | + return __btrfs_map_block_for_discard(fs_info, logical, |
---|
| 6344 | + length, bbio_ret); |
---|
| 6345 | + |
---|
5943 | 6346 | return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, |
---|
5944 | 6347 | mirror_num, 0); |
---|
5945 | 6348 | } |
---|
.. | .. |
---|
5950 | 6353 | struct btrfs_bio **bbio_ret) |
---|
5951 | 6354 | { |
---|
5952 | 6355 | return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); |
---|
5953 | | -} |
---|
5954 | | - |
---|
5955 | | -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, |
---|
5956 | | - u64 physical, u64 **logical, int *naddrs, int *stripe_len) |
---|
5957 | | -{ |
---|
5958 | | - struct extent_map *em; |
---|
5959 | | - struct map_lookup *map; |
---|
5960 | | - u64 *buf; |
---|
5961 | | - u64 bytenr; |
---|
5962 | | - u64 length; |
---|
5963 | | - u64 stripe_nr; |
---|
5964 | | - u64 rmap_len; |
---|
5965 | | - int i, j, nr = 0; |
---|
5966 | | - |
---|
5967 | | - em = get_chunk_map(fs_info, chunk_start, 1); |
---|
5968 | | - if (IS_ERR(em)) |
---|
5969 | | - return -EIO; |
---|
5970 | | - |
---|
5971 | | - map = em->map_lookup; |
---|
5972 | | - length = em->len; |
---|
5973 | | - rmap_len = map->stripe_len; |
---|
5974 | | - |
---|
5975 | | - if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
---|
5976 | | - length = div_u64(length, map->num_stripes / map->sub_stripes); |
---|
5977 | | - else if (map->type & BTRFS_BLOCK_GROUP_RAID0) |
---|
5978 | | - length = div_u64(length, map->num_stripes); |
---|
5979 | | - else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { |
---|
5980 | | - length = div_u64(length, nr_data_stripes(map)); |
---|
5981 | | - rmap_len = map->stripe_len * nr_data_stripes(map); |
---|
5982 | | - } |
---|
5983 | | - |
---|
5984 | | - buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); |
---|
5985 | | - BUG_ON(!buf); /* -ENOMEM */ |
---|
5986 | | - |
---|
5987 | | - for (i = 0; i < map->num_stripes; i++) { |
---|
5988 | | - if (map->stripes[i].physical > physical || |
---|
5989 | | - map->stripes[i].physical + length <= physical) |
---|
5990 | | - continue; |
---|
5991 | | - |
---|
5992 | | - stripe_nr = physical - map->stripes[i].physical; |
---|
5993 | | - stripe_nr = div64_u64(stripe_nr, map->stripe_len); |
---|
5994 | | - |
---|
5995 | | - if (map->type & BTRFS_BLOCK_GROUP_RAID10) { |
---|
5996 | | - stripe_nr = stripe_nr * map->num_stripes + i; |
---|
5997 | | - stripe_nr = div_u64(stripe_nr, map->sub_stripes); |
---|
5998 | | - } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
---|
5999 | | - stripe_nr = stripe_nr * map->num_stripes + i; |
---|
6000 | | - } /* else if RAID[56], multiply by nr_data_stripes(). |
---|
6001 | | - * Alternatively, just use rmap_len below instead of |
---|
6002 | | - * map->stripe_len */ |
---|
6003 | | - |
---|
6004 | | - bytenr = chunk_start + stripe_nr * rmap_len; |
---|
6005 | | - WARN_ON(nr >= map->num_stripes); |
---|
6006 | | - for (j = 0; j < nr; j++) { |
---|
6007 | | - if (buf[j] == bytenr) |
---|
6008 | | - break; |
---|
6009 | | - } |
---|
6010 | | - if (j == nr) { |
---|
6011 | | - WARN_ON(nr >= map->num_stripes); |
---|
6012 | | - buf[nr++] = bytenr; |
---|
6013 | | - } |
---|
6014 | | - } |
---|
6015 | | - |
---|
6016 | | - *logical = buf; |
---|
6017 | | - *naddrs = nr; |
---|
6018 | | - *stripe_len = rmap_len; |
---|
6019 | | - |
---|
6020 | | - free_extent_map(em); |
---|
6021 | | - return 0; |
---|
6022 | 6356 | } |
---|
6023 | 6357 | |
---|
6024 | 6358 | static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) |
---|
.. | .. |
---|
6039 | 6373 | atomic_inc(&bbio->error); |
---|
6040 | 6374 | if (bio->bi_status == BLK_STS_IOERR || |
---|
6041 | 6375 | bio->bi_status == BLK_STS_TARGET) { |
---|
6042 | | - unsigned int stripe_index = |
---|
6043 | | - btrfs_io_bio(bio)->stripe_index; |
---|
6044 | | - struct btrfs_device *dev; |
---|
| 6376 | + struct btrfs_device *dev = btrfs_io_bio(bio)->device; |
---|
6045 | 6377 | |
---|
6046 | | - BUG_ON(stripe_index >= bbio->num_stripes); |
---|
6047 | | - dev = bbio->stripes[stripe_index].dev; |
---|
6048 | | - if (dev->bdev) { |
---|
6049 | | - if (bio_op(bio) == REQ_OP_WRITE) |
---|
6050 | | - btrfs_dev_stat_inc_and_print(dev, |
---|
| 6378 | + ASSERT(dev->bdev); |
---|
| 6379 | + if (bio_op(bio) == REQ_OP_WRITE) |
---|
| 6380 | + btrfs_dev_stat_inc_and_print(dev, |
---|
6051 | 6381 | BTRFS_DEV_STAT_WRITE_ERRS); |
---|
6052 | | - else if (!(bio->bi_opf & REQ_RAHEAD)) |
---|
6053 | | - btrfs_dev_stat_inc_and_print(dev, |
---|
| 6382 | + else if (!(bio->bi_opf & REQ_RAHEAD)) |
---|
| 6383 | + btrfs_dev_stat_inc_and_print(dev, |
---|
6054 | 6384 | BTRFS_DEV_STAT_READ_ERRS); |
---|
6055 | | - if (bio->bi_opf & REQ_PREFLUSH) |
---|
6056 | | - btrfs_dev_stat_inc_and_print(dev, |
---|
| 6385 | + if (bio->bi_opf & REQ_PREFLUSH) |
---|
| 6386 | + btrfs_dev_stat_inc_and_print(dev, |
---|
6057 | 6387 | BTRFS_DEV_STAT_FLUSH_ERRS); |
---|
6058 | | - } |
---|
6059 | 6388 | } |
---|
6060 | 6389 | } |
---|
6061 | 6390 | |
---|
.. | .. |
---|
6090 | 6419 | } |
---|
6091 | 6420 | } |
---|
6092 | 6421 | |
---|
6093 | | -/* |
---|
6094 | | - * see run_scheduled_bios for a description of why bios are collected for |
---|
6095 | | - * async submit. |
---|
6096 | | - * |
---|
6097 | | - * This will add one bio to the pending list for a device and make sure |
---|
6098 | | - * the work struct is scheduled. |
---|
6099 | | - */ |
---|
6100 | | -static noinline void btrfs_schedule_bio(struct btrfs_device *device, |
---|
6101 | | - struct bio *bio) |
---|
6102 | | -{ |
---|
6103 | | - struct btrfs_fs_info *fs_info = device->fs_info; |
---|
6104 | | - int should_queue = 1; |
---|
6105 | | - struct btrfs_pending_bios *pending_bios; |
---|
6106 | | - |
---|
6107 | | - /* don't bother with additional async steps for reads, right now */ |
---|
6108 | | - if (bio_op(bio) == REQ_OP_READ) { |
---|
6109 | | - btrfsic_submit_bio(bio); |
---|
6110 | | - return; |
---|
6111 | | - } |
---|
6112 | | - |
---|
6113 | | - WARN_ON(bio->bi_next); |
---|
6114 | | - bio->bi_next = NULL; |
---|
6115 | | - |
---|
6116 | | - spin_lock(&device->io_lock); |
---|
6117 | | - if (op_is_sync(bio->bi_opf)) |
---|
6118 | | - pending_bios = &device->pending_sync_bios; |
---|
6119 | | - else |
---|
6120 | | - pending_bios = &device->pending_bios; |
---|
6121 | | - |
---|
6122 | | - if (pending_bios->tail) |
---|
6123 | | - pending_bios->tail->bi_next = bio; |
---|
6124 | | - |
---|
6125 | | - pending_bios->tail = bio; |
---|
6126 | | - if (!pending_bios->head) |
---|
6127 | | - pending_bios->head = bio; |
---|
6128 | | - if (device->running_pending) |
---|
6129 | | - should_queue = 0; |
---|
6130 | | - |
---|
6131 | | - spin_unlock(&device->io_lock); |
---|
6132 | | - |
---|
6133 | | - if (should_queue) |
---|
6134 | | - btrfs_queue_work(fs_info->submit_workers, &device->work); |
---|
6135 | | -} |
---|
6136 | | - |
---|
6137 | 6422 | static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, |
---|
6138 | | - u64 physical, int dev_nr, int async) |
---|
| 6423 | + u64 physical, struct btrfs_device *dev) |
---|
6139 | 6424 | { |
---|
6140 | | - struct btrfs_device *dev = bbio->stripes[dev_nr].dev; |
---|
6141 | 6425 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
---|
6142 | 6426 | |
---|
6143 | 6427 | bio->bi_private = bbio; |
---|
6144 | | - btrfs_io_bio(bio)->stripe_index = dev_nr; |
---|
| 6428 | + btrfs_io_bio(bio)->device = dev; |
---|
6145 | 6429 | bio->bi_end_io = btrfs_end_bio; |
---|
6146 | 6430 | bio->bi_iter.bi_sector = physical >> 9; |
---|
6147 | 6431 | btrfs_debug_in_rcu(fs_info, |
---|
6148 | 6432 | "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", |
---|
6149 | 6433 | bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, |
---|
6150 | | - (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, |
---|
6151 | | - bio->bi_iter.bi_size); |
---|
| 6434 | + (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), |
---|
| 6435 | + dev->devid, bio->bi_iter.bi_size); |
---|
6152 | 6436 | bio_set_dev(bio, dev->bdev); |
---|
6153 | 6437 | |
---|
6154 | 6438 | btrfs_bio_counter_inc_noblocked(fs_info); |
---|
6155 | 6439 | |
---|
6156 | | - if (async) |
---|
6157 | | - btrfs_schedule_bio(dev, bio); |
---|
6158 | | - else |
---|
6159 | | - btrfsic_submit_bio(bio); |
---|
| 6440 | + btrfsic_submit_bio(bio); |
---|
6160 | 6441 | } |
---|
6161 | 6442 | |
---|
6162 | 6443 | static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) |
---|
.. | .. |
---|
6177 | 6458 | } |
---|
6178 | 6459 | |
---|
6179 | 6460 | blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, |
---|
6180 | | - int mirror_num, int async_submit) |
---|
| 6461 | + int mirror_num) |
---|
6181 | 6462 | { |
---|
6182 | 6463 | struct btrfs_device *dev; |
---|
6183 | 6464 | struct bio *first_bio = bio; |
---|
.. | .. |
---|
6245 | 6526 | else |
---|
6246 | 6527 | bio = first_bio; |
---|
6247 | 6528 | |
---|
6248 | | - submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, |
---|
6249 | | - dev_nr, async_submit); |
---|
| 6529 | + submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); |
---|
6250 | 6530 | } |
---|
6251 | 6531 | btrfs_bio_counter_dec(fs_info); |
---|
6252 | 6532 | return BLK_STS_OK; |
---|
.. | .. |
---|
6262 | 6542 | * If @seed is true, traverse through the seed devices. |
---|
6263 | 6543 | */ |
---|
6264 | 6544 | struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, |
---|
6265 | | - u64 devid, u8 *uuid, u8 *fsid, |
---|
6266 | | - bool seed) |
---|
| 6545 | + u64 devid, u8 *uuid, u8 *fsid, |
---|
| 6546 | + bool seed) |
---|
6267 | 6547 | { |
---|
6268 | 6548 | struct btrfs_device *device; |
---|
| 6549 | + struct btrfs_fs_devices *seed_devs; |
---|
6269 | 6550 | |
---|
6270 | | - while (fs_devices) { |
---|
| 6551 | + if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { |
---|
| 6552 | + list_for_each_entry(device, &fs_devices->devices, dev_list) { |
---|
| 6553 | + if (device->devid == devid && |
---|
| 6554 | + (!uuid || memcmp(device->uuid, uuid, |
---|
| 6555 | + BTRFS_UUID_SIZE) == 0)) |
---|
| 6556 | + return device; |
---|
| 6557 | + } |
---|
| 6558 | + } |
---|
| 6559 | + |
---|
| 6560 | + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { |
---|
6271 | 6561 | if (!fsid || |
---|
6272 | | - !memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) { |
---|
6273 | | - list_for_each_entry(device, &fs_devices->devices, |
---|
| 6562 | + !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { |
---|
| 6563 | + list_for_each_entry(device, &seed_devs->devices, |
---|
6274 | 6564 | dev_list) { |
---|
6275 | 6565 | if (device->devid == devid && |
---|
6276 | 6566 | (!uuid || memcmp(device->uuid, uuid, |
---|
.. | .. |
---|
6278 | 6568 | return device; |
---|
6279 | 6569 | } |
---|
6280 | 6570 | } |
---|
6281 | | - if (seed) |
---|
6282 | | - fs_devices = fs_devices->seed; |
---|
6283 | | - else |
---|
6284 | | - return NULL; |
---|
6285 | 6571 | } |
---|
| 6572 | + |
---|
6286 | 6573 | return NULL; |
---|
6287 | 6574 | } |
---|
6288 | 6575 | |
---|
.. | .. |
---|
6337 | 6624 | if (WARN_ON(!devid && !fs_info)) |
---|
6338 | 6625 | return ERR_PTR(-EINVAL); |
---|
6339 | 6626 | |
---|
6340 | | - dev = __alloc_device(); |
---|
| 6627 | + dev = __alloc_device(fs_info); |
---|
6341 | 6628 | if (IS_ERR(dev)) |
---|
6342 | 6629 | return dev; |
---|
6343 | 6630 | |
---|
.. | .. |
---|
6359 | 6646 | else |
---|
6360 | 6647 | generate_random_uuid(dev->uuid); |
---|
6361 | 6648 | |
---|
6362 | | - btrfs_init_work(&dev->work, btrfs_submit_helper, |
---|
6363 | | - pending_bios_fn, NULL, NULL); |
---|
6364 | | - |
---|
6365 | 6649 | return dev; |
---|
6366 | 6650 | } |
---|
6367 | 6651 | |
---|
.. | .. |
---|
6376 | 6660 | devid, uuid); |
---|
6377 | 6661 | } |
---|
6378 | 6662 | |
---|
6379 | | -static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, |
---|
6380 | | - struct extent_buffer *leaf, |
---|
| 6663 | +static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) |
---|
| 6664 | +{ |
---|
| 6665 | + int index = btrfs_bg_flags_to_raid_index(type); |
---|
| 6666 | + int ncopies = btrfs_raid_array[index].ncopies; |
---|
| 6667 | + const int nparity = btrfs_raid_array[index].nparity; |
---|
| 6668 | + int data_stripes; |
---|
| 6669 | + |
---|
| 6670 | + if (nparity) |
---|
| 6671 | + data_stripes = num_stripes - nparity; |
---|
| 6672 | + else |
---|
| 6673 | + data_stripes = num_stripes / ncopies; |
---|
| 6674 | + |
---|
| 6675 | + return div_u64(chunk_len, data_stripes); |
---|
| 6676 | +} |
---|
| 6677 | + |
---|
| 6678 | +static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, |
---|
6381 | 6679 | struct btrfs_chunk *chunk) |
---|
6382 | 6680 | { |
---|
6383 | | - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; |
---|
| 6681 | + struct btrfs_fs_info *fs_info = leaf->fs_info; |
---|
| 6682 | + struct extent_map_tree *map_tree = &fs_info->mapping_tree; |
---|
6384 | 6683 | struct map_lookup *map; |
---|
6385 | 6684 | struct extent_map *em; |
---|
6386 | 6685 | u64 logical; |
---|
.. | .. |
---|
6400 | 6699 | * as chunk item in tree block is already verified by tree-checker. |
---|
6401 | 6700 | */ |
---|
6402 | 6701 | if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { |
---|
6403 | | - ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); |
---|
| 6702 | + ret = btrfs_check_chunk_valid(leaf, chunk, logical); |
---|
6404 | 6703 | if (ret) |
---|
6405 | 6704 | return ret; |
---|
6406 | 6705 | } |
---|
6407 | 6706 | |
---|
6408 | | - read_lock(&map_tree->map_tree.lock); |
---|
6409 | | - em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); |
---|
6410 | | - read_unlock(&map_tree->map_tree.lock); |
---|
| 6707 | + read_lock(&map_tree->lock); |
---|
| 6708 | + em = lookup_extent_mapping(map_tree, logical, 1); |
---|
| 6709 | + read_unlock(&map_tree->lock); |
---|
6411 | 6710 | |
---|
6412 | 6711 | /* already mapped? */ |
---|
6413 | 6712 | if (em && em->start <= logical && em->start + em->len > logical) { |
---|
.. | .. |
---|
6441 | 6740 | map->type = btrfs_chunk_type(leaf, chunk); |
---|
6442 | 6741 | map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); |
---|
6443 | 6742 | map->verified_stripes = 0; |
---|
| 6743 | + em->orig_block_len = calc_stripe_length(map->type, em->len, |
---|
| 6744 | + map->num_stripes); |
---|
6444 | 6745 | for (i = 0; i < num_stripes; i++) { |
---|
6445 | 6746 | map->stripes[i].physical = |
---|
6446 | 6747 | btrfs_stripe_offset_nr(leaf, chunk, i); |
---|
.. | .. |
---|
6449 | 6750 | btrfs_stripe_dev_uuid_nr(chunk, i), |
---|
6450 | 6751 | BTRFS_UUID_SIZE); |
---|
6451 | 6752 | map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, |
---|
6452 | | - devid, uuid, NULL, true); |
---|
| 6753 | + devid, uuid, NULL, true); |
---|
6453 | 6754 | if (!map->stripes[i].dev && |
---|
6454 | 6755 | !btrfs_test_opt(fs_info, DEGRADED)) { |
---|
6455 | 6756 | free_extent_map(em); |
---|
.. | .. |
---|
6474 | 6775 | |
---|
6475 | 6776 | } |
---|
6476 | 6777 | |
---|
6477 | | - write_lock(&map_tree->map_tree.lock); |
---|
6478 | | - ret = add_extent_mapping(&map_tree->map_tree, em, 0); |
---|
6479 | | - write_unlock(&map_tree->map_tree.lock); |
---|
| 6778 | + write_lock(&map_tree->lock); |
---|
| 6779 | + ret = add_extent_mapping(map_tree, em, 0); |
---|
| 6780 | + write_unlock(&map_tree->lock); |
---|
6480 | 6781 | if (ret < 0) { |
---|
6481 | 6782 | btrfs_err(fs_info, |
---|
6482 | 6783 | "failed to add chunk map, start=%llu len=%llu: %d", |
---|
.. | .. |
---|
6519 | 6820 | lockdep_assert_held(&uuid_mutex); |
---|
6520 | 6821 | ASSERT(fsid); |
---|
6521 | 6822 | |
---|
6522 | | - fs_devices = fs_info->fs_devices->seed; |
---|
6523 | | - while (fs_devices) { |
---|
| 6823 | + /* This will match only for multi-device seed fs */ |
---|
| 6824 | + list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) |
---|
6524 | 6825 | if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) |
---|
6525 | 6826 | return fs_devices; |
---|
6526 | 6827 | |
---|
6527 | | - fs_devices = fs_devices->seed; |
---|
6528 | | - } |
---|
6529 | 6828 | |
---|
6530 | | - fs_devices = find_fsid(fsid); |
---|
| 6829 | + fs_devices = find_fsid(fsid, NULL); |
---|
6531 | 6830 | if (!fs_devices) { |
---|
6532 | 6831 | if (!btrfs_test_opt(fs_info, DEGRADED)) |
---|
6533 | 6832 | return ERR_PTR(-ENOENT); |
---|
6534 | 6833 | |
---|
6535 | | - fs_devices = alloc_fs_devices(fsid); |
---|
| 6834 | + fs_devices = alloc_fs_devices(fsid, NULL); |
---|
6536 | 6835 | if (IS_ERR(fs_devices)) |
---|
6537 | 6836 | return fs_devices; |
---|
6538 | 6837 | |
---|
6539 | | - fs_devices->seeding = 1; |
---|
| 6838 | + fs_devices->seeding = true; |
---|
6540 | 6839 | fs_devices->opened = 1; |
---|
6541 | 6840 | return fs_devices; |
---|
6542 | 6841 | } |
---|
6543 | 6842 | |
---|
| 6843 | + /* |
---|
| 6844 | + * Upon first call for a seed fs fsid, just create a private copy of the |
---|
| 6845 | + * respective fs_devices and anchor it at fs_info->fs_devices->seed_list |
---|
| 6846 | + */ |
---|
6544 | 6847 | fs_devices = clone_fs_devices(fs_devices); |
---|
6545 | 6848 | if (IS_ERR(fs_devices)) |
---|
6546 | 6849 | return fs_devices; |
---|
.. | .. |
---|
6548 | 6851 | ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); |
---|
6549 | 6852 | if (ret) { |
---|
6550 | 6853 | free_fs_devices(fs_devices); |
---|
6551 | | - fs_devices = ERR_PTR(ret); |
---|
6552 | | - goto out; |
---|
| 6854 | + return ERR_PTR(ret); |
---|
6553 | 6855 | } |
---|
6554 | 6856 | |
---|
6555 | 6857 | if (!fs_devices->seeding) { |
---|
6556 | 6858 | close_fs_devices(fs_devices); |
---|
6557 | 6859 | free_fs_devices(fs_devices); |
---|
6558 | | - fs_devices = ERR_PTR(-EINVAL); |
---|
6559 | | - goto out; |
---|
| 6860 | + return ERR_PTR(-EINVAL); |
---|
6560 | 6861 | } |
---|
6561 | 6862 | |
---|
6562 | | - fs_devices->seed = fs_info->fs_devices->seed; |
---|
6563 | | - fs_info->fs_devices->seed = fs_devices; |
---|
6564 | | -out: |
---|
| 6863 | + list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); |
---|
| 6864 | + |
---|
6565 | 6865 | return fs_devices; |
---|
6566 | 6866 | } |
---|
6567 | 6867 | |
---|
6568 | | -static int read_one_dev(struct btrfs_fs_info *fs_info, |
---|
6569 | | - struct extent_buffer *leaf, |
---|
| 6868 | +static int read_one_dev(struct extent_buffer *leaf, |
---|
6570 | 6869 | struct btrfs_dev_item *dev_item) |
---|
6571 | 6870 | { |
---|
| 6871 | + struct btrfs_fs_info *fs_info = leaf->fs_info; |
---|
6572 | 6872 | struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
---|
6573 | 6873 | struct btrfs_device *device; |
---|
6574 | 6874 | u64 devid; |
---|
.. | .. |
---|
6582 | 6882 | read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), |
---|
6583 | 6883 | BTRFS_FSID_SIZE); |
---|
6584 | 6884 | |
---|
6585 | | - if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { |
---|
| 6885 | + if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { |
---|
6586 | 6886 | fs_devices = open_seed_devices(fs_info, fs_uuid); |
---|
6587 | 6887 | if (IS_ERR(fs_devices)) |
---|
6588 | 6888 | return PTR_ERR(fs_devices); |
---|
.. | .. |
---|
6725 | 7025 | sb_array_offset += len; |
---|
6726 | 7026 | cur_offset += len; |
---|
6727 | 7027 | |
---|
6728 | | - if (key.type == BTRFS_CHUNK_ITEM_KEY) { |
---|
6729 | | - chunk = (struct btrfs_chunk *)sb_array_offset; |
---|
6730 | | - /* |
---|
6731 | | - * At least one btrfs_chunk with one stripe must be |
---|
6732 | | - * present, exact stripe count check comes afterwards |
---|
6733 | | - */ |
---|
6734 | | - len = btrfs_chunk_item_size(1); |
---|
6735 | | - if (cur_offset + len > array_size) |
---|
6736 | | - goto out_short_read; |
---|
6737 | | - |
---|
6738 | | - num_stripes = btrfs_chunk_num_stripes(sb, chunk); |
---|
6739 | | - if (!num_stripes) { |
---|
6740 | | - btrfs_err(fs_info, |
---|
6741 | | - "invalid number of stripes %u in sys_array at offset %u", |
---|
6742 | | - num_stripes, cur_offset); |
---|
6743 | | - ret = -EIO; |
---|
6744 | | - break; |
---|
6745 | | - } |
---|
6746 | | - |
---|
6747 | | - type = btrfs_chunk_type(sb, chunk); |
---|
6748 | | - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { |
---|
6749 | | - btrfs_err(fs_info, |
---|
6750 | | - "invalid chunk type %llu in sys_array at offset %u", |
---|
6751 | | - type, cur_offset); |
---|
6752 | | - ret = -EIO; |
---|
6753 | | - break; |
---|
6754 | | - } |
---|
6755 | | - |
---|
6756 | | - len = btrfs_chunk_item_size(num_stripes); |
---|
6757 | | - if (cur_offset + len > array_size) |
---|
6758 | | - goto out_short_read; |
---|
6759 | | - |
---|
6760 | | - ret = read_one_chunk(fs_info, &key, sb, chunk); |
---|
6761 | | - if (ret) |
---|
6762 | | - break; |
---|
6763 | | - } else { |
---|
| 7028 | + if (key.type != BTRFS_CHUNK_ITEM_KEY) { |
---|
6764 | 7029 | btrfs_err(fs_info, |
---|
6765 | 7030 | "unexpected item type %u in sys_array at offset %u", |
---|
6766 | 7031 | (u32)key.type, cur_offset); |
---|
6767 | 7032 | ret = -EIO; |
---|
6768 | 7033 | break; |
---|
6769 | 7034 | } |
---|
| 7035 | + |
---|
| 7036 | + chunk = (struct btrfs_chunk *)sb_array_offset; |
---|
| 7037 | + /* |
---|
| 7038 | + * At least one btrfs_chunk with one stripe must be present, |
---|
| 7039 | + * exact stripe count check comes afterwards |
---|
| 7040 | + */ |
---|
| 7041 | + len = btrfs_chunk_item_size(1); |
---|
| 7042 | + if (cur_offset + len > array_size) |
---|
| 7043 | + goto out_short_read; |
---|
| 7044 | + |
---|
| 7045 | + num_stripes = btrfs_chunk_num_stripes(sb, chunk); |
---|
| 7046 | + if (!num_stripes) { |
---|
| 7047 | + btrfs_err(fs_info, |
---|
| 7048 | + "invalid number of stripes %u in sys_array at offset %u", |
---|
| 7049 | + num_stripes, cur_offset); |
---|
| 7050 | + ret = -EIO; |
---|
| 7051 | + break; |
---|
| 7052 | + } |
---|
| 7053 | + |
---|
| 7054 | + type = btrfs_chunk_type(sb, chunk); |
---|
| 7055 | + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { |
---|
| 7056 | + btrfs_err(fs_info, |
---|
| 7057 | + "invalid chunk type %llu in sys_array at offset %u", |
---|
| 7058 | + type, cur_offset); |
---|
| 7059 | + ret = -EIO; |
---|
| 7060 | + break; |
---|
| 7061 | + } |
---|
| 7062 | + |
---|
| 7063 | + len = btrfs_chunk_item_size(num_stripes); |
---|
| 7064 | + if (cur_offset + len > array_size) |
---|
| 7065 | + goto out_short_read; |
---|
| 7066 | + |
---|
| 7067 | + ret = read_one_chunk(&key, sb, chunk); |
---|
| 7068 | + if (ret) |
---|
| 7069 | + break; |
---|
| 7070 | + |
---|
6770 | 7071 | array_ptr += len; |
---|
6771 | 7072 | sb_array_offset += len; |
---|
6772 | 7073 | cur_offset += len; |
---|
.. | .. |
---|
6794 | 7095 | bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, |
---|
6795 | 7096 | struct btrfs_device *failing_dev) |
---|
6796 | 7097 | { |
---|
6797 | | - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; |
---|
| 7098 | + struct extent_map_tree *map_tree = &fs_info->mapping_tree; |
---|
6798 | 7099 | struct extent_map *em; |
---|
6799 | 7100 | u64 next_start = 0; |
---|
6800 | 7101 | bool ret = true; |
---|
6801 | 7102 | |
---|
6802 | | - read_lock(&map_tree->map_tree.lock); |
---|
6803 | | - em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); |
---|
6804 | | - read_unlock(&map_tree->map_tree.lock); |
---|
| 7103 | + read_lock(&map_tree->lock); |
---|
| 7104 | + em = lookup_extent_mapping(map_tree, 0, (u64)-1); |
---|
| 7105 | + read_unlock(&map_tree->lock); |
---|
6805 | 7106 | /* No chunk at all? Return false anyway */ |
---|
6806 | 7107 | if (!em) { |
---|
6807 | 7108 | ret = false; |
---|
.. | .. |
---|
6830 | 7131 | if (missing > max_tolerated) { |
---|
6831 | 7132 | if (!failing_dev) |
---|
6832 | 7133 | btrfs_warn(fs_info, |
---|
6833 | | - "chunk %llu missing %d devices, max tolerance is %d for writeable mount", |
---|
| 7134 | + "chunk %llu missing %d devices, max tolerance is %d for writable mount", |
---|
6834 | 7135 | em->start, missing, max_tolerated); |
---|
6835 | 7136 | free_extent_map(em); |
---|
6836 | 7137 | ret = false; |
---|
.. | .. |
---|
6839 | 7140 | next_start = extent_map_end(em); |
---|
6840 | 7141 | free_extent_map(em); |
---|
6841 | 7142 | |
---|
6842 | | - read_lock(&map_tree->map_tree.lock); |
---|
6843 | | - em = lookup_extent_mapping(&map_tree->map_tree, next_start, |
---|
| 7143 | + read_lock(&map_tree->lock); |
---|
| 7144 | + em = lookup_extent_mapping(map_tree, next_start, |
---|
6844 | 7145 | (u64)(-1) - next_start); |
---|
6845 | | - read_unlock(&map_tree->map_tree.lock); |
---|
| 7146 | + read_unlock(&map_tree->lock); |
---|
6846 | 7147 | } |
---|
6847 | 7148 | out: |
---|
6848 | 7149 | return ret; |
---|
| 7150 | +} |
---|
| 7151 | + |
---|
| 7152 | +static void readahead_tree_node_children(struct extent_buffer *node) |
---|
| 7153 | +{ |
---|
| 7154 | + int i; |
---|
| 7155 | + const int nr_items = btrfs_header_nritems(node); |
---|
| 7156 | + |
---|
| 7157 | + for (i = 0; i < nr_items; i++) { |
---|
| 7158 | + u64 start; |
---|
| 7159 | + |
---|
| 7160 | + start = btrfs_node_blockptr(node, i); |
---|
| 7161 | + readahead_tree_block(node->fs_info, start); |
---|
| 7162 | + } |
---|
6849 | 7163 | } |
---|
6850 | 7164 | |
---|
6851 | 7165 | int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) |
---|
.. | .. |
---|
6858 | 7172 | int ret; |
---|
6859 | 7173 | int slot; |
---|
6860 | 7174 | u64 total_dev = 0; |
---|
| 7175 | + u64 last_ra_node = 0; |
---|
6861 | 7176 | |
---|
6862 | 7177 | path = btrfs_alloc_path(); |
---|
6863 | 7178 | if (!path) |
---|
.. | .. |
---|
6868 | 7183 | * otherwise we don't need it. |
---|
6869 | 7184 | */ |
---|
6870 | 7185 | mutex_lock(&uuid_mutex); |
---|
6871 | | - mutex_lock(&fs_info->chunk_mutex); |
---|
6872 | 7186 | |
---|
6873 | 7187 | /* |
---|
6874 | 7188 | * It is possible for mount and umount to race in such a way that |
---|
.. | .. |
---|
6891 | 7205 | if (ret < 0) |
---|
6892 | 7206 | goto error; |
---|
6893 | 7207 | while (1) { |
---|
| 7208 | + struct extent_buffer *node; |
---|
| 7209 | + |
---|
6894 | 7210 | leaf = path->nodes[0]; |
---|
6895 | 7211 | slot = path->slots[0]; |
---|
6896 | 7212 | if (slot >= btrfs_header_nritems(leaf)) { |
---|
.. | .. |
---|
6901 | 7217 | goto error; |
---|
6902 | 7218 | break; |
---|
6903 | 7219 | } |
---|
| 7220 | + /* |
---|
| 7221 | + * The nodes on level 1 are not locked but we don't need to do |
---|
| 7222 | + * that during mount time as nothing else can access the tree |
---|
| 7223 | + */ |
---|
| 7224 | + node = path->nodes[1]; |
---|
| 7225 | + if (node) { |
---|
| 7226 | + if (last_ra_node != node->start) { |
---|
| 7227 | + readahead_tree_node_children(node); |
---|
| 7228 | + last_ra_node = node->start; |
---|
| 7229 | + } |
---|
| 7230 | + } |
---|
6904 | 7231 | btrfs_item_key_to_cpu(leaf, &found_key, slot); |
---|
6905 | 7232 | if (found_key.type == BTRFS_DEV_ITEM_KEY) { |
---|
6906 | 7233 | struct btrfs_dev_item *dev_item; |
---|
6907 | 7234 | dev_item = btrfs_item_ptr(leaf, slot, |
---|
6908 | 7235 | struct btrfs_dev_item); |
---|
6909 | | - ret = read_one_dev(fs_info, leaf, dev_item); |
---|
| 7236 | + ret = read_one_dev(leaf, dev_item); |
---|
6910 | 7237 | if (ret) |
---|
6911 | 7238 | goto error; |
---|
6912 | 7239 | total_dev++; |
---|
6913 | 7240 | } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { |
---|
6914 | 7241 | struct btrfs_chunk *chunk; |
---|
6915 | 7242 | chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); |
---|
6916 | | - ret = read_one_chunk(fs_info, &found_key, leaf, chunk); |
---|
| 7243 | + mutex_lock(&fs_info->chunk_mutex); |
---|
| 7244 | + ret = read_one_chunk(&found_key, leaf, chunk); |
---|
| 7245 | + mutex_unlock(&fs_info->chunk_mutex); |
---|
6917 | 7246 | if (ret) |
---|
6918 | 7247 | goto error; |
---|
6919 | 7248 | } |
---|
.. | .. |
---|
6925 | 7254 | * do another round of validation checks. |
---|
6926 | 7255 | */ |
---|
6927 | 7256 | if (total_dev != fs_info->fs_devices->total_devices) { |
---|
6928 | | - btrfs_err(fs_info, |
---|
6929 | | - "super_num_devices %llu mismatch with num_devices %llu found here", |
---|
| 7257 | + btrfs_warn(fs_info, |
---|
| 7258 | +"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", |
---|
6930 | 7259 | btrfs_super_num_devices(fs_info->super_copy), |
---|
6931 | 7260 | total_dev); |
---|
6932 | | - ret = -EINVAL; |
---|
6933 | | - goto error; |
---|
| 7261 | + fs_info->fs_devices->total_devices = total_dev; |
---|
| 7262 | + btrfs_set_super_num_devices(fs_info->super_copy, total_dev); |
---|
6934 | 7263 | } |
---|
6935 | 7264 | if (btrfs_super_total_bytes(fs_info->super_copy) < |
---|
6936 | 7265 | fs_info->fs_devices->total_rw_bytes) { |
---|
.. | .. |
---|
6943 | 7272 | } |
---|
6944 | 7273 | ret = 0; |
---|
6945 | 7274 | error: |
---|
6946 | | - mutex_unlock(&fs_info->chunk_mutex); |
---|
6947 | 7275 | mutex_unlock(&uuid_mutex); |
---|
6948 | 7276 | |
---|
6949 | 7277 | btrfs_free_path(path); |
---|
.. | .. |
---|
6952 | 7280 | |
---|
6953 | 7281 | void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) |
---|
6954 | 7282 | { |
---|
6955 | | - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
---|
| 7283 | + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; |
---|
6956 | 7284 | struct btrfs_device *device; |
---|
6957 | 7285 | |
---|
6958 | | - while (fs_devices) { |
---|
6959 | | - mutex_lock(&fs_devices->device_list_mutex); |
---|
6960 | | - list_for_each_entry(device, &fs_devices->devices, dev_list) |
---|
6961 | | - device->fs_info = fs_info; |
---|
6962 | | - mutex_unlock(&fs_devices->device_list_mutex); |
---|
| 7286 | + fs_devices->fs_info = fs_info; |
---|
6963 | 7287 | |
---|
6964 | | - fs_devices = fs_devices->seed; |
---|
| 7288 | + mutex_lock(&fs_devices->device_list_mutex); |
---|
| 7289 | + list_for_each_entry(device, &fs_devices->devices, dev_list) |
---|
| 7290 | + device->fs_info = fs_info; |
---|
| 7291 | + |
---|
| 7292 | + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { |
---|
| 7293 | + list_for_each_entry(device, &seed_devs->devices, dev_list) |
---|
| 7294 | + device->fs_info = fs_info; |
---|
| 7295 | + |
---|
| 7296 | + seed_devs->fs_info = fs_info; |
---|
6965 | 7297 | } |
---|
| 7298 | + mutex_unlock(&fs_devices->device_list_mutex); |
---|
6966 | 7299 | } |
---|
6967 | 7300 | |
---|
6968 | | -static void __btrfs_reset_dev_stats(struct btrfs_device *dev) |
---|
| 7301 | +static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, |
---|
| 7302 | + const struct btrfs_dev_stats_item *ptr, |
---|
| 7303 | + int index) |
---|
6969 | 7304 | { |
---|
6970 | | - int i; |
---|
| 7305 | + u64 val; |
---|
6971 | 7306 | |
---|
6972 | | - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) |
---|
6973 | | - btrfs_dev_stat_reset(dev, i); |
---|
| 7307 | + read_extent_buffer(eb, &val, |
---|
| 7308 | + offsetof(struct btrfs_dev_stats_item, values) + |
---|
| 7309 | + ((unsigned long)ptr) + (index * sizeof(u64)), |
---|
| 7310 | + sizeof(val)); |
---|
| 7311 | + return val; |
---|
| 7312 | +} |
---|
| 7313 | + |
---|
| 7314 | +static void btrfs_set_dev_stats_value(struct extent_buffer *eb, |
---|
| 7315 | + struct btrfs_dev_stats_item *ptr, |
---|
| 7316 | + int index, u64 val) |
---|
| 7317 | +{ |
---|
| 7318 | + write_extent_buffer(eb, &val, |
---|
| 7319 | + offsetof(struct btrfs_dev_stats_item, values) + |
---|
| 7320 | + ((unsigned long)ptr) + (index * sizeof(u64)), |
---|
| 7321 | + sizeof(val)); |
---|
| 7322 | +} |
---|
| 7323 | + |
---|
| 7324 | +static int btrfs_device_init_dev_stats(struct btrfs_device *device, |
---|
| 7325 | + struct btrfs_path *path) |
---|
| 7326 | +{ |
---|
| 7327 | + struct btrfs_dev_stats_item *ptr; |
---|
| 7328 | + struct extent_buffer *eb; |
---|
| 7329 | + struct btrfs_key key; |
---|
| 7330 | + int item_size; |
---|
| 7331 | + int i, ret, slot; |
---|
| 7332 | + |
---|
| 7333 | + key.objectid = BTRFS_DEV_STATS_OBJECTID; |
---|
| 7334 | + key.type = BTRFS_PERSISTENT_ITEM_KEY; |
---|
| 7335 | + key.offset = device->devid; |
---|
| 7336 | + ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); |
---|
| 7337 | + if (ret) { |
---|
| 7338 | + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) |
---|
| 7339 | + btrfs_dev_stat_set(device, i, 0); |
---|
| 7340 | + device->dev_stats_valid = 1; |
---|
| 7341 | + btrfs_release_path(path); |
---|
| 7342 | + return ret < 0 ? ret : 0; |
---|
| 7343 | + } |
---|
| 7344 | + slot = path->slots[0]; |
---|
| 7345 | + eb = path->nodes[0]; |
---|
| 7346 | + item_size = btrfs_item_size_nr(eb, slot); |
---|
| 7347 | + |
---|
| 7348 | + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); |
---|
| 7349 | + |
---|
| 7350 | + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { |
---|
| 7351 | + if (item_size >= (1 + i) * sizeof(__le64)) |
---|
| 7352 | + btrfs_dev_stat_set(device, i, |
---|
| 7353 | + btrfs_dev_stats_value(eb, ptr, i)); |
---|
| 7354 | + else |
---|
| 7355 | + btrfs_dev_stat_set(device, i, 0); |
---|
| 7356 | + } |
---|
| 7357 | + |
---|
| 7358 | + device->dev_stats_valid = 1; |
---|
| 7359 | + btrfs_dev_stat_print_on_load(device); |
---|
| 7360 | + btrfs_release_path(path); |
---|
| 7361 | + |
---|
| 7362 | + return 0; |
---|
6974 | 7363 | } |
---|
6975 | 7364 | |
---|
6976 | 7365 | int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) |
---|
6977 | 7366 | { |
---|
6978 | | - struct btrfs_key key; |
---|
6979 | | - struct btrfs_key found_key; |
---|
6980 | | - struct btrfs_root *dev_root = fs_info->dev_root; |
---|
6981 | | - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
---|
6982 | | - struct extent_buffer *eb; |
---|
6983 | | - int slot; |
---|
6984 | | - int ret = 0; |
---|
| 7367 | + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; |
---|
6985 | 7368 | struct btrfs_device *device; |
---|
6986 | 7369 | struct btrfs_path *path = NULL; |
---|
6987 | | - int i; |
---|
| 7370 | + int ret = 0; |
---|
6988 | 7371 | |
---|
6989 | 7372 | path = btrfs_alloc_path(); |
---|
6990 | | - if (!path) { |
---|
6991 | | - ret = -ENOMEM; |
---|
6992 | | - goto out; |
---|
6993 | | - } |
---|
| 7373 | + if (!path) |
---|
| 7374 | + return -ENOMEM; |
---|
6994 | 7375 | |
---|
6995 | 7376 | mutex_lock(&fs_devices->device_list_mutex); |
---|
6996 | 7377 | list_for_each_entry(device, &fs_devices->devices, dev_list) { |
---|
6997 | | - int item_size; |
---|
6998 | | - struct btrfs_dev_stats_item *ptr; |
---|
6999 | | - |
---|
7000 | | - key.objectid = BTRFS_DEV_STATS_OBJECTID; |
---|
7001 | | - key.type = BTRFS_PERSISTENT_ITEM_KEY; |
---|
7002 | | - key.offset = device->devid; |
---|
7003 | | - ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); |
---|
7004 | | - if (ret) { |
---|
7005 | | - __btrfs_reset_dev_stats(device); |
---|
7006 | | - device->dev_stats_valid = 1; |
---|
7007 | | - btrfs_release_path(path); |
---|
7008 | | - continue; |
---|
7009 | | - } |
---|
7010 | | - slot = path->slots[0]; |
---|
7011 | | - eb = path->nodes[0]; |
---|
7012 | | - btrfs_item_key_to_cpu(eb, &found_key, slot); |
---|
7013 | | - item_size = btrfs_item_size_nr(eb, slot); |
---|
7014 | | - |
---|
7015 | | - ptr = btrfs_item_ptr(eb, slot, |
---|
7016 | | - struct btrfs_dev_stats_item); |
---|
7017 | | - |
---|
7018 | | - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { |
---|
7019 | | - if (item_size >= (1 + i) * sizeof(__le64)) |
---|
7020 | | - btrfs_dev_stat_set(device, i, |
---|
7021 | | - btrfs_dev_stats_value(eb, ptr, i)); |
---|
7022 | | - else |
---|
7023 | | - btrfs_dev_stat_reset(device, i); |
---|
7024 | | - } |
---|
7025 | | - |
---|
7026 | | - device->dev_stats_valid = 1; |
---|
7027 | | - btrfs_dev_stat_print_on_load(device); |
---|
7028 | | - btrfs_release_path(path); |
---|
| 7378 | + ret = btrfs_device_init_dev_stats(device, path); |
---|
| 7379 | + if (ret) |
---|
| 7380 | + goto out; |
---|
7029 | 7381 | } |
---|
| 7382 | + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { |
---|
| 7383 | + list_for_each_entry(device, &seed_devs->devices, dev_list) { |
---|
| 7384 | + ret = btrfs_device_init_dev_stats(device, path); |
---|
| 7385 | + if (ret) |
---|
| 7386 | + goto out; |
---|
| 7387 | + } |
---|
| 7388 | + } |
---|
| 7389 | +out: |
---|
7030 | 7390 | mutex_unlock(&fs_devices->device_list_mutex); |
---|
7031 | 7391 | |
---|
7032 | | -out: |
---|
7033 | 7392 | btrfs_free_path(path); |
---|
7034 | | - return ret < 0 ? ret : 0; |
---|
| 7393 | + return ret; |
---|
7035 | 7394 | } |
---|
7036 | 7395 | |
---|
7037 | 7396 | static int update_dev_stat_item(struct btrfs_trans_handle *trans, |
---|
.. | .. |
---|
7102 | 7461 | /* |
---|
7103 | 7462 | * called from commit_transaction. Writes all changed device stats to disk. |
---|
7104 | 7463 | */ |
---|
7105 | | -int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, |
---|
7106 | | - struct btrfs_fs_info *fs_info) |
---|
| 7464 | +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) |
---|
7107 | 7465 | { |
---|
| 7466 | + struct btrfs_fs_info *fs_info = trans->fs_info; |
---|
7108 | 7467 | struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
---|
7109 | 7468 | struct btrfs_device *device; |
---|
7110 | 7469 | int stats_cnt; |
---|
.. | .. |
---|
7187 | 7546 | int i; |
---|
7188 | 7547 | |
---|
7189 | 7548 | mutex_lock(&fs_devices->device_list_mutex); |
---|
7190 | | - dev = btrfs_find_device(fs_info->fs_devices, stats->devid, |
---|
7191 | | - NULL, NULL, true); |
---|
| 7549 | + dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL, |
---|
| 7550 | + true); |
---|
7192 | 7551 | mutex_unlock(&fs_devices->device_list_mutex); |
---|
7193 | 7552 | |
---|
7194 | 7553 | if (!dev) { |
---|
.. | .. |
---|
7203 | 7562 | stats->values[i] = |
---|
7204 | 7563 | btrfs_dev_stat_read_and_reset(dev, i); |
---|
7205 | 7564 | else |
---|
7206 | | - btrfs_dev_stat_reset(dev, i); |
---|
| 7565 | + btrfs_dev_stat_set(dev, i, 0); |
---|
7207 | 7566 | } |
---|
7208 | 7567 | btrfs_info(fs_info, "device stats zeroed by %s (%d)", |
---|
7209 | 7568 | current->comm, task_pid_nr(current)); |
---|
.. | .. |
---|
7217 | 7576 | return 0; |
---|
7218 | 7577 | } |
---|
7219 | 7578 | |
---|
7220 | | -void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path) |
---|
7221 | | -{ |
---|
7222 | | - struct buffer_head *bh; |
---|
7223 | | - struct btrfs_super_block *disk_super; |
---|
7224 | | - int copy_num; |
---|
7225 | | - |
---|
7226 | | - if (!bdev) |
---|
7227 | | - return; |
---|
7228 | | - |
---|
7229 | | - for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; |
---|
7230 | | - copy_num++) { |
---|
7231 | | - |
---|
7232 | | - if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) |
---|
7233 | | - continue; |
---|
7234 | | - |
---|
7235 | | - disk_super = (struct btrfs_super_block *)bh->b_data; |
---|
7236 | | - |
---|
7237 | | - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); |
---|
7238 | | - set_buffer_dirty(bh); |
---|
7239 | | - sync_dirty_buffer(bh); |
---|
7240 | | - brelse(bh); |
---|
7241 | | - } |
---|
7242 | | - |
---|
7243 | | - /* Notify udev that device has changed */ |
---|
7244 | | - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); |
---|
7245 | | - |
---|
7246 | | - /* Update ctime/mtime for device path for libblkid */ |
---|
7247 | | - update_dev_time(device_path); |
---|
7248 | | -} |
---|
7249 | | - |
---|
7250 | 7579 | /* |
---|
7251 | | - * Update the size of all devices, which is used for writing out the |
---|
7252 | | - * super blocks. |
---|
| 7580 | + * Update the size and bytes used for each device where it changed. This is |
---|
| 7581 | + * delayed since we would otherwise get errors while writing out the |
---|
| 7582 | + * superblocks. |
---|
| 7583 | + * |
---|
| 7584 | + * Must be invoked during transaction commit. |
---|
7253 | 7585 | */ |
---|
7254 | | -void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) |
---|
| 7586 | +void btrfs_commit_device_sizes(struct btrfs_transaction *trans) |
---|
7255 | 7587 | { |
---|
7256 | | - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
---|
7257 | 7588 | struct btrfs_device *curr, *next; |
---|
7258 | 7589 | |
---|
7259 | | - if (list_empty(&fs_devices->resized_devices)) |
---|
| 7590 | + ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); |
---|
| 7591 | + |
---|
| 7592 | + if (list_empty(&trans->dev_update_list)) |
---|
7260 | 7593 | return; |
---|
7261 | 7594 | |
---|
7262 | | - mutex_lock(&fs_devices->device_list_mutex); |
---|
7263 | | - mutex_lock(&fs_info->chunk_mutex); |
---|
7264 | | - list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, |
---|
7265 | | - resized_list) { |
---|
7266 | | - list_del_init(&curr->resized_list); |
---|
| 7595 | + /* |
---|
| 7596 | + * We don't need the device_list_mutex here. This list is owned by the |
---|
| 7597 | + * transaction and the transaction must complete before the device is |
---|
| 7598 | + * released. |
---|
| 7599 | + */ |
---|
| 7600 | + mutex_lock(&trans->fs_info->chunk_mutex); |
---|
| 7601 | + list_for_each_entry_safe(curr, next, &trans->dev_update_list, |
---|
| 7602 | + post_commit_list) { |
---|
| 7603 | + list_del_init(&curr->post_commit_list); |
---|
7267 | 7604 | curr->commit_total_bytes = curr->disk_total_bytes; |
---|
| 7605 | + curr->commit_bytes_used = curr->bytes_used; |
---|
7268 | 7606 | } |
---|
7269 | | - mutex_unlock(&fs_info->chunk_mutex); |
---|
7270 | | - mutex_unlock(&fs_devices->device_list_mutex); |
---|
7271 | | -} |
---|
7272 | | - |
---|
7273 | | -/* Must be invoked during the transaction commit */ |
---|
7274 | | -void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans) |
---|
7275 | | -{ |
---|
7276 | | - struct btrfs_fs_info *fs_info = trans->fs_info; |
---|
7277 | | - struct extent_map *em; |
---|
7278 | | - struct map_lookup *map; |
---|
7279 | | - struct btrfs_device *dev; |
---|
7280 | | - int i; |
---|
7281 | | - |
---|
7282 | | - if (list_empty(&trans->pending_chunks)) |
---|
7283 | | - return; |
---|
7284 | | - |
---|
7285 | | - /* In order to kick the device replace finish process */ |
---|
7286 | | - mutex_lock(&fs_info->chunk_mutex); |
---|
7287 | | - list_for_each_entry(em, &trans->pending_chunks, list) { |
---|
7288 | | - map = em->map_lookup; |
---|
7289 | | - |
---|
7290 | | - for (i = 0; i < map->num_stripes; i++) { |
---|
7291 | | - dev = map->stripes[i].dev; |
---|
7292 | | - dev->commit_bytes_used = dev->bytes_used; |
---|
7293 | | - dev->has_pending_chunks = false; |
---|
7294 | | - } |
---|
7295 | | - } |
---|
7296 | | - mutex_unlock(&fs_info->chunk_mutex); |
---|
7297 | | -} |
---|
7298 | | - |
---|
7299 | | -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) |
---|
7300 | | -{ |
---|
7301 | | - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
---|
7302 | | - while (fs_devices) { |
---|
7303 | | - fs_devices->fs_info = fs_info; |
---|
7304 | | - fs_devices = fs_devices->seed; |
---|
7305 | | - } |
---|
7306 | | -} |
---|
7307 | | - |
---|
7308 | | -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) |
---|
7309 | | -{ |
---|
7310 | | - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; |
---|
7311 | | - while (fs_devices) { |
---|
7312 | | - fs_devices->fs_info = NULL; |
---|
7313 | | - fs_devices = fs_devices->seed; |
---|
7314 | | - } |
---|
| 7607 | + mutex_unlock(&trans->fs_info->chunk_mutex); |
---|
7315 | 7608 | } |
---|
7316 | 7609 | |
---|
7317 | 7610 | /* |
---|
.. | .. |
---|
7319 | 7612 | */ |
---|
7320 | 7613 | int btrfs_bg_type_to_factor(u64 flags) |
---|
7321 | 7614 | { |
---|
7322 | | - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | |
---|
7323 | | - BTRFS_BLOCK_GROUP_RAID10)) |
---|
7324 | | - return 2; |
---|
7325 | | - return 1; |
---|
| 7615 | + const int index = btrfs_bg_flags_to_raid_index(flags); |
---|
| 7616 | + |
---|
| 7617 | + return btrfs_raid_array[index].ncopies; |
---|
7326 | 7618 | } |
---|
7327 | 7619 | |
---|
7328 | 7620 | |
---|
7329 | | -static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) |
---|
7330 | | -{ |
---|
7331 | | - int index = btrfs_bg_flags_to_raid_index(type); |
---|
7332 | | - int ncopies = btrfs_raid_array[index].ncopies; |
---|
7333 | | - int data_stripes; |
---|
7334 | | - |
---|
7335 | | - switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
---|
7336 | | - case BTRFS_BLOCK_GROUP_RAID5: |
---|
7337 | | - data_stripes = num_stripes - 1; |
---|
7338 | | - break; |
---|
7339 | | - case BTRFS_BLOCK_GROUP_RAID6: |
---|
7340 | | - data_stripes = num_stripes - 2; |
---|
7341 | | - break; |
---|
7342 | | - default: |
---|
7343 | | - data_stripes = num_stripes / ncopies; |
---|
7344 | | - break; |
---|
7345 | | - } |
---|
7346 | | - return div_u64(chunk_len, data_stripes); |
---|
7347 | | -} |
---|
7348 | 7621 | |
---|
7349 | 7622 | static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, |
---|
7350 | 7623 | u64 chunk_offset, u64 devid, |
---|
7351 | 7624 | u64 physical_offset, u64 physical_len) |
---|
7352 | 7625 | { |
---|
7353 | | - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; |
---|
| 7626 | + struct extent_map_tree *em_tree = &fs_info->mapping_tree; |
---|
7354 | 7627 | struct extent_map *em; |
---|
7355 | 7628 | struct map_lookup *map; |
---|
7356 | 7629 | struct btrfs_device *dev; |
---|
.. | .. |
---|
7414 | 7687 | |
---|
7415 | 7688 | /* It's possible this device is a dummy for seed device */ |
---|
7416 | 7689 | if (dev->disk_total_bytes == 0) { |
---|
7417 | | - dev = btrfs_find_device(fs_info->fs_devices->seed, devid, |
---|
7418 | | - NULL, NULL, false); |
---|
| 7690 | + struct btrfs_fs_devices *devs; |
---|
| 7691 | + |
---|
| 7692 | + devs = list_first_entry(&fs_info->fs_devices->seed_list, |
---|
| 7693 | + struct btrfs_fs_devices, seed_list); |
---|
| 7694 | + dev = btrfs_find_device(devs, devid, NULL, NULL, false); |
---|
7419 | 7695 | if (!dev) { |
---|
7420 | 7696 | btrfs_err(fs_info, "failed to find seed devid %llu", |
---|
7421 | 7697 | devid); |
---|
.. | .. |
---|
7439 | 7715 | |
---|
7440 | 7716 | static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) |
---|
7441 | 7717 | { |
---|
7442 | | - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; |
---|
| 7718 | + struct extent_map_tree *em_tree = &fs_info->mapping_tree; |
---|
7443 | 7719 | struct extent_map *em; |
---|
7444 | 7720 | struct rb_node *node; |
---|
7445 | 7721 | int ret = 0; |
---|
7446 | 7722 | |
---|
7447 | 7723 | read_lock(&em_tree->lock); |
---|
7448 | | - for (node = rb_first(&em_tree->map); node; node = rb_next(node)) { |
---|
| 7724 | + for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { |
---|
7449 | 7725 | em = rb_entry(node, struct extent_map, rb_node); |
---|
7450 | 7726 | if (em->map_lookup->num_stripes != |
---|
7451 | 7727 | em->map_lookup->verified_stripes) { |
---|
.. | .. |
---|
7551 | 7827 | btrfs_free_path(path); |
---|
7552 | 7828 | return ret; |
---|
7553 | 7829 | } |
---|
| 7830 | + |
---|
| 7831 | +/* |
---|
| 7832 | + * Check whether the given block group or device is pinned by any inode being |
---|
| 7833 | + * used as a swapfile. |
---|
| 7834 | + */ |
---|
| 7835 | +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) |
---|
| 7836 | +{ |
---|
| 7837 | + struct btrfs_swapfile_pin *sp; |
---|
| 7838 | + struct rb_node *node; |
---|
| 7839 | + |
---|
| 7840 | + spin_lock(&fs_info->swapfile_pins_lock); |
---|
| 7841 | + node = fs_info->swapfile_pins.rb_node; |
---|
| 7842 | + while (node) { |
---|
| 7843 | + sp = rb_entry(node, struct btrfs_swapfile_pin, node); |
---|
| 7844 | + if (ptr < sp->ptr) |
---|
| 7845 | + node = node->rb_left; |
---|
| 7846 | + else if (ptr > sp->ptr) |
---|
| 7847 | + node = node->rb_right; |
---|
| 7848 | + else |
---|
| 7849 | + break; |
---|
| 7850 | + } |
---|
| 7851 | + spin_unlock(&fs_info->swapfile_pins_lock); |
---|
| 7852 | + return node != NULL; |
---|
| 7853 | +} |
---|