~hc/RK356X_SDK_RELEASE.git

..	..	@@ -7,7 +7,6 @@
7	7	#include <linux/sched/mm.h>
8	8	#include <linux/bio.h>
9	9	#include <linux/slab.h>
10		-#include <linux/buffer_head.h>
11	10	#include <linux/blkdev.h>
12	11	#include <linux/ratelimit.h>
13	12	#include <linux/kthread.h>
..	..	@@ -15,6 +14,8 @@
15	14	#include <linux/semaphore.h>
16	15	#include <linux/uuid.h>
17	16	#include <linux/list_sort.h>
	17	+#include <linux/namei.h>
	18	+#include "misc.h"
18	19	#include "ctree.h"
19	20	#include "extent_map.h"
20	21	#include "disk-io.h"
..	..	@@ -25,10 +26,12 @@
25	26	#include "async-thread.h"
26	27	#include "check-integrity.h"
27	28	#include "rcu-string.h"
28		-#include "math.h"
29	29	#include "dev-replace.h"
30	30	#include "sysfs.h"
31	31	#include "tree-checker.h"
	32	+#include "space-info.h"
	33	+#include "block-group.h"
	34	+#include "discard.h"
32	35
33	36	const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
34	37	[BTRFS_RAID_RAID10] = {
..	..	@@ -39,6 +42,7 @@
39	42	.tolerated_failures = 1,
40	43	.devs_increment = 2,
41	44	.ncopies = 2,
	45	+ .nparity = 0,
42	46	.raid_name = "raid10",
43	47	.bg_flag = BTRFS_BLOCK_GROUP_RAID10,
44	48	.mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
..	..	@@ -51,9 +55,36 @@
51	55	.tolerated_failures = 1,
52	56	.devs_increment = 2,
53	57	.ncopies = 2,
	58	+ .nparity = 0,
54	59	.raid_name = "raid1",
55	60	.bg_flag = BTRFS_BLOCK_GROUP_RAID1,
56	61	.mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
	62	+ },
	63	+ [BTRFS_RAID_RAID1C3] = {
	64	+ .sub_stripes = 1,
	65	+ .dev_stripes = 1,
	66	+ .devs_max = 3,
	67	+ .devs_min = 3,
	68	+ .tolerated_failures = 2,
	69	+ .devs_increment = 3,
	70	+ .ncopies = 3,
	71	+ .nparity = 0,
	72	+ .raid_name = "raid1c3",
	73	+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
	74	+ .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
	75	+ },
	76	+ [BTRFS_RAID_RAID1C4] = {
	77	+ .sub_stripes = 1,
	78	+ .dev_stripes = 1,
	79	+ .devs_max = 4,
	80	+ .devs_min = 4,
	81	+ .tolerated_failures = 3,
	82	+ .devs_increment = 4,
	83	+ .ncopies = 4,
	84	+ .nparity = 0,
	85	+ .raid_name = "raid1c4",
	86	+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
	87	+ .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
57	88	},
58	89	[BTRFS_RAID_DUP] = {
59	90	.sub_stripes = 1,
..	..	@@ -63,6 +94,7 @@
63	94	.tolerated_failures = 0,
64	95	.devs_increment = 1,
65	96	.ncopies = 2,
	97	+ .nparity = 0,
66	98	.raid_name = "dup",
67	99	.bg_flag = BTRFS_BLOCK_GROUP_DUP,
68	100	.mindev_error = 0,
..	..	@@ -75,6 +107,7 @@
75	107	.tolerated_failures = 0,
76	108	.devs_increment = 1,
77	109	.ncopies = 1,
	110	+ .nparity = 0,
78	111	.raid_name = "raid0",
79	112	.bg_flag = BTRFS_BLOCK_GROUP_RAID0,
80	113	.mindev_error = 0,
..	..	@@ -87,6 +120,7 @@
87	120	.tolerated_failures = 0,
88	121	.devs_increment = 1,
89	122	.ncopies = 1,
	123	+ .nparity = 0,
90	124	.raid_name = "single",
91	125	.bg_flag = 0,
92	126	.mindev_error = 0,
..	..	@@ -99,6 +133,7 @@
99	133	.tolerated_failures = 1,
100	134	.devs_increment = 1,
101	135	.ncopies = 1,
	136	+ .nparity = 1,
102	137	.raid_name = "raid5",
103	138	.bg_flag = BTRFS_BLOCK_GROUP_RAID5,
104	139	.mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
..	..	@@ -111,24 +146,79 @@
111	146	.tolerated_failures = 2,
112	147	.devs_increment = 1,
113	148	.ncopies = 1,
	149	+ .nparity = 2,
114	150	.raid_name = "raid6",
115	151	.bg_flag = BTRFS_BLOCK_GROUP_RAID6,
116	152	.mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
117	153	},
118	154	};
119	155
120		-const char *get_raid_name(enum btrfs_raid_types type)
	156	+const char *btrfs_bg_type_to_raid_name(u64 flags)
121	157	{
122		- if (type >= BTRFS_NR_RAID_TYPES)
	158	+ const int index = btrfs_bg_flags_to_raid_index(flags);
	159	+
	160	+ if (index >= BTRFS_NR_RAID_TYPES)
123	161	return NULL;
124	162
125		- return btrfs_raid_array[type].raid_name;
	163	+ return btrfs_raid_array[index].raid_name;
126	164	}
127	165
128		-static int init_first_rw_device(struct btrfs_trans_handle *trans,
129		- struct btrfs_fs_info *fs_info);
	166	+/*
	167	+ * Fill @buf with textual description of @bg_flags, no more than @size_buf
	168	+ * bytes including terminating null byte.
	169	+ */
	170	+void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
	171	+{
	172	+ int i;
	173	+ int ret;
	174	+ char *bp = buf;
	175	+ u64 flags = bg_flags;
	176	+ u32 size_bp = size_buf;
	177	+
	178	+ if (!flags) {
	179	+ strcpy(bp, "NONE");
	180	+ return;
	181	+ }
	182	+
	183	+#define DESCRIBE_FLAG(flag, desc) \
	184	+ do { \
	185	+ if (flags & (flag)) { \
	186	+ ret = snprintf(bp, size_bp, "%s\|", (desc)); \
	187	+ if (ret < 0 \|\| ret >= size_bp) \
	188	+ goto out_overflow; \
	189	+ size_bp -= ret; \
	190	+ bp += ret; \
	191	+ flags &= ~(flag); \
	192	+ } \
	193	+ } while (0)
	194	+
	195	+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
	196	+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
	197	+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
	198	+
	199	+ DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
	200	+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
	201	+ DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
	202	+ btrfs_raid_array[i].raid_name);
	203	+#undef DESCRIBE_FLAG
	204	+
	205	+ if (flags) {
	206	+ ret = snprintf(bp, size_bp, "0x%llx\|", flags);
	207	+ size_bp -= ret;
	208	+ }
	209	+
	210	+ if (size_bp < size_buf)
	211	+ buf[size_buf - size_bp - 1] = '\0'; /* remove last \| */
	212	+
	213	+ /*
	214	+ * The text is trimmed, it's up to the caller to provide sufficiently
	215	+ * large buffer
	216	+ */
	217	+out_overflow:;
	218	+}
	219	+
	220	+static int init_first_rw_device(struct btrfs_trans_handle *trans);
130	221	static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
131		-static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
132	222	static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
133	223	static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
134	224	static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
..	..	@@ -153,7 +243,7 @@
153	243	* the mutex can be very coarse and can cover long-running operations
154	244	*
155	245	* protects: updates to fs_devices counters like missing devices, rw devices,
156		- * seeding, structure cloning, openning/closing devices at mount/umount time
	246	+ * seeding, structure cloning, opening/closing devices at mount/umount time
157	247	*
158	248	* global::fs_devs - add, remove, updates to the global list
159	249	*
..	..	@@ -183,7 +273,9 @@
183	273	* chunk_mutex
184	274	* -----------
185	275	* protects chunks, adding or removing during allocation, trim or when a new
186		- * device is added/removed
	276	+ * device is added/removed. Additionally it also protects post_commit_list of
	277	+ * individual devices, since they can be added to the transaction's
	278	+ * post_commit_list only with chunk_mutex held.
187	279	*
188	280	* cleaner_mutex
189	281	* -------------
..	..	@@ -195,14 +287,13 @@
195	287	* ============
196	288	*
197	289	* uuid_mutex
198		- * volume_mutex
199		- * device_list_mutex
200		- * chunk_mutex
201		- * balance_mutex
	290	+ * device_list_mutex
	291	+ * chunk_mutex
	292	+ * balance_mutex
202	293	*
203	294	*
204		- * Exclusive operations, BTRFS_FS_EXCL_OP
205		- * ======================================
	295	+ * Exclusive operations
	296	+ * ====================
206	297	*
207	298	* Maintains the exclusivity of the following operations that apply to the
208	299	* whole filesystem and cannot run in parallel.
..	..	@@ -228,30 +319,32 @@
228	319	* - system power-cycle and filesystem mounted as read-only
229	320	* - filesystem or device errors leading to forced read-only
230	321	*
231		- * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
232		- * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
	322	+ * The status of exclusive operation is set and cleared atomically.
	323	+ * During the course of Paused state, fs_info::exclusive_operation remains set.
233	324	* A device operation in Paused or Running state can be canceled or resumed
234	325	* either by ioctl (Balance only) or when remounted as read-write.
235		- * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
	326	+ * The exclusive status is cleared when the device operation is canceled or
236	327	* completed.
237	328	*/
238	329
239	330	DEFINE_MUTEX(uuid_mutex);
240	331	static LIST_HEAD(fs_uuids);
241		-struct list_head *btrfs_get_fs_uuids(void)
	332	+struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
242	333	{
243	334	return &fs_uuids;
244	335	}
245	336
246	337	/*
247	338	* alloc_fs_devices - allocate struct btrfs_fs_devices
248		- * @fsid: if not NULL, copy the uuid to fs_devices::fsid
	339	+ * @fsid: if not NULL, copy the UUID to fs_devices::fsid
	340	+ * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
249	341	*
250	342	* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
251	343	* The returned struct is not linked onto any lists and can be destroyed with
252	344	* kfree() right away.
253	345	*/
254		-static struct btrfs_fs_devices alloc_fs_devices(const u8 fsid)
	346	+static struct btrfs_fs_devices alloc_fs_devices(const u8 fsid,
	347	+ const u8 *metadata_fsid)
255	348	{
256	349	struct btrfs_fs_devices *fs_devs;
257	350
..	..	@@ -262,18 +355,25 @@
262	355	mutex_init(&fs_devs->device_list_mutex);
263	356
264	357	INIT_LIST_HEAD(&fs_devs->devices);
265		- INIT_LIST_HEAD(&fs_devs->resized_devices);
266	358	INIT_LIST_HEAD(&fs_devs->alloc_list);
267	359	INIT_LIST_HEAD(&fs_devs->fs_list);
	360	+ INIT_LIST_HEAD(&fs_devs->seed_list);
268	361	if (fsid)
269	362	memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
	363	+
	364	+ if (metadata_fsid)
	365	+ memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
	366	+ else if (fsid)
	367	+ memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
270	368
271	369	return fs_devs;
272	370	}
273	371
274	372	void btrfs_free_device(struct btrfs_device *device)
275	373	{
	374	+ WARN_ON(!list_empty(&device->post_commit_list));
276	375	rcu_string_free(device->name);
	376	+ extent_io_tree_release(&device->alloc_state);
277	377	bio_put(device->flush_bio);
278	378	kfree(device);
279	379	}
..	..	@@ -289,19 +389,6 @@
289	389	btrfs_free_device(device);
290	390	}
291	391	kfree(fs_devices);
292		-}
293		-
294		-static void btrfs_kobject_uevent(struct block_device *bdev,
295		- enum kobject_action action)
296		-{
297		- int ret;
298		-
299		- ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
300		- if (ret)
301		- pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
302		- action,
303		- kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
304		- &disk_to_dev(bdev->bd_disk)->kobj);
305	392	}
306	393
307	394	void __exit btrfs_cleanup_fs_uuids(void)
..	..	@@ -321,7 +408,7 @@
321	408	* Returned struct is not linked onto any lists and must be destroyed using
322	409	* btrfs_free_device.
323	410	*/
324		-static struct btrfs_device *__alloc_device(void)
	411	+static struct btrfs_device __alloc_device(struct btrfs_fs_info fs_info)
325	412	{
326	413	struct btrfs_device *dev;
327	414
..	..	@@ -341,34 +428,86 @@
341	428
342	429	INIT_LIST_HEAD(&dev->dev_list);
343	430	INIT_LIST_HEAD(&dev->dev_alloc_list);
344		- INIT_LIST_HEAD(&dev->resized_list);
345		-
346		- spin_lock_init(&dev->io_lock);
	431	+ INIT_LIST_HEAD(&dev->post_commit_list);
347	432
348	433	atomic_set(&dev->reada_in_flight, 0);
349	434	atomic_set(&dev->dev_stats_ccnt, 0);
350	435	btrfs_device_data_ordered_init(dev);
351	436	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
352	437	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
	438	+ extent_io_tree_init(fs_info, &dev->alloc_state,
	439	+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
353	440
354	441	return dev;
355	442	}
356	443
357		-static noinline struct btrfs_fs_devices find_fsid(u8 fsid)
	444	+static noinline struct btrfs_fs_devices *find_fsid(
	445	+ const u8 fsid, const u8 metadata_fsid)
358	446	{
359	447	struct btrfs_fs_devices *fs_devices;
360	448
	449	+ ASSERT(fsid);
	450	+
	451	+ /* Handle non-split brain cases */
361	452	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
362		- if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
363		- return fs_devices;
	453	+ if (metadata_fsid) {
	454	+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
	455	+ && memcmp(metadata_fsid, fs_devices->metadata_uuid,
	456	+ BTRFS_FSID_SIZE) == 0)
	457	+ return fs_devices;
	458	+ } else {
	459	+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
	460	+ return fs_devices;
	461	+ }
364	462	}
365	463	return NULL;
366	464	}
367	465
	466	+static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
	467	+ struct btrfs_super_block *disk_super)
	468	+{
	469	+
	470	+ struct btrfs_fs_devices *fs_devices;
	471	+
	472	+ /*
	473	+ * Handle scanned device having completed its fsid change but
	474	+ * belonging to a fs_devices that was created by first scanning
	475	+ * a device which didn't have its fsid/metadata_uuid changed
	476	+ * at all and the CHANGING_FSID_V2 flag set.
	477	+ */
	478	+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
	479	+ if (fs_devices->fsid_change &&
	480	+ memcmp(disk_super->metadata_uuid, fs_devices->fsid,
	481	+ BTRFS_FSID_SIZE) == 0 &&
	482	+ memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
	483	+ BTRFS_FSID_SIZE) == 0) {
	484	+ return fs_devices;
	485	+ }
	486	+ }
	487	+ /*
	488	+ * Handle scanned device having completed its fsid change but
	489	+ * belonging to a fs_devices that was created by a device that
	490	+ * has an outdated pair of fsid/metadata_uuid and
	491	+ * CHANGING_FSID_V2 flag set.
	492	+ */
	493	+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
	494	+ if (fs_devices->fsid_change &&
	495	+ memcmp(fs_devices->metadata_uuid,
	496	+ fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
	497	+ memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
	498	+ BTRFS_FSID_SIZE) == 0) {
	499	+ return fs_devices;
	500	+ }
	501	+ }
	502	+
	503	+ return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
	504	+}
	505	+
	506	+
368	507	static int
369	508	btrfs_get_bdev_and_sb(const char device_path, fmode_t flags, void holder,
370	509	int flush, struct block_device **bdev,
371		- struct buffer_head **bh)
	510	+ struct btrfs_super_block **disk_super)
372	511	{
373	512	int ret;
374	513
..	..	@@ -387,9 +526,9 @@
387	526	goto error;
388	527	}
389	528	invalidate_bdev(*bdev);
390		- bh = btrfs_read_dev_super(bdev);
391		- if (IS_ERR(*bh)) {
392		- ret = PTR_ERR(*bh);
	529	+ disk_super = btrfs_read_dev_super(bdev);
	530	+ if (IS_ERR(*disk_super)) {
	531	+ ret = PTR_ERR(*disk_super);
393	532	blkdev_put(*bdev, flags);
394	533	goto error;
395	534	}
..	..	@@ -398,214 +537,50 @@
398	537
399	538	error:
400	539	*bdev = NULL;
401		- *bh = NULL;
402	540	return ret;
403	541	}
404	542
405		-static void requeue_list(struct btrfs_pending_bios *pending_bios,
406		- struct bio head, struct bio tail)
407		-{
408		-
409		- struct bio *old_head;
410		-
411		- old_head = pending_bios->head;
412		- pending_bios->head = head;
413		- if (pending_bios->tail)
414		- tail->bi_next = old_head;
415		- else
416		- pending_bios->tail = tail;
417		-}
418		-
419	543	/*
420		- * we try to collect pending bios for a device so we don't get a large
421		- * number of procs sending bios down to the same device. This greatly
422		- * improves the schedulers ability to collect and merge the bios.
	544	+ * Check if the device in the path matches the device in the given struct device.
423	545	*
424		- * But, it also turns into a long list of bios to process and that is sure
425		- * to eventually make the worker thread block. The solution here is to
426		- * make some progress and then put this work struct back at the end of
427		- * the list if the block device is congested. This way, multiple devices
428		- * can make progress from a single worker thread.
	546	+ * Returns:
	547	+ * true If it is the same device.
	548	+ * false If it is not the same device or on error.
429	549	*/
430		-static noinline void run_scheduled_bios(struct btrfs_device *device)
	550	+static bool device_matched(const struct btrfs_device device, const char path)
431	551	{
432		- struct btrfs_fs_info *fs_info = device->fs_info;
433		- struct bio *pending;
434		- struct backing_dev_info *bdi;
435		- struct btrfs_pending_bios *pending_bios;
436		- struct bio *tail;
437		- struct bio *cur;
438		- int again = 0;
439		- unsigned long num_run;
440		- unsigned long batch_run = 0;
441		- unsigned long last_waited = 0;
442		- int force_reg = 0;
443		- int sync_pending = 0;
444		- struct blk_plug plug;
	552	+ char *device_name;
	553	+ struct block_device *bdev_old;
	554	+ struct block_device *bdev_new;
445	555
446	556	/*
447		- * this function runs all the bios we've collected for
448		- * a particular device. We don't want to wander off to
449		- * another device without first sending all of these down.
450		- * So, setup a plug here and finish it off before we return
	557	+ * If we are looking for a device with the matching dev_t, then skip
	558	+ * device without a name (a missing device).
451	559	*/
452		- blk_start_plug(&plug);
	560	+ if (!device->name)
	561	+ return false;
453	562
454		- bdi = device->bdev->bd_bdi;
	563	+ device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
	564	+ if (!device_name)
	565	+ return false;
455	566
456		-loop:
457		- spin_lock(&device->io_lock);
	567	+ rcu_read_lock();
	568	+ scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
	569	+ rcu_read_unlock();
458	570
459		-loop_lock:
460		- num_run = 0;
	571	+ bdev_old = lookup_bdev(device_name);
	572	+ kfree(device_name);
	573	+ if (IS_ERR(bdev_old))
	574	+ return false;
461	575
462		- /* take all the bios off the list at once and process them
463		- * later on (without the lock held). But, remember the
464		- * tail and other pointers so the bios can be properly reinserted
465		- * into the list if we hit congestion
466		- */
467		- if (!force_reg && device->pending_sync_bios.head) {
468		- pending_bios = &device->pending_sync_bios;
469		- force_reg = 1;
470		- } else {
471		- pending_bios = &device->pending_bios;
472		- force_reg = 0;
473		- }
	576	+ bdev_new = lookup_bdev(path);
	577	+ if (IS_ERR(bdev_new))
	578	+ return false;
474	579
475		- pending = pending_bios->head;
476		- tail = pending_bios->tail;
477		- WARN_ON(pending && !tail);
	580	+ if (bdev_old == bdev_new)
	581	+ return true;
478	582
479		- /*
480		- * if pending was null this time around, no bios need processing
481		- * at all and we can stop. Otherwise it'll loop back up again
482		- * and do an additional check so no bios are missed.
483		- *
484		- * device->running_pending is used to synchronize with the
485		- * schedule_bio code.
486		- */
487		- if (device->pending_sync_bios.head == NULL &&
488		- device->pending_bios.head == NULL) {
489		- again = 0;
490		- device->running_pending = 0;
491		- } else {
492		- again = 1;
493		- device->running_pending = 1;
494		- }
495		-
496		- pending_bios->head = NULL;
497		- pending_bios->tail = NULL;
498		-
499		- spin_unlock(&device->io_lock);
500		-
501		- while (pending) {
502		-
503		- rmb();
504		- /* we want to work on both lists, but do more bios on the
505		- * sync list than the regular list
506		- */
507		- if ((num_run > 32 &&
508		- pending_bios != &device->pending_sync_bios &&
509		- device->pending_sync_bios.head) \|\|
510		- (num_run > 64 && pending_bios == &device->pending_sync_bios &&
511		- device->pending_bios.head)) {
512		- spin_lock(&device->io_lock);
513		- requeue_list(pending_bios, pending, tail);
514		- goto loop_lock;
515		- }
516		-
517		- cur = pending;
518		- pending = pending->bi_next;
519		- cur->bi_next = NULL;
520		-
521		- BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
522		-
523		- /*
524		- * if we're doing the sync list, record that our
525		- * plug has some sync requests on it
526		- *
527		- * If we're doing the regular list and there are
528		- * sync requests sitting around, unplug before
529		- * we add more
530		- */
531		- if (pending_bios == &device->pending_sync_bios) {
532		- sync_pending = 1;
533		- } else if (sync_pending) {
534		- blk_finish_plug(&plug);
535		- blk_start_plug(&plug);
536		- sync_pending = 0;
537		- }
538		-
539		- btrfsic_submit_bio(cur);
540		- num_run++;
541		- batch_run++;
542		-
543		- cond_resched();
544		-
545		- /*
546		- * we made progress, there is more work to do and the bdi
547		- * is now congested. Back off and let other work structs
548		- * run instead
549		- */
550		- if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
551		- fs_info->fs_devices->open_devices > 1) {
552		- struct io_context *ioc;
553		-
554		- ioc = current->io_context;
555		-
556		- /*
557		- * the main goal here is that we don't want to
558		- * block if we're going to be able to submit
559		- * more requests without blocking.
560		- *
561		- * This code does two great things, it pokes into
562		- * the elevator code from a filesystem _and_
563		- * it makes assumptions about how batching works.
564		- */
565		- if (ioc && ioc->nr_batch_requests > 0 &&
566		- time_before(jiffies, ioc->last_waited + HZ/50UL) &&
567		- (last_waited == 0 \|\|
568		- ioc->last_waited == last_waited)) {
569		- /*
570		- * we want to go through our batch of
571		- * requests and stop. So, we copy out
572		- * the ioc->last_waited time and test
573		- * against it before looping
574		- */
575		- last_waited = ioc->last_waited;
576		- cond_resched();
577		- continue;
578		- }
579		- spin_lock(&device->io_lock);
580		- requeue_list(pending_bios, pending, tail);
581		- device->running_pending = 1;
582		-
583		- spin_unlock(&device->io_lock);
584		- btrfs_queue_work(fs_info->submit_workers,
585		- &device->work);
586		- goto done;
587		- }
588		- }
589		-
590		- cond_resched();
591		- if (again)
592		- goto loop;
593		-
594		- spin_lock(&device->io_lock);
595		- if (device->pending_bios.head \|\| device->pending_sync_bios.head)
596		- goto loop_lock;
597		- spin_unlock(&device->io_lock);
598		-
599		-done:
600		- blk_finish_plug(&plug);
601		-}
602		-
603		-static void pending_bios_fn(struct btrfs_work *work)
604		-{
605		- struct btrfs_device *device;
606		-
607		- device = container_of(work, struct btrfs_device, work);
608		- run_scheduled_bios(device);
	583	+ return false;
609	584	}
610	585
611	586	/*
..	..	@@ -615,52 +590,55 @@
615	590	* matching this path only.
616	591	* skip_dev: Optional. Will skip this device when searching for the stale
617	592	* devices.
	593	+ * Return: 0 for success or if @path is NULL.
	594	+ * -EBUSY if @path is a mounted device.
	595	+ * -ENOENT if @path does not match any device in the list.
618	596	*/
619		-static void btrfs_free_stale_devices(const char *path,
	597	+static int btrfs_free_stale_devices(const char *path,
620	598	struct btrfs_device *skip_device)
621	599	{
622	600	struct btrfs_fs_devices fs_devices, tmp_fs_devices;
623	601	struct btrfs_device device, tmp_device;
	602	+ int ret = 0;
	603	+
	604	+ lockdep_assert_held(&uuid_mutex);
	605	+
	606	+ if (path)
	607	+ ret = -ENOENT;
624	608
625	609	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
626		- mutex_lock(&fs_devices->device_list_mutex);
627		- if (fs_devices->opened) {
628		- mutex_unlock(&fs_devices->device_list_mutex);
629		- continue;
630		- }
631	610
	611	+ mutex_lock(&fs_devices->device_list_mutex);
632	612	list_for_each_entry_safe(device, tmp_device,
633	613	&fs_devices->devices, dev_list) {
634		- int not_found = 0;
635		-
636	614	if (skip_device && skip_device == device)
637	615	continue;
638		- if (path && !device->name)
	616	+ if (path && !device_matched(device, path))
639	617	continue;
640		-
641		- rcu_read_lock();
642		- if (path)
643		- not_found = strcmp(rcu_str_deref(device->name),
644		- path);
645		- rcu_read_unlock();
646		- if (not_found)
647		- continue;
	618	+ if (fs_devices->opened) {
	619	+ /* for an already deleted device return 0 */
	620	+ if (path && ret != 0)
	621	+ ret = -EBUSY;
	622	+ break;
	623	+ }
648	624
649	625	/* delete the stale device */
650	626	fs_devices->num_devices--;
651	627	list_del(&device->dev_list);
652	628	btrfs_free_device(device);
653	629
654		- if (fs_devices->num_devices == 0)
655		- break;
	630	+ ret = 0;
656	631	}
657	632	mutex_unlock(&fs_devices->device_list_mutex);
	633	+
658	634	if (fs_devices->num_devices == 0) {
659	635	btrfs_sysfs_remove_fsid(fs_devices);
660	636	list_del(&fs_devices->fs_list);
661	637	free_fs_devices(fs_devices);
662	638	}
663	639	}
	640	+
	641	+ return ret;
664	642	}
665	643
666	644	/*
..	..	@@ -674,7 +652,6 @@
674	652	{
675	653	struct request_queue *q;
676	654	struct block_device *bdev;
677		- struct buffer_head *bh;
678	655	struct btrfs_super_block *disk_super;
679	656	u64 devid;
680	657	int ret;
..	..	@@ -685,23 +662,29 @@
685	662	return -EINVAL;
686	663
687	664	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
688		- &bdev, &bh);
	665	+ &bdev, &disk_super);
689	666	if (ret)
690	667	return ret;
691	668
692		- disk_super = (struct btrfs_super_block *)bh->b_data;
693	669	devid = btrfs_stack_device_id(&disk_super->dev_item);
694	670	if (devid != device->devid)
695		- goto error_brelse;
	671	+ goto error_free_page;
696	672
697	673	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
698		- goto error_brelse;
	674	+ goto error_free_page;
699	675
700	676	device->generation = btrfs_super_generation(disk_super);
701	677
702	678	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
	679	+ if (btrfs_super_incompat_flags(disk_super) &
	680	+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
	681	+ pr_err(
	682	+ "BTRFS: Invalid seeding and uuid-changed device detected\n");
	683	+ goto error_free_page;
	684	+ }
	685	+
703	686	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
704		- fs_devices->seeding = 1;
	687	+ fs_devices->seeding = true;
705	688	} else {
706	689	if (bdev_read_only(bdev))
707	690	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
..	..	@@ -711,7 +694,7 @@
711	694
712	695	q = bdev_get_queue(bdev);
713	696	if (!blk_queue_nonrot(q))
714		- fs_devices->rotating = 1;
	697	+ fs_devices->rotating = true;
715	698
716	699	device->bdev = bdev;
717	700	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
..	..	@@ -723,17 +706,101 @@
723	706	fs_devices->rw_devices++;
724	707	list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
725	708	}
726		- brelse(bh);
	709	+ btrfs_release_disk_super(disk_super);
727	710
728	711	return 0;
729	712
730		-error_brelse:
731		- brelse(bh);
	713	+error_free_page:
	714	+ btrfs_release_disk_super(disk_super);
732	715	blkdev_put(bdev, flags);
733	716
734	717	return -EINVAL;
735	718	}
736	719
	720	+/*
	721	+ * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
	722	+ * being created with a disk that has already completed its fsid change. Such
	723	+ * disk can belong to an fs which has its FSID changed or to one which doesn't.
	724	+ * Handle both cases here.
	725	+ */
	726	+static struct btrfs_fs_devices *find_fsid_inprogress(
	727	+ struct btrfs_super_block *disk_super)
	728	+{
	729	+ struct btrfs_fs_devices *fs_devices;
	730	+
	731	+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
	732	+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
	733	+ BTRFS_FSID_SIZE) != 0 &&
	734	+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
	735	+ BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
	736	+ return fs_devices;
	737	+ }
	738	+ }
	739	+
	740	+ return find_fsid(disk_super->fsid, NULL);
	741	+}
	742	+
	743	+
	744	+static struct btrfs_fs_devices *find_fsid_changed(
	745	+ struct btrfs_super_block *disk_super)
	746	+{
	747	+ struct btrfs_fs_devices *fs_devices;
	748	+
	749	+ /*
	750	+ * Handles the case where scanned device is part of an fs that had
	751	+ * multiple successful changes of FSID but curently device didn't
	752	+ * observe it. Meaning our fsid will be different than theirs. We need
	753	+ * to handle two subcases :
	754	+ * 1 - The fs still continues to have different METADATA/FSID uuids.
	755	+ * 2 - The fs is switched back to its original FSID (METADATA/FSID
	756	+ * are equal).
	757	+ */
	758	+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
	759	+ /* Changed UUIDs */
	760	+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
	761	+ BTRFS_FSID_SIZE) != 0 &&
	762	+ memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
	763	+ BTRFS_FSID_SIZE) == 0 &&
	764	+ memcmp(fs_devices->fsid, disk_super->fsid,
	765	+ BTRFS_FSID_SIZE) != 0)
	766	+ return fs_devices;
	767	+
	768	+ /* Unchanged UUIDs */
	769	+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
	770	+ BTRFS_FSID_SIZE) == 0 &&
	771	+ memcmp(fs_devices->fsid, disk_super->metadata_uuid,
	772	+ BTRFS_FSID_SIZE) == 0)
	773	+ return fs_devices;
	774	+ }
	775	+
	776	+ return NULL;
	777	+}
	778	+
	779	+static struct btrfs_fs_devices *find_fsid_reverted_metadata(
	780	+ struct btrfs_super_block *disk_super)
	781	+{
	782	+ struct btrfs_fs_devices *fs_devices;
	783	+
	784	+ /*
	785	+ * Handle the case where the scanned device is part of an fs whose last
	786	+ * metadata UUID change reverted it to the original FSID. At the same
	787	+ * time * fs_devices was first created by another constitutent device
	788	+ * which didn't fully observe the operation. This results in an
	789	+ * btrfs_fs_devices created with metadata/fsid different AND
	790	+ * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
	791	+ * fs_devices equal to the FSID of the disk.
	792	+ */
	793	+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
	794	+ if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
	795	+ BTRFS_FSID_SIZE) != 0 &&
	796	+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
	797	+ BTRFS_FSID_SIZE) == 0 &&
	798	+ fs_devices->fsid_change)
	799	+ return fs_devices;
	800	+ }
	801	+
	802	+ return NULL;
	803	+}
737	804	/*
738	805	* Add new device to list of registered devices
739	806	*
..	..	@@ -746,16 +813,40 @@
746	813	bool *new_device_added)
747	814	{
748	815	struct btrfs_device *device;
749		- struct btrfs_fs_devices *fs_devices;
	816	+ struct btrfs_fs_devices *fs_devices = NULL;
750	817	struct rcu_string *name;
751	818	u64 found_transid = btrfs_super_generation(disk_super);
752	819	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
	820	+ bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
	821	+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
	822	+ bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
	823	+ BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
753	824
754		- fs_devices = find_fsid(disk_super->fsid);
	825	+ if (fsid_change_in_progress) {
	826	+ if (!has_metadata_uuid)
	827	+ fs_devices = find_fsid_inprogress(disk_super);
	828	+ else
	829	+ fs_devices = find_fsid_changed(disk_super);
	830	+ } else if (has_metadata_uuid) {
	831	+ fs_devices = find_fsid_with_metadata_uuid(disk_super);
	832	+ } else {
	833	+ fs_devices = find_fsid_reverted_metadata(disk_super);
	834	+ if (!fs_devices)
	835	+ fs_devices = find_fsid(disk_super->fsid, NULL);
	836	+ }
	837	+
	838	+
755	839	if (!fs_devices) {
756		- fs_devices = alloc_fs_devices(disk_super->fsid);
	840	+ if (has_metadata_uuid)
	841	+ fs_devices = alloc_fs_devices(disk_super->fsid,
	842	+ disk_super->metadata_uuid);
	843	+ else
	844	+ fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
	845	+
757	846	if (IS_ERR(fs_devices))
758	847	return ERR_CAST(fs_devices);
	848	+
	849	+ fs_devices->fsid_change = fsid_change_in_progress;
759	850
760	851	mutex_lock(&fs_devices->device_list_mutex);
761	852	list_add(&fs_devices->fs_list, &fs_uuids);
..	..	@@ -765,6 +856,27 @@
765	856	mutex_lock(&fs_devices->device_list_mutex);
766	857	device = btrfs_find_device(fs_devices, devid,
767	858	disk_super->dev_item.uuid, NULL, false);
	859	+
	860	+ /*
	861	+ * If this disk has been pulled into an fs devices created by
	862	+ * a device which had the CHANGING_FSID_V2 flag then replace the
	863	+ * metadata_uuid/fsid values of the fs_devices.
	864	+ */
	865	+ if (fs_devices->fsid_change &&
	866	+ found_transid > fs_devices->latest_generation) {
	867	+ memcpy(fs_devices->fsid, disk_super->fsid,
	868	+ BTRFS_FSID_SIZE);
	869	+
	870	+ if (has_metadata_uuid)
	871	+ memcpy(fs_devices->metadata_uuid,
	872	+ disk_super->metadata_uuid,
	873	+ BTRFS_FSID_SIZE);
	874	+ else
	875	+ memcpy(fs_devices->metadata_uuid,
	876	+ disk_super->fsid, BTRFS_FSID_SIZE);
	877	+
	878	+ fs_devices->fsid_change = false;
	879	+ }
768	880	}
769	881
770	882	if (!device) {
..	..	@@ -796,11 +908,15 @@
796	908	*new_device_added = true;
797	909
798	910	if (disk_super->label[0])
799		- pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
800		- disk_super->label, devid, found_transid, path);
	911	+ pr_info(
	912	+ "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
	913	+ disk_super->label, devid, found_transid, path,
	914	+ current->comm, task_pid_nr(current));
801	915	else
802		- pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
803		- disk_super->fsid, devid, found_transid, path);
	916	+ pr_info(
	917	+ "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
	918	+ disk_super->fsid, devid, found_transid, path,
	919	+ current->comm, task_pid_nr(current));
804	920
805	921	} else if (!device->name \|\| strcmp(device->name->str, path)) {
806	922	/*
..	..	@@ -897,8 +1013,11 @@
897	1013	* it back. We need it to pick the disk with largest generation
898	1014	* (as above).
899	1015	*/
900		- if (!fs_devices->opened)
	1016	+ if (!fs_devices->opened) {
901	1017	device->generation = found_transid;
	1018	+ fs_devices->latest_generation = max_t(u64, found_transid,
	1019	+ fs_devices->latest_generation);
	1020	+ }
902	1021
903	1022	fs_devices->total_devices = btrfs_super_num_devices(disk_super);
904	1023
..	..	@@ -911,22 +1030,25 @@
911	1030	struct btrfs_fs_devices *fs_devices;
912	1031	struct btrfs_device *device;
913	1032	struct btrfs_device *orig_dev;
	1033	+ int ret = 0;
914	1034
915		- fs_devices = alloc_fs_devices(orig->fsid);
	1035	+ lockdep_assert_held(&uuid_mutex);
	1036	+
	1037	+ fs_devices = alloc_fs_devices(orig->fsid, NULL);
916	1038	if (IS_ERR(fs_devices))
917	1039	return fs_devices;
918	1040
919		- mutex_lock(&orig->device_list_mutex);
920	1041	fs_devices->total_devices = orig->total_devices;
921	1042
922		- /* We have held the volume lock, it is safe to get the devices. */
923	1043	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
924	1044	struct rcu_string *name;
925	1045
926	1046	device = btrfs_alloc_device(NULL, &orig_dev->devid,
927	1047	orig_dev->uuid);
928		- if (IS_ERR(device))
	1048	+ if (IS_ERR(device)) {
	1049	+ ret = PTR_ERR(device);
929	1050	goto error;
	1051	+ }
930	1052
931	1053	/*
932	1054	* This is ok to do without rcu read locked because we hold the
..	..	@@ -937,6 +1059,7 @@
937	1059	GFP_KERNEL);
938	1060	if (!name) {
939	1061	btrfs_free_device(device);
	1062	+ ret = -ENOMEM;
940	1063	goto error;
941	1064	}
942	1065	rcu_assign_pointer(device->name, name);
..	..	@@ -946,36 +1069,27 @@
946	1069	device->fs_devices = fs_devices;
947	1070	fs_devices->num_devices++;
948	1071	}
949		- mutex_unlock(&orig->device_list_mutex);
950	1072	return fs_devices;
951	1073	error:
952		- mutex_unlock(&orig->device_list_mutex);
953	1074	free_fs_devices(fs_devices);
954		- return ERR_PTR(-ENOMEM);
	1075	+ return ERR_PTR(ret);
955	1076	}
956	1077
957		-/*
958		- * After we have read the system tree and know devids belonging to
959		- * this filesystem, remove the device which does not belong there.
960		- */
961		-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
	1078	+static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
	1079	+ int step, struct btrfs_device **latest_dev)
962	1080	{
963	1081	struct btrfs_device device, next;
964		- struct btrfs_device *latest_dev = NULL;
965	1082
966		- mutex_lock(&uuid_mutex);
967		-again:
968	1083	/* This is the initialized path, it is safe to release the devices. */
969	1084	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
970		- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
971		- &device->dev_state)) {
	1085	+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
972	1086	if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
973		- &device->dev_state) &&
	1087	+ &device->dev_state) &&
974	1088	!test_bit(BTRFS_DEV_STATE_MISSING,
975	1089	&device->dev_state) &&
976		- (!latest_dev \|\|
977		- device->generation > latest_dev->generation)) {
978		- latest_dev = device;
	1090	+ (!*latest_dev \|\|
	1091	+ device->generation > (*latest_dev)->generation)) {
	1092	+ *latest_dev = device;
979	1093	}
980	1094	continue;
981	1095	}
..	..	@@ -1002,22 +1116,26 @@
1002	1116	btrfs_free_device(device);
1003	1117	}
1004	1118
1005		- if (fs_devices->seed) {
1006		- fs_devices = fs_devices->seed;
1007		- goto again;
1008		- }
	1119	+}
	1120	+
	1121	+/*
	1122	+ * After we have read the system tree and know devids belonging to this
	1123	+ * filesystem, remove the device which does not belong there.
	1124	+ */
	1125	+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
	1126	+{
	1127	+ struct btrfs_device *latest_dev = NULL;
	1128	+ struct btrfs_fs_devices *seed_dev;
	1129	+
	1130	+ mutex_lock(&uuid_mutex);
	1131	+ __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
	1132	+
	1133	+ list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
	1134	+ __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
1009	1135
1010	1136	fs_devices->latest_bdev = latest_dev->bdev;
1011	1137
1012	1138	mutex_unlock(&uuid_mutex);
1013		-}
1014		-
1015		-static void free_device_rcu(struct rcu_head *head)
1016		-{
1017		- struct btrfs_device *device;
1018		-
1019		- device = container_of(head, struct btrfs_device, rcu);
1020		- btrfs_free_device(device);
1021	1139	}
1022	1140
1023	1141	static void btrfs_close_bdev(struct btrfs_device *device)
..	..	@@ -1036,11 +1154,6 @@
1036	1154	static void btrfs_close_one_device(struct btrfs_device *device)
1037	1155	{
1038	1156	struct btrfs_fs_devices *fs_devices = device->fs_devices;
1039		- struct btrfs_device *new_device;
1040		- struct rcu_string *name;
1041		-
1042		- if (device->bdev)
1043		- fs_devices->open_devices--;
1044	1157
1045	1158	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1046	1159	device->devid != BTRFS_DEV_REPLACE_DEVID) {
..	..	@@ -1057,65 +1170,72 @@
1057	1170	}
1058	1171
1059	1172	btrfs_close_bdev(device);
1060		-
1061		- new_device = btrfs_alloc_device(NULL, &device->devid,
1062		- device->uuid);
1063		- BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1064		-
1065		- /* Safe because we are under uuid_mutex */
1066		- if (device->name) {
1067		- name = rcu_string_strdup(device->name->str, GFP_NOFS);
1068		- BUG_ON(!name); /* -ENOMEM */
1069		- rcu_assign_pointer(new_device->name, name);
	1173	+ if (device->bdev) {
	1174	+ fs_devices->open_devices--;
	1175	+ device->bdev = NULL;
1070	1176	}
	1177	+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1071	1178
1072		- list_replace_rcu(&device->dev_list, &new_device->dev_list);
1073		- new_device->fs_devices = device->fs_devices;
	1179	+ device->fs_info = NULL;
	1180	+ atomic_set(&device->dev_stats_ccnt, 0);
	1181	+ extent_io_tree_release(&device->alloc_state);
1074	1182
1075		- call_rcu(&device->rcu, free_device_rcu);
	1183	+ /*
	1184	+ * Reset the flush error record. We might have a transient flush error
	1185	+ * in this mount, and if so we aborted the current transaction and set
	1186	+ * the fs to an error state, guaranteeing no super blocks can be further
	1187	+ * committed. However that error might be transient and if we unmount the
	1188	+ * filesystem and mount it again, we should allow the mount to succeed
	1189	+ * (btrfs_check_rw_degradable() should not fail) - if after mounting the
	1190	+ * filesystem again we still get flush errors, then we will again abort
	1191	+ * any transaction and set the error state, guaranteeing no commits of
	1192	+ * unsafe super blocks.
	1193	+ */
	1194	+ device->last_flush_error = 0;
	1195	+
	1196	+ /* Verify the device is back in a pristine state */
	1197	+ ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
	1198	+ ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
	1199	+ ASSERT(list_empty(&device->dev_alloc_list));
	1200	+ ASSERT(list_empty(&device->post_commit_list));
	1201	+ ASSERT(atomic_read(&device->reada_in_flight) == 0);
1076	1202	}
1077	1203
1078		-static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
	1204	+static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1079	1205	{
1080	1206	struct btrfs_device device, tmp;
1081	1207
1082		- if (--fs_devices->opened > 0)
1083		- return 0;
	1208	+ lockdep_assert_held(&uuid_mutex);
1084	1209
1085		- mutex_lock(&fs_devices->device_list_mutex);
1086		- list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
	1210	+ if (--fs_devices->opened > 0)
	1211	+ return;
	1212	+
	1213	+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1087	1214	btrfs_close_one_device(device);
1088		- }
1089		- mutex_unlock(&fs_devices->device_list_mutex);
1090	1215
1091	1216	WARN_ON(fs_devices->open_devices);
1092	1217	WARN_ON(fs_devices->rw_devices);
1093	1218	fs_devices->opened = 0;
1094		- fs_devices->seeding = 0;
1095		-
1096		- return 0;
	1219	+ fs_devices->seeding = false;
	1220	+ fs_devices->fs_info = NULL;
1097	1221	}
1098	1222
1099		-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
	1223	+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1100	1224	{
1101		- struct btrfs_fs_devices *seed_devices = NULL;
1102		- int ret;
	1225	+ LIST_HEAD(list);
	1226	+ struct btrfs_fs_devices *tmp;
1103	1227
1104	1228	mutex_lock(&uuid_mutex);
1105		- ret = close_fs_devices(fs_devices);
1106		- if (!fs_devices->opened) {
1107		- seed_devices = fs_devices->seed;
1108		- fs_devices->seed = NULL;
1109		- }
1110		- mutex_unlock(&uuid_mutex);
	1229	+ close_fs_devices(fs_devices);
	1230	+ if (!fs_devices->opened)
	1231	+ list_splice_init(&fs_devices->seed_list, &list);
1111	1232
1112		- while (seed_devices) {
1113		- fs_devices = seed_devices;
1114		- seed_devices = fs_devices->seed;
	1233	+ list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1115	1234	close_fs_devices(fs_devices);
	1235	+ list_del(&fs_devices->seed_list);
1116	1236	free_fs_devices(fs_devices);
1117	1237	}
1118		- return ret;
	1238	+ mutex_unlock(&uuid_mutex);
1119	1239	}
1120	1240
1121	1241	static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
..	..	@@ -1123,28 +1243,33 @@
1123	1243	{
1124	1244	struct btrfs_device *device;
1125	1245	struct btrfs_device *latest_dev = NULL;
1126		- int ret = 0;
	1246	+ struct btrfs_device *tmp_device;
1127	1247
1128	1248	flags \|= FMODE_EXCL;
1129	1249
1130		- list_for_each_entry(device, &fs_devices->devices, dev_list) {
1131		- /* Just open everything we can; ignore failures here */
1132		- if (btrfs_open_one_device(fs_devices, device, flags, holder))
1133		- continue;
	1250	+ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
	1251	+ dev_list) {
	1252	+ int ret;
1134	1253
1135		- if (!latest_dev \|\|
1136		- device->generation > latest_dev->generation)
	1254	+ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
	1255	+ if (ret == 0 &&
	1256	+ (!latest_dev \|\| device->generation > latest_dev->generation)) {
1137	1257	latest_dev = device;
	1258	+ } else if (ret == -ENODATA) {
	1259	+ fs_devices->num_devices--;
	1260	+ list_del(&device->dev_list);
	1261	+ btrfs_free_device(device);
	1262	+ }
1138	1263	}
1139		- if (fs_devices->open_devices == 0) {
1140		- ret = -EINVAL;
1141		- goto out;
1142		- }
	1264	+ if (fs_devices->open_devices == 0)
	1265	+ return -EINVAL;
	1266	+
1143	1267	fs_devices->opened = 1;
1144	1268	fs_devices->latest_bdev = latest_dev->bdev;
1145	1269	fs_devices->total_rw_bytes = 0;
1146		-out:
1147		- return ret;
	1270	+ fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
	1271	+
	1272	+ return 0;
1148	1273	}
1149	1274
1150	1275	static int devid_cmp(void priv, struct list_head a, struct list_head *b)
..	..	@@ -1186,55 +1311,66 @@
1186	1311	return ret;
1187	1312	}
1188	1313
1189		-static void btrfs_release_disk_super(struct page *page)
	1314	+void btrfs_release_disk_super(struct btrfs_super_block *super)
1190	1315	{
1191		- kunmap(page);
	1316	+ struct page *page = virt_to_page(super);
	1317	+
1192	1318	put_page(page);
1193	1319	}
1194	1320
1195		-static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1196		- struct page **page,
1197		- struct btrfs_super_block **disk_super)
	1321	+static struct btrfs_super_block btrfs_read_disk_super(struct block_device bdev,
	1322	+ u64 bytenr)
1198	1323	{
	1324	+ struct btrfs_super_block *disk_super;
	1325	+ struct page *page;
1199	1326	void *p;
1200	1327	pgoff_t index;
1201	1328
1202	1329	/* make sure our super fits in the device */
1203	1330	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1204		- return 1;
	1331	+ return ERR_PTR(-EINVAL);
1205	1332
1206	1333	/* make sure our super fits in the page */
1207		- if (sizeof(**disk_super) > PAGE_SIZE)
1208		- return 1;
	1334	+ if (sizeof(*disk_super) > PAGE_SIZE)
	1335	+ return ERR_PTR(-EINVAL);
1209	1336
1210	1337	/* make sure our super doesn't straddle pages on disk */
1211	1338	index = bytenr >> PAGE_SHIFT;
1212		- if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1213		- return 1;
	1339	+ if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
	1340	+ return ERR_PTR(-EINVAL);
1214	1341
1215	1342	/* pull in the page with our super */
1216		- *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1217		- index, GFP_KERNEL);
	1343	+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1218	1344
1219		- if (IS_ERR_OR_NULL(*page))
1220		- return 1;
	1345	+ if (IS_ERR(page))
	1346	+ return ERR_CAST(page);
1221	1347
1222		- p = kmap(*page);
	1348	+ p = page_address(page);
1223	1349
1224	1350	/* align our pointer to the offset of the super block */
1225		- *disk_super = p + (bytenr & ~PAGE_MASK);
	1351	+ disk_super = p + offset_in_page(bytenr);
1226	1352
1227		- if (btrfs_super_bytenr(*disk_super) != bytenr \|\|
1228		- btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1229		- btrfs_release_disk_super(*page);
1230		- return 1;
	1353	+ if (btrfs_super_bytenr(disk_super) != bytenr \|\|
	1354	+ btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
	1355	+ btrfs_release_disk_super(p);
	1356	+ return ERR_PTR(-EINVAL);
1231	1357	}
1232	1358
1233		- if ((*disk_super)->label[0] &&
1234		- (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1235		- (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
	1359	+ if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
	1360	+ disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1236	1361
1237		- return 0;
	1362	+ return disk_super;
	1363	+}
	1364	+
	1365	+int btrfs_forget_devices(const char *path)
	1366	+{
	1367	+ int ret;
	1368	+
	1369	+ mutex_lock(&uuid_mutex);
	1370	+ ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
	1371	+ mutex_unlock(&uuid_mutex);
	1372	+
	1373	+ return ret;
1238	1374	}
1239	1375
1240	1376	/*
..	..	@@ -1249,7 +1385,6 @@
1249	1385	bool new_device_added = false;
1250	1386	struct btrfs_device *device = NULL;
1251	1387	struct block_device *bdev;
1252		- struct page *page;
1253	1388	u64 bytenr;
1254	1389
1255	1390	lockdep_assert_held(&uuid_mutex);
..	..	@@ -1267,8 +1402,9 @@
1267	1402	if (IS_ERR(bdev))
1268	1403	return ERR_CAST(bdev);
1269	1404
1270		- if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1271		- device = ERR_PTR(-EINVAL);
	1405	+ disk_super = btrfs_read_disk_super(bdev, bytenr);
	1406	+ if (IS_ERR(disk_super)) {
	1407	+ device = ERR_CAST(disk_super);
1272	1408	goto error_bdev_put;
1273	1409	}
1274	1410
..	..	@@ -1278,7 +1414,7 @@
1278	1414	btrfs_free_stale_devices(path, device);
1279	1415	}
1280	1416
1281		- btrfs_release_disk_super(page);
	1417	+ btrfs_release_disk_super(disk_super);
1282	1418
1283	1419	error_bdev_put:
1284	1420	blkdev_put(bdev, flags);
..	..	@@ -1286,60 +1422,84 @@
1286	1422	return device;
1287	1423	}
1288	1424
1289		-static int contains_pending_extent(struct btrfs_transaction *transaction,
1290		- struct btrfs_device *device,
1291		- u64 *start, u64 len)
	1425	+/*
	1426	+ * Try to find a chunk that intersects [start, start + len] range and when one
	1427	+ * such is found, record the end of it in *start
	1428	+ */
	1429	+static bool contains_pending_extent(struct btrfs_device device, u64 start,
	1430	+ u64 len)
1292	1431	{
1293		- struct btrfs_fs_info *fs_info = device->fs_info;
1294		- struct extent_map *em;
1295		- struct list_head *search_list = &fs_info->pinned_chunks;
1296		- int ret = 0;
1297		- u64 physical_start = *start;
	1432	+ u64 physical_start, physical_end;
1298	1433
1299		- if (transaction)
1300		- search_list = &transaction->pending_chunks;
1301		-again:
1302		- list_for_each_entry(em, search_list, list) {
1303		- struct map_lookup *map;
1304		- int i;
	1434	+ lockdep_assert_held(&device->fs_info->chunk_mutex);
1305	1435
1306		- map = em->map_lookup;
1307		- for (i = 0; i < map->num_stripes; i++) {
1308		- u64 end;
	1436	+ if (!find_first_extent_bit(&device->alloc_state, *start,
	1437	+ &physical_start, &physical_end,
	1438	+ CHUNK_ALLOCATED, NULL)) {
1309	1439
1310		- if (map->stripes[i].dev != device)
1311		- continue;
1312		- if (map->stripes[i].physical >= physical_start + len \|\|
1313		- map->stripes[i].physical + em->orig_block_len <=
1314		- physical_start)
1315		- continue;
1316		- /*
1317		- * Make sure that while processing the pinned list we do
1318		- * not override our *start with a lower value, because
1319		- * we can have pinned chunks that fall within this
1320		- * device hole and that have lower physical addresses
1321		- * than the pending chunks we processed before. If we
1322		- * do not take this special care we can end up getting
1323		- * 2 pending chunks that start at the same physical
1324		- * device offsets because the end offset of a pinned
1325		- * chunk can be equal to the start offset of some
1326		- * pending chunk.
1327		- */
1328		- end = map->stripes[i].physical + em->orig_block_len;
1329		- if (end > *start) {
1330		- *start = end;
1331		- ret = 1;
1332		- }
	1440	+ if (in_range(physical_start, *start, len) \|\|
	1441	+ in_range(*start, physical_start,
	1442	+ physical_end - physical_start)) {
	1443	+ *start = physical_end + 1;
	1444	+ return true;
1333	1445	}
1334	1446	}
1335		- if (search_list != &fs_info->pinned_chunks) {
1336		- search_list = &fs_info->pinned_chunks;
1337		- goto again;
1338		- }
1339		-
1340		- return ret;
	1447	+ return false;
1341	1448	}
1342	1449
	1450	+static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
	1451	+{
	1452	+ switch (device->fs_devices->chunk_alloc_policy) {
	1453	+ case BTRFS_CHUNK_ALLOC_REGULAR:
	1454	+ /*
	1455	+ * We don't want to overwrite the superblock on the drive nor
	1456	+ * any area used by the boot loader (grub for example), so we
	1457	+ * make sure to start at an offset of at least 1MB.
	1458	+ */
	1459	+ return max_t(u64, start, SZ_1M);
	1460	+ default:
	1461	+ BUG();
	1462	+ }
	1463	+}
	1464	+
	1465	+/**
	1466	+ * dev_extent_hole_check - check if specified hole is suitable for allocation
	1467	+ * @device: the device which we have the hole
	1468	+ * @hole_start: starting position of the hole
	1469	+ * @hole_size: the size of the hole
	1470	+ * @num_bytes: the size of the free space that we need
	1471	+ *
	1472	+ * This function may modify @hole_start and @hole_end to reflect the suitable
	1473	+ * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
	1474	+ */
	1475	+static bool dev_extent_hole_check(struct btrfs_device device, u64 hole_start,
	1476	+ u64 *hole_size, u64 num_bytes)
	1477	+{
	1478	+ bool changed = false;
	1479	+ u64 hole_end = hole_start + hole_size;
	1480	+
	1481	+ /*
	1482	+ * Check before we set max_hole_start, otherwise we could end up
	1483	+ * sending back this offset anyway.
	1484	+ */
	1485	+ if (contains_pending_extent(device, hole_start, *hole_size)) {
	1486	+ if (hole_end >= *hole_start)
	1487	+ hole_size = hole_end - hole_start;
	1488	+ else
	1489	+ *hole_size = 0;
	1490	+ changed = true;
	1491	+ }
	1492	+
	1493	+ switch (device->fs_devices->chunk_alloc_policy) {
	1494	+ case BTRFS_CHUNK_ALLOC_REGULAR:
	1495	+ /* No extra check */
	1496	+ break;
	1497	+ default:
	1498	+ BUG();
	1499	+ }
	1500	+
	1501	+ return changed;
	1502	+}
1343	1503
1344	1504	/*
1345	1505	* find_free_dev_extent_start - find free space in the specified device
..	..	@@ -1361,10 +1521,16 @@
1361	1521	* @len is used to store the size of the free space that we find.
1362	1522	* But if we don't find suitable free space, it is used to store the size of
1363	1523	* the max free space.
	1524	+ *
	1525	+ * NOTE: This function will search commit root of device tree, and does extra
	1526	+ * check to ensure dev extents are not double allocated.
	1527	+ * This makes the function safe to allocate dev extents but may not report
	1528	+ * correct usable device space, as device extent freed in current transaction
	1529	+ * is not reported as avaiable.
1364	1530	*/
1365		-int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1366		- struct btrfs_device *device, u64 num_bytes,
1367		- u64 search_start, u64 start, u64 len)
	1531	+static int find_free_dev_extent_start(struct btrfs_device *device,
	1532	+ u64 num_bytes, u64 search_start, u64 *start,
	1533	+ u64 *len)
1368	1534	{
1369	1535	struct btrfs_fs_info *fs_info = device->fs_info;
1370	1536	struct btrfs_root *root = fs_info->dev_root;
..	..	@@ -1380,12 +1546,7 @@
1380	1546	int slot;
1381	1547	struct extent_buffer *l;
1382	1548
1383		- /*
1384		- * We don't want to overwrite the superblock on the drive nor any area
1385		- * used by the boot loader (grub for example), so we make sure to start
1386		- * at an offset of at least 1MB.
1387		- */
1388		- search_start = max_t(u64, search_start, SZ_1M);
	1549	+ search_start = dev_extent_search_start(device, search_start);
1389	1550
1390	1551	path = btrfs_alloc_path();
1391	1552	if (!path)
..	..	@@ -1443,21 +1604,8 @@
1443	1604
1444	1605	if (key.offset > search_start) {
1445	1606	hole_size = key.offset - search_start;
1446		-
1447		- /*
1448		- * Have to check before we set max_hole_start, otherwise
1449		- * we could end up sending back this offset anyway.
1450		- */
1451		- if (contains_pending_extent(transaction, device,
1452		- &search_start,
1453		- hole_size)) {
1454		- if (key.offset >= search_start) {
1455		- hole_size = key.offset - search_start;
1456		- } else {
1457		- WARN_ON_ONCE(1);
1458		- hole_size = 0;
1459		- }
1460		- }
	1607	+ dev_extent_hole_check(device, &search_start, &hole_size,
	1608	+ num_bytes);
1461	1609
1462	1610	if (hole_size > max_hole_size) {
1463	1611	max_hole_start = search_start;
..	..	@@ -1496,9 +1644,8 @@
1496	1644	*/
1497	1645	if (search_end > search_start) {
1498	1646	hole_size = search_end - search_start;
1499		-
1500		- if (contains_pending_extent(transaction, device, &search_start,
1501		- hole_size)) {
	1647	+ if (dev_extent_hole_check(device, &search_start, &hole_size,
	1648	+ num_bytes)) {
1502	1649	btrfs_release_path(path);
1503	1650	goto again;
1504	1651	}
..	..	@@ -1523,13 +1670,11 @@
1523	1670	return ret;
1524	1671	}
1525	1672
1526		-int find_free_dev_extent(struct btrfs_trans_handle *trans,
1527		- struct btrfs_device *device, u64 num_bytes,
	1673	+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1528	1674	u64 start, u64 len)
1529	1675	{
1530	1676	/* FIXME use last free of some kind */
1531		- return find_free_dev_extent_start(trans->transaction, device,
1532		- num_bytes, 0, start, len);
	1677	+ return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1533	1678	}
1534	1679
1535	1680	static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
..	..	@@ -1640,9 +1785,9 @@
1640	1785	struct rb_node *n;
1641	1786	u64 ret = 0;
1642	1787
1643		- em_tree = &fs_info->mapping_tree.map_tree;
	1788	+ em_tree = &fs_info->mapping_tree;
1644	1789	read_lock(&em_tree->lock);
1645		- n = rb_last(&em_tree->map);
	1790	+ n = rb_last(&em_tree->map.rb_root);
1646	1791	if (n) {
1647	1792	em = rb_entry(n, struct extent_map, rb_node);
1648	1793	ret = em->start + em->len;
..	..	@@ -1672,7 +1817,12 @@
1672	1817	if (ret < 0)
1673	1818	goto error;
1674	1819
1675		- BUG_ON(ret == 0); /* Corruption */
	1820	+ if (ret == 0) {
	1821	+ /* Corruption */
	1822	+ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
	1823	+ ret = -EUCLEAN;
	1824	+ goto error;
	1825	+ }
1676	1826
1677	1827	ret = btrfs_previous_item(fs_info->chunk_root, path,
1678	1828	BTRFS_DEV_ITEMS_OBJECTID,
..	..	@@ -1738,7 +1888,8 @@
1738	1888	ptr = btrfs_device_uuid(dev_item);
1739	1889	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1740	1890	ptr = btrfs_device_fsid(dev_item);
1741		- write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE);
	1891	+ write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
	1892	+ ptr, BTRFS_FSID_SIZE);
1742	1893	btrfs_mark_buffer_dirty(leaf);
1743	1894
1744	1895	ret = 0;
..	..	@@ -1750,22 +1901,27 @@
1750	1901	/*
1751	1902	* Function to update ctime/mtime for a given device path.
1752	1903	* Mainly used for ctime/mtime based probe like libblkid.
	1904	+ *
	1905	+ * We don't care about errors here, this is just to be kind to userspace.
1753	1906	*/
1754		-static void update_dev_time(const char *path_name)
	1907	+static void update_dev_time(const char *device_path)
1755	1908	{
1756		- struct file *filp;
	1909	+ struct path path;
	1910	+ struct timespec64 now;
	1911	+ int ret;
1757	1912
1758		- filp = filp_open(path_name, O_RDWR, 0);
1759		- if (IS_ERR(filp))
	1913	+ ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
	1914	+ if (ret)
1760	1915	return;
1761		- file_update_time(filp);
1762		- filp_close(filp, NULL);
	1916	+
	1917	+ now = current_time(d_inode(path.dentry));
	1918	+ inode_update_time(d_inode(path.dentry), &now, S_MTIME \| S_CTIME);
	1919	+ path_put(&path);
1763	1920	}
1764	1921
1765		-static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1766		- struct btrfs_device *device)
	1922	+static int btrfs_rm_dev_item(struct btrfs_device *device)
1767	1923	{
1768		- struct btrfs_root *root = fs_info->chunk_root;
	1924	+ struct btrfs_root *root = device->fs_info->chunk_root;
1769	1925	int ret;
1770	1926	struct btrfs_path *path;
1771	1927	struct btrfs_key key;
..	..	@@ -1862,17 +2018,14 @@
1862	2018	* where this function called, there should be always be another device (or
1863	2019	* this_dev) which is active.
1864	2020	*/
1865		-void btrfs_assign_next_active_device(struct btrfs_device *device,
1866		- struct btrfs_device *this_dev)
	2021	+void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
	2022	+ struct btrfs_device *next_device)
1867	2023	{
1868	2024	struct btrfs_fs_info *fs_info = device->fs_info;
1869		- struct btrfs_device *next_device;
1870	2025
1871		- if (this_dev)
1872		- next_device = this_dev;
1873		- else
	2026	+ if (!next_device)
1874	2027	next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1875		- device);
	2028	+ device);
1876	2029	ASSERT(next_device);
1877	2030
1878	2031	if (fs_info->sb->s_bdev &&
..	..	@@ -1883,8 +2036,66 @@
1883	2036	fs_info->fs_devices->latest_bdev = next_device->bdev;
1884	2037	}
1885	2038
	2039	+/*
	2040	+ * Return btrfs_fs_devices::num_devices excluding the device that's being
	2041	+ * currently replaced.
	2042	+ */
	2043	+static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
	2044	+{
	2045	+ u64 num_devices = fs_info->fs_devices->num_devices;
	2046	+
	2047	+ down_read(&fs_info->dev_replace.rwsem);
	2048	+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
	2049	+ ASSERT(num_devices > 1);
	2050	+ num_devices--;
	2051	+ }
	2052	+ up_read(&fs_info->dev_replace.rwsem);
	2053	+
	2054	+ return num_devices;
	2055	+}
	2056	+
	2057	+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
	2058	+ struct block_device *bdev,
	2059	+ const char *device_path)
	2060	+{
	2061	+ struct btrfs_super_block *disk_super;
	2062	+ int copy_num;
	2063	+
	2064	+ if (!bdev)
	2065	+ return;
	2066	+
	2067	+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
	2068	+ struct page *page;
	2069	+ int ret;
	2070	+
	2071	+ disk_super = btrfs_read_dev_one_super(bdev, copy_num);
	2072	+ if (IS_ERR(disk_super))
	2073	+ continue;
	2074	+
	2075	+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
	2076	+
	2077	+ page = virt_to_page(disk_super);
	2078	+ set_page_dirty(page);
	2079	+ lock_page(page);
	2080	+ /* write_on_page() unlocks the page */
	2081	+ ret = write_one_page(page);
	2082	+ if (ret)
	2083	+ btrfs_warn(fs_info,
	2084	+ "error clearing superblock number %d (%d)",
	2085	+ copy_num, ret);
	2086	+ btrfs_release_disk_super(disk_super);
	2087	+
	2088	+ }
	2089	+
	2090	+ /* Notify udev that device has changed */
	2091	+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
	2092	+
	2093	+ /* Update ctime/mtime for device path for libblkid */
	2094	+ update_dev_time(device_path);
	2095	+}
	2096	+
1886	2097	int btrfs_rm_device(struct btrfs_fs_info fs_info, const char device_path,
1887		- u64 devid)
	2098	+ u64 devid)
1888	2099	{
1889	2100	struct btrfs_device *device;
1890	2101	struct btrfs_fs_devices *cur_devices;
..	..	@@ -1892,24 +2103,35 @@
1892	2103	u64 num_devices;
1893	2104	int ret = 0;
1894	2105
1895		- mutex_lock(&uuid_mutex);
1896		-
1897		- num_devices = fs_devices->num_devices;
1898		- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1899		- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1900		- WARN_ON(num_devices < 1);
1901		- num_devices--;
1902		- }
1903		- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
	2106	+ /*
	2107	+ * The device list in fs_devices is accessed without locks (neither
	2108	+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
	2109	+ * filesystem and another device rm cannot run.
	2110	+ */
	2111	+ num_devices = btrfs_num_devices(fs_info);
1904	2112
1905	2113	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1906	2114	if (ret)
1907	2115	goto out;
1908	2116
1909		- ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1910		- &device);
1911		- if (ret)
	2117	+ device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
	2118	+
	2119	+ if (IS_ERR(device)) {
	2120	+ if (PTR_ERR(device) == -ENOENT &&
	2121	+ device_path && strcmp(device_path, "missing") == 0)
	2122	+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
	2123	+ else
	2124	+ ret = PTR_ERR(device);
1912	2125	goto out;
	2126	+ }
	2127	+
	2128	+ if (btrfs_pinned_by_swapfile(fs_info, device)) {
	2129	+ btrfs_warn_in_rcu(fs_info,
	2130	+ "cannot remove device %s (devid %llu) due to active swapfile",
	2131	+ rcu_str_deref(device->name), device->devid);
	2132	+ ret = -ETXTBSY;
	2133	+ goto out;
	2134	+ }
1913	2135
1914	2136	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1915	2137	ret = BTRFS_ERROR_DEV_TGT_REPLACE;
..	..	@@ -1929,9 +2151,9 @@
1929	2151	mutex_unlock(&fs_info->chunk_mutex);
1930	2152	}
1931	2153
1932		- mutex_unlock(&uuid_mutex);
1933	2154	ret = btrfs_shrink_device(device, 0);
1934		- mutex_lock(&uuid_mutex);
	2155	+ if (!ret)
	2156	+ btrfs_reada_remove_dev(device);
1935	2157	if (ret)
1936	2158	goto error_undo;
1937	2159
..	..	@@ -1940,12 +2162,12 @@
1940	2162	* counter although write_all_supers() is not locked out. This
1941	2163	* could give a filesystem state which requires a degraded mount.
1942	2164	*/
1943		- ret = btrfs_rm_dev_item(fs_info, device);
	2165	+ ret = btrfs_rm_dev_item(device);
1944	2166	if (ret)
1945	2167	goto error_undo;
1946	2168
1947	2169	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
1948		- btrfs_scrub_cancel_dev(fs_info, device);
	2170	+ btrfs_scrub_cancel_dev(device);
1949	2171
1950	2172	/*
1951	2173	* the device list mutex makes sure that we don't change
..	..	@@ -1980,7 +2202,7 @@
1980	2202	if (device->bdev) {
1981	2203	cur_devices->open_devices--;
1982	2204	/* remove sysfs entry */
1983		- btrfs_sysfs_rm_device_link(fs_devices, device);
	2205	+ btrfs_sysfs_remove_device(device);
1984	2206	}
1985	2207
1986	2208	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
..	..	@@ -1993,29 +2215,24 @@
1993	2215	* supers and free the device.
1994	2216	*/
1995	2217	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
1996		- btrfs_scratch_superblocks(device->bdev, device->name->str);
	2218	+ btrfs_scratch_superblocks(fs_info, device->bdev,
	2219	+ device->name->str);
1997	2220
1998	2221	btrfs_close_bdev(device);
1999		- call_rcu(&device->rcu, free_device_rcu);
	2222	+ synchronize_rcu();
	2223	+ btrfs_free_device(device);
2000	2224
2001	2225	if (cur_devices->open_devices == 0) {
2002		- while (fs_devices) {
2003		- if (fs_devices->seed == cur_devices) {
2004		- fs_devices->seed = cur_devices->seed;
2005		- break;
2006		- }
2007		- fs_devices = fs_devices->seed;
2008		- }
2009		- cur_devices->seed = NULL;
	2226	+ list_del_init(&cur_devices->seed_list);
2010	2227	close_fs_devices(cur_devices);
2011	2228	free_fs_devices(cur_devices);
2012	2229	}
2013	2230
2014	2231	out:
2015		- mutex_unlock(&uuid_mutex);
2016	2232	return ret;
2017	2233
2018	2234	error_undo:
	2235	+ btrfs_reada_undo_remove_dev(device);
2019	2236	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2020	2237	mutex_lock(&fs_info->chunk_mutex);
2021	2238	list_add(&device->dev_alloc_list,
..	..	@@ -2053,23 +2270,18 @@
2053	2270	fs_devices->open_devices--;
2054	2271	}
2055	2272
2056		-void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2057		- struct btrfs_device *srcdev)
	2273	+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2058	2274	{
2059	2275	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2060	2276
2061		- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2062		- /* zero out the old super if it is writable */
2063		- btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2064		- }
	2277	+ mutex_lock(&uuid_mutex);
2065	2278
2066	2279	btrfs_close_bdev(srcdev);
2067		- call_rcu(&srcdev->rcu, free_device_rcu);
	2280	+ synchronize_rcu();
	2281	+ btrfs_free_device(srcdev);
2068	2282
2069	2283	/* if this is no devs we rather delete the fs_devices */
2070	2284	if (!fs_devices->num_devices) {
2071		- struct btrfs_fs_devices *tmp_fs_devices;
2072		-
2073	2285	/*
2074	2286	* On a mounted FS, num_devices can't be zero unless it's a
2075	2287	* seed. In case of a seed device being replaced, the replace
..	..	@@ -2078,28 +2290,20 @@
2078	2290	*/
2079	2291	ASSERT(fs_devices->seeding);
2080	2292
2081		- tmp_fs_devices = fs_info->fs_devices;
2082		- while (tmp_fs_devices) {
2083		- if (tmp_fs_devices->seed == fs_devices) {
2084		- tmp_fs_devices->seed = fs_devices->seed;
2085		- break;
2086		- }
2087		- tmp_fs_devices = tmp_fs_devices->seed;
2088		- }
2089		- fs_devices->seed = NULL;
	2293	+ list_del_init(&fs_devices->seed_list);
2090	2294	close_fs_devices(fs_devices);
2091	2295	free_fs_devices(fs_devices);
2092	2296	}
	2297	+ mutex_unlock(&uuid_mutex);
2093	2298	}
2094	2299
2095	2300	void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2096	2301	{
2097	2302	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2098	2303
2099		- WARN_ON(!tgtdev);
2100	2304	mutex_lock(&fs_devices->device_list_mutex);
2101	2305
2102		- btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
	2306	+ btrfs_sysfs_remove_device(tgtdev);
2103	2307
2104	2308	if (tgtdev->bdev)
2105	2309	fs_devices->open_devices--;
..	..	@@ -2119,90 +2323,77 @@
2119	2323	* is already out of device list, so we don't have to hold
2120	2324	* the device_list_mutex lock.
2121	2325	*/
2122		- btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
	2326	+ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
	2327	+ tgtdev->name->str);
2123	2328
2124	2329	btrfs_close_bdev(tgtdev);
2125		- call_rcu(&tgtdev->rcu, free_device_rcu);
	2330	+ synchronize_rcu();
	2331	+ btrfs_free_device(tgtdev);
2126	2332	}
2127	2333
2128		-static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2129		- const char *device_path,
2130		- struct btrfs_device **device)
	2334	+static struct btrfs_device *btrfs_find_device_by_path(
	2335	+ struct btrfs_fs_info fs_info, const char device_path)
2131	2336	{
2132	2337	int ret = 0;
2133	2338	struct btrfs_super_block *disk_super;
2134	2339	u64 devid;
2135	2340	u8 *dev_uuid;
2136	2341	struct block_device *bdev;
2137		- struct buffer_head *bh;
	2342	+ struct btrfs_device *device;
2138	2343
2139		- *device = NULL;
2140	2344	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2141		- fs_info->bdev_holder, 0, &bdev, &bh);
	2345	+ fs_info->bdev_holder, 0, &bdev, &disk_super);
2142	2346	if (ret)
2143		- return ret;
2144		- disk_super = (struct btrfs_super_block *)bh->b_data;
	2347	+ return ERR_PTR(ret);
	2348	+
2145	2349	devid = btrfs_stack_device_id(&disk_super->dev_item);
2146	2350	dev_uuid = disk_super->dev_item.uuid;
2147		- *device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2148		- disk_super->fsid, true);
2149		- brelse(bh);
2150		- if (!*device)
2151		- ret = -ENOENT;
	2351	+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
	2352	+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
	2353	+ disk_super->metadata_uuid, true);
	2354	+ else
	2355	+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
	2356	+ disk_super->fsid, true);
	2357	+
	2358	+ btrfs_release_disk_super(disk_super);
	2359	+ if (!device)
	2360	+ device = ERR_PTR(-ENOENT);
2152	2361	blkdev_put(bdev, FMODE_READ);
2153		- return ret;
2154		-}
2155		-
2156		-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2157		- const char *device_path,
2158		- struct btrfs_device **device)
2159		-{
2160		- *device = NULL;
2161		- if (strcmp(device_path, "missing") == 0) {
2162		- struct list_head *devices;
2163		- struct btrfs_device *tmp;
2164		-
2165		- devices = &fs_info->fs_devices->devices;
2166		- list_for_each_entry(tmp, devices, dev_list) {
2167		- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2168		- &tmp->dev_state) && !tmp->bdev) {
2169		- *device = tmp;
2170		- break;
2171		- }
2172		- }
2173		-
2174		- if (!*device)
2175		- return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2176		-
2177		- return 0;
2178		- } else {
2179		- return btrfs_find_device_by_path(fs_info, device_path, device);
2180		- }
	2362	+ return device;
2181	2363	}
2182	2364
2183	2365	/*
2184	2366	* Lookup a device given by device id, or the path if the id is 0.
2185	2367	*/
2186		-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2187		- const char *devpath,
2188		- struct btrfs_device **device)
	2368	+struct btrfs_device *btrfs_find_device_by_devspec(
	2369	+ struct btrfs_fs_info *fs_info, u64 devid,
	2370	+ const char *device_path)
2189	2371	{
2190		- int ret;
	2372	+ struct btrfs_device *device;
2191	2373
2192	2374	if (devid) {
2193		- ret = 0;
2194		- *device = btrfs_find_device(fs_info->fs_devices, devid,
2195		- NULL, NULL, true);
2196		- if (!*device)
2197		- ret = -ENOENT;
2198		- } else {
2199		- if (!devpath \|\| !devpath[0])
2200		- return -EINVAL;
2201		-
2202		- ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2203		- device);
	2375	+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
	2376	+ NULL, true);
	2377	+ if (!device)
	2378	+ return ERR_PTR(-ENOENT);
	2379	+ return device;
2204	2380	}
2205		- return ret;
	2381	+
	2382	+ if (!device_path \|\| !device_path[0])
	2383	+ return ERR_PTR(-EINVAL);
	2384	+
	2385	+ if (strcmp(device_path, "missing") == 0) {
	2386	+ /* Find first missing device */
	2387	+ list_for_each_entry(device, &fs_info->fs_devices->devices,
	2388	+ dev_list) {
	2389	+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
	2390	+ &device->dev_state) && !device->bdev)
	2391	+ return device;
	2392	+ }
	2393	+ return ERR_PTR(-ENOENT);
	2394	+ }
	2395	+
	2396	+ return btrfs_find_device_by_path(fs_info, device_path);
2206	2397	}
2207	2398
2208	2399	/*
..	..	@@ -2221,10 +2412,20 @@
2221	2412	if (!fs_devices->seeding)
2222	2413	return -EINVAL;
2223	2414
2224		- seed_devices = alloc_fs_devices(NULL);
	2415	+ /*
	2416	+ * Private copy of the seed devices, anchored at
	2417	+ * fs_info->fs_devices->seed_list
	2418	+ */
	2419	+ seed_devices = alloc_fs_devices(NULL, NULL);
2225	2420	if (IS_ERR(seed_devices))
2226	2421	return PTR_ERR(seed_devices);
2227	2422
	2423	+ /*
	2424	+ * It's necessary to retain a copy of the original seed fs_devices in
	2425	+ * fs_uuids so that filesystems which have been seeded can successfully
	2426	+ * reference the seed device from open_seed_devices. This also supports
	2427	+ * multiple fs seed.
	2428	+ */
2228	2429	old_devices = clone_fs_devices(fs_devices);
2229	2430	if (IS_ERR(old_devices)) {
2230	2431	kfree(seed_devices);
..	..	@@ -2245,19 +2446,15 @@
2245	2446	list_for_each_entry(device, &seed_devices->devices, dev_list)
2246	2447	device->fs_devices = seed_devices;
2247	2448
2248		- mutex_lock(&fs_info->chunk_mutex);
2249		- list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2250		- mutex_unlock(&fs_info->chunk_mutex);
2251		-
2252		- fs_devices->seeding = 0;
	2449	+ fs_devices->seeding = false;
2253	2450	fs_devices->num_devices = 0;
2254	2451	fs_devices->open_devices = 0;
2255	2452	fs_devices->missing_devices = 0;
2256		- fs_devices->rotating = 0;
2257		- fs_devices->seed = seed_devices;
	2453	+ fs_devices->rotating = false;
	2454	+ list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2258	2455
2259	2456	generate_random_uuid(fs_devices->fsid);
2260		- memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
	2457	+ memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2261	2458	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2262	2459	mutex_unlock(&fs_devices->device_list_mutex);
2263	2460
..	..	@@ -2271,9 +2468,9 @@
2271	2468	/*
2272	2469	* Store the expected generation for seed devices in device items.
2273	2470	*/
2274		-static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2275		- struct btrfs_fs_info *fs_info)
	2471	+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2276	2472	{
	2473	+ struct btrfs_fs_info *fs_info = trans->fs_info;
2277	2474	struct btrfs_root *root = fs_info->chunk_root;
2278	2475	struct btrfs_path *path;
2279	2476	struct extent_buffer *leaf;
..	..	@@ -2357,7 +2554,7 @@
2357	2554	u64 orig_super_num_devices;
2358	2555	int seeding_dev = 0;
2359	2556	int ret = 0;
2360		- bool unlocked = false;
	2557	+ bool locked = false;
2361	2558
2362	2559	if (sb_rdonly(sb) && !fs_devices->seeding)
2363	2560	return -EROFS;
..	..	@@ -2371,20 +2568,20 @@
2371	2568	seeding_dev = 1;
2372	2569	down_write(&sb->s_umount);
2373	2570	mutex_lock(&uuid_mutex);
	2571	+ locked = true;
2374	2572	}
2375	2573
2376		- filemap_write_and_wait(bdev->bd_inode->i_mapping);
	2574	+ sync_blockdev(bdev);
2377	2575
2378		- mutex_lock(&fs_devices->device_list_mutex);
2379		- list_for_each_entry(device, &fs_devices->devices, dev_list) {
	2576	+ rcu_read_lock();
	2577	+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2380	2578	if (device->bdev == bdev) {
2381	2579	ret = -EEXIST;
2382		- mutex_unlock(
2383		- &fs_devices->device_list_mutex);
	2580	+ rcu_read_unlock();
2384	2581	goto error;
2385	2582	}
2386	2583	}
2387		- mutex_unlock(&fs_devices->device_list_mutex);
	2584	+ rcu_read_unlock();
2388	2585
2389	2586	device = btrfs_alloc_device(fs_info, NULL, NULL);
2390	2587	if (IS_ERR(device)) {
..	..	@@ -2448,7 +2645,7 @@
2448	2645	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2449	2646
2450	2647	if (!blk_queue_nonrot(q))
2451		- fs_devices->rotating = 1;
	2648	+ fs_devices->rotating = true;
2452	2649
2453	2650	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2454	2651	btrfs_set_super_total_bytes(fs_info->super_copy,
..	..	@@ -2468,13 +2665,13 @@
2468	2665	mutex_unlock(&fs_info->chunk_mutex);
2469	2666
2470	2667	/* Add sysfs device entry */
2471		- btrfs_sysfs_add_device_link(fs_devices, device);
	2668	+ btrfs_sysfs_add_device(device);
2472	2669
2473	2670	mutex_unlock(&fs_devices->device_list_mutex);
2474	2671
2475	2672	if (seeding_dev) {
2476	2673	mutex_lock(&fs_info->chunk_mutex);
2477		- ret = init_first_rw_device(trans, fs_info);
	2674	+ ret = init_first_rw_device(trans);
2478	2675	mutex_unlock(&fs_info->chunk_mutex);
2479	2676	if (ret) {
2480	2677	btrfs_abort_transaction(trans, ret);
..	..	@@ -2489,22 +2686,17 @@
2489	2686	}
2490	2687
2491	2688	if (seeding_dev) {
2492		- char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2493		-
2494		- ret = btrfs_finish_sprout(trans, fs_info);
	2689	+ ret = btrfs_finish_sprout(trans);
2495	2690	if (ret) {
2496	2691	btrfs_abort_transaction(trans, ret);
2497	2692	goto error_sysfs;
2498	2693	}
2499	2694
2500		- /* Sprouting would change fsid of the mounted root,
2501		- * so rename the fsid on the sysfs
	2695	+ /*
	2696	+ * fs_devices now represents the newly sprouted filesystem and
	2697	+ * its fsid has been changed by btrfs_prepare_sprout
2502	2698	*/
2503		- snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2504		- fs_info->fsid);
2505		- if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
2506		- btrfs_warn(fs_info,
2507		- "sysfs: failed to create fsid for sprout");
	2699	+ btrfs_sysfs_update_sprout_fsid(fs_devices);
2508	2700	}
2509	2701
2510	2702	ret = btrfs_commit_transaction(trans);
..	..	@@ -2512,7 +2704,7 @@
2512	2704	if (seeding_dev) {
2513	2705	mutex_unlock(&uuid_mutex);
2514	2706	up_write(&sb->s_umount);
2515		- unlocked = true;
	2707	+ locked = false;
2516	2708
2517	2709	if (ret) /* transaction commit */
2518	2710	return ret;
..	..	@@ -2532,12 +2724,22 @@
2532	2724	ret = btrfs_commit_transaction(trans);
2533	2725	}
2534	2726
2535		- /* Update ctime/mtime for libblkid */
	2727	+ /*
	2728	+ * Now that we have written a new super block to this device, check all
	2729	+ * other fs_devices list if device_path alienates any other scanned
	2730	+ * device.
	2731	+ * We can ignore the return value as it typically returns -EINVAL and
	2732	+ * only succeeds if the device was an alien.
	2733	+ */
	2734	+ btrfs_forget_devices(device_path);
	2735	+
	2736	+ /* Update ctime/mtime for blkid or udev */
2536	2737	update_dev_time(device_path);
	2738	+
2537	2739	return ret;
2538	2740
2539	2741	error_sysfs:
2540		- btrfs_sysfs_rm_device_link(fs_devices, device);
	2742	+ btrfs_sysfs_remove_device(device);
2541	2743	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2542	2744	mutex_lock(&fs_info->chunk_mutex);
2543	2745	list_del_rcu(&device->dev_list);
..	..	@@ -2563,7 +2765,7 @@
2563	2765	btrfs_free_device(device);
2564	2766	error:
2565	2767	blkdev_put(bdev, FMODE_EXCL);
2566		- if (seeding_dev && !unlocked) {
	2768	+ if (locked) {
2567	2769	mutex_unlock(&uuid_mutex);
2568	2770	up_write(&sb->s_umount);
2569	2771	}
..	..	@@ -2621,7 +2823,6 @@
2621	2823	{
2622	2824	struct btrfs_fs_info *fs_info = device->fs_info;
2623	2825	struct btrfs_super_block *super_copy = fs_info->super_copy;
2624		- struct btrfs_fs_devices *fs_devices;
2625	2826	u64 old_total;
2626	2827	u64 diff;
2627	2828
..	..	@@ -2640,8 +2841,6 @@
2640	2841	return -EINVAL;
2641	2842	}
2642	2843
2643		- fs_devices = fs_info->fs_devices;
2644		-
2645	2844	btrfs_set_super_total_bytes(super_copy,
2646	2845	round_down(old_total + diff, fs_info->sectorsize));
2647	2846	device->fs_devices->total_rw_bytes += diff;
..	..	@@ -2649,9 +2848,9 @@
2649	2848	btrfs_device_set_total_bytes(device, new_size);
2650	2849	btrfs_device_set_disk_total_bytes(device, new_size);
2651	2850	btrfs_clear_space_info_full(device->fs_info);
2652		- if (list_empty(&device->resized_list))
2653		- list_add_tail(&device->resized_list,
2654		- &fs_devices->resized_devices);
	2851	+ if (list_empty(&device->post_commit_list))
	2852	+ list_add_tail(&device->post_commit_list,
	2853	+ &trans->transaction->dev_update_list);
2655	2854	mutex_unlock(&fs_info->chunk_mutex);
2656	2855
2657	2856	return btrfs_update_device(trans, device);
..	..	@@ -2739,13 +2938,20 @@
2739	2938	return ret;
2740	2939	}
2741	2940
2742		-static struct extent_map get_chunk_map(struct btrfs_fs_info fs_info,
2743		- u64 logical, u64 length)
	2941	+/*
	2942	+ * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
	2943	+ * @logical: Logical block offset in bytes.
	2944	+ * @length: Length of extent in bytes.
	2945	+ *
	2946	+ * Return: Chunk mapping or ERR_PTR.
	2947	+ */
	2948	+struct extent_map btrfs_get_chunk_map(struct btrfs_fs_info fs_info,
	2949	+ u64 logical, u64 length)
2744	2950	{
2745	2951	struct extent_map_tree *em_tree;
2746	2952	struct extent_map *em;
2747	2953
2748		- em_tree = &fs_info->mapping_tree.map_tree;
	2954	+ em_tree = &fs_info->mapping_tree;
2749	2955	read_lock(&em_tree->lock);
2750	2956	em = lookup_extent_mapping(em_tree, logical, length);
2751	2957	read_unlock(&em_tree->lock);
..	..	@@ -2777,7 +2983,7 @@
2777	2983	int i, ret = 0;
2778	2984	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2779	2985
2780		- em = get_chunk_map(fs_info, chunk_offset, 1);
	2986	+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2781	2987	if (IS_ERR(em)) {
2782	2988	/*
2783	2989	* This is a logic error, but we don't want to just rely on the
..	..	@@ -2818,13 +3024,11 @@
2818	3024	mutex_unlock(&fs_info->chunk_mutex);
2819	3025	}
2820	3026
2821		- if (map->stripes[i].dev) {
2822		- ret = btrfs_update_device(trans, map->stripes[i].dev);
2823		- if (ret) {
2824		- mutex_unlock(&fs_devices->device_list_mutex);
2825		- btrfs_abort_transaction(trans, ret);
2826		- goto out;
2827		- }
	3027	+ ret = btrfs_update_device(trans, device);
	3028	+ if (ret) {
	3029	+ mutex_unlock(&fs_devices->device_list_mutex);
	3030	+ btrfs_abort_transaction(trans, ret);
	3031	+ goto out;
2828	3032	}
2829	3033	}
2830	3034	mutex_unlock(&fs_devices->device_list_mutex);
..	..	@@ -2861,6 +3065,7 @@
2861	3065	{
2862	3066	struct btrfs_root *root = fs_info->chunk_root;
2863	3067	struct btrfs_trans_handle *trans;
	3068	+ struct btrfs_block_group *block_group;
2864	3069	int ret;
2865	3070
2866	3071	/*
..	..	@@ -2877,10 +3082,6 @@
2877	3082	*/
2878	3083	lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2879	3084
2880		- ret = btrfs_can_relocate(fs_info, chunk_offset);
2881		- if (ret)
2882		- return -ENOSPC;
2883		-
2884	3085	/* step one, relocate all the extents inside this chunk */
2885	3086	btrfs_scrub_pause(fs_info);
2886	3087	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
..	..	@@ -2888,15 +3089,11 @@
2888	3089	if (ret)
2889	3090	return ret;
2890	3091
2891		- /*
2892		- * We add the kobjects here (and after forcing data chunk creation)
2893		- * since relocation is the only place we'll create chunks of a new
2894		- * type at runtime. The only place where we'll remove the last
2895		- * chunk of a type is the call immediately below this one. Even
2896		- * so, we're protected against races with the cleaner thread since
2897		- * we're covered by the delete_unused_bgs_mutex.
2898		- */
2899		- btrfs_add_raid_kobjects(fs_info);
	3092	+ block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
	3093	+ if (!block_group)
	3094	+ return -ENOENT;
	3095	+ btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
	3096	+ btrfs_put_block_group(block_group);
2900	3097
2901	3098	trans = btrfs_start_trans_remove_block_group(root->fs_info,
2902	3099	chunk_offset);
..	..	@@ -2997,7 +3194,7 @@
2997	3194	static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
2998	3195	u64 chunk_offset)
2999	3196	{
3000		- struct btrfs_block_group_cache *cache;
	3197	+ struct btrfs_block_group *cache;
3001	3198	u64 bytes_used;
3002	3199	u64 chunk_type;
3003	3200
..	..	@@ -3006,30 +3203,28 @@
3006	3203	chunk_type = cache->flags;
3007	3204	btrfs_put_block_group(cache);
3008	3205
3009		- if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3010		- spin_lock(&fs_info->data_sinfo->lock);
3011		- bytes_used = fs_info->data_sinfo->bytes_used;
3012		- spin_unlock(&fs_info->data_sinfo->lock);
	3206	+ if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
	3207	+ return 0;
3013	3208
3014		- if (!bytes_used) {
3015		- struct btrfs_trans_handle *trans;
3016		- int ret;
	3209	+ spin_lock(&fs_info->data_sinfo->lock);
	3210	+ bytes_used = fs_info->data_sinfo->bytes_used;
	3211	+ spin_unlock(&fs_info->data_sinfo->lock);
3017	3212
3018		- trans = btrfs_join_transaction(fs_info->tree_root);
3019		- if (IS_ERR(trans))
3020		- return PTR_ERR(trans);
	3213	+ if (!bytes_used) {
	3214	+ struct btrfs_trans_handle *trans;
	3215	+ int ret;
3021	3216
3022		- ret = btrfs_force_chunk_alloc(trans,
3023		- BTRFS_BLOCK_GROUP_DATA);
3024		- btrfs_end_transaction(trans);
3025		- if (ret < 0)
3026		- return ret;
	3217	+ trans = btrfs_join_transaction(fs_info->tree_root);
	3218	+ if (IS_ERR(trans))
	3219	+ return PTR_ERR(trans);
3027	3220
3028		- btrfs_add_raid_kobjects(fs_info);
3029		-
3030		- return 1;
3031		- }
	3221	+ ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
	3222	+ btrfs_end_transaction(trans);
	3223	+ if (ret < 0)
	3224	+ return ret;
	3225	+ return 1;
3032	3226	}
	3227	+
3033	3228	return 0;
3034	3229	}
3035	3230
..	..	@@ -3099,7 +3294,7 @@
3099	3294	if (!path)
3100	3295	return -ENOMEM;
3101	3296
3102		- trans = btrfs_start_transaction(root, 0);
	3297	+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3103	3298	if (IS_ERR(trans)) {
3104	3299	btrfs_free_path(path);
3105	3300	return PTR_ERR(trans);
..	..	@@ -3208,28 +3403,28 @@
3208	3403	static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3209	3404	struct btrfs_balance_args *bargs)
3210	3405	{
3211		- struct btrfs_block_group_cache *cache;
	3406	+ struct btrfs_block_group *cache;
3212	3407	u64 chunk_used;
3213	3408	u64 user_thresh_min;
3214	3409	u64 user_thresh_max;
3215	3410	int ret = 1;
3216	3411
3217	3412	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3218		- chunk_used = btrfs_block_group_used(&cache->item);
	3413	+ chunk_used = cache->used;
3219	3414
3220	3415	if (bargs->usage_min == 0)
3221	3416	user_thresh_min = 0;
3222	3417	else
3223		- user_thresh_min = div_factor_fine(cache->key.offset,
3224		- bargs->usage_min);
	3418	+ user_thresh_min = div_factor_fine(cache->length,
	3419	+ bargs->usage_min);
3225	3420
3226	3421	if (bargs->usage_max == 0)
3227	3422	user_thresh_max = 1;
3228	3423	else if (bargs->usage_max > 100)
3229		- user_thresh_max = cache->key.offset;
	3424	+ user_thresh_max = cache->length;
3230	3425	else
3231		- user_thresh_max = div_factor_fine(cache->key.offset,
3232		- bargs->usage_max);
	3426	+ user_thresh_max = div_factor_fine(cache->length,
	3427	+ bargs->usage_max);
3233	3428
3234	3429	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3235	3430	ret = 0;
..	..	@@ -3241,20 +3436,19 @@
3241	3436	static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3242	3437	u64 chunk_offset, struct btrfs_balance_args *bargs)
3243	3438	{
3244		- struct btrfs_block_group_cache *cache;
	3439	+ struct btrfs_block_group *cache;
3245	3440	u64 chunk_used, user_thresh;
3246	3441	int ret = 1;
3247	3442
3248	3443	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3249		- chunk_used = btrfs_block_group_used(&cache->item);
	3444	+ chunk_used = cache->used;
3250	3445
3251	3446	if (bargs->usage_min == 0)
3252	3447	user_thresh = 1;
3253	3448	else if (bargs->usage > 100)
3254		- user_thresh = cache->key.offset;
	3449	+ user_thresh = cache->length;
3255	3450	else
3256		- user_thresh = div_factor_fine(cache->key.offset,
3257		- bargs->usage);
	3451	+ user_thresh = div_factor_fine(cache->length, bargs->usage);
3258	3452
3259	3453	if (chunk_used < user_thresh)
3260	3454	ret = 0;
..	..	@@ -3280,6 +3474,18 @@
3280	3474	return 1;
3281	3475	}
3282	3476
	3477	+static u64 calc_data_stripes(u64 type, int num_stripes)
	3478	+{
	3479	+ const int index = btrfs_bg_flags_to_raid_index(type);
	3480	+ const int ncopies = btrfs_raid_array[index].ncopies;
	3481	+ const int nparity = btrfs_raid_array[index].nparity;
	3482	+
	3483	+ if (nparity)
	3484	+ return num_stripes - nparity;
	3485	+ else
	3486	+ return num_stripes / ncopies;
	3487	+}
	3488	+
3283	3489	/* [pstart, pend) */
3284	3490	static int chunk_drange_filter(struct extent_buffer *leaf,
3285	3491	struct btrfs_chunk *chunk,
..	..	@@ -3289,22 +3495,15 @@
3289	3495	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3290	3496	u64 stripe_offset;
3291	3497	u64 stripe_length;
	3498	+ u64 type;
3292	3499	int factor;
3293	3500	int i;
3294	3501
3295	3502	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3296	3503	return 0;
3297	3504
3298		- if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP \|
3299		- BTRFS_BLOCK_GROUP_RAID1 \| BTRFS_BLOCK_GROUP_RAID10)) {
3300		- factor = num_stripes / 2;
3301		- } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3302		- factor = num_stripes - 1;
3303		- } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3304		- factor = num_stripes - 2;
3305		- } else {
3306		- factor = num_stripes;
3307		- }
	3505	+ type = btrfs_chunk_type(leaf, chunk);
	3506	+ factor = calc_data_stripes(type, num_stripes);
3308	3507
3309	3508	for (i = 0; i < num_stripes; i++) {
3310	3509	stripe = btrfs_stripe_nr(chunk, i);
..	..	@@ -3365,10 +3564,10 @@
3365	3564	return 0;
3366	3565	}
3367	3566
3368		-static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3369		- struct extent_buffer *leaf,
	3567	+static int should_balance_chunk(struct extent_buffer *leaf,
3370	3568	struct btrfs_chunk *chunk, u64 chunk_offset)
3371	3569	{
	3570	+ struct btrfs_fs_info *fs_info = leaf->fs_info;
3372	3571	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3373	3572	struct btrfs_balance_args *bargs = NULL;
3374	3573	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
..	..	@@ -3458,17 +3657,11 @@
3458	3657	{
3459	3658	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3460	3659	struct btrfs_root *chunk_root = fs_info->chunk_root;
3461		- struct btrfs_root *dev_root = fs_info->dev_root;
3462		- struct list_head *devices;
3463		- struct btrfs_device *device;
3464		- u64 old_size;
3465		- u64 size_to_free;
3466	3660	u64 chunk_type;
3467	3661	struct btrfs_chunk *chunk;
3468	3662	struct btrfs_path *path = NULL;
3469	3663	struct btrfs_key key;
3470	3664	struct btrfs_key found_key;
3471		- struct btrfs_trans_handle *trans;
3472	3665	struct extent_buffer *leaf;
3473	3666	int slot;
3474	3667	int ret;
..	..	@@ -3483,53 +3676,6 @@
3483	3676	u32 count_sys = 0;
3484	3677	int chunk_reserved = 0;
3485	3678
3486		- /* step one make some room on all the devices */
3487		- devices = &fs_info->fs_devices->devices;
3488		- list_for_each_entry(device, devices, dev_list) {
3489		- old_size = btrfs_device_get_total_bytes(device);
3490		- size_to_free = div_factor(old_size, 1);
3491		- size_to_free = min_t(u64, size_to_free, SZ_1M);
3492		- if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) \|\|
3493		- btrfs_device_get_total_bytes(device) -
3494		- btrfs_device_get_bytes_used(device) > size_to_free \|\|
3495		- test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
3496		- continue;
3497		-
3498		- ret = btrfs_shrink_device(device, old_size - size_to_free);
3499		- if (ret == -ENOSPC)
3500		- break;
3501		- if (ret) {
3502		- /* btrfs_shrink_device never returns ret > 0 */
3503		- WARN_ON(ret > 0);
3504		- goto error;
3505		- }
3506		-
3507		- trans = btrfs_start_transaction(dev_root, 0);
3508		- if (IS_ERR(trans)) {
3509		- ret = PTR_ERR(trans);
3510		- btrfs_info_in_rcu(fs_info,
3511		- "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
3512		- rcu_str_deref(device->name), ret,
3513		- old_size, old_size - size_to_free);
3514		- goto error;
3515		- }
3516		-
3517		- ret = btrfs_grow_device(trans, device, old_size);
3518		- if (ret) {
3519		- btrfs_end_transaction(trans);
3520		- /* btrfs_grow_device never returns ret > 0 */
3521		- WARN_ON(ret > 0);
3522		- btrfs_info_in_rcu(fs_info,
3523		- "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
3524		- rcu_str_deref(device->name), ret,
3525		- old_size, old_size - size_to_free);
3526		- goto error;
3527		- }
3528		-
3529		- btrfs_end_transaction(trans);
3530		- }
3531		-
3532		- /* step two, relocate all the chunks */
3533	3679	path = btrfs_alloc_path();
3534	3680	if (!path) {
3535	3681	ret = -ENOMEM;
..	..	@@ -3601,8 +3747,7 @@
3601	3747	spin_unlock(&fs_info->balance_lock);
3602	3748	}
3603	3749
3604		- ret = should_balance_chunk(fs_info, leaf, chunk,
3605		- found_key.offset);
	3750	+ ret = should_balance_chunk(leaf, chunk, found_key.offset);
3606	3751
3607	3752	btrfs_release_path(path);
3608	3753	if (!ret) {
..	..	@@ -3659,10 +3804,15 @@
3659	3804
3660	3805	ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3661	3806	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3662		- if (ret && ret != -ENOSPC)
3663		- goto error;
3664	3807	if (ret == -ENOSPC) {
3665	3808	enospc_errors++;
	3809	+ } else if (ret == -ETXTBSY) {
	3810	+ btrfs_info(fs_info,
	3811	+ "skipping relocation of block group %llu due to active swapfile",
	3812	+ found_key.offset);
	3813	+ ret = 0;
	3814	+ } else if (ret) {
	3815	+ goto error;
3666	3816	} else {
3667	3817	spin_lock(&fs_info->balance_lock);
3668	3818	bctl->stat.completed++;
..	..	@@ -3711,8 +3861,7 @@
3711	3861	if (flags == 0)
3712	3862	return !extended; /* "0" is valid for usual profiles */
3713	3863
3714		- /* true if exactly one bit set */
3715		- return (flags & (flags - 1)) == 0;
	3864	+ return has_single_bit_set(flags);
3716	3865	}
3717	3866
3718	3867	static inline int balance_need_close(struct btrfs_fs_info *fs_info)
..	..	@@ -3723,13 +3872,179 @@
3723	3872	atomic_read(&fs_info->balance_cancel_req) == 0);
3724	3873	}
3725	3874
3726		-/* Non-zero return value signifies invalidity */
3727		-static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
3728		- u64 allowed)
	3875	+/*
	3876	+ * Validate target profile against allowed profiles and return true if it's OK.
	3877	+ * Otherwise print the error message and return false.
	3878	+ */
	3879	+static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
	3880	+ const struct btrfs_balance_args *bargs,
	3881	+ u64 allowed, const char *type)
3729	3882	{
3730		- return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3731		- (!alloc_profile_is_valid(bctl_arg->target, 1) \|\|
3732		- (bctl_arg->target & ~allowed)));
	3883	+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
	3884	+ return true;
	3885	+
	3886	+ /* Profile is valid and does not have bits outside of the allowed set */
	3887	+ if (alloc_profile_is_valid(bargs->target, 1) &&
	3888	+ (bargs->target & ~allowed) == 0)
	3889	+ return true;
	3890	+
	3891	+ btrfs_err(fs_info, "balance: invalid convert %s profile %s",
	3892	+ type, btrfs_bg_type_to_raid_name(bargs->target));
	3893	+ return false;
	3894	+}
	3895	+
	3896	+/*
	3897	+ * Fill @buf with textual description of balance filter flags @bargs, up to
	3898	+ * @size_buf including the terminating null. The output may be trimmed if it
	3899	+ * does not fit into the provided buffer.
	3900	+ */
	3901	+static void describe_balance_args(struct btrfs_balance_args bargs, char buf,
	3902	+ u32 size_buf)
	3903	+{
	3904	+ int ret;
	3905	+ u32 size_bp = size_buf;
	3906	+ char *bp = buf;
	3907	+ u64 flags = bargs->flags;
	3908	+ char tmp_buf[128] = {'\0'};
	3909	+
	3910	+ if (!flags)
	3911	+ return;
	3912	+
	3913	+#define CHECK_APPEND_NOARG(a) \
	3914	+ do { \
	3915	+ ret = snprintf(bp, size_bp, (a)); \
	3916	+ if (ret < 0 \|\| ret >= size_bp) \
	3917	+ goto out_overflow; \
	3918	+ size_bp -= ret; \
	3919	+ bp += ret; \
	3920	+ } while (0)
	3921	+
	3922	+#define CHECK_APPEND_1ARG(a, v1) \
	3923	+ do { \
	3924	+ ret = snprintf(bp, size_bp, (a), (v1)); \
	3925	+ if (ret < 0 \|\| ret >= size_bp) \
	3926	+ goto out_overflow; \
	3927	+ size_bp -= ret; \
	3928	+ bp += ret; \
	3929	+ } while (0)
	3930	+
	3931	+#define CHECK_APPEND_2ARG(a, v1, v2) \
	3932	+ do { \
	3933	+ ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
	3934	+ if (ret < 0 \|\| ret >= size_bp) \
	3935	+ goto out_overflow; \
	3936	+ size_bp -= ret; \
	3937	+ bp += ret; \
	3938	+ } while (0)
	3939	+
	3940	+ if (flags & BTRFS_BALANCE_ARGS_CONVERT)
	3941	+ CHECK_APPEND_1ARG("convert=%s,",
	3942	+ btrfs_bg_type_to_raid_name(bargs->target));
	3943	+
	3944	+ if (flags & BTRFS_BALANCE_ARGS_SOFT)
	3945	+ CHECK_APPEND_NOARG("soft,");
	3946	+
	3947	+ if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
	3948	+ btrfs_describe_block_groups(bargs->profiles, tmp_buf,
	3949	+ sizeof(tmp_buf));
	3950	+ CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
	3951	+ }
	3952	+
	3953	+ if (flags & BTRFS_BALANCE_ARGS_USAGE)
	3954	+ CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
	3955	+
	3956	+ if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
	3957	+ CHECK_APPEND_2ARG("usage=%u..%u,",
	3958	+ bargs->usage_min, bargs->usage_max);
	3959	+
	3960	+ if (flags & BTRFS_BALANCE_ARGS_DEVID)
	3961	+ CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
	3962	+
	3963	+ if (flags & BTRFS_BALANCE_ARGS_DRANGE)
	3964	+ CHECK_APPEND_2ARG("drange=%llu..%llu,",
	3965	+ bargs->pstart, bargs->pend);
	3966	+
	3967	+ if (flags & BTRFS_BALANCE_ARGS_VRANGE)
	3968	+ CHECK_APPEND_2ARG("vrange=%llu..%llu,",
	3969	+ bargs->vstart, bargs->vend);
	3970	+
	3971	+ if (flags & BTRFS_BALANCE_ARGS_LIMIT)
	3972	+ CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
	3973	+
	3974	+ if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
	3975	+ CHECK_APPEND_2ARG("limit=%u..%u,",
	3976	+ bargs->limit_min, bargs->limit_max);
	3977	+
	3978	+ if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
	3979	+ CHECK_APPEND_2ARG("stripes=%u..%u,",
	3980	+ bargs->stripes_min, bargs->stripes_max);
	3981	+
	3982	+#undef CHECK_APPEND_2ARG
	3983	+#undef CHECK_APPEND_1ARG
	3984	+#undef CHECK_APPEND_NOARG
	3985	+
	3986	+out_overflow:
	3987	+
	3988	+ if (size_bp < size_buf)
	3989	+ buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
	3990	+ else
	3991	+ buf[0] = '\0';
	3992	+}
	3993	+
	3994	+static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
	3995	+{
	3996	+ u32 size_buf = 1024;
	3997	+ char tmp_buf[192] = {'\0'};
	3998	+ char *buf;
	3999	+ char *bp;
	4000	+ u32 size_bp = size_buf;
	4001	+ int ret;
	4002	+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
	4003	+
	4004	+ buf = kzalloc(size_buf, GFP_KERNEL);
	4005	+ if (!buf)
	4006	+ return;
	4007	+
	4008	+ bp = buf;
	4009	+
	4010	+#define CHECK_APPEND_1ARG(a, v1) \
	4011	+ do { \
	4012	+ ret = snprintf(bp, size_bp, (a), (v1)); \
	4013	+ if (ret < 0 \|\| ret >= size_bp) \
	4014	+ goto out_overflow; \
	4015	+ size_bp -= ret; \
	4016	+ bp += ret; \
	4017	+ } while (0)
	4018	+
	4019	+ if (bctl->flags & BTRFS_BALANCE_FORCE)
	4020	+ CHECK_APPEND_1ARG("%s", "-f ");
	4021	+
	4022	+ if (bctl->flags & BTRFS_BALANCE_DATA) {
	4023	+ describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
	4024	+ CHECK_APPEND_1ARG("-d%s ", tmp_buf);
	4025	+ }
	4026	+
	4027	+ if (bctl->flags & BTRFS_BALANCE_METADATA) {
	4028	+ describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
	4029	+ CHECK_APPEND_1ARG("-m%s ", tmp_buf);
	4030	+ }
	4031	+
	4032	+ if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
	4033	+ describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
	4034	+ CHECK_APPEND_1ARG("-s%s ", tmp_buf);
	4035	+ }
	4036	+
	4037	+#undef CHECK_APPEND_1ARG
	4038	+
	4039	+out_overflow:
	4040	+
	4041	+ if (size_bp < size_buf)
	4042	+ buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
	4043	+ btrfs_info(fs_info, "balance: %s %s",
	4044	+ (bctl->flags & BTRFS_BALANCE_RESUME) ?
	4045	+ "resume" : "start", buf);
	4046	+
	4047	+ kfree(buf);
3733	4048	}
3734	4049
3735	4050	/*
..	..	@@ -3745,11 +4060,12 @@
3745	4060	int ret;
3746	4061	u64 num_devices;
3747	4062	unsigned seq;
3748		- bool reducing_integrity;
	4063	+ bool reducing_redundancy;
	4064	+ int i;
3749	4065
3750	4066	if (btrfs_fs_closing(fs_info) \|\|
3751	4067	atomic_read(&fs_info->balance_pause_req) \|\|
3752		- atomic_read(&fs_info->balance_cancel_req)) {
	4068	+ btrfs_should_cancel_balance(fs_info)) {
3753	4069	ret = -EINVAL;
3754	4070	goto out;
3755	4071	}
..	..	@@ -3774,54 +4090,39 @@
3774	4090	}
3775	4091	}
3776	4092
3777		- num_devices = fs_info->fs_devices->num_devices;
3778		- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
3779		- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3780		- BUG_ON(num_devices < 1);
3781		- num_devices--;
3782		- }
3783		- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3784		- allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE \| BTRFS_BLOCK_GROUP_DUP;
3785		- if (num_devices > 1)
3786		- allowed \|= (BTRFS_BLOCK_GROUP_RAID0 \| BTRFS_BLOCK_GROUP_RAID1);
3787		- if (num_devices > 2)
3788		- allowed \|= BTRFS_BLOCK_GROUP_RAID5;
3789		- if (num_devices > 3)
3790		- allowed \|= (BTRFS_BLOCK_GROUP_RAID10 \|
3791		- BTRFS_BLOCK_GROUP_RAID6);
3792		- if (validate_convert_profile(&bctl->data, allowed)) {
3793		- int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
	4093	+ /*
	4094	+ * rw_devices will not change at the moment, device add/delete/replace
	4095	+ * are exclusive
	4096	+ */
	4097	+ num_devices = fs_info->fs_devices->rw_devices;
3794	4098
3795		- btrfs_err(fs_info,
3796		- "balance: invalid convert data profile %s",
3797		- get_raid_name(index));
3798		- ret = -EINVAL;
3799		- goto out;
3800		- }
3801		- if (validate_convert_profile(&bctl->meta, allowed)) {
3802		- int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
	4099	+ /*
	4100	+ * SINGLE profile on-disk has no profile bit, but in-memory we have a
	4101	+ * special bit for it, to make it easier to distinguish. Thus we need
	4102	+ * to set it manually, or balance would refuse the profile.
	4103	+ */
	4104	+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
	4105	+ for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
	4106	+ if (num_devices >= btrfs_raid_array[i].devs_min)
	4107	+ allowed \|= btrfs_raid_array[i].bg_flag;
3803	4108
3804		- btrfs_err(fs_info,
3805		- "balance: invalid convert metadata profile %s",
3806		- get_raid_name(index));
3807		- ret = -EINVAL;
3808		- goto out;
3809		- }
3810		- if (validate_convert_profile(&bctl->sys, allowed)) {
3811		- int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
3812		-
3813		- btrfs_err(fs_info,
3814		- "balance: invalid convert system profile %s",
3815		- get_raid_name(index));
	4109	+ if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") \|\|
	4110	+ !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") \|\|
	4111	+ !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
3816	4112	ret = -EINVAL;
3817	4113	goto out;
3818	4114	}
3819	4115
3820		- /* allow to reduce meta or sys integrity only if force set */
3821		- allowed = BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1 \|
3822		- BTRFS_BLOCK_GROUP_RAID10 \|
3823		- BTRFS_BLOCK_GROUP_RAID5 \|
3824		- BTRFS_BLOCK_GROUP_RAID6;
	4116	+ /*
	4117	+ * Allow to reduce metadata or system integrity only if force set for
	4118	+ * profiles with redundancy (copies, parity)
	4119	+ */
	4120	+ allowed = 0;
	4121	+ for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
	4122	+ if (btrfs_raid_array[i].ncopies >= 2 \|\|
	4123	+ btrfs_raid_array[i].tolerated_failures >= 1)
	4124	+ allowed \|= btrfs_raid_array[i].bg_flag;
	4125	+ }
3825	4126	do {
3826	4127	seq = read_seqbegin(&fs_info->profiles_lock);
3827	4128
..	..	@@ -3831,9 +4132,9 @@
3831	4132	((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3832	4133	(fs_info->avail_metadata_alloc_bits & allowed) &&
3833	4134	!(bctl->meta.target & allowed)))
3834		- reducing_integrity = true;
	4135	+ reducing_redundancy = true;
3835	4136	else
3836		- reducing_integrity = false;
	4137	+ reducing_redundancy = false;
3837	4138
3838	4139	/* if we're not converting, the target field is uninitialized */
3839	4140	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
..	..	@@ -3842,13 +4143,13 @@
3842	4143	bctl->data.target : fs_info->avail_data_alloc_bits;
3843	4144	} while (read_seqretry(&fs_info->profiles_lock, seq));
3844	4145
3845		- if (reducing_integrity) {
	4146	+ if (reducing_redundancy) {
3846	4147	if (bctl->flags & BTRFS_BALANCE_FORCE) {
3847	4148	btrfs_info(fs_info,
3848		- "balance: force reducing metadata integrity");
	4149	+ "balance: force reducing metadata redundancy");
3849	4150	} else {
3850	4151	btrfs_err(fs_info,
3851		- "balance: reduces metadata integrity, use --force if you want this");
	4152	+ "balance: reduces metadata redundancy, use --force if you want this");
3852	4153	ret = -EINVAL;
3853	4154	goto out;
3854	4155	}
..	..	@@ -3856,12 +4157,18 @@
3856	4157
3857	4158	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
3858	4159	btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
3859		- int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
3860		- int data_index = btrfs_bg_flags_to_raid_index(data_target);
3861		-
3862	4160	btrfs_warn(fs_info,
3863	4161	"balance: metadata profile %s has lower redundancy than data profile %s",
3864		- get_raid_name(meta_index), get_raid_name(data_index));
	4162	+ btrfs_bg_type_to_raid_name(meta_target),
	4163	+ btrfs_bg_type_to_raid_name(data_target));
	4164	+ }
	4165	+
	4166	+ if (fs_info->send_in_progress) {
	4167	+ btrfs_warn_rl(fs_info,
	4168	+"cannot run balance while send operations are in progress (%d in progress)",
	4169	+ fs_info->send_in_progress);
	4170	+ ret = -EAGAIN;
	4171	+ goto out;
3865	4172	}
3866	4173
3867	4174	ret = insert_balance_item(fs_info, bctl);
..	..	@@ -3883,11 +4190,34 @@
3883	4190
3884	4191	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
3885	4192	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
	4193	+ describe_balance_start_or_resume(fs_info);
3886	4194	mutex_unlock(&fs_info->balance_mutex);
3887	4195
3888	4196	ret = __btrfs_balance(fs_info);
3889	4197
3890	4198	mutex_lock(&fs_info->balance_mutex);
	4199	+ if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
	4200	+ btrfs_info(fs_info, "balance: paused");
	4201	+ /*
	4202	+ * Balance can be canceled by:
	4203	+ *
	4204	+ * - Regular cancel request
	4205	+ * Then ret == -ECANCELED and balance_cancel_req > 0
	4206	+ *
	4207	+ * - Fatal signal to "btrfs" process
	4208	+ * Either the signal caught by wait_reserve_ticket() and callers
	4209	+ * got -EINTR, or caught by btrfs_should_cancel_balance() and
	4210	+ * got -ECANCELED.
	4211	+ * Either way, in this case balance_cancel_req = 0, and
	4212	+ * ret == -EINTR or ret == -ECANCELED.
	4213	+ *
	4214	+ * So here we only check the return value to catch canceled balance.
	4215	+ */
	4216	+ else if (ret == -ECANCELED \|\| ret == -EINTR)
	4217	+ btrfs_info(fs_info, "balance: canceled");
	4218	+ else
	4219	+ btrfs_info(fs_info, "balance: ended with status: %d", ret);
	4220	+
3891	4221	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
3892	4222
3893	4223	if (bargs) {
..	..	@@ -3898,7 +4228,7 @@
3898	4228	if ((ret && ret != -ECANCELED && ret != -ENOSPC) \|\|
3899	4229	balance_need_close(fs_info)) {
3900	4230	reset_balance_state(fs_info);
3901		- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
	4231	+ btrfs_exclop_finish(fs_info);
3902	4232	}
3903	4233
3904	4234	wake_up(&fs_info->balance_wait_q);
..	..	@@ -3909,7 +4239,7 @@
3909	4239	reset_balance_state(fs_info);
3910	4240	else
3911	4241	kfree(bctl);
3912		- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
	4242	+ btrfs_exclop_finish(fs_info);
3913	4243
3914	4244	return ret;
3915	4245	}
..	..	@@ -3919,12 +4249,12 @@
3919	4249	struct btrfs_fs_info *fs_info = data;
3920	4250	int ret = 0;
3921	4251
	4252	+ sb_start_write(fs_info->sb);
3922	4253	mutex_lock(&fs_info->balance_mutex);
3923		- if (fs_info->balance_ctl) {
3924		- btrfs_info(fs_info, "balance: resuming");
	4254	+ if (fs_info->balance_ctl)
3925	4255	ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
3926		- }
3927	4256	mutex_unlock(&fs_info->balance_mutex);
	4257	+ sb_end_write(fs_info->sb);
3928	4258
3929	4259	return ret;
3930	4260	}
..	..	@@ -4013,7 +4343,7 @@
4013	4343	* is in a paused state and must have fs_info::balance_ctl properly
4014	4344	* set up.
4015	4345	*/
4016		- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
	4346	+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4017	4347	btrfs_warn(fs_info,
4018	4348	"balance: cannot set exclusive op status, resume manually");
4019	4349
..	..	@@ -4097,7 +4427,7 @@
4097	4427
4098	4428	if (fs_info->balance_ctl) {
4099	4429	reset_balance_state(fs_info);
4100		- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
	4430	+ btrfs_exclop_finish(fs_info);
4101	4431	btrfs_info(fs_info, "balance: canceled");
4102	4432	}
4103	4433	}
..	..	@@ -4109,7 +4439,7 @@
4109	4439	return 0;
4110	4440	}
4111	4441
4112		-static int btrfs_uuid_scan_kthread(void *data)
	4442	+int btrfs_uuid_scan_kthread(void *data)
4113	4443	{
4114	4444	struct btrfs_fs_info *fs_info = data;
4115	4445	struct btrfs_root *root = fs_info->tree_root;
..	..	@@ -4121,6 +4451,7 @@
4121	4451	struct btrfs_root_item root_item;
4122	4452	u32 item_size;
4123	4453	struct btrfs_trans_handle *trans = NULL;
	4454	+ bool closing = false;
4124	4455
4125	4456	path = btrfs_alloc_path();
4126	4457	if (!path) {
..	..	@@ -4133,6 +4464,10 @@
4133	4464	key.offset = 0;
4134	4465
4135	4466	while (1) {
	4467	+ if (btrfs_fs_closing(fs_info)) {
	4468	+ closing = true;
	4469	+ break;
	4470	+ }
4136	4471	ret = btrfs_search_forward(root, &key, path,
4137	4472	BTRFS_OLDEST_GENERATION);
4138	4473	if (ret) {
..	..	@@ -4233,74 +4568,10 @@
4233	4568	btrfs_end_transaction(trans);
4234	4569	if (ret)
4235	4570	btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4236		- else
	4571	+ else if (!closing)
4237	4572	set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4238	4573	up(&fs_info->uuid_tree_rescan_sem);
4239	4574	return 0;
4240		-}
4241		-
4242		-/*
4243		- * Callback for btrfs_uuid_tree_iterate().
4244		- * returns:
4245		- * 0 check succeeded, the entry is not outdated.
4246		- * < 0 if an error occurred.
4247		- * > 0 if the check failed, which means the caller shall remove the entry.
4248		- */
4249		-static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
4250		- u8 *uuid, u8 type, u64 subid)
4251		-{
4252		- struct btrfs_key key;
4253		- int ret = 0;
4254		- struct btrfs_root *subvol_root;
4255		-
4256		- if (type != BTRFS_UUID_KEY_SUBVOL &&
4257		- type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
4258		- goto out;
4259		-
4260		- key.objectid = subid;
4261		- key.type = BTRFS_ROOT_ITEM_KEY;
4262		- key.offset = (u64)-1;
4263		- subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
4264		- if (IS_ERR(subvol_root)) {
4265		- ret = PTR_ERR(subvol_root);
4266		- if (ret == -ENOENT)
4267		- ret = 1;
4268		- goto out;
4269		- }
4270		-
4271		- switch (type) {
4272		- case BTRFS_UUID_KEY_SUBVOL:
4273		- if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
4274		- ret = 1;
4275		- break;
4276		- case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
4277		- if (memcmp(uuid, subvol_root->root_item.received_uuid,
4278		- BTRFS_UUID_SIZE))
4279		- ret = 1;
4280		- break;
4281		- }
4282		-
4283		-out:
4284		- return ret;
4285		-}
4286		-
4287		-static int btrfs_uuid_rescan_kthread(void *data)
4288		-{
4289		- struct btrfs_fs_info fs_info = (struct btrfs_fs_info )data;
4290		- int ret;
4291		-
4292		- /*
4293		- * 1st step is to iterate through the existing UUID tree and
4294		- * to delete all entries that contain outdated data.
4295		- * 2nd step is to add all missing entries to the UUID tree.
4296		- */
4297		- ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
4298		- if (ret < 0) {
4299		- btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
4300		- up(&fs_info->uuid_tree_rescan_sem);
4301		- return ret;
4302		- }
4303		- return btrfs_uuid_scan_kthread(data);
4304	4575	}
4305	4576
4306	4577	int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
..	..	@@ -4319,8 +4590,7 @@
4319	4590	if (IS_ERR(trans))
4320	4591	return PTR_ERR(trans);
4321	4592
4322		- uuid_root = btrfs_create_tree(trans, fs_info,
4323		- BTRFS_UUID_TREE_OBJECTID);
	4593	+ uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4324	4594	if (IS_ERR(uuid_root)) {
4325	4595	ret = PTR_ERR(uuid_root);
4326	4596	btrfs_abort_transaction(trans, ret);
..	..	@@ -4346,22 +4616,6 @@
4346	4616	return 0;
4347	4617	}
4348	4618
4349		-int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
4350		-{
4351		- struct task_struct *task;
4352		-
4353		- down(&fs_info->uuid_tree_rescan_sem);
4354		- task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
4355		- if (IS_ERR(task)) {
4356		- /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4357		- btrfs_warn(fs_info, "failed to start uuid_rescan task");
4358		- up(&fs_info->uuid_tree_rescan_sem);
4359		- return PTR_ERR(task);
4360		- }
4361		-
4362		- return 0;
4363		-}
4364		-
4365	4619	/*
4366	4620	* shrinking a device means finding all of the device extents past
4367	4621	* the new size, and then following the back refs to the chunks.
..	..	@@ -4380,15 +4634,16 @@
4380	4634	int slot;
4381	4635	int failed = 0;
4382	4636	bool retried = false;
4383		- bool checked_pending_chunks = false;
4384	4637	struct extent_buffer *l;
4385	4638	struct btrfs_key key;
4386	4639	struct btrfs_super_block *super_copy = fs_info->super_copy;
4387	4640	u64 old_total = btrfs_super_total_bytes(super_copy);
4388	4641	u64 old_size = btrfs_device_get_total_bytes(device);
4389	4642	u64 diff;
	4643	+ u64 start;
4390	4644
4391	4645	new_size = round_down(new_size, fs_info->sectorsize);
	4646	+ start = new_size;
4392	4647	diff = round_down(old_size - new_size, fs_info->sectorsize);
4393	4648
4394	4649	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
..	..	@@ -4400,6 +4655,12 @@
4400	4655
4401	4656	path->reada = READA_BACK;
4402	4657
	4658	+ trans = btrfs_start_transaction(root, 0);
	4659	+ if (IS_ERR(trans)) {
	4660	+ btrfs_free_path(path);
	4661	+ return PTR_ERR(trans);
	4662	+ }
	4663	+
4403	4664	mutex_lock(&fs_info->chunk_mutex);
4404	4665
4405	4666	btrfs_device_set_total_bytes(device, new_size);
..	..	@@ -4407,7 +4668,21 @@
4407	4668	device->fs_devices->total_rw_bytes -= diff;
4408	4669	atomic64_sub(diff, &fs_info->free_chunk_space);
4409	4670	}
4410		- mutex_unlock(&fs_info->chunk_mutex);
	4671	+
	4672	+ /*
	4673	+ * Once the device's size has been set to the new size, ensure all
	4674	+ * in-memory chunks are synced to disk so that the loop below sees them
	4675	+ * and relocates them accordingly.
	4676	+ */
	4677	+ if (contains_pending_extent(device, &start, diff)) {
	4678	+ mutex_unlock(&fs_info->chunk_mutex);
	4679	+ ret = btrfs_commit_transaction(trans);
	4680	+ if (ret)
	4681	+ goto done;
	4682	+ } else {
	4683	+ mutex_unlock(&fs_info->chunk_mutex);
	4684	+ btrfs_end_transaction(trans);
	4685	+ }
4411	4686
4412	4687	again:
4413	4688	key.objectid = device->devid;
..	..	@@ -4469,10 +4744,16 @@
4469	4744
4470	4745	ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4471	4746	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4472		- if (ret && ret != -ENOSPC)
4473		- goto done;
4474		- if (ret == -ENOSPC)
	4747	+ if (ret == -ENOSPC) {
4475	4748	failed++;
	4749	+ } else if (ret) {
	4750	+ if (ret == -ETXTBSY) {
	4751	+ btrfs_warn(fs_info,
	4752	+ "could not shrink block group %llu due to active swapfile",
	4753	+ chunk_offset);
	4754	+ }
	4755	+ goto done;
	4756	+ }
4476	4757	} while (key.offset-- > 0);
4477	4758
4478	4759	if (failed && !retried) {
..	..	@@ -4492,40 +4773,14 @@
4492	4773	}
4493	4774
4494	4775	mutex_lock(&fs_info->chunk_mutex);
4495		-
4496		- /*
4497		- * We checked in the above loop all device extents that were already in
4498		- * the device tree. However before we have updated the device's
4499		- * total_bytes to the new size, we might have had chunk allocations that
4500		- * have not complete yet (new block groups attached to transaction
4501		- * handles), and therefore their device extents were not yet in the
4502		- * device tree and we missed them in the loop above. So if we have any
4503		- * pending chunk using a device extent that overlaps the device range
4504		- * that we can not use anymore, commit the current transaction and
4505		- * repeat the search on the device tree - this way we guarantee we will
4506		- * not have chunks using device extents that end beyond 'new_size'.
4507		- */
4508		- if (!checked_pending_chunks) {
4509		- u64 start = new_size;
4510		- u64 len = old_size - new_size;
4511		-
4512		- if (contains_pending_extent(trans->transaction, device,
4513		- &start, len)) {
4514		- mutex_unlock(&fs_info->chunk_mutex);
4515		- checked_pending_chunks = true;
4516		- failed = 0;
4517		- retried = false;
4518		- ret = btrfs_commit_transaction(trans);
4519		- if (ret)
4520		- goto done;
4521		- goto again;
4522		- }
4523		- }
	4776	+ /* Clear all state bits beyond the shrunk device size */
	4777	+ clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
	4778	+ CHUNK_STATE_MASK);
4524	4779
4525	4780	btrfs_device_set_disk_total_bytes(device, new_size);
4526		- if (list_empty(&device->resized_list))
4527		- list_add_tail(&device->resized_list,
4528		- &fs_info->fs_devices->resized_devices);
	4781	+ if (list_empty(&device->post_commit_list))
	4782	+ list_add_tail(&device->post_commit_list,
	4783	+ &trans->transaction->dev_update_list);
4529	4784
4530	4785	WARN_ON(diff > old_total);
4531	4786	btrfs_set_super_total_bytes(super_copy,
..	..	@@ -4609,96 +4864,119 @@
4609	4864	btrfs_set_fs_incompat(info, RAID56);
4610	4865	}
4611	4866
4612		-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4613		- u64 start, u64 type)
	4867	+static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4614	4868	{
4615		- struct btrfs_fs_info *info = trans->fs_info;
4616		- struct btrfs_fs_devices *fs_devices = info->fs_devices;
4617		- struct btrfs_device *device;
4618		- struct map_lookup *map = NULL;
4619		- struct extent_map_tree *em_tree;
4620		- struct extent_map *em;
4621		- struct btrfs_device_info *devices_info = NULL;
4622		- u64 total_avail;
4623		- int num_stripes; /* total number of stripes to allocate */
4624		- int data_stripes; /* number of stripes that count for
4625		- block group size */
4626		- int sub_stripes; /* sub_stripes info for map */
4627		- int dev_stripes; /* stripes per dev */
4628		- int devs_max; /* max devs to use */
4629		- int devs_min; /* min devs needed */
4630		- int devs_increment; /* ndevs has to be a multiple of this */
4631		- int ncopies; /* how many copies to data has */
4632		- int ret;
	4869	+ if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 \| BTRFS_BLOCK_GROUP_RAID1C4)))
	4870	+ return;
	4871	+
	4872	+ btrfs_set_fs_incompat(info, RAID1C34);
	4873	+}
	4874	+
	4875	+/*
	4876	+ * Structure used internally for __btrfs_alloc_chunk() function.
	4877	+ * Wraps needed parameters.
	4878	+ */
	4879	+struct alloc_chunk_ctl {
	4880	+ u64 start;
	4881	+ u64 type;
	4882	+ /* Total number of stripes to allocate */
	4883	+ int num_stripes;
	4884	+ /* sub_stripes info for map */
	4885	+ int sub_stripes;
	4886	+ /* Stripes per device */
	4887	+ int dev_stripes;
	4888	+ /* Maximum number of devices to use */
	4889	+ int devs_max;
	4890	+ /* Minimum number of devices to use */
	4891	+ int devs_min;
	4892	+ /* ndevs has to be a multiple of this */
	4893	+ int devs_increment;
	4894	+ /* Number of copies */
	4895	+ int ncopies;
	4896	+ /* Number of stripes worth of bytes to store parity information */
	4897	+ int nparity;
4633	4898	u64 max_stripe_size;
4634	4899	u64 max_chunk_size;
	4900	+ u64 dev_extent_min;
4635	4901	u64 stripe_size;
4636		- u64 num_bytes;
	4902	+ u64 chunk_size;
4637	4903	int ndevs;
4638		- int i;
4639		- int j;
4640		- int index;
	4904	+};
4641	4905
4642		- BUG_ON(!alloc_profile_is_valid(type, 0));
4643		-
4644		- if (list_empty(&fs_devices->alloc_list)) {
4645		- if (btrfs_test_opt(info, ENOSPC_DEBUG))
4646		- btrfs_debug(info, "%s: no writable device", __func__);
4647		- return -ENOSPC;
4648		- }
4649		-
4650		- index = btrfs_bg_flags_to_raid_index(type);
4651		-
4652		- sub_stripes = btrfs_raid_array[index].sub_stripes;
4653		- dev_stripes = btrfs_raid_array[index].dev_stripes;
4654		- devs_max = btrfs_raid_array[index].devs_max;
4655		- devs_min = btrfs_raid_array[index].devs_min;
4656		- devs_increment = btrfs_raid_array[index].devs_increment;
4657		- ncopies = btrfs_raid_array[index].ncopies;
	4906	+static void init_alloc_chunk_ctl_policy_regular(
	4907	+ struct btrfs_fs_devices *fs_devices,
	4908	+ struct alloc_chunk_ctl *ctl)
	4909	+{
	4910	+ u64 type = ctl->type;
4658	4911
4659	4912	if (type & BTRFS_BLOCK_GROUP_DATA) {
4660		- max_stripe_size = SZ_1G;
4661		- max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4662		- if (!devs_max)
4663		- devs_max = BTRFS_MAX_DEVS(info);
	4913	+ ctl->max_stripe_size = SZ_1G;
	4914	+ ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4664	4915	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4665		- /* for larger filesystems, use larger metadata chunks */
	4916	+ /* For larger filesystems, use larger metadata chunks */
4666	4917	if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4667		- max_stripe_size = SZ_1G;
	4918	+ ctl->max_stripe_size = SZ_1G;
4668	4919	else
4669		- max_stripe_size = SZ_256M;
4670		- max_chunk_size = max_stripe_size;
4671		- if (!devs_max)
4672		- devs_max = BTRFS_MAX_DEVS(info);
	4920	+ ctl->max_stripe_size = SZ_256M;
	4921	+ ctl->max_chunk_size = ctl->max_stripe_size;
4673	4922	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4674		- max_stripe_size = SZ_32M;
4675		- max_chunk_size = 2 * max_stripe_size;
4676		- if (!devs_max)
4677		- devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
	4923	+ ctl->max_stripe_size = SZ_32M;
	4924	+ ctl->max_chunk_size = 2 * ctl->max_stripe_size;
	4925	+ ctl->devs_max = min_t(int, ctl->devs_max,
	4926	+ BTRFS_MAX_DEVS_SYS_CHUNK);
4678	4927	} else {
4679		- btrfs_err(info, "invalid chunk type 0x%llx requested",
4680		- type);
4681		- BUG_ON(1);
	4928	+ BUG();
4682	4929	}
4683	4930
4684		- /* we don't want a chunk larger than 10% of writeable space */
4685		- max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4686		- max_chunk_size);
	4931	+ /* We don't want a chunk larger than 10% of writable space */
	4932	+ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
	4933	+ ctl->max_chunk_size);
	4934	+ ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
	4935	+}
4687	4936
4688		- devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4689		- GFP_NOFS);
4690		- if (!devices_info)
4691		- return -ENOMEM;
	4937	+static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
	4938	+ struct alloc_chunk_ctl *ctl)
	4939	+{
	4940	+ int index = btrfs_bg_flags_to_raid_index(ctl->type);
	4941	+
	4942	+ ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
	4943	+ ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
	4944	+ ctl->devs_max = btrfs_raid_array[index].devs_max;
	4945	+ if (!ctl->devs_max)
	4946	+ ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
	4947	+ ctl->devs_min = btrfs_raid_array[index].devs_min;
	4948	+ ctl->devs_increment = btrfs_raid_array[index].devs_increment;
	4949	+ ctl->ncopies = btrfs_raid_array[index].ncopies;
	4950	+ ctl->nparity = btrfs_raid_array[index].nparity;
	4951	+ ctl->ndevs = 0;
	4952	+
	4953	+ switch (fs_devices->chunk_alloc_policy) {
	4954	+ case BTRFS_CHUNK_ALLOC_REGULAR:
	4955	+ init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
	4956	+ break;
	4957	+ default:
	4958	+ BUG();
	4959	+ }
	4960	+}
	4961	+
	4962	+static int gather_device_info(struct btrfs_fs_devices *fs_devices,
	4963	+ struct alloc_chunk_ctl *ctl,
	4964	+ struct btrfs_device_info *devices_info)
	4965	+{
	4966	+ struct btrfs_fs_info *info = fs_devices->fs_info;
	4967	+ struct btrfs_device *device;
	4968	+ u64 total_avail;
	4969	+ u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
	4970	+ int ret;
	4971	+ int ndevs = 0;
	4972	+ u64 max_avail;
	4973	+ u64 dev_offset;
4692	4974
4693	4975	/*
4694	4976	* in the first pass through the devices list, we gather information
4695	4977	* about the available holes on each device.
4696	4978	*/
4697		- ndevs = 0;
4698	4979	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4699		- u64 max_avail;
4700		- u64 dev_offset;
4701		-
4702	4980	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4703	4981	WARN(1, KERN_ERR
4704	4982	"BTRFS: read-only device in alloc_list\n");
..	..	@@ -4716,24 +4994,23 @@
4716	4994	total_avail = 0;
4717	4995
4718	4996	/* If there is no space on this device, skip it. */
4719		- if (total_avail == 0)
	4997	+ if (total_avail < ctl->dev_extent_min)
4720	4998	continue;
4721	4999
4722		- ret = find_free_dev_extent(trans, device,
4723		- max_stripe_size * dev_stripes,
4724		- &dev_offset, &max_avail);
	5000	+ ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
	5001	+ &max_avail);
4725	5002	if (ret && ret != -ENOSPC)
4726		- goto error;
	5003	+ return ret;
4727	5004
4728	5005	if (ret == 0)
4729		- max_avail = max_stripe_size * dev_stripes;
	5006	+ max_avail = dev_extent_want;
4730	5007
4731		- if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
	5008	+ if (max_avail < ctl->dev_extent_min) {
4732	5009	if (btrfs_test_opt(info, ENOSPC_DEBUG))
4733	5010	btrfs_debug(info,
4734		- "%s: devid %llu has no free space, have=%llu want=%u",
	5011	+ "%s: devid %llu has no free space, have=%llu want=%llu",
4735	5012	__func__, device->devid, max_avail,
4736		- BTRFS_STRIPE_LEN * dev_stripes);
	5013	+ ctl->dev_extent_min);
4737	5014	continue;
4738	5015	}
4739	5016
..	..	@@ -4748,6 +5025,7 @@
4748	5025	devices_info[ndevs].dev = device;
4749	5026	++ndevs;
4750	5027	}
	5028	+ ctl->ndevs = ndevs;
4751	5029
4752	5030	/*
4753	5031	* now sort the devices by hole size / available space
..	..	@@ -4755,20 +5033,14 @@
4755	5033	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4756	5034	btrfs_cmp_device_info, NULL);
4757	5035
4758		- /* round down to number of usable stripes */
4759		- ndevs = round_down(ndevs, devs_increment);
	5036	+ return 0;
	5037	+}
4760	5038
4761		- if (ndevs < devs_min) {
4762		- ret = -ENOSPC;
4763		- if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
4764		- btrfs_debug(info,
4765		- "%s: not enough devices with free space: have=%d minimum required=%d",
4766		- __func__, ndevs, devs_min);
4767		- }
4768		- goto error;
4769		- }
4770		-
4771		- ndevs = min(ndevs, devs_max);
	5039	+static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
	5040	+ struct btrfs_device_info *devices_info)
	5041	+{
	5042	+ /* Number of stripes that count for block group size */
	5043	+ int data_stripes;
4772	5044
4773	5045	/*
4774	5046	* The primary goal is to maximize the number of stripes, so use as
..	..	@@ -4777,109 +5049,148 @@
4777	5049	* The DUP profile stores more than one stripe per device, the
4778	5050	* max_avail is the total size so we have to adjust.
4779	5051	*/
4780		- stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
4781		- num_stripes = ndevs * dev_stripes;
	5052	+ ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
	5053	+ ctl->dev_stripes);
	5054	+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
	5055	+
	5056	+ /* This will have to be fixed for RAID1 and RAID10 over more drives */
	5057	+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
4782	5058
4783	5059	/*
4784		- * this will have to be fixed for RAID1 and RAID10 over
4785		- * more drives
	5060	+ * Use the number of data stripes to figure out how big this chunk is
	5061	+ * really going to be in terms of logical address space, and compare
	5062	+ * that answer with the max chunk size. If it's higher, we try to
	5063	+ * reduce stripe_size.
4786	5064	*/
4787		- data_stripes = num_stripes / ncopies;
4788		-
4789		- if (type & BTRFS_BLOCK_GROUP_RAID5)
4790		- data_stripes = num_stripes - 1;
4791		-
4792		- if (type & BTRFS_BLOCK_GROUP_RAID6)
4793		- data_stripes = num_stripes - 2;
4794		-
4795		- /*
4796		- * Use the number of data stripes to figure out how big this chunk
4797		- * is really going to be in terms of logical address space,
4798		- * and compare that answer with the max chunk size. If it's higher,
4799		- * we try to reduce stripe_size.
4800		- */
4801		- if (stripe_size * data_stripes > max_chunk_size) {
	5065	+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
4802	5066	/*
4803	5067	* Reduce stripe_size, round it up to a 16MB boundary again and
4804	5068	* then use it, unless it ends up being even bigger than the
4805	5069	* previous value we had already.
4806	5070	*/
4807		- stripe_size = min(round_up(div_u64(max_chunk_size,
4808		- data_stripes), SZ_16M),
4809		- stripe_size);
	5071	+ ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
	5072	+ data_stripes), SZ_16M),
	5073	+ ctl->stripe_size);
4810	5074	}
4811	5075
4812		- /* align to BTRFS_STRIPE_LEN */
4813		- stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
	5076	+ /* Align to BTRFS_STRIPE_LEN */
	5077	+ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
	5078	+ ctl->chunk_size = ctl->stripe_size * data_stripes;
4814	5079
4815		- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4816		- if (!map) {
4817		- ret = -ENOMEM;
4818		- goto error;
	5080	+ return 0;
	5081	+}
	5082	+
	5083	+static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
	5084	+ struct alloc_chunk_ctl *ctl,
	5085	+ struct btrfs_device_info *devices_info)
	5086	+{
	5087	+ struct btrfs_fs_info *info = fs_devices->fs_info;
	5088	+
	5089	+ /*
	5090	+ * Round down to number of usable stripes, devs_increment can be any
	5091	+ * number so we can't use round_down() that requires power of 2, while
	5092	+ * rounddown is safe.
	5093	+ */
	5094	+ ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
	5095	+
	5096	+ if (ctl->ndevs < ctl->devs_min) {
	5097	+ if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
	5098	+ btrfs_debug(info,
	5099	+ "%s: not enough devices with free space: have=%d minimum required=%d",
	5100	+ __func__, ctl->ndevs, ctl->devs_min);
	5101	+ }
	5102	+ return -ENOSPC;
4819	5103	}
4820		- map->num_stripes = num_stripes;
4821	5104
4822		- for (i = 0; i < ndevs; ++i) {
4823		- for (j = 0; j < dev_stripes; ++j) {
4824		- int s = i * dev_stripes + j;
	5105	+ ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
	5106	+
	5107	+ switch (fs_devices->chunk_alloc_policy) {
	5108	+ case BTRFS_CHUNK_ALLOC_REGULAR:
	5109	+ return decide_stripe_size_regular(ctl, devices_info);
	5110	+ default:
	5111	+ BUG();
	5112	+ }
	5113	+}
	5114	+
	5115	+static int create_chunk(struct btrfs_trans_handle *trans,
	5116	+ struct alloc_chunk_ctl *ctl,
	5117	+ struct btrfs_device_info *devices_info)
	5118	+{
	5119	+ struct btrfs_fs_info *info = trans->fs_info;
	5120	+ struct map_lookup *map = NULL;
	5121	+ struct extent_map_tree *em_tree;
	5122	+ struct extent_map *em;
	5123	+ u64 start = ctl->start;
	5124	+ u64 type = ctl->type;
	5125	+ int ret;
	5126	+ int i;
	5127	+ int j;
	5128	+
	5129	+ map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
	5130	+ if (!map)
	5131	+ return -ENOMEM;
	5132	+ map->num_stripes = ctl->num_stripes;
	5133	+
	5134	+ for (i = 0; i < ctl->ndevs; ++i) {
	5135	+ for (j = 0; j < ctl->dev_stripes; ++j) {
	5136	+ int s = i * ctl->dev_stripes + j;
4825	5137	map->stripes[s].dev = devices_info[i].dev;
4826	5138	map->stripes[s].physical = devices_info[i].dev_offset +
4827		- j * stripe_size;
	5139	+ j * ctl->stripe_size;
4828	5140	}
4829	5141	}
4830	5142	map->stripe_len = BTRFS_STRIPE_LEN;
4831	5143	map->io_align = BTRFS_STRIPE_LEN;
4832	5144	map->io_width = BTRFS_STRIPE_LEN;
4833	5145	map->type = type;
4834		- map->sub_stripes = sub_stripes;
	5146	+ map->sub_stripes = ctl->sub_stripes;
4835	5147
4836		- num_bytes = stripe_size * data_stripes;
4837		-
4838		- trace_btrfs_chunk_alloc(info, map, start, num_bytes);
	5148	+ trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
4839	5149
4840	5150	em = alloc_extent_map();
4841	5151	if (!em) {
4842	5152	kfree(map);
4843		- ret = -ENOMEM;
4844		- goto error;
	5153	+ return -ENOMEM;
4845	5154	}
4846	5155	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4847	5156	em->map_lookup = map;
4848	5157	em->start = start;
4849		- em->len = num_bytes;
	5158	+ em->len = ctl->chunk_size;
4850	5159	em->block_start = 0;
4851	5160	em->block_len = em->len;
4852		- em->orig_block_len = stripe_size;
	5161	+ em->orig_block_len = ctl->stripe_size;
4853	5162
4854		- em_tree = &info->mapping_tree.map_tree;
	5163	+ em_tree = &info->mapping_tree;
4855	5164	write_lock(&em_tree->lock);
4856	5165	ret = add_extent_mapping(em_tree, em, 0);
4857	5166	if (ret) {
4858	5167	write_unlock(&em_tree->lock);
4859	5168	free_extent_map(em);
4860		- goto error;
	5169	+ return ret;
4861	5170	}
4862		-
4863		- list_add_tail(&em->list, &trans->transaction->pending_chunks);
4864		- refcount_inc(&em->refs);
4865	5171	write_unlock(&em_tree->lock);
4866	5172
4867		- ret = btrfs_make_block_group(trans, 0, type, start, num_bytes);
	5173	+ ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
4868	5174	if (ret)
4869	5175	goto error_del_extent;
4870	5176
4871	5177	for (i = 0; i < map->num_stripes; i++) {
4872		- num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4873		- btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4874		- map->stripes[i].dev->has_pending_chunks = true;
	5178	+ struct btrfs_device *dev = map->stripes[i].dev;
	5179	+
	5180	+ btrfs_device_set_bytes_used(dev,
	5181	+ dev->bytes_used + ctl->stripe_size);
	5182	+ if (list_empty(&dev->post_commit_list))
	5183	+ list_add_tail(&dev->post_commit_list,
	5184	+ &trans->transaction->dev_update_list);
4875	5185	}
4876	5186
4877		- atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
	5187	+ atomic64_sub(ctl->stripe_size * map->num_stripes,
	5188	+ &info->free_chunk_space);
4878	5189
4879	5190	free_extent_map(em);
4880	5191	check_raid56_incompat_flag(info, type);
	5192	+ check_raid1c34_incompat_flag(info, type);
4881	5193
4882		- kfree(devices_info);
4883	5194	return 0;
4884	5195
4885	5196	error_del_extent:
..	..	@@ -4891,13 +5202,68 @@
4891	5202	free_extent_map(em);
4892	5203	/* One for the tree reference */
4893	5204	free_extent_map(em);
4894		- /* One for the pending_chunks list reference */
4895		- free_extent_map(em);
4896		-error:
	5205	+
	5206	+ return ret;
	5207	+}
	5208	+
	5209	+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
	5210	+{
	5211	+ struct btrfs_fs_info *info = trans->fs_info;
	5212	+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
	5213	+ struct btrfs_device_info *devices_info = NULL;
	5214	+ struct alloc_chunk_ctl ctl;
	5215	+ int ret;
	5216	+
	5217	+ lockdep_assert_held(&info->chunk_mutex);
	5218	+
	5219	+ if (!alloc_profile_is_valid(type, 0)) {
	5220	+ ASSERT(0);
	5221	+ return -EINVAL;
	5222	+ }
	5223	+
	5224	+ if (list_empty(&fs_devices->alloc_list)) {
	5225	+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
	5226	+ btrfs_debug(info, "%s: no writable device", __func__);
	5227	+ return -ENOSPC;
	5228	+ }
	5229	+
	5230	+ if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
	5231	+ btrfs_err(info, "invalid chunk type 0x%llx requested", type);
	5232	+ ASSERT(0);
	5233	+ return -EINVAL;
	5234	+ }
	5235	+
	5236	+ ctl.start = find_next_chunk(info);
	5237	+ ctl.type = type;
	5238	+ init_alloc_chunk_ctl(fs_devices, &ctl);
	5239	+
	5240	+ devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
	5241	+ GFP_NOFS);
	5242	+ if (!devices_info)
	5243	+ return -ENOMEM;
	5244	+
	5245	+ ret = gather_device_info(fs_devices, &ctl, devices_info);
	5246	+ if (ret < 0)
	5247	+ goto out;
	5248	+
	5249	+ ret = decide_stripe_size(fs_devices, &ctl, devices_info);
	5250	+ if (ret < 0)
	5251	+ goto out;
	5252	+
	5253	+ ret = create_chunk(trans, &ctl, devices_info);
	5254	+
	5255	+out:
4897	5256	kfree(devices_info);
4898	5257	return ret;
4899	5258	}
4900	5259
	5260	+/*
	5261	+ * Chunk allocation falls into two parts. The first part does work
	5262	+ * that makes the new allocated chunk usable, but does not do any operation
	5263	+ * that modifies the chunk tree. The second part does the work that
	5264	+ * requires modifying the chunk tree. This division is important for the
	5265	+ * bootstrap process of adding storage to a seed btrfs.
	5266	+ */
4901	5267	int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4902	5268	u64 chunk_offset, u64 chunk_size)
4903	5269	{
..	..	@@ -4916,7 +5282,7 @@
4916	5282	int i = 0;
4917	5283	int ret = 0;
4918	5284
4919		- em = get_chunk_map(fs_info, chunk_offset, chunk_size);
	5285	+ em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
4920	5286	if (IS_ERR(em))
4921	5287	return PTR_ERR(em);
4922	5288
..	..	@@ -4996,57 +5362,27 @@
4996	5362	return ret;
4997	5363	}
4998	5364
4999		-/*
5000		- * Chunk allocation falls into two parts. The first part does works
5001		- * that make the new allocated chunk useable, but not do any operation
5002		- * that modifies the chunk tree. The second part does the works that
5003		- * require modifying the chunk tree. This division is important for the
5004		- * bootstrap process of adding storage to a seed btrfs.
5005		- */
5006		-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
	5365	+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5007	5366	{
5008		- u64 chunk_offset;
5009		-
5010		- lockdep_assert_held(&trans->fs_info->chunk_mutex);
5011		- chunk_offset = find_next_chunk(trans->fs_info);
5012		- return __btrfs_alloc_chunk(trans, chunk_offset, type);
5013		-}
5014		-
5015		-static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
5016		- struct btrfs_fs_info *fs_info)
5017		-{
5018		- u64 chunk_offset;
5019		- u64 sys_chunk_offset;
	5367	+ struct btrfs_fs_info *fs_info = trans->fs_info;
5020	5368	u64 alloc_profile;
5021	5369	int ret;
5022	5370
5023		- chunk_offset = find_next_chunk(fs_info);
5024	5371	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5025		- ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
	5372	+ ret = btrfs_alloc_chunk(trans, alloc_profile);
5026	5373	if (ret)
5027	5374	return ret;
5028	5375
5029		- sys_chunk_offset = find_next_chunk(fs_info);
5030	5376	alloc_profile = btrfs_system_alloc_profile(fs_info);
5031		- ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
	5377	+ ret = btrfs_alloc_chunk(trans, alloc_profile);
5032	5378	return ret;
5033	5379	}
5034	5380
5035	5381	static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5036	5382	{
5037		- int max_errors;
	5383	+ const int index = btrfs_bg_flags_to_raid_index(map->type);
5038	5384
5039		- if (map->type & (BTRFS_BLOCK_GROUP_RAID1 \|
5040		- BTRFS_BLOCK_GROUP_RAID10 \|
5041		- BTRFS_BLOCK_GROUP_RAID5)) {
5042		- max_errors = 1;
5043		- } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
5044		- max_errors = 2;
5045		- } else {
5046		- max_errors = 0;
5047		- }
5048		-
5049		- return max_errors;
	5385	+ return btrfs_raid_array[index].tolerated_failures;
5050	5386	}
5051	5387
5052	5388	int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
..	..	@@ -5057,7 +5393,7 @@
5057	5393	int miss_ndevs = 0;
5058	5394	int i;
5059	5395
5060		- em = get_chunk_map(fs_info, chunk_offset, 1);
	5396	+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5061	5397	if (IS_ERR(em))
5062	5398	return 1;
5063	5399
..	..	@@ -5087,21 +5423,16 @@
5087	5423	return readonly;
5088	5424	}
5089	5425
5090		-void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
5091		-{
5092		- extent_map_tree_init(&tree->map_tree);
5093		-}
5094		-
5095		-void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
	5426	+void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5096	5427	{
5097	5428	struct extent_map *em;
5098	5429
5099	5430	while (1) {
5100		- write_lock(&tree->map_tree.lock);
5101		- em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
	5431	+ write_lock(&tree->lock);
	5432	+ em = lookup_extent_mapping(tree, 0, (u64)-1);
5102	5433	if (em)
5103		- remove_extent_mapping(&tree->map_tree, em);
5104		- write_unlock(&tree->map_tree.lock);
	5434	+ remove_extent_mapping(tree, em);
	5435	+ write_unlock(&tree->lock);
5105	5436	if (!em)
5106	5437	break;
5107	5438	/* once for us */
..	..	@@ -5117,7 +5448,7 @@
5117	5448	struct map_lookup *map;
5118	5449	int ret;
5119	5450
5120		- em = get_chunk_map(fs_info, logical, len);
	5451	+ em = btrfs_get_chunk_map(fs_info, logical, len);
5121	5452	if (IS_ERR(em))
5122	5453	/*
5123	5454	* We could return errors for these cases, but that could get
..	..	@@ -5128,7 +5459,7 @@
5128	5459	return 1;
5129	5460
5130	5461	map = em->map_lookup;
5131		- if (map->type & (BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1))
	5462	+ if (map->type & (BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1_MASK))
5132	5463	ret = map->num_stripes;
5133	5464	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5134	5465	ret = map->sub_stripes;
..	..	@@ -5147,11 +5478,11 @@
5147	5478	ret = 1;
5148	5479	free_extent_map(em);
5149	5480
5150		- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
	5481	+ down_read(&fs_info->dev_replace.rwsem);
5151	5482	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5152	5483	fs_info->dev_replace.tgtdev)
5153	5484	ret++;
5154		- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
	5485	+ up_read(&fs_info->dev_replace.rwsem);
5155	5486
5156	5487	return ret;
5157	5488	}
..	..	@@ -5163,7 +5494,7 @@
5163	5494	struct map_lookup *map;
5164	5495	unsigned long len = fs_info->sectorsize;
5165	5496
5166		- em = get_chunk_map(fs_info, logical, len);
	5497	+ em = btrfs_get_chunk_map(fs_info, logical, len);
5167	5498
5168	5499	if (!WARN_ON(IS_ERR(em))) {
5169	5500	map = em->map_lookup;
..	..	@@ -5180,7 +5511,7 @@
5180	5511	struct map_lookup *map;
5181	5512	int ret = 0;
5182	5513
5183		- em = get_chunk_map(fs_info, logical, len);
	5514	+ em = btrfs_get_chunk_map(fs_info, logical, len);
5184	5515
5185	5516	if(!WARN_ON(IS_ERR(em))) {
5186	5517	map = em->map_lookup;
..	..	@@ -5202,7 +5533,7 @@
5202	5533	struct btrfs_device *srcdev;
5203	5534
5204	5535	ASSERT((map->type &
5205		- (BTRFS_BLOCK_GROUP_RAID1 \| BTRFS_BLOCK_GROUP_RAID10)));
	5536	+ (BTRFS_BLOCK_GROUP_RAID1_MASK \| BTRFS_BLOCK_GROUP_RAID10)));
5206	5537
5207	5538	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5208	5539	num_stripes = map->sub_stripes;
..	..	@@ -5240,31 +5571,19 @@
5240	5571	return preferred_mirror;
5241	5572	}
5242	5573
5243		-static inline int parity_smaller(u64 a, u64 b)
5244		-{
5245		- return a > b;
5246		-}
5247		-
5248	5574	/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5249	5575	static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5250	5576	{
5251		- struct btrfs_bio_stripe s;
5252	5577	int i;
5253		- u64 l;
5254	5578	int again = 1;
5255	5579
5256	5580	while (again) {
5257	5581	again = 0;
5258	5582	for (i = 0; i < num_stripes - 1; i++) {
5259		- if (parity_smaller(bbio->raid_map[i],
5260		- bbio->raid_map[i+1])) {
5261		- s = bbio->stripes[i];
5262		- l = bbio->raid_map[i];
5263		- bbio->stripes[i] = bbio->stripes[i+1];
5264		- bbio->raid_map[i] = bbio->raid_map[i+1];
5265		- bbio->stripes[i+1] = s;
5266		- bbio->raid_map[i+1] = l;
5267		-
	5583	+ /* Swap if parity is on a smaller index */
	5584	+ if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
	5585	+ swap(bbio->stripes[i], bbio->stripes[i + 1]);
	5586	+ swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
5268	5587	again = 1;
5269	5588	}
5270	5589	}
..	..	@@ -5290,6 +5609,9 @@
5290	5609	atomic_set(&bbio->error, 0);
5291	5610	refcount_set(&bbio->refs, 1);
5292	5611
	5612	+ bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
	5613	+ bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
	5614	+
5293	5615	return bbio;
5294	5616	}
5295	5617
..	..	@@ -5313,12 +5635,13 @@
5313	5635	* replace.
5314	5636	*/
5315	5637	static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5316		- u64 logical, u64 length,
	5638	+ u64 logical, u64 *length_ret,
5317	5639	struct btrfs_bio **bbio_ret)
5318	5640	{
5319	5641	struct extent_map *em;
5320	5642	struct map_lookup *map;
5321	5643	struct btrfs_bio *bbio;
	5644	+ u64 length = *length_ret;
5322	5645	u64 offset;
5323	5646	u64 stripe_nr;
5324	5647	u64 stripe_nr_end;
..	..	@@ -5339,7 +5662,7 @@
5339	5662	/* discard always return a bbio */
5340	5663	ASSERT(bbio_ret);
5341	5664
5342		- em = get_chunk_map(fs_info, logical, length);
	5665	+ em = btrfs_get_chunk_map(fs_info, logical, length);
5343	5666	if (IS_ERR(em))
5344	5667	return PTR_ERR(em);
5345	5668
..	..	@@ -5351,7 +5674,8 @@
5351	5674	}
5352	5675
5353	5676	offset = logical - em->start;
5354		- length = min_t(u64, em->len - offset, length);
	5677	+ length = min_t(u64, em->start + em->len - logical, length);
	5678	+ *length_ret = length;
5355	5679
5356	5680	stripe_len = map->stripe_len;
5357	5681	/*
..	..	@@ -5391,7 +5715,7 @@
5391	5715	&remaining_stripes);
5392	5716	div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5393	5717	last_stripe *= sub_stripes;
5394		- } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 \|
	5718	+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK \|
5395	5719	BTRFS_BLOCK_GROUP_DUP)) {
5396	5720	num_stripes = map->num_stripes;
5397	5721	} else {
..	..	@@ -5635,6 +5959,106 @@
5635	5959	return (op == BTRFS_MAP_WRITE \|\| op == BTRFS_MAP_GET_READ_MIRRORS);
5636	5960	}
5637	5961
	5962	+/*
	5963	+ * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
	5964	+ * tuple. This information is used to calculate how big a
	5965	+ * particular bio can get before it straddles a stripe.
	5966	+ *
	5967	+ * @fs_info - the filesystem
	5968	+ * @logical - address that we want to figure out the geometry of
	5969	+ * @len - the length of IO we are going to perform, starting at @logical
	5970	+ * @op - type of operation - write or read
	5971	+ * @io_geom - pointer used to return values
	5972	+ *
	5973	+ * Returns < 0 in case a chunk for the given logical address cannot be found,
	5974	+ * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
	5975	+ */
	5976	+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
	5977	+ u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
	5978	+{
	5979	+ struct extent_map *em;
	5980	+ struct map_lookup *map;
	5981	+ u64 offset;
	5982	+ u64 stripe_offset;
	5983	+ u64 stripe_nr;
	5984	+ u64 stripe_len;
	5985	+ u64 raid56_full_stripe_start = (u64)-1;
	5986	+ int data_stripes;
	5987	+ int ret = 0;
	5988	+
	5989	+ ASSERT(op != BTRFS_MAP_DISCARD);
	5990	+
	5991	+ em = btrfs_get_chunk_map(fs_info, logical, len);
	5992	+ if (IS_ERR(em))
	5993	+ return PTR_ERR(em);
	5994	+
	5995	+ map = em->map_lookup;
	5996	+ /* Offset of this logical address in the chunk */
	5997	+ offset = logical - em->start;
	5998	+ /* Len of a stripe in a chunk */
	5999	+ stripe_len = map->stripe_len;
	6000	+ /* Stripe wher this block falls in */
	6001	+ stripe_nr = div64_u64(offset, stripe_len);
	6002	+ /* Offset of stripe in the chunk */
	6003	+ stripe_offset = stripe_nr * stripe_len;
	6004	+ if (offset < stripe_offset) {
	6005	+ btrfs_crit(fs_info,
	6006	+"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
	6007	+ stripe_offset, offset, em->start, logical, stripe_len);
	6008	+ ret = -EINVAL;
	6009	+ goto out;
	6010	+ }
	6011	+
	6012	+ /* stripe_offset is the offset of this block in its stripe */
	6013	+ stripe_offset = offset - stripe_offset;
	6014	+ data_stripes = nr_data_stripes(map);
	6015	+
	6016	+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
	6017	+ u64 max_len = stripe_len - stripe_offset;
	6018	+
	6019	+ /*
	6020	+ * In case of raid56, we need to know the stripe aligned start
	6021	+ */
	6022	+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
	6023	+ unsigned long full_stripe_len = stripe_len * data_stripes;
	6024	+ raid56_full_stripe_start = offset;
	6025	+
	6026	+ /*
	6027	+ * Allow a write of a full stripe, but make sure we
	6028	+ * don't allow straddling of stripes
	6029	+ */
	6030	+ raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
	6031	+ full_stripe_len);
	6032	+ raid56_full_stripe_start *= full_stripe_len;
	6033	+
	6034	+ /*
	6035	+ * For writes to RAID[56], allow a full stripeset across
	6036	+ * all disks. For other RAID types and for RAID[56]
	6037	+ * reads, just allow a single stripe (on a single disk).
	6038	+ */
	6039	+ if (op == BTRFS_MAP_WRITE) {
	6040	+ max_len = stripe_len * data_stripes -
	6041	+ (offset - raid56_full_stripe_start);
	6042	+ }
	6043	+ }
	6044	+ len = min_t(u64, em->len - offset, max_len);
	6045	+ } else {
	6046	+ len = em->len - offset;
	6047	+ }
	6048	+
	6049	+ io_geom->len = len;
	6050	+ io_geom->offset = offset;
	6051	+ io_geom->stripe_len = stripe_len;
	6052	+ io_geom->stripe_nr = stripe_nr;
	6053	+ io_geom->stripe_offset = stripe_offset;
	6054	+ io_geom->raid56_stripe_offset = raid56_full_stripe_start;
	6055	+
	6056	+out:
	6057	+ /* once for us */
	6058	+ free_extent_map(em);
	6059	+ return ret;
	6060	+}
	6061	+
5638	6062	static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5639	6063	enum btrfs_map_op op,
5640	6064	u64 logical, u64 *length,
..	..	@@ -5643,11 +6067,11 @@
5643	6067	{
5644	6068	struct extent_map *em;
5645	6069	struct map_lookup *map;
5646		- u64 offset;
5647	6070	u64 stripe_offset;
5648	6071	u64 stripe_nr;
5649	6072	u64 stripe_len;
5650	6073	u32 stripe_index;
	6074	+ int data_stripes;
5651	6075	int i;
5652	6076	int ret = 0;
5653	6077	int num_stripes;
..	..	@@ -5660,81 +6084,34 @@
5660	6084	int patch_the_first_stripe_for_dev_replace = 0;
5661	6085	u64 physical_to_patch_in_first_stripe = 0;
5662	6086	u64 raid56_full_stripe_start = (u64)-1;
	6087	+ struct btrfs_io_geometry geom;
5663	6088
5664		- if (op == BTRFS_MAP_DISCARD)
5665		- return __btrfs_map_block_for_discard(fs_info, logical,
5666		- *length, bbio_ret);
	6089	+ ASSERT(bbio_ret);
	6090	+ ASSERT(op != BTRFS_MAP_DISCARD);
5667	6091
5668		- em = get_chunk_map(fs_info, logical, *length);
5669		- if (IS_ERR(em))
5670		- return PTR_ERR(em);
	6092	+ ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
	6093	+ if (ret < 0)
	6094	+ return ret;
5671	6095
	6096	+ em = btrfs_get_chunk_map(fs_info, logical, *length);
	6097	+ ASSERT(!IS_ERR(em));
5672	6098	map = em->map_lookup;
5673		- offset = logical - em->start;
5674	6099
5675		- stripe_len = map->stripe_len;
5676		- stripe_nr = offset;
5677		- /*
5678		- * stripe_nr counts the total number of stripes we have to stride
5679		- * to get to this block
5680		- */
5681		- stripe_nr = div64_u64(stripe_nr, stripe_len);
	6100	+ *length = geom.len;
	6101	+ stripe_len = geom.stripe_len;
	6102	+ stripe_nr = geom.stripe_nr;
	6103	+ stripe_offset = geom.stripe_offset;
	6104	+ raid56_full_stripe_start = geom.raid56_stripe_offset;
	6105	+ data_stripes = nr_data_stripes(map);
5682	6106
5683		- stripe_offset = stripe_nr * stripe_len;
5684		- if (offset < stripe_offset) {
5685		- btrfs_crit(fs_info,
5686		- "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
5687		- stripe_offset, offset, em->start, logical,
5688		- stripe_len);
5689		- free_extent_map(em);
5690		- return -EINVAL;
5691		- }
5692		-
5693		- /* stripe_offset is the offset of this block in its stripe*/
5694		- stripe_offset = offset - stripe_offset;
5695		-
5696		- /* if we're here for raid56, we need to know the stripe aligned start */
5697		- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5698		- unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
5699		- raid56_full_stripe_start = offset;
5700		-
5701		- /* allow a write of a full stripe, but make sure we don't
5702		- * allow straddling of stripes
5703		- */
5704		- raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5705		- full_stripe_len);
5706		- raid56_full_stripe_start *= full_stripe_len;
5707		- }
5708		-
5709		- if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5710		- u64 max_len;
5711		- /* For writes to RAID[56], allow a full stripeset across all disks.
5712		- For other RAID types and for RAID[56] reads, just allow a single
5713		- stripe (on a single disk). */
5714		- if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5715		- (op == BTRFS_MAP_WRITE)) {
5716		- max_len = stripe_len * nr_data_stripes(map) -
5717		- (offset - raid56_full_stripe_start);
5718		- } else {
5719		- /* we limit the length of each bio to what fits in a stripe */
5720		- max_len = stripe_len - stripe_offset;
5721		- }
5722		- *length = min_t(u64, em->len - offset, max_len);
5723		- } else {
5724		- *length = em->len - offset;
5725		- }
5726		-
5727		- /* This is for when we're called from btrfs_merge_bio_hook() and all
5728		- it cares about is the length */
5729		- if (!bbio_ret)
5730		- goto out;
5731		-
5732		- btrfs_dev_replace_read_lock(dev_replace);
	6107	+ down_read(&dev_replace->rwsem);
5733	6108	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
	6109	+ /*
	6110	+ * Hold the semaphore for read during the whole operation, write is
	6111	+ * requested at commit time but must wait.
	6112	+ */
5734	6113	if (!dev_replace_is_ongoing)
5735		- btrfs_dev_replace_read_unlock(dev_replace);
5736		- else
5737		- btrfs_dev_replace_set_lock_blocking(dev_replace);
	6114	+ up_read(&dev_replace->rwsem);
5738	6115
5739	6116	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5740	6117	!need_full_stripe(op) && dev_replace->tgtdev != NULL) {
..	..	@@ -5757,7 +6134,7 @@
5757	6134	&stripe_index);
5758	6135	if (!need_full_stripe(op))
5759	6136	mirror_num = 1;
5760		- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
	6137	+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
5761	6138	if (need_full_stripe(op))
5762	6139	num_stripes = map->num_stripes;
5763	6140	else if (mirror_num)
..	..	@@ -5799,7 +6176,7 @@
5799	6176	if (need_raid_map && (need_full_stripe(op) \|\| mirror_num > 1)) {
5800	6177	/* push stripe_nr back to the start of the full stripe */
5801	6178	stripe_nr = div64_u64(raid56_full_stripe_start,
5802		- stripe_len * nr_data_stripes(map));
	6179	+ stripe_len * data_stripes);
5803	6180
5804	6181	/* RAID[56] write or recovery. Return all stripes */
5805	6182	num_stripes = map->num_stripes;
..	..	@@ -5815,10 +6192,9 @@
5815	6192	* Mirror #3 is RAID6 Q block.
5816	6193	*/
5817	6194	stripe_nr = div_u64_rem(stripe_nr,
5818		- nr_data_stripes(map), &stripe_index);
	6195	+ data_stripes, &stripe_index);
5819	6196	if (mirror_num > 1)
5820		- stripe_index = nr_data_stripes(map) +
5821		- mirror_num - 2;
	6197	+ stripe_index = data_stripes + mirror_num - 2;
5822	6198
5823	6199	/* We distribute the parity blocks across stripes */
5824	6200	div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
..	..	@@ -5858,8 +6234,13 @@
5858	6234	ret = -ENOMEM;
5859	6235	goto out;
5860	6236	}
5861		- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
5862		- bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
	6237	+
	6238	+ for (i = 0; i < num_stripes; i++) {
	6239	+ bbio->stripes[i].physical = map->stripes[stripe_index].physical +
	6240	+ stripe_offset + stripe_nr * map->stripe_len;
	6241	+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
	6242	+ stripe_index++;
	6243	+ }
5863	6244
5864	6245	/* build raid_map */
5865	6246	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
..	..	@@ -5867,17 +6248,12 @@
5867	6248	u64 tmp;
5868	6249	unsigned rot;
5869	6250
5870		- bbio->raid_map = (u64 )((void )bbio->stripes +
5871		- sizeof(struct btrfs_bio_stripe) *
5872		- num_alloc_stripes +
5873		- sizeof(int) * tgtdev_indexes);
5874		-
5875	6251	/* Work out the disk rotation on this stripe-set */
5876	6252	div_u64_rem(stripe_nr, num_stripes, &rot);
5877	6253
5878	6254	/* Fill in the logical address of each stripe */
5879		- tmp = stripe_nr * nr_data_stripes(map);
5880		- for (i = 0; i < nr_data_stripes(map); i++)
	6255	+ tmp = stripe_nr * data_stripes;
	6256	+ for (i = 0; i < data_stripes; i++)
5881	6257	bbio->raid_map[(i+rot) % num_stripes] =
5882	6258	em->start + (tmp + i) * map->stripe_len;
5883	6259
..	..	@@ -5885,24 +6261,12 @@
5885	6261	if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5886	6262	bbio->raid_map[(i+rot+1) % num_stripes] =
5887	6263	RAID6_Q_STRIPE;
5888		- }
5889	6264
5890		-
5891		- for (i = 0; i < num_stripes; i++) {
5892		- bbio->stripes[i].physical =
5893		- map->stripes[stripe_index].physical +
5894		- stripe_offset +
5895		- stripe_nr * map->stripe_len;
5896		- bbio->stripes[i].dev =
5897		- map->stripes[stripe_index].dev;
5898		- stripe_index++;
	6265	+ sort_parity_stripes(bbio, num_stripes);
5899	6266	}
5900	6267
5901	6268	if (need_full_stripe(op))
5902	6269	max_errors = btrfs_chunk_max_errors(map);
5903		-
5904		- if (bbio->raid_map)
5905		- sort_parity_stripes(bbio, num_stripes);
5906	6270
5907	6271	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
5908	6272	need_full_stripe(op)) {
..	..	@@ -5929,8 +6293,9 @@
5929	6293	}
5930	6294	out:
5931	6295	if (dev_replace_is_ongoing) {
5932		- btrfs_dev_replace_clear_lock_blocking(dev_replace);
5933		- btrfs_dev_replace_read_unlock(dev_replace);
	6296	+ lockdep_assert_held(&dev_replace->rwsem);
	6297	+ /* Unlock and let waiting writers proceed */
	6298	+ up_read(&dev_replace->rwsem);
5934	6299	}
5935	6300	free_extent_map(em);
5936	6301	return ret;
..	..	@@ -5940,6 +6305,10 @@
5940	6305	u64 logical, u64 *length,
5941	6306	struct btrfs_bio **bbio_ret, int mirror_num)
5942	6307	{
	6308	+ if (op == BTRFS_MAP_DISCARD)
	6309	+ return __btrfs_map_block_for_discard(fs_info, logical,
	6310	+ length, bbio_ret);
	6311	+
5943	6312	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
5944	6313	mirror_num, 0);
5945	6314	}
..	..	@@ -5950,75 +6319,6 @@
5950	6319	struct btrfs_bio **bbio_ret)
5951	6320	{
5952	6321	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
5953		-}
5954		-
5955		-int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
5956		- u64 physical, u64 *logical, int naddrs, int *stripe_len)
5957		-{
5958		- struct extent_map *em;
5959		- struct map_lookup *map;
5960		- u64 *buf;
5961		- u64 bytenr;
5962		- u64 length;
5963		- u64 stripe_nr;
5964		- u64 rmap_len;
5965		- int i, j, nr = 0;
5966		-
5967		- em = get_chunk_map(fs_info, chunk_start, 1);
5968		- if (IS_ERR(em))
5969		- return -EIO;
5970		-
5971		- map = em->map_lookup;
5972		- length = em->len;
5973		- rmap_len = map->stripe_len;
5974		-
5975		- if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5976		- length = div_u64(length, map->num_stripes / map->sub_stripes);
5977		- else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5978		- length = div_u64(length, map->num_stripes);
5979		- else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5980		- length = div_u64(length, nr_data_stripes(map));
5981		- rmap_len = map->stripe_len * nr_data_stripes(map);
5982		- }
5983		-
5984		- buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
5985		- BUG_ON(!buf); /* -ENOMEM */
5986		-
5987		- for (i = 0; i < map->num_stripes; i++) {
5988		- if (map->stripes[i].physical > physical \|\|
5989		- map->stripes[i].physical + length <= physical)
5990		- continue;
5991		-
5992		- stripe_nr = physical - map->stripes[i].physical;
5993		- stripe_nr = div64_u64(stripe_nr, map->stripe_len);
5994		-
5995		- if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5996		- stripe_nr = stripe_nr * map->num_stripes + i;
5997		- stripe_nr = div_u64(stripe_nr, map->sub_stripes);
5998		- } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5999		- stripe_nr = stripe_nr * map->num_stripes + i;
6000		- } /* else if RAID[56], multiply by nr_data_stripes().
6001		- * Alternatively, just use rmap_len below instead of
6002		- * map->stripe_len */
6003		-
6004		- bytenr = chunk_start + stripe_nr * rmap_len;
6005		- WARN_ON(nr >= map->num_stripes);
6006		- for (j = 0; j < nr; j++) {
6007		- if (buf[j] == bytenr)
6008		- break;
6009		- }
6010		- if (j == nr) {
6011		- WARN_ON(nr >= map->num_stripes);
6012		- buf[nr++] = bytenr;
6013		- }
6014		- }
6015		-
6016		- *logical = buf;
6017		- *naddrs = nr;
6018		- *stripe_len = rmap_len;
6019		-
6020		- free_extent_map(em);
6021		- return 0;
6022	6322	}
6023	6323
6024	6324	static inline void btrfs_end_bbio(struct btrfs_bio bbio, struct bio bio)
..	..	@@ -6039,23 +6339,18 @@
6039	6339	atomic_inc(&bbio->error);
6040	6340	if (bio->bi_status == BLK_STS_IOERR \|\|
6041	6341	bio->bi_status == BLK_STS_TARGET) {
6042		- unsigned int stripe_index =
6043		- btrfs_io_bio(bio)->stripe_index;
6044		- struct btrfs_device *dev;
	6342	+ struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6045	6343
6046		- BUG_ON(stripe_index >= bbio->num_stripes);
6047		- dev = bbio->stripes[stripe_index].dev;
6048		- if (dev->bdev) {
6049		- if (bio_op(bio) == REQ_OP_WRITE)
6050		- btrfs_dev_stat_inc_and_print(dev,
	6344	+ ASSERT(dev->bdev);
	6345	+ if (bio_op(bio) == REQ_OP_WRITE)
	6346	+ btrfs_dev_stat_inc_and_print(dev,
6051	6347	BTRFS_DEV_STAT_WRITE_ERRS);
6052		- else if (!(bio->bi_opf & REQ_RAHEAD))
6053		- btrfs_dev_stat_inc_and_print(dev,
	6348	+ else if (!(bio->bi_opf & REQ_RAHEAD))
	6349	+ btrfs_dev_stat_inc_and_print(dev,
6054	6350	BTRFS_DEV_STAT_READ_ERRS);
6055		- if (bio->bi_opf & REQ_PREFLUSH)
6056		- btrfs_dev_stat_inc_and_print(dev,
	6351	+ if (bio->bi_opf & REQ_PREFLUSH)
	6352	+ btrfs_dev_stat_inc_and_print(dev,
6057	6353	BTRFS_DEV_STAT_FLUSH_ERRS);
6058		- }
6059	6354	}
6060	6355	}
6061	6356
..	..	@@ -6090,73 +6385,25 @@
6090	6385	}
6091	6386	}
6092	6387
6093		-/*
6094		- * see run_scheduled_bios for a description of why bios are collected for
6095		- * async submit.
6096		- *
6097		- * This will add one bio to the pending list for a device and make sure
6098		- * the work struct is scheduled.
6099		- */
6100		-static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6101		- struct bio *bio)
6102		-{
6103		- struct btrfs_fs_info *fs_info = device->fs_info;
6104		- int should_queue = 1;
6105		- struct btrfs_pending_bios *pending_bios;
6106		-
6107		- /* don't bother with additional async steps for reads, right now */
6108		- if (bio_op(bio) == REQ_OP_READ) {
6109		- btrfsic_submit_bio(bio);
6110		- return;
6111		- }
6112		-
6113		- WARN_ON(bio->bi_next);
6114		- bio->bi_next = NULL;
6115		-
6116		- spin_lock(&device->io_lock);
6117		- if (op_is_sync(bio->bi_opf))
6118		- pending_bios = &device->pending_sync_bios;
6119		- else
6120		- pending_bios = &device->pending_bios;
6121		-
6122		- if (pending_bios->tail)
6123		- pending_bios->tail->bi_next = bio;
6124		-
6125		- pending_bios->tail = bio;
6126		- if (!pending_bios->head)
6127		- pending_bios->head = bio;
6128		- if (device->running_pending)
6129		- should_queue = 0;
6130		-
6131		- spin_unlock(&device->io_lock);
6132		-
6133		- if (should_queue)
6134		- btrfs_queue_work(fs_info->submit_workers, &device->work);
6135		-}
6136		-
6137	6388	static void submit_stripe_bio(struct btrfs_bio bbio, struct bio bio,
6138		- u64 physical, int dev_nr, int async)
	6389	+ u64 physical, struct btrfs_device *dev)
6139	6390	{
6140		- struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
6141	6391	struct btrfs_fs_info *fs_info = bbio->fs_info;
6142	6392
6143	6393	bio->bi_private = bbio;
6144		- btrfs_io_bio(bio)->stripe_index = dev_nr;
	6394	+ btrfs_io_bio(bio)->device = dev;
6145	6395	bio->bi_end_io = btrfs_end_bio;
6146	6396	bio->bi_iter.bi_sector = physical >> 9;
6147	6397	btrfs_debug_in_rcu(fs_info,
6148	6398	"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6149	6399	bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
6150		- (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid,
6151		- bio->bi_iter.bi_size);
	6400	+ (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
	6401	+ dev->devid, bio->bi_iter.bi_size);
6152	6402	bio_set_dev(bio, dev->bdev);
6153	6403
6154	6404	btrfs_bio_counter_inc_noblocked(fs_info);
6155	6405
6156		- if (async)
6157		- btrfs_schedule_bio(dev, bio);
6158		- else
6159		- btrfsic_submit_bio(bio);
	6406	+ btrfsic_submit_bio(bio);
6160	6407	}
6161	6408
6162	6409	static void bbio_error(struct btrfs_bio bbio, struct bio bio, u64 logical)
..	..	@@ -6177,7 +6424,7 @@
6177	6424	}
6178	6425
6179	6426	blk_status_t btrfs_map_bio(struct btrfs_fs_info fs_info, struct bio bio,
6180		- int mirror_num, int async_submit)
	6427	+ int mirror_num)
6181	6428	{
6182	6429	struct btrfs_device *dev;
6183	6430	struct bio *first_bio = bio;
..	..	@@ -6245,8 +6492,7 @@
6245	6492	else
6246	6493	bio = first_bio;
6247	6494
6248		- submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
6249		- dev_nr, async_submit);
	6495	+ submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
6250	6496	}
6251	6497	btrfs_bio_counter_dec(fs_info);
6252	6498	return BLK_STS_OK;
..	..	@@ -6262,15 +6508,25 @@
6262	6508	* If @seed is true, traverse through the seed devices.
6263	6509	*/
6264	6510	struct btrfs_device btrfs_find_device(struct btrfs_fs_devices fs_devices,
6265		- u64 devid, u8 uuid, u8 fsid,
6266		- bool seed)
	6511	+ u64 devid, u8 uuid, u8 fsid,
	6512	+ bool seed)
6267	6513	{
6268	6514	struct btrfs_device *device;
	6515	+ struct btrfs_fs_devices *seed_devs;
6269	6516
6270		- while (fs_devices) {
	6517	+ if (!fsid \|\| !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
	6518	+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
	6519	+ if (device->devid == devid &&
	6520	+ (!uuid \|\| memcmp(device->uuid, uuid,
	6521	+ BTRFS_UUID_SIZE) == 0))
	6522	+ return device;
	6523	+ }
	6524	+ }
	6525	+
	6526	+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6271	6527	if (!fsid \|\|
6272		- !memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
6273		- list_for_each_entry(device, &fs_devices->devices,
	6528	+ !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
	6529	+ list_for_each_entry(device, &seed_devs->devices,
6274	6530	dev_list) {
6275	6531	if (device->devid == devid &&
6276	6532	(!uuid \|\| memcmp(device->uuid, uuid,
..	..	@@ -6278,11 +6534,8 @@
6278	6534	return device;
6279	6535	}
6280	6536	}
6281		- if (seed)
6282		- fs_devices = fs_devices->seed;
6283		- else
6284		- return NULL;
6285	6537	}
	6538	+
6286	6539	return NULL;
6287	6540	}
6288	6541
..	..	@@ -6337,7 +6590,7 @@
6337	6590	if (WARN_ON(!devid && !fs_info))
6338	6591	return ERR_PTR(-EINVAL);
6339	6592
6340		- dev = __alloc_device();
	6593	+ dev = __alloc_device(fs_info);
6341	6594	if (IS_ERR(dev))
6342	6595	return dev;
6343	6596
..	..	@@ -6359,9 +6612,6 @@
6359	6612	else
6360	6613	generate_random_uuid(dev->uuid);
6361	6614
6362		- btrfs_init_work(&dev->work, btrfs_submit_helper,
6363		- pending_bios_fn, NULL, NULL);
6364		-
6365	6615	return dev;
6366	6616	}
6367	6617
..	..	@@ -6376,11 +6626,26 @@
6376	6626	devid, uuid);
6377	6627	}
6378	6628
6379		-static int read_one_chunk(struct btrfs_fs_info fs_info, struct btrfs_key key,
6380		- struct extent_buffer *leaf,
	6629	+static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
	6630	+{
	6631	+ int index = btrfs_bg_flags_to_raid_index(type);
	6632	+ int ncopies = btrfs_raid_array[index].ncopies;
	6633	+ const int nparity = btrfs_raid_array[index].nparity;
	6634	+ int data_stripes;
	6635	+
	6636	+ if (nparity)
	6637	+ data_stripes = num_stripes - nparity;
	6638	+ else
	6639	+ data_stripes = num_stripes / ncopies;
	6640	+
	6641	+ return div_u64(chunk_len, data_stripes);
	6642	+}
	6643	+
	6644	+static int read_one_chunk(struct btrfs_key key, struct extent_buffer leaf,
6381	6645	struct btrfs_chunk *chunk)
6382	6646	{
6383		- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
	6647	+ struct btrfs_fs_info *fs_info = leaf->fs_info;
	6648	+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6384	6649	struct map_lookup *map;
6385	6650	struct extent_map *em;
6386	6651	u64 logical;
..	..	@@ -6400,14 +6665,14 @@
6400	6665	* as chunk item in tree block is already verified by tree-checker.
6401	6666	*/
6402	6667	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6403		- ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
	6668	+ ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6404	6669	if (ret)
6405	6670	return ret;
6406	6671	}
6407	6672
6408		- read_lock(&map_tree->map_tree.lock);
6409		- em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6410		- read_unlock(&map_tree->map_tree.lock);
	6673	+ read_lock(&map_tree->lock);
	6674	+ em = lookup_extent_mapping(map_tree, logical, 1);
	6675	+ read_unlock(&map_tree->lock);
6411	6676
6412	6677	/* already mapped? */
6413	6678	if (em && em->start <= logical && em->start + em->len > logical) {
..	..	@@ -6441,6 +6706,8 @@
6441	6706	map->type = btrfs_chunk_type(leaf, chunk);
6442	6707	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6443	6708	map->verified_stripes = 0;
	6709	+ em->orig_block_len = calc_stripe_length(map->type, em->len,
	6710	+ map->num_stripes);
6444	6711	for (i = 0; i < num_stripes; i++) {
6445	6712	map->stripes[i].physical =
6446	6713	btrfs_stripe_offset_nr(leaf, chunk, i);
..	..	@@ -6449,7 +6716,7 @@
6449	6716	btrfs_stripe_dev_uuid_nr(chunk, i),
6450	6717	BTRFS_UUID_SIZE);
6451	6718	map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6452		- devid, uuid, NULL, true);
	6719	+ devid, uuid, NULL, true);
6453	6720	if (!map->stripes[i].dev &&
6454	6721	!btrfs_test_opt(fs_info, DEGRADED)) {
6455	6722	free_extent_map(em);
..	..	@@ -6474,9 +6741,9 @@
6474	6741
6475	6742	}
6476	6743
6477		- write_lock(&map_tree->map_tree.lock);
6478		- ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6479		- write_unlock(&map_tree->map_tree.lock);
	6744	+ write_lock(&map_tree->lock);
	6745	+ ret = add_extent_mapping(map_tree, em, 0);
	6746	+ write_unlock(&map_tree->lock);
6480	6747	if (ret < 0) {
6481	6748	btrfs_err(fs_info,
6482	6749	"failed to add chunk map, start=%llu len=%llu: %d",
..	..	@@ -6519,28 +6786,30 @@
6519	6786	lockdep_assert_held(&uuid_mutex);
6520	6787	ASSERT(fsid);
6521	6788
6522		- fs_devices = fs_info->fs_devices->seed;
6523		- while (fs_devices) {
	6789	+ /* This will match only for multi-device seed fs */
	6790	+ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
6524	6791	if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6525	6792	return fs_devices;
6526	6793
6527		- fs_devices = fs_devices->seed;
6528		- }
6529	6794
6530		- fs_devices = find_fsid(fsid);
	6795	+ fs_devices = find_fsid(fsid, NULL);
6531	6796	if (!fs_devices) {
6532	6797	if (!btrfs_test_opt(fs_info, DEGRADED))
6533	6798	return ERR_PTR(-ENOENT);
6534	6799
6535		- fs_devices = alloc_fs_devices(fsid);
	6800	+ fs_devices = alloc_fs_devices(fsid, NULL);
6536	6801	if (IS_ERR(fs_devices))
6537	6802	return fs_devices;
6538	6803
6539		- fs_devices->seeding = 1;
	6804	+ fs_devices->seeding = true;
6540	6805	fs_devices->opened = 1;
6541	6806	return fs_devices;
6542	6807	}
6543	6808
	6809	+ /*
	6810	+ * Upon first call for a seed fs fsid, just create a private copy of the
	6811	+ * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
	6812	+ */
6544	6813	fs_devices = clone_fs_devices(fs_devices);
6545	6814	if (IS_ERR(fs_devices))
6546	6815	return fs_devices;
..	..	@@ -6548,27 +6817,24 @@
6548	6817	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6549	6818	if (ret) {
6550	6819	free_fs_devices(fs_devices);
6551		- fs_devices = ERR_PTR(ret);
6552		- goto out;
	6820	+ return ERR_PTR(ret);
6553	6821	}
6554	6822
6555	6823	if (!fs_devices->seeding) {
6556	6824	close_fs_devices(fs_devices);
6557	6825	free_fs_devices(fs_devices);
6558		- fs_devices = ERR_PTR(-EINVAL);
6559		- goto out;
	6826	+ return ERR_PTR(-EINVAL);
6560	6827	}
6561	6828
6562		- fs_devices->seed = fs_info->fs_devices->seed;
6563		- fs_info->fs_devices->seed = fs_devices;
6564		-out:
	6829	+ list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
	6830	+
6565	6831	return fs_devices;
6566	6832	}
6567	6833
6568		-static int read_one_dev(struct btrfs_fs_info *fs_info,
6569		- struct extent_buffer *leaf,
	6834	+static int read_one_dev(struct extent_buffer *leaf,
6570	6835	struct btrfs_dev_item *dev_item)
6571	6836	{
	6837	+ struct btrfs_fs_info *fs_info = leaf->fs_info;
6572	6838	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6573	6839	struct btrfs_device *device;
6574	6840	u64 devid;
..	..	@@ -6582,7 +6848,7 @@
6582	6848	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6583	6849	BTRFS_FSID_SIZE);
6584	6850
6585		- if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
	6851	+ if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
6586	6852	fs_devices = open_seed_devices(fs_info, fs_uuid);
6587	6853	if (IS_ERR(fs_devices))
6588	6854	return PTR_ERR(fs_devices);
..	..	@@ -6725,48 +6991,49 @@
6725	6991	sb_array_offset += len;
6726	6992	cur_offset += len;
6727	6993
6728		- if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6729		- chunk = (struct btrfs_chunk *)sb_array_offset;
6730		- /*
6731		- * At least one btrfs_chunk with one stripe must be
6732		- * present, exact stripe count check comes afterwards
6733		- */
6734		- len = btrfs_chunk_item_size(1);
6735		- if (cur_offset + len > array_size)
6736		- goto out_short_read;
6737		-
6738		- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6739		- if (!num_stripes) {
6740		- btrfs_err(fs_info,
6741		- "invalid number of stripes %u in sys_array at offset %u",
6742		- num_stripes, cur_offset);
6743		- ret = -EIO;
6744		- break;
6745		- }
6746		-
6747		- type = btrfs_chunk_type(sb, chunk);
6748		- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6749		- btrfs_err(fs_info,
6750		- "invalid chunk type %llu in sys_array at offset %u",
6751		- type, cur_offset);
6752		- ret = -EIO;
6753		- break;
6754		- }
6755		-
6756		- len = btrfs_chunk_item_size(num_stripes);
6757		- if (cur_offset + len > array_size)
6758		- goto out_short_read;
6759		-
6760		- ret = read_one_chunk(fs_info, &key, sb, chunk);
6761		- if (ret)
6762		- break;
6763		- } else {
	6994	+ if (key.type != BTRFS_CHUNK_ITEM_KEY) {
6764	6995	btrfs_err(fs_info,
6765	6996	"unexpected item type %u in sys_array at offset %u",
6766	6997	(u32)key.type, cur_offset);
6767	6998	ret = -EIO;
6768	6999	break;
6769	7000	}
	7001	+
	7002	+ chunk = (struct btrfs_chunk *)sb_array_offset;
	7003	+ /*
	7004	+ * At least one btrfs_chunk with one stripe must be present,
	7005	+ * exact stripe count check comes afterwards
	7006	+ */
	7007	+ len = btrfs_chunk_item_size(1);
	7008	+ if (cur_offset + len > array_size)
	7009	+ goto out_short_read;
	7010	+
	7011	+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
	7012	+ if (!num_stripes) {
	7013	+ btrfs_err(fs_info,
	7014	+ "invalid number of stripes %u in sys_array at offset %u",
	7015	+ num_stripes, cur_offset);
	7016	+ ret = -EIO;
	7017	+ break;
	7018	+ }
	7019	+
	7020	+ type = btrfs_chunk_type(sb, chunk);
	7021	+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
	7022	+ btrfs_err(fs_info,
	7023	+ "invalid chunk type %llu in sys_array at offset %u",
	7024	+ type, cur_offset);
	7025	+ ret = -EIO;
	7026	+ break;
	7027	+ }
	7028	+
	7029	+ len = btrfs_chunk_item_size(num_stripes);
	7030	+ if (cur_offset + len > array_size)
	7031	+ goto out_short_read;
	7032	+
	7033	+ ret = read_one_chunk(&key, sb, chunk);
	7034	+ if (ret)
	7035	+ break;
	7036	+
6770	7037	array_ptr += len;
6771	7038	sb_array_offset += len;
6772	7039	cur_offset += len;
..	..	@@ -6794,14 +7061,14 @@
6794	7061	bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
6795	7062	struct btrfs_device *failing_dev)
6796	7063	{
6797		- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
	7064	+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6798	7065	struct extent_map *em;
6799	7066	u64 next_start = 0;
6800	7067	bool ret = true;
6801	7068
6802		- read_lock(&map_tree->map_tree.lock);
6803		- em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
6804		- read_unlock(&map_tree->map_tree.lock);
	7069	+ read_lock(&map_tree->lock);
	7070	+ em = lookup_extent_mapping(map_tree, 0, (u64)-1);
	7071	+ read_unlock(&map_tree->lock);
6805	7072	/* No chunk at all? Return false anyway */
6806	7073	if (!em) {
6807	7074	ret = false;
..	..	@@ -6830,7 +7097,7 @@
6830	7097	if (missing > max_tolerated) {
6831	7098	if (!failing_dev)
6832	7099	btrfs_warn(fs_info,
6833		- "chunk %llu missing %d devices, max tolerance is %d for writeable mount",
	7100	+ "chunk %llu missing %d devices, max tolerance is %d for writable mount",
6834	7101	em->start, missing, max_tolerated);
6835	7102	free_extent_map(em);
6836	7103	ret = false;
..	..	@@ -6839,13 +7106,26 @@
6839	7106	next_start = extent_map_end(em);
6840	7107	free_extent_map(em);
6841	7108
6842		- read_lock(&map_tree->map_tree.lock);
6843		- em = lookup_extent_mapping(&map_tree->map_tree, next_start,
	7109	+ read_lock(&map_tree->lock);
	7110	+ em = lookup_extent_mapping(map_tree, next_start,
6844	7111	(u64)(-1) - next_start);
6845		- read_unlock(&map_tree->map_tree.lock);
	7112	+ read_unlock(&map_tree->lock);
6846	7113	}
6847	7114	out:
6848	7115	return ret;
	7116	+}
	7117	+
	7118	+static void readahead_tree_node_children(struct extent_buffer *node)
	7119	+{
	7120	+ int i;
	7121	+ const int nr_items = btrfs_header_nritems(node);
	7122	+
	7123	+ for (i = 0; i < nr_items; i++) {
	7124	+ u64 start;
	7125	+
	7126	+ start = btrfs_node_blockptr(node, i);
	7127	+ readahead_tree_block(node->fs_info, start);
	7128	+ }
6849	7129	}
6850	7130
6851	7131	int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
..	..	@@ -6858,6 +7138,7 @@
6858	7138	int ret;
6859	7139	int slot;
6860	7140	u64 total_dev = 0;
	7141	+ u64 last_ra_node = 0;
6861	7142
6862	7143	path = btrfs_alloc_path();
6863	7144	if (!path)
..	..	@@ -6868,7 +7149,6 @@
6868	7149	* otherwise we don't need it.
6869	7150	*/
6870	7151	mutex_lock(&uuid_mutex);
6871		- mutex_lock(&fs_info->chunk_mutex);
6872	7152
6873	7153	/*
6874	7154	* It is possible for mount and umount to race in such a way that
..	..	@@ -6891,6 +7171,8 @@
6891	7171	if (ret < 0)
6892	7172	goto error;
6893	7173	while (1) {
	7174	+ struct extent_buffer *node;
	7175	+
6894	7176	leaf = path->nodes[0];
6895	7177	slot = path->slots[0];
6896	7178	if (slot >= btrfs_header_nritems(leaf)) {
..	..	@@ -6901,19 +7183,32 @@
6901	7183	goto error;
6902	7184	break;
6903	7185	}
	7186	+ /*
	7187	+ * The nodes on level 1 are not locked but we don't need to do
	7188	+ * that during mount time as nothing else can access the tree
	7189	+ */
	7190	+ node = path->nodes[1];
	7191	+ if (node) {
	7192	+ if (last_ra_node != node->start) {
	7193	+ readahead_tree_node_children(node);
	7194	+ last_ra_node = node->start;
	7195	+ }
	7196	+ }
6904	7197	btrfs_item_key_to_cpu(leaf, &found_key, slot);
6905	7198	if (found_key.type == BTRFS_DEV_ITEM_KEY) {
6906	7199	struct btrfs_dev_item *dev_item;
6907	7200	dev_item = btrfs_item_ptr(leaf, slot,
6908	7201	struct btrfs_dev_item);
6909		- ret = read_one_dev(fs_info, leaf, dev_item);
	7202	+ ret = read_one_dev(leaf, dev_item);
6910	7203	if (ret)
6911	7204	goto error;
6912	7205	total_dev++;
6913	7206	} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
6914	7207	struct btrfs_chunk *chunk;
6915	7208	chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6916		- ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
	7209	+ mutex_lock(&fs_info->chunk_mutex);
	7210	+ ret = read_one_chunk(&found_key, leaf, chunk);
	7211	+ mutex_unlock(&fs_info->chunk_mutex);
6917	7212	if (ret)
6918	7213	goto error;
6919	7214	}
..	..	@@ -6925,12 +7220,12 @@
6925	7220	* do another round of validation checks.
6926	7221	*/
6927	7222	if (total_dev != fs_info->fs_devices->total_devices) {
6928		- btrfs_err(fs_info,
6929		- "super_num_devices %llu mismatch with num_devices %llu found here",
	7223	+ btrfs_warn(fs_info,
	7224	+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
6930	7225	btrfs_super_num_devices(fs_info->super_copy),
6931	7226	total_dev);
6932		- ret = -EINVAL;
6933		- goto error;
	7227	+ fs_info->fs_devices->total_devices = total_dev;
	7228	+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
6934	7229	}
6935	7230	if (btrfs_super_total_bytes(fs_info->super_copy) <
6936	7231	fs_info->fs_devices->total_rw_bytes) {
..	..	@@ -6943,7 +7238,6 @@
6943	7238	}
6944	7239	ret = 0;
6945	7240	error:
6946		- mutex_unlock(&fs_info->chunk_mutex);
6947	7241	mutex_unlock(&uuid_mutex);
6948	7242
6949	7243	btrfs_free_path(path);
..	..	@@ -6952,86 +7246,117 @@
6952	7246
6953	7247	void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
6954	7248	{
6955		- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	7249	+ struct btrfs_fs_devices fs_devices = fs_info->fs_devices, seed_devs;
6956	7250	struct btrfs_device *device;
6957	7251
6958		- while (fs_devices) {
6959		- mutex_lock(&fs_devices->device_list_mutex);
6960		- list_for_each_entry(device, &fs_devices->devices, dev_list)
6961		- device->fs_info = fs_info;
6962		- mutex_unlock(&fs_devices->device_list_mutex);
	7252	+ fs_devices->fs_info = fs_info;
6963	7253
6964		- fs_devices = fs_devices->seed;
	7254	+ mutex_lock(&fs_devices->device_list_mutex);
	7255	+ list_for_each_entry(device, &fs_devices->devices, dev_list)
	7256	+ device->fs_info = fs_info;
	7257	+
	7258	+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
	7259	+ list_for_each_entry(device, &seed_devs->devices, dev_list)
	7260	+ device->fs_info = fs_info;
	7261	+
	7262	+ seed_devs->fs_info = fs_info;
6965	7263	}
	7264	+ mutex_unlock(&fs_devices->device_list_mutex);
6966	7265	}
6967	7266
6968		-static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
	7267	+static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
	7268	+ const struct btrfs_dev_stats_item *ptr,
	7269	+ int index)
6969	7270	{
6970		- int i;
	7271	+ u64 val;
6971	7272
6972		- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6973		- btrfs_dev_stat_reset(dev, i);
	7273	+ read_extent_buffer(eb, &val,
	7274	+ offsetof(struct btrfs_dev_stats_item, values) +
	7275	+ ((unsigned long)ptr) + (index * sizeof(u64)),
	7276	+ sizeof(val));
	7277	+ return val;
	7278	+}
	7279	+
	7280	+static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
	7281	+ struct btrfs_dev_stats_item *ptr,
	7282	+ int index, u64 val)
	7283	+{
	7284	+ write_extent_buffer(eb, &val,
	7285	+ offsetof(struct btrfs_dev_stats_item, values) +
	7286	+ ((unsigned long)ptr) + (index * sizeof(u64)),
	7287	+ sizeof(val));
	7288	+}
	7289	+
	7290	+static int btrfs_device_init_dev_stats(struct btrfs_device *device,
	7291	+ struct btrfs_path *path)
	7292	+{
	7293	+ struct btrfs_dev_stats_item *ptr;
	7294	+ struct extent_buffer *eb;
	7295	+ struct btrfs_key key;
	7296	+ int item_size;
	7297	+ int i, ret, slot;
	7298	+
	7299	+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
	7300	+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
	7301	+ key.offset = device->devid;
	7302	+ ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
	7303	+ if (ret) {
	7304	+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
	7305	+ btrfs_dev_stat_set(device, i, 0);
	7306	+ device->dev_stats_valid = 1;
	7307	+ btrfs_release_path(path);
	7308	+ return ret < 0 ? ret : 0;
	7309	+ }
	7310	+ slot = path->slots[0];
	7311	+ eb = path->nodes[0];
	7312	+ item_size = btrfs_item_size_nr(eb, slot);
	7313	+
	7314	+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
	7315	+
	7316	+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
	7317	+ if (item_size >= (1 + i) * sizeof(__le64))
	7318	+ btrfs_dev_stat_set(device, i,
	7319	+ btrfs_dev_stats_value(eb, ptr, i));
	7320	+ else
	7321	+ btrfs_dev_stat_set(device, i, 0);
	7322	+ }
	7323	+
	7324	+ device->dev_stats_valid = 1;
	7325	+ btrfs_dev_stat_print_on_load(device);
	7326	+ btrfs_release_path(path);
	7327	+
	7328	+ return 0;
6974	7329	}
6975	7330
6976	7331	int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
6977	7332	{
6978		- struct btrfs_key key;
6979		- struct btrfs_key found_key;
6980		- struct btrfs_root *dev_root = fs_info->dev_root;
6981		- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6982		- struct extent_buffer *eb;
6983		- int slot;
6984		- int ret = 0;
	7333	+ struct btrfs_fs_devices fs_devices = fs_info->fs_devices, seed_devs;
6985	7334	struct btrfs_device *device;
6986	7335	struct btrfs_path *path = NULL;
6987		- int i;
	7336	+ int ret = 0;
6988	7337
6989	7338	path = btrfs_alloc_path();
6990		- if (!path) {
6991		- ret = -ENOMEM;
6992		- goto out;
6993		- }
	7339	+ if (!path)
	7340	+ return -ENOMEM;
6994	7341
6995	7342	mutex_lock(&fs_devices->device_list_mutex);
6996	7343	list_for_each_entry(device, &fs_devices->devices, dev_list) {
6997		- int item_size;
6998		- struct btrfs_dev_stats_item *ptr;
6999		-
7000		- key.objectid = BTRFS_DEV_STATS_OBJECTID;
7001		- key.type = BTRFS_PERSISTENT_ITEM_KEY;
7002		- key.offset = device->devid;
7003		- ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
7004		- if (ret) {
7005		- __btrfs_reset_dev_stats(device);
7006		- device->dev_stats_valid = 1;
7007		- btrfs_release_path(path);
7008		- continue;
7009		- }
7010		- slot = path->slots[0];
7011		- eb = path->nodes[0];
7012		- btrfs_item_key_to_cpu(eb, &found_key, slot);
7013		- item_size = btrfs_item_size_nr(eb, slot);
7014		-
7015		- ptr = btrfs_item_ptr(eb, slot,
7016		- struct btrfs_dev_stats_item);
7017		-
7018		- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7019		- if (item_size >= (1 + i) * sizeof(__le64))
7020		- btrfs_dev_stat_set(device, i,
7021		- btrfs_dev_stats_value(eb, ptr, i));
7022		- else
7023		- btrfs_dev_stat_reset(device, i);
7024		- }
7025		-
7026		- device->dev_stats_valid = 1;
7027		- btrfs_dev_stat_print_on_load(device);
7028		- btrfs_release_path(path);
	7344	+ ret = btrfs_device_init_dev_stats(device, path);
	7345	+ if (ret)
	7346	+ goto out;
7029	7347	}
	7348	+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
	7349	+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
	7350	+ ret = btrfs_device_init_dev_stats(device, path);
	7351	+ if (ret)
	7352	+ goto out;
	7353	+ }
	7354	+ }
	7355	+out:
7030	7356	mutex_unlock(&fs_devices->device_list_mutex);
7031	7357
7032		-out:
7033	7358	btrfs_free_path(path);
7034		- return ret < 0 ? ret : 0;
	7359	+ return ret;
7035	7360	}
7036	7361
7037	7362	static int update_dev_stat_item(struct btrfs_trans_handle *trans,
..	..	@@ -7102,9 +7427,9 @@
7102	7427	/*
7103	7428	* called from commit_transaction. Writes all changed device stats to disk.
7104	7429	*/
7105		-int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
7106		- struct btrfs_fs_info *fs_info)
	7430	+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7107	7431	{
	7432	+ struct btrfs_fs_info *fs_info = trans->fs_info;
7108	7433	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7109	7434	struct btrfs_device *device;
7110	7435	int stats_cnt;
..	..	@@ -7187,8 +7512,8 @@
7187	7512	int i;
7188	7513
7189	7514	mutex_lock(&fs_devices->device_list_mutex);
7190		- dev = btrfs_find_device(fs_info->fs_devices, stats->devid,
7191		- NULL, NULL, true);
	7515	+ dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
	7516	+ true);
7192	7517	mutex_unlock(&fs_devices->device_list_mutex);
7193	7518
7194	7519	if (!dev) {
..	..	@@ -7203,7 +7528,7 @@
7203	7528	stats->values[i] =
7204	7529	btrfs_dev_stat_read_and_reset(dev, i);
7205	7530	else
7206		- btrfs_dev_stat_reset(dev, i);
	7531	+ btrfs_dev_stat_set(dev, i, 0);
7207	7532	}
7208	7533	btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7209	7534	current->comm, task_pid_nr(current));
..	..	@@ -7217,101 +7542,35 @@
7217	7542	return 0;
7218	7543	}
7219	7544
7220		-void btrfs_scratch_superblocks(struct block_device bdev, const char device_path)
7221		-{
7222		- struct buffer_head *bh;
7223		- struct btrfs_super_block *disk_super;
7224		- int copy_num;
7225		-
7226		- if (!bdev)
7227		- return;
7228		-
7229		- for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
7230		- copy_num++) {
7231		-
7232		- if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
7233		- continue;
7234		-
7235		- disk_super = (struct btrfs_super_block *)bh->b_data;
7236		-
7237		- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
7238		- set_buffer_dirty(bh);
7239		- sync_dirty_buffer(bh);
7240		- brelse(bh);
7241		- }
7242		-
7243		- /* Notify udev that device has changed */
7244		- btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
7245		-
7246		- /* Update ctime/mtime for device path for libblkid */
7247		- update_dev_time(device_path);
7248		-}
7249		-
7250	7545	/*
7251		- * Update the size of all devices, which is used for writing out the
7252		- * super blocks.
	7546	+ * Update the size and bytes used for each device where it changed. This is
	7547	+ * delayed since we would otherwise get errors while writing out the
	7548	+ * superblocks.
	7549	+ *
	7550	+ * Must be invoked during transaction commit.
7253	7551	*/
7254		-void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
	7552	+void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7255	7553	{
7256		- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7257	7554	struct btrfs_device curr, next;
7258	7555
7259		- if (list_empty(&fs_devices->resized_devices))
	7556	+ ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
	7557	+
	7558	+ if (list_empty(&trans->dev_update_list))
7260	7559	return;
7261	7560
7262		- mutex_lock(&fs_devices->device_list_mutex);
7263		- mutex_lock(&fs_info->chunk_mutex);
7264		- list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
7265		- resized_list) {
7266		- list_del_init(&curr->resized_list);
	7561	+ /*
	7562	+ * We don't need the device_list_mutex here. This list is owned by the
	7563	+ * transaction and the transaction must complete before the device is
	7564	+ * released.
	7565	+ */
	7566	+ mutex_lock(&trans->fs_info->chunk_mutex);
	7567	+ list_for_each_entry_safe(curr, next, &trans->dev_update_list,
	7568	+ post_commit_list) {
	7569	+ list_del_init(&curr->post_commit_list);
7267	7570	curr->commit_total_bytes = curr->disk_total_bytes;
	7571	+ curr->commit_bytes_used = curr->bytes_used;
7268	7572	}
7269		- mutex_unlock(&fs_info->chunk_mutex);
7270		- mutex_unlock(&fs_devices->device_list_mutex);
7271		-}
7272		-
7273		-/* Must be invoked during the transaction commit */
7274		-void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
7275		-{
7276		- struct btrfs_fs_info *fs_info = trans->fs_info;
7277		- struct extent_map *em;
7278		- struct map_lookup *map;
7279		- struct btrfs_device *dev;
7280		- int i;
7281		-
7282		- if (list_empty(&trans->pending_chunks))
7283		- return;
7284		-
7285		- /* In order to kick the device replace finish process */
7286		- mutex_lock(&fs_info->chunk_mutex);
7287		- list_for_each_entry(em, &trans->pending_chunks, list) {
7288		- map = em->map_lookup;
7289		-
7290		- for (i = 0; i < map->num_stripes; i++) {
7291		- dev = map->stripes[i].dev;
7292		- dev->commit_bytes_used = dev->bytes_used;
7293		- dev->has_pending_chunks = false;
7294		- }
7295		- }
7296		- mutex_unlock(&fs_info->chunk_mutex);
7297		-}
7298		-
7299		-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
7300		-{
7301		- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7302		- while (fs_devices) {
7303		- fs_devices->fs_info = fs_info;
7304		- fs_devices = fs_devices->seed;
7305		- }
7306		-}
7307		-
7308		-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
7309		-{
7310		- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7311		- while (fs_devices) {
7312		- fs_devices->fs_info = NULL;
7313		- fs_devices = fs_devices->seed;
7314		- }
	7573	+ mutex_unlock(&trans->fs_info->chunk_mutex);
7315	7574	}
7316	7575
7317	7576	/*
..	..	@@ -7319,38 +7578,18 @@
7319	7578	*/
7320	7579	int btrfs_bg_type_to_factor(u64 flags)
7321	7580	{
7322		- if (flags & (BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1 \|
7323		- BTRFS_BLOCK_GROUP_RAID10))
7324		- return 2;
7325		- return 1;
	7581	+ const int index = btrfs_bg_flags_to_raid_index(flags);
	7582	+
	7583	+ return btrfs_raid_array[index].ncopies;
7326	7584	}
7327	7585
7328	7586
7329		-static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
7330		-{
7331		- int index = btrfs_bg_flags_to_raid_index(type);
7332		- int ncopies = btrfs_raid_array[index].ncopies;
7333		- int data_stripes;
7334		-
7335		- switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
7336		- case BTRFS_BLOCK_GROUP_RAID5:
7337		- data_stripes = num_stripes - 1;
7338		- break;
7339		- case BTRFS_BLOCK_GROUP_RAID6:
7340		- data_stripes = num_stripes - 2;
7341		- break;
7342		- default:
7343		- data_stripes = num_stripes / ncopies;
7344		- break;
7345		- }
7346		- return div_u64(chunk_len, data_stripes);
7347		-}
7348	7587
7349	7588	static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7350	7589	u64 chunk_offset, u64 devid,
7351	7590	u64 physical_offset, u64 physical_len)
7352	7591	{
7353		- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
	7592	+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7354	7593	struct extent_map *em;
7355	7594	struct map_lookup *map;
7356	7595	struct btrfs_device *dev;
..	..	@@ -7414,8 +7653,11 @@
7414	7653
7415	7654	/* It's possible this device is a dummy for seed device */
7416	7655	if (dev->disk_total_bytes == 0) {
7417		- dev = btrfs_find_device(fs_info->fs_devices->seed, devid,
7418		- NULL, NULL, false);
	7656	+ struct btrfs_fs_devices *devs;
	7657	+
	7658	+ devs = list_first_entry(&fs_info->fs_devices->seed_list,
	7659	+ struct btrfs_fs_devices, seed_list);
	7660	+ dev = btrfs_find_device(devs, devid, NULL, NULL, false);
7419	7661	if (!dev) {
7420	7662	btrfs_err(fs_info, "failed to find seed devid %llu",
7421	7663	devid);
..	..	@@ -7439,13 +7681,13 @@
7439	7681
7440	7682	static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7441	7683	{
7442		- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
	7684	+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7443	7685	struct extent_map *em;
7444	7686	struct rb_node *node;
7445	7687	int ret = 0;
7446	7688
7447	7689	read_lock(&em_tree->lock);
7448		- for (node = rb_first(&em_tree->map); node; node = rb_next(node)) {
	7690	+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7449	7691	em = rb_entry(node, struct extent_map, rb_node);
7450	7692	if (em->map_lookup->num_stripes !=
7451	7693	em->map_lookup->verified_stripes) {
..	..	@@ -7551,3 +7793,27 @@
7551	7793	btrfs_free_path(path);
7552	7794	return ret;
7553	7795	}
	7796	+
	7797	+/*
	7798	+ * Check whether the given block group or device is pinned by any inode being
	7799	+ * used as a swapfile.
	7800	+ */
	7801	+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info fs_info, void ptr)
	7802	+{
	7803	+ struct btrfs_swapfile_pin *sp;
	7804	+ struct rb_node *node;
	7805	+
	7806	+ spin_lock(&fs_info->swapfile_pins_lock);
	7807	+ node = fs_info->swapfile_pins.rb_node;
	7808	+ while (node) {
	7809	+ sp = rb_entry(node, struct btrfs_swapfile_pin, node);
	7810	+ if (ptr < sp->ptr)
	7811	+ node = node->rb_left;
	7812	+ else if (ptr > sp->ptr)
	7813	+ node = node->rb_right;
	7814	+ else
	7815	+ break;
	7816	+ }
	7817	+ spin_unlock(&fs_info->swapfile_pins_lock);
	7818	+ return node != NULL;
	7819	+}