~hc/RK356X_SDK_RELEASE.git

..	..	@@ -7,7 +7,6 @@
7	7	#include <linux/sched/mm.h>
8	8	#include <linux/bio.h>
9	9	#include <linux/slab.h>
10		-#include <linux/buffer_head.h>
11	10	#include <linux/blkdev.h>
12	11	#include <linux/ratelimit.h>
13	12	#include <linux/kthread.h>
..	..	@@ -15,6 +14,8 @@
15	14	#include <linux/semaphore.h>
16	15	#include <linux/uuid.h>
17	16	#include <linux/list_sort.h>
	17	+#include <linux/namei.h>
	18	+#include "misc.h"
18	19	#include "ctree.h"
19	20	#include "extent_map.h"
20	21	#include "disk-io.h"
..	..	@@ -25,10 +26,12 @@
25	26	#include "async-thread.h"
26	27	#include "check-integrity.h"
27	28	#include "rcu-string.h"
28		-#include "math.h"
29	29	#include "dev-replace.h"
30	30	#include "sysfs.h"
31	31	#include "tree-checker.h"
	32	+#include "space-info.h"
	33	+#include "block-group.h"
	34	+#include "discard.h"
32	35
33	36	const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
34	37	[BTRFS_RAID_RAID10] = {
..	..	@@ -39,6 +42,7 @@
39	42	.tolerated_failures = 1,
40	43	.devs_increment = 2,
41	44	.ncopies = 2,
	45	+ .nparity = 0,
42	46	.raid_name = "raid10",
43	47	.bg_flag = BTRFS_BLOCK_GROUP_RAID10,
44	48	.mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
..	..	@@ -51,9 +55,36 @@
51	55	.tolerated_failures = 1,
52	56	.devs_increment = 2,
53	57	.ncopies = 2,
	58	+ .nparity = 0,
54	59	.raid_name = "raid1",
55	60	.bg_flag = BTRFS_BLOCK_GROUP_RAID1,
56	61	.mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
	62	+ },
	63	+ [BTRFS_RAID_RAID1C3] = {
	64	+ .sub_stripes = 1,
	65	+ .dev_stripes = 1,
	66	+ .devs_max = 3,
	67	+ .devs_min = 3,
	68	+ .tolerated_failures = 2,
	69	+ .devs_increment = 3,
	70	+ .ncopies = 3,
	71	+ .nparity = 0,
	72	+ .raid_name = "raid1c3",
	73	+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
	74	+ .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
	75	+ },
	76	+ [BTRFS_RAID_RAID1C4] = {
	77	+ .sub_stripes = 1,
	78	+ .dev_stripes = 1,
	79	+ .devs_max = 4,
	80	+ .devs_min = 4,
	81	+ .tolerated_failures = 3,
	82	+ .devs_increment = 4,
	83	+ .ncopies = 4,
	84	+ .nparity = 0,
	85	+ .raid_name = "raid1c4",
	86	+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
	87	+ .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
57	88	},
58	89	[BTRFS_RAID_DUP] = {
59	90	.sub_stripes = 1,
..	..	@@ -63,6 +94,7 @@
63	94	.tolerated_failures = 0,
64	95	.devs_increment = 1,
65	96	.ncopies = 2,
	97	+ .nparity = 0,
66	98	.raid_name = "dup",
67	99	.bg_flag = BTRFS_BLOCK_GROUP_DUP,
68	100	.mindev_error = 0,
..	..	@@ -75,6 +107,7 @@
75	107	.tolerated_failures = 0,
76	108	.devs_increment = 1,
77	109	.ncopies = 1,
	110	+ .nparity = 0,
78	111	.raid_name = "raid0",
79	112	.bg_flag = BTRFS_BLOCK_GROUP_RAID0,
80	113	.mindev_error = 0,
..	..	@@ -87,6 +120,7 @@
87	120	.tolerated_failures = 0,
88	121	.devs_increment = 1,
89	122	.ncopies = 1,
	123	+ .nparity = 0,
90	124	.raid_name = "single",
91	125	.bg_flag = 0,
92	126	.mindev_error = 0,
..	..	@@ -99,6 +133,7 @@
99	133	.tolerated_failures = 1,
100	134	.devs_increment = 1,
101	135	.ncopies = 1,
	136	+ .nparity = 1,
102	137	.raid_name = "raid5",
103	138	.bg_flag = BTRFS_BLOCK_GROUP_RAID5,
104	139	.mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
..	..	@@ -111,24 +146,79 @@
111	146	.tolerated_failures = 2,
112	147	.devs_increment = 1,
113	148	.ncopies = 1,
	149	+ .nparity = 2,
114	150	.raid_name = "raid6",
115	151	.bg_flag = BTRFS_BLOCK_GROUP_RAID6,
116	152	.mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
117	153	},
118	154	};
119	155
120		-const char *get_raid_name(enum btrfs_raid_types type)
	156	+const char *btrfs_bg_type_to_raid_name(u64 flags)
121	157	{
122		- if (type >= BTRFS_NR_RAID_TYPES)
	158	+ const int index = btrfs_bg_flags_to_raid_index(flags);
	159	+
	160	+ if (index >= BTRFS_NR_RAID_TYPES)
123	161	return NULL;
124	162
125		- return btrfs_raid_array[type].raid_name;
	163	+ return btrfs_raid_array[index].raid_name;
126	164	}
127	165
128		-static int init_first_rw_device(struct btrfs_trans_handle *trans,
129		- struct btrfs_fs_info *fs_info);
	166	+/*
	167	+ * Fill @buf with textual description of @bg_flags, no more than @size_buf
	168	+ * bytes including terminating null byte.
	169	+ */
	170	+void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
	171	+{
	172	+ int i;
	173	+ int ret;
	174	+ char *bp = buf;
	175	+ u64 flags = bg_flags;
	176	+ u32 size_bp = size_buf;
	177	+
	178	+ if (!flags) {
	179	+ strcpy(bp, "NONE");
	180	+ return;
	181	+ }
	182	+
	183	+#define DESCRIBE_FLAG(flag, desc) \
	184	+ do { \
	185	+ if (flags & (flag)) { \
	186	+ ret = snprintf(bp, size_bp, "%s\|", (desc)); \
	187	+ if (ret < 0 \|\| ret >= size_bp) \
	188	+ goto out_overflow; \
	189	+ size_bp -= ret; \
	190	+ bp += ret; \
	191	+ flags &= ~(flag); \
	192	+ } \
	193	+ } while (0)
	194	+
	195	+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
	196	+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
	197	+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
	198	+
	199	+ DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
	200	+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
	201	+ DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
	202	+ btrfs_raid_array[i].raid_name);
	203	+#undef DESCRIBE_FLAG
	204	+
	205	+ if (flags) {
	206	+ ret = snprintf(bp, size_bp, "0x%llx\|", flags);
	207	+ size_bp -= ret;
	208	+ }
	209	+
	210	+ if (size_bp < size_buf)
	211	+ buf[size_buf - size_bp - 1] = '\0'; /* remove last \| */
	212	+
	213	+ /*
	214	+ * The text is trimmed, it's up to the caller to provide sufficiently
	215	+ * large buffer
	216	+ */
	217	+out_overflow:;
	218	+}
	219	+
	220	+static int init_first_rw_device(struct btrfs_trans_handle *trans);
130	221	static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
131		-static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
132	222	static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
133	223	static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
134	224	static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
..	..	@@ -153,7 +243,7 @@
153	243	* the mutex can be very coarse and can cover long-running operations
154	244	*
155	245	* protects: updates to fs_devices counters like missing devices, rw devices,
156		- * seeding, structure cloning, openning/closing devices at mount/umount time
	246	+ * seeding, structure cloning, opening/closing devices at mount/umount time
157	247	*
158	248	* global::fs_devs - add, remove, updates to the global list
159	249	*
..	..	@@ -183,7 +273,9 @@
183	273	* chunk_mutex
184	274	* -----------
185	275	* protects chunks, adding or removing during allocation, trim or when a new
186		- * device is added/removed
	276	+ * device is added/removed. Additionally it also protects post_commit_list of
	277	+ * individual devices, since they can be added to the transaction's
	278	+ * post_commit_list only with chunk_mutex held.
187	279	*
188	280	* cleaner_mutex
189	281	* -------------
..	..	@@ -195,14 +287,13 @@
195	287	* ============
196	288	*
197	289	* uuid_mutex
198		- * volume_mutex
199		- * device_list_mutex
200		- * chunk_mutex
201		- * balance_mutex
	290	+ * device_list_mutex
	291	+ * chunk_mutex
	292	+ * balance_mutex
202	293	*
203	294	*
204		- * Exclusive operations, BTRFS_FS_EXCL_OP
205		- * ======================================
	295	+ * Exclusive operations
	296	+ * ====================
206	297	*
207	298	* Maintains the exclusivity of the following operations that apply to the
208	299	* whole filesystem and cannot run in parallel.
..	..	@@ -228,30 +319,32 @@
228	319	* - system power-cycle and filesystem mounted as read-only
229	320	* - filesystem or device errors leading to forced read-only
230	321	*
231		- * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
232		- * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
	322	+ * The status of exclusive operation is set and cleared atomically.
	323	+ * During the course of Paused state, fs_info::exclusive_operation remains set.
233	324	* A device operation in Paused or Running state can be canceled or resumed
234	325	* either by ioctl (Balance only) or when remounted as read-write.
235		- * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
	326	+ * The exclusive status is cleared when the device operation is canceled or
236	327	* completed.
237	328	*/
238	329
239	330	DEFINE_MUTEX(uuid_mutex);
240	331	static LIST_HEAD(fs_uuids);
241		-struct list_head *btrfs_get_fs_uuids(void)
	332	+struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
242	333	{
243	334	return &fs_uuids;
244	335	}
245	336
246	337	/*
247	338	* alloc_fs_devices - allocate struct btrfs_fs_devices
248		- * @fsid: if not NULL, copy the uuid to fs_devices::fsid
	339	+ * @fsid: if not NULL, copy the UUID to fs_devices::fsid
	340	+ * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
249	341	*
250	342	* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
251	343	* The returned struct is not linked onto any lists and can be destroyed with
252	344	* kfree() right away.
253	345	*/
254		-static struct btrfs_fs_devices alloc_fs_devices(const u8 fsid)
	346	+static struct btrfs_fs_devices alloc_fs_devices(const u8 fsid,
	347	+ const u8 *metadata_fsid)
255	348	{
256	349	struct btrfs_fs_devices *fs_devs;
257	350
..	..	@@ -262,18 +355,25 @@
262	355	mutex_init(&fs_devs->device_list_mutex);
263	356
264	357	INIT_LIST_HEAD(&fs_devs->devices);
265		- INIT_LIST_HEAD(&fs_devs->resized_devices);
266	358	INIT_LIST_HEAD(&fs_devs->alloc_list);
267	359	INIT_LIST_HEAD(&fs_devs->fs_list);
	360	+ INIT_LIST_HEAD(&fs_devs->seed_list);
268	361	if (fsid)
269	362	memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
	363	+
	364	+ if (metadata_fsid)
	365	+ memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
	366	+ else if (fsid)
	367	+ memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
270	368
271	369	return fs_devs;
272	370	}
273	371
274	372	void btrfs_free_device(struct btrfs_device *device)
275	373	{
	374	+ WARN_ON(!list_empty(&device->post_commit_list));
276	375	rcu_string_free(device->name);
	376	+ extent_io_tree_release(&device->alloc_state);
277	377	bio_put(device->flush_bio);
278	378	kfree(device);
279	379	}
..	..	@@ -281,6 +381,7 @@
281	381	static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
282	382	{
283	383	struct btrfs_device *device;
	384	+
284	385	WARN_ON(fs_devices->opened);
285	386	while (!list_empty(&fs_devices->devices)) {
286	387	device = list_entry(fs_devices->devices.next,
..	..	@@ -289,19 +390,6 @@
289	390	btrfs_free_device(device);
290	391	}
291	392	kfree(fs_devices);
292		-}
293		-
294		-static void btrfs_kobject_uevent(struct block_device *bdev,
295		- enum kobject_action action)
296		-{
297		- int ret;
298		-
299		- ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
300		- if (ret)
301		- pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
302		- action,
303		- kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
304		- &disk_to_dev(bdev->bd_disk)->kobj);
305	393	}
306	394
307	395	void __exit btrfs_cleanup_fs_uuids(void)
..	..	@@ -321,7 +409,7 @@
321	409	* Returned struct is not linked onto any lists and must be destroyed using
322	410	* btrfs_free_device.
323	411	*/
324		-static struct btrfs_device *__alloc_device(void)
	412	+static struct btrfs_device __alloc_device(struct btrfs_fs_info fs_info)
325	413	{
326	414	struct btrfs_device *dev;
327	415
..	..	@@ -341,34 +429,86 @@
341	429
342	430	INIT_LIST_HEAD(&dev->dev_list);
343	431	INIT_LIST_HEAD(&dev->dev_alloc_list);
344		- INIT_LIST_HEAD(&dev->resized_list);
345		-
346		- spin_lock_init(&dev->io_lock);
	432	+ INIT_LIST_HEAD(&dev->post_commit_list);
347	433
348	434	atomic_set(&dev->reada_in_flight, 0);
349	435	atomic_set(&dev->dev_stats_ccnt, 0);
350	436	btrfs_device_data_ordered_init(dev);
351	437	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
352	438	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
	439	+ extent_io_tree_init(fs_info, &dev->alloc_state,
	440	+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
353	441
354	442	return dev;
355	443	}
356	444
357		-static noinline struct btrfs_fs_devices find_fsid(u8 fsid)
	445	+static noinline struct btrfs_fs_devices *find_fsid(
	446	+ const u8 fsid, const u8 metadata_fsid)
358	447	{
359	448	struct btrfs_fs_devices *fs_devices;
360	449
	450	+ ASSERT(fsid);
	451	+
	452	+ /* Handle non-split brain cases */
361	453	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
362		- if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
363		- return fs_devices;
	454	+ if (metadata_fsid) {
	455	+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
	456	+ && memcmp(metadata_fsid, fs_devices->metadata_uuid,
	457	+ BTRFS_FSID_SIZE) == 0)
	458	+ return fs_devices;
	459	+ } else {
	460	+ if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
	461	+ return fs_devices;
	462	+ }
364	463	}
365	464	return NULL;
366	465	}
367	466
	467	+static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
	468	+ struct btrfs_super_block *disk_super)
	469	+{
	470	+
	471	+ struct btrfs_fs_devices *fs_devices;
	472	+
	473	+ /*
	474	+ * Handle scanned device having completed its fsid change but
	475	+ * belonging to a fs_devices that was created by first scanning
	476	+ * a device which didn't have its fsid/metadata_uuid changed
	477	+ * at all and the CHANGING_FSID_V2 flag set.
	478	+ */
	479	+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
	480	+ if (fs_devices->fsid_change &&
	481	+ memcmp(disk_super->metadata_uuid, fs_devices->fsid,
	482	+ BTRFS_FSID_SIZE) == 0 &&
	483	+ memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
	484	+ BTRFS_FSID_SIZE) == 0) {
	485	+ return fs_devices;
	486	+ }
	487	+ }
	488	+ /*
	489	+ * Handle scanned device having completed its fsid change but
	490	+ * belonging to a fs_devices that was created by a device that
	491	+ * has an outdated pair of fsid/metadata_uuid and
	492	+ * CHANGING_FSID_V2 flag set.
	493	+ */
	494	+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
	495	+ if (fs_devices->fsid_change &&
	496	+ memcmp(fs_devices->metadata_uuid,
	497	+ fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
	498	+ memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
	499	+ BTRFS_FSID_SIZE) == 0) {
	500	+ return fs_devices;
	501	+ }
	502	+ }
	503	+
	504	+ return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
	505	+}
	506	+
	507	+
368	508	static int
369	509	btrfs_get_bdev_and_sb(const char device_path, fmode_t flags, void holder,
370	510	int flush, struct block_device **bdev,
371		- struct buffer_head **bh)
	511	+ struct btrfs_super_block **disk_super)
372	512	{
373	513	int ret;
374	514
..	..	@@ -387,9 +527,9 @@
387	527	goto error;
388	528	}
389	529	invalidate_bdev(*bdev);
390		- bh = btrfs_read_dev_super(bdev);
391		- if (IS_ERR(*bh)) {
392		- ret = PTR_ERR(*bh);
	530	+ disk_super = btrfs_read_dev_super(bdev);
	531	+ if (IS_ERR(*disk_super)) {
	532	+ ret = PTR_ERR(*disk_super);
393	533	blkdev_put(*bdev, flags);
394	534	goto error;
395	535	}
..	..	@@ -398,214 +538,50 @@
398	538
399	539	error:
400	540	*bdev = NULL;
401		- *bh = NULL;
402	541	return ret;
403	542	}
404	543
405		-static void requeue_list(struct btrfs_pending_bios *pending_bios,
406		- struct bio head, struct bio tail)
407		-{
408		-
409		- struct bio *old_head;
410		-
411		- old_head = pending_bios->head;
412		- pending_bios->head = head;
413		- if (pending_bios->tail)
414		- tail->bi_next = old_head;
415		- else
416		- pending_bios->tail = tail;
417		-}
418		-
419	544	/*
420		- * we try to collect pending bios for a device so we don't get a large
421		- * number of procs sending bios down to the same device. This greatly
422		- * improves the schedulers ability to collect and merge the bios.
	545	+ * Check if the device in the path matches the device in the given struct device.
423	546	*
424		- * But, it also turns into a long list of bios to process and that is sure
425		- * to eventually make the worker thread block. The solution here is to
426		- * make some progress and then put this work struct back at the end of
427		- * the list if the block device is congested. This way, multiple devices
428		- * can make progress from a single worker thread.
	547	+ * Returns:
	548	+ * true If it is the same device.
	549	+ * false If it is not the same device or on error.
429	550	*/
430		-static noinline void run_scheduled_bios(struct btrfs_device *device)
	551	+static bool device_matched(const struct btrfs_device device, const char path)
431	552	{
432		- struct btrfs_fs_info *fs_info = device->fs_info;
433		- struct bio *pending;
434		- struct backing_dev_info *bdi;
435		- struct btrfs_pending_bios *pending_bios;
436		- struct bio *tail;
437		- struct bio *cur;
438		- int again = 0;
439		- unsigned long num_run;
440		- unsigned long batch_run = 0;
441		- unsigned long last_waited = 0;
442		- int force_reg = 0;
443		- int sync_pending = 0;
444		- struct blk_plug plug;
	553	+ char *device_name;
	554	+ struct block_device *bdev_old;
	555	+ struct block_device *bdev_new;
445	556
446	557	/*
447		- * this function runs all the bios we've collected for
448		- * a particular device. We don't want to wander off to
449		- * another device without first sending all of these down.
450		- * So, setup a plug here and finish it off before we return
	558	+ * If we are looking for a device with the matching dev_t, then skip
	559	+ * device without a name (a missing device).
451	560	*/
452		- blk_start_plug(&plug);
	561	+ if (!device->name)
	562	+ return false;
453	563
454		- bdi = device->bdev->bd_bdi;
	564	+ device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
	565	+ if (!device_name)
	566	+ return false;
455	567
456		-loop:
457		- spin_lock(&device->io_lock);
	568	+ rcu_read_lock();
	569	+ scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
	570	+ rcu_read_unlock();
458	571
459		-loop_lock:
460		- num_run = 0;
	572	+ bdev_old = lookup_bdev(device_name);
	573	+ kfree(device_name);
	574	+ if (IS_ERR(bdev_old))
	575	+ return false;
461	576
462		- /* take all the bios off the list at once and process them
463		- * later on (without the lock held). But, remember the
464		- * tail and other pointers so the bios can be properly reinserted
465		- * into the list if we hit congestion
466		- */
467		- if (!force_reg && device->pending_sync_bios.head) {
468		- pending_bios = &device->pending_sync_bios;
469		- force_reg = 1;
470		- } else {
471		- pending_bios = &device->pending_bios;
472		- force_reg = 0;
473		- }
	577	+ bdev_new = lookup_bdev(path);
	578	+ if (IS_ERR(bdev_new))
	579	+ return false;
474	580
475		- pending = pending_bios->head;
476		- tail = pending_bios->tail;
477		- WARN_ON(pending && !tail);
	581	+ if (bdev_old == bdev_new)
	582	+ return true;
478	583
479		- /*
480		- * if pending was null this time around, no bios need processing
481		- * at all and we can stop. Otherwise it'll loop back up again
482		- * and do an additional check so no bios are missed.
483		- *
484		- * device->running_pending is used to synchronize with the
485		- * schedule_bio code.
486		- */
487		- if (device->pending_sync_bios.head == NULL &&
488		- device->pending_bios.head == NULL) {
489		- again = 0;
490		- device->running_pending = 0;
491		- } else {
492		- again = 1;
493		- device->running_pending = 1;
494		- }
495		-
496		- pending_bios->head = NULL;
497		- pending_bios->tail = NULL;
498		-
499		- spin_unlock(&device->io_lock);
500		-
501		- while (pending) {
502		-
503		- rmb();
504		- /* we want to work on both lists, but do more bios on the
505		- * sync list than the regular list
506		- */
507		- if ((num_run > 32 &&
508		- pending_bios != &device->pending_sync_bios &&
509		- device->pending_sync_bios.head) \|\|
510		- (num_run > 64 && pending_bios == &device->pending_sync_bios &&
511		- device->pending_bios.head)) {
512		- spin_lock(&device->io_lock);
513		- requeue_list(pending_bios, pending, tail);
514		- goto loop_lock;
515		- }
516		-
517		- cur = pending;
518		- pending = pending->bi_next;
519		- cur->bi_next = NULL;
520		-
521		- BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
522		-
523		- /*
524		- * if we're doing the sync list, record that our
525		- * plug has some sync requests on it
526		- *
527		- * If we're doing the regular list and there are
528		- * sync requests sitting around, unplug before
529		- * we add more
530		- */
531		- if (pending_bios == &device->pending_sync_bios) {
532		- sync_pending = 1;
533		- } else if (sync_pending) {
534		- blk_finish_plug(&plug);
535		- blk_start_plug(&plug);
536		- sync_pending = 0;
537		- }
538		-
539		- btrfsic_submit_bio(cur);
540		- num_run++;
541		- batch_run++;
542		-
543		- cond_resched();
544		-
545		- /*
546		- * we made progress, there is more work to do and the bdi
547		- * is now congested. Back off and let other work structs
548		- * run instead
549		- */
550		- if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
551		- fs_info->fs_devices->open_devices > 1) {
552		- struct io_context *ioc;
553		-
554		- ioc = current->io_context;
555		-
556		- /*
557		- * the main goal here is that we don't want to
558		- * block if we're going to be able to submit
559		- * more requests without blocking.
560		- *
561		- * This code does two great things, it pokes into
562		- * the elevator code from a filesystem _and_
563		- * it makes assumptions about how batching works.
564		- */
565		- if (ioc && ioc->nr_batch_requests > 0 &&
566		- time_before(jiffies, ioc->last_waited + HZ/50UL) &&
567		- (last_waited == 0 \|\|
568		- ioc->last_waited == last_waited)) {
569		- /*
570		- * we want to go through our batch of
571		- * requests and stop. So, we copy out
572		- * the ioc->last_waited time and test
573		- * against it before looping
574		- */
575		- last_waited = ioc->last_waited;
576		- cond_resched();
577		- continue;
578		- }
579		- spin_lock(&device->io_lock);
580		- requeue_list(pending_bios, pending, tail);
581		- device->running_pending = 1;
582		-
583		- spin_unlock(&device->io_lock);
584		- btrfs_queue_work(fs_info->submit_workers,
585		- &device->work);
586		- goto done;
587		- }
588		- }
589		-
590		- cond_resched();
591		- if (again)
592		- goto loop;
593		-
594		- spin_lock(&device->io_lock);
595		- if (device->pending_bios.head \|\| device->pending_sync_bios.head)
596		- goto loop_lock;
597		- spin_unlock(&device->io_lock);
598		-
599		-done:
600		- blk_finish_plug(&plug);
601		-}
602		-
603		-static void pending_bios_fn(struct btrfs_work *work)
604		-{
605		- struct btrfs_device *device;
606		-
607		- device = container_of(work, struct btrfs_device, work);
608		- run_scheduled_bios(device);
	584	+ return false;
609	585	}
610	586
611	587	/*
..	..	@@ -615,52 +591,55 @@
615	591	* matching this path only.
616	592	* skip_dev: Optional. Will skip this device when searching for the stale
617	593	* devices.
	594	+ * Return: 0 for success or if @path is NULL.
	595	+ * -EBUSY if @path is a mounted device.
	596	+ * -ENOENT if @path does not match any device in the list.
618	597	*/
619		-static void btrfs_free_stale_devices(const char *path,
	598	+static int btrfs_free_stale_devices(const char *path,
620	599	struct btrfs_device *skip_device)
621	600	{
622	601	struct btrfs_fs_devices fs_devices, tmp_fs_devices;
623	602	struct btrfs_device device, tmp_device;
	603	+ int ret = 0;
	604	+
	605	+ lockdep_assert_held(&uuid_mutex);
	606	+
	607	+ if (path)
	608	+ ret = -ENOENT;
624	609
625	610	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
626		- mutex_lock(&fs_devices->device_list_mutex);
627		- if (fs_devices->opened) {
628		- mutex_unlock(&fs_devices->device_list_mutex);
629		- continue;
630		- }
631	611
	612	+ mutex_lock(&fs_devices->device_list_mutex);
632	613	list_for_each_entry_safe(device, tmp_device,
633	614	&fs_devices->devices, dev_list) {
634		- int not_found = 0;
635		-
636	615	if (skip_device && skip_device == device)
637	616	continue;
638		- if (path && !device->name)
	617	+ if (path && !device_matched(device, path))
639	618	continue;
640		-
641		- rcu_read_lock();
642		- if (path)
643		- not_found = strcmp(rcu_str_deref(device->name),
644		- path);
645		- rcu_read_unlock();
646		- if (not_found)
647		- continue;
	619	+ if (fs_devices->opened) {
	620	+ /* for an already deleted device return 0 */
	621	+ if (path && ret != 0)
	622	+ ret = -EBUSY;
	623	+ break;
	624	+ }
648	625
649	626	/* delete the stale device */
650	627	fs_devices->num_devices--;
651	628	list_del(&device->dev_list);
652	629	btrfs_free_device(device);
653	630
654		- if (fs_devices->num_devices == 0)
655		- break;
	631	+ ret = 0;
656	632	}
657	633	mutex_unlock(&fs_devices->device_list_mutex);
	634	+
658	635	if (fs_devices->num_devices == 0) {
659	636	btrfs_sysfs_remove_fsid(fs_devices);
660	637	list_del(&fs_devices->fs_list);
661	638	free_fs_devices(fs_devices);
662	639	}
663	640	}
	641	+
	642	+ return ret;
664	643	}
665	644
666	645	/*
..	..	@@ -674,7 +653,6 @@
674	653	{
675	654	struct request_queue *q;
676	655	struct block_device *bdev;
677		- struct buffer_head *bh;
678	656	struct btrfs_super_block *disk_super;
679	657	u64 devid;
680	658	int ret;
..	..	@@ -685,23 +663,29 @@
685	663	return -EINVAL;
686	664
687	665	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
688		- &bdev, &bh);
	666	+ &bdev, &disk_super);
689	667	if (ret)
690	668	return ret;
691	669
692		- disk_super = (struct btrfs_super_block *)bh->b_data;
693	670	devid = btrfs_stack_device_id(&disk_super->dev_item);
694	671	if (devid != device->devid)
695		- goto error_brelse;
	672	+ goto error_free_page;
696	673
697	674	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
698		- goto error_brelse;
	675	+ goto error_free_page;
699	676
700	677	device->generation = btrfs_super_generation(disk_super);
701	678
702	679	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
	680	+ if (btrfs_super_incompat_flags(disk_super) &
	681	+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
	682	+ pr_err(
	683	+ "BTRFS: Invalid seeding and uuid-changed device detected\n");
	684	+ goto error_free_page;
	685	+ }
	686	+
703	687	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
704		- fs_devices->seeding = 1;
	688	+ fs_devices->seeding = true;
705	689	} else {
706	690	if (bdev_read_only(bdev))
707	691	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
..	..	@@ -711,7 +695,7 @@
711	695
712	696	q = bdev_get_queue(bdev);
713	697	if (!blk_queue_nonrot(q))
714		- fs_devices->rotating = 1;
	698	+ fs_devices->rotating = true;
715	699
716	700	device->bdev = bdev;
717	701	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
..	..	@@ -723,17 +707,109 @@
723	707	fs_devices->rw_devices++;
724	708	list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
725	709	}
726		- brelse(bh);
	710	+ btrfs_release_disk_super(disk_super);
727	711
728	712	return 0;
729	713
730		-error_brelse:
731		- brelse(bh);
	714	+error_free_page:
	715	+ btrfs_release_disk_super(disk_super);
732	716	blkdev_put(bdev, flags);
733	717
734	718	return -EINVAL;
735	719	}
736	720
	721	+u8 btrfs_sb_fsid_ptr(struct btrfs_super_block sb)
	722	+{
	723	+ bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
	724	+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
	725	+
	726	+ return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
	727	+}
	728	+
	729	+/*
	730	+ * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
	731	+ * being created with a disk that has already completed its fsid change. Such
	732	+ * disk can belong to an fs which has its FSID changed or to one which doesn't.
	733	+ * Handle both cases here.
	734	+ */
	735	+static struct btrfs_fs_devices *find_fsid_inprogress(
	736	+ struct btrfs_super_block *disk_super)
	737	+{
	738	+ struct btrfs_fs_devices *fs_devices;
	739	+
	740	+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
	741	+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
	742	+ BTRFS_FSID_SIZE) != 0 &&
	743	+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
	744	+ BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
	745	+ return fs_devices;
	746	+ }
	747	+ }
	748	+
	749	+ return find_fsid(disk_super->fsid, NULL);
	750	+}
	751	+
	752	+
	753	+static struct btrfs_fs_devices *find_fsid_changed(
	754	+ struct btrfs_super_block *disk_super)
	755	+{
	756	+ struct btrfs_fs_devices *fs_devices;
	757	+
	758	+ /*
	759	+ * Handles the case where scanned device is part of an fs that had
	760	+ * multiple successful changes of FSID but curently device didn't
	761	+ * observe it. Meaning our fsid will be different than theirs. We need
	762	+ * to handle two subcases :
	763	+ * 1 - The fs still continues to have different METADATA/FSID uuids.
	764	+ * 2 - The fs is switched back to its original FSID (METADATA/FSID
	765	+ * are equal).
	766	+ */
	767	+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
	768	+ /* Changed UUIDs */
	769	+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
	770	+ BTRFS_FSID_SIZE) != 0 &&
	771	+ memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
	772	+ BTRFS_FSID_SIZE) == 0 &&
	773	+ memcmp(fs_devices->fsid, disk_super->fsid,
	774	+ BTRFS_FSID_SIZE) != 0)
	775	+ return fs_devices;
	776	+
	777	+ /* Unchanged UUIDs */
	778	+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
	779	+ BTRFS_FSID_SIZE) == 0 &&
	780	+ memcmp(fs_devices->fsid, disk_super->metadata_uuid,
	781	+ BTRFS_FSID_SIZE) == 0)
	782	+ return fs_devices;
	783	+ }
	784	+
	785	+ return NULL;
	786	+}
	787	+
	788	+static struct btrfs_fs_devices *find_fsid_reverted_metadata(
	789	+ struct btrfs_super_block *disk_super)
	790	+{
	791	+ struct btrfs_fs_devices *fs_devices;
	792	+
	793	+ /*
	794	+ * Handle the case where the scanned device is part of an fs whose last
	795	+ * metadata UUID change reverted it to the original FSID. At the same
	796	+ * time * fs_devices was first created by another constitutent device
	797	+ * which didn't fully observe the operation. This results in an
	798	+ * btrfs_fs_devices created with metadata/fsid different AND
	799	+ * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
	800	+ * fs_devices equal to the FSID of the disk.
	801	+ */
	802	+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
	803	+ if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
	804	+ BTRFS_FSID_SIZE) != 0 &&
	805	+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
	806	+ BTRFS_FSID_SIZE) == 0 &&
	807	+ fs_devices->fsid_change)
	808	+ return fs_devices;
	809	+ }
	810	+
	811	+ return NULL;
	812	+}
737	813	/*
738	814	* Add new device to list of registered devices
739	815	*
..	..	@@ -746,16 +822,40 @@
746	822	bool *new_device_added)
747	823	{
748	824	struct btrfs_device *device;
749		- struct btrfs_fs_devices *fs_devices;
	825	+ struct btrfs_fs_devices *fs_devices = NULL;
750	826	struct rcu_string *name;
751	827	u64 found_transid = btrfs_super_generation(disk_super);
752	828	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
	829	+ bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
	830	+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
	831	+ bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
	832	+ BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
753	833
754		- fs_devices = find_fsid(disk_super->fsid);
	834	+ if (fsid_change_in_progress) {
	835	+ if (!has_metadata_uuid)
	836	+ fs_devices = find_fsid_inprogress(disk_super);
	837	+ else
	838	+ fs_devices = find_fsid_changed(disk_super);
	839	+ } else if (has_metadata_uuid) {
	840	+ fs_devices = find_fsid_with_metadata_uuid(disk_super);
	841	+ } else {
	842	+ fs_devices = find_fsid_reverted_metadata(disk_super);
	843	+ if (!fs_devices)
	844	+ fs_devices = find_fsid(disk_super->fsid, NULL);
	845	+ }
	846	+
	847	+
755	848	if (!fs_devices) {
756		- fs_devices = alloc_fs_devices(disk_super->fsid);
	849	+ if (has_metadata_uuid)
	850	+ fs_devices = alloc_fs_devices(disk_super->fsid,
	851	+ disk_super->metadata_uuid);
	852	+ else
	853	+ fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
	854	+
757	855	if (IS_ERR(fs_devices))
758	856	return ERR_CAST(fs_devices);
	857	+
	858	+ fs_devices->fsid_change = fsid_change_in_progress;
759	859
760	860	mutex_lock(&fs_devices->device_list_mutex);
761	861	list_add(&fs_devices->fs_list, &fs_uuids);
..	..	@@ -765,6 +865,27 @@
765	865	mutex_lock(&fs_devices->device_list_mutex);
766	866	device = btrfs_find_device(fs_devices, devid,
767	867	disk_super->dev_item.uuid, NULL, false);
	868	+
	869	+ /*
	870	+ * If this disk has been pulled into an fs devices created by
	871	+ * a device which had the CHANGING_FSID_V2 flag then replace the
	872	+ * metadata_uuid/fsid values of the fs_devices.
	873	+ */
	874	+ if (fs_devices->fsid_change &&
	875	+ found_transid > fs_devices->latest_generation) {
	876	+ memcpy(fs_devices->fsid, disk_super->fsid,
	877	+ BTRFS_FSID_SIZE);
	878	+
	879	+ if (has_metadata_uuid)
	880	+ memcpy(fs_devices->metadata_uuid,
	881	+ disk_super->metadata_uuid,
	882	+ BTRFS_FSID_SIZE);
	883	+ else
	884	+ memcpy(fs_devices->metadata_uuid,
	885	+ disk_super->fsid, BTRFS_FSID_SIZE);
	886	+
	887	+ fs_devices->fsid_change = false;
	888	+ }
768	889	}
769	890
770	891	if (!device) {
..	..	@@ -796,11 +917,15 @@
796	917	*new_device_added = true;
797	918
798	919	if (disk_super->label[0])
799		- pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
800		- disk_super->label, devid, found_transid, path);
	920	+ pr_info(
	921	+ "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
	922	+ disk_super->label, devid, found_transid, path,
	923	+ current->comm, task_pid_nr(current));
801	924	else
802		- pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
803		- disk_super->fsid, devid, found_transid, path);
	925	+ pr_info(
	926	+ "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
	927	+ disk_super->fsid, devid, found_transid, path,
	928	+ current->comm, task_pid_nr(current));
804	929
805	930	} else if (!device->name \|\| strcmp(device->name->str, path)) {
806	931	/*
..	..	@@ -897,8 +1022,11 @@
897	1022	* it back. We need it to pick the disk with largest generation
898	1023	* (as above).
899	1024	*/
900		- if (!fs_devices->opened)
	1025	+ if (!fs_devices->opened) {
901	1026	device->generation = found_transid;
	1027	+ fs_devices->latest_generation = max_t(u64, found_transid,
	1028	+ fs_devices->latest_generation);
	1029	+ }
902	1030
903	1031	fs_devices->total_devices = btrfs_super_num_devices(disk_super);
904	1032
..	..	@@ -911,22 +1039,25 @@
911	1039	struct btrfs_fs_devices *fs_devices;
912	1040	struct btrfs_device *device;
913	1041	struct btrfs_device *orig_dev;
	1042	+ int ret = 0;
914	1043
915		- fs_devices = alloc_fs_devices(orig->fsid);
	1044	+ lockdep_assert_held(&uuid_mutex);
	1045	+
	1046	+ fs_devices = alloc_fs_devices(orig->fsid, NULL);
916	1047	if (IS_ERR(fs_devices))
917	1048	return fs_devices;
918	1049
919		- mutex_lock(&orig->device_list_mutex);
920	1050	fs_devices->total_devices = orig->total_devices;
921	1051
922		- /* We have held the volume lock, it is safe to get the devices. */
923	1052	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
924	1053	struct rcu_string *name;
925	1054
926	1055	device = btrfs_alloc_device(NULL, &orig_dev->devid,
927	1056	orig_dev->uuid);
928		- if (IS_ERR(device))
	1057	+ if (IS_ERR(device)) {
	1058	+ ret = PTR_ERR(device);
929	1059	goto error;
	1060	+ }
930	1061
931	1062	/*
932	1063	* This is ok to do without rcu read locked because we hold the
..	..	@@ -937,6 +1068,7 @@
937	1068	GFP_KERNEL);
938	1069	if (!name) {
939	1070	btrfs_free_device(device);
	1071	+ ret = -ENOMEM;
940	1072	goto error;
941	1073	}
942	1074	rcu_assign_pointer(device->name, name);
..	..	@@ -946,36 +1078,27 @@
946	1078	device->fs_devices = fs_devices;
947	1079	fs_devices->num_devices++;
948	1080	}
949		- mutex_unlock(&orig->device_list_mutex);
950	1081	return fs_devices;
951	1082	error:
952		- mutex_unlock(&orig->device_list_mutex);
953	1083	free_fs_devices(fs_devices);
954		- return ERR_PTR(-ENOMEM);
	1084	+ return ERR_PTR(ret);
955	1085	}
956	1086
957		-/*
958		- * After we have read the system tree and know devids belonging to
959		- * this filesystem, remove the device which does not belong there.
960		- */
961		-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
	1087	+static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
	1088	+ int step, struct btrfs_device **latest_dev)
962	1089	{
963	1090	struct btrfs_device device, next;
964		- struct btrfs_device *latest_dev = NULL;
965	1091
966		- mutex_lock(&uuid_mutex);
967		-again:
968	1092	/* This is the initialized path, it is safe to release the devices. */
969	1093	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
970		- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
971		- &device->dev_state)) {
	1094	+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
972	1095	if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
973		- &device->dev_state) &&
	1096	+ &device->dev_state) &&
974	1097	!test_bit(BTRFS_DEV_STATE_MISSING,
975	1098	&device->dev_state) &&
976		- (!latest_dev \|\|
977		- device->generation > latest_dev->generation)) {
978		- latest_dev = device;
	1099	+ (!*latest_dev \|\|
	1100	+ device->generation > (*latest_dev)->generation)) {
	1101	+ *latest_dev = device;
979	1102	}
980	1103	continue;
981	1104	}
..	..	@@ -1002,22 +1125,26 @@
1002	1125	btrfs_free_device(device);
1003	1126	}
1004	1127
1005		- if (fs_devices->seed) {
1006		- fs_devices = fs_devices->seed;
1007		- goto again;
1008		- }
	1128	+}
	1129	+
	1130	+/*
	1131	+ * After we have read the system tree and know devids belonging to this
	1132	+ * filesystem, remove the device which does not belong there.
	1133	+ */
	1134	+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
	1135	+{
	1136	+ struct btrfs_device *latest_dev = NULL;
	1137	+ struct btrfs_fs_devices *seed_dev;
	1138	+
	1139	+ mutex_lock(&uuid_mutex);
	1140	+ __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
	1141	+
	1142	+ list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
	1143	+ __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
1009	1144
1010	1145	fs_devices->latest_bdev = latest_dev->bdev;
1011	1146
1012	1147	mutex_unlock(&uuid_mutex);
1013		-}
1014		-
1015		-static void free_device_rcu(struct rcu_head *head)
1016		-{
1017		- struct btrfs_device *device;
1018		-
1019		- device = container_of(head, struct btrfs_device, rcu);
1020		- btrfs_free_device(device);
1021	1148	}
1022	1149
1023	1150	static void btrfs_close_bdev(struct btrfs_device *device)
..	..	@@ -1036,11 +1163,6 @@
1036	1163	static void btrfs_close_one_device(struct btrfs_device *device)
1037	1164	{
1038	1165	struct btrfs_fs_devices *fs_devices = device->fs_devices;
1039		- struct btrfs_device *new_device;
1040		- struct rcu_string *name;
1041		-
1042		- if (device->bdev)
1043		- fs_devices->open_devices--;
1044	1166
1045	1167	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1046	1168	device->devid != BTRFS_DEV_REPLACE_DEVID) {
..	..	@@ -1057,65 +1179,85 @@
1057	1179	}
1058	1180
1059	1181	btrfs_close_bdev(device);
1060		-
1061		- new_device = btrfs_alloc_device(NULL, &device->devid,
1062		- device->uuid);
1063		- BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1064		-
1065		- /* Safe because we are under uuid_mutex */
1066		- if (device->name) {
1067		- name = rcu_string_strdup(device->name->str, GFP_NOFS);
1068		- BUG_ON(!name); /* -ENOMEM */
1069		- rcu_assign_pointer(new_device->name, name);
	1182	+ if (device->bdev) {
	1183	+ fs_devices->open_devices--;
	1184	+ device->bdev = NULL;
1070	1185	}
	1186	+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1071	1187
1072		- list_replace_rcu(&device->dev_list, &new_device->dev_list);
1073		- new_device->fs_devices = device->fs_devices;
	1188	+ device->fs_info = NULL;
	1189	+ atomic_set(&device->dev_stats_ccnt, 0);
	1190	+ extent_io_tree_release(&device->alloc_state);
1074	1191
1075		- call_rcu(&device->rcu, free_device_rcu);
	1192	+ /*
	1193	+ * Reset the flush error record. We might have a transient flush error
	1194	+ * in this mount, and if so we aborted the current transaction and set
	1195	+ * the fs to an error state, guaranteeing no super blocks can be further
	1196	+ * committed. However that error might be transient and if we unmount the
	1197	+ * filesystem and mount it again, we should allow the mount to succeed
	1198	+ * (btrfs_check_rw_degradable() should not fail) - if after mounting the
	1199	+ * filesystem again we still get flush errors, then we will again abort
	1200	+ * any transaction and set the error state, guaranteeing no commits of
	1201	+ * unsafe super blocks.
	1202	+ */
	1203	+ device->last_flush_error = 0;
	1204	+
	1205	+ /* Verify the device is back in a pristine state */
	1206	+ ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
	1207	+ ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
	1208	+ ASSERT(list_empty(&device->dev_alloc_list));
	1209	+ ASSERT(list_empty(&device->post_commit_list));
	1210	+ ASSERT(atomic_read(&device->reada_in_flight) == 0);
1076	1211	}
1077	1212
1078		-static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
	1213	+static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1079	1214	{
1080	1215	struct btrfs_device device, tmp;
1081	1216
1082		- if (--fs_devices->opened > 0)
1083		- return 0;
	1217	+ lockdep_assert_held(&uuid_mutex);
1084	1218
1085		- mutex_lock(&fs_devices->device_list_mutex);
1086		- list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
	1219	+ if (--fs_devices->opened > 0)
	1220	+ return;
	1221	+
	1222	+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1087	1223	btrfs_close_one_device(device);
1088		- }
1089		- mutex_unlock(&fs_devices->device_list_mutex);
1090	1224
1091	1225	WARN_ON(fs_devices->open_devices);
1092	1226	WARN_ON(fs_devices->rw_devices);
1093	1227	fs_devices->opened = 0;
1094		- fs_devices->seeding = 0;
1095		-
1096		- return 0;
	1228	+ fs_devices->seeding = false;
	1229	+ fs_devices->fs_info = NULL;
1097	1230	}
1098	1231
1099		-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
	1232	+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1100	1233	{
1101		- struct btrfs_fs_devices *seed_devices = NULL;
1102		- int ret;
	1234	+ LIST_HEAD(list);
	1235	+ struct btrfs_fs_devices *tmp;
1103	1236
1104	1237	mutex_lock(&uuid_mutex);
1105		- ret = close_fs_devices(fs_devices);
	1238	+ close_fs_devices(fs_devices);
1106	1239	if (!fs_devices->opened) {
1107		- seed_devices = fs_devices->seed;
1108		- fs_devices->seed = NULL;
1109		- }
1110		- mutex_unlock(&uuid_mutex);
	1240	+ list_splice_init(&fs_devices->seed_list, &list);
1111	1241
1112		- while (seed_devices) {
1113		- fs_devices = seed_devices;
1114		- seed_devices = fs_devices->seed;
	1242	+ /*
	1243	+ * If the struct btrfs_fs_devices is not assembled with any
	1244	+ * other device, it can be re-initialized during the next mount
	1245	+ * without the needing device-scan step. Therefore, it can be
	1246	+ * fully freed.
	1247	+ */
	1248	+ if (fs_devices->num_devices == 1) {
	1249	+ list_del(&fs_devices->fs_list);
	1250	+ free_fs_devices(fs_devices);
	1251	+ }
	1252	+ }
	1253	+
	1254	+
	1255	+ list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1115	1256	close_fs_devices(fs_devices);
	1257	+ list_del(&fs_devices->seed_list);
1116	1258	free_fs_devices(fs_devices);
1117	1259	}
1118		- return ret;
	1260	+ mutex_unlock(&uuid_mutex);
1119	1261	}
1120	1262
1121	1263	static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
..	..	@@ -1123,28 +1265,33 @@
1123	1265	{
1124	1266	struct btrfs_device *device;
1125	1267	struct btrfs_device *latest_dev = NULL;
1126		- int ret = 0;
	1268	+ struct btrfs_device *tmp_device;
1127	1269
1128	1270	flags \|= FMODE_EXCL;
1129	1271
1130		- list_for_each_entry(device, &fs_devices->devices, dev_list) {
1131		- /* Just open everything we can; ignore failures here */
1132		- if (btrfs_open_one_device(fs_devices, device, flags, holder))
1133		- continue;
	1272	+ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
	1273	+ dev_list) {
	1274	+ int ret;
1134	1275
1135		- if (!latest_dev \|\|
1136		- device->generation > latest_dev->generation)
	1276	+ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
	1277	+ if (ret == 0 &&
	1278	+ (!latest_dev \|\| device->generation > latest_dev->generation)) {
1137	1279	latest_dev = device;
	1280	+ } else if (ret == -ENODATA) {
	1281	+ fs_devices->num_devices--;
	1282	+ list_del(&device->dev_list);
	1283	+ btrfs_free_device(device);
	1284	+ }
1138	1285	}
1139		- if (fs_devices->open_devices == 0) {
1140		- ret = -EINVAL;
1141		- goto out;
1142		- }
	1286	+ if (fs_devices->open_devices == 0)
	1287	+ return -EINVAL;
	1288	+
1143	1289	fs_devices->opened = 1;
1144	1290	fs_devices->latest_bdev = latest_dev->bdev;
1145	1291	fs_devices->total_rw_bytes = 0;
1146		-out:
1147		- return ret;
	1292	+ fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
	1293	+
	1294	+ return 0;
1148	1295	}
1149	1296
1150	1297	static int devid_cmp(void priv, struct list_head a, struct list_head *b)
..	..	@@ -1186,55 +1333,66 @@
1186	1333	return ret;
1187	1334	}
1188	1335
1189		-static void btrfs_release_disk_super(struct page *page)
	1336	+void btrfs_release_disk_super(struct btrfs_super_block *super)
1190	1337	{
1191		- kunmap(page);
	1338	+ struct page *page = virt_to_page(super);
	1339	+
1192	1340	put_page(page);
1193	1341	}
1194	1342
1195		-static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1196		- struct page **page,
1197		- struct btrfs_super_block **disk_super)
	1343	+static struct btrfs_super_block btrfs_read_disk_super(struct block_device bdev,
	1344	+ u64 bytenr)
1198	1345	{
	1346	+ struct btrfs_super_block *disk_super;
	1347	+ struct page *page;
1199	1348	void *p;
1200	1349	pgoff_t index;
1201	1350
1202	1351	/* make sure our super fits in the device */
1203	1352	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1204		- return 1;
	1353	+ return ERR_PTR(-EINVAL);
1205	1354
1206	1355	/* make sure our super fits in the page */
1207		- if (sizeof(**disk_super) > PAGE_SIZE)
1208		- return 1;
	1356	+ if (sizeof(*disk_super) > PAGE_SIZE)
	1357	+ return ERR_PTR(-EINVAL);
1209	1358
1210	1359	/* make sure our super doesn't straddle pages on disk */
1211	1360	index = bytenr >> PAGE_SHIFT;
1212		- if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1213		- return 1;
	1361	+ if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
	1362	+ return ERR_PTR(-EINVAL);
1214	1363
1215	1364	/* pull in the page with our super */
1216		- *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1217		- index, GFP_KERNEL);
	1365	+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1218	1366
1219		- if (IS_ERR_OR_NULL(*page))
1220		- return 1;
	1367	+ if (IS_ERR(page))
	1368	+ return ERR_CAST(page);
1221	1369
1222		- p = kmap(*page);
	1370	+ p = page_address(page);
1223	1371
1224	1372	/* align our pointer to the offset of the super block */
1225		- *disk_super = p + (bytenr & ~PAGE_MASK);
	1373	+ disk_super = p + offset_in_page(bytenr);
1226	1374
1227		- if (btrfs_super_bytenr(*disk_super) != bytenr \|\|
1228		- btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1229		- btrfs_release_disk_super(*page);
1230		- return 1;
	1375	+ if (btrfs_super_bytenr(disk_super) != bytenr \|\|
	1376	+ btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
	1377	+ btrfs_release_disk_super(p);
	1378	+ return ERR_PTR(-EINVAL);
1231	1379	}
1232	1380
1233		- if ((*disk_super)->label[0] &&
1234		- (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1235		- (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
	1381	+ if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
	1382	+ disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1236	1383
1237		- return 0;
	1384	+ return disk_super;
	1385	+}
	1386	+
	1387	+int btrfs_forget_devices(const char *path)
	1388	+{
	1389	+ int ret;
	1390	+
	1391	+ mutex_lock(&uuid_mutex);
	1392	+ ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
	1393	+ mutex_unlock(&uuid_mutex);
	1394	+
	1395	+ return ret;
1238	1396	}
1239	1397
1240	1398	/*
..	..	@@ -1249,7 +1407,6 @@
1249	1407	bool new_device_added = false;
1250	1408	struct btrfs_device *device = NULL;
1251	1409	struct block_device *bdev;
1252		- struct page *page;
1253	1410	u64 bytenr;
1254	1411
1255	1412	lockdep_assert_held(&uuid_mutex);
..	..	@@ -1261,14 +1418,24 @@
1261	1418	* later supers, using BTRFS_SUPER_MIRROR_MAX instead
1262	1419	*/
1263	1420	bytenr = btrfs_sb_offset(0);
1264		- flags \|= FMODE_EXCL;
1265	1421
	1422	+ /*
	1423	+ * Avoid using flag \|= FMODE_EXCL here, as the systemd-udev may
	1424	+ * initiate the device scan which may race with the user's mount
	1425	+ * or mkfs command, resulting in failure.
	1426	+ * Since the device scan is solely for reading purposes, there is
	1427	+ * no need for FMODE_EXCL. Additionally, the devices are read again
	1428	+ * during the mount process. It is ok to get some inconsistent
	1429	+ * values temporarily, as the device paths of the fsid are the only
	1430	+ * required information for assembling the volume.
	1431	+ */
1266	1432	bdev = blkdev_get_by_path(path, flags, holder);
1267	1433	if (IS_ERR(bdev))
1268	1434	return ERR_CAST(bdev);
1269	1435
1270		- if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1271		- device = ERR_PTR(-EINVAL);
	1436	+ disk_super = btrfs_read_disk_super(bdev, bytenr);
	1437	+ if (IS_ERR(disk_super)) {
	1438	+ device = ERR_CAST(disk_super);
1272	1439	goto error_bdev_put;
1273	1440	}
1274	1441
..	..	@@ -1278,7 +1445,7 @@
1278	1445	btrfs_free_stale_devices(path, device);
1279	1446	}
1280	1447
1281		- btrfs_release_disk_super(page);
	1448	+ btrfs_release_disk_super(disk_super);
1282	1449
1283	1450	error_bdev_put:
1284	1451	blkdev_put(bdev, flags);
..	..	@@ -1286,60 +1453,84 @@
1286	1453	return device;
1287	1454	}
1288	1455
1289		-static int contains_pending_extent(struct btrfs_transaction *transaction,
1290		- struct btrfs_device *device,
1291		- u64 *start, u64 len)
	1456	+/*
	1457	+ * Try to find a chunk that intersects [start, start + len] range and when one
	1458	+ * such is found, record the end of it in *start
	1459	+ */
	1460	+static bool contains_pending_extent(struct btrfs_device device, u64 start,
	1461	+ u64 len)
1292	1462	{
1293		- struct btrfs_fs_info *fs_info = device->fs_info;
1294		- struct extent_map *em;
1295		- struct list_head *search_list = &fs_info->pinned_chunks;
1296		- int ret = 0;
1297		- u64 physical_start = *start;
	1463	+ u64 physical_start, physical_end;
1298	1464
1299		- if (transaction)
1300		- search_list = &transaction->pending_chunks;
1301		-again:
1302		- list_for_each_entry(em, search_list, list) {
1303		- struct map_lookup *map;
1304		- int i;
	1465	+ lockdep_assert_held(&device->fs_info->chunk_mutex);
1305	1466
1306		- map = em->map_lookup;
1307		- for (i = 0; i < map->num_stripes; i++) {
1308		- u64 end;
	1467	+ if (!find_first_extent_bit(&device->alloc_state, *start,
	1468	+ &physical_start, &physical_end,
	1469	+ CHUNK_ALLOCATED, NULL)) {
1309	1470
1310		- if (map->stripes[i].dev != device)
1311		- continue;
1312		- if (map->stripes[i].physical >= physical_start + len \|\|
1313		- map->stripes[i].physical + em->orig_block_len <=
1314		- physical_start)
1315		- continue;
1316		- /*
1317		- * Make sure that while processing the pinned list we do
1318		- * not override our *start with a lower value, because
1319		- * we can have pinned chunks that fall within this
1320		- * device hole and that have lower physical addresses
1321		- * than the pending chunks we processed before. If we
1322		- * do not take this special care we can end up getting
1323		- * 2 pending chunks that start at the same physical
1324		- * device offsets because the end offset of a pinned
1325		- * chunk can be equal to the start offset of some
1326		- * pending chunk.
1327		- */
1328		- end = map->stripes[i].physical + em->orig_block_len;
1329		- if (end > *start) {
1330		- *start = end;
1331		- ret = 1;
1332		- }
	1471	+ if (in_range(physical_start, *start, len) \|\|
	1472	+ in_range(*start, physical_start,
	1473	+ physical_end - physical_start)) {
	1474	+ *start = physical_end + 1;
	1475	+ return true;
1333	1476	}
1334	1477	}
1335		- if (search_list != &fs_info->pinned_chunks) {
1336		- search_list = &fs_info->pinned_chunks;
1337		- goto again;
1338		- }
1339		-
1340		- return ret;
	1478	+ return false;
1341	1479	}
1342	1480
	1481	+static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
	1482	+{
	1483	+ switch (device->fs_devices->chunk_alloc_policy) {
	1484	+ case BTRFS_CHUNK_ALLOC_REGULAR:
	1485	+ /*
	1486	+ * We don't want to overwrite the superblock on the drive nor
	1487	+ * any area used by the boot loader (grub for example), so we
	1488	+ * make sure to start at an offset of at least 1MB.
	1489	+ */
	1490	+ return max_t(u64, start, SZ_1M);
	1491	+ default:
	1492	+ BUG();
	1493	+ }
	1494	+}
	1495	+
	1496	+/**
	1497	+ * dev_extent_hole_check - check if specified hole is suitable for allocation
	1498	+ * @device: the device which we have the hole
	1499	+ * @hole_start: starting position of the hole
	1500	+ * @hole_size: the size of the hole
	1501	+ * @num_bytes: the size of the free space that we need
	1502	+ *
	1503	+ * This function may modify @hole_start and @hole_end to reflect the suitable
	1504	+ * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
	1505	+ */
	1506	+static bool dev_extent_hole_check(struct btrfs_device device, u64 hole_start,
	1507	+ u64 *hole_size, u64 num_bytes)
	1508	+{
	1509	+ bool changed = false;
	1510	+ u64 hole_end = hole_start + hole_size;
	1511	+
	1512	+ /*
	1513	+ * Check before we set max_hole_start, otherwise we could end up
	1514	+ * sending back this offset anyway.
	1515	+ */
	1516	+ if (contains_pending_extent(device, hole_start, *hole_size)) {
	1517	+ if (hole_end >= *hole_start)
	1518	+ hole_size = hole_end - hole_start;
	1519	+ else
	1520	+ *hole_size = 0;
	1521	+ changed = true;
	1522	+ }
	1523	+
	1524	+ switch (device->fs_devices->chunk_alloc_policy) {
	1525	+ case BTRFS_CHUNK_ALLOC_REGULAR:
	1526	+ /* No extra check */
	1527	+ break;
	1528	+ default:
	1529	+ BUG();
	1530	+ }
	1531	+
	1532	+ return changed;
	1533	+}
1343	1534
1344	1535	/*
1345	1536	* find_free_dev_extent_start - find free space in the specified device
..	..	@@ -1361,10 +1552,16 @@
1361	1552	* @len is used to store the size of the free space that we find.
1362	1553	* But if we don't find suitable free space, it is used to store the size of
1363	1554	* the max free space.
	1555	+ *
	1556	+ * NOTE: This function will search commit root of device tree, and does extra
	1557	+ * check to ensure dev extents are not double allocated.
	1558	+ * This makes the function safe to allocate dev extents but may not report
	1559	+ * correct usable device space, as device extent freed in current transaction
	1560	+ * is not reported as avaiable.
1364	1561	*/
1365		-int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1366		- struct btrfs_device *device, u64 num_bytes,
1367		- u64 search_start, u64 start, u64 len)
	1562	+static int find_free_dev_extent_start(struct btrfs_device *device,
	1563	+ u64 num_bytes, u64 search_start, u64 *start,
	1564	+ u64 *len)
1368	1565	{
1369	1566	struct btrfs_fs_info *fs_info = device->fs_info;
1370	1567	struct btrfs_root *root = fs_info->dev_root;
..	..	@@ -1380,12 +1577,7 @@
1380	1577	int slot;
1381	1578	struct extent_buffer *l;
1382	1579
1383		- /*
1384		- * We don't want to overwrite the superblock on the drive nor any area
1385		- * used by the boot loader (grub for example), so we make sure to start
1386		- * at an offset of at least 1MB.
1387		- */
1388		- search_start = max_t(u64, search_start, SZ_1M);
	1580	+ search_start = dev_extent_search_start(device, search_start);
1389	1581
1390	1582	path = btrfs_alloc_path();
1391	1583	if (!path)
..	..	@@ -1418,7 +1610,7 @@
1418	1610	goto out;
1419	1611	}
1420	1612
1421		- while (1) {
	1613	+ while (search_start < search_end) {
1422	1614	l = path->nodes[0];
1423	1615	slot = path->slots[0];
1424	1616	if (slot >= btrfs_header_nritems(l)) {
..	..	@@ -1441,23 +1633,13 @@
1441	1633	if (key.type != BTRFS_DEV_EXTENT_KEY)
1442	1634	goto next;
1443	1635
	1636	+ if (key.offset > search_end)
	1637	+ break;
	1638	+
1444	1639	if (key.offset > search_start) {
1445	1640	hole_size = key.offset - search_start;
1446		-
1447		- /*
1448		- * Have to check before we set max_hole_start, otherwise
1449		- * we could end up sending back this offset anyway.
1450		- */
1451		- if (contains_pending_extent(transaction, device,
1452		- &search_start,
1453		- hole_size)) {
1454		- if (key.offset >= search_start) {
1455		- hole_size = key.offset - search_start;
1456		- } else {
1457		- WARN_ON_ONCE(1);
1458		- hole_size = 0;
1459		- }
1460		- }
	1641	+ dev_extent_hole_check(device, &search_start, &hole_size,
	1642	+ num_bytes);
1461	1643
1462	1644	if (hole_size > max_hole_size) {
1463	1645	max_hole_start = search_start;
..	..	@@ -1496,9 +1678,8 @@
1496	1678	*/
1497	1679	if (search_end > search_start) {
1498	1680	hole_size = search_end - search_start;
1499		-
1500		- if (contains_pending_extent(transaction, device, &search_start,
1501		- hole_size)) {
	1681	+ if (dev_extent_hole_check(device, &search_start, &hole_size,
	1682	+ num_bytes)) {
1502	1683	btrfs_release_path(path);
1503	1684	goto again;
1504	1685	}
..	..	@@ -1515,6 +1696,7 @@
1515	1696	else
1516	1697	ret = 0;
1517	1698
	1699	+ ASSERT(max_hole_start + max_hole_size <= search_end);
1518	1700	out:
1519	1701	btrfs_free_path(path);
1520	1702	*start = max_hole_start;
..	..	@@ -1523,13 +1705,11 @@
1523	1705	return ret;
1524	1706	}
1525	1707
1526		-int find_free_dev_extent(struct btrfs_trans_handle *trans,
1527		- struct btrfs_device *device, u64 num_bytes,
	1708	+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1528	1709	u64 start, u64 len)
1529	1710	{
1530	1711	/* FIXME use last free of some kind */
1531		- return find_free_dev_extent_start(trans->transaction, device,
1532		- num_bytes, 0, start, len);
	1712	+ return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1533	1713	}
1534	1714
1535	1715	static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
..	..	@@ -1640,9 +1820,9 @@
1640	1820	struct rb_node *n;
1641	1821	u64 ret = 0;
1642	1822
1643		- em_tree = &fs_info->mapping_tree.map_tree;
	1823	+ em_tree = &fs_info->mapping_tree;
1644	1824	read_lock(&em_tree->lock);
1645		- n = rb_last(&em_tree->map);
	1825	+ n = rb_last(&em_tree->map.rb_root);
1646	1826	if (n) {
1647	1827	em = rb_entry(n, struct extent_map, rb_node);
1648	1828	ret = em->start + em->len;
..	..	@@ -1672,7 +1852,12 @@
1672	1852	if (ret < 0)
1673	1853	goto error;
1674	1854
1675		- BUG_ON(ret == 0); /* Corruption */
	1855	+ if (ret == 0) {
	1856	+ /* Corruption */
	1857	+ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
	1858	+ ret = -EUCLEAN;
	1859	+ goto error;
	1860	+ }
1676	1861
1677	1862	ret = btrfs_previous_item(fs_info->chunk_root, path,
1678	1863	BTRFS_DEV_ITEMS_OBJECTID,
..	..	@@ -1738,7 +1923,8 @@
1738	1923	ptr = btrfs_device_uuid(dev_item);
1739	1924	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1740	1925	ptr = btrfs_device_fsid(dev_item);
1741		- write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE);
	1926	+ write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
	1927	+ ptr, BTRFS_FSID_SIZE);
1742	1928	btrfs_mark_buffer_dirty(leaf);
1743	1929
1744	1930	ret = 0;
..	..	@@ -1750,22 +1936,27 @@
1750	1936	/*
1751	1937	* Function to update ctime/mtime for a given device path.
1752	1938	* Mainly used for ctime/mtime based probe like libblkid.
	1939	+ *
	1940	+ * We don't care about errors here, this is just to be kind to userspace.
1753	1941	*/
1754		-static void update_dev_time(const char *path_name)
	1942	+static void update_dev_time(const char *device_path)
1755	1943	{
1756		- struct file *filp;
	1944	+ struct path path;
	1945	+ struct timespec64 now;
	1946	+ int ret;
1757	1947
1758		- filp = filp_open(path_name, O_RDWR, 0);
1759		- if (IS_ERR(filp))
	1948	+ ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
	1949	+ if (ret)
1760	1950	return;
1761		- file_update_time(filp);
1762		- filp_close(filp, NULL);
	1951	+
	1952	+ now = current_time(d_inode(path.dentry));
	1953	+ inode_update_time(d_inode(path.dentry), &now, S_MTIME \| S_CTIME);
	1954	+ path_put(&path);
1763	1955	}
1764	1956
1765		-static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1766		- struct btrfs_device *device)
	1957	+static int btrfs_rm_dev_item(struct btrfs_device *device)
1767	1958	{
1768		- struct btrfs_root *root = fs_info->chunk_root;
	1959	+ struct btrfs_root *root = device->fs_info->chunk_root;
1769	1960	int ret;
1770	1961	struct btrfs_path *path;
1771	1962	struct btrfs_key key;
..	..	@@ -1862,17 +2053,14 @@
1862	2053	* where this function called, there should be always be another device (or
1863	2054	* this_dev) which is active.
1864	2055	*/
1865		-void btrfs_assign_next_active_device(struct btrfs_device *device,
1866		- struct btrfs_device *this_dev)
	2056	+void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
	2057	+ struct btrfs_device *next_device)
1867	2058	{
1868	2059	struct btrfs_fs_info *fs_info = device->fs_info;
1869		- struct btrfs_device *next_device;
1870	2060
1871		- if (this_dev)
1872		- next_device = this_dev;
1873		- else
	2061	+ if (!next_device)
1874	2062	next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1875		- device);
	2063	+ device);
1876	2064	ASSERT(next_device);
1877	2065
1878	2066	if (fs_info->sb->s_bdev &&
..	..	@@ -1883,8 +2071,66 @@
1883	2071	fs_info->fs_devices->latest_bdev = next_device->bdev;
1884	2072	}
1885	2073
	2074	+/*
	2075	+ * Return btrfs_fs_devices::num_devices excluding the device that's being
	2076	+ * currently replaced.
	2077	+ */
	2078	+static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
	2079	+{
	2080	+ u64 num_devices = fs_info->fs_devices->num_devices;
	2081	+
	2082	+ down_read(&fs_info->dev_replace.rwsem);
	2083	+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
	2084	+ ASSERT(num_devices > 1);
	2085	+ num_devices--;
	2086	+ }
	2087	+ up_read(&fs_info->dev_replace.rwsem);
	2088	+
	2089	+ return num_devices;
	2090	+}
	2091	+
	2092	+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
	2093	+ struct block_device *bdev,
	2094	+ const char *device_path)
	2095	+{
	2096	+ struct btrfs_super_block *disk_super;
	2097	+ int copy_num;
	2098	+
	2099	+ if (!bdev)
	2100	+ return;
	2101	+
	2102	+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
	2103	+ struct page *page;
	2104	+ int ret;
	2105	+
	2106	+ disk_super = btrfs_read_dev_one_super(bdev, copy_num);
	2107	+ if (IS_ERR(disk_super))
	2108	+ continue;
	2109	+
	2110	+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
	2111	+
	2112	+ page = virt_to_page(disk_super);
	2113	+ set_page_dirty(page);
	2114	+ lock_page(page);
	2115	+ /* write_on_page() unlocks the page */
	2116	+ ret = write_one_page(page);
	2117	+ if (ret)
	2118	+ btrfs_warn(fs_info,
	2119	+ "error clearing superblock number %d (%d)",
	2120	+ copy_num, ret);
	2121	+ btrfs_release_disk_super(disk_super);
	2122	+
	2123	+ }
	2124	+
	2125	+ /* Notify udev that device has changed */
	2126	+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
	2127	+
	2128	+ /* Update ctime/mtime for device path for libblkid */
	2129	+ update_dev_time(device_path);
	2130	+}
	2131	+
1886	2132	int btrfs_rm_device(struct btrfs_fs_info fs_info, const char device_path,
1887		- u64 devid)
	2133	+ u64 devid)
1888	2134	{
1889	2135	struct btrfs_device *device;
1890	2136	struct btrfs_fs_devices *cur_devices;
..	..	@@ -1892,24 +2138,35 @@
1892	2138	u64 num_devices;
1893	2139	int ret = 0;
1894	2140
1895		- mutex_lock(&uuid_mutex);
1896		-
1897		- num_devices = fs_devices->num_devices;
1898		- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
1899		- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1900		- WARN_ON(num_devices < 1);
1901		- num_devices--;
1902		- }
1903		- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
	2141	+ /*
	2142	+ * The device list in fs_devices is accessed without locks (neither
	2143	+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
	2144	+ * filesystem and another device rm cannot run.
	2145	+ */
	2146	+ num_devices = btrfs_num_devices(fs_info);
1904	2147
1905	2148	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1906	2149	if (ret)
1907	2150	goto out;
1908	2151
1909		- ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1910		- &device);
1911		- if (ret)
	2152	+ device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
	2153	+
	2154	+ if (IS_ERR(device)) {
	2155	+ if (PTR_ERR(device) == -ENOENT &&
	2156	+ device_path && strcmp(device_path, "missing") == 0)
	2157	+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
	2158	+ else
	2159	+ ret = PTR_ERR(device);
1912	2160	goto out;
	2161	+ }
	2162	+
	2163	+ if (btrfs_pinned_by_swapfile(fs_info, device)) {
	2164	+ btrfs_warn_in_rcu(fs_info,
	2165	+ "cannot remove device %s (devid %llu) due to active swapfile",
	2166	+ rcu_str_deref(device->name), device->devid);
	2167	+ ret = -ETXTBSY;
	2168	+ goto out;
	2169	+ }
1913	2170
1914	2171	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1915	2172	ret = BTRFS_ERROR_DEV_TGT_REPLACE;
..	..	@@ -1929,9 +2186,9 @@
1929	2186	mutex_unlock(&fs_info->chunk_mutex);
1930	2187	}
1931	2188
1932		- mutex_unlock(&uuid_mutex);
1933	2189	ret = btrfs_shrink_device(device, 0);
1934		- mutex_lock(&uuid_mutex);
	2190	+ if (!ret)
	2191	+ btrfs_reada_remove_dev(device);
1935	2192	if (ret)
1936	2193	goto error_undo;
1937	2194
..	..	@@ -1940,12 +2197,12 @@
1940	2197	* counter although write_all_supers() is not locked out. This
1941	2198	* could give a filesystem state which requires a degraded mount.
1942	2199	*/
1943		- ret = btrfs_rm_dev_item(fs_info, device);
	2200	+ ret = btrfs_rm_dev_item(device);
1944	2201	if (ret)
1945	2202	goto error_undo;
1946	2203
1947	2204	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
1948		- btrfs_scrub_cancel_dev(fs_info, device);
	2205	+ btrfs_scrub_cancel_dev(device);
1949	2206
1950	2207	/*
1951	2208	* the device list mutex makes sure that we don't change
..	..	@@ -1980,7 +2237,7 @@
1980	2237	if (device->bdev) {
1981	2238	cur_devices->open_devices--;
1982	2239	/* remove sysfs entry */
1983		- btrfs_sysfs_rm_device_link(fs_devices, device);
	2240	+ btrfs_sysfs_remove_device(device);
1984	2241	}
1985	2242
1986	2243	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
..	..	@@ -1993,29 +2250,24 @@
1993	2250	* supers and free the device.
1994	2251	*/
1995	2252	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
1996		- btrfs_scratch_superblocks(device->bdev, device->name->str);
	2253	+ btrfs_scratch_superblocks(fs_info, device->bdev,
	2254	+ device->name->str);
1997	2255
1998	2256	btrfs_close_bdev(device);
1999		- call_rcu(&device->rcu, free_device_rcu);
	2257	+ synchronize_rcu();
	2258	+ btrfs_free_device(device);
2000	2259
2001	2260	if (cur_devices->open_devices == 0) {
2002		- while (fs_devices) {
2003		- if (fs_devices->seed == cur_devices) {
2004		- fs_devices->seed = cur_devices->seed;
2005		- break;
2006		- }
2007		- fs_devices = fs_devices->seed;
2008		- }
2009		- cur_devices->seed = NULL;
	2261	+ list_del_init(&cur_devices->seed_list);
2010	2262	close_fs_devices(cur_devices);
2011	2263	free_fs_devices(cur_devices);
2012	2264	}
2013	2265
2014	2266	out:
2015		- mutex_unlock(&uuid_mutex);
2016	2267	return ret;
2017	2268
2018	2269	error_undo:
	2270	+ btrfs_reada_undo_remove_dev(device);
2019	2271	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2020	2272	mutex_lock(&fs_info->chunk_mutex);
2021	2273	list_add(&device->dev_alloc_list,
..	..	@@ -2053,23 +2305,18 @@
2053	2305	fs_devices->open_devices--;
2054	2306	}
2055	2307
2056		-void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2057		- struct btrfs_device *srcdev)
	2308	+void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2058	2309	{
2059	2310	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2060	2311
2061		- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2062		- /* zero out the old super if it is writable */
2063		- btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2064		- }
	2312	+ mutex_lock(&uuid_mutex);
2065	2313
2066	2314	btrfs_close_bdev(srcdev);
2067		- call_rcu(&srcdev->rcu, free_device_rcu);
	2315	+ synchronize_rcu();
	2316	+ btrfs_free_device(srcdev);
2068	2317
2069	2318	/* if this is no devs we rather delete the fs_devices */
2070	2319	if (!fs_devices->num_devices) {
2071		- struct btrfs_fs_devices *tmp_fs_devices;
2072		-
2073	2320	/*
2074	2321	* On a mounted FS, num_devices can't be zero unless it's a
2075	2322	* seed. In case of a seed device being replaced, the replace
..	..	@@ -2078,28 +2325,20 @@
2078	2325	*/
2079	2326	ASSERT(fs_devices->seeding);
2080	2327
2081		- tmp_fs_devices = fs_info->fs_devices;
2082		- while (tmp_fs_devices) {
2083		- if (tmp_fs_devices->seed == fs_devices) {
2084		- tmp_fs_devices->seed = fs_devices->seed;
2085		- break;
2086		- }
2087		- tmp_fs_devices = tmp_fs_devices->seed;
2088		- }
2089		- fs_devices->seed = NULL;
	2328	+ list_del_init(&fs_devices->seed_list);
2090	2329	close_fs_devices(fs_devices);
2091	2330	free_fs_devices(fs_devices);
2092	2331	}
	2332	+ mutex_unlock(&uuid_mutex);
2093	2333	}
2094	2334
2095	2335	void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2096	2336	{
2097	2337	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2098	2338
2099		- WARN_ON(!tgtdev);
2100	2339	mutex_lock(&fs_devices->device_list_mutex);
2101	2340
2102		- btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
	2341	+ btrfs_sysfs_remove_device(tgtdev);
2103	2342
2104	2343	if (tgtdev->bdev)
2105	2344	fs_devices->open_devices--;
..	..	@@ -2119,90 +2358,77 @@
2119	2358	* is already out of device list, so we don't have to hold
2120	2359	* the device_list_mutex lock.
2121	2360	*/
2122		- btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
	2361	+ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
	2362	+ tgtdev->name->str);
2123	2363
2124	2364	btrfs_close_bdev(tgtdev);
2125		- call_rcu(&tgtdev->rcu, free_device_rcu);
	2365	+ synchronize_rcu();
	2366	+ btrfs_free_device(tgtdev);
2126	2367	}
2127	2368
2128		-static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2129		- const char *device_path,
2130		- struct btrfs_device **device)
	2369	+static struct btrfs_device *btrfs_find_device_by_path(
	2370	+ struct btrfs_fs_info fs_info, const char device_path)
2131	2371	{
2132	2372	int ret = 0;
2133	2373	struct btrfs_super_block *disk_super;
2134	2374	u64 devid;
2135	2375	u8 *dev_uuid;
2136	2376	struct block_device *bdev;
2137		- struct buffer_head *bh;
	2377	+ struct btrfs_device *device;
2138	2378
2139		- *device = NULL;
2140	2379	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2141		- fs_info->bdev_holder, 0, &bdev, &bh);
	2380	+ fs_info->bdev_holder, 0, &bdev, &disk_super);
2142	2381	if (ret)
2143		- return ret;
2144		- disk_super = (struct btrfs_super_block *)bh->b_data;
	2382	+ return ERR_PTR(ret);
	2383	+
2145	2384	devid = btrfs_stack_device_id(&disk_super->dev_item);
2146	2385	dev_uuid = disk_super->dev_item.uuid;
2147		- *device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2148		- disk_super->fsid, true);
2149		- brelse(bh);
2150		- if (!*device)
2151		- ret = -ENOENT;
	2386	+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
	2387	+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
	2388	+ disk_super->metadata_uuid, true);
	2389	+ else
	2390	+ device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
	2391	+ disk_super->fsid, true);
	2392	+
	2393	+ btrfs_release_disk_super(disk_super);
	2394	+ if (!device)
	2395	+ device = ERR_PTR(-ENOENT);
2152	2396	blkdev_put(bdev, FMODE_READ);
2153		- return ret;
2154		-}
2155		-
2156		-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2157		- const char *device_path,
2158		- struct btrfs_device **device)
2159		-{
2160		- *device = NULL;
2161		- if (strcmp(device_path, "missing") == 0) {
2162		- struct list_head *devices;
2163		- struct btrfs_device *tmp;
2164		-
2165		- devices = &fs_info->fs_devices->devices;
2166		- list_for_each_entry(tmp, devices, dev_list) {
2167		- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2168		- &tmp->dev_state) && !tmp->bdev) {
2169		- *device = tmp;
2170		- break;
2171		- }
2172		- }
2173		-
2174		- if (!*device)
2175		- return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2176		-
2177		- return 0;
2178		- } else {
2179		- return btrfs_find_device_by_path(fs_info, device_path, device);
2180		- }
	2397	+ return device;
2181	2398	}
2182	2399
2183	2400	/*
2184	2401	* Lookup a device given by device id, or the path if the id is 0.
2185	2402	*/
2186		-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2187		- const char *devpath,
2188		- struct btrfs_device **device)
	2403	+struct btrfs_device *btrfs_find_device_by_devspec(
	2404	+ struct btrfs_fs_info *fs_info, u64 devid,
	2405	+ const char *device_path)
2189	2406	{
2190		- int ret;
	2407	+ struct btrfs_device *device;
2191	2408
2192	2409	if (devid) {
2193		- ret = 0;
2194		- *device = btrfs_find_device(fs_info->fs_devices, devid,
2195		- NULL, NULL, true);
2196		- if (!*device)
2197		- ret = -ENOENT;
2198		- } else {
2199		- if (!devpath \|\| !devpath[0])
2200		- return -EINVAL;
2201		-
2202		- ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2203		- device);
	2410	+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
	2411	+ NULL, true);
	2412	+ if (!device)
	2413	+ return ERR_PTR(-ENOENT);
	2414	+ return device;
2204	2415	}
2205		- return ret;
	2416	+
	2417	+ if (!device_path \|\| !device_path[0])
	2418	+ return ERR_PTR(-EINVAL);
	2419	+
	2420	+ if (strcmp(device_path, "missing") == 0) {
	2421	+ /* Find first missing device */
	2422	+ list_for_each_entry(device, &fs_info->fs_devices->devices,
	2423	+ dev_list) {
	2424	+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
	2425	+ &device->dev_state) && !device->bdev)
	2426	+ return device;
	2427	+ }
	2428	+ return ERR_PTR(-ENOENT);
	2429	+ }
	2430	+
	2431	+ return btrfs_find_device_by_path(fs_info, device_path);
2206	2432	}
2207	2433
2208	2434	/*
..	..	@@ -2221,10 +2447,20 @@
2221	2447	if (!fs_devices->seeding)
2222	2448	return -EINVAL;
2223	2449
2224		- seed_devices = alloc_fs_devices(NULL);
	2450	+ /*
	2451	+ * Private copy of the seed devices, anchored at
	2452	+ * fs_info->fs_devices->seed_list
	2453	+ */
	2454	+ seed_devices = alloc_fs_devices(NULL, NULL);
2225	2455	if (IS_ERR(seed_devices))
2226	2456	return PTR_ERR(seed_devices);
2227	2457
	2458	+ /*
	2459	+ * It's necessary to retain a copy of the original seed fs_devices in
	2460	+ * fs_uuids so that filesystems which have been seeded can successfully
	2461	+ * reference the seed device from open_seed_devices. This also supports
	2462	+ * multiple fs seed.
	2463	+ */
2228	2464	old_devices = clone_fs_devices(fs_devices);
2229	2465	if (IS_ERR(old_devices)) {
2230	2466	kfree(seed_devices);
..	..	@@ -2245,19 +2481,15 @@
2245	2481	list_for_each_entry(device, &seed_devices->devices, dev_list)
2246	2482	device->fs_devices = seed_devices;
2247	2483
2248		- mutex_lock(&fs_info->chunk_mutex);
2249		- list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2250		- mutex_unlock(&fs_info->chunk_mutex);
2251		-
2252		- fs_devices->seeding = 0;
	2484	+ fs_devices->seeding = false;
2253	2485	fs_devices->num_devices = 0;
2254	2486	fs_devices->open_devices = 0;
2255	2487	fs_devices->missing_devices = 0;
2256		- fs_devices->rotating = 0;
2257		- fs_devices->seed = seed_devices;
	2488	+ fs_devices->rotating = false;
	2489	+ list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2258	2490
2259	2491	generate_random_uuid(fs_devices->fsid);
2260		- memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
	2492	+ memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2261	2493	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2262	2494	mutex_unlock(&fs_devices->device_list_mutex);
2263	2495
..	..	@@ -2271,9 +2503,9 @@
2271	2503	/*
2272	2504	* Store the expected generation for seed devices in device items.
2273	2505	*/
2274		-static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2275		- struct btrfs_fs_info *fs_info)
	2506	+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2276	2507	{
	2508	+ struct btrfs_fs_info *fs_info = trans->fs_info;
2277	2509	struct btrfs_root *root = fs_info->chunk_root;
2278	2510	struct btrfs_path *path;
2279	2511	struct extent_buffer *leaf;
..	..	@@ -2357,7 +2589,7 @@
2357	2589	u64 orig_super_num_devices;
2358	2590	int seeding_dev = 0;
2359	2591	int ret = 0;
2360		- bool unlocked = false;
	2592	+ bool locked = false;
2361	2593
2362	2594	if (sb_rdonly(sb) && !fs_devices->seeding)
2363	2595	return -EROFS;
..	..	@@ -2371,20 +2603,20 @@
2371	2603	seeding_dev = 1;
2372	2604	down_write(&sb->s_umount);
2373	2605	mutex_lock(&uuid_mutex);
	2606	+ locked = true;
2374	2607	}
2375	2608
2376		- filemap_write_and_wait(bdev->bd_inode->i_mapping);
	2609	+ sync_blockdev(bdev);
2377	2610
2378		- mutex_lock(&fs_devices->device_list_mutex);
2379		- list_for_each_entry(device, &fs_devices->devices, dev_list) {
	2611	+ rcu_read_lock();
	2612	+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2380	2613	if (device->bdev == bdev) {
2381	2614	ret = -EEXIST;
2382		- mutex_unlock(
2383		- &fs_devices->device_list_mutex);
	2615	+ rcu_read_unlock();
2384	2616	goto error;
2385	2617	}
2386	2618	}
2387		- mutex_unlock(&fs_devices->device_list_mutex);
	2619	+ rcu_read_unlock();
2388	2620
2389	2621	device = btrfs_alloc_device(fs_info, NULL, NULL);
2390	2622	if (IS_ERR(device)) {
..	..	@@ -2448,7 +2680,7 @@
2448	2680	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2449	2681
2450	2682	if (!blk_queue_nonrot(q))
2451		- fs_devices->rotating = 1;
	2683	+ fs_devices->rotating = true;
2452	2684
2453	2685	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2454	2686	btrfs_set_super_total_bytes(fs_info->super_copy,
..	..	@@ -2468,13 +2700,13 @@
2468	2700	mutex_unlock(&fs_info->chunk_mutex);
2469	2701
2470	2702	/* Add sysfs device entry */
2471		- btrfs_sysfs_add_device_link(fs_devices, device);
	2703	+ btrfs_sysfs_add_device(device);
2472	2704
2473	2705	mutex_unlock(&fs_devices->device_list_mutex);
2474	2706
2475	2707	if (seeding_dev) {
2476	2708	mutex_lock(&fs_info->chunk_mutex);
2477		- ret = init_first_rw_device(trans, fs_info);
	2709	+ ret = init_first_rw_device(trans);
2478	2710	mutex_unlock(&fs_info->chunk_mutex);
2479	2711	if (ret) {
2480	2712	btrfs_abort_transaction(trans, ret);
..	..	@@ -2489,22 +2721,17 @@
2489	2721	}
2490	2722
2491	2723	if (seeding_dev) {
2492		- char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2493		-
2494		- ret = btrfs_finish_sprout(trans, fs_info);
	2724	+ ret = btrfs_finish_sprout(trans);
2495	2725	if (ret) {
2496	2726	btrfs_abort_transaction(trans, ret);
2497	2727	goto error_sysfs;
2498	2728	}
2499	2729
2500		- /* Sprouting would change fsid of the mounted root,
2501		- * so rename the fsid on the sysfs
	2730	+ /*
	2731	+ * fs_devices now represents the newly sprouted filesystem and
	2732	+ * its fsid has been changed by btrfs_prepare_sprout
2502	2733	*/
2503		- snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2504		- fs_info->fsid);
2505		- if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
2506		- btrfs_warn(fs_info,
2507		- "sysfs: failed to create fsid for sprout");
	2734	+ btrfs_sysfs_update_sprout_fsid(fs_devices);
2508	2735	}
2509	2736
2510	2737	ret = btrfs_commit_transaction(trans);
..	..	@@ -2512,7 +2739,7 @@
2512	2739	if (seeding_dev) {
2513	2740	mutex_unlock(&uuid_mutex);
2514	2741	up_write(&sb->s_umount);
2515		- unlocked = true;
	2742	+ locked = false;
2516	2743
2517	2744	if (ret) /* transaction commit */
2518	2745	return ret;
..	..	@@ -2532,12 +2759,22 @@
2532	2759	ret = btrfs_commit_transaction(trans);
2533	2760	}
2534	2761
2535		- /* Update ctime/mtime for libblkid */
	2762	+ /*
	2763	+ * Now that we have written a new super block to this device, check all
	2764	+ * other fs_devices list if device_path alienates any other scanned
	2765	+ * device.
	2766	+ * We can ignore the return value as it typically returns -EINVAL and
	2767	+ * only succeeds if the device was an alien.
	2768	+ */
	2769	+ btrfs_forget_devices(device_path);
	2770	+
	2771	+ /* Update ctime/mtime for blkid or udev */
2536	2772	update_dev_time(device_path);
	2773	+
2537	2774	return ret;
2538	2775
2539	2776	error_sysfs:
2540		- btrfs_sysfs_rm_device_link(fs_devices, device);
	2777	+ btrfs_sysfs_remove_device(device);
2541	2778	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2542	2779	mutex_lock(&fs_info->chunk_mutex);
2543	2780	list_del_rcu(&device->dev_list);
..	..	@@ -2563,7 +2800,7 @@
2563	2800	btrfs_free_device(device);
2564	2801	error:
2565	2802	blkdev_put(bdev, FMODE_EXCL);
2566		- if (seeding_dev && !unlocked) {
	2803	+ if (locked) {
2567	2804	mutex_unlock(&uuid_mutex);
2568	2805	up_write(&sb->s_umount);
2569	2806	}
..	..	@@ -2621,7 +2858,6 @@
2621	2858	{
2622	2859	struct btrfs_fs_info *fs_info = device->fs_info;
2623	2860	struct btrfs_super_block *super_copy = fs_info->super_copy;
2624		- struct btrfs_fs_devices *fs_devices;
2625	2861	u64 old_total;
2626	2862	u64 diff;
2627	2863
..	..	@@ -2640,8 +2876,6 @@
2640	2876	return -EINVAL;
2641	2877	}
2642	2878
2643		- fs_devices = fs_info->fs_devices;
2644		-
2645	2879	btrfs_set_super_total_bytes(super_copy,
2646	2880	round_down(old_total + diff, fs_info->sectorsize));
2647	2881	device->fs_devices->total_rw_bytes += diff;
..	..	@@ -2649,9 +2883,9 @@
2649	2883	btrfs_device_set_total_bytes(device, new_size);
2650	2884	btrfs_device_set_disk_total_bytes(device, new_size);
2651	2885	btrfs_clear_space_info_full(device->fs_info);
2652		- if (list_empty(&device->resized_list))
2653		- list_add_tail(&device->resized_list,
2654		- &fs_devices->resized_devices);
	2886	+ if (list_empty(&device->post_commit_list))
	2887	+ list_add_tail(&device->post_commit_list,
	2888	+ &trans->transaction->dev_update_list);
2655	2889	mutex_unlock(&fs_info->chunk_mutex);
2656	2890
2657	2891	return btrfs_update_device(trans, device);
..	..	@@ -2739,13 +2973,20 @@
2739	2973	return ret;
2740	2974	}
2741	2975
2742		-static struct extent_map get_chunk_map(struct btrfs_fs_info fs_info,
2743		- u64 logical, u64 length)
	2976	+/*
	2977	+ * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
	2978	+ * @logical: Logical block offset in bytes.
	2979	+ * @length: Length of extent in bytes.
	2980	+ *
	2981	+ * Return: Chunk mapping or ERR_PTR.
	2982	+ */
	2983	+struct extent_map btrfs_get_chunk_map(struct btrfs_fs_info fs_info,
	2984	+ u64 logical, u64 length)
2744	2985	{
2745	2986	struct extent_map_tree *em_tree;
2746	2987	struct extent_map *em;
2747	2988
2748		- em_tree = &fs_info->mapping_tree.map_tree;
	2989	+ em_tree = &fs_info->mapping_tree;
2749	2990	read_lock(&em_tree->lock);
2750	2991	em = lookup_extent_mapping(em_tree, logical, length);
2751	2992	read_unlock(&em_tree->lock);
..	..	@@ -2777,7 +3018,7 @@
2777	3018	int i, ret = 0;
2778	3019	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2779	3020
2780		- em = get_chunk_map(fs_info, chunk_offset, 1);
	3021	+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2781	3022	if (IS_ERR(em)) {
2782	3023	/*
2783	3024	* This is a logic error, but we don't want to just rely on the
..	..	@@ -2818,13 +3059,11 @@
2818	3059	mutex_unlock(&fs_info->chunk_mutex);
2819	3060	}
2820	3061
2821		- if (map->stripes[i].dev) {
2822		- ret = btrfs_update_device(trans, map->stripes[i].dev);
2823		- if (ret) {
2824		- mutex_unlock(&fs_devices->device_list_mutex);
2825		- btrfs_abort_transaction(trans, ret);
2826		- goto out;
2827		- }
	3062	+ ret = btrfs_update_device(trans, device);
	3063	+ if (ret) {
	3064	+ mutex_unlock(&fs_devices->device_list_mutex);
	3065	+ btrfs_abort_transaction(trans, ret);
	3066	+ goto out;
2828	3067	}
2829	3068	}
2830	3069	mutex_unlock(&fs_devices->device_list_mutex);
..	..	@@ -2861,6 +3100,7 @@
2861	3100	{
2862	3101	struct btrfs_root *root = fs_info->chunk_root;
2863	3102	struct btrfs_trans_handle *trans;
	3103	+ struct btrfs_block_group *block_group;
2864	3104	int ret;
2865	3105
2866	3106	/*
..	..	@@ -2877,10 +3117,6 @@
2877	3117	*/
2878	3118	lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
2879	3119
2880		- ret = btrfs_can_relocate(fs_info, chunk_offset);
2881		- if (ret)
2882		- return -ENOSPC;
2883		-
2884	3120	/* step one, relocate all the extents inside this chunk */
2885	3121	btrfs_scrub_pause(fs_info);
2886	3122	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
..	..	@@ -2888,15 +3124,11 @@
2888	3124	if (ret)
2889	3125	return ret;
2890	3126
2891		- /*
2892		- * We add the kobjects here (and after forcing data chunk creation)
2893		- * since relocation is the only place we'll create chunks of a new
2894		- * type at runtime. The only place where we'll remove the last
2895		- * chunk of a type is the call immediately below this one. Even
2896		- * so, we're protected against races with the cleaner thread since
2897		- * we're covered by the delete_unused_bgs_mutex.
2898		- */
2899		- btrfs_add_raid_kobjects(fs_info);
	3127	+ block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
	3128	+ if (!block_group)
	3129	+ return -ENOENT;
	3130	+ btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
	3131	+ btrfs_put_block_group(block_group);
2900	3132
2901	3133	trans = btrfs_start_trans_remove_block_group(root->fs_info,
2902	3134	chunk_offset);
..	..	@@ -2997,7 +3229,7 @@
2997	3229	static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
2998	3230	u64 chunk_offset)
2999	3231	{
3000		- struct btrfs_block_group_cache *cache;
	3232	+ struct btrfs_block_group *cache;
3001	3233	u64 bytes_used;
3002	3234	u64 chunk_type;
3003	3235
..	..	@@ -3006,30 +3238,28 @@
3006	3238	chunk_type = cache->flags;
3007	3239	btrfs_put_block_group(cache);
3008	3240
3009		- if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3010		- spin_lock(&fs_info->data_sinfo->lock);
3011		- bytes_used = fs_info->data_sinfo->bytes_used;
3012		- spin_unlock(&fs_info->data_sinfo->lock);
	3241	+ if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
	3242	+ return 0;
3013	3243
3014		- if (!bytes_used) {
3015		- struct btrfs_trans_handle *trans;
3016		- int ret;
	3244	+ spin_lock(&fs_info->data_sinfo->lock);
	3245	+ bytes_used = fs_info->data_sinfo->bytes_used;
	3246	+ spin_unlock(&fs_info->data_sinfo->lock);
3017	3247
3018		- trans = btrfs_join_transaction(fs_info->tree_root);
3019		- if (IS_ERR(trans))
3020		- return PTR_ERR(trans);
	3248	+ if (!bytes_used) {
	3249	+ struct btrfs_trans_handle *trans;
	3250	+ int ret;
3021	3251
3022		- ret = btrfs_force_chunk_alloc(trans,
3023		- BTRFS_BLOCK_GROUP_DATA);
3024		- btrfs_end_transaction(trans);
3025		- if (ret < 0)
3026		- return ret;
	3252	+ trans = btrfs_join_transaction(fs_info->tree_root);
	3253	+ if (IS_ERR(trans))
	3254	+ return PTR_ERR(trans);
3027	3255
3028		- btrfs_add_raid_kobjects(fs_info);
3029		-
3030		- return 1;
3031		- }
	3256	+ ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
	3257	+ btrfs_end_transaction(trans);
	3258	+ if (ret < 0)
	3259	+ return ret;
	3260	+ return 1;
3032	3261	}
	3262	+
3033	3263	return 0;
3034	3264	}
3035	3265
..	..	@@ -3099,7 +3329,7 @@
3099	3329	if (!path)
3100	3330	return -ENOMEM;
3101	3331
3102		- trans = btrfs_start_transaction(root, 0);
	3332	+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3103	3333	if (IS_ERR(trans)) {
3104	3334	btrfs_free_path(path);
3105	3335	return PTR_ERR(trans);
..	..	@@ -3208,28 +3438,28 @@
3208	3438	static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3209	3439	struct btrfs_balance_args *bargs)
3210	3440	{
3211		- struct btrfs_block_group_cache *cache;
	3441	+ struct btrfs_block_group *cache;
3212	3442	u64 chunk_used;
3213	3443	u64 user_thresh_min;
3214	3444	u64 user_thresh_max;
3215	3445	int ret = 1;
3216	3446
3217	3447	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3218		- chunk_used = btrfs_block_group_used(&cache->item);
	3448	+ chunk_used = cache->used;
3219	3449
3220	3450	if (bargs->usage_min == 0)
3221	3451	user_thresh_min = 0;
3222	3452	else
3223		- user_thresh_min = div_factor_fine(cache->key.offset,
3224		- bargs->usage_min);
	3453	+ user_thresh_min = div_factor_fine(cache->length,
	3454	+ bargs->usage_min);
3225	3455
3226	3456	if (bargs->usage_max == 0)
3227	3457	user_thresh_max = 1;
3228	3458	else if (bargs->usage_max > 100)
3229		- user_thresh_max = cache->key.offset;
	3459	+ user_thresh_max = cache->length;
3230	3460	else
3231		- user_thresh_max = div_factor_fine(cache->key.offset,
3232		- bargs->usage_max);
	3461	+ user_thresh_max = div_factor_fine(cache->length,
	3462	+ bargs->usage_max);
3233	3463
3234	3464	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3235	3465	ret = 0;
..	..	@@ -3241,20 +3471,19 @@
3241	3471	static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3242	3472	u64 chunk_offset, struct btrfs_balance_args *bargs)
3243	3473	{
3244		- struct btrfs_block_group_cache *cache;
	3474	+ struct btrfs_block_group *cache;
3245	3475	u64 chunk_used, user_thresh;
3246	3476	int ret = 1;
3247	3477
3248	3478	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3249		- chunk_used = btrfs_block_group_used(&cache->item);
	3479	+ chunk_used = cache->used;
3250	3480
3251	3481	if (bargs->usage_min == 0)
3252	3482	user_thresh = 1;
3253	3483	else if (bargs->usage > 100)
3254		- user_thresh = cache->key.offset;
	3484	+ user_thresh = cache->length;
3255	3485	else
3256		- user_thresh = div_factor_fine(cache->key.offset,
3257		- bargs->usage);
	3486	+ user_thresh = div_factor_fine(cache->length, bargs->usage);
3258	3487
3259	3488	if (chunk_used < user_thresh)
3260	3489	ret = 0;
..	..	@@ -3280,6 +3509,18 @@
3280	3509	return 1;
3281	3510	}
3282	3511
	3512	+static u64 calc_data_stripes(u64 type, int num_stripes)
	3513	+{
	3514	+ const int index = btrfs_bg_flags_to_raid_index(type);
	3515	+ const int ncopies = btrfs_raid_array[index].ncopies;
	3516	+ const int nparity = btrfs_raid_array[index].nparity;
	3517	+
	3518	+ if (nparity)
	3519	+ return num_stripes - nparity;
	3520	+ else
	3521	+ return num_stripes / ncopies;
	3522	+}
	3523	+
3283	3524	/* [pstart, pend) */
3284	3525	static int chunk_drange_filter(struct extent_buffer *leaf,
3285	3526	struct btrfs_chunk *chunk,
..	..	@@ -3289,22 +3530,15 @@
3289	3530	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3290	3531	u64 stripe_offset;
3291	3532	u64 stripe_length;
	3533	+ u64 type;
3292	3534	int factor;
3293	3535	int i;
3294	3536
3295	3537	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3296	3538	return 0;
3297	3539
3298		- if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP \|
3299		- BTRFS_BLOCK_GROUP_RAID1 \| BTRFS_BLOCK_GROUP_RAID10)) {
3300		- factor = num_stripes / 2;
3301		- } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3302		- factor = num_stripes - 1;
3303		- } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3304		- factor = num_stripes - 2;
3305		- } else {
3306		- factor = num_stripes;
3307		- }
	3540	+ type = btrfs_chunk_type(leaf, chunk);
	3541	+ factor = calc_data_stripes(type, num_stripes);
3308	3542
3309	3543	for (i = 0; i < num_stripes; i++) {
3310	3544	stripe = btrfs_stripe_nr(chunk, i);
..	..	@@ -3365,10 +3599,10 @@
3365	3599	return 0;
3366	3600	}
3367	3601
3368		-static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3369		- struct extent_buffer *leaf,
	3602	+static int should_balance_chunk(struct extent_buffer *leaf,
3370	3603	struct btrfs_chunk *chunk, u64 chunk_offset)
3371	3604	{
	3605	+ struct btrfs_fs_info *fs_info = leaf->fs_info;
3372	3606	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3373	3607	struct btrfs_balance_args *bargs = NULL;
3374	3608	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
..	..	@@ -3458,17 +3692,11 @@
3458	3692	{
3459	3693	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3460	3694	struct btrfs_root *chunk_root = fs_info->chunk_root;
3461		- struct btrfs_root *dev_root = fs_info->dev_root;
3462		- struct list_head *devices;
3463		- struct btrfs_device *device;
3464		- u64 old_size;
3465		- u64 size_to_free;
3466	3695	u64 chunk_type;
3467	3696	struct btrfs_chunk *chunk;
3468	3697	struct btrfs_path *path = NULL;
3469	3698	struct btrfs_key key;
3470	3699	struct btrfs_key found_key;
3471		- struct btrfs_trans_handle *trans;
3472	3700	struct extent_buffer *leaf;
3473	3701	int slot;
3474	3702	int ret;
..	..	@@ -3483,53 +3711,6 @@
3483	3711	u32 count_sys = 0;
3484	3712	int chunk_reserved = 0;
3485	3713
3486		- /* step one make some room on all the devices */
3487		- devices = &fs_info->fs_devices->devices;
3488		- list_for_each_entry(device, devices, dev_list) {
3489		- old_size = btrfs_device_get_total_bytes(device);
3490		- size_to_free = div_factor(old_size, 1);
3491		- size_to_free = min_t(u64, size_to_free, SZ_1M);
3492		- if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) \|\|
3493		- btrfs_device_get_total_bytes(device) -
3494		- btrfs_device_get_bytes_used(device) > size_to_free \|\|
3495		- test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
3496		- continue;
3497		-
3498		- ret = btrfs_shrink_device(device, old_size - size_to_free);
3499		- if (ret == -ENOSPC)
3500		- break;
3501		- if (ret) {
3502		- /* btrfs_shrink_device never returns ret > 0 */
3503		- WARN_ON(ret > 0);
3504		- goto error;
3505		- }
3506		-
3507		- trans = btrfs_start_transaction(dev_root, 0);
3508		- if (IS_ERR(trans)) {
3509		- ret = PTR_ERR(trans);
3510		- btrfs_info_in_rcu(fs_info,
3511		- "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
3512		- rcu_str_deref(device->name), ret,
3513		- old_size, old_size - size_to_free);
3514		- goto error;
3515		- }
3516		-
3517		- ret = btrfs_grow_device(trans, device, old_size);
3518		- if (ret) {
3519		- btrfs_end_transaction(trans);
3520		- /* btrfs_grow_device never returns ret > 0 */
3521		- WARN_ON(ret > 0);
3522		- btrfs_info_in_rcu(fs_info,
3523		- "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
3524		- rcu_str_deref(device->name), ret,
3525		- old_size, old_size - size_to_free);
3526		- goto error;
3527		- }
3528		-
3529		- btrfs_end_transaction(trans);
3530		- }
3531		-
3532		- /* step two, relocate all the chunks */
3533	3714	path = btrfs_alloc_path();
3534	3715	if (!path) {
3535	3716	ret = -ENOMEM;
..	..	@@ -3601,8 +3782,7 @@
3601	3782	spin_unlock(&fs_info->balance_lock);
3602	3783	}
3603	3784
3604		- ret = should_balance_chunk(fs_info, leaf, chunk,
3605		- found_key.offset);
	3785	+ ret = should_balance_chunk(leaf, chunk, found_key.offset);
3606	3786
3607	3787	btrfs_release_path(path);
3608	3788	if (!ret) {
..	..	@@ -3659,10 +3839,15 @@
3659	3839
3660	3840	ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3661	3841	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3662		- if (ret && ret != -ENOSPC)
3663		- goto error;
3664	3842	if (ret == -ENOSPC) {
3665	3843	enospc_errors++;
	3844	+ } else if (ret == -ETXTBSY) {
	3845	+ btrfs_info(fs_info,
	3846	+ "skipping relocation of block group %llu due to active swapfile",
	3847	+ found_key.offset);
	3848	+ ret = 0;
	3849	+ } else if (ret) {
	3850	+ goto error;
3666	3851	} else {
3667	3852	spin_lock(&fs_info->balance_lock);
3668	3853	bctl->stat.completed++;
..	..	@@ -3711,8 +3896,7 @@
3711	3896	if (flags == 0)
3712	3897	return !extended; /* "0" is valid for usual profiles */
3713	3898
3714		- /* true if exactly one bit set */
3715		- return (flags & (flags - 1)) == 0;
	3899	+ return has_single_bit_set(flags);
3716	3900	}
3717	3901
3718	3902	static inline int balance_need_close(struct btrfs_fs_info *fs_info)
..	..	@@ -3723,13 +3907,179 @@
3723	3907	atomic_read(&fs_info->balance_cancel_req) == 0);
3724	3908	}
3725	3909
3726		-/* Non-zero return value signifies invalidity */
3727		-static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
3728		- u64 allowed)
	3910	+/*
	3911	+ * Validate target profile against allowed profiles and return true if it's OK.
	3912	+ * Otherwise print the error message and return false.
	3913	+ */
	3914	+static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
	3915	+ const struct btrfs_balance_args *bargs,
	3916	+ u64 allowed, const char *type)
3729	3917	{
3730		- return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3731		- (!alloc_profile_is_valid(bctl_arg->target, 1) \|\|
3732		- (bctl_arg->target & ~allowed)));
	3918	+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
	3919	+ return true;
	3920	+
	3921	+ /* Profile is valid and does not have bits outside of the allowed set */
	3922	+ if (alloc_profile_is_valid(bargs->target, 1) &&
	3923	+ (bargs->target & ~allowed) == 0)
	3924	+ return true;
	3925	+
	3926	+ btrfs_err(fs_info, "balance: invalid convert %s profile %s",
	3927	+ type, btrfs_bg_type_to_raid_name(bargs->target));
	3928	+ return false;
	3929	+}
	3930	+
	3931	+/*
	3932	+ * Fill @buf with textual description of balance filter flags @bargs, up to
	3933	+ * @size_buf including the terminating null. The output may be trimmed if it
	3934	+ * does not fit into the provided buffer.
	3935	+ */
	3936	+static void describe_balance_args(struct btrfs_balance_args bargs, char buf,
	3937	+ u32 size_buf)
	3938	+{
	3939	+ int ret;
	3940	+ u32 size_bp = size_buf;
	3941	+ char *bp = buf;
	3942	+ u64 flags = bargs->flags;
	3943	+ char tmp_buf[128] = {'\0'};
	3944	+
	3945	+ if (!flags)
	3946	+ return;
	3947	+
	3948	+#define CHECK_APPEND_NOARG(a) \
	3949	+ do { \
	3950	+ ret = snprintf(bp, size_bp, (a)); \
	3951	+ if (ret < 0 \|\| ret >= size_bp) \
	3952	+ goto out_overflow; \
	3953	+ size_bp -= ret; \
	3954	+ bp += ret; \
	3955	+ } while (0)
	3956	+
	3957	+#define CHECK_APPEND_1ARG(a, v1) \
	3958	+ do { \
	3959	+ ret = snprintf(bp, size_bp, (a), (v1)); \
	3960	+ if (ret < 0 \|\| ret >= size_bp) \
	3961	+ goto out_overflow; \
	3962	+ size_bp -= ret; \
	3963	+ bp += ret; \
	3964	+ } while (0)
	3965	+
	3966	+#define CHECK_APPEND_2ARG(a, v1, v2) \
	3967	+ do { \
	3968	+ ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
	3969	+ if (ret < 0 \|\| ret >= size_bp) \
	3970	+ goto out_overflow; \
	3971	+ size_bp -= ret; \
	3972	+ bp += ret; \
	3973	+ } while (0)
	3974	+
	3975	+ if (flags & BTRFS_BALANCE_ARGS_CONVERT)
	3976	+ CHECK_APPEND_1ARG("convert=%s,",
	3977	+ btrfs_bg_type_to_raid_name(bargs->target));
	3978	+
	3979	+ if (flags & BTRFS_BALANCE_ARGS_SOFT)
	3980	+ CHECK_APPEND_NOARG("soft,");
	3981	+
	3982	+ if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
	3983	+ btrfs_describe_block_groups(bargs->profiles, tmp_buf,
	3984	+ sizeof(tmp_buf));
	3985	+ CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
	3986	+ }
	3987	+
	3988	+ if (flags & BTRFS_BALANCE_ARGS_USAGE)
	3989	+ CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
	3990	+
	3991	+ if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
	3992	+ CHECK_APPEND_2ARG("usage=%u..%u,",
	3993	+ bargs->usage_min, bargs->usage_max);
	3994	+
	3995	+ if (flags & BTRFS_BALANCE_ARGS_DEVID)
	3996	+ CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
	3997	+
	3998	+ if (flags & BTRFS_BALANCE_ARGS_DRANGE)
	3999	+ CHECK_APPEND_2ARG("drange=%llu..%llu,",
	4000	+ bargs->pstart, bargs->pend);
	4001	+
	4002	+ if (flags & BTRFS_BALANCE_ARGS_VRANGE)
	4003	+ CHECK_APPEND_2ARG("vrange=%llu..%llu,",
	4004	+ bargs->vstart, bargs->vend);
	4005	+
	4006	+ if (flags & BTRFS_BALANCE_ARGS_LIMIT)
	4007	+ CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
	4008	+
	4009	+ if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
	4010	+ CHECK_APPEND_2ARG("limit=%u..%u,",
	4011	+ bargs->limit_min, bargs->limit_max);
	4012	+
	4013	+ if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
	4014	+ CHECK_APPEND_2ARG("stripes=%u..%u,",
	4015	+ bargs->stripes_min, bargs->stripes_max);
	4016	+
	4017	+#undef CHECK_APPEND_2ARG
	4018	+#undef CHECK_APPEND_1ARG
	4019	+#undef CHECK_APPEND_NOARG
	4020	+
	4021	+out_overflow:
	4022	+
	4023	+ if (size_bp < size_buf)
	4024	+ buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
	4025	+ else
	4026	+ buf[0] = '\0';
	4027	+}
	4028	+
	4029	+static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
	4030	+{
	4031	+ u32 size_buf = 1024;
	4032	+ char tmp_buf[192] = {'\0'};
	4033	+ char *buf;
	4034	+ char *bp;
	4035	+ u32 size_bp = size_buf;
	4036	+ int ret;
	4037	+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
	4038	+
	4039	+ buf = kzalloc(size_buf, GFP_KERNEL);
	4040	+ if (!buf)
	4041	+ return;
	4042	+
	4043	+ bp = buf;
	4044	+
	4045	+#define CHECK_APPEND_1ARG(a, v1) \
	4046	+ do { \
	4047	+ ret = snprintf(bp, size_bp, (a), (v1)); \
	4048	+ if (ret < 0 \|\| ret >= size_bp) \
	4049	+ goto out_overflow; \
	4050	+ size_bp -= ret; \
	4051	+ bp += ret; \
	4052	+ } while (0)
	4053	+
	4054	+ if (bctl->flags & BTRFS_BALANCE_FORCE)
	4055	+ CHECK_APPEND_1ARG("%s", "-f ");
	4056	+
	4057	+ if (bctl->flags & BTRFS_BALANCE_DATA) {
	4058	+ describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
	4059	+ CHECK_APPEND_1ARG("-d%s ", tmp_buf);
	4060	+ }
	4061	+
	4062	+ if (bctl->flags & BTRFS_BALANCE_METADATA) {
	4063	+ describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
	4064	+ CHECK_APPEND_1ARG("-m%s ", tmp_buf);
	4065	+ }
	4066	+
	4067	+ if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
	4068	+ describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
	4069	+ CHECK_APPEND_1ARG("-s%s ", tmp_buf);
	4070	+ }
	4071	+
	4072	+#undef CHECK_APPEND_1ARG
	4073	+
	4074	+out_overflow:
	4075	+
	4076	+ if (size_bp < size_buf)
	4077	+ buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
	4078	+ btrfs_info(fs_info, "balance: %s %s",
	4079	+ (bctl->flags & BTRFS_BALANCE_RESUME) ?
	4080	+ "resume" : "start", buf);
	4081	+
	4082	+ kfree(buf);
3733	4083	}
3734	4084
3735	4085	/*
..	..	@@ -3745,11 +4095,12 @@
3745	4095	int ret;
3746	4096	u64 num_devices;
3747	4097	unsigned seq;
3748		- bool reducing_integrity;
	4098	+ bool reducing_redundancy;
	4099	+ int i;
3749	4100
3750	4101	if (btrfs_fs_closing(fs_info) \|\|
3751	4102	atomic_read(&fs_info->balance_pause_req) \|\|
3752		- atomic_read(&fs_info->balance_cancel_req)) {
	4103	+ btrfs_should_cancel_balance(fs_info)) {
3753	4104	ret = -EINVAL;
3754	4105	goto out;
3755	4106	}
..	..	@@ -3774,54 +4125,39 @@
3774	4125	}
3775	4126	}
3776	4127
3777		- num_devices = fs_info->fs_devices->num_devices;
3778		- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
3779		- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3780		- BUG_ON(num_devices < 1);
3781		- num_devices--;
3782		- }
3783		- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
3784		- allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE \| BTRFS_BLOCK_GROUP_DUP;
3785		- if (num_devices > 1)
3786		- allowed \|= (BTRFS_BLOCK_GROUP_RAID0 \| BTRFS_BLOCK_GROUP_RAID1);
3787		- if (num_devices > 2)
3788		- allowed \|= BTRFS_BLOCK_GROUP_RAID5;
3789		- if (num_devices > 3)
3790		- allowed \|= (BTRFS_BLOCK_GROUP_RAID10 \|
3791		- BTRFS_BLOCK_GROUP_RAID6);
3792		- if (validate_convert_profile(&bctl->data, allowed)) {
3793		- int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
	4128	+ /*
	4129	+ * rw_devices will not change at the moment, device add/delete/replace
	4130	+ * are exclusive
	4131	+ */
	4132	+ num_devices = fs_info->fs_devices->rw_devices;
3794	4133
3795		- btrfs_err(fs_info,
3796		- "balance: invalid convert data profile %s",
3797		- get_raid_name(index));
3798		- ret = -EINVAL;
3799		- goto out;
3800		- }
3801		- if (validate_convert_profile(&bctl->meta, allowed)) {
3802		- int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
	4134	+ /*
	4135	+ * SINGLE profile on-disk has no profile bit, but in-memory we have a
	4136	+ * special bit for it, to make it easier to distinguish. Thus we need
	4137	+ * to set it manually, or balance would refuse the profile.
	4138	+ */
	4139	+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
	4140	+ for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
	4141	+ if (num_devices >= btrfs_raid_array[i].devs_min)
	4142	+ allowed \|= btrfs_raid_array[i].bg_flag;
3803	4143
3804		- btrfs_err(fs_info,
3805		- "balance: invalid convert metadata profile %s",
3806		- get_raid_name(index));
3807		- ret = -EINVAL;
3808		- goto out;
3809		- }
3810		- if (validate_convert_profile(&bctl->sys, allowed)) {
3811		- int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
3812		-
3813		- btrfs_err(fs_info,
3814		- "balance: invalid convert system profile %s",
3815		- get_raid_name(index));
	4144	+ if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") \|\|
	4145	+ !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") \|\|
	4146	+ !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
3816	4147	ret = -EINVAL;
3817	4148	goto out;
3818	4149	}
3819	4150
3820		- /* allow to reduce meta or sys integrity only if force set */
3821		- allowed = BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1 \|
3822		- BTRFS_BLOCK_GROUP_RAID10 \|
3823		- BTRFS_BLOCK_GROUP_RAID5 \|
3824		- BTRFS_BLOCK_GROUP_RAID6;
	4151	+ /*
	4152	+ * Allow to reduce metadata or system integrity only if force set for
	4153	+ * profiles with redundancy (copies, parity)
	4154	+ */
	4155	+ allowed = 0;
	4156	+ for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
	4157	+ if (btrfs_raid_array[i].ncopies >= 2 \|\|
	4158	+ btrfs_raid_array[i].tolerated_failures >= 1)
	4159	+ allowed \|= btrfs_raid_array[i].bg_flag;
	4160	+ }
3825	4161	do {
3826	4162	seq = read_seqbegin(&fs_info->profiles_lock);
3827	4163
..	..	@@ -3831,9 +4167,9 @@
3831	4167	((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3832	4168	(fs_info->avail_metadata_alloc_bits & allowed) &&
3833	4169	!(bctl->meta.target & allowed)))
3834		- reducing_integrity = true;
	4170	+ reducing_redundancy = true;
3835	4171	else
3836		- reducing_integrity = false;
	4172	+ reducing_redundancy = false;
3837	4173
3838	4174	/* if we're not converting, the target field is uninitialized */
3839	4175	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
..	..	@@ -3842,13 +4178,13 @@
3842	4178	bctl->data.target : fs_info->avail_data_alloc_bits;
3843	4179	} while (read_seqretry(&fs_info->profiles_lock, seq));
3844	4180
3845		- if (reducing_integrity) {
	4181	+ if (reducing_redundancy) {
3846	4182	if (bctl->flags & BTRFS_BALANCE_FORCE) {
3847	4183	btrfs_info(fs_info,
3848		- "balance: force reducing metadata integrity");
	4184	+ "balance: force reducing metadata redundancy");
3849	4185	} else {
3850	4186	btrfs_err(fs_info,
3851		- "balance: reduces metadata integrity, use --force if you want this");
	4187	+ "balance: reduces metadata redundancy, use --force if you want this");
3852	4188	ret = -EINVAL;
3853	4189	goto out;
3854	4190	}
..	..	@@ -3856,12 +4192,18 @@
3856	4192
3857	4193	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
3858	4194	btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
3859		- int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
3860		- int data_index = btrfs_bg_flags_to_raid_index(data_target);
3861		-
3862	4195	btrfs_warn(fs_info,
3863	4196	"balance: metadata profile %s has lower redundancy than data profile %s",
3864		- get_raid_name(meta_index), get_raid_name(data_index));
	4197	+ btrfs_bg_type_to_raid_name(meta_target),
	4198	+ btrfs_bg_type_to_raid_name(data_target));
	4199	+ }
	4200	+
	4201	+ if (fs_info->send_in_progress) {
	4202	+ btrfs_warn_rl(fs_info,
	4203	+"cannot run balance while send operations are in progress (%d in progress)",
	4204	+ fs_info->send_in_progress);
	4205	+ ret = -EAGAIN;
	4206	+ goto out;
3865	4207	}
3866	4208
3867	4209	ret = insert_balance_item(fs_info, bctl);
..	..	@@ -3883,11 +4225,34 @@
3883	4225
3884	4226	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
3885	4227	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
	4228	+ describe_balance_start_or_resume(fs_info);
3886	4229	mutex_unlock(&fs_info->balance_mutex);
3887	4230
3888	4231	ret = __btrfs_balance(fs_info);
3889	4232
3890	4233	mutex_lock(&fs_info->balance_mutex);
	4234	+ if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
	4235	+ btrfs_info(fs_info, "balance: paused");
	4236	+ /*
	4237	+ * Balance can be canceled by:
	4238	+ *
	4239	+ * - Regular cancel request
	4240	+ * Then ret == -ECANCELED and balance_cancel_req > 0
	4241	+ *
	4242	+ * - Fatal signal to "btrfs" process
	4243	+ * Either the signal caught by wait_reserve_ticket() and callers
	4244	+ * got -EINTR, or caught by btrfs_should_cancel_balance() and
	4245	+ * got -ECANCELED.
	4246	+ * Either way, in this case balance_cancel_req = 0, and
	4247	+ * ret == -EINTR or ret == -ECANCELED.
	4248	+ *
	4249	+ * So here we only check the return value to catch canceled balance.
	4250	+ */
	4251	+ else if (ret == -ECANCELED \|\| ret == -EINTR)
	4252	+ btrfs_info(fs_info, "balance: canceled");
	4253	+ else
	4254	+ btrfs_info(fs_info, "balance: ended with status: %d", ret);
	4255	+
3891	4256	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
3892	4257
3893	4258	if (bargs) {
..	..	@@ -3898,7 +4263,7 @@
3898	4263	if ((ret && ret != -ECANCELED && ret != -ENOSPC) \|\|
3899	4264	balance_need_close(fs_info)) {
3900	4265	reset_balance_state(fs_info);
3901		- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
	4266	+ btrfs_exclop_finish(fs_info);
3902	4267	}
3903	4268
3904	4269	wake_up(&fs_info->balance_wait_q);
..	..	@@ -3909,7 +4274,7 @@
3909	4274	reset_balance_state(fs_info);
3910	4275	else
3911	4276	kfree(bctl);
3912		- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
	4277	+ btrfs_exclop_finish(fs_info);
3913	4278
3914	4279	return ret;
3915	4280	}
..	..	@@ -3919,12 +4284,12 @@
3919	4284	struct btrfs_fs_info *fs_info = data;
3920	4285	int ret = 0;
3921	4286
	4287	+ sb_start_write(fs_info->sb);
3922	4288	mutex_lock(&fs_info->balance_mutex);
3923		- if (fs_info->balance_ctl) {
3924		- btrfs_info(fs_info, "balance: resuming");
	4289	+ if (fs_info->balance_ctl)
3925	4290	ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
3926		- }
3927	4291	mutex_unlock(&fs_info->balance_mutex);
	4292	+ sb_end_write(fs_info->sb);
3928	4293
3929	4294	return ret;
3930	4295	}
..	..	@@ -4013,7 +4378,7 @@
4013	4378	* is in a paused state and must have fs_info::balance_ctl properly
4014	4379	* set up.
4015	4380	*/
4016		- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
	4381	+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4017	4382	btrfs_warn(fs_info,
4018	4383	"balance: cannot set exclusive op status, resume manually");
4019	4384
..	..	@@ -4097,19 +4462,18 @@
4097	4462
4098	4463	if (fs_info->balance_ctl) {
4099	4464	reset_balance_state(fs_info);
4100		- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
	4465	+ btrfs_exclop_finish(fs_info);
4101	4466	btrfs_info(fs_info, "balance: canceled");
4102	4467	}
4103	4468	}
4104	4469
4105		- BUG_ON(fs_info->balance_ctl \|\|
4106		- test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
	4470	+ ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4107	4471	atomic_dec(&fs_info->balance_cancel_req);
4108	4472	mutex_unlock(&fs_info->balance_mutex);
4109	4473	return 0;
4110	4474	}
4111	4475
4112		-static int btrfs_uuid_scan_kthread(void *data)
	4476	+int btrfs_uuid_scan_kthread(void *data)
4113	4477	{
4114	4478	struct btrfs_fs_info *fs_info = data;
4115	4479	struct btrfs_root *root = fs_info->tree_root;
..	..	@@ -4121,6 +4485,7 @@
4121	4485	struct btrfs_root_item root_item;
4122	4486	u32 item_size;
4123	4487	struct btrfs_trans_handle *trans = NULL;
	4488	+ bool closing = false;
4124	4489
4125	4490	path = btrfs_alloc_path();
4126	4491	if (!path) {
..	..	@@ -4133,6 +4498,10 @@
4133	4498	key.offset = 0;
4134	4499
4135	4500	while (1) {
	4501	+ if (btrfs_fs_closing(fs_info)) {
	4502	+ closing = true;
	4503	+ break;
	4504	+ }
4136	4505	ret = btrfs_search_forward(root, &key, path,
4137	4506	BTRFS_OLDEST_GENERATION);
4138	4507	if (ret) {
..	..	@@ -4233,74 +4602,10 @@
4233	4602	btrfs_end_transaction(trans);
4234	4603	if (ret)
4235	4604	btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4236		- else
	4605	+ else if (!closing)
4237	4606	set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4238	4607	up(&fs_info->uuid_tree_rescan_sem);
4239	4608	return 0;
4240		-}
4241		-
4242		-/*
4243		- * Callback for btrfs_uuid_tree_iterate().
4244		- * returns:
4245		- * 0 check succeeded, the entry is not outdated.
4246		- * < 0 if an error occurred.
4247		- * > 0 if the check failed, which means the caller shall remove the entry.
4248		- */
4249		-static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
4250		- u8 *uuid, u8 type, u64 subid)
4251		-{
4252		- struct btrfs_key key;
4253		- int ret = 0;
4254		- struct btrfs_root *subvol_root;
4255		-
4256		- if (type != BTRFS_UUID_KEY_SUBVOL &&
4257		- type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
4258		- goto out;
4259		-
4260		- key.objectid = subid;
4261		- key.type = BTRFS_ROOT_ITEM_KEY;
4262		- key.offset = (u64)-1;
4263		- subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
4264		- if (IS_ERR(subvol_root)) {
4265		- ret = PTR_ERR(subvol_root);
4266		- if (ret == -ENOENT)
4267		- ret = 1;
4268		- goto out;
4269		- }
4270		-
4271		- switch (type) {
4272		- case BTRFS_UUID_KEY_SUBVOL:
4273		- if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
4274		- ret = 1;
4275		- break;
4276		- case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
4277		- if (memcmp(uuid, subvol_root->root_item.received_uuid,
4278		- BTRFS_UUID_SIZE))
4279		- ret = 1;
4280		- break;
4281		- }
4282		-
4283		-out:
4284		- return ret;
4285		-}
4286		-
4287		-static int btrfs_uuid_rescan_kthread(void *data)
4288		-{
4289		- struct btrfs_fs_info fs_info = (struct btrfs_fs_info )data;
4290		- int ret;
4291		-
4292		- /*
4293		- * 1st step is to iterate through the existing UUID tree and
4294		- * to delete all entries that contain outdated data.
4295		- * 2nd step is to add all missing entries to the UUID tree.
4296		- */
4297		- ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
4298		- if (ret < 0) {
4299		- btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
4300		- up(&fs_info->uuid_tree_rescan_sem);
4301		- return ret;
4302		- }
4303		- return btrfs_uuid_scan_kthread(data);
4304	4609	}
4305	4610
4306	4611	int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
..	..	@@ -4319,8 +4624,7 @@
4319	4624	if (IS_ERR(trans))
4320	4625	return PTR_ERR(trans);
4321	4626
4322		- uuid_root = btrfs_create_tree(trans, fs_info,
4323		- BTRFS_UUID_TREE_OBJECTID);
	4627	+ uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4324	4628	if (IS_ERR(uuid_root)) {
4325	4629	ret = PTR_ERR(uuid_root);
4326	4630	btrfs_abort_transaction(trans, ret);
..	..	@@ -4346,22 +4650,6 @@
4346	4650	return 0;
4347	4651	}
4348	4652
4349		-int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
4350		-{
4351		- struct task_struct *task;
4352		-
4353		- down(&fs_info->uuid_tree_rescan_sem);
4354		- task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
4355		- if (IS_ERR(task)) {
4356		- /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4357		- btrfs_warn(fs_info, "failed to start uuid_rescan task");
4358		- up(&fs_info->uuid_tree_rescan_sem);
4359		- return PTR_ERR(task);
4360		- }
4361		-
4362		- return 0;
4363		-}
4364		-
4365	4653	/*
4366	4654	* shrinking a device means finding all of the device extents past
4367	4655	* the new size, and then following the back refs to the chunks.
..	..	@@ -4380,15 +4668,16 @@
4380	4668	int slot;
4381	4669	int failed = 0;
4382	4670	bool retried = false;
4383		- bool checked_pending_chunks = false;
4384	4671	struct extent_buffer *l;
4385	4672	struct btrfs_key key;
4386	4673	struct btrfs_super_block *super_copy = fs_info->super_copy;
4387	4674	u64 old_total = btrfs_super_total_bytes(super_copy);
4388	4675	u64 old_size = btrfs_device_get_total_bytes(device);
4389	4676	u64 diff;
	4677	+ u64 start;
4390	4678
4391	4679	new_size = round_down(new_size, fs_info->sectorsize);
	4680	+ start = new_size;
4392	4681	diff = round_down(old_size - new_size, fs_info->sectorsize);
4393	4682
4394	4683	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
..	..	@@ -4400,6 +4689,12 @@
4400	4689
4401	4690	path->reada = READA_BACK;
4402	4691
	4692	+ trans = btrfs_start_transaction(root, 0);
	4693	+ if (IS_ERR(trans)) {
	4694	+ btrfs_free_path(path);
	4695	+ return PTR_ERR(trans);
	4696	+ }
	4697	+
4403	4698	mutex_lock(&fs_info->chunk_mutex);
4404	4699
4405	4700	btrfs_device_set_total_bytes(device, new_size);
..	..	@@ -4407,7 +4702,21 @@
4407	4702	device->fs_devices->total_rw_bytes -= diff;
4408	4703	atomic64_sub(diff, &fs_info->free_chunk_space);
4409	4704	}
4410		- mutex_unlock(&fs_info->chunk_mutex);
	4705	+
	4706	+ /*
	4707	+ * Once the device's size has been set to the new size, ensure all
	4708	+ * in-memory chunks are synced to disk so that the loop below sees them
	4709	+ * and relocates them accordingly.
	4710	+ */
	4711	+ if (contains_pending_extent(device, &start, diff)) {
	4712	+ mutex_unlock(&fs_info->chunk_mutex);
	4713	+ ret = btrfs_commit_transaction(trans);
	4714	+ if (ret)
	4715	+ goto done;
	4716	+ } else {
	4717	+ mutex_unlock(&fs_info->chunk_mutex);
	4718	+ btrfs_end_transaction(trans);
	4719	+ }
4411	4720
4412	4721	again:
4413	4722	key.objectid = device->devid;
..	..	@@ -4469,10 +4778,16 @@
4469	4778
4470	4779	ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4471	4780	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4472		- if (ret && ret != -ENOSPC)
4473		- goto done;
4474		- if (ret == -ENOSPC)
	4781	+ if (ret == -ENOSPC) {
4475	4782	failed++;
	4783	+ } else if (ret) {
	4784	+ if (ret == -ETXTBSY) {
	4785	+ btrfs_warn(fs_info,
	4786	+ "could not shrink block group %llu due to active swapfile",
	4787	+ chunk_offset);
	4788	+ }
	4789	+ goto done;
	4790	+ }
4476	4791	} while (key.offset-- > 0);
4477	4792
4478	4793	if (failed && !retried) {
..	..	@@ -4492,40 +4807,14 @@
4492	4807	}
4493	4808
4494	4809	mutex_lock(&fs_info->chunk_mutex);
4495		-
4496		- /*
4497		- * We checked in the above loop all device extents that were already in
4498		- * the device tree. However before we have updated the device's
4499		- * total_bytes to the new size, we might have had chunk allocations that
4500		- * have not complete yet (new block groups attached to transaction
4501		- * handles), and therefore their device extents were not yet in the
4502		- * device tree and we missed them in the loop above. So if we have any
4503		- * pending chunk using a device extent that overlaps the device range
4504		- * that we can not use anymore, commit the current transaction and
4505		- * repeat the search on the device tree - this way we guarantee we will
4506		- * not have chunks using device extents that end beyond 'new_size'.
4507		- */
4508		- if (!checked_pending_chunks) {
4509		- u64 start = new_size;
4510		- u64 len = old_size - new_size;
4511		-
4512		- if (contains_pending_extent(trans->transaction, device,
4513		- &start, len)) {
4514		- mutex_unlock(&fs_info->chunk_mutex);
4515		- checked_pending_chunks = true;
4516		- failed = 0;
4517		- retried = false;
4518		- ret = btrfs_commit_transaction(trans);
4519		- if (ret)
4520		- goto done;
4521		- goto again;
4522		- }
4523		- }
	4810	+ /* Clear all state bits beyond the shrunk device size */
	4811	+ clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
	4812	+ CHUNK_STATE_MASK);
4524	4813
4525	4814	btrfs_device_set_disk_total_bytes(device, new_size);
4526		- if (list_empty(&device->resized_list))
4527		- list_add_tail(&device->resized_list,
4528		- &fs_info->fs_devices->resized_devices);
	4815	+ if (list_empty(&device->post_commit_list))
	4816	+ list_add_tail(&device->post_commit_list,
	4817	+ &trans->transaction->dev_update_list);
4529	4818
4530	4819	WARN_ON(diff > old_total);
4531	4820	btrfs_set_super_total_bytes(super_copy,
..	..	@@ -4609,96 +4898,119 @@
4609	4898	btrfs_set_fs_incompat(info, RAID56);
4610	4899	}
4611	4900
4612		-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4613		- u64 start, u64 type)
	4901	+static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4614	4902	{
4615		- struct btrfs_fs_info *info = trans->fs_info;
4616		- struct btrfs_fs_devices *fs_devices = info->fs_devices;
4617		- struct btrfs_device *device;
4618		- struct map_lookup *map = NULL;
4619		- struct extent_map_tree *em_tree;
4620		- struct extent_map *em;
4621		- struct btrfs_device_info *devices_info = NULL;
4622		- u64 total_avail;
4623		- int num_stripes; /* total number of stripes to allocate */
4624		- int data_stripes; /* number of stripes that count for
4625		- block group size */
4626		- int sub_stripes; /* sub_stripes info for map */
4627		- int dev_stripes; /* stripes per dev */
4628		- int devs_max; /* max devs to use */
4629		- int devs_min; /* min devs needed */
4630		- int devs_increment; /* ndevs has to be a multiple of this */
4631		- int ncopies; /* how many copies to data has */
4632		- int ret;
	4903	+ if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 \| BTRFS_BLOCK_GROUP_RAID1C4)))
	4904	+ return;
	4905	+
	4906	+ btrfs_set_fs_incompat(info, RAID1C34);
	4907	+}
	4908	+
	4909	+/*
	4910	+ * Structure used internally for __btrfs_alloc_chunk() function.
	4911	+ * Wraps needed parameters.
	4912	+ */
	4913	+struct alloc_chunk_ctl {
	4914	+ u64 start;
	4915	+ u64 type;
	4916	+ /* Total number of stripes to allocate */
	4917	+ int num_stripes;
	4918	+ /* sub_stripes info for map */
	4919	+ int sub_stripes;
	4920	+ /* Stripes per device */
	4921	+ int dev_stripes;
	4922	+ /* Maximum number of devices to use */
	4923	+ int devs_max;
	4924	+ /* Minimum number of devices to use */
	4925	+ int devs_min;
	4926	+ /* ndevs has to be a multiple of this */
	4927	+ int devs_increment;
	4928	+ /* Number of copies */
	4929	+ int ncopies;
	4930	+ /* Number of stripes worth of bytes to store parity information */
	4931	+ int nparity;
4633	4932	u64 max_stripe_size;
4634	4933	u64 max_chunk_size;
	4934	+ u64 dev_extent_min;
4635	4935	u64 stripe_size;
4636		- u64 num_bytes;
	4936	+ u64 chunk_size;
4637	4937	int ndevs;
4638		- int i;
4639		- int j;
4640		- int index;
	4938	+};
4641	4939
4642		- BUG_ON(!alloc_profile_is_valid(type, 0));
4643		-
4644		- if (list_empty(&fs_devices->alloc_list)) {
4645		- if (btrfs_test_opt(info, ENOSPC_DEBUG))
4646		- btrfs_debug(info, "%s: no writable device", __func__);
4647		- return -ENOSPC;
4648		- }
4649		-
4650		- index = btrfs_bg_flags_to_raid_index(type);
4651		-
4652		- sub_stripes = btrfs_raid_array[index].sub_stripes;
4653		- dev_stripes = btrfs_raid_array[index].dev_stripes;
4654		- devs_max = btrfs_raid_array[index].devs_max;
4655		- devs_min = btrfs_raid_array[index].devs_min;
4656		- devs_increment = btrfs_raid_array[index].devs_increment;
4657		- ncopies = btrfs_raid_array[index].ncopies;
	4940	+static void init_alloc_chunk_ctl_policy_regular(
	4941	+ struct btrfs_fs_devices *fs_devices,
	4942	+ struct alloc_chunk_ctl *ctl)
	4943	+{
	4944	+ u64 type = ctl->type;
4658	4945
4659	4946	if (type & BTRFS_BLOCK_GROUP_DATA) {
4660		- max_stripe_size = SZ_1G;
4661		- max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4662		- if (!devs_max)
4663		- devs_max = BTRFS_MAX_DEVS(info);
	4947	+ ctl->max_stripe_size = SZ_1G;
	4948	+ ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4664	4949	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4665		- /* for larger filesystems, use larger metadata chunks */
	4950	+ /* For larger filesystems, use larger metadata chunks */
4666	4951	if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4667		- max_stripe_size = SZ_1G;
	4952	+ ctl->max_stripe_size = SZ_1G;
4668	4953	else
4669		- max_stripe_size = SZ_256M;
4670		- max_chunk_size = max_stripe_size;
4671		- if (!devs_max)
4672		- devs_max = BTRFS_MAX_DEVS(info);
	4954	+ ctl->max_stripe_size = SZ_256M;
	4955	+ ctl->max_chunk_size = ctl->max_stripe_size;
4673	4956	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4674		- max_stripe_size = SZ_32M;
4675		- max_chunk_size = 2 * max_stripe_size;
4676		- if (!devs_max)
4677		- devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
	4957	+ ctl->max_stripe_size = SZ_32M;
	4958	+ ctl->max_chunk_size = 2 * ctl->max_stripe_size;
	4959	+ ctl->devs_max = min_t(int, ctl->devs_max,
	4960	+ BTRFS_MAX_DEVS_SYS_CHUNK);
4678	4961	} else {
4679		- btrfs_err(info, "invalid chunk type 0x%llx requested",
4680		- type);
4681		- BUG_ON(1);
	4962	+ BUG();
4682	4963	}
4683	4964
4684		- /* we don't want a chunk larger than 10% of writeable space */
4685		- max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4686		- max_chunk_size);
	4965	+ /* We don't want a chunk larger than 10% of writable space */
	4966	+ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
	4967	+ ctl->max_chunk_size);
	4968	+ ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
	4969	+}
4687	4970
4688		- devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4689		- GFP_NOFS);
4690		- if (!devices_info)
4691		- return -ENOMEM;
	4971	+static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
	4972	+ struct alloc_chunk_ctl *ctl)
	4973	+{
	4974	+ int index = btrfs_bg_flags_to_raid_index(ctl->type);
	4975	+
	4976	+ ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
	4977	+ ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
	4978	+ ctl->devs_max = btrfs_raid_array[index].devs_max;
	4979	+ if (!ctl->devs_max)
	4980	+ ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
	4981	+ ctl->devs_min = btrfs_raid_array[index].devs_min;
	4982	+ ctl->devs_increment = btrfs_raid_array[index].devs_increment;
	4983	+ ctl->ncopies = btrfs_raid_array[index].ncopies;
	4984	+ ctl->nparity = btrfs_raid_array[index].nparity;
	4985	+ ctl->ndevs = 0;
	4986	+
	4987	+ switch (fs_devices->chunk_alloc_policy) {
	4988	+ case BTRFS_CHUNK_ALLOC_REGULAR:
	4989	+ init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
	4990	+ break;
	4991	+ default:
	4992	+ BUG();
	4993	+ }
	4994	+}
	4995	+
	4996	+static int gather_device_info(struct btrfs_fs_devices *fs_devices,
	4997	+ struct alloc_chunk_ctl *ctl,
	4998	+ struct btrfs_device_info *devices_info)
	4999	+{
	5000	+ struct btrfs_fs_info *info = fs_devices->fs_info;
	5001	+ struct btrfs_device *device;
	5002	+ u64 total_avail;
	5003	+ u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
	5004	+ int ret;
	5005	+ int ndevs = 0;
	5006	+ u64 max_avail;
	5007	+ u64 dev_offset;
4692	5008
4693	5009	/*
4694	5010	* in the first pass through the devices list, we gather information
4695	5011	* about the available holes on each device.
4696	5012	*/
4697		- ndevs = 0;
4698	5013	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4699		- u64 max_avail;
4700		- u64 dev_offset;
4701		-
4702	5014	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4703	5015	WARN(1, KERN_ERR
4704	5016	"BTRFS: read-only device in alloc_list\n");
..	..	@@ -4716,24 +5028,23 @@
4716	5028	total_avail = 0;
4717	5029
4718	5030	/* If there is no space on this device, skip it. */
4719		- if (total_avail == 0)
	5031	+ if (total_avail < ctl->dev_extent_min)
4720	5032	continue;
4721	5033
4722		- ret = find_free_dev_extent(trans, device,
4723		- max_stripe_size * dev_stripes,
4724		- &dev_offset, &max_avail);
	5034	+ ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
	5035	+ &max_avail);
4725	5036	if (ret && ret != -ENOSPC)
4726		- goto error;
	5037	+ return ret;
4727	5038
4728	5039	if (ret == 0)
4729		- max_avail = max_stripe_size * dev_stripes;
	5040	+ max_avail = dev_extent_want;
4730	5041
4731		- if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
	5042	+ if (max_avail < ctl->dev_extent_min) {
4732	5043	if (btrfs_test_opt(info, ENOSPC_DEBUG))
4733	5044	btrfs_debug(info,
4734		- "%s: devid %llu has no free space, have=%llu want=%u",
	5045	+ "%s: devid %llu has no free space, have=%llu want=%llu",
4735	5046	__func__, device->devid, max_avail,
4736		- BTRFS_STRIPE_LEN * dev_stripes);
	5047	+ ctl->dev_extent_min);
4737	5048	continue;
4738	5049	}
4739	5050
..	..	@@ -4748,6 +5059,7 @@
4748	5059	devices_info[ndevs].dev = device;
4749	5060	++ndevs;
4750	5061	}
	5062	+ ctl->ndevs = ndevs;
4751	5063
4752	5064	/*
4753	5065	* now sort the devices by hole size / available space
..	..	@@ -4755,20 +5067,14 @@
4755	5067	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4756	5068	btrfs_cmp_device_info, NULL);
4757	5069
4758		- /* round down to number of usable stripes */
4759		- ndevs = round_down(ndevs, devs_increment);
	5070	+ return 0;
	5071	+}
4760	5072
4761		- if (ndevs < devs_min) {
4762		- ret = -ENOSPC;
4763		- if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
4764		- btrfs_debug(info,
4765		- "%s: not enough devices with free space: have=%d minimum required=%d",
4766		- __func__, ndevs, devs_min);
4767		- }
4768		- goto error;
4769		- }
4770		-
4771		- ndevs = min(ndevs, devs_max);
	5073	+static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
	5074	+ struct btrfs_device_info *devices_info)
	5075	+{
	5076	+ /* Number of stripes that count for block group size */
	5077	+ int data_stripes;
4772	5078
4773	5079	/*
4774	5080	* The primary goal is to maximize the number of stripes, so use as
..	..	@@ -4777,109 +5083,148 @@
4777	5083	* The DUP profile stores more than one stripe per device, the
4778	5084	* max_avail is the total size so we have to adjust.
4779	5085	*/
4780		- stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
4781		- num_stripes = ndevs * dev_stripes;
	5086	+ ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
	5087	+ ctl->dev_stripes);
	5088	+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
	5089	+
	5090	+ /* This will have to be fixed for RAID1 and RAID10 over more drives */
	5091	+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
4782	5092
4783	5093	/*
4784		- * this will have to be fixed for RAID1 and RAID10 over
4785		- * more drives
	5094	+ * Use the number of data stripes to figure out how big this chunk is
	5095	+ * really going to be in terms of logical address space, and compare
	5096	+ * that answer with the max chunk size. If it's higher, we try to
	5097	+ * reduce stripe_size.
4786	5098	*/
4787		- data_stripes = num_stripes / ncopies;
4788		-
4789		- if (type & BTRFS_BLOCK_GROUP_RAID5)
4790		- data_stripes = num_stripes - 1;
4791		-
4792		- if (type & BTRFS_BLOCK_GROUP_RAID6)
4793		- data_stripes = num_stripes - 2;
4794		-
4795		- /*
4796		- * Use the number of data stripes to figure out how big this chunk
4797		- * is really going to be in terms of logical address space,
4798		- * and compare that answer with the max chunk size. If it's higher,
4799		- * we try to reduce stripe_size.
4800		- */
4801		- if (stripe_size * data_stripes > max_chunk_size) {
	5099	+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
4802	5100	/*
4803	5101	* Reduce stripe_size, round it up to a 16MB boundary again and
4804	5102	* then use it, unless it ends up being even bigger than the
4805	5103	* previous value we had already.
4806	5104	*/
4807		- stripe_size = min(round_up(div_u64(max_chunk_size,
4808		- data_stripes), SZ_16M),
4809		- stripe_size);
	5105	+ ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
	5106	+ data_stripes), SZ_16M),
	5107	+ ctl->stripe_size);
4810	5108	}
4811	5109
4812		- /* align to BTRFS_STRIPE_LEN */
4813		- stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
	5110	+ /* Align to BTRFS_STRIPE_LEN */
	5111	+ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
	5112	+ ctl->chunk_size = ctl->stripe_size * data_stripes;
4814	5113
4815		- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4816		- if (!map) {
4817		- ret = -ENOMEM;
4818		- goto error;
	5114	+ return 0;
	5115	+}
	5116	+
	5117	+static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
	5118	+ struct alloc_chunk_ctl *ctl,
	5119	+ struct btrfs_device_info *devices_info)
	5120	+{
	5121	+ struct btrfs_fs_info *info = fs_devices->fs_info;
	5122	+
	5123	+ /*
	5124	+ * Round down to number of usable stripes, devs_increment can be any
	5125	+ * number so we can't use round_down() that requires power of 2, while
	5126	+ * rounddown is safe.
	5127	+ */
	5128	+ ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
	5129	+
	5130	+ if (ctl->ndevs < ctl->devs_min) {
	5131	+ if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
	5132	+ btrfs_debug(info,
	5133	+ "%s: not enough devices with free space: have=%d minimum required=%d",
	5134	+ __func__, ctl->ndevs, ctl->devs_min);
	5135	+ }
	5136	+ return -ENOSPC;
4819	5137	}
4820		- map->num_stripes = num_stripes;
4821	5138
4822		- for (i = 0; i < ndevs; ++i) {
4823		- for (j = 0; j < dev_stripes; ++j) {
4824		- int s = i * dev_stripes + j;
	5139	+ ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
	5140	+
	5141	+ switch (fs_devices->chunk_alloc_policy) {
	5142	+ case BTRFS_CHUNK_ALLOC_REGULAR:
	5143	+ return decide_stripe_size_regular(ctl, devices_info);
	5144	+ default:
	5145	+ BUG();
	5146	+ }
	5147	+}
	5148	+
	5149	+static int create_chunk(struct btrfs_trans_handle *trans,
	5150	+ struct alloc_chunk_ctl *ctl,
	5151	+ struct btrfs_device_info *devices_info)
	5152	+{
	5153	+ struct btrfs_fs_info *info = trans->fs_info;
	5154	+ struct map_lookup *map = NULL;
	5155	+ struct extent_map_tree *em_tree;
	5156	+ struct extent_map *em;
	5157	+ u64 start = ctl->start;
	5158	+ u64 type = ctl->type;
	5159	+ int ret;
	5160	+ int i;
	5161	+ int j;
	5162	+
	5163	+ map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
	5164	+ if (!map)
	5165	+ return -ENOMEM;
	5166	+ map->num_stripes = ctl->num_stripes;
	5167	+
	5168	+ for (i = 0; i < ctl->ndevs; ++i) {
	5169	+ for (j = 0; j < ctl->dev_stripes; ++j) {
	5170	+ int s = i * ctl->dev_stripes + j;
4825	5171	map->stripes[s].dev = devices_info[i].dev;
4826	5172	map->stripes[s].physical = devices_info[i].dev_offset +
4827		- j * stripe_size;
	5173	+ j * ctl->stripe_size;
4828	5174	}
4829	5175	}
4830	5176	map->stripe_len = BTRFS_STRIPE_LEN;
4831	5177	map->io_align = BTRFS_STRIPE_LEN;
4832	5178	map->io_width = BTRFS_STRIPE_LEN;
4833	5179	map->type = type;
4834		- map->sub_stripes = sub_stripes;
	5180	+ map->sub_stripes = ctl->sub_stripes;
4835	5181
4836		- num_bytes = stripe_size * data_stripes;
4837		-
4838		- trace_btrfs_chunk_alloc(info, map, start, num_bytes);
	5182	+ trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
4839	5183
4840	5184	em = alloc_extent_map();
4841	5185	if (!em) {
4842	5186	kfree(map);
4843		- ret = -ENOMEM;
4844		- goto error;
	5187	+ return -ENOMEM;
4845	5188	}
4846	5189	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4847	5190	em->map_lookup = map;
4848	5191	em->start = start;
4849		- em->len = num_bytes;
	5192	+ em->len = ctl->chunk_size;
4850	5193	em->block_start = 0;
4851	5194	em->block_len = em->len;
4852		- em->orig_block_len = stripe_size;
	5195	+ em->orig_block_len = ctl->stripe_size;
4853	5196
4854		- em_tree = &info->mapping_tree.map_tree;
	5197	+ em_tree = &info->mapping_tree;
4855	5198	write_lock(&em_tree->lock);
4856	5199	ret = add_extent_mapping(em_tree, em, 0);
4857	5200	if (ret) {
4858	5201	write_unlock(&em_tree->lock);
4859	5202	free_extent_map(em);
4860		- goto error;
	5203	+ return ret;
4861	5204	}
4862		-
4863		- list_add_tail(&em->list, &trans->transaction->pending_chunks);
4864		- refcount_inc(&em->refs);
4865	5205	write_unlock(&em_tree->lock);
4866	5206
4867		- ret = btrfs_make_block_group(trans, 0, type, start, num_bytes);
	5207	+ ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
4868	5208	if (ret)
4869	5209	goto error_del_extent;
4870	5210
4871	5211	for (i = 0; i < map->num_stripes; i++) {
4872		- num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4873		- btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4874		- map->stripes[i].dev->has_pending_chunks = true;
	5212	+ struct btrfs_device *dev = map->stripes[i].dev;
	5213	+
	5214	+ btrfs_device_set_bytes_used(dev,
	5215	+ dev->bytes_used + ctl->stripe_size);
	5216	+ if (list_empty(&dev->post_commit_list))
	5217	+ list_add_tail(&dev->post_commit_list,
	5218	+ &trans->transaction->dev_update_list);
4875	5219	}
4876	5220
4877		- atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
	5221	+ atomic64_sub(ctl->stripe_size * map->num_stripes,
	5222	+ &info->free_chunk_space);
4878	5223
4879	5224	free_extent_map(em);
4880	5225	check_raid56_incompat_flag(info, type);
	5226	+ check_raid1c34_incompat_flag(info, type);
4881	5227
4882		- kfree(devices_info);
4883	5228	return 0;
4884	5229
4885	5230	error_del_extent:
..	..	@@ -4891,13 +5236,68 @@
4891	5236	free_extent_map(em);
4892	5237	/* One for the tree reference */
4893	5238	free_extent_map(em);
4894		- /* One for the pending_chunks list reference */
4895		- free_extent_map(em);
4896		-error:
	5239	+
	5240	+ return ret;
	5241	+}
	5242	+
	5243	+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
	5244	+{
	5245	+ struct btrfs_fs_info *info = trans->fs_info;
	5246	+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
	5247	+ struct btrfs_device_info *devices_info = NULL;
	5248	+ struct alloc_chunk_ctl ctl;
	5249	+ int ret;
	5250	+
	5251	+ lockdep_assert_held(&info->chunk_mutex);
	5252	+
	5253	+ if (!alloc_profile_is_valid(type, 0)) {
	5254	+ ASSERT(0);
	5255	+ return -EINVAL;
	5256	+ }
	5257	+
	5258	+ if (list_empty(&fs_devices->alloc_list)) {
	5259	+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
	5260	+ btrfs_debug(info, "%s: no writable device", __func__);
	5261	+ return -ENOSPC;
	5262	+ }
	5263	+
	5264	+ if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
	5265	+ btrfs_err(info, "invalid chunk type 0x%llx requested", type);
	5266	+ ASSERT(0);
	5267	+ return -EINVAL;
	5268	+ }
	5269	+
	5270	+ ctl.start = find_next_chunk(info);
	5271	+ ctl.type = type;
	5272	+ init_alloc_chunk_ctl(fs_devices, &ctl);
	5273	+
	5274	+ devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
	5275	+ GFP_NOFS);
	5276	+ if (!devices_info)
	5277	+ return -ENOMEM;
	5278	+
	5279	+ ret = gather_device_info(fs_devices, &ctl, devices_info);
	5280	+ if (ret < 0)
	5281	+ goto out;
	5282	+
	5283	+ ret = decide_stripe_size(fs_devices, &ctl, devices_info);
	5284	+ if (ret < 0)
	5285	+ goto out;
	5286	+
	5287	+ ret = create_chunk(trans, &ctl, devices_info);
	5288	+
	5289	+out:
4897	5290	kfree(devices_info);
4898	5291	return ret;
4899	5292	}
4900	5293
	5294	+/*
	5295	+ * Chunk allocation falls into two parts. The first part does work
	5296	+ * that makes the new allocated chunk usable, but does not do any operation
	5297	+ * that modifies the chunk tree. The second part does the work that
	5298	+ * requires modifying the chunk tree. This division is important for the
	5299	+ * bootstrap process of adding storage to a seed btrfs.
	5300	+ */
4901	5301	int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4902	5302	u64 chunk_offset, u64 chunk_size)
4903	5303	{
..	..	@@ -4916,7 +5316,7 @@
4916	5316	int i = 0;
4917	5317	int ret = 0;
4918	5318
4919		- em = get_chunk_map(fs_info, chunk_offset, chunk_size);
	5319	+ em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
4920	5320	if (IS_ERR(em))
4921	5321	return PTR_ERR(em);
4922	5322
..	..	@@ -4996,57 +5396,27 @@
4996	5396	return ret;
4997	5397	}
4998	5398
4999		-/*
5000		- * Chunk allocation falls into two parts. The first part does works
5001		- * that make the new allocated chunk useable, but not do any operation
5002		- * that modifies the chunk tree. The second part does the works that
5003		- * require modifying the chunk tree. This division is important for the
5004		- * bootstrap process of adding storage to a seed btrfs.
5005		- */
5006		-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
	5399	+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5007	5400	{
5008		- u64 chunk_offset;
5009		-
5010		- lockdep_assert_held(&trans->fs_info->chunk_mutex);
5011		- chunk_offset = find_next_chunk(trans->fs_info);
5012		- return __btrfs_alloc_chunk(trans, chunk_offset, type);
5013		-}
5014		-
5015		-static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
5016		- struct btrfs_fs_info *fs_info)
5017		-{
5018		- u64 chunk_offset;
5019		- u64 sys_chunk_offset;
	5401	+ struct btrfs_fs_info *fs_info = trans->fs_info;
5020	5402	u64 alloc_profile;
5021	5403	int ret;
5022	5404
5023		- chunk_offset = find_next_chunk(fs_info);
5024	5405	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5025		- ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
	5406	+ ret = btrfs_alloc_chunk(trans, alloc_profile);
5026	5407	if (ret)
5027	5408	return ret;
5028	5409
5029		- sys_chunk_offset = find_next_chunk(fs_info);
5030	5410	alloc_profile = btrfs_system_alloc_profile(fs_info);
5031		- ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
	5411	+ ret = btrfs_alloc_chunk(trans, alloc_profile);
5032	5412	return ret;
5033	5413	}
5034	5414
5035	5415	static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5036	5416	{
5037		- int max_errors;
	5417	+ const int index = btrfs_bg_flags_to_raid_index(map->type);
5038	5418
5039		- if (map->type & (BTRFS_BLOCK_GROUP_RAID1 \|
5040		- BTRFS_BLOCK_GROUP_RAID10 \|
5041		- BTRFS_BLOCK_GROUP_RAID5)) {
5042		- max_errors = 1;
5043		- } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
5044		- max_errors = 2;
5045		- } else {
5046		- max_errors = 0;
5047		- }
5048		-
5049		- return max_errors;
	5419	+ return btrfs_raid_array[index].tolerated_failures;
5050	5420	}
5051	5421
5052	5422	int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
..	..	@@ -5057,7 +5427,7 @@
5057	5427	int miss_ndevs = 0;
5058	5428	int i;
5059	5429
5060		- em = get_chunk_map(fs_info, chunk_offset, 1);
	5430	+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5061	5431	if (IS_ERR(em))
5062	5432	return 1;
5063	5433
..	..	@@ -5087,21 +5457,16 @@
5087	5457	return readonly;
5088	5458	}
5089	5459
5090		-void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
5091		-{
5092		- extent_map_tree_init(&tree->map_tree);
5093		-}
5094		-
5095		-void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
	5460	+void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5096	5461	{
5097	5462	struct extent_map *em;
5098	5463
5099	5464	while (1) {
5100		- write_lock(&tree->map_tree.lock);
5101		- em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
	5465	+ write_lock(&tree->lock);
	5466	+ em = lookup_extent_mapping(tree, 0, (u64)-1);
5102	5467	if (em)
5103		- remove_extent_mapping(&tree->map_tree, em);
5104		- write_unlock(&tree->map_tree.lock);
	5468	+ remove_extent_mapping(tree, em);
	5469	+ write_unlock(&tree->lock);
5105	5470	if (!em)
5106	5471	break;
5107	5472	/* once for us */
..	..	@@ -5117,7 +5482,7 @@
5117	5482	struct map_lookup *map;
5118	5483	int ret;
5119	5484
5120		- em = get_chunk_map(fs_info, logical, len);
	5485	+ em = btrfs_get_chunk_map(fs_info, logical, len);
5121	5486	if (IS_ERR(em))
5122	5487	/*
5123	5488	* We could return errors for these cases, but that could get
..	..	@@ -5128,7 +5493,7 @@
5128	5493	return 1;
5129	5494
5130	5495	map = em->map_lookup;
5131		- if (map->type & (BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1))
	5496	+ if (map->type & (BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1_MASK))
5132	5497	ret = map->num_stripes;
5133	5498	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5134	5499	ret = map->sub_stripes;
..	..	@@ -5147,11 +5512,11 @@
5147	5512	ret = 1;
5148	5513	free_extent_map(em);
5149	5514
5150		- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
	5515	+ down_read(&fs_info->dev_replace.rwsem);
5151	5516	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5152	5517	fs_info->dev_replace.tgtdev)
5153	5518	ret++;
5154		- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
	5519	+ up_read(&fs_info->dev_replace.rwsem);
5155	5520
5156	5521	return ret;
5157	5522	}
..	..	@@ -5163,7 +5528,7 @@
5163	5528	struct map_lookup *map;
5164	5529	unsigned long len = fs_info->sectorsize;
5165	5530
5166		- em = get_chunk_map(fs_info, logical, len);
	5531	+ em = btrfs_get_chunk_map(fs_info, logical, len);
5167	5532
5168	5533	if (!WARN_ON(IS_ERR(em))) {
5169	5534	map = em->map_lookup;
..	..	@@ -5180,7 +5545,7 @@
5180	5545	struct map_lookup *map;
5181	5546	int ret = 0;
5182	5547
5183		- em = get_chunk_map(fs_info, logical, len);
	5548	+ em = btrfs_get_chunk_map(fs_info, logical, len);
5184	5549
5185	5550	if(!WARN_ON(IS_ERR(em))) {
5186	5551	map = em->map_lookup;
..	..	@@ -5202,7 +5567,7 @@
5202	5567	struct btrfs_device *srcdev;
5203	5568
5204	5569	ASSERT((map->type &
5205		- (BTRFS_BLOCK_GROUP_RAID1 \| BTRFS_BLOCK_GROUP_RAID10)));
	5570	+ (BTRFS_BLOCK_GROUP_RAID1_MASK \| BTRFS_BLOCK_GROUP_RAID10)));
5206	5571
5207	5572	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5208	5573	num_stripes = map->sub_stripes;
..	..	@@ -5240,31 +5605,19 @@
5240	5605	return preferred_mirror;
5241	5606	}
5242	5607
5243		-static inline int parity_smaller(u64 a, u64 b)
5244		-{
5245		- return a > b;
5246		-}
5247		-
5248	5608	/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5249	5609	static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5250	5610	{
5251		- struct btrfs_bio_stripe s;
5252	5611	int i;
5253		- u64 l;
5254	5612	int again = 1;
5255	5613
5256	5614	while (again) {
5257	5615	again = 0;
5258	5616	for (i = 0; i < num_stripes - 1; i++) {
5259		- if (parity_smaller(bbio->raid_map[i],
5260		- bbio->raid_map[i+1])) {
5261		- s = bbio->stripes[i];
5262		- l = bbio->raid_map[i];
5263		- bbio->stripes[i] = bbio->stripes[i+1];
5264		- bbio->raid_map[i] = bbio->raid_map[i+1];
5265		- bbio->stripes[i+1] = s;
5266		- bbio->raid_map[i+1] = l;
5267		-
	5617	+ /* Swap if parity is on a smaller index */
	5618	+ if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
	5619	+ swap(bbio->stripes[i], bbio->stripes[i + 1]);
	5620	+ swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
5268	5621	again = 1;
5269	5622	}
5270	5623	}
..	..	@@ -5290,6 +5643,9 @@
5290	5643	atomic_set(&bbio->error, 0);
5291	5644	refcount_set(&bbio->refs, 1);
5292	5645
	5646	+ bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
	5647	+ bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
	5648	+
5293	5649	return bbio;
5294	5650	}
5295	5651
..	..	@@ -5313,12 +5669,13 @@
5313	5669	* replace.
5314	5670	*/
5315	5671	static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5316		- u64 logical, u64 length,
	5672	+ u64 logical, u64 *length_ret,
5317	5673	struct btrfs_bio **bbio_ret)
5318	5674	{
5319	5675	struct extent_map *em;
5320	5676	struct map_lookup *map;
5321	5677	struct btrfs_bio *bbio;
	5678	+ u64 length = *length_ret;
5322	5679	u64 offset;
5323	5680	u64 stripe_nr;
5324	5681	u64 stripe_nr_end;
..	..	@@ -5339,7 +5696,7 @@
5339	5696	/* discard always return a bbio */
5340	5697	ASSERT(bbio_ret);
5341	5698
5342		- em = get_chunk_map(fs_info, logical, length);
	5699	+ em = btrfs_get_chunk_map(fs_info, logical, length);
5343	5700	if (IS_ERR(em))
5344	5701	return PTR_ERR(em);
5345	5702
..	..	@@ -5351,7 +5708,8 @@
5351	5708	}
5352	5709
5353	5710	offset = logical - em->start;
5354		- length = min_t(u64, em->len - offset, length);
	5711	+ length = min_t(u64, em->start + em->len - logical, length);
	5712	+ *length_ret = length;
5355	5713
5356	5714	stripe_len = map->stripe_len;
5357	5715	/*
..	..	@@ -5391,7 +5749,7 @@
5391	5749	&remaining_stripes);
5392	5750	div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5393	5751	last_stripe *= sub_stripes;
5394		- } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 \|
	5752	+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK \|
5395	5753	BTRFS_BLOCK_GROUP_DUP)) {
5396	5754	num_stripes = map->num_stripes;
5397	5755	} else {
..	..	@@ -5635,6 +5993,106 @@
5635	5993	return (op == BTRFS_MAP_WRITE \|\| op == BTRFS_MAP_GET_READ_MIRRORS);
5636	5994	}
5637	5995
	5996	+/*
	5997	+ * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
	5998	+ * tuple. This information is used to calculate how big a
	5999	+ * particular bio can get before it straddles a stripe.
	6000	+ *
	6001	+ * @fs_info - the filesystem
	6002	+ * @logical - address that we want to figure out the geometry of
	6003	+ * @len - the length of IO we are going to perform, starting at @logical
	6004	+ * @op - type of operation - write or read
	6005	+ * @io_geom - pointer used to return values
	6006	+ *
	6007	+ * Returns < 0 in case a chunk for the given logical address cannot be found,
	6008	+ * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
	6009	+ */
	6010	+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
	6011	+ u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
	6012	+{
	6013	+ struct extent_map *em;
	6014	+ struct map_lookup *map;
	6015	+ u64 offset;
	6016	+ u64 stripe_offset;
	6017	+ u64 stripe_nr;
	6018	+ u64 stripe_len;
	6019	+ u64 raid56_full_stripe_start = (u64)-1;
	6020	+ int data_stripes;
	6021	+ int ret = 0;
	6022	+
	6023	+ ASSERT(op != BTRFS_MAP_DISCARD);
	6024	+
	6025	+ em = btrfs_get_chunk_map(fs_info, logical, len);
	6026	+ if (IS_ERR(em))
	6027	+ return PTR_ERR(em);
	6028	+
	6029	+ map = em->map_lookup;
	6030	+ /* Offset of this logical address in the chunk */
	6031	+ offset = logical - em->start;
	6032	+ /* Len of a stripe in a chunk */
	6033	+ stripe_len = map->stripe_len;
	6034	+ /* Stripe wher this block falls in */
	6035	+ stripe_nr = div64_u64(offset, stripe_len);
	6036	+ /* Offset of stripe in the chunk */
	6037	+ stripe_offset = stripe_nr * stripe_len;
	6038	+ if (offset < stripe_offset) {
	6039	+ btrfs_crit(fs_info,
	6040	+"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
	6041	+ stripe_offset, offset, em->start, logical, stripe_len);
	6042	+ ret = -EINVAL;
	6043	+ goto out;
	6044	+ }
	6045	+
	6046	+ /* stripe_offset is the offset of this block in its stripe */
	6047	+ stripe_offset = offset - stripe_offset;
	6048	+ data_stripes = nr_data_stripes(map);
	6049	+
	6050	+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
	6051	+ u64 max_len = stripe_len - stripe_offset;
	6052	+
	6053	+ /*
	6054	+ * In case of raid56, we need to know the stripe aligned start
	6055	+ */
	6056	+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
	6057	+ unsigned long full_stripe_len = stripe_len * data_stripes;
	6058	+ raid56_full_stripe_start = offset;
	6059	+
	6060	+ /*
	6061	+ * Allow a write of a full stripe, but make sure we
	6062	+ * don't allow straddling of stripes
	6063	+ */
	6064	+ raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
	6065	+ full_stripe_len);
	6066	+ raid56_full_stripe_start *= full_stripe_len;
	6067	+
	6068	+ /*
	6069	+ * For writes to RAID[56], allow a full stripeset across
	6070	+ * all disks. For other RAID types and for RAID[56]
	6071	+ * reads, just allow a single stripe (on a single disk).
	6072	+ */
	6073	+ if (op == BTRFS_MAP_WRITE) {
	6074	+ max_len = stripe_len * data_stripes -
	6075	+ (offset - raid56_full_stripe_start);
	6076	+ }
	6077	+ }
	6078	+ len = min_t(u64, em->len - offset, max_len);
	6079	+ } else {
	6080	+ len = em->len - offset;
	6081	+ }
	6082	+
	6083	+ io_geom->len = len;
	6084	+ io_geom->offset = offset;
	6085	+ io_geom->stripe_len = stripe_len;
	6086	+ io_geom->stripe_nr = stripe_nr;
	6087	+ io_geom->stripe_offset = stripe_offset;
	6088	+ io_geom->raid56_stripe_offset = raid56_full_stripe_start;
	6089	+
	6090	+out:
	6091	+ /* once for us */
	6092	+ free_extent_map(em);
	6093	+ return ret;
	6094	+}
	6095	+
5638	6096	static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5639	6097	enum btrfs_map_op op,
5640	6098	u64 logical, u64 *length,
..	..	@@ -5643,11 +6101,11 @@
5643	6101	{
5644	6102	struct extent_map *em;
5645	6103	struct map_lookup *map;
5646		- u64 offset;
5647	6104	u64 stripe_offset;
5648	6105	u64 stripe_nr;
5649	6106	u64 stripe_len;
5650	6107	u32 stripe_index;
	6108	+ int data_stripes;
5651	6109	int i;
5652	6110	int ret = 0;
5653	6111	int num_stripes;
..	..	@@ -5660,81 +6118,34 @@
5660	6118	int patch_the_first_stripe_for_dev_replace = 0;
5661	6119	u64 physical_to_patch_in_first_stripe = 0;
5662	6120	u64 raid56_full_stripe_start = (u64)-1;
	6121	+ struct btrfs_io_geometry geom;
5663	6122
5664		- if (op == BTRFS_MAP_DISCARD)
5665		- return __btrfs_map_block_for_discard(fs_info, logical,
5666		- *length, bbio_ret);
	6123	+ ASSERT(bbio_ret);
	6124	+ ASSERT(op != BTRFS_MAP_DISCARD);
5667	6125
5668		- em = get_chunk_map(fs_info, logical, *length);
5669		- if (IS_ERR(em))
5670		- return PTR_ERR(em);
	6126	+ ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
	6127	+ if (ret < 0)
	6128	+ return ret;
5671	6129
	6130	+ em = btrfs_get_chunk_map(fs_info, logical, *length);
	6131	+ ASSERT(!IS_ERR(em));
5672	6132	map = em->map_lookup;
5673		- offset = logical - em->start;
5674	6133
5675		- stripe_len = map->stripe_len;
5676		- stripe_nr = offset;
5677		- /*
5678		- * stripe_nr counts the total number of stripes we have to stride
5679		- * to get to this block
5680		- */
5681		- stripe_nr = div64_u64(stripe_nr, stripe_len);
	6134	+ *length = geom.len;
	6135	+ stripe_len = geom.stripe_len;
	6136	+ stripe_nr = geom.stripe_nr;
	6137	+ stripe_offset = geom.stripe_offset;
	6138	+ raid56_full_stripe_start = geom.raid56_stripe_offset;
	6139	+ data_stripes = nr_data_stripes(map);
5682	6140
5683		- stripe_offset = stripe_nr * stripe_len;
5684		- if (offset < stripe_offset) {
5685		- btrfs_crit(fs_info,
5686		- "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
5687		- stripe_offset, offset, em->start, logical,
5688		- stripe_len);
5689		- free_extent_map(em);
5690		- return -EINVAL;
5691		- }
5692		-
5693		- /* stripe_offset is the offset of this block in its stripe*/
5694		- stripe_offset = offset - stripe_offset;
5695		-
5696		- /* if we're here for raid56, we need to know the stripe aligned start */
5697		- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5698		- unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
5699		- raid56_full_stripe_start = offset;
5700		-
5701		- /* allow a write of a full stripe, but make sure we don't
5702		- * allow straddling of stripes
5703		- */
5704		- raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5705		- full_stripe_len);
5706		- raid56_full_stripe_start *= full_stripe_len;
5707		- }
5708		-
5709		- if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5710		- u64 max_len;
5711		- /* For writes to RAID[56], allow a full stripeset across all disks.
5712		- For other RAID types and for RAID[56] reads, just allow a single
5713		- stripe (on a single disk). */
5714		- if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5715		- (op == BTRFS_MAP_WRITE)) {
5716		- max_len = stripe_len * nr_data_stripes(map) -
5717		- (offset - raid56_full_stripe_start);
5718		- } else {
5719		- /* we limit the length of each bio to what fits in a stripe */
5720		- max_len = stripe_len - stripe_offset;
5721		- }
5722		- *length = min_t(u64, em->len - offset, max_len);
5723		- } else {
5724		- *length = em->len - offset;
5725		- }
5726		-
5727		- /* This is for when we're called from btrfs_merge_bio_hook() and all
5728		- it cares about is the length */
5729		- if (!bbio_ret)
5730		- goto out;
5731		-
5732		- btrfs_dev_replace_read_lock(dev_replace);
	6141	+ down_read(&dev_replace->rwsem);
5733	6142	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
	6143	+ /*
	6144	+ * Hold the semaphore for read during the whole operation, write is
	6145	+ * requested at commit time but must wait.
	6146	+ */
5734	6147	if (!dev_replace_is_ongoing)
5735		- btrfs_dev_replace_read_unlock(dev_replace);
5736		- else
5737		- btrfs_dev_replace_set_lock_blocking(dev_replace);
	6148	+ up_read(&dev_replace->rwsem);
5738	6149
5739	6150	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5740	6151	!need_full_stripe(op) && dev_replace->tgtdev != NULL) {
..	..	@@ -5757,7 +6168,7 @@
5757	6168	&stripe_index);
5758	6169	if (!need_full_stripe(op))
5759	6170	mirror_num = 1;
5760		- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
	6171	+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
5761	6172	if (need_full_stripe(op))
5762	6173	num_stripes = map->num_stripes;
5763	6174	else if (mirror_num)
..	..	@@ -5799,7 +6210,7 @@
5799	6210	if (need_raid_map && (need_full_stripe(op) \|\| mirror_num > 1)) {
5800	6211	/* push stripe_nr back to the start of the full stripe */
5801	6212	stripe_nr = div64_u64(raid56_full_stripe_start,
5802		- stripe_len * nr_data_stripes(map));
	6213	+ stripe_len * data_stripes);
5803	6214
5804	6215	/* RAID[56] write or recovery. Return all stripes */
5805	6216	num_stripes = map->num_stripes;
..	..	@@ -5815,10 +6226,9 @@
5815	6226	* Mirror #3 is RAID6 Q block.
5816	6227	*/
5817	6228	stripe_nr = div_u64_rem(stripe_nr,
5818		- nr_data_stripes(map), &stripe_index);
	6229	+ data_stripes, &stripe_index);
5819	6230	if (mirror_num > 1)
5820		- stripe_index = nr_data_stripes(map) +
5821		- mirror_num - 2;
	6231	+ stripe_index = data_stripes + mirror_num - 2;
5822	6232
5823	6233	/* We distribute the parity blocks across stripes */
5824	6234	div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
..	..	@@ -5858,8 +6268,13 @@
5858	6268	ret = -ENOMEM;
5859	6269	goto out;
5860	6270	}
5861		- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
5862		- bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
	6271	+
	6272	+ for (i = 0; i < num_stripes; i++) {
	6273	+ bbio->stripes[i].physical = map->stripes[stripe_index].physical +
	6274	+ stripe_offset + stripe_nr * map->stripe_len;
	6275	+ bbio->stripes[i].dev = map->stripes[stripe_index].dev;
	6276	+ stripe_index++;
	6277	+ }
5863	6278
5864	6279	/* build raid_map */
5865	6280	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
..	..	@@ -5867,17 +6282,12 @@
5867	6282	u64 tmp;
5868	6283	unsigned rot;
5869	6284
5870		- bbio->raid_map = (u64 )((void )bbio->stripes +
5871		- sizeof(struct btrfs_bio_stripe) *
5872		- num_alloc_stripes +
5873		- sizeof(int) * tgtdev_indexes);
5874		-
5875	6285	/* Work out the disk rotation on this stripe-set */
5876	6286	div_u64_rem(stripe_nr, num_stripes, &rot);
5877	6287
5878	6288	/* Fill in the logical address of each stripe */
5879		- tmp = stripe_nr * nr_data_stripes(map);
5880		- for (i = 0; i < nr_data_stripes(map); i++)
	6289	+ tmp = stripe_nr * data_stripes;
	6290	+ for (i = 0; i < data_stripes; i++)
5881	6291	bbio->raid_map[(i+rot) % num_stripes] =
5882	6292	em->start + (tmp + i) * map->stripe_len;
5883	6293
..	..	@@ -5885,24 +6295,12 @@
5885	6295	if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5886	6296	bbio->raid_map[(i+rot+1) % num_stripes] =
5887	6297	RAID6_Q_STRIPE;
5888		- }
5889	6298
5890		-
5891		- for (i = 0; i < num_stripes; i++) {
5892		- bbio->stripes[i].physical =
5893		- map->stripes[stripe_index].physical +
5894		- stripe_offset +
5895		- stripe_nr * map->stripe_len;
5896		- bbio->stripes[i].dev =
5897		- map->stripes[stripe_index].dev;
5898		- stripe_index++;
	6299	+ sort_parity_stripes(bbio, num_stripes);
5899	6300	}
5900	6301
5901	6302	if (need_full_stripe(op))
5902	6303	max_errors = btrfs_chunk_max_errors(map);
5903		-
5904		- if (bbio->raid_map)
5905		- sort_parity_stripes(bbio, num_stripes);
5906	6304
5907	6305	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
5908	6306	need_full_stripe(op)) {
..	..	@@ -5929,8 +6327,9 @@
5929	6327	}
5930	6328	out:
5931	6329	if (dev_replace_is_ongoing) {
5932		- btrfs_dev_replace_clear_lock_blocking(dev_replace);
5933		- btrfs_dev_replace_read_unlock(dev_replace);
	6330	+ lockdep_assert_held(&dev_replace->rwsem);
	6331	+ /* Unlock and let waiting writers proceed */
	6332	+ up_read(&dev_replace->rwsem);
5934	6333	}
5935	6334	free_extent_map(em);
5936	6335	return ret;
..	..	@@ -5940,6 +6339,10 @@
5940	6339	u64 logical, u64 *length,
5941	6340	struct btrfs_bio **bbio_ret, int mirror_num)
5942	6341	{
	6342	+ if (op == BTRFS_MAP_DISCARD)
	6343	+ return __btrfs_map_block_for_discard(fs_info, logical,
	6344	+ length, bbio_ret);
	6345	+
5943	6346	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
5944	6347	mirror_num, 0);
5945	6348	}
..	..	@@ -5950,75 +6353,6 @@
5950	6353	struct btrfs_bio **bbio_ret)
5951	6354	{
5952	6355	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
5953		-}
5954		-
5955		-int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
5956		- u64 physical, u64 *logical, int naddrs, int *stripe_len)
5957		-{
5958		- struct extent_map *em;
5959		- struct map_lookup *map;
5960		- u64 *buf;
5961		- u64 bytenr;
5962		- u64 length;
5963		- u64 stripe_nr;
5964		- u64 rmap_len;
5965		- int i, j, nr = 0;
5966		-
5967		- em = get_chunk_map(fs_info, chunk_start, 1);
5968		- if (IS_ERR(em))
5969		- return -EIO;
5970		-
5971		- map = em->map_lookup;
5972		- length = em->len;
5973		- rmap_len = map->stripe_len;
5974		-
5975		- if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5976		- length = div_u64(length, map->num_stripes / map->sub_stripes);
5977		- else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5978		- length = div_u64(length, map->num_stripes);
5979		- else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5980		- length = div_u64(length, nr_data_stripes(map));
5981		- rmap_len = map->stripe_len * nr_data_stripes(map);
5982		- }
5983		-
5984		- buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
5985		- BUG_ON(!buf); /* -ENOMEM */
5986		-
5987		- for (i = 0; i < map->num_stripes; i++) {
5988		- if (map->stripes[i].physical > physical \|\|
5989		- map->stripes[i].physical + length <= physical)
5990		- continue;
5991		-
5992		- stripe_nr = physical - map->stripes[i].physical;
5993		- stripe_nr = div64_u64(stripe_nr, map->stripe_len);
5994		-
5995		- if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5996		- stripe_nr = stripe_nr * map->num_stripes + i;
5997		- stripe_nr = div_u64(stripe_nr, map->sub_stripes);
5998		- } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5999		- stripe_nr = stripe_nr * map->num_stripes + i;
6000		- } /* else if RAID[56], multiply by nr_data_stripes().
6001		- * Alternatively, just use rmap_len below instead of
6002		- * map->stripe_len */
6003		-
6004		- bytenr = chunk_start + stripe_nr * rmap_len;
6005		- WARN_ON(nr >= map->num_stripes);
6006		- for (j = 0; j < nr; j++) {
6007		- if (buf[j] == bytenr)
6008		- break;
6009		- }
6010		- if (j == nr) {
6011		- WARN_ON(nr >= map->num_stripes);
6012		- buf[nr++] = bytenr;
6013		- }
6014		- }
6015		-
6016		- *logical = buf;
6017		- *naddrs = nr;
6018		- *stripe_len = rmap_len;
6019		-
6020		- free_extent_map(em);
6021		- return 0;
6022	6356	}
6023	6357
6024	6358	static inline void btrfs_end_bbio(struct btrfs_bio bbio, struct bio bio)
..	..	@@ -6039,23 +6373,18 @@
6039	6373	atomic_inc(&bbio->error);
6040	6374	if (bio->bi_status == BLK_STS_IOERR \|\|
6041	6375	bio->bi_status == BLK_STS_TARGET) {
6042		- unsigned int stripe_index =
6043		- btrfs_io_bio(bio)->stripe_index;
6044		- struct btrfs_device *dev;
	6376	+ struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6045	6377
6046		- BUG_ON(stripe_index >= bbio->num_stripes);
6047		- dev = bbio->stripes[stripe_index].dev;
6048		- if (dev->bdev) {
6049		- if (bio_op(bio) == REQ_OP_WRITE)
6050		- btrfs_dev_stat_inc_and_print(dev,
	6378	+ ASSERT(dev->bdev);
	6379	+ if (bio_op(bio) == REQ_OP_WRITE)
	6380	+ btrfs_dev_stat_inc_and_print(dev,
6051	6381	BTRFS_DEV_STAT_WRITE_ERRS);
6052		- else if (!(bio->bi_opf & REQ_RAHEAD))
6053		- btrfs_dev_stat_inc_and_print(dev,
	6382	+ else if (!(bio->bi_opf & REQ_RAHEAD))
	6383	+ btrfs_dev_stat_inc_and_print(dev,
6054	6384	BTRFS_DEV_STAT_READ_ERRS);
6055		- if (bio->bi_opf & REQ_PREFLUSH)
6056		- btrfs_dev_stat_inc_and_print(dev,
	6385	+ if (bio->bi_opf & REQ_PREFLUSH)
	6386	+ btrfs_dev_stat_inc_and_print(dev,
6057	6387	BTRFS_DEV_STAT_FLUSH_ERRS);
6058		- }
6059	6388	}
6060	6389	}
6061	6390
..	..	@@ -6090,73 +6419,25 @@
6090	6419	}
6091	6420	}
6092	6421
6093		-/*
6094		- * see run_scheduled_bios for a description of why bios are collected for
6095		- * async submit.
6096		- *
6097		- * This will add one bio to the pending list for a device and make sure
6098		- * the work struct is scheduled.
6099		- */
6100		-static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6101		- struct bio *bio)
6102		-{
6103		- struct btrfs_fs_info *fs_info = device->fs_info;
6104		- int should_queue = 1;
6105		- struct btrfs_pending_bios *pending_bios;
6106		-
6107		- /* don't bother with additional async steps for reads, right now */
6108		- if (bio_op(bio) == REQ_OP_READ) {
6109		- btrfsic_submit_bio(bio);
6110		- return;
6111		- }
6112		-
6113		- WARN_ON(bio->bi_next);
6114		- bio->bi_next = NULL;
6115		-
6116		- spin_lock(&device->io_lock);
6117		- if (op_is_sync(bio->bi_opf))
6118		- pending_bios = &device->pending_sync_bios;
6119		- else
6120		- pending_bios = &device->pending_bios;
6121		-
6122		- if (pending_bios->tail)
6123		- pending_bios->tail->bi_next = bio;
6124		-
6125		- pending_bios->tail = bio;
6126		- if (!pending_bios->head)
6127		- pending_bios->head = bio;
6128		- if (device->running_pending)
6129		- should_queue = 0;
6130		-
6131		- spin_unlock(&device->io_lock);
6132		-
6133		- if (should_queue)
6134		- btrfs_queue_work(fs_info->submit_workers, &device->work);
6135		-}
6136		-
6137	6422	static void submit_stripe_bio(struct btrfs_bio bbio, struct bio bio,
6138		- u64 physical, int dev_nr, int async)
	6423	+ u64 physical, struct btrfs_device *dev)
6139	6424	{
6140		- struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
6141	6425	struct btrfs_fs_info *fs_info = bbio->fs_info;
6142	6426
6143	6427	bio->bi_private = bbio;
6144		- btrfs_io_bio(bio)->stripe_index = dev_nr;
	6428	+ btrfs_io_bio(bio)->device = dev;
6145	6429	bio->bi_end_io = btrfs_end_bio;
6146	6430	bio->bi_iter.bi_sector = physical >> 9;
6147	6431	btrfs_debug_in_rcu(fs_info,
6148	6432	"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6149	6433	bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
6150		- (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid,
6151		- bio->bi_iter.bi_size);
	6434	+ (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
	6435	+ dev->devid, bio->bi_iter.bi_size);
6152	6436	bio_set_dev(bio, dev->bdev);
6153	6437
6154	6438	btrfs_bio_counter_inc_noblocked(fs_info);
6155	6439
6156		- if (async)
6157		- btrfs_schedule_bio(dev, bio);
6158		- else
6159		- btrfsic_submit_bio(bio);
	6440	+ btrfsic_submit_bio(bio);
6160	6441	}
6161	6442
6162	6443	static void bbio_error(struct btrfs_bio bbio, struct bio bio, u64 logical)
..	..	@@ -6177,7 +6458,7 @@
6177	6458	}
6178	6459
6179	6460	blk_status_t btrfs_map_bio(struct btrfs_fs_info fs_info, struct bio bio,
6180		- int mirror_num, int async_submit)
	6461	+ int mirror_num)
6181	6462	{
6182	6463	struct btrfs_device *dev;
6183	6464	struct bio *first_bio = bio;
..	..	@@ -6245,8 +6526,7 @@
6245	6526	else
6246	6527	bio = first_bio;
6247	6528
6248		- submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
6249		- dev_nr, async_submit);
	6529	+ submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
6250	6530	}
6251	6531	btrfs_bio_counter_dec(fs_info);
6252	6532	return BLK_STS_OK;
..	..	@@ -6262,15 +6542,25 @@
6262	6542	* If @seed is true, traverse through the seed devices.
6263	6543	*/
6264	6544	struct btrfs_device btrfs_find_device(struct btrfs_fs_devices fs_devices,
6265		- u64 devid, u8 uuid, u8 fsid,
6266		- bool seed)
	6545	+ u64 devid, u8 uuid, u8 fsid,
	6546	+ bool seed)
6267	6547	{
6268	6548	struct btrfs_device *device;
	6549	+ struct btrfs_fs_devices *seed_devs;
6269	6550
6270		- while (fs_devices) {
	6551	+ if (!fsid \|\| !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
	6552	+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
	6553	+ if (device->devid == devid &&
	6554	+ (!uuid \|\| memcmp(device->uuid, uuid,
	6555	+ BTRFS_UUID_SIZE) == 0))
	6556	+ return device;
	6557	+ }
	6558	+ }
	6559	+
	6560	+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6271	6561	if (!fsid \|\|
6272		- !memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
6273		- list_for_each_entry(device, &fs_devices->devices,
	6562	+ !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
	6563	+ list_for_each_entry(device, &seed_devs->devices,
6274	6564	dev_list) {
6275	6565	if (device->devid == devid &&
6276	6566	(!uuid \|\| memcmp(device->uuid, uuid,
..	..	@@ -6278,11 +6568,8 @@
6278	6568	return device;
6279	6569	}
6280	6570	}
6281		- if (seed)
6282		- fs_devices = fs_devices->seed;
6283		- else
6284		- return NULL;
6285	6571	}
	6572	+
6286	6573	return NULL;
6287	6574	}
6288	6575
..	..	@@ -6337,7 +6624,7 @@
6337	6624	if (WARN_ON(!devid && !fs_info))
6338	6625	return ERR_PTR(-EINVAL);
6339	6626
6340		- dev = __alloc_device();
	6627	+ dev = __alloc_device(fs_info);
6341	6628	if (IS_ERR(dev))
6342	6629	return dev;
6343	6630
..	..	@@ -6359,9 +6646,6 @@
6359	6646	else
6360	6647	generate_random_uuid(dev->uuid);
6361	6648
6362		- btrfs_init_work(&dev->work, btrfs_submit_helper,
6363		- pending_bios_fn, NULL, NULL);
6364		-
6365	6649	return dev;
6366	6650	}
6367	6651
..	..	@@ -6376,11 +6660,26 @@
6376	6660	devid, uuid);
6377	6661	}
6378	6662
6379		-static int read_one_chunk(struct btrfs_fs_info fs_info, struct btrfs_key key,
6380		- struct extent_buffer *leaf,
	6663	+static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
	6664	+{
	6665	+ int index = btrfs_bg_flags_to_raid_index(type);
	6666	+ int ncopies = btrfs_raid_array[index].ncopies;
	6667	+ const int nparity = btrfs_raid_array[index].nparity;
	6668	+ int data_stripes;
	6669	+
	6670	+ if (nparity)
	6671	+ data_stripes = num_stripes - nparity;
	6672	+ else
	6673	+ data_stripes = num_stripes / ncopies;
	6674	+
	6675	+ return div_u64(chunk_len, data_stripes);
	6676	+}
	6677	+
	6678	+static int read_one_chunk(struct btrfs_key key, struct extent_buffer leaf,
6381	6679	struct btrfs_chunk *chunk)
6382	6680	{
6383		- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
	6681	+ struct btrfs_fs_info *fs_info = leaf->fs_info;
	6682	+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6384	6683	struct map_lookup *map;
6385	6684	struct extent_map *em;
6386	6685	u64 logical;
..	..	@@ -6400,14 +6699,14 @@
6400	6699	* as chunk item in tree block is already verified by tree-checker.
6401	6700	*/
6402	6701	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6403		- ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
	6702	+ ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6404	6703	if (ret)
6405	6704	return ret;
6406	6705	}
6407	6706
6408		- read_lock(&map_tree->map_tree.lock);
6409		- em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6410		- read_unlock(&map_tree->map_tree.lock);
	6707	+ read_lock(&map_tree->lock);
	6708	+ em = lookup_extent_mapping(map_tree, logical, 1);
	6709	+ read_unlock(&map_tree->lock);
6411	6710
6412	6711	/* already mapped? */
6413	6712	if (em && em->start <= logical && em->start + em->len > logical) {
..	..	@@ -6441,6 +6740,8 @@
6441	6740	map->type = btrfs_chunk_type(leaf, chunk);
6442	6741	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6443	6742	map->verified_stripes = 0;
	6743	+ em->orig_block_len = calc_stripe_length(map->type, em->len,
	6744	+ map->num_stripes);
6444	6745	for (i = 0; i < num_stripes; i++) {
6445	6746	map->stripes[i].physical =
6446	6747	btrfs_stripe_offset_nr(leaf, chunk, i);
..	..	@@ -6449,7 +6750,7 @@
6449	6750	btrfs_stripe_dev_uuid_nr(chunk, i),
6450	6751	BTRFS_UUID_SIZE);
6451	6752	map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6452		- devid, uuid, NULL, true);
	6753	+ devid, uuid, NULL, true);
6453	6754	if (!map->stripes[i].dev &&
6454	6755	!btrfs_test_opt(fs_info, DEGRADED)) {
6455	6756	free_extent_map(em);
..	..	@@ -6474,9 +6775,9 @@
6474	6775
6475	6776	}
6476	6777
6477		- write_lock(&map_tree->map_tree.lock);
6478		- ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6479		- write_unlock(&map_tree->map_tree.lock);
	6778	+ write_lock(&map_tree->lock);
	6779	+ ret = add_extent_mapping(map_tree, em, 0);
	6780	+ write_unlock(&map_tree->lock);
6480	6781	if (ret < 0) {
6481	6782	btrfs_err(fs_info,
6482	6783	"failed to add chunk map, start=%llu len=%llu: %d",
..	..	@@ -6519,28 +6820,30 @@
6519	6820	lockdep_assert_held(&uuid_mutex);
6520	6821	ASSERT(fsid);
6521	6822
6522		- fs_devices = fs_info->fs_devices->seed;
6523		- while (fs_devices) {
	6823	+ /* This will match only for multi-device seed fs */
	6824	+ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
6524	6825	if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6525	6826	return fs_devices;
6526	6827
6527		- fs_devices = fs_devices->seed;
6528		- }
6529	6828
6530		- fs_devices = find_fsid(fsid);
	6829	+ fs_devices = find_fsid(fsid, NULL);
6531	6830	if (!fs_devices) {
6532	6831	if (!btrfs_test_opt(fs_info, DEGRADED))
6533	6832	return ERR_PTR(-ENOENT);
6534	6833
6535		- fs_devices = alloc_fs_devices(fsid);
	6834	+ fs_devices = alloc_fs_devices(fsid, NULL);
6536	6835	if (IS_ERR(fs_devices))
6537	6836	return fs_devices;
6538	6837
6539		- fs_devices->seeding = 1;
	6838	+ fs_devices->seeding = true;
6540	6839	fs_devices->opened = 1;
6541	6840	return fs_devices;
6542	6841	}
6543	6842
	6843	+ /*
	6844	+ * Upon first call for a seed fs fsid, just create a private copy of the
	6845	+ * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
	6846	+ */
6544	6847	fs_devices = clone_fs_devices(fs_devices);
6545	6848	if (IS_ERR(fs_devices))
6546	6849	return fs_devices;
..	..	@@ -6548,27 +6851,24 @@
6548	6851	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6549	6852	if (ret) {
6550	6853	free_fs_devices(fs_devices);
6551		- fs_devices = ERR_PTR(ret);
6552		- goto out;
	6854	+ return ERR_PTR(ret);
6553	6855	}
6554	6856
6555	6857	if (!fs_devices->seeding) {
6556	6858	close_fs_devices(fs_devices);
6557	6859	free_fs_devices(fs_devices);
6558		- fs_devices = ERR_PTR(-EINVAL);
6559		- goto out;
	6860	+ return ERR_PTR(-EINVAL);
6560	6861	}
6561	6862
6562		- fs_devices->seed = fs_info->fs_devices->seed;
6563		- fs_info->fs_devices->seed = fs_devices;
6564		-out:
	6863	+ list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
	6864	+
6565	6865	return fs_devices;
6566	6866	}
6567	6867
6568		-static int read_one_dev(struct btrfs_fs_info *fs_info,
6569		- struct extent_buffer *leaf,
	6868	+static int read_one_dev(struct extent_buffer *leaf,
6570	6869	struct btrfs_dev_item *dev_item)
6571	6870	{
	6871	+ struct btrfs_fs_info *fs_info = leaf->fs_info;
6572	6872	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6573	6873	struct btrfs_device *device;
6574	6874	u64 devid;
..	..	@@ -6582,7 +6882,7 @@
6582	6882	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6583	6883	BTRFS_FSID_SIZE);
6584	6884
6585		- if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
	6885	+ if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
6586	6886	fs_devices = open_seed_devices(fs_info, fs_uuid);
6587	6887	if (IS_ERR(fs_devices))
6588	6888	return PTR_ERR(fs_devices);
..	..	@@ -6725,48 +7025,49 @@
6725	7025	sb_array_offset += len;
6726	7026	cur_offset += len;
6727	7027
6728		- if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6729		- chunk = (struct btrfs_chunk *)sb_array_offset;
6730		- /*
6731		- * At least one btrfs_chunk with one stripe must be
6732		- * present, exact stripe count check comes afterwards
6733		- */
6734		- len = btrfs_chunk_item_size(1);
6735		- if (cur_offset + len > array_size)
6736		- goto out_short_read;
6737		-
6738		- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6739		- if (!num_stripes) {
6740		- btrfs_err(fs_info,
6741		- "invalid number of stripes %u in sys_array at offset %u",
6742		- num_stripes, cur_offset);
6743		- ret = -EIO;
6744		- break;
6745		- }
6746		-
6747		- type = btrfs_chunk_type(sb, chunk);
6748		- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6749		- btrfs_err(fs_info,
6750		- "invalid chunk type %llu in sys_array at offset %u",
6751		- type, cur_offset);
6752		- ret = -EIO;
6753		- break;
6754		- }
6755		-
6756		- len = btrfs_chunk_item_size(num_stripes);
6757		- if (cur_offset + len > array_size)
6758		- goto out_short_read;
6759		-
6760		- ret = read_one_chunk(fs_info, &key, sb, chunk);
6761		- if (ret)
6762		- break;
6763		- } else {
	7028	+ if (key.type != BTRFS_CHUNK_ITEM_KEY) {
6764	7029	btrfs_err(fs_info,
6765	7030	"unexpected item type %u in sys_array at offset %u",
6766	7031	(u32)key.type, cur_offset);
6767	7032	ret = -EIO;
6768	7033	break;
6769	7034	}
	7035	+
	7036	+ chunk = (struct btrfs_chunk *)sb_array_offset;
	7037	+ /*
	7038	+ * At least one btrfs_chunk with one stripe must be present,
	7039	+ * exact stripe count check comes afterwards
	7040	+ */
	7041	+ len = btrfs_chunk_item_size(1);
	7042	+ if (cur_offset + len > array_size)
	7043	+ goto out_short_read;
	7044	+
	7045	+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
	7046	+ if (!num_stripes) {
	7047	+ btrfs_err(fs_info,
	7048	+ "invalid number of stripes %u in sys_array at offset %u",
	7049	+ num_stripes, cur_offset);
	7050	+ ret = -EIO;
	7051	+ break;
	7052	+ }
	7053	+
	7054	+ type = btrfs_chunk_type(sb, chunk);
	7055	+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
	7056	+ btrfs_err(fs_info,
	7057	+ "invalid chunk type %llu in sys_array at offset %u",
	7058	+ type, cur_offset);
	7059	+ ret = -EIO;
	7060	+ break;
	7061	+ }
	7062	+
	7063	+ len = btrfs_chunk_item_size(num_stripes);
	7064	+ if (cur_offset + len > array_size)
	7065	+ goto out_short_read;
	7066	+
	7067	+ ret = read_one_chunk(&key, sb, chunk);
	7068	+ if (ret)
	7069	+ break;
	7070	+
6770	7071	array_ptr += len;
6771	7072	sb_array_offset += len;
6772	7073	cur_offset += len;
..	..	@@ -6794,14 +7095,14 @@
6794	7095	bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
6795	7096	struct btrfs_device *failing_dev)
6796	7097	{
6797		- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
	7098	+ struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6798	7099	struct extent_map *em;
6799	7100	u64 next_start = 0;
6800	7101	bool ret = true;
6801	7102
6802		- read_lock(&map_tree->map_tree.lock);
6803		- em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
6804		- read_unlock(&map_tree->map_tree.lock);
	7103	+ read_lock(&map_tree->lock);
	7104	+ em = lookup_extent_mapping(map_tree, 0, (u64)-1);
	7105	+ read_unlock(&map_tree->lock);
6805	7106	/* No chunk at all? Return false anyway */
6806	7107	if (!em) {
6807	7108	ret = false;
..	..	@@ -6830,7 +7131,7 @@
6830	7131	if (missing > max_tolerated) {
6831	7132	if (!failing_dev)
6832	7133	btrfs_warn(fs_info,
6833		- "chunk %llu missing %d devices, max tolerance is %d for writeable mount",
	7134	+ "chunk %llu missing %d devices, max tolerance is %d for writable mount",
6834	7135	em->start, missing, max_tolerated);
6835	7136	free_extent_map(em);
6836	7137	ret = false;
..	..	@@ -6839,13 +7140,26 @@
6839	7140	next_start = extent_map_end(em);
6840	7141	free_extent_map(em);
6841	7142
6842		- read_lock(&map_tree->map_tree.lock);
6843		- em = lookup_extent_mapping(&map_tree->map_tree, next_start,
	7143	+ read_lock(&map_tree->lock);
	7144	+ em = lookup_extent_mapping(map_tree, next_start,
6844	7145	(u64)(-1) - next_start);
6845		- read_unlock(&map_tree->map_tree.lock);
	7146	+ read_unlock(&map_tree->lock);
6846	7147	}
6847	7148	out:
6848	7149	return ret;
	7150	+}
	7151	+
	7152	+static void readahead_tree_node_children(struct extent_buffer *node)
	7153	+{
	7154	+ int i;
	7155	+ const int nr_items = btrfs_header_nritems(node);
	7156	+
	7157	+ for (i = 0; i < nr_items; i++) {
	7158	+ u64 start;
	7159	+
	7160	+ start = btrfs_node_blockptr(node, i);
	7161	+ readahead_tree_block(node->fs_info, start);
	7162	+ }
6849	7163	}
6850	7164
6851	7165	int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
..	..	@@ -6858,6 +7172,7 @@
6858	7172	int ret;
6859	7173	int slot;
6860	7174	u64 total_dev = 0;
	7175	+ u64 last_ra_node = 0;
6861	7176
6862	7177	path = btrfs_alloc_path();
6863	7178	if (!path)
..	..	@@ -6868,7 +7183,6 @@
6868	7183	* otherwise we don't need it.
6869	7184	*/
6870	7185	mutex_lock(&uuid_mutex);
6871		- mutex_lock(&fs_info->chunk_mutex);
6872	7186
6873	7187	/*
6874	7188	* It is possible for mount and umount to race in such a way that
..	..	@@ -6891,6 +7205,8 @@
6891	7205	if (ret < 0)
6892	7206	goto error;
6893	7207	while (1) {
	7208	+ struct extent_buffer *node;
	7209	+
6894	7210	leaf = path->nodes[0];
6895	7211	slot = path->slots[0];
6896	7212	if (slot >= btrfs_header_nritems(leaf)) {
..	..	@@ -6901,19 +7217,32 @@
6901	7217	goto error;
6902	7218	break;
6903	7219	}
	7220	+ /*
	7221	+ * The nodes on level 1 are not locked but we don't need to do
	7222	+ * that during mount time as nothing else can access the tree
	7223	+ */
	7224	+ node = path->nodes[1];
	7225	+ if (node) {
	7226	+ if (last_ra_node != node->start) {
	7227	+ readahead_tree_node_children(node);
	7228	+ last_ra_node = node->start;
	7229	+ }
	7230	+ }
6904	7231	btrfs_item_key_to_cpu(leaf, &found_key, slot);
6905	7232	if (found_key.type == BTRFS_DEV_ITEM_KEY) {
6906	7233	struct btrfs_dev_item *dev_item;
6907	7234	dev_item = btrfs_item_ptr(leaf, slot,
6908	7235	struct btrfs_dev_item);
6909		- ret = read_one_dev(fs_info, leaf, dev_item);
	7236	+ ret = read_one_dev(leaf, dev_item);
6910	7237	if (ret)
6911	7238	goto error;
6912	7239	total_dev++;
6913	7240	} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
6914	7241	struct btrfs_chunk *chunk;
6915	7242	chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6916		- ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
	7243	+ mutex_lock(&fs_info->chunk_mutex);
	7244	+ ret = read_one_chunk(&found_key, leaf, chunk);
	7245	+ mutex_unlock(&fs_info->chunk_mutex);
6917	7246	if (ret)
6918	7247	goto error;
6919	7248	}
..	..	@@ -6925,12 +7254,12 @@
6925	7254	* do another round of validation checks.
6926	7255	*/
6927	7256	if (total_dev != fs_info->fs_devices->total_devices) {
6928		- btrfs_err(fs_info,
6929		- "super_num_devices %llu mismatch with num_devices %llu found here",
	7257	+ btrfs_warn(fs_info,
	7258	+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
6930	7259	btrfs_super_num_devices(fs_info->super_copy),
6931	7260	total_dev);
6932		- ret = -EINVAL;
6933		- goto error;
	7261	+ fs_info->fs_devices->total_devices = total_dev;
	7262	+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
6934	7263	}
6935	7264	if (btrfs_super_total_bytes(fs_info->super_copy) <
6936	7265	fs_info->fs_devices->total_rw_bytes) {
..	..	@@ -6943,7 +7272,6 @@
6943	7272	}
6944	7273	ret = 0;
6945	7274	error:
6946		- mutex_unlock(&fs_info->chunk_mutex);
6947	7275	mutex_unlock(&uuid_mutex);
6948	7276
6949	7277	btrfs_free_path(path);
..	..	@@ -6952,86 +7280,117 @@
6952	7280
6953	7281	void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
6954	7282	{
6955		- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	7283	+ struct btrfs_fs_devices fs_devices = fs_info->fs_devices, seed_devs;
6956	7284	struct btrfs_device *device;
6957	7285
6958		- while (fs_devices) {
6959		- mutex_lock(&fs_devices->device_list_mutex);
6960		- list_for_each_entry(device, &fs_devices->devices, dev_list)
6961		- device->fs_info = fs_info;
6962		- mutex_unlock(&fs_devices->device_list_mutex);
	7286	+ fs_devices->fs_info = fs_info;
6963	7287
6964		- fs_devices = fs_devices->seed;
	7288	+ mutex_lock(&fs_devices->device_list_mutex);
	7289	+ list_for_each_entry(device, &fs_devices->devices, dev_list)
	7290	+ device->fs_info = fs_info;
	7291	+
	7292	+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
	7293	+ list_for_each_entry(device, &seed_devs->devices, dev_list)
	7294	+ device->fs_info = fs_info;
	7295	+
	7296	+ seed_devs->fs_info = fs_info;
6965	7297	}
	7298	+ mutex_unlock(&fs_devices->device_list_mutex);
6966	7299	}
6967	7300
6968		-static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
	7301	+static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
	7302	+ const struct btrfs_dev_stats_item *ptr,
	7303	+ int index)
6969	7304	{
6970		- int i;
	7305	+ u64 val;
6971	7306
6972		- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6973		- btrfs_dev_stat_reset(dev, i);
	7307	+ read_extent_buffer(eb, &val,
	7308	+ offsetof(struct btrfs_dev_stats_item, values) +
	7309	+ ((unsigned long)ptr) + (index * sizeof(u64)),
	7310	+ sizeof(val));
	7311	+ return val;
	7312	+}
	7313	+
	7314	+static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
	7315	+ struct btrfs_dev_stats_item *ptr,
	7316	+ int index, u64 val)
	7317	+{
	7318	+ write_extent_buffer(eb, &val,
	7319	+ offsetof(struct btrfs_dev_stats_item, values) +
	7320	+ ((unsigned long)ptr) + (index * sizeof(u64)),
	7321	+ sizeof(val));
	7322	+}
	7323	+
	7324	+static int btrfs_device_init_dev_stats(struct btrfs_device *device,
	7325	+ struct btrfs_path *path)
	7326	+{
	7327	+ struct btrfs_dev_stats_item *ptr;
	7328	+ struct extent_buffer *eb;
	7329	+ struct btrfs_key key;
	7330	+ int item_size;
	7331	+ int i, ret, slot;
	7332	+
	7333	+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
	7334	+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
	7335	+ key.offset = device->devid;
	7336	+ ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
	7337	+ if (ret) {
	7338	+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
	7339	+ btrfs_dev_stat_set(device, i, 0);
	7340	+ device->dev_stats_valid = 1;
	7341	+ btrfs_release_path(path);
	7342	+ return ret < 0 ? ret : 0;
	7343	+ }
	7344	+ slot = path->slots[0];
	7345	+ eb = path->nodes[0];
	7346	+ item_size = btrfs_item_size_nr(eb, slot);
	7347	+
	7348	+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
	7349	+
	7350	+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
	7351	+ if (item_size >= (1 + i) * sizeof(__le64))
	7352	+ btrfs_dev_stat_set(device, i,
	7353	+ btrfs_dev_stats_value(eb, ptr, i));
	7354	+ else
	7355	+ btrfs_dev_stat_set(device, i, 0);
	7356	+ }
	7357	+
	7358	+ device->dev_stats_valid = 1;
	7359	+ btrfs_dev_stat_print_on_load(device);
	7360	+ btrfs_release_path(path);
	7361	+
	7362	+ return 0;
6974	7363	}
6975	7364
6976	7365	int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
6977	7366	{
6978		- struct btrfs_key key;
6979		- struct btrfs_key found_key;
6980		- struct btrfs_root *dev_root = fs_info->dev_root;
6981		- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6982		- struct extent_buffer *eb;
6983		- int slot;
6984		- int ret = 0;
	7367	+ struct btrfs_fs_devices fs_devices = fs_info->fs_devices, seed_devs;
6985	7368	struct btrfs_device *device;
6986	7369	struct btrfs_path *path = NULL;
6987		- int i;
	7370	+ int ret = 0;
6988	7371
6989	7372	path = btrfs_alloc_path();
6990		- if (!path) {
6991		- ret = -ENOMEM;
6992		- goto out;
6993		- }
	7373	+ if (!path)
	7374	+ return -ENOMEM;
6994	7375
6995	7376	mutex_lock(&fs_devices->device_list_mutex);
6996	7377	list_for_each_entry(device, &fs_devices->devices, dev_list) {
6997		- int item_size;
6998		- struct btrfs_dev_stats_item *ptr;
6999		-
7000		- key.objectid = BTRFS_DEV_STATS_OBJECTID;
7001		- key.type = BTRFS_PERSISTENT_ITEM_KEY;
7002		- key.offset = device->devid;
7003		- ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
7004		- if (ret) {
7005		- __btrfs_reset_dev_stats(device);
7006		- device->dev_stats_valid = 1;
7007		- btrfs_release_path(path);
7008		- continue;
7009		- }
7010		- slot = path->slots[0];
7011		- eb = path->nodes[0];
7012		- btrfs_item_key_to_cpu(eb, &found_key, slot);
7013		- item_size = btrfs_item_size_nr(eb, slot);
7014		-
7015		- ptr = btrfs_item_ptr(eb, slot,
7016		- struct btrfs_dev_stats_item);
7017		-
7018		- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7019		- if (item_size >= (1 + i) * sizeof(__le64))
7020		- btrfs_dev_stat_set(device, i,
7021		- btrfs_dev_stats_value(eb, ptr, i));
7022		- else
7023		- btrfs_dev_stat_reset(device, i);
7024		- }
7025		-
7026		- device->dev_stats_valid = 1;
7027		- btrfs_dev_stat_print_on_load(device);
7028		- btrfs_release_path(path);
	7378	+ ret = btrfs_device_init_dev_stats(device, path);
	7379	+ if (ret)
	7380	+ goto out;
7029	7381	}
	7382	+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
	7383	+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
	7384	+ ret = btrfs_device_init_dev_stats(device, path);
	7385	+ if (ret)
	7386	+ goto out;
	7387	+ }
	7388	+ }
	7389	+out:
7030	7390	mutex_unlock(&fs_devices->device_list_mutex);
7031	7391
7032		-out:
7033	7392	btrfs_free_path(path);
7034		- return ret < 0 ? ret : 0;
	7393	+ return ret;
7035	7394	}
7036	7395
7037	7396	static int update_dev_stat_item(struct btrfs_trans_handle *trans,
..	..	@@ -7102,9 +7461,9 @@
7102	7461	/*
7103	7462	* called from commit_transaction. Writes all changed device stats to disk.
7104	7463	*/
7105		-int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
7106		- struct btrfs_fs_info *fs_info)
	7464	+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7107	7465	{
	7466	+ struct btrfs_fs_info *fs_info = trans->fs_info;
7108	7467	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7109	7468	struct btrfs_device *device;
7110	7469	int stats_cnt;
..	..	@@ -7187,8 +7546,8 @@
7187	7546	int i;
7188	7547
7189	7548	mutex_lock(&fs_devices->device_list_mutex);
7190		- dev = btrfs_find_device(fs_info->fs_devices, stats->devid,
7191		- NULL, NULL, true);
	7549	+ dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
	7550	+ true);
7192	7551	mutex_unlock(&fs_devices->device_list_mutex);
7193	7552
7194	7553	if (!dev) {
..	..	@@ -7203,7 +7562,7 @@
7203	7562	stats->values[i] =
7204	7563	btrfs_dev_stat_read_and_reset(dev, i);
7205	7564	else
7206		- btrfs_dev_stat_reset(dev, i);
	7565	+ btrfs_dev_stat_set(dev, i, 0);
7207	7566	}
7208	7567	btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7209	7568	current->comm, task_pid_nr(current));
..	..	@@ -7217,101 +7576,35 @@
7217	7576	return 0;
7218	7577	}
7219	7578
7220		-void btrfs_scratch_superblocks(struct block_device bdev, const char device_path)
7221		-{
7222		- struct buffer_head *bh;
7223		- struct btrfs_super_block *disk_super;
7224		- int copy_num;
7225		-
7226		- if (!bdev)
7227		- return;
7228		-
7229		- for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
7230		- copy_num++) {
7231		-
7232		- if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
7233		- continue;
7234		-
7235		- disk_super = (struct btrfs_super_block *)bh->b_data;
7236		-
7237		- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
7238		- set_buffer_dirty(bh);
7239		- sync_dirty_buffer(bh);
7240		- brelse(bh);
7241		- }
7242		-
7243		- /* Notify udev that device has changed */
7244		- btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
7245		-
7246		- /* Update ctime/mtime for device path for libblkid */
7247		- update_dev_time(device_path);
7248		-}
7249		-
7250	7579	/*
7251		- * Update the size of all devices, which is used for writing out the
7252		- * super blocks.
	7580	+ * Update the size and bytes used for each device where it changed. This is
	7581	+ * delayed since we would otherwise get errors while writing out the
	7582	+ * superblocks.
	7583	+ *
	7584	+ * Must be invoked during transaction commit.
7253	7585	*/
7254		-void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
	7586	+void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7255	7587	{
7256		- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7257	7588	struct btrfs_device curr, next;
7258	7589
7259		- if (list_empty(&fs_devices->resized_devices))
	7590	+ ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
	7591	+
	7592	+ if (list_empty(&trans->dev_update_list))
7260	7593	return;
7261	7594
7262		- mutex_lock(&fs_devices->device_list_mutex);
7263		- mutex_lock(&fs_info->chunk_mutex);
7264		- list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
7265		- resized_list) {
7266		- list_del_init(&curr->resized_list);
	7595	+ /*
	7596	+ * We don't need the device_list_mutex here. This list is owned by the
	7597	+ * transaction and the transaction must complete before the device is
	7598	+ * released.
	7599	+ */
	7600	+ mutex_lock(&trans->fs_info->chunk_mutex);
	7601	+ list_for_each_entry_safe(curr, next, &trans->dev_update_list,
	7602	+ post_commit_list) {
	7603	+ list_del_init(&curr->post_commit_list);
7267	7604	curr->commit_total_bytes = curr->disk_total_bytes;
	7605	+ curr->commit_bytes_used = curr->bytes_used;
7268	7606	}
7269		- mutex_unlock(&fs_info->chunk_mutex);
7270		- mutex_unlock(&fs_devices->device_list_mutex);
7271		-}
7272		-
7273		-/* Must be invoked during the transaction commit */
7274		-void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
7275		-{
7276		- struct btrfs_fs_info *fs_info = trans->fs_info;
7277		- struct extent_map *em;
7278		- struct map_lookup *map;
7279		- struct btrfs_device *dev;
7280		- int i;
7281		-
7282		- if (list_empty(&trans->pending_chunks))
7283		- return;
7284		-
7285		- /* In order to kick the device replace finish process */
7286		- mutex_lock(&fs_info->chunk_mutex);
7287		- list_for_each_entry(em, &trans->pending_chunks, list) {
7288		- map = em->map_lookup;
7289		-
7290		- for (i = 0; i < map->num_stripes; i++) {
7291		- dev = map->stripes[i].dev;
7292		- dev->commit_bytes_used = dev->bytes_used;
7293		- dev->has_pending_chunks = false;
7294		- }
7295		- }
7296		- mutex_unlock(&fs_info->chunk_mutex);
7297		-}
7298		-
7299		-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
7300		-{
7301		- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7302		- while (fs_devices) {
7303		- fs_devices->fs_info = fs_info;
7304		- fs_devices = fs_devices->seed;
7305		- }
7306		-}
7307		-
7308		-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
7309		-{
7310		- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7311		- while (fs_devices) {
7312		- fs_devices->fs_info = NULL;
7313		- fs_devices = fs_devices->seed;
7314		- }
	7607	+ mutex_unlock(&trans->fs_info->chunk_mutex);
7315	7608	}
7316	7609
7317	7610	/*
..	..	@@ -7319,38 +7612,18 @@
7319	7612	*/
7320	7613	int btrfs_bg_type_to_factor(u64 flags)
7321	7614	{
7322		- if (flags & (BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1 \|
7323		- BTRFS_BLOCK_GROUP_RAID10))
7324		- return 2;
7325		- return 1;
	7615	+ const int index = btrfs_bg_flags_to_raid_index(flags);
	7616	+
	7617	+ return btrfs_raid_array[index].ncopies;
7326	7618	}
7327	7619
7328	7620
7329		-static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
7330		-{
7331		- int index = btrfs_bg_flags_to_raid_index(type);
7332		- int ncopies = btrfs_raid_array[index].ncopies;
7333		- int data_stripes;
7334		-
7335		- switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
7336		- case BTRFS_BLOCK_GROUP_RAID5:
7337		- data_stripes = num_stripes - 1;
7338		- break;
7339		- case BTRFS_BLOCK_GROUP_RAID6:
7340		- data_stripes = num_stripes - 2;
7341		- break;
7342		- default:
7343		- data_stripes = num_stripes / ncopies;
7344		- break;
7345		- }
7346		- return div_u64(chunk_len, data_stripes);
7347		-}
7348	7621
7349	7622	static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7350	7623	u64 chunk_offset, u64 devid,
7351	7624	u64 physical_offset, u64 physical_len)
7352	7625	{
7353		- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
	7626	+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7354	7627	struct extent_map *em;
7355	7628	struct map_lookup *map;
7356	7629	struct btrfs_device *dev;
..	..	@@ -7414,8 +7687,11 @@
7414	7687
7415	7688	/* It's possible this device is a dummy for seed device */
7416	7689	if (dev->disk_total_bytes == 0) {
7417		- dev = btrfs_find_device(fs_info->fs_devices->seed, devid,
7418		- NULL, NULL, false);
	7690	+ struct btrfs_fs_devices *devs;
	7691	+
	7692	+ devs = list_first_entry(&fs_info->fs_devices->seed_list,
	7693	+ struct btrfs_fs_devices, seed_list);
	7694	+ dev = btrfs_find_device(devs, devid, NULL, NULL, false);
7419	7695	if (!dev) {
7420	7696	btrfs_err(fs_info, "failed to find seed devid %llu",
7421	7697	devid);
..	..	@@ -7439,13 +7715,13 @@
7439	7715
7440	7716	static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7441	7717	{
7442		- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
	7718	+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7443	7719	struct extent_map *em;
7444	7720	struct rb_node *node;
7445	7721	int ret = 0;
7446	7722
7447	7723	read_lock(&em_tree->lock);
7448		- for (node = rb_first(&em_tree->map); node; node = rb_next(node)) {
	7724	+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7449	7725	em = rb_entry(node, struct extent_map, rb_node);
7450	7726	if (em->map_lookup->num_stripes !=
7451	7727	em->map_lookup->verified_stripes) {
..	..	@@ -7551,3 +7827,27 @@
7551	7827	btrfs_free_path(path);
7552	7828	return ret;
7553	7829	}
	7830	+
	7831	+/*
	7832	+ * Check whether the given block group or device is pinned by any inode being
	7833	+ * used as a swapfile.
	7834	+ */
	7835	+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info fs_info, void ptr)
	7836	+{
	7837	+ struct btrfs_swapfile_pin *sp;
	7838	+ struct rb_node *node;
	7839	+
	7840	+ spin_lock(&fs_info->swapfile_pins_lock);
	7841	+ node = fs_info->swapfile_pins.rb_node;
	7842	+ while (node) {
	7843	+ sp = rb_entry(node, struct btrfs_swapfile_pin, node);
	7844	+ if (ptr < sp->ptr)
	7845	+ node = node->rb_left;
	7846	+ else if (ptr > sp->ptr)
	7847	+ node = node->rb_right;
	7848	+ else
	7849	+ break;
	7850	+ }
	7851	+ spin_unlock(&fs_info->swapfile_pins_lock);
	7852	+ return node != NULL;
	7853	+}