~hc/RK356X_SDK_RELEASE.git

..	..	@@ -16,6 +16,7 @@
16	16	#include <linux/percpu_counter.h>
17	17	#include <linux/lockdep.h>
18	18	#include <linux/crc32c.h>
	19	+#include "misc.h"
19	20	#include "tree-log.h"
20	21	#include "disk-io.h"
21	22	#include "print-tree.h"
..	..	@@ -24,32 +25,18 @@
24	25	#include "locking.h"
25	26	#include "free-space-cache.h"
26	27	#include "free-space-tree.h"
27		-#include "math.h"
28	28	#include "sysfs.h"
29	29	#include "qgroup.h"
30	30	#include "ref-verify.h"
	31	+#include "space-info.h"
	32	+#include "block-rsv.h"
	33	+#include "delalloc-space.h"
	34	+#include "block-group.h"
	35	+#include "discard.h"
	36	+#include "rcu-string.h"
31	37
32	38	#undef SCRAMBLE_DELAYED_REFS
33	39
34		-/*
35		- * control flags for do_chunk_alloc's force field
36		- * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
37		- * if we really need one.
38		- *
39		- * CHUNK_ALLOC_LIMITED means to only try and allocate one
40		- * if we have very few chunks already allocated. This is
41		- * used as part of the clustering code to help make sure
42		- * we have a good pool of storage to cluster in, without
43		- * filling the FS with empty chunks
44		- *
45		- * CHUNK_ALLOC_FORCE means it must try to allocate one
46		- *
47		- */
48		-enum {
49		- CHUNK_ALLOC_NO_FORCE = 0,
50		- CHUNK_ALLOC_LIMITED = 1,
51		- CHUNK_ALLOC_FORCE = 2,
52		-};
53	40
54	41	static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
55	42	struct btrfs_delayed_ref_node *node, u64 parent,
..	..	@@ -66,712 +53,33 @@
66	53	static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
67	54	struct btrfs_delayed_ref_node *node,
68	55	struct btrfs_delayed_extent_op *extent_op);
69		-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
70		- int force);
71	56	static int find_next_key(struct btrfs_path *path, int level,
72	57	struct btrfs_key *key);
73		-static void dump_space_info(struct btrfs_fs_info *fs_info,
74		- struct btrfs_space_info *info, u64 bytes,
75		- int dump_block_groups);
76		-static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
77		- u64 num_bytes);
78		-static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
79		- struct btrfs_space_info *space_info,
80		- u64 num_bytes);
81		-static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
82		- struct btrfs_space_info *space_info,
83		- u64 num_bytes);
84	58
85		-static noinline int
86		-block_group_cache_done(struct btrfs_block_group_cache *cache)
87		-{
88		- smp_mb();
89		- return cache->cached == BTRFS_CACHE_FINISHED \|\|
90		- cache->cached == BTRFS_CACHE_ERROR;
91		-}
92		-
93		-static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
	59	+static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
94	60	{
95	61	return (cache->flags & bits) == bits;
96	62	}
97	63
98		-void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
99		-{
100		- atomic_inc(&cache->count);
101		-}
102		-
103		-void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104		-{
105		- if (atomic_dec_and_test(&cache->count)) {
106		- WARN_ON(cache->pinned > 0);
107		- WARN_ON(cache->reserved > 0);
108		-
109		- /*
110		- * If not empty, someone is still holding mutex of
111		- * full_stripe_lock, which can only be released by caller.
112		- * And it will definitely cause use-after-free when caller
113		- * tries to release full stripe lock.
114		- *
115		- * No better way to resolve, but only to warn.
116		- */
117		- WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
118		- kfree(cache->free_space_ctl);
119		- kfree(cache);
120		- }
121		-}
122		-
123		-/*
124		- * this adds the block group to the fs_info rb tree for the block group
125		- * cache
126		- */
127		-static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
128		- struct btrfs_block_group_cache *block_group)
129		-{
130		- struct rb_node **p;
131		- struct rb_node *parent = NULL;
132		- struct btrfs_block_group_cache *cache;
133		-
134		- spin_lock(&info->block_group_cache_lock);
135		- p = &info->block_group_cache_tree.rb_node;
136		-
137		- while (*p) {
138		- parent = *p;
139		- cache = rb_entry(parent, struct btrfs_block_group_cache,
140		- cache_node);
141		- if (block_group->key.objectid < cache->key.objectid) {
142		- p = &(*p)->rb_left;
143		- } else if (block_group->key.objectid > cache->key.objectid) {
144		- p = &(*p)->rb_right;
145		- } else {
146		- spin_unlock(&info->block_group_cache_lock);
147		- return -EEXIST;
148		- }
149		- }
150		-
151		- rb_link_node(&block_group->cache_node, parent, p);
152		- rb_insert_color(&block_group->cache_node,
153		- &info->block_group_cache_tree);
154		-
155		- if (info->first_logical_byte > block_group->key.objectid)
156		- info->first_logical_byte = block_group->key.objectid;
157		-
158		- spin_unlock(&info->block_group_cache_lock);
159		-
160		- return 0;
161		-}
162		-
163		-/*
164		- * This will return the block group at or after bytenr if contains is 0, else
165		- * it will return the block group that contains the bytenr
166		- */
167		-static struct btrfs_block_group_cache *
168		-block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
169		- int contains)
170		-{
171		- struct btrfs_block_group_cache cache, ret = NULL;
172		- struct rb_node *n;
173		- u64 end, start;
174		-
175		- spin_lock(&info->block_group_cache_lock);
176		- n = info->block_group_cache_tree.rb_node;
177		-
178		- while (n) {
179		- cache = rb_entry(n, struct btrfs_block_group_cache,
180		- cache_node);
181		- end = cache->key.objectid + cache->key.offset - 1;
182		- start = cache->key.objectid;
183		-
184		- if (bytenr < start) {
185		- if (!contains && (!ret \|\| start < ret->key.objectid))
186		- ret = cache;
187		- n = n->rb_left;
188		- } else if (bytenr > start) {
189		- if (contains && bytenr <= end) {
190		- ret = cache;
191		- break;
192		- }
193		- n = n->rb_right;
194		- } else {
195		- ret = cache;
196		- break;
197		- }
198		- }
199		- if (ret) {
200		- btrfs_get_block_group(ret);
201		- if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
202		- info->first_logical_byte = ret->key.objectid;
203		- }
204		- spin_unlock(&info->block_group_cache_lock);
205		-
206		- return ret;
207		-}
208		-
209		-static int add_excluded_extent(struct btrfs_fs_info *fs_info,
210		- u64 start, u64 num_bytes)
	64	+int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
	65	+ u64 start, u64 num_bytes)
211	66	{
212	67	u64 end = start + num_bytes - 1;
213		- set_extent_bits(&fs_info->freed_extents[0],
214		- start, end, EXTENT_UPTODATE);
215		- set_extent_bits(&fs_info->freed_extents[1],
216		- start, end, EXTENT_UPTODATE);
	68	+ set_extent_bits(&fs_info->excluded_extents, start, end,
	69	+ EXTENT_UPTODATE);
217	70	return 0;
218	71	}
219	72
220		-static void free_excluded_extents(struct btrfs_block_group_cache *cache)
	73	+void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
221	74	{
222	75	struct btrfs_fs_info *fs_info = cache->fs_info;
223	76	u64 start, end;
224	77
225		- start = cache->key.objectid;
226		- end = start + cache->key.offset - 1;
	78	+ start = cache->start;
	79	+ end = start + cache->length - 1;
227	80
228		- clear_extent_bits(&fs_info->freed_extents[0],
229		- start, end, EXTENT_UPTODATE);
230		- clear_extent_bits(&fs_info->freed_extents[1],
231		- start, end, EXTENT_UPTODATE);
232		-}
233		-
234		-static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
235		-{
236		- struct btrfs_fs_info *fs_info = cache->fs_info;
237		- u64 bytenr;
238		- u64 *logical;
239		- int stripe_len;
240		- int i, nr, ret;
241		-
242		- if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
243		- stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
244		- cache->bytes_super += stripe_len;
245		- ret = add_excluded_extent(fs_info, cache->key.objectid,
246		- stripe_len);
247		- if (ret)
248		- return ret;
249		- }
250		-
251		- for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
252		- bytenr = btrfs_sb_offset(i);
253		- ret = btrfs_rmap_block(fs_info, cache->key.objectid,
254		- bytenr, &logical, &nr, &stripe_len);
255		- if (ret)
256		- return ret;
257		-
258		- while (nr--) {
259		- u64 start, len;
260		-
261		- if (logical[nr] > cache->key.objectid +
262		- cache->key.offset)
263		- continue;
264		-
265		- if (logical[nr] + stripe_len <= cache->key.objectid)
266		- continue;
267		-
268		- start = logical[nr];
269		- if (start < cache->key.objectid) {
270		- start = cache->key.objectid;
271		- len = (logical[nr] + stripe_len) - start;
272		- } else {
273		- len = min_t(u64, stripe_len,
274		- cache->key.objectid +
275		- cache->key.offset - start);
276		- }
277		-
278		- cache->bytes_super += len;
279		- ret = add_excluded_extent(fs_info, start, len);
280		- if (ret) {
281		- kfree(logical);
282		- return ret;
283		- }
284		- }
285		-
286		- kfree(logical);
287		- }
288		- return 0;
289		-}
290		-
291		-static struct btrfs_caching_control *
292		-get_caching_control(struct btrfs_block_group_cache *cache)
293		-{
294		- struct btrfs_caching_control *ctl;
295		-
296		- spin_lock(&cache->lock);
297		- if (!cache->caching_ctl) {
298		- spin_unlock(&cache->lock);
299		- return NULL;
300		- }
301		-
302		- ctl = cache->caching_ctl;
303		- refcount_inc(&ctl->count);
304		- spin_unlock(&cache->lock);
305		- return ctl;
306		-}
307		-
308		-static void put_caching_control(struct btrfs_caching_control *ctl)
309		-{
310		- if (refcount_dec_and_test(&ctl->count))
311		- kfree(ctl);
312		-}
313		-
314		-#ifdef CONFIG_BTRFS_DEBUG
315		-static void fragment_free_space(struct btrfs_block_group_cache *block_group)
316		-{
317		- struct btrfs_fs_info *fs_info = block_group->fs_info;
318		- u64 start = block_group->key.objectid;
319		- u64 len = block_group->key.offset;
320		- u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
321		- fs_info->nodesize : fs_info->sectorsize;
322		- u64 step = chunk << 1;
323		-
324		- while (len > chunk) {
325		- btrfs_remove_free_space(block_group, start, chunk);
326		- start += step;
327		- if (len < step)
328		- len = 0;
329		- else
330		- len -= step;
331		- }
332		-}
333		-#endif
334		-
335		-/*
336		- * this is only called by cache_block_group, since we could have freed extents
337		- * we need to check the pinned_extents for any extents that can't be used yet
338		- * since their free space will be released as soon as the transaction commits.
339		- */
340		-u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
341		- u64 start, u64 end)
342		-{
343		- struct btrfs_fs_info *info = block_group->fs_info;
344		- u64 extent_start, extent_end, size, total_added = 0;
345		- int ret;
346		-
347		- while (start < end) {
348		- ret = find_first_extent_bit(info->pinned_extents, start,
349		- &extent_start, &extent_end,
350		- EXTENT_DIRTY \| EXTENT_UPTODATE,
351		- NULL);
352		- if (ret)
353		- break;
354		-
355		- if (extent_start <= start) {
356		- start = extent_end + 1;
357		- } else if (extent_start > start && extent_start < end) {
358		- size = extent_start - start;
359		- total_added += size;
360		- ret = btrfs_add_free_space(block_group, start,
361		- size);
362		- BUG_ON(ret); /* -ENOMEM or logic error */
363		- start = extent_end + 1;
364		- } else {
365		- break;
366		- }
367		- }
368		-
369		- if (start < end) {
370		- size = end - start;
371		- total_added += size;
372		- ret = btrfs_add_free_space(block_group, start, size);
373		- BUG_ON(ret); /* -ENOMEM or logic error */
374		- }
375		-
376		- return total_added;
377		-}
378		-
379		-static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
380		-{
381		- struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
382		- struct btrfs_fs_info *fs_info = block_group->fs_info;
383		- struct btrfs_root *extent_root = fs_info->extent_root;
384		- struct btrfs_path *path;
385		- struct extent_buffer *leaf;
386		- struct btrfs_key key;
387		- u64 total_found = 0;
388		- u64 last = 0;
389		- u32 nritems;
390		- int ret;
391		- bool wakeup = true;
392		-
393		- path = btrfs_alloc_path();
394		- if (!path)
395		- return -ENOMEM;
396		-
397		- last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
398		-
399		-#ifdef CONFIG_BTRFS_DEBUG
400		- /*
401		- * If we're fragmenting we don't want to make anybody think we can
402		- * allocate from this block group until we've had a chance to fragment
403		- * the free space.
404		- */
405		- if (btrfs_should_fragment_free_space(block_group))
406		- wakeup = false;
407		-#endif
408		- /*
409		- * We don't want to deadlock with somebody trying to allocate a new
410		- * extent for the extent root while also trying to search the extent
411		- * root to add free space. So we skip locking and search the commit
412		- * root, since its read-only
413		- */
414		- path->skip_locking = 1;
415		- path->search_commit_root = 1;
416		- path->reada = READA_FORWARD;
417		-
418		- key.objectid = last;
419		- key.offset = 0;
420		- key.type = BTRFS_EXTENT_ITEM_KEY;
421		-
422		-next:
423		- ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
424		- if (ret < 0)
425		- goto out;
426		-
427		- leaf = path->nodes[0];
428		- nritems = btrfs_header_nritems(leaf);
429		-
430		- while (1) {
431		- if (btrfs_fs_closing(fs_info) > 1) {
432		- last = (u64)-1;
433		- break;
434		- }
435		-
436		- if (path->slots[0] < nritems) {
437		- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
438		- } else {
439		- ret = find_next_key(path, 0, &key);
440		- if (ret)
441		- break;
442		-
443		- if (need_resched() \|\|
444		- rwsem_is_contended(&fs_info->commit_root_sem)) {
445		- if (wakeup)
446		- caching_ctl->progress = last;
447		- btrfs_release_path(path);
448		- up_read(&fs_info->commit_root_sem);
449		- mutex_unlock(&caching_ctl->mutex);
450		- cond_resched();
451		- mutex_lock(&caching_ctl->mutex);
452		- down_read(&fs_info->commit_root_sem);
453		- goto next;
454		- }
455		-
456		- ret = btrfs_next_leaf(extent_root, path);
457		- if (ret < 0)
458		- goto out;
459		- if (ret)
460		- break;
461		- leaf = path->nodes[0];
462		- nritems = btrfs_header_nritems(leaf);
463		- continue;
464		- }
465		-
466		- if (key.objectid < last) {
467		- key.objectid = last;
468		- key.offset = 0;
469		- key.type = BTRFS_EXTENT_ITEM_KEY;
470		-
471		- if (wakeup)
472		- caching_ctl->progress = last;
473		- btrfs_release_path(path);
474		- goto next;
475		- }
476		-
477		- if (key.objectid < block_group->key.objectid) {
478		- path->slots[0]++;
479		- continue;
480		- }
481		-
482		- if (key.objectid >= block_group->key.objectid +
483		- block_group->key.offset)
484		- break;
485		-
486		- if (key.type == BTRFS_EXTENT_ITEM_KEY \|\|
487		- key.type == BTRFS_METADATA_ITEM_KEY) {
488		- total_found += add_new_free_space(block_group, last,
489		- key.objectid);
490		- if (key.type == BTRFS_METADATA_ITEM_KEY)
491		- last = key.objectid +
492		- fs_info->nodesize;
493		- else
494		- last = key.objectid + key.offset;
495		-
496		- if (total_found > CACHING_CTL_WAKE_UP) {
497		- total_found = 0;
498		- if (wakeup)
499		- wake_up(&caching_ctl->wait);
500		- }
501		- }
502		- path->slots[0]++;
503		- }
504		- ret = 0;
505		-
506		- total_found += add_new_free_space(block_group, last,
507		- block_group->key.objectid +
508		- block_group->key.offset);
509		- caching_ctl->progress = (u64)-1;
510		-
511		-out:
512		- btrfs_free_path(path);
513		- return ret;
514		-}
515		-
516		-static noinline void caching_thread(struct btrfs_work *work)
517		-{
518		- struct btrfs_block_group_cache *block_group;
519		- struct btrfs_fs_info *fs_info;
520		- struct btrfs_caching_control *caching_ctl;
521		- int ret;
522		-
523		- caching_ctl = container_of(work, struct btrfs_caching_control, work);
524		- block_group = caching_ctl->block_group;
525		- fs_info = block_group->fs_info;
526		-
527		- mutex_lock(&caching_ctl->mutex);
528		- down_read(&fs_info->commit_root_sem);
529		-
530		- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
531		- ret = load_free_space_tree(caching_ctl);
532		- else
533		- ret = load_extent_tree_free(caching_ctl);
534		-
535		- spin_lock(&block_group->lock);
536		- block_group->caching_ctl = NULL;
537		- block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
538		- spin_unlock(&block_group->lock);
539		-
540		-#ifdef CONFIG_BTRFS_DEBUG
541		- if (btrfs_should_fragment_free_space(block_group)) {
542		- u64 bytes_used;
543		-
544		- spin_lock(&block_group->space_info->lock);
545		- spin_lock(&block_group->lock);
546		- bytes_used = block_group->key.offset -
547		- btrfs_block_group_used(&block_group->item);
548		- block_group->space_info->bytes_used += bytes_used >> 1;
549		- spin_unlock(&block_group->lock);
550		- spin_unlock(&block_group->space_info->lock);
551		- fragment_free_space(block_group);
552		- }
553		-#endif
554		-
555		- caching_ctl->progress = (u64)-1;
556		-
557		- up_read(&fs_info->commit_root_sem);
558		- free_excluded_extents(block_group);
559		- mutex_unlock(&caching_ctl->mutex);
560		-
561		- wake_up(&caching_ctl->wait);
562		-
563		- put_caching_control(caching_ctl);
564		- btrfs_put_block_group(block_group);
565		-}
566		-
567		-static int cache_block_group(struct btrfs_block_group_cache *cache,
568		- int load_cache_only)
569		-{
570		- DEFINE_WAIT(wait);
571		- struct btrfs_fs_info *fs_info = cache->fs_info;
572		- struct btrfs_caching_control *caching_ctl;
573		- int ret = 0;
574		-
575		- caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
576		- if (!caching_ctl)
577		- return -ENOMEM;
578		-
579		- INIT_LIST_HEAD(&caching_ctl->list);
580		- mutex_init(&caching_ctl->mutex);
581		- init_waitqueue_head(&caching_ctl->wait);
582		- caching_ctl->block_group = cache;
583		- caching_ctl->progress = cache->key.objectid;
584		- refcount_set(&caching_ctl->count, 1);
585		- btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
586		- caching_thread, NULL, NULL);
587		-
588		- spin_lock(&cache->lock);
589		- /*
590		- * This should be a rare occasion, but this could happen I think in the
591		- * case where one thread starts to load the space cache info, and then
592		- * some other thread starts a transaction commit which tries to do an
593		- * allocation while the other thread is still loading the space cache
594		- * info. The previous loop should have kept us from choosing this block
595		- * group, but if we've moved to the state where we will wait on caching
596		- * block groups we need to first check if we're doing a fast load here,
597		- * so we can wait for it to finish, otherwise we could end up allocating
598		- * from a block group who's cache gets evicted for one reason or
599		- * another.
600		- */
601		- while (cache->cached == BTRFS_CACHE_FAST) {
602		- struct btrfs_caching_control *ctl;
603		-
604		- ctl = cache->caching_ctl;
605		- refcount_inc(&ctl->count);
606		- prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
607		- spin_unlock(&cache->lock);
608		-
609		- schedule();
610		-
611		- finish_wait(&ctl->wait, &wait);
612		- put_caching_control(ctl);
613		- spin_lock(&cache->lock);
614		- }
615		-
616		- if (cache->cached != BTRFS_CACHE_NO) {
617		- spin_unlock(&cache->lock);
618		- kfree(caching_ctl);
619		- return 0;
620		- }
621		- WARN_ON(cache->caching_ctl);
622		- cache->caching_ctl = caching_ctl;
623		- cache->cached = BTRFS_CACHE_FAST;
624		- spin_unlock(&cache->lock);
625		-
626		- if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
627		- mutex_lock(&caching_ctl->mutex);
628		- ret = load_free_space_cache(fs_info, cache);
629		-
630		- spin_lock(&cache->lock);
631		- if (ret == 1) {
632		- cache->caching_ctl = NULL;
633		- cache->cached = BTRFS_CACHE_FINISHED;
634		- cache->last_byte_to_unpin = (u64)-1;
635		- caching_ctl->progress = (u64)-1;
636		- } else {
637		- if (load_cache_only) {
638		- cache->caching_ctl = NULL;
639		- cache->cached = BTRFS_CACHE_NO;
640		- } else {
641		- cache->cached = BTRFS_CACHE_STARTED;
642		- cache->has_caching_ctl = 1;
643		- }
644		- }
645		- spin_unlock(&cache->lock);
646		-#ifdef CONFIG_BTRFS_DEBUG
647		- if (ret == 1 &&
648		- btrfs_should_fragment_free_space(cache)) {
649		- u64 bytes_used;
650		-
651		- spin_lock(&cache->space_info->lock);
652		- spin_lock(&cache->lock);
653		- bytes_used = cache->key.offset -
654		- btrfs_block_group_used(&cache->item);
655		- cache->space_info->bytes_used += bytes_used >> 1;
656		- spin_unlock(&cache->lock);
657		- spin_unlock(&cache->space_info->lock);
658		- fragment_free_space(cache);
659		- }
660		-#endif
661		- mutex_unlock(&caching_ctl->mutex);
662		-
663		- wake_up(&caching_ctl->wait);
664		- if (ret == 1) {
665		- put_caching_control(caching_ctl);
666		- free_excluded_extents(cache);
667		- return 0;
668		- }
669		- } else {
670		- /*
671		- * We're either using the free space tree or no caching at all.
672		- * Set cached to the appropriate value and wakeup any waiters.
673		- */
674		- spin_lock(&cache->lock);
675		- if (load_cache_only) {
676		- cache->caching_ctl = NULL;
677		- cache->cached = BTRFS_CACHE_NO;
678		- } else {
679		- cache->cached = BTRFS_CACHE_STARTED;
680		- cache->has_caching_ctl = 1;
681		- }
682		- spin_unlock(&cache->lock);
683		- wake_up(&caching_ctl->wait);
684		- }
685		-
686		- if (load_cache_only) {
687		- put_caching_control(caching_ctl);
688		- return 0;
689		- }
690		-
691		- down_write(&fs_info->commit_root_sem);
692		- refcount_inc(&caching_ctl->count);
693		- list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
694		- up_write(&fs_info->commit_root_sem);
695		-
696		- btrfs_get_block_group(cache);
697		-
698		- btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
699		-
700		- return ret;
701		-}
702		-
703		-/*
704		- * return the block group that starts at or after bytenr
705		- */
706		-static struct btrfs_block_group_cache *
707		-btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
708		-{
709		- return block_group_cache_tree_search(info, bytenr, 0);
710		-}
711		-
712		-/*
713		- * return the block group that contains the given bytenr
714		- */
715		-struct btrfs_block_group_cache *btrfs_lookup_block_group(
716		- struct btrfs_fs_info *info,
717		- u64 bytenr)
718		-{
719		- return block_group_cache_tree_search(info, bytenr, 1);
720		-}
721		-
722		-static struct btrfs_space_info __find_space_info(struct btrfs_fs_info info,
723		- u64 flags)
724		-{
725		- struct list_head *head = &info->space_info;
726		- struct btrfs_space_info *found;
727		-
728		- flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
729		-
730		- rcu_read_lock();
731		- list_for_each_entry_rcu(found, head, list) {
732		- if (found->flags & flags) {
733		- rcu_read_unlock();
734		- return found;
735		- }
736		- }
737		- rcu_read_unlock();
738		- return NULL;
739		-}
740		-
741		-static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
742		- bool metadata, u64 root_objectid)
743		-{
744		- struct btrfs_space_info *space_info;
745		- u64 flags;
746		-
747		- if (metadata) {
748		- if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
749		- flags = BTRFS_BLOCK_GROUP_SYSTEM;
750		- else
751		- flags = BTRFS_BLOCK_GROUP_METADATA;
752		- } else {
753		- flags = BTRFS_BLOCK_GROUP_DATA;
754		- }
755		-
756		- space_info = __find_space_info(fs_info, flags);
757		- ASSERT(space_info);
758		- percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
759		- BTRFS_TOTAL_BYTES_PINNED_BATCH);
760		-}
761		-
762		-/*
763		- * after adding space to the filesystem, we need to clear the full flags
764		- * on all the space infos.
765		- */
766		-void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
767		-{
768		- struct list_head *head = &info->space_info;
769		- struct btrfs_space_info *found;
770		-
771		- rcu_read_lock();
772		- list_for_each_entry_rcu(found, head, list)
773		- found->full = 0;
774		- rcu_read_unlock();
	81	+ clear_extent_bits(&fs_info->excluded_extents, start, end,
	82	+ EXTENT_UPTODATE);
775	83	}
776	84
777	85	/* simple helper to search for an existing data extent at a given offset */
..	..	@@ -1037,7 +345,7 @@
1037	345
1038	346	/*
1039	347	* is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1040		- * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
	348	+ * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
1041	349	* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1042	350	*/
1043	351	int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
..	..	@@ -1092,18 +400,18 @@
1092	400	return BTRFS_REF_TYPE_INVALID;
1093	401	}
1094	402
1095		-static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
	403	+u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1096	404	{
1097	405	u32 high_crc = ~(u32)0;
1098	406	u32 low_crc = ~(u32)0;
1099	407	__le64 lenum;
1100	408
1101	409	lenum = cpu_to_le64(root_objectid);
1102		- high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
	410	+ high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1103	411	lenum = cpu_to_le64(owner);
1104		- low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
	412	+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1105	413	lenum = cpu_to_le64(offset);
1106		- low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
	414	+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1107	415
1108	416	return ((u64)high_crc << 31) ^ (u64)low_crc;
1109	417	}
..	..	@@ -1685,7 +993,7 @@
1685	993	type = extent_ref_type(parent, owner);
1686	994	size = btrfs_extent_inline_ref_size(type);
1687	995
1688		- btrfs_extend_item(fs_info, path, size);
	996	+ btrfs_extend_item(path, size);
1689	997
1690	998	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1691	999	refs = btrfs_extent_refs(leaf, ei);
..	..	@@ -1760,7 +1068,6 @@
1760	1068	int *last_ref)
1761	1069	{
1762	1070	struct extent_buffer *leaf = path->nodes[0];
1763		- struct btrfs_fs_info *fs_info = leaf->fs_info;
1764	1071	struct btrfs_extent_item *ei;
1765	1072	struct btrfs_extent_data_ref *dref = NULL;
1766	1073	struct btrfs_shared_data_ref *sref = NULL;
..	..	@@ -1815,7 +1122,7 @@
1815	1122	memmove_extent_buffer(leaf, ptr, ptr + size,
1816	1123	end - ptr - size);
1817	1124	item_size -= size;
1818		- btrfs_truncate_item(fs_info, path, item_size, 1);
	1125	+ btrfs_truncate_item(path, item_size, 1);
1819	1126	}
1820	1127	btrfs_mark_buffer_dirty(leaf);
1821	1128	}
..	..	@@ -1835,7 +1142,22 @@
1835	1142	num_bytes, parent, root_objectid,
1836	1143	owner, offset, 1);
1837	1144	if (ret == 0) {
1838		- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
	1145	+ /*
	1146	+ * We're adding refs to a tree block we already own, this
	1147	+ * should not happen at all.
	1148	+ */
	1149	+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
	1150	+ btrfs_crit(trans->fs_info,
	1151	+"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
	1152	+ bytenr, num_bytes, root_objectid);
	1153	+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
	1154	+ WARN_ON(1);
	1155	+ btrfs_crit(trans->fs_info,
	1156	+ "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
	1157	+ btrfs_print_leaf(path->nodes[0]);
	1158	+ }
	1159	+ return -EUCLEAN;
	1160	+ }
1839	1161	update_inline_extent_backref(path, iref, refs_to_add,
1840	1162	extent_op, NULL);
1841	1163	} else if (ret == -ENOENT) {
..	..	@@ -1843,24 +1165,6 @@
1843	1165	root_objectid, owner, offset,
1844	1166	refs_to_add, extent_op);
1845	1167	ret = 0;
1846		- }
1847		- return ret;
1848		-}
1849		-
1850		-static int insert_extent_backref(struct btrfs_trans_handle *trans,
1851		- struct btrfs_path *path,
1852		- u64 bytenr, u64 parent, u64 root_objectid,
1853		- u64 owner, u64 offset, int refs_to_add)
1854		-{
1855		- int ret;
1856		- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1857		- BUG_ON(refs_to_add != 1);
1858		- ret = insert_tree_block_ref(trans, path, bytenr, parent,
1859		- root_objectid);
1860		- } else {
1861		- ret = insert_extent_data_ref(trans, path, bytenr, parent,
1862		- root_objectid, owner, offset,
1863		- refs_to_add);
1864	1168	}
1865	1169	return ret;
1866	1170	}
..	..	@@ -1886,7 +1190,6 @@
1886	1190	return ret;
1887	1191	}
1888	1192
1889		-#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
1890	1193	static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1891	1194	u64 *discarded_bytes)
1892	1195	{
..	..	@@ -1962,8 +1265,10 @@
1962	1265	int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1963	1266	u64 num_bytes, u64 *actual_bytes)
1964	1267	{
1965		- int ret;
	1268	+ int ret = 0;
1966	1269	u64 discarded_bytes = 0;
	1270	+ u64 end = bytenr + num_bytes;
	1271	+ u64 cur = bytenr;
1967	1272	struct btrfs_bio *bbio = NULL;
1968	1273
1969	1274
..	..	@@ -1972,15 +1277,23 @@
1972	1277	* associated to its stripes that don't go away while we are discarding.
1973	1278	*/
1974	1279	btrfs_bio_counter_inc_blocked(fs_info);
1975		- /* Tell the block device(s) that the sectors can be discarded */
1976		- ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1977		- &bbio, 0);
1978		- /* Error condition is -ENOMEM */
1979		- if (!ret) {
1980		- struct btrfs_bio_stripe *stripe = bbio->stripes;
	1280	+ while (cur < end) {
	1281	+ struct btrfs_bio_stripe *stripe;
1981	1282	int i;
1982	1283
	1284	+ num_bytes = end - cur;
	1285	+ /* Tell the block device(s) that the sectors can be discarded */
	1286	+ ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
	1287	+ &num_bytes, &bbio, 0);
	1288	+ /*
	1289	+ * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
	1290	+ * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
	1291	+ * thus we can't continue anyway.
	1292	+ */
	1293	+ if (ret < 0)
	1294	+ goto out;
1983	1295
	1296	+ stripe = bbio->stripes;
1984	1297	for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1985	1298	u64 bytes;
1986	1299	struct request_queue *req_q;
..	..	@@ -2001,10 +1314,19 @@
2001	1314	stripe->physical,
2002	1315	stripe->length,
2003	1316	&bytes);
2004		- if (!ret)
	1317	+ if (!ret) {
2005	1318	discarded_bytes += bytes;
2006		- else if (ret != -EOPNOTSUPP)
2007		- break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
	1319	+ } else if (ret != -EOPNOTSUPP) {
	1320	+ /*
	1321	+ * Logic errors or -ENOMEM, or -EIO, but
	1322	+ * unlikely to happen.
	1323	+ *
	1324	+ * And since there are two loops, explicitly
	1325	+ * go to out to avoid confusion.
	1326	+ */
	1327	+ btrfs_put_bbio(bbio);
	1328	+ goto out;
	1329	+ }
2008	1330
2009	1331	/*
2010	1332	* Just in case we get back EOPNOTSUPP for some reason,
..	..	@@ -2014,7 +1336,9 @@
2014	1336	ret = 0;
2015	1337	}
2016	1338	btrfs_put_bbio(bbio);
	1339	+ cur += num_bytes;
2017	1340	}
	1341	+out:
2018	1342	btrfs_bio_counter_dec(fs_info);
2019	1343
2020	1344	if (actual_bytes)
..	..	@@ -2028,45 +1352,31 @@
2028	1352
2029	1353	/* Can return -ENOMEM */
2030	1354	int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2031		- struct btrfs_root *root,
2032		- u64 bytenr, u64 num_bytes, u64 parent,
2033		- u64 root_objectid, u64 owner, u64 offset)
	1355	+ struct btrfs_ref *generic_ref)
2034	1356	{
2035		- struct btrfs_fs_info *fs_info = root->fs_info;
2036		- int old_ref_mod, new_ref_mod;
	1357	+ struct btrfs_fs_info *fs_info = trans->fs_info;
2037	1358	int ret;
2038	1359
2039		- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2040		- root_objectid == BTRFS_TREE_LOG_OBJECTID);
	1360	+ ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
	1361	+ generic_ref->action);
	1362	+ BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
	1363	+ generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
2041	1364
2042		- btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2043		- owner, offset, BTRFS_ADD_DELAYED_REF);
	1365	+ if (generic_ref->type == BTRFS_REF_METADATA)
	1366	+ ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
	1367	+ else
	1368	+ ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
2044	1369
2045		- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2046		- ret = btrfs_add_delayed_tree_ref(trans, bytenr,
2047		- num_bytes, parent,
2048		- root_objectid, (int)owner,
2049		- BTRFS_ADD_DELAYED_REF, NULL,
2050		- &old_ref_mod, &new_ref_mod);
2051		- } else {
2052		- ret = btrfs_add_delayed_data_ref(trans, bytenr,
2053		- num_bytes, parent,
2054		- root_objectid, owner, offset,
2055		- 0, BTRFS_ADD_DELAYED_REF,
2056		- &old_ref_mod, &new_ref_mod);
2057		- }
2058		-
2059		- if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
2060		- bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
2061		-
2062		- add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
2063		- }
	1370	+ btrfs_ref_tree_mod(fs_info, generic_ref);
2064	1371
2065	1372	return ret;
2066	1373	}
2067	1374
2068	1375	/*
2069	1376	* __btrfs_inc_extent_ref - insert backreference for a given extent
	1377	+ *
	1378	+ * The counterpart is in __btrfs_free_extent(), with examples and more details
	1379	+ * how it works.
2070	1380	*
2071	1381	* @trans: Handle of transaction
2072	1382	*
..	..	@@ -2118,7 +1428,6 @@
2118	1428	if (!path)
2119	1429	return -ENOMEM;
2120	1430
2121		- path->reada = READA_FORWARD;
2122	1431	path->leave_spinning = 1;
2123	1432	/* this will setup the path even if it fails to insert the back ref */
2124	1433	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
..	..	@@ -2143,11 +1452,17 @@
2143	1452	btrfs_mark_buffer_dirty(leaf);
2144	1453	btrfs_release_path(path);
2145	1454
2146		- path->reada = READA_FORWARD;
2147	1455	path->leave_spinning = 1;
2148	1456	/* now insert the actual backref */
2149		- ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2150		- owner, offset, refs_to_add);
	1457	+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
	1458	+ BUG_ON(refs_to_add != 1);
	1459	+ ret = insert_tree_block_ref(trans, path, bytenr, parent,
	1460	+ root_objectid);
	1461	+ } else {
	1462	+ ret = insert_extent_data_ref(trans, path, bytenr, parent,
	1463	+ root_objectid, owner, offset,
	1464	+ refs_to_add);
	1465	+ }
2151	1466	if (ret)
2152	1467	btrfs_abort_transaction(trans, ret);
2153	1468	out:
..	..	@@ -2232,7 +1547,7 @@
2232	1547	int err = 0;
2233	1548	int metadata = !extent_op->is_data;
2234	1549
2235		- if (trans->aborted)
	1550	+ if (TRANS_ABORTED(trans))
2236	1551	return 0;
2237	1552
2238	1553	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
..	..	@@ -2253,7 +1568,6 @@
2253	1568	}
2254	1569
2255	1570	again:
2256		- path->reada = READA_FORWARD;
2257	1571	path->leave_spinning = 1;
2258	1572	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2259	1573	if (ret < 0) {
..	..	@@ -2352,10 +1666,9 @@
2352	1666	{
2353	1667	int ret = 0;
2354	1668
2355		- if (trans->aborted) {
	1669	+ if (TRANS_ABORTED(trans)) {
2356	1670	if (insert_reserved)
2357		- btrfs_pin_extent(trans->fs_info, node->bytenr,
2358		- node->num_bytes, 1);
	1671	+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
2359	1672	return 0;
2360	1673	}
2361	1674
..	..	@@ -2370,8 +1683,7 @@
2370	1683	else
2371	1684	BUG();
2372	1685	if (ret && insert_reserved)
2373		- btrfs_pin_extent(trans->fs_info, node->bytenr,
2374		- node->num_bytes, 1);
	1686	+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
2375	1687	return ret;
2376	1688	}
2377	1689
..	..	@@ -2380,7 +1692,7 @@
2380	1692	{
2381	1693	struct btrfs_delayed_ref_node *ref;
2382	1694
2383		- if (RB_EMPTY_ROOT(&head->ref_tree))
	1695	+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
2384	1696	return NULL;
2385	1697
2386	1698	/*
..	..	@@ -2393,7 +1705,7 @@
2393	1705	return list_first_entry(&head->ref_add_list,
2394	1706	struct btrfs_delayed_ref_node, add_list);
2395	1707
2396		- ref = rb_entry(rb_first(&head->ref_tree),
	1708	+ ref = rb_entry(rb_first_cached(&head->ref_tree),
2397	1709	struct btrfs_delayed_ref_node, ref_node);
2398	1710	ASSERT(list_empty(&ref->add_list));
2399	1711	return ref;
..	..	@@ -2409,23 +1721,69 @@
2409	1721	btrfs_delayed_ref_unlock(head);
2410	1722	}
2411	1723
2412		-static int cleanup_extent_op(struct btrfs_trans_handle *trans,
2413		- struct btrfs_delayed_ref_head *head)
	1724	+static struct btrfs_delayed_extent_op *cleanup_extent_op(
	1725	+ struct btrfs_delayed_ref_head *head)
2414	1726	{
2415	1727	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
	1728	+
	1729	+ if (!extent_op)
	1730	+ return NULL;
	1731	+
	1732	+ if (head->must_insert_reserved) {
	1733	+ head->extent_op = NULL;
	1734	+ btrfs_free_delayed_extent_op(extent_op);
	1735	+ return NULL;
	1736	+ }
	1737	+ return extent_op;
	1738	+}
	1739	+
	1740	+static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
	1741	+ struct btrfs_delayed_ref_head *head)
	1742	+{
	1743	+ struct btrfs_delayed_extent_op *extent_op;
2416	1744	int ret;
2417	1745
	1746	+ extent_op = cleanup_extent_op(head);
2418	1747	if (!extent_op)
2419	1748	return 0;
2420	1749	head->extent_op = NULL;
2421		- if (head->must_insert_reserved) {
2422		- btrfs_free_delayed_extent_op(extent_op);
2423		- return 0;
2424		- }
2425	1750	spin_unlock(&head->lock);
2426	1751	ret = run_delayed_extent_op(trans, head, extent_op);
2427	1752	btrfs_free_delayed_extent_op(extent_op);
2428	1753	return ret ? ret : 1;
	1754	+}
	1755	+
	1756	+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
	1757	+ struct btrfs_delayed_ref_root *delayed_refs,
	1758	+ struct btrfs_delayed_ref_head *head)
	1759	+{
	1760	+ int nr_items = 1; /* Dropping this ref head update. */
	1761	+
	1762	+ /*
	1763	+ * We had csum deletions accounted for in our delayed refs rsv, we need
	1764	+ * to drop the csum leaves for this update from our delayed_refs_rsv.
	1765	+ */
	1766	+ if (head->total_ref_mod < 0 && head->is_data) {
	1767	+ spin_lock(&delayed_refs->lock);
	1768	+ delayed_refs->pending_csums -= head->num_bytes;
	1769	+ spin_unlock(&delayed_refs->lock);
	1770	+ nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
	1771	+ }
	1772	+
	1773	+ /*
	1774	+ * We were dropping refs, or had a new ref and dropped it, and thus must
	1775	+ * adjust down our total_bytes_pinned, the space may or may not have
	1776	+ * been pinned and so is accounted for properly in the pinned space by
	1777	+ * now.
	1778	+ */
	1779	+ if (head->total_ref_mod < 0 \|\|
	1780	+ (head->total_ref_mod == 0 && head->must_insert_reserved)) {
	1781	+ u64 flags = btrfs_ref_head_to_space_flags(head);
	1782	+
	1783	+ btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
	1784	+ }
	1785	+
	1786	+ btrfs_delayed_refs_rsv_release(fs_info, nr_items);
2429	1787	}
2430	1788
2431	1789	static int cleanup_ref_head(struct btrfs_trans_handle *trans,
..	..	@@ -2438,7 +1796,7 @@
2438	1796
2439	1797	delayed_refs = &trans->transaction->delayed_refs;
2440	1798
2441		- ret = cleanup_extent_op(trans, head);
	1799	+ ret = run_and_cleanup_extent_op(trans, head);
2442	1800	if (ret < 0) {
2443	1801	unselect_delayed_ref_head(delayed_refs, head);
2444	1802	btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
..	..	@@ -2454,156 +1812,91 @@
2454	1812	spin_unlock(&head->lock);
2455	1813	spin_lock(&delayed_refs->lock);
2456	1814	spin_lock(&head->lock);
2457		- if (!RB_EMPTY_ROOT(&head->ref_tree) \|\| head->extent_op) {
	1815	+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) \|\| head->extent_op) {
2458	1816	spin_unlock(&head->lock);
2459	1817	spin_unlock(&delayed_refs->lock);
2460	1818	return 1;
2461	1819	}
2462		- delayed_refs->num_heads--;
2463		- rb_erase(&head->href_node, &delayed_refs->href_root);
2464		- RB_CLEAR_NODE(&head->href_node);
	1820	+ btrfs_delete_ref_head(delayed_refs, head);
2465	1821	spin_unlock(&head->lock);
2466	1822	spin_unlock(&delayed_refs->lock);
2467		- atomic_dec(&delayed_refs->num_entries);
2468		-
2469		- trace_run_delayed_ref_head(fs_info, head, 0);
2470		-
2471		- if (head->total_ref_mod < 0) {
2472		- struct btrfs_space_info *space_info;
2473		- u64 flags;
2474		-
2475		- if (head->is_data)
2476		- flags = BTRFS_BLOCK_GROUP_DATA;
2477		- else if (head->is_system)
2478		- flags = BTRFS_BLOCK_GROUP_SYSTEM;
2479		- else
2480		- flags = BTRFS_BLOCK_GROUP_METADATA;
2481		- space_info = __find_space_info(fs_info, flags);
2482		- ASSERT(space_info);
2483		- percpu_counter_add_batch(&space_info->total_bytes_pinned,
2484		- -head->num_bytes,
2485		- BTRFS_TOTAL_BYTES_PINNED_BATCH);
2486		-
2487		- if (head->is_data) {
2488		- spin_lock(&delayed_refs->lock);
2489		- delayed_refs->pending_csums -= head->num_bytes;
2490		- spin_unlock(&delayed_refs->lock);
2491		- }
2492		- }
2493	1823
2494	1824	if (head->must_insert_reserved) {
2495		- btrfs_pin_extent(fs_info, head->bytenr,
2496		- head->num_bytes, 1);
	1825	+ btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
2497	1826	if (head->is_data) {
2498	1827	ret = btrfs_del_csums(trans, fs_info->csum_root,
2499	1828	head->bytenr, head->num_bytes);
2500	1829	}
2501	1830	}
2502	1831
2503		- /* Also free its reserved qgroup space */
2504		- btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2505		- head->qgroup_reserved);
	1832	+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
	1833	+
	1834	+ trace_run_delayed_ref_head(fs_info, head, 0);
2506	1835	btrfs_delayed_ref_unlock(head);
2507	1836	btrfs_put_delayed_ref_head(head);
2508	1837	return ret;
2509	1838	}
2510	1839
2511		-/*
2512		- * Returns 0 on success or if called with an already aborted transaction.
2513		- * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2514		- */
2515		-static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2516		- unsigned long nr)
	1840	+static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
	1841	+ struct btrfs_trans_handle *trans)
	1842	+{
	1843	+ struct btrfs_delayed_ref_root *delayed_refs =
	1844	+ &trans->transaction->delayed_refs;
	1845	+ struct btrfs_delayed_ref_head *head = NULL;
	1846	+ int ret;
	1847	+
	1848	+ spin_lock(&delayed_refs->lock);
	1849	+ head = btrfs_select_ref_head(delayed_refs);
	1850	+ if (!head) {
	1851	+ spin_unlock(&delayed_refs->lock);
	1852	+ return head;
	1853	+ }
	1854	+
	1855	+ /*
	1856	+ * Grab the lock that says we are going to process all the refs for
	1857	+ * this head
	1858	+ */
	1859	+ ret = btrfs_delayed_ref_lock(delayed_refs, head);
	1860	+ spin_unlock(&delayed_refs->lock);
	1861	+
	1862	+ /*
	1863	+ * We may have dropped the spin lock to get the head mutex lock, and
	1864	+ * that might have given someone else time to free the head. If that's
	1865	+ * true, it has been removed from our list and we can move on.
	1866	+ */
	1867	+ if (ret == -EAGAIN)
	1868	+ head = ERR_PTR(-EAGAIN);
	1869	+
	1870	+ return head;
	1871	+}
	1872	+
	1873	+static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
	1874	+ struct btrfs_delayed_ref_head *locked_ref,
	1875	+ unsigned long *run_refs)
2517	1876	{
2518	1877	struct btrfs_fs_info *fs_info = trans->fs_info;
2519	1878	struct btrfs_delayed_ref_root *delayed_refs;
2520		- struct btrfs_delayed_ref_node *ref;
2521		- struct btrfs_delayed_ref_head *locked_ref = NULL;
2522	1879	struct btrfs_delayed_extent_op *extent_op;
2523		- ktime_t start = ktime_get();
2524		- int ret;
2525		- unsigned long count = 0;
2526		- unsigned long actual_count = 0;
	1880	+ struct btrfs_delayed_ref_node *ref;
2527	1881	int must_insert_reserved = 0;
	1882	+ int ret;
2528	1883
2529	1884	delayed_refs = &trans->transaction->delayed_refs;
2530		- while (1) {
2531		- if (!locked_ref) {
2532		- if (count >= nr)
2533		- break;
2534	1885
2535		- spin_lock(&delayed_refs->lock);
2536		- locked_ref = btrfs_select_ref_head(trans);
2537		- if (!locked_ref) {
2538		- spin_unlock(&delayed_refs->lock);
2539		- break;
2540		- }
	1886	+ lockdep_assert_held(&locked_ref->mutex);
	1887	+ lockdep_assert_held(&locked_ref->lock);
2541	1888
2542		- /* grab the lock that says we are going to process
2543		- * all the refs for this head */
2544		- ret = btrfs_delayed_ref_lock(trans, locked_ref);
2545		- spin_unlock(&delayed_refs->lock);
2546		- /*
2547		- * we may have dropped the spin lock to get the head
2548		- * mutex lock, and that might have given someone else
2549		- * time to free the head. If that's true, it has been
2550		- * removed from our list and we can move on.
2551		- */
2552		- if (ret == -EAGAIN) {
2553		- locked_ref = NULL;
2554		- count++;
2555		- continue;
2556		- }
2557		- }
2558		-
2559		- /*
2560		- * We need to try and merge add/drops of the same ref since we
2561		- * can run into issues with relocate dropping the implicit ref
2562		- * and then it being added back again before the drop can
2563		- * finish. If we merged anything we need to re-loop so we can
2564		- * get a good ref.
2565		- * Or we can get node references of the same type that weren't
2566		- * merged when created due to bumps in the tree mod seq, and
2567		- * we need to merge them to prevent adding an inline extent
2568		- * backref before dropping it (triggering a BUG_ON at
2569		- * insert_inline_extent_backref()).
2570		- */
2571		- spin_lock(&locked_ref->lock);
2572		- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2573		-
2574		- ref = select_delayed_ref(locked_ref);
2575		-
2576		- if (ref && ref->seq &&
	1889	+ while ((ref = select_delayed_ref(locked_ref))) {
	1890	+ if (ref->seq &&
2577	1891	btrfs_check_delayed_seq(fs_info, ref->seq)) {
2578	1892	spin_unlock(&locked_ref->lock);
2579	1893	unselect_delayed_ref_head(delayed_refs, locked_ref);
2580		- locked_ref = NULL;
2581		- cond_resched();
2582		- count++;
2583		- continue;
	1894	+ return -EAGAIN;
2584	1895	}
2585	1896
2586		- /*
2587		- * We're done processing refs in this ref_head, clean everything
2588		- * up and move on to the next ref_head.
2589		- */
2590		- if (!ref) {
2591		- ret = cleanup_ref_head(trans, locked_ref);
2592		- if (ret > 0 ) {
2593		- /* We dropped our lock, we need to loop. */
2594		- ret = 0;
2595		- continue;
2596		- } else if (ret) {
2597		- return ret;
2598		- }
2599		- locked_ref = NULL;
2600		- count++;
2601		- continue;
2602		- }
2603		-
2604		- actual_count++;
	1897	+ (*run_refs)++;
2605	1898	ref->in_tree = 0;
2606		- rb_erase(&ref->ref_node, &locked_ref->ref_tree);
	1899	+ rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2607	1900	RB_CLEAR_NODE(&ref->ref_node);
2608	1901	if (!list_empty(&ref->add_list))
2609	1902	list_del(&ref->add_list);
..	..	@@ -2625,8 +1918,8 @@
2625	1918	atomic_dec(&delayed_refs->num_entries);
2626	1919
2627	1920	/*
2628		- * Record the must-insert_reserved flag before we drop the spin
2629		- * lock.
	1921	+ * Record the must_insert_reserved flag before we drop the
	1922	+ * spin lock.
2630	1923	*/
2631	1924	must_insert_reserved = locked_ref->must_insert_reserved;
2632	1925	locked_ref->must_insert_reserved = 0;
..	..	@@ -2648,9 +1941,89 @@
2648	1941	}
2649	1942
2650	1943	btrfs_put_delayed_ref(ref);
2651		- count++;
2652	1944	cond_resched();
	1945	+
	1946	+ spin_lock(&locked_ref->lock);
	1947	+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2653	1948	}
	1949	+
	1950	+ return 0;
	1951	+}
	1952	+
	1953	+/*
	1954	+ * Returns 0 on success or if called with an already aborted transaction.
	1955	+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
	1956	+ */
	1957	+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
	1958	+ unsigned long nr)
	1959	+{
	1960	+ struct btrfs_fs_info *fs_info = trans->fs_info;
	1961	+ struct btrfs_delayed_ref_root *delayed_refs;
	1962	+ struct btrfs_delayed_ref_head *locked_ref = NULL;
	1963	+ ktime_t start = ktime_get();
	1964	+ int ret;
	1965	+ unsigned long count = 0;
	1966	+ unsigned long actual_count = 0;
	1967	+
	1968	+ delayed_refs = &trans->transaction->delayed_refs;
	1969	+ do {
	1970	+ if (!locked_ref) {
	1971	+ locked_ref = btrfs_obtain_ref_head(trans);
	1972	+ if (IS_ERR_OR_NULL(locked_ref)) {
	1973	+ if (PTR_ERR(locked_ref) == -EAGAIN) {
	1974	+ continue;
	1975	+ } else {
	1976	+ break;
	1977	+ }
	1978	+ }
	1979	+ count++;
	1980	+ }
	1981	+ /*
	1982	+ * We need to try and merge add/drops of the same ref since we
	1983	+ * can run into issues with relocate dropping the implicit ref
	1984	+ * and then it being added back again before the drop can
	1985	+ * finish. If we merged anything we need to re-loop so we can
	1986	+ * get a good ref.
	1987	+ * Or we can get node references of the same type that weren't
	1988	+ * merged when created due to bumps in the tree mod seq, and
	1989	+ * we need to merge them to prevent adding an inline extent
	1990	+ * backref before dropping it (triggering a BUG_ON at
	1991	+ * insert_inline_extent_backref()).
	1992	+ */
	1993	+ spin_lock(&locked_ref->lock);
	1994	+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
	1995	+
	1996	+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
	1997	+ &actual_count);
	1998	+ if (ret < 0 && ret != -EAGAIN) {
	1999	+ /*
	2000	+ * Error, btrfs_run_delayed_refs_for_head already
	2001	+ * unlocked everything so just bail out
	2002	+ */
	2003	+ return ret;
	2004	+ } else if (!ret) {
	2005	+ /*
	2006	+ * Success, perform the usual cleanup of a processed
	2007	+ * head
	2008	+ */
	2009	+ ret = cleanup_ref_head(trans, locked_ref);
	2010	+ if (ret > 0 ) {
	2011	+ /* We dropped our lock, we need to loop. */
	2012	+ ret = 0;
	2013	+ continue;
	2014	+ } else if (ret) {
	2015	+ return ret;
	2016	+ }
	2017	+ }
	2018	+
	2019	+ /*
	2020	+ * Either success case or btrfs_run_delayed_refs_for_head
	2021	+ * returned -EAGAIN, meaning we need to select another head
	2022	+ */
	2023	+
	2024	+ locked_ref = NULL;
	2025	+ cond_resched();
	2026	+ } while ((nr != -1 && count < nr) \|\| locked_ref);
2654	2027
2655	2028	/*
2656	2029	* We don't want to include ref heads since we can have empty ref heads
..	..	@@ -2716,22 +2089,6 @@
2716	2089	}
2717	2090	#endif
2718	2091
2719		-static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2720		-{
2721		- u64 num_bytes;
2722		-
2723		- num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2724		- sizeof(struct btrfs_extent_inline_ref));
2725		- if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2726		- num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2727		-
2728		- /*
2729		- * We don't ever fill up leaves all the way so multiply by 2 just to be
2730		- * closer to what we're really going to want to use.
2731		- */
2732		- return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2733		-}
2734		-
2735	2092	/*
2736	2093	* Takes the number of bytes to be csumm'ed and figures out how many leaves it
2737	2094	* would require to store the csums for that many bytes.
..	..	@@ -2749,153 +2106,6 @@
2749	2106	num_csums += num_csums_per_leaf - 1;
2750	2107	num_csums = div64_u64(num_csums, num_csums_per_leaf);
2751	2108	return num_csums;
2752		-}
2753		-
2754		-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2755		- struct btrfs_fs_info *fs_info)
2756		-{
2757		- struct btrfs_block_rsv *global_rsv;
2758		- u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2759		- u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2760		- unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
2761		- u64 num_bytes, num_dirty_bgs_bytes;
2762		- int ret = 0;
2763		-
2764		- num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
2765		- num_heads = heads_to_leaves(fs_info, num_heads);
2766		- if (num_heads > 1)
2767		- num_bytes += (num_heads - 1) * fs_info->nodesize;
2768		- num_bytes <<= 1;
2769		- num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
2770		- fs_info->nodesize;
2771		- num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
2772		- num_dirty_bgs);
2773		- global_rsv = &fs_info->global_block_rsv;
2774		-
2775		- /*
2776		- * If we can't allocate any more chunks lets make sure we have _lots_ of
2777		- * wiggle room since running delayed refs can create more delayed refs.
2778		- */
2779		- if (global_rsv->space_info->full) {
2780		- num_dirty_bgs_bytes <<= 1;
2781		- num_bytes <<= 1;
2782		- }
2783		-
2784		- spin_lock(&global_rsv->lock);
2785		- if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2786		- ret = 1;
2787		- spin_unlock(&global_rsv->lock);
2788		- return ret;
2789		-}
2790		-
2791		-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2792		- struct btrfs_fs_info *fs_info)
2793		-{
2794		- u64 num_entries =
2795		- atomic_read(&trans->transaction->delayed_refs.num_entries);
2796		- u64 avg_runtime;
2797		- u64 val;
2798		-
2799		- smp_mb();
2800		- avg_runtime = fs_info->avg_delayed_ref_runtime;
2801		- val = num_entries * avg_runtime;
2802		- if (val >= NSEC_PER_SEC)
2803		- return 1;
2804		- if (val >= NSEC_PER_SEC / 2)
2805		- return 2;
2806		-
2807		- return btrfs_check_space_for_delayed_refs(trans, fs_info);
2808		-}
2809		-
2810		-struct async_delayed_refs {
2811		- struct btrfs_root *root;
2812		- u64 transid;
2813		- int count;
2814		- int error;
2815		- int sync;
2816		- struct completion wait;
2817		- struct btrfs_work work;
2818		-};
2819		-
2820		-static inline struct async_delayed_refs *
2821		-to_async_delayed_refs(struct btrfs_work *work)
2822		-{
2823		- return container_of(work, struct async_delayed_refs, work);
2824		-}
2825		-
2826		-static void delayed_ref_async_start(struct btrfs_work *work)
2827		-{
2828		- struct async_delayed_refs *async = to_async_delayed_refs(work);
2829		- struct btrfs_trans_handle *trans;
2830		- struct btrfs_fs_info *fs_info = async->root->fs_info;
2831		- int ret;
2832		-
2833		- /* if the commit is already started, we don't need to wait here */
2834		- if (btrfs_transaction_blocked(fs_info))
2835		- goto done;
2836		-
2837		- trans = btrfs_join_transaction(async->root);
2838		- if (IS_ERR(trans)) {
2839		- async->error = PTR_ERR(trans);
2840		- goto done;
2841		- }
2842		-
2843		- /*
2844		- * trans->sync means that when we call end_transaction, we won't
2845		- * wait on delayed refs
2846		- */
2847		- trans->sync = true;
2848		-
2849		- /* Don't bother flushing if we got into a different transaction */
2850		- if (trans->transid > async->transid)
2851		- goto end;
2852		-
2853		- ret = btrfs_run_delayed_refs(trans, async->count);
2854		- if (ret)
2855		- async->error = ret;
2856		-end:
2857		- ret = btrfs_end_transaction(trans);
2858		- if (ret && !async->error)
2859		- async->error = ret;
2860		-done:
2861		- if (async->sync)
2862		- complete(&async->wait);
2863		- else
2864		- kfree(async);
2865		-}
2866		-
2867		-int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
2868		- unsigned long count, u64 transid, int wait)
2869		-{
2870		- struct async_delayed_refs *async;
2871		- int ret;
2872		-
2873		- async = kmalloc(sizeof(*async), GFP_NOFS);
2874		- if (!async)
2875		- return -ENOMEM;
2876		-
2877		- async->root = fs_info->tree_root;
2878		- async->count = count;
2879		- async->error = 0;
2880		- async->transid = transid;
2881		- if (wait)
2882		- async->sync = 1;
2883		- else
2884		- async->sync = 0;
2885		- init_completion(&async->wait);
2886		-
2887		- btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2888		- delayed_ref_async_start, NULL, NULL);
2889		-
2890		- btrfs_queue_work(fs_info->extent_workers, &async->work);
2891		-
2892		- if (wait) {
2893		- wait_for_completion(&async->wait);
2894		- ret = async->error;
2895		- kfree(async);
2896		- return ret;
2897		- }
2898		- return 0;
2899	2109	}
2900	2110
2901	2111	/*
..	..	@@ -2919,7 +2129,7 @@
2919	2129	int run_all = count == (unsigned long)-1;
2920	2130
2921	2131	/* We'll clean this up in btrfs_cleanup_transaction */
2922		- if (trans->aborted)
	2132	+ if (TRANS_ABORTED(trans))
2923	2133	return 0;
2924	2134
2925	2135	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
..	..	@@ -2940,11 +2150,10 @@
2940	2150	}
2941	2151
2942	2152	if (run_all) {
2943		- if (!list_empty(&trans->new_bgs))
2944		- btrfs_create_pending_block_groups(trans);
	2153	+ btrfs_create_pending_block_groups(trans);
2945	2154
2946	2155	spin_lock(&delayed_refs->lock);
2947		- node = rb_first(&delayed_refs->href_root);
	2156	+ node = rb_first_cached(&delayed_refs->href_root);
2948	2157	if (!node) {
2949	2158	spin_unlock(&delayed_refs->lock);
2950	2159	goto out;
..	..	@@ -2967,8 +2176,7 @@
2967	2176	}
2968	2177
2969	2178	int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2970		- struct btrfs_fs_info *fs_info,
2971		- u64 bytenr, u64 num_bytes, u64 flags,
	2179	+ struct extent_buffer *eb, u64 flags,
2972	2180	int level, int is_data)
2973	2181	{
2974	2182	struct btrfs_delayed_extent_op *extent_op;
..	..	@@ -2984,8 +2192,7 @@
2984	2192	extent_op->is_data = is_data ? true : false;
2985	2193	extent_op->level = level;
2986	2194
2987		- ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
2988		- num_bytes, extent_op);
	2195	+ ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2989	2196	if (ret)
2990	2197	btrfs_free_delayed_extent_op(extent_op);
2991	2198	return ret;
..	..	@@ -3043,7 +2250,8 @@
3043	2250	* XXX: We should replace this with a proper search function in the
3044	2251	* future.
3045	2252	*/
3046		- for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
	2253	+ for (node = rb_first_cached(&head->ref_tree); node;
	2254	+ node = rb_next(node)) {
3047	2255	ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
3048	2256	/* If it's a shared ref we know a cross reference exists */
3049	2257	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
..	..	@@ -3072,7 +2280,8 @@
3072	2280
3073	2281	static noinline int check_committed_ref(struct btrfs_root *root,
3074	2282	struct btrfs_path *path,
3075		- u64 objectid, u64 offset, u64 bytenr)
	2283	+ u64 objectid, u64 offset, u64 bytenr,
	2284	+ bool strict)
3076	2285	{
3077	2286	struct btrfs_fs_info *fs_info = root->fs_info;
3078	2287	struct btrfs_root *extent_root = fs_info->extent_root;
..	..	@@ -3109,16 +2318,23 @@
3109	2318	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3110	2319	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3111	2320
	2321	+ /* If extent item has more than 1 inline ref then it's shared */
3112	2322	if (item_size != sizeof(*ei) +
3113	2323	btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3114	2324	goto out;
3115	2325
3116		- if (btrfs_extent_generation(leaf, ei) <=
3117		- btrfs_root_last_snapshot(&root->root_item))
	2326	+ /*
	2327	+ * If extent created before last snapshot => it's shared unless the
	2328	+ * snapshot has been deleted. Use the heuristic if strict is false.
	2329	+ */
	2330	+ if (!strict &&
	2331	+ (btrfs_extent_generation(leaf, ei) <=
	2332	+ btrfs_root_last_snapshot(&root->root_item)))
3118	2333	goto out;
3119	2334
3120	2335	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3121	2336
	2337	+ /* If this extent has SHARED_DATA_REF then it's shared */
3122	2338	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3123	2339	if (type != BTRFS_EXTENT_DATA_REF_KEY)
3124	2340	goto out;
..	..	@@ -3138,11 +2354,10 @@
3138	2354	}
3139	2355
3140	2356	int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3141		- u64 bytenr)
	2357	+ u64 bytenr, bool strict)
3142	2358	{
3143	2359	struct btrfs_path *path;
3144	2360	int ret;
3145		- int ret2;
3146	2361
3147	2362	path = btrfs_alloc_path();
3148	2363	if (!path)
..	..	@@ -3150,21 +2365,13 @@
3150	2365
3151	2366	do {
3152	2367	ret = check_committed_ref(root, path, objectid,
3153		- offset, bytenr);
	2368	+ offset, bytenr, strict);
3154	2369	if (ret && ret != -ENOENT)
3155	2370	goto out;
3156	2371
3157		- ret2 = check_delayed_ref(root, path, objectid,
3158		- offset, bytenr);
3159		- } while (ret2 == -EAGAIN);
	2372	+ ret = check_delayed_ref(root, path, objectid, offset, bytenr);
	2373	+ } while (ret == -EAGAIN);
3160	2374
3161		- if (ret2 && ret2 != -ENOENT) {
3162		- ret = ret2;
3163		- goto out;
3164		- }
3165		-
3166		- if (ret != -ENOENT \|\| ret2 != -ENOENT)
3167		- ret = 0;
3168	2375	out:
3169	2376	btrfs_free_path(path);
3170	2377	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
..	..	@@ -3185,13 +2392,12 @@
3185	2392	u32 nritems;
3186	2393	struct btrfs_key key;
3187	2394	struct btrfs_file_extent_item *fi;
	2395	+ struct btrfs_ref generic_ref = { 0 };
	2396	+ bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
3188	2397	int i;
	2398	+ int action;
3189	2399	int level;
3190	2400	int ret = 0;
3191		- int (process_func)(struct btrfs_trans_handle ,
3192		- struct btrfs_root *,
3193		- u64, u64, u64, u64, u64, u64);
3194		-
3195	2401
3196	2402	if (btrfs_is_testing(fs_info))
3197	2403	return 0;
..	..	@@ -3200,18 +2406,17 @@
3200	2406	nritems = btrfs_header_nritems(buf);
3201	2407	level = btrfs_header_level(buf);
3202	2408
3203		- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
	2409	+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
3204	2410	return 0;
3205		-
3206		- if (inc)
3207		- process_func = btrfs_inc_extent_ref;
3208		- else
3209		- process_func = btrfs_free_extent;
3210	2411
3211	2412	if (full_backref)
3212	2413	parent = buf->start;
3213	2414	else
3214	2415	parent = 0;
	2416	+ if (inc)
	2417	+ action = BTRFS_ADD_DELAYED_REF;
	2418	+ else
	2419	+ action = BTRFS_DROP_DELAYED_REF;
3215	2420
3216	2421	for (i = 0; i < nritems; i++) {
3217	2422	if (level == 0) {
..	..	@@ -3229,16 +2434,30 @@
3229	2434
3230	2435	num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3231	2436	key.offset -= btrfs_file_extent_offset(buf, fi);
3232		- ret = process_func(trans, root, bytenr, num_bytes,
3233		- parent, ref_root, key.objectid,
3234		- key.offset);
	2437	+ btrfs_init_generic_ref(&generic_ref, action, bytenr,
	2438	+ num_bytes, parent);
	2439	+ generic_ref.real_root = root->root_key.objectid;
	2440	+ btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
	2441	+ key.offset);
	2442	+ generic_ref.skip_qgroup = for_reloc;
	2443	+ if (inc)
	2444	+ ret = btrfs_inc_extent_ref(trans, &generic_ref);
	2445	+ else
	2446	+ ret = btrfs_free_extent(trans, &generic_ref);
3235	2447	if (ret)
3236	2448	goto fail;
3237	2449	} else {
3238	2450	bytenr = btrfs_node_blockptr(buf, i);
3239	2451	num_bytes = fs_info->nodesize;
3240		- ret = process_func(trans, root, bytenr, num_bytes,
3241		- parent, ref_root, level - 1, 0);
	2452	+ btrfs_init_generic_ref(&generic_ref, action, bytenr,
	2453	+ num_bytes, parent);
	2454	+ generic_ref.real_root = root->root_key.objectid;
	2455	+ btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
	2456	+ generic_ref.skip_qgroup = for_reloc;
	2457	+ if (inc)
	2458	+ ret = btrfs_inc_extent_ref(trans, &generic_ref);
	2459	+ else
	2460	+ ret = btrfs_free_extent(trans, &generic_ref);
3242	2461	if (ret)
3243	2462	goto fail;
3244	2463	}
..	..	@@ -3260,555 +2479,9 @@
3260	2479	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3261	2480	}
3262	2481
3263		-static int write_one_cache_group(struct btrfs_trans_handle *trans,
3264		- struct btrfs_fs_info *fs_info,
3265		- struct btrfs_path *path,
3266		- struct btrfs_block_group_cache *cache)
3267		-{
3268		- int ret;
3269		- struct btrfs_root *extent_root = fs_info->extent_root;
3270		- unsigned long bi;
3271		- struct extent_buffer *leaf;
3272		-
3273		- ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3274		- if (ret) {
3275		- if (ret > 0)
3276		- ret = -ENOENT;
3277		- goto fail;
3278		- }
3279		-
3280		- leaf = path->nodes[0];
3281		- bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3282		- write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3283		- btrfs_mark_buffer_dirty(leaf);
3284		-fail:
3285		- btrfs_release_path(path);
3286		- return ret;
3287		-
3288		-}
3289		-
3290		-static struct btrfs_block_group_cache *
3291		-next_block_group(struct btrfs_fs_info *fs_info,
3292		- struct btrfs_block_group_cache *cache)
3293		-{
3294		- struct rb_node *node;
3295		-
3296		- spin_lock(&fs_info->block_group_cache_lock);
3297		-
3298		- /* If our block group was removed, we need a full search. */
3299		- if (RB_EMPTY_NODE(&cache->cache_node)) {
3300		- const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3301		-
3302		- spin_unlock(&fs_info->block_group_cache_lock);
3303		- btrfs_put_block_group(cache);
3304		- cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3305		- }
3306		- node = rb_next(&cache->cache_node);
3307		- btrfs_put_block_group(cache);
3308		- if (node) {
3309		- cache = rb_entry(node, struct btrfs_block_group_cache,
3310		- cache_node);
3311		- btrfs_get_block_group(cache);
3312		- } else
3313		- cache = NULL;
3314		- spin_unlock(&fs_info->block_group_cache_lock);
3315		- return cache;
3316		-}
3317		-
3318		-static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3319		- struct btrfs_trans_handle *trans,
3320		- struct btrfs_path *path)
3321		-{
3322		- struct btrfs_fs_info *fs_info = block_group->fs_info;
3323		- struct btrfs_root *root = fs_info->tree_root;
3324		- struct inode *inode = NULL;
3325		- struct extent_changeset *data_reserved = NULL;
3326		- u64 alloc_hint = 0;
3327		- int dcs = BTRFS_DC_ERROR;
3328		- u64 num_pages = 0;
3329		- int retries = 0;
3330		- int ret = 0;
3331		-
3332		- /*
3333		- * If this block group is smaller than 100 megs don't bother caching the
3334		- * block group.
3335		- */
3336		- if (block_group->key.offset < (100 * SZ_1M)) {
3337		- spin_lock(&block_group->lock);
3338		- block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3339		- spin_unlock(&block_group->lock);
3340		- return 0;
3341		- }
3342		-
3343		- if (trans->aborted)
3344		- return 0;
3345		-again:
3346		- inode = lookup_free_space_inode(fs_info, block_group, path);
3347		- if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3348		- ret = PTR_ERR(inode);
3349		- btrfs_release_path(path);
3350		- goto out;
3351		- }
3352		-
3353		- if (IS_ERR(inode)) {
3354		- BUG_ON(retries);
3355		- retries++;
3356		-
3357		- if (block_group->ro)
3358		- goto out_free;
3359		-
3360		- ret = create_free_space_inode(fs_info, trans, block_group,
3361		- path);
3362		- if (ret)
3363		- goto out_free;
3364		- goto again;
3365		- }
3366		-
3367		- /*
3368		- * We want to set the generation to 0, that way if anything goes wrong
3369		- * from here on out we know not to trust this cache when we load up next
3370		- * time.
3371		- */
3372		- BTRFS_I(inode)->generation = 0;
3373		- ret = btrfs_update_inode(trans, root, inode);
3374		- if (ret) {
3375		- /*
3376		- * So theoretically we could recover from this, simply set the
3377		- * super cache generation to 0 so we know to invalidate the
3378		- * cache, but then we'd have to keep track of the block groups
3379		- * that fail this way so we know we _have_ to reset this cache
3380		- * before the next commit or risk reading stale cache. So to
3381		- * limit our exposure to horrible edge cases lets just abort the
3382		- * transaction, this only happens in really bad situations
3383		- * anyway.
3384		- */
3385		- btrfs_abort_transaction(trans, ret);
3386		- goto out_put;
3387		- }
3388		- WARN_ON(ret);
3389		-
3390		- /* We've already setup this transaction, go ahead and exit */
3391		- if (block_group->cache_generation == trans->transid &&
3392		- i_size_read(inode)) {
3393		- dcs = BTRFS_DC_SETUP;
3394		- goto out_put;
3395		- }
3396		-
3397		- if (i_size_read(inode) > 0) {
3398		- ret = btrfs_check_trunc_cache_free_space(fs_info,
3399		- &fs_info->global_block_rsv);
3400		- if (ret)
3401		- goto out_put;
3402		-
3403		- ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3404		- if (ret)
3405		- goto out_put;
3406		- }
3407		-
3408		- spin_lock(&block_group->lock);
3409		- if (block_group->cached != BTRFS_CACHE_FINISHED \|\|
3410		- !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3411		- /*
3412		- * don't bother trying to write stuff out _if_
3413		- * a) we're not cached,
3414		- * b) we're with nospace_cache mount option,
3415		- * c) we're with v2 space_cache (FREE_SPACE_TREE).
3416		- */
3417		- dcs = BTRFS_DC_WRITTEN;
3418		- spin_unlock(&block_group->lock);
3419		- goto out_put;
3420		- }
3421		- spin_unlock(&block_group->lock);
3422		-
3423		- /*
3424		- * We hit an ENOSPC when setting up the cache in this transaction, just
3425		- * skip doing the setup, we've already cleared the cache so we're safe.
3426		- */
3427		- if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3428		- ret = -ENOSPC;
3429		- goto out_put;
3430		- }
3431		-
3432		- /*
3433		- * Try to preallocate enough space based on how big the block group is.
3434		- * Keep in mind this has to include any pinned space which could end up
3435		- * taking up quite a bit since it's not folded into the other space
3436		- * cache.
3437		- */
3438		- num_pages = div_u64(block_group->key.offset, SZ_256M);
3439		- if (!num_pages)
3440		- num_pages = 1;
3441		-
3442		- num_pages *= 16;
3443		- num_pages *= PAGE_SIZE;
3444		-
3445		- ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3446		- if (ret)
3447		- goto out_put;
3448		-
3449		- ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3450		- num_pages, num_pages,
3451		- &alloc_hint);
3452		- /*
3453		- * Our cache requires contiguous chunks so that we don't modify a bunch
3454		- * of metadata or split extents when writing the cache out, which means
3455		- * we can enospc if we are heavily fragmented in addition to just normal
3456		- * out of space conditions. So if we hit this just skip setting up any
3457		- * other block groups for this transaction, maybe we'll unpin enough
3458		- * space the next time around.
3459		- */
3460		- if (!ret)
3461		- dcs = BTRFS_DC_SETUP;
3462		- else if (ret == -ENOSPC)
3463		- set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3464		-
3465		-out_put:
3466		- iput(inode);
3467		-out_free:
3468		- btrfs_release_path(path);
3469		-out:
3470		- spin_lock(&block_group->lock);
3471		- if (!ret && dcs == BTRFS_DC_SETUP)
3472		- block_group->cache_generation = trans->transid;
3473		- block_group->disk_cache_state = dcs;
3474		- spin_unlock(&block_group->lock);
3475		-
3476		- extent_changeset_free(data_reserved);
3477		- return ret;
3478		-}
3479		-
3480		-int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3481		- struct btrfs_fs_info *fs_info)
3482		-{
3483		- struct btrfs_block_group_cache cache, tmp;
3484		- struct btrfs_transaction *cur_trans = trans->transaction;
3485		- struct btrfs_path *path;
3486		-
3487		- if (list_empty(&cur_trans->dirty_bgs) \|\|
3488		- !btrfs_test_opt(fs_info, SPACE_CACHE))
3489		- return 0;
3490		-
3491		- path = btrfs_alloc_path();
3492		- if (!path)
3493		- return -ENOMEM;
3494		-
3495		- /* Could add new block groups, use _safe just in case */
3496		- list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3497		- dirty_list) {
3498		- if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3499		- cache_save_setup(cache, trans, path);
3500		- }
3501		-
3502		- btrfs_free_path(path);
3503		- return 0;
3504		-}
3505		-
3506		-/*
3507		- * transaction commit does final block group cache writeback during a
3508		- * critical section where nothing is allowed to change the FS. This is
3509		- * required in order for the cache to actually match the block group,
3510		- * but can introduce a lot of latency into the commit.
3511		- *
3512		- * So, btrfs_start_dirty_block_groups is here to kick off block group
3513		- * cache IO. There's a chance we'll have to redo some of it if the
3514		- * block group changes again during the commit, but it greatly reduces
3515		- * the commit latency by getting rid of the easy block groups while
3516		- * we're still allowing others to join the commit.
3517		- */
3518		-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3519		-{
3520		- struct btrfs_fs_info *fs_info = trans->fs_info;
3521		- struct btrfs_block_group_cache *cache;
3522		- struct btrfs_transaction *cur_trans = trans->transaction;
3523		- int ret = 0;
3524		- int should_put;
3525		- struct btrfs_path *path = NULL;
3526		- LIST_HEAD(dirty);
3527		- struct list_head *io = &cur_trans->io_bgs;
3528		- int num_started = 0;
3529		- int loops = 0;
3530		-
3531		- spin_lock(&cur_trans->dirty_bgs_lock);
3532		- if (list_empty(&cur_trans->dirty_bgs)) {
3533		- spin_unlock(&cur_trans->dirty_bgs_lock);
3534		- return 0;
3535		- }
3536		- list_splice_init(&cur_trans->dirty_bgs, &dirty);
3537		- spin_unlock(&cur_trans->dirty_bgs_lock);
3538		-
3539		-again:
3540		- /*
3541		- * make sure all the block groups on our dirty list actually
3542		- * exist
3543		- */
3544		- btrfs_create_pending_block_groups(trans);
3545		-
3546		- if (!path) {
3547		- path = btrfs_alloc_path();
3548		- if (!path)
3549		- return -ENOMEM;
3550		- }
3551		-
3552		- /*
3553		- * cache_write_mutex is here only to save us from balance or automatic
3554		- * removal of empty block groups deleting this block group while we are
3555		- * writing out the cache
3556		- */
3557		- mutex_lock(&trans->transaction->cache_write_mutex);
3558		- while (!list_empty(&dirty)) {
3559		- cache = list_first_entry(&dirty,
3560		- struct btrfs_block_group_cache,
3561		- dirty_list);
3562		- /*
3563		- * this can happen if something re-dirties a block
3564		- * group that is already under IO. Just wait for it to
3565		- * finish and then do it all again
3566		- */
3567		- if (!list_empty(&cache->io_list)) {
3568		- list_del_init(&cache->io_list);
3569		- btrfs_wait_cache_io(trans, cache, path);
3570		- btrfs_put_block_group(cache);
3571		- }
3572		-
3573		-
3574		- /*
3575		- * btrfs_wait_cache_io uses the cache->dirty_list to decide
3576		- * if it should update the cache_state. Don't delete
3577		- * until after we wait.
3578		- *
3579		- * Since we're not running in the commit critical section
3580		- * we need the dirty_bgs_lock to protect from update_block_group
3581		- */
3582		- spin_lock(&cur_trans->dirty_bgs_lock);
3583		- list_del_init(&cache->dirty_list);
3584		- spin_unlock(&cur_trans->dirty_bgs_lock);
3585		-
3586		- should_put = 1;
3587		-
3588		- cache_save_setup(cache, trans, path);
3589		-
3590		- if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3591		- cache->io_ctl.inode = NULL;
3592		- ret = btrfs_write_out_cache(fs_info, trans,
3593		- cache, path);
3594		- if (ret == 0 && cache->io_ctl.inode) {
3595		- num_started++;
3596		- should_put = 0;
3597		-
3598		- /*
3599		- * The cache_write_mutex is protecting the
3600		- * io_list, also refer to the definition of
3601		- * btrfs_transaction::io_bgs for more details
3602		- */
3603		- list_add_tail(&cache->io_list, io);
3604		- } else {
3605		- /*
3606		- * if we failed to write the cache, the
3607		- * generation will be bad and life goes on
3608		- */
3609		- ret = 0;
3610		- }
3611		- }
3612		- if (!ret) {
3613		- ret = write_one_cache_group(trans, fs_info,
3614		- path, cache);
3615		- /*
3616		- * Our block group might still be attached to the list
3617		- * of new block groups in the transaction handle of some
3618		- * other task (struct btrfs_trans_handle->new_bgs). This
3619		- * means its block group item isn't yet in the extent
3620		- * tree. If this happens ignore the error, as we will
3621		- * try again later in the critical section of the
3622		- * transaction commit.
3623		- */
3624		- if (ret == -ENOENT) {
3625		- ret = 0;
3626		- spin_lock(&cur_trans->dirty_bgs_lock);
3627		- if (list_empty(&cache->dirty_list)) {
3628		- list_add_tail(&cache->dirty_list,
3629		- &cur_trans->dirty_bgs);
3630		- btrfs_get_block_group(cache);
3631		- }
3632		- spin_unlock(&cur_trans->dirty_bgs_lock);
3633		- } else if (ret) {
3634		- btrfs_abort_transaction(trans, ret);
3635		- }
3636		- }
3637		-
3638		- /* if its not on the io list, we need to put the block group */
3639		- if (should_put)
3640		- btrfs_put_block_group(cache);
3641		-
3642		- if (ret)
3643		- break;
3644		-
3645		- /*
3646		- * Avoid blocking other tasks for too long. It might even save
3647		- * us from writing caches for block groups that are going to be
3648		- * removed.
3649		- */
3650		- mutex_unlock(&trans->transaction->cache_write_mutex);
3651		- mutex_lock(&trans->transaction->cache_write_mutex);
3652		- }
3653		- mutex_unlock(&trans->transaction->cache_write_mutex);
3654		-
3655		- /*
3656		- * go through delayed refs for all the stuff we've just kicked off
3657		- * and then loop back (just once)
3658		- */
3659		- ret = btrfs_run_delayed_refs(trans, 0);
3660		- if (!ret && loops == 0) {
3661		- loops++;
3662		- spin_lock(&cur_trans->dirty_bgs_lock);
3663		- list_splice_init(&cur_trans->dirty_bgs, &dirty);
3664		- /*
3665		- * dirty_bgs_lock protects us from concurrent block group
3666		- * deletes too (not just cache_write_mutex).
3667		- */
3668		- if (!list_empty(&dirty)) {
3669		- spin_unlock(&cur_trans->dirty_bgs_lock);
3670		- goto again;
3671		- }
3672		- spin_unlock(&cur_trans->dirty_bgs_lock);
3673		- } else if (ret < 0) {
3674		- btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3675		- }
3676		-
3677		- btrfs_free_path(path);
3678		- return ret;
3679		-}
3680		-
3681		-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3682		- struct btrfs_fs_info *fs_info)
3683		-{
3684		- struct btrfs_block_group_cache *cache;
3685		- struct btrfs_transaction *cur_trans = trans->transaction;
3686		- int ret = 0;
3687		- int should_put;
3688		- struct btrfs_path *path;
3689		- struct list_head *io = &cur_trans->io_bgs;
3690		- int num_started = 0;
3691		-
3692		- path = btrfs_alloc_path();
3693		- if (!path)
3694		- return -ENOMEM;
3695		-
3696		- /*
3697		- * Even though we are in the critical section of the transaction commit,
3698		- * we can still have concurrent tasks adding elements to this
3699		- * transaction's list of dirty block groups. These tasks correspond to
3700		- * endio free space workers started when writeback finishes for a
3701		- * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3702		- * allocate new block groups as a result of COWing nodes of the root
3703		- * tree when updating the free space inode. The writeback for the space
3704		- * caches is triggered by an earlier call to
3705		- * btrfs_start_dirty_block_groups() and iterations of the following
3706		- * loop.
3707		- * Also we want to do the cache_save_setup first and then run the
3708		- * delayed refs to make sure we have the best chance at doing this all
3709		- * in one shot.
3710		- */
3711		- spin_lock(&cur_trans->dirty_bgs_lock);
3712		- while (!list_empty(&cur_trans->dirty_bgs)) {
3713		- cache = list_first_entry(&cur_trans->dirty_bgs,
3714		- struct btrfs_block_group_cache,
3715		- dirty_list);
3716		-
3717		- /*
3718		- * this can happen if cache_save_setup re-dirties a block
3719		- * group that is already under IO. Just wait for it to
3720		- * finish and then do it all again
3721		- */
3722		- if (!list_empty(&cache->io_list)) {
3723		- spin_unlock(&cur_trans->dirty_bgs_lock);
3724		- list_del_init(&cache->io_list);
3725		- btrfs_wait_cache_io(trans, cache, path);
3726		- btrfs_put_block_group(cache);
3727		- spin_lock(&cur_trans->dirty_bgs_lock);
3728		- }
3729		-
3730		- /*
3731		- * don't remove from the dirty list until after we've waited
3732		- * on any pending IO
3733		- */
3734		- list_del_init(&cache->dirty_list);
3735		- spin_unlock(&cur_trans->dirty_bgs_lock);
3736		- should_put = 1;
3737		-
3738		- cache_save_setup(cache, trans, path);
3739		-
3740		- if (!ret)
3741		- ret = btrfs_run_delayed_refs(trans,
3742		- (unsigned long) -1);
3743		-
3744		- if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3745		- cache->io_ctl.inode = NULL;
3746		- ret = btrfs_write_out_cache(fs_info, trans,
3747		- cache, path);
3748		- if (ret == 0 && cache->io_ctl.inode) {
3749		- num_started++;
3750		- should_put = 0;
3751		- list_add_tail(&cache->io_list, io);
3752		- } else {
3753		- /*
3754		- * if we failed to write the cache, the
3755		- * generation will be bad and life goes on
3756		- */
3757		- ret = 0;
3758		- }
3759		- }
3760		- if (!ret) {
3761		- ret = write_one_cache_group(trans, fs_info,
3762		- path, cache);
3763		- /*
3764		- * One of the free space endio workers might have
3765		- * created a new block group while updating a free space
3766		- * cache's inode (at inode.c:btrfs_finish_ordered_io())
3767		- * and hasn't released its transaction handle yet, in
3768		- * which case the new block group is still attached to
3769		- * its transaction handle and its creation has not
3770		- * finished yet (no block group item in the extent tree
3771		- * yet, etc). If this is the case, wait for all free
3772		- * space endio workers to finish and retry. This is a
3773		- * a very rare case so no need for a more efficient and
3774		- * complex approach.
3775		- */
3776		- if (ret == -ENOENT) {
3777		- wait_event(cur_trans->writer_wait,
3778		- atomic_read(&cur_trans->num_writers) == 1);
3779		- ret = write_one_cache_group(trans, fs_info,
3780		- path, cache);
3781		- }
3782		- if (ret)
3783		- btrfs_abort_transaction(trans, ret);
3784		- }
3785		-
3786		- /* if its not on the io list, we need to put the block group */
3787		- if (should_put)
3788		- btrfs_put_block_group(cache);
3789		- spin_lock(&cur_trans->dirty_bgs_lock);
3790		- }
3791		- spin_unlock(&cur_trans->dirty_bgs_lock);
3792		-
3793		- /*
3794		- * Refer to the definition of io_bgs member for details why it's safe
3795		- * to use it without any locking
3796		- */
3797		- while (!list_empty(io)) {
3798		- cache = list_first_entry(io, struct btrfs_block_group_cache,
3799		- io_list);
3800		- list_del_init(&cache->io_list);
3801		- btrfs_wait_cache_io(trans, cache, path);
3802		- btrfs_put_block_group(cache);
3803		- }
3804		-
3805		- btrfs_free_path(path);
3806		- return ret;
3807		-}
3808		-
3809	2482	int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3810	2483	{
3811		- struct btrfs_block_group_cache *block_group;
	2484	+ struct btrfs_block_group *block_group;
3812	2485	int readonly = 0;
3813	2486
3814	2487	block_group = btrfs_lookup_block_group(fs_info, bytenr);
..	..	@@ -3817,253 +2490,6 @@
3817	2490	if (block_group)
3818	2491	btrfs_put_block_group(block_group);
3819	2492	return readonly;
3820		-}
3821		-
3822		-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3823		-{
3824		- struct btrfs_block_group_cache *bg;
3825		- bool ret = true;
3826		-
3827		- bg = btrfs_lookup_block_group(fs_info, bytenr);
3828		- if (!bg)
3829		- return false;
3830		-
3831		- spin_lock(&bg->lock);
3832		- if (bg->ro)
3833		- ret = false;
3834		- else
3835		- atomic_inc(&bg->nocow_writers);
3836		- spin_unlock(&bg->lock);
3837		-
3838		- /* no put on block group, done by btrfs_dec_nocow_writers */
3839		- if (!ret)
3840		- btrfs_put_block_group(bg);
3841		-
3842		- return ret;
3843		-
3844		-}
3845		-
3846		-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3847		-{
3848		- struct btrfs_block_group_cache *bg;
3849		-
3850		- bg = btrfs_lookup_block_group(fs_info, bytenr);
3851		- ASSERT(bg);
3852		- if (atomic_dec_and_test(&bg->nocow_writers))
3853		- wake_up_var(&bg->nocow_writers);
3854		- /*
3855		- * Once for our lookup and once for the lookup done by a previous call
3856		- * to btrfs_inc_nocow_writers()
3857		- */
3858		- btrfs_put_block_group(bg);
3859		- btrfs_put_block_group(bg);
3860		-}
3861		-
3862		-void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3863		-{
3864		- wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3865		-}
3866		-
3867		-static const char *alloc_name(u64 flags)
3868		-{
3869		- switch (flags) {
3870		- case BTRFS_BLOCK_GROUP_METADATA\|BTRFS_BLOCK_GROUP_DATA:
3871		- return "mixed";
3872		- case BTRFS_BLOCK_GROUP_METADATA:
3873		- return "metadata";
3874		- case BTRFS_BLOCK_GROUP_DATA:
3875		- return "data";
3876		- case BTRFS_BLOCK_GROUP_SYSTEM:
3877		- return "system";
3878		- default:
3879		- WARN_ON(1);
3880		- return "invalid-combination";
3881		- };
3882		-}
3883		-
3884		-static int create_space_info(struct btrfs_fs_info *info, u64 flags)
3885		-{
3886		-
3887		- struct btrfs_space_info *space_info;
3888		- int i;
3889		- int ret;
3890		-
3891		- space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
3892		- if (!space_info)
3893		- return -ENOMEM;
3894		-
3895		- ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
3896		- GFP_KERNEL);
3897		- if (ret) {
3898		- kfree(space_info);
3899		- return ret;
3900		- }
3901		-
3902		- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3903		- INIT_LIST_HEAD(&space_info->block_groups[i]);
3904		- init_rwsem(&space_info->groups_sem);
3905		- spin_lock_init(&space_info->lock);
3906		- space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3907		- space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3908		- init_waitqueue_head(&space_info->wait);
3909		- INIT_LIST_HEAD(&space_info->ro_bgs);
3910		- INIT_LIST_HEAD(&space_info->tickets);
3911		- INIT_LIST_HEAD(&space_info->priority_tickets);
3912		-
3913		- ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
3914		- info->space_info_kobj, "%s",
3915		- alloc_name(space_info->flags));
3916		- if (ret) {
3917		- kobject_put(&space_info->kobj);
3918		- return ret;
3919		- }
3920		-
3921		- list_add_rcu(&space_info->list, &info->space_info);
3922		- if (flags & BTRFS_BLOCK_GROUP_DATA)
3923		- info->data_sinfo = space_info;
3924		-
3925		- return ret;
3926		-}
3927		-
3928		-static void update_space_info(struct btrfs_fs_info *info, u64 flags,
3929		- u64 total_bytes, u64 bytes_used,
3930		- u64 bytes_readonly,
3931		- struct btrfs_space_info **space_info)
3932		-{
3933		- struct btrfs_space_info *found;
3934		- int factor;
3935		-
3936		- factor = btrfs_bg_type_to_factor(flags);
3937		-
3938		- found = __find_space_info(info, flags);
3939		- ASSERT(found);
3940		- spin_lock(&found->lock);
3941		- found->total_bytes += total_bytes;
3942		- found->disk_total += total_bytes * factor;
3943		- found->bytes_used += bytes_used;
3944		- found->disk_used += bytes_used * factor;
3945		- found->bytes_readonly += bytes_readonly;
3946		- if (total_bytes > 0)
3947		- found->full = 0;
3948		- space_info_add_new_bytes(info, found, total_bytes -
3949		- bytes_used - bytes_readonly);
3950		- spin_unlock(&found->lock);
3951		- *space_info = found;
3952		-}
3953		-
3954		-static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3955		-{
3956		- u64 extra_flags = chunk_to_extended(flags) &
3957		- BTRFS_EXTENDED_PROFILE_MASK;
3958		-
3959		- write_seqlock(&fs_info->profiles_lock);
3960		- if (flags & BTRFS_BLOCK_GROUP_DATA)
3961		- fs_info->avail_data_alloc_bits \|= extra_flags;
3962		- if (flags & BTRFS_BLOCK_GROUP_METADATA)
3963		- fs_info->avail_metadata_alloc_bits \|= extra_flags;
3964		- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3965		- fs_info->avail_system_alloc_bits \|= extra_flags;
3966		- write_sequnlock(&fs_info->profiles_lock);
3967		-}
3968		-
3969		-/*
3970		- * returns target flags in extended format or 0 if restripe for this
3971		- * chunk_type is not in progress
3972		- *
3973		- * should be called with balance_lock held
3974		- */
3975		-static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3976		-{
3977		- struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3978		- u64 target = 0;
3979		-
3980		- if (!bctl)
3981		- return 0;
3982		-
3983		- if (flags & BTRFS_BLOCK_GROUP_DATA &&
3984		- bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3985		- target = BTRFS_BLOCK_GROUP_DATA \| bctl->data.target;
3986		- } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3987		- bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3988		- target = BTRFS_BLOCK_GROUP_SYSTEM \| bctl->sys.target;
3989		- } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3990		- bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3991		- target = BTRFS_BLOCK_GROUP_METADATA \| bctl->meta.target;
3992		- }
3993		-
3994		- return target;
3995		-}
3996		-
3997		-/*
3998		- * @flags: available profiles in extended format (see ctree.h)
3999		- *
4000		- * Returns reduced profile in chunk format. If profile changing is in
4001		- * progress (either running or paused) picks the target profile (if it's
4002		- * already available), otherwise falls back to plain reducing.
4003		- */
4004		-static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
4005		-{
4006		- u64 num_devices = fs_info->fs_devices->rw_devices;
4007		- u64 target;
4008		- u64 raid_type;
4009		- u64 allowed = 0;
4010		-
4011		- /*
4012		- * see if restripe for this chunk_type is in progress, if so
4013		- * try to reduce to the target profile
4014		- */
4015		- spin_lock(&fs_info->balance_lock);
4016		- target = get_restripe_target(fs_info, flags);
4017		- if (target) {
4018		- /* pick target profile only if it's already available */
4019		- if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4020		- spin_unlock(&fs_info->balance_lock);
4021		- return extended_to_chunk(target);
4022		- }
4023		- }
4024		- spin_unlock(&fs_info->balance_lock);
4025		-
4026		- /* First, mask out the RAID levels which aren't possible */
4027		- for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4028		- if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4029		- allowed \|= btrfs_raid_array[raid_type].bg_flag;
4030		- }
4031		- allowed &= flags;
4032		-
4033		- if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4034		- allowed = BTRFS_BLOCK_GROUP_RAID6;
4035		- else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4036		- allowed = BTRFS_BLOCK_GROUP_RAID5;
4037		- else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4038		- allowed = BTRFS_BLOCK_GROUP_RAID10;
4039		- else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4040		- allowed = BTRFS_BLOCK_GROUP_RAID1;
4041		- else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4042		- allowed = BTRFS_BLOCK_GROUP_RAID0;
4043		-
4044		- flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4045		-
4046		- return extended_to_chunk(flags \| allowed);
4047		-}
4048		-
4049		-static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
4050		-{
4051		- unsigned seq;
4052		- u64 flags;
4053		-
4054		- do {
4055		- flags = orig_flags;
4056		- seq = read_seqbegin(&fs_info->profiles_lock);
4057		-
4058		- if (flags & BTRFS_BLOCK_GROUP_DATA)
4059		- flags \|= fs_info->avail_data_alloc_bits;
4060		- else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4061		- flags \|= fs_info->avail_system_alloc_bits;
4062		- else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4063		- flags \|= fs_info->avail_metadata_alloc_bits;
4064		- } while (read_seqretry(&fs_info->profiles_lock, seq));
4065		-
4066		- return btrfs_reduce_alloc_profile(fs_info, flags);
4067	2493	}
4068	2494
4069	2495	static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
..	..	@@ -4079,2091 +2505,13 @@
4079	2505	else
4080	2506	flags = BTRFS_BLOCK_GROUP_METADATA;
4081	2507
4082		- ret = get_alloc_profile(fs_info, flags);
	2508	+ ret = btrfs_get_alloc_profile(fs_info, flags);
4083	2509	return ret;
4084		-}
4085		-
4086		-u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4087		-{
4088		- return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4089		-}
4090		-
4091		-u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4092		-{
4093		- return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4094		-}
4095		-
4096		-u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4097		-{
4098		- return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4099		-}
4100		-
4101		-static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4102		- bool may_use_included)
4103		-{
4104		- ASSERT(s_info);
4105		- return s_info->bytes_used + s_info->bytes_reserved +
4106		- s_info->bytes_pinned + s_info->bytes_readonly +
4107		- (may_use_included ? s_info->bytes_may_use : 0);
4108		-}
4109		-
4110		-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
4111		-{
4112		- struct btrfs_root *root = inode->root;
4113		- struct btrfs_fs_info *fs_info = root->fs_info;
4114		- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
4115		- u64 used;
4116		- int ret = 0;
4117		- int need_commit = 2;
4118		- int have_pinned_space;
4119		-
4120		- /* make sure bytes are sectorsize aligned */
4121		- bytes = ALIGN(bytes, fs_info->sectorsize);
4122		-
4123		- if (btrfs_is_free_space_inode(inode)) {
4124		- need_commit = 0;
4125		- ASSERT(current->journal_info);
4126		- }
4127		-
4128		-again:
4129		- /* make sure we have enough space to handle the data first */
4130		- spin_lock(&data_sinfo->lock);
4131		- used = btrfs_space_info_used(data_sinfo, true);
4132		-
4133		- if (used + bytes > data_sinfo->total_bytes) {
4134		- struct btrfs_trans_handle *trans;
4135		-
4136		- /*
4137		- * if we don't have enough free bytes in this space then we need
4138		- * to alloc a new chunk.
4139		- */
4140		- if (!data_sinfo->full) {
4141		- u64 alloc_target;
4142		-
4143		- data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4144		- spin_unlock(&data_sinfo->lock);
4145		-
4146		- alloc_target = btrfs_data_alloc_profile(fs_info);
4147		- /*
4148		- * It is ugly that we don't call nolock join
4149		- * transaction for the free space inode case here.
4150		- * But it is safe because we only do the data space
4151		- * reservation for the free space cache in the
4152		- * transaction context, the common join transaction
4153		- * just increase the counter of the current transaction
4154		- * handler, doesn't try to acquire the trans_lock of
4155		- * the fs.
4156		- */
4157		- trans = btrfs_join_transaction(root);
4158		- if (IS_ERR(trans))
4159		- return PTR_ERR(trans);
4160		-
4161		- ret = do_chunk_alloc(trans, alloc_target,
4162		- CHUNK_ALLOC_NO_FORCE);
4163		- btrfs_end_transaction(trans);
4164		- if (ret < 0) {
4165		- if (ret != -ENOSPC)
4166		- return ret;
4167		- else {
4168		- have_pinned_space = 1;
4169		- goto commit_trans;
4170		- }
4171		- }
4172		-
4173		- goto again;
4174		- }
4175		-
4176		- /*
4177		- * If we don't have enough pinned space to deal with this
4178		- * allocation, and no removed chunk in current transaction,
4179		- * don't bother committing the transaction.
4180		- */
4181		- have_pinned_space = __percpu_counter_compare(
4182		- &data_sinfo->total_bytes_pinned,
4183		- used + bytes - data_sinfo->total_bytes,
4184		- BTRFS_TOTAL_BYTES_PINNED_BATCH);
4185		- spin_unlock(&data_sinfo->lock);
4186		-
4187		- /* commit the current transaction and try again */
4188		-commit_trans:
4189		- if (need_commit) {
4190		- need_commit--;
4191		-
4192		- if (need_commit > 0) {
4193		- btrfs_start_delalloc_roots(fs_info, -1);
4194		- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4195		- (u64)-1);
4196		- }
4197		-
4198		- trans = btrfs_join_transaction(root);
4199		- if (IS_ERR(trans))
4200		- return PTR_ERR(trans);
4201		- if (have_pinned_space >= 0 \|\|
4202		- test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4203		- &trans->transaction->flags) \|\|
4204		- need_commit > 0) {
4205		- ret = btrfs_commit_transaction(trans);
4206		- if (ret)
4207		- return ret;
4208		- /*
4209		- * The cleaner kthread might still be doing iput
4210		- * operations. Wait for it to finish so that
4211		- * more space is released.
4212		- */
4213		- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
4214		- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
4215		- goto again;
4216		- } else {
4217		- btrfs_end_transaction(trans);
4218		- }
4219		- }
4220		-
4221		- trace_btrfs_space_reservation(fs_info,
4222		- "space_info:enospc",
4223		- data_sinfo->flags, bytes, 1);
4224		- return -ENOSPC;
4225		- }
4226		- data_sinfo->bytes_may_use += bytes;
4227		- trace_btrfs_space_reservation(fs_info, "space_info",
4228		- data_sinfo->flags, bytes, 1);
4229		- spin_unlock(&data_sinfo->lock);
4230		-
4231		- return 0;
4232		-}
4233		-
4234		-int btrfs_check_data_free_space(struct inode *inode,
4235		- struct extent_changeset **reserved, u64 start, u64 len)
4236		-{
4237		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4238		- int ret;
4239		-
4240		- /* align the range */
4241		- len = round_up(start + len, fs_info->sectorsize) -
4242		- round_down(start, fs_info->sectorsize);
4243		- start = round_down(start, fs_info->sectorsize);
4244		-
4245		- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4246		- if (ret < 0)
4247		- return ret;
4248		-
4249		- /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4250		- ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4251		- if (ret < 0)
4252		- btrfs_free_reserved_data_space_noquota(inode, start, len);
4253		- else
4254		- ret = 0;
4255		- return ret;
4256		-}
4257		-
4258		-/*
4259		- * Called if we need to clear a data reservation for this inode
4260		- * Normally in a error case.
4261		- *
4262		- * This one will NOT use accurate qgroup reserved space API, just for case
4263		- * which we can't sleep and is sure it won't affect qgroup reserved space.
4264		- * Like clear_bit_hook().
4265		- */
4266		-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4267		- u64 len)
4268		-{
4269		- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4270		- struct btrfs_space_info *data_sinfo;
4271		-
4272		- /* Make sure the range is aligned to sectorsize */
4273		- len = round_up(start + len, fs_info->sectorsize) -
4274		- round_down(start, fs_info->sectorsize);
4275		- start = round_down(start, fs_info->sectorsize);
4276		-
4277		- data_sinfo = fs_info->data_sinfo;
4278		- spin_lock(&data_sinfo->lock);
4279		- if (WARN_ON(data_sinfo->bytes_may_use < len))
4280		- data_sinfo->bytes_may_use = 0;
4281		- else
4282		- data_sinfo->bytes_may_use -= len;
4283		- trace_btrfs_space_reservation(fs_info, "space_info",
4284		- data_sinfo->flags, len, 0);
4285		- spin_unlock(&data_sinfo->lock);
4286		-}
4287		-
4288		-/*
4289		- * Called if we need to clear a data reservation for this inode
4290		- * Normally in a error case.
4291		- *
4292		- * This one will handle the per-inode data rsv map for accurate reserved
4293		- * space framework.
4294		- */
4295		-void btrfs_free_reserved_data_space(struct inode *inode,
4296		- struct extent_changeset *reserved, u64 start, u64 len)
4297		-{
4298		- struct btrfs_root *root = BTRFS_I(inode)->root;
4299		-
4300		- /* Make sure the range is aligned to sectorsize */
4301		- len = round_up(start + len, root->fs_info->sectorsize) -
4302		- round_down(start, root->fs_info->sectorsize);
4303		- start = round_down(start, root->fs_info->sectorsize);
4304		-
4305		- btrfs_free_reserved_data_space_noquota(inode, start, len);
4306		- btrfs_qgroup_free_data(inode, reserved, start, len);
4307		-}
4308		-
4309		-static void force_metadata_allocation(struct btrfs_fs_info *info)
4310		-{
4311		- struct list_head *head = &info->space_info;
4312		- struct btrfs_space_info *found;
4313		-
4314		- rcu_read_lock();
4315		- list_for_each_entry_rcu(found, head, list) {
4316		- if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4317		- found->force_alloc = CHUNK_ALLOC_FORCE;
4318		- }
4319		- rcu_read_unlock();
4320		-}
4321		-
4322		-static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4323		-{
4324		- return (global->size << 1);
4325		-}
4326		-
4327		-static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4328		- struct btrfs_space_info *sinfo, int force)
4329		-{
4330		- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4331		- u64 bytes_used = btrfs_space_info_used(sinfo, false);
4332		- u64 thresh;
4333		-
4334		- if (force == CHUNK_ALLOC_FORCE)
4335		- return 1;
4336		-
4337		- /*
4338		- * We need to take into account the global rsv because for all intents
4339		- * and purposes it's used space. Don't worry about locking the
4340		- * global_rsv, it doesn't change except when the transaction commits.
4341		- */
4342		- if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4343		- bytes_used += calc_global_rsv_need_space(global_rsv);
4344		-
4345		- /*
4346		- * in limited mode, we want to have some free space up to
4347		- * about 1% of the FS size.
4348		- */
4349		- if (force == CHUNK_ALLOC_LIMITED) {
4350		- thresh = btrfs_super_total_bytes(fs_info->super_copy);
4351		- thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4352		-
4353		- if (sinfo->total_bytes - bytes_used < thresh)
4354		- return 1;
4355		- }
4356		-
4357		- if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4358		- return 0;
4359		- return 1;
4360		-}
4361		-
4362		-static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4363		-{
4364		- u64 num_dev;
4365		-
4366		- if (type & (BTRFS_BLOCK_GROUP_RAID10 \|
4367		- BTRFS_BLOCK_GROUP_RAID0 \|
4368		- BTRFS_BLOCK_GROUP_RAID5 \|
4369		- BTRFS_BLOCK_GROUP_RAID6))
4370		- num_dev = fs_info->fs_devices->rw_devices;
4371		- else if (type & BTRFS_BLOCK_GROUP_RAID1)
4372		- num_dev = 2;
4373		- else
4374		- num_dev = 1; /* DUP or single */
4375		-
4376		- return num_dev;
4377		-}
4378		-
4379		-/*
4380		- * If @is_allocation is true, reserve space in the system space info necessary
4381		- * for allocating a chunk, otherwise if it's false, reserve space necessary for
4382		- * removing a chunk.
4383		- */
4384		-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4385		-{
4386		- struct btrfs_fs_info *fs_info = trans->fs_info;
4387		- struct btrfs_space_info *info;
4388		- u64 left;
4389		- u64 thresh;
4390		- int ret = 0;
4391		- u64 num_devs;
4392		-
4393		- /*
4394		- * Needed because we can end up allocating a system chunk and for an
4395		- * atomic and race free space reservation in the chunk block reserve.
4396		- */
4397		- lockdep_assert_held(&fs_info->chunk_mutex);
4398		-
4399		- info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4400		- spin_lock(&info->lock);
4401		- left = info->total_bytes - btrfs_space_info_used(info, true);
4402		- spin_unlock(&info->lock);
4403		-
4404		- num_devs = get_profile_num_devs(fs_info, type);
4405		-
4406		- /* num_devs device items to update and 1 chunk item to add or remove */
4407		- thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4408		- btrfs_calc_trans_metadata_size(fs_info, 1);
4409		-
4410		- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4411		- btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4412		- left, thresh, type);
4413		- dump_space_info(fs_info, info, 0, 0);
4414		- }
4415		-
4416		- if (left < thresh) {
4417		- u64 flags = btrfs_system_alloc_profile(fs_info);
4418		-
4419		- /*
4420		- * Ignore failure to create system chunk. We might end up not
4421		- * needing it, as we might not need to COW all nodes/leafs from
4422		- * the paths we visit in the chunk tree (they were already COWed
4423		- * or created in the current transaction for example).
4424		- */
4425		- ret = btrfs_alloc_chunk(trans, flags);
4426		- }
4427		-
4428		- if (!ret) {
4429		- ret = btrfs_block_rsv_add(fs_info->chunk_root,
4430		- &fs_info->chunk_block_rsv,
4431		- thresh, BTRFS_RESERVE_NO_FLUSH);
4432		- if (!ret)
4433		- trans->chunk_bytes_reserved += thresh;
4434		- }
4435		-}
4436		-
4437		-/*
4438		- * If force is CHUNK_ALLOC_FORCE:
4439		- * - return 1 if it successfully allocates a chunk,
4440		- * - return errors including -ENOSPC otherwise.
4441		- * If force is NOT CHUNK_ALLOC_FORCE:
4442		- * - return 0 if it doesn't need to allocate a new chunk,
4443		- * - return 1 if it successfully allocates a chunk,
4444		- * - return errors including -ENOSPC otherwise.
4445		- */
4446		-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4447		- int force)
4448		-{
4449		- struct btrfs_fs_info *fs_info = trans->fs_info;
4450		- struct btrfs_space_info *space_info;
4451		- bool wait_for_alloc = false;
4452		- bool should_alloc = false;
4453		- int ret = 0;
4454		-
4455		- /* Don't re-enter if we're already allocating a chunk */
4456		- if (trans->allocating_chunk)
4457		- return -ENOSPC;
4458		-
4459		- space_info = __find_space_info(fs_info, flags);
4460		- ASSERT(space_info);
4461		-
4462		- do {
4463		- spin_lock(&space_info->lock);
4464		- if (force < space_info->force_alloc)
4465		- force = space_info->force_alloc;
4466		- should_alloc = should_alloc_chunk(fs_info, space_info, force);
4467		- if (space_info->full) {
4468		- /* No more free physical space */
4469		- if (should_alloc)
4470		- ret = -ENOSPC;
4471		- else
4472		- ret = 0;
4473		- spin_unlock(&space_info->lock);
4474		- return ret;
4475		- } else if (!should_alloc) {
4476		- spin_unlock(&space_info->lock);
4477		- return 0;
4478		- } else if (space_info->chunk_alloc) {
4479		- /*
4480		- * Someone is already allocating, so we need to block
4481		- * until this someone is finished and then loop to
4482		- * recheck if we should continue with our allocation
4483		- * attempt.
4484		- */
4485		- wait_for_alloc = true;
4486		- spin_unlock(&space_info->lock);
4487		- mutex_lock(&fs_info->chunk_mutex);
4488		- mutex_unlock(&fs_info->chunk_mutex);
4489		- } else {
4490		- /* Proceed with allocation */
4491		- space_info->chunk_alloc = 1;
4492		- wait_for_alloc = false;
4493		- spin_unlock(&space_info->lock);
4494		- }
4495		-
4496		- cond_resched();
4497		- } while (wait_for_alloc);
4498		-
4499		- mutex_lock(&fs_info->chunk_mutex);
4500		- trans->allocating_chunk = true;
4501		-
4502		- /*
4503		- * If we have mixed data/metadata chunks we want to make sure we keep
4504		- * allocating mixed chunks instead of individual chunks.
4505		- */
4506		- if (btrfs_mixed_space_info(space_info))
4507		- flags \|= (BTRFS_BLOCK_GROUP_DATA \| BTRFS_BLOCK_GROUP_METADATA);
4508		-
4509		- /*
4510		- * if we're doing a data chunk, go ahead and make sure that
4511		- * we keep a reasonable number of metadata chunks allocated in the
4512		- * FS as well.
4513		- */
4514		- if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4515		- fs_info->data_chunk_allocations++;
4516		- if (!(fs_info->data_chunk_allocations %
4517		- fs_info->metadata_ratio))
4518		- force_metadata_allocation(fs_info);
4519		- }
4520		-
4521		- /*
4522		- * Check if we have enough space in SYSTEM chunk because we may need
4523		- * to update devices.
4524		- */
4525		- check_system_chunk(trans, flags);
4526		-
4527		- ret = btrfs_alloc_chunk(trans, flags);
4528		- trans->allocating_chunk = false;
4529		-
4530		- spin_lock(&space_info->lock);
4531		- if (ret < 0) {
4532		- if (ret == -ENOSPC)
4533		- space_info->full = 1;
4534		- else
4535		- goto out;
4536		- } else {
4537		- ret = 1;
4538		- space_info->max_extent_size = 0;
4539		- }
4540		-
4541		- space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4542		-out:
4543		- space_info->chunk_alloc = 0;
4544		- spin_unlock(&space_info->lock);
4545		- mutex_unlock(&fs_info->chunk_mutex);
4546		- /*
4547		- * When we allocate a new chunk we reserve space in the chunk block
4548		- * reserve to make sure we can COW nodes/leafs in the chunk tree or
4549		- * add new nodes/leafs to it if we end up needing to do it when
4550		- * inserting the chunk item and updating device items as part of the
4551		- * second phase of chunk allocation, performed by
4552		- * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4553		- * large number of new block groups to create in our transaction
4554		- * handle's new_bgs list to avoid exhausting the chunk block reserve
4555		- * in extreme cases - like having a single transaction create many new
4556		- * block groups when starting to write out the free space caches of all
4557		- * the block groups that were made dirty during the lifetime of the
4558		- * transaction.
4559		- */
4560		- if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
4561		- btrfs_create_pending_block_groups(trans);
4562		-
4563		- return ret;
4564		-}
4565		-
4566		-static int can_overcommit(struct btrfs_fs_info *fs_info,
4567		- struct btrfs_space_info *space_info, u64 bytes,
4568		- enum btrfs_reserve_flush_enum flush,
4569		- bool system_chunk)
4570		-{
4571		- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4572		- u64 profile;
4573		- u64 space_size;
4574		- u64 avail;
4575		- u64 used;
4576		- int factor;
4577		-
4578		- /* Don't overcommit when in mixed mode. */
4579		- if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4580		- return 0;
4581		-
4582		- if (system_chunk)
4583		- profile = btrfs_system_alloc_profile(fs_info);
4584		- else
4585		- profile = btrfs_metadata_alloc_profile(fs_info);
4586		-
4587		- used = btrfs_space_info_used(space_info, false);
4588		-
4589		- /*
4590		- * We only want to allow over committing if we have lots of actual space
4591		- * free, but if we don't have enough space to handle the global reserve
4592		- * space then we could end up having a real enospc problem when trying
4593		- * to allocate a chunk or some other such important allocation.
4594		- */
4595		- spin_lock(&global_rsv->lock);
4596		- space_size = calc_global_rsv_need_space(global_rsv);
4597		- spin_unlock(&global_rsv->lock);
4598		- if (used + space_size >= space_info->total_bytes)
4599		- return 0;
4600		-
4601		- used += space_info->bytes_may_use;
4602		-
4603		- avail = atomic64_read(&fs_info->free_chunk_space);
4604		-
4605		- /*
4606		- * If we have dup, raid1 or raid10 then only half of the free
4607		- * space is actually useable. For raid56, the space info used
4608		- * doesn't include the parity drive, so we don't have to
4609		- * change the math
4610		- */
4611		- factor = btrfs_bg_type_to_factor(profile);
4612		- avail = div_u64(avail, factor);
4613		-
4614		- /*
4615		- * If we aren't flushing all things, let us overcommit up to
4616		- * 1/2th of the space. If we can flush, don't let us overcommit
4617		- * too much, let it overcommit up to 1/8 of the space.
4618		- */
4619		- if (flush == BTRFS_RESERVE_FLUSH_ALL)
4620		- avail >>= 3;
4621		- else
4622		- avail >>= 1;
4623		-
4624		- if (used + bytes < space_info->total_bytes + avail)
4625		- return 1;
4626		- return 0;
4627		-}
4628		-
4629		-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4630		- unsigned long nr_pages, int nr_items)
4631		-{
4632		- struct super_block *sb = fs_info->sb;
4633		-
4634		- if (down_read_trylock(&sb->s_umount)) {
4635		- writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4636		- up_read(&sb->s_umount);
4637		- } else {
4638		- /*
4639		- * We needn't worry the filesystem going from r/w to r/o though
4640		- * we don't acquire ->s_umount mutex, because the filesystem
4641		- * should guarantee the delalloc inodes list be empty after
4642		- * the filesystem is readonly(all dirty pages are written to
4643		- * the disk).
4644		- */
4645		- btrfs_start_delalloc_roots(fs_info, nr_items);
4646		- if (!current->journal_info)
4647		- btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4648		- }
4649		-}
4650		-
4651		-static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4652		- u64 to_reclaim)
4653		-{
4654		- u64 bytes;
4655		- u64 nr;
4656		-
4657		- bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4658		- nr = div64_u64(to_reclaim, bytes);
4659		- if (!nr)
4660		- nr = 1;
4661		- return nr;
4662		-}
4663		-
4664		-#define EXTENT_SIZE_PER_ITEM SZ_256K
4665		-
4666		-/*
4667		- * shrink metadata reservation for delalloc
4668		- */
4669		-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4670		- u64 orig, bool wait_ordered)
4671		-{
4672		- struct btrfs_space_info *space_info;
4673		- struct btrfs_trans_handle *trans;
4674		- u64 delalloc_bytes;
4675		- u64 max_reclaim;
4676		- u64 items;
4677		- long time_left;
4678		- unsigned long nr_pages;
4679		- int loops;
4680		-
4681		- /* Calc the number of the pages we need flush for space reservation */
4682		- items = calc_reclaim_items_nr(fs_info, to_reclaim);
4683		- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4684		-
4685		- trans = (struct btrfs_trans_handle *)current->journal_info;
4686		- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4687		-
4688		- delalloc_bytes = percpu_counter_sum_positive(
4689		- &fs_info->delalloc_bytes);
4690		- if (delalloc_bytes == 0) {
4691		- if (trans)
4692		- return;
4693		- if (wait_ordered)
4694		- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4695		- return;
4696		- }
4697		-
4698		- loops = 0;
4699		- while (delalloc_bytes && loops < 3) {
4700		- max_reclaim = min(delalloc_bytes, to_reclaim);
4701		- nr_pages = max_reclaim >> PAGE_SHIFT;
4702		- btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4703		- /*
4704		- * We need to wait for the async pages to actually start before
4705		- * we do anything.
4706		- */
4707		- max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
4708		- if (!max_reclaim)
4709		- goto skip_async;
4710		-
4711		- if (max_reclaim <= nr_pages)
4712		- max_reclaim = 0;
4713		- else
4714		- max_reclaim -= nr_pages;
4715		-
4716		- wait_event(fs_info->async_submit_wait,
4717		- atomic_read(&fs_info->async_delalloc_pages) <=
4718		- (int)max_reclaim);
4719		-skip_async:
4720		- spin_lock(&space_info->lock);
4721		- if (list_empty(&space_info->tickets) &&
4722		- list_empty(&space_info->priority_tickets)) {
4723		- spin_unlock(&space_info->lock);
4724		- break;
4725		- }
4726		- spin_unlock(&space_info->lock);
4727		-
4728		- loops++;
4729		- if (wait_ordered && !trans) {
4730		- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4731		- } else {
4732		- time_left = schedule_timeout_killable(1);
4733		- if (time_left)
4734		- break;
4735		- }
4736		- delalloc_bytes = percpu_counter_sum_positive(
4737		- &fs_info->delalloc_bytes);
4738		- }
4739		-}
4740		-
4741		-struct reserve_ticket {
4742		- u64 bytes;
4743		- int error;
4744		- struct list_head list;
4745		- wait_queue_head_t wait;
4746		-};
4747		-
4748		-/**
4749		- * maybe_commit_transaction - possibly commit the transaction if its ok to
4750		- * @root - the root we're allocating for
4751		- * @bytes - the number of bytes we want to reserve
4752		- * @force - force the commit
4753		- *
4754		- * This will check to make sure that committing the transaction will actually
4755		- * get us somewhere and then commit the transaction if it does. Otherwise it
4756		- * will return -ENOSPC.
4757		- */
4758		-static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4759		- struct btrfs_space_info *space_info)
4760		-{
4761		- struct reserve_ticket *ticket = NULL;
4762		- struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4763		- struct btrfs_trans_handle *trans;
4764		- u64 bytes;
4765		-
4766		- trans = (struct btrfs_trans_handle *)current->journal_info;
4767		- if (trans)
4768		- return -EAGAIN;
4769		-
4770		- spin_lock(&space_info->lock);
4771		- if (!list_empty(&space_info->priority_tickets))
4772		- ticket = list_first_entry(&space_info->priority_tickets,
4773		- struct reserve_ticket, list);
4774		- else if (!list_empty(&space_info->tickets))
4775		- ticket = list_first_entry(&space_info->tickets,
4776		- struct reserve_ticket, list);
4777		- bytes = (ticket) ? ticket->bytes : 0;
4778		- spin_unlock(&space_info->lock);
4779		-
4780		- if (!bytes)
4781		- return 0;
4782		-
4783		- /* See if there is enough pinned space to make this reservation */
4784		- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4785		- bytes,
4786		- BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
4787		- goto commit;
4788		-
4789		- /*
4790		- * See if there is some space in the delayed insertion reservation for
4791		- * this reservation.
4792		- */
4793		- if (space_info != delayed_rsv->space_info)
4794		- return -ENOSPC;
4795		-
4796		- spin_lock(&delayed_rsv->lock);
4797		- if (delayed_rsv->size > bytes)
4798		- bytes = 0;
4799		- else
4800		- bytes -= delayed_rsv->size;
4801		- spin_unlock(&delayed_rsv->lock);
4802		-
4803		- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4804		- bytes,
4805		- BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
4806		- return -ENOSPC;
4807		- }
4808		-
4809		-commit:
4810		- trans = btrfs_join_transaction(fs_info->extent_root);
4811		- if (IS_ERR(trans))
4812		- return -ENOSPC;
4813		-
4814		- return btrfs_commit_transaction(trans);
4815		-}
4816		-
4817		-/*
4818		- * Try to flush some data based on policy set by @state. This is only advisory
4819		- * and may fail for various reasons. The caller is supposed to examine the
4820		- * state of @space_info to detect the outcome.
4821		- */
4822		-static void flush_space(struct btrfs_fs_info *fs_info,
4823		- struct btrfs_space_info *space_info, u64 num_bytes,
4824		- int state)
4825		-{
4826		- struct btrfs_root *root = fs_info->extent_root;
4827		- struct btrfs_trans_handle *trans;
4828		- int nr;
4829		- int ret = 0;
4830		-
4831		- switch (state) {
4832		- case FLUSH_DELAYED_ITEMS_NR:
4833		- case FLUSH_DELAYED_ITEMS:
4834		- if (state == FLUSH_DELAYED_ITEMS_NR)
4835		- nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4836		- else
4837		- nr = -1;
4838		-
4839		- trans = btrfs_join_transaction(root);
4840		- if (IS_ERR(trans)) {
4841		- ret = PTR_ERR(trans);
4842		- break;
4843		- }
4844		- ret = btrfs_run_delayed_items_nr(trans, nr);
4845		- btrfs_end_transaction(trans);
4846		- break;
4847		- case FLUSH_DELALLOC:
4848		- case FLUSH_DELALLOC_WAIT:
4849		- shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4850		- state == FLUSH_DELALLOC_WAIT);
4851		- break;
4852		- case ALLOC_CHUNK:
4853		- trans = btrfs_join_transaction(root);
4854		- if (IS_ERR(trans)) {
4855		- ret = PTR_ERR(trans);
4856		- break;
4857		- }
4858		- ret = do_chunk_alloc(trans,
4859		- btrfs_metadata_alloc_profile(fs_info),
4860		- CHUNK_ALLOC_NO_FORCE);
4861		- btrfs_end_transaction(trans);
4862		- if (ret > 0 \|\| ret == -ENOSPC)
4863		- ret = 0;
4864		- break;
4865		- case COMMIT_TRANS:
4866		- ret = may_commit_transaction(fs_info, space_info);
4867		- break;
4868		- default:
4869		- ret = -ENOSPC;
4870		- break;
4871		- }
4872		-
4873		- trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
4874		- ret);
4875		- return;
4876		-}
4877		-
4878		-static inline u64
4879		-btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4880		- struct btrfs_space_info *space_info,
4881		- bool system_chunk)
4882		-{
4883		- struct reserve_ticket *ticket;
4884		- u64 used;
4885		- u64 expected;
4886		- u64 to_reclaim = 0;
4887		-
4888		- list_for_each_entry(ticket, &space_info->tickets, list)
4889		- to_reclaim += ticket->bytes;
4890		- list_for_each_entry(ticket, &space_info->priority_tickets, list)
4891		- to_reclaim += ticket->bytes;
4892		- if (to_reclaim)
4893		- return to_reclaim;
4894		-
4895		- to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4896		- if (can_overcommit(fs_info, space_info, to_reclaim,
4897		- BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4898		- return 0;
4899		-
4900		- used = btrfs_space_info_used(space_info, true);
4901		-
4902		- if (can_overcommit(fs_info, space_info, SZ_1M,
4903		- BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4904		- expected = div_factor_fine(space_info->total_bytes, 95);
4905		- else
4906		- expected = div_factor_fine(space_info->total_bytes, 90);
4907		-
4908		- if (used > expected)
4909		- to_reclaim = used - expected;
4910		- else
4911		- to_reclaim = 0;
4912		- to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4913		- space_info->bytes_reserved);
4914		- return to_reclaim;
4915		-}
4916		-
4917		-static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
4918		- struct btrfs_space_info *space_info,
4919		- u64 used, bool system_chunk)
4920		-{
4921		- u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4922		-
4923		- /* If we're just plain full then async reclaim just slows us down. */
4924		- if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4925		- return 0;
4926		-
4927		- if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4928		- system_chunk))
4929		- return 0;
4930		-
4931		- return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4932		- !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4933		-}
4934		-
4935		-static void wake_all_tickets(struct list_head *head)
4936		-{
4937		- struct reserve_ticket *ticket;
4938		-
4939		- while (!list_empty(head)) {
4940		- ticket = list_first_entry(head, struct reserve_ticket, list);
4941		- list_del_init(&ticket->list);
4942		- ticket->error = -ENOSPC;
4943		- wake_up(&ticket->wait);
4944		- }
4945		-}
4946		-
4947		-/*
4948		- * This is for normal flushers, we can wait all goddamned day if we want to. We
4949		- * will loop and continuously try to flush as long as we are making progress.
4950		- * We count progress as clearing off tickets each time we have to loop.
4951		- */
4952		-static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4953		-{
4954		- struct btrfs_fs_info *fs_info;
4955		- struct btrfs_space_info *space_info;
4956		- u64 to_reclaim;
4957		- int flush_state;
4958		- int commit_cycles = 0;
4959		- u64 last_tickets_id;
4960		-
4961		- fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4962		- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4963		-
4964		- spin_lock(&space_info->lock);
4965		- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4966		- false);
4967		- if (!to_reclaim) {
4968		- space_info->flush = 0;
4969		- spin_unlock(&space_info->lock);
4970		- return;
4971		- }
4972		- last_tickets_id = space_info->tickets_id;
4973		- spin_unlock(&space_info->lock);
4974		-
4975		- flush_state = FLUSH_DELAYED_ITEMS_NR;
4976		- do {
4977		- flush_space(fs_info, space_info, to_reclaim, flush_state);
4978		- spin_lock(&space_info->lock);
4979		- if (list_empty(&space_info->tickets)) {
4980		- space_info->flush = 0;
4981		- spin_unlock(&space_info->lock);
4982		- return;
4983		- }
4984		- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
4985		- space_info,
4986		- false);
4987		- if (last_tickets_id == space_info->tickets_id) {
4988		- flush_state++;
4989		- } else {
4990		- last_tickets_id = space_info->tickets_id;
4991		- flush_state = FLUSH_DELAYED_ITEMS_NR;
4992		- if (commit_cycles)
4993		- commit_cycles--;
4994		- }
4995		-
4996		- if (flush_state > COMMIT_TRANS) {
4997		- commit_cycles++;
4998		- if (commit_cycles > 2) {
4999		- wake_all_tickets(&space_info->tickets);
5000		- space_info->flush = 0;
5001		- } else {
5002		- flush_state = FLUSH_DELAYED_ITEMS_NR;
5003		- }
5004		- }
5005		- spin_unlock(&space_info->lock);
5006		- } while (flush_state <= COMMIT_TRANS);
5007		-}
5008		-
5009		-void btrfs_init_async_reclaim_work(struct work_struct *work)
5010		-{
5011		- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5012		-}
5013		-
5014		-static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5015		- struct btrfs_space_info *space_info,
5016		- struct reserve_ticket *ticket)
5017		-{
5018		- u64 to_reclaim;
5019		- int flush_state = FLUSH_DELAYED_ITEMS_NR;
5020		-
5021		- spin_lock(&space_info->lock);
5022		- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5023		- false);
5024		- if (!to_reclaim) {
5025		- spin_unlock(&space_info->lock);
5026		- return;
5027		- }
5028		- spin_unlock(&space_info->lock);
5029		-
5030		- do {
5031		- flush_space(fs_info, space_info, to_reclaim, flush_state);
5032		- flush_state++;
5033		- spin_lock(&space_info->lock);
5034		- if (ticket->bytes == 0) {
5035		- spin_unlock(&space_info->lock);
5036		- return;
5037		- }
5038		- spin_unlock(&space_info->lock);
5039		-
5040		- /*
5041		- * Priority flushers can't wait on delalloc without
5042		- * deadlocking.
5043		- */
5044		- if (flush_state == FLUSH_DELALLOC \|\|
5045		- flush_state == FLUSH_DELALLOC_WAIT)
5046		- flush_state = ALLOC_CHUNK;
5047		- } while (flush_state < COMMIT_TRANS);
5048		-}
5049		-
5050		-static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5051		- struct btrfs_space_info *space_info,
5052		- struct reserve_ticket *ticket, u64 orig_bytes)
5053		-
5054		-{
5055		- DEFINE_WAIT(wait);
5056		- int ret = 0;
5057		-
5058		- spin_lock(&space_info->lock);
5059		- while (ticket->bytes > 0 && ticket->error == 0) {
5060		- ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5061		- if (ret) {
5062		- ret = -EINTR;
5063		- break;
5064		- }
5065		- spin_unlock(&space_info->lock);
5066		-
5067		- schedule();
5068		-
5069		- finish_wait(&ticket->wait, &wait);
5070		- spin_lock(&space_info->lock);
5071		- }
5072		- if (!ret)
5073		- ret = ticket->error;
5074		- if (!list_empty(&ticket->list))
5075		- list_del_init(&ticket->list);
5076		- if (ticket->bytes && ticket->bytes < orig_bytes) {
5077		- u64 num_bytes = orig_bytes - ticket->bytes;
5078		- space_info->bytes_may_use -= num_bytes;
5079		- trace_btrfs_space_reservation(fs_info, "space_info",
5080		- space_info->flags, num_bytes, 0);
5081		- }
5082		- spin_unlock(&space_info->lock);
5083		-
5084		- return ret;
5085		-}
5086		-
5087		-/**
5088		- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5089		- * @root - the root we're allocating for
5090		- * @space_info - the space info we want to allocate from
5091		- * @orig_bytes - the number of bytes we want
5092		- * @flush - whether or not we can flush to make our reservation
5093		- *
5094		- * This will reserve orig_bytes number of bytes from the space info associated
5095		- * with the block_rsv. If there is not enough space it will make an attempt to
5096		- * flush out space to make room. It will do this by flushing delalloc if
5097		- * possible or committing the transaction. If flush is 0 then no attempts to
5098		- * regain reservations will be made and this will fail if there is not enough
5099		- * space already.
5100		- */
5101		-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5102		- struct btrfs_space_info *space_info,
5103		- u64 orig_bytes,
5104		- enum btrfs_reserve_flush_enum flush,
5105		- bool system_chunk)
5106		-{
5107		- struct reserve_ticket ticket;
5108		- u64 used;
5109		- int ret = 0;
5110		-
5111		- ASSERT(orig_bytes);
5112		- ASSERT(!current->journal_info \|\| flush != BTRFS_RESERVE_FLUSH_ALL);
5113		-
5114		- spin_lock(&space_info->lock);
5115		- ret = -ENOSPC;
5116		- used = btrfs_space_info_used(space_info, true);
5117		-
5118		- /*
5119		- * If we have enough space then hooray, make our reservation and carry
5120		- * on. If not see if we can overcommit, and if we can, hooray carry on.
5121		- * If not things get more complicated.
5122		- */
5123		- if (used + orig_bytes <= space_info->total_bytes) {
5124		- space_info->bytes_may_use += orig_bytes;
5125		- trace_btrfs_space_reservation(fs_info, "space_info",
5126		- space_info->flags, orig_bytes, 1);
5127		- ret = 0;
5128		- } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5129		- system_chunk)) {
5130		- space_info->bytes_may_use += orig_bytes;
5131		- trace_btrfs_space_reservation(fs_info, "space_info",
5132		- space_info->flags, orig_bytes, 1);
5133		- ret = 0;
5134		- }
5135		-
5136		- /*
5137		- * If we couldn't make a reservation then setup our reservation ticket
5138		- * and kick the async worker if it's not already running.
5139		- *
5140		- * If we are a priority flusher then we just need to add our ticket to
5141		- * the list and we will do our own flushing further down.
5142		- */
5143		- if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5144		- ticket.bytes = orig_bytes;
5145		- ticket.error = 0;
5146		- init_waitqueue_head(&ticket.wait);
5147		- if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5148		- list_add_tail(&ticket.list, &space_info->tickets);
5149		- if (!space_info->flush) {
5150		- space_info->flush = 1;
5151		- trace_btrfs_trigger_flush(fs_info,
5152		- space_info->flags,
5153		- orig_bytes, flush,
5154		- "enospc");
5155		- queue_work(system_unbound_wq,
5156		- &fs_info->async_reclaim_work);
5157		- }
5158		- } else {
5159		- list_add_tail(&ticket.list,
5160		- &space_info->priority_tickets);
5161		- }
5162		- } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5163		- used += orig_bytes;
5164		- /*
5165		- * We will do the space reservation dance during log replay,
5166		- * which means we won't have fs_info->fs_root set, so don't do
5167		- * the async reclaim as we will panic.
5168		- */
5169		- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5170		- need_do_async_reclaim(fs_info, space_info,
5171		- used, system_chunk) &&
5172		- !work_busy(&fs_info->async_reclaim_work)) {
5173		- trace_btrfs_trigger_flush(fs_info, space_info->flags,
5174		- orig_bytes, flush, "preempt");
5175		- queue_work(system_unbound_wq,
5176		- &fs_info->async_reclaim_work);
5177		- }
5178		- }
5179		- spin_unlock(&space_info->lock);
5180		- if (!ret \|\| flush == BTRFS_RESERVE_NO_FLUSH)
5181		- return ret;
5182		-
5183		- if (flush == BTRFS_RESERVE_FLUSH_ALL)
5184		- return wait_reserve_ticket(fs_info, space_info, &ticket,
5185		- orig_bytes);
5186		-
5187		- ret = 0;
5188		- priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5189		- spin_lock(&space_info->lock);
5190		- if (ticket.bytes) {
5191		- if (ticket.bytes < orig_bytes) {
5192		- u64 num_bytes = orig_bytes - ticket.bytes;
5193		- space_info->bytes_may_use -= num_bytes;
5194		- trace_btrfs_space_reservation(fs_info, "space_info",
5195		- space_info->flags,
5196		- num_bytes, 0);
5197		-
5198		- }
5199		- list_del_init(&ticket.list);
5200		- ret = -ENOSPC;
5201		- }
5202		- spin_unlock(&space_info->lock);
5203		- ASSERT(list_empty(&ticket.list));
5204		- return ret;
5205		-}
5206		-
5207		-/**
5208		- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5209		- * @root - the root we're allocating for
5210		- * @block_rsv - the block_rsv we're allocating for
5211		- * @orig_bytes - the number of bytes we want
5212		- * @flush - whether or not we can flush to make our reservation
5213		- *
5214		- * This will reserve orgi_bytes number of bytes from the space info associated
5215		- * with the block_rsv. If there is not enough space it will make an attempt to
5216		- * flush out space to make room. It will do this by flushing delalloc if
5217		- * possible or committing the transaction. If flush is 0 then no attempts to
5218		- * regain reservations will be made and this will fail if there is not enough
5219		- * space already.
5220		- */
5221		-static int reserve_metadata_bytes(struct btrfs_root *root,
5222		- struct btrfs_block_rsv *block_rsv,
5223		- u64 orig_bytes,
5224		- enum btrfs_reserve_flush_enum flush)
5225		-{
5226		- struct btrfs_fs_info *fs_info = root->fs_info;
5227		- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5228		- int ret;
5229		- bool system_chunk = (root == fs_info->chunk_root);
5230		-
5231		- ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5232		- orig_bytes, flush, system_chunk);
5233		- if (ret == -ENOSPC &&
5234		- unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5235		- if (block_rsv != global_rsv &&
5236		- !block_rsv_use_bytes(global_rsv, orig_bytes))
5237		- ret = 0;
5238		- }
5239		- if (ret == -ENOSPC) {
5240		- trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5241		- block_rsv->space_info->flags,
5242		- orig_bytes, 1);
5243		-
5244		- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5245		- dump_space_info(fs_info, block_rsv->space_info,
5246		- orig_bytes, 0);
5247		- }
5248		- return ret;
5249		-}
5250		-
5251		-static struct btrfs_block_rsv *get_block_rsv(
5252		- const struct btrfs_trans_handle *trans,
5253		- const struct btrfs_root *root)
5254		-{
5255		- struct btrfs_fs_info *fs_info = root->fs_info;
5256		- struct btrfs_block_rsv *block_rsv = NULL;
5257		-
5258		- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) \|\|
5259		- (root == fs_info->csum_root && trans->adding_csums) \|\|
5260		- (root == fs_info->uuid_root))
5261		- block_rsv = trans->block_rsv;
5262		-
5263		- if (!block_rsv)
5264		- block_rsv = root->block_rsv;
5265		-
5266		- if (!block_rsv)
5267		- block_rsv = &fs_info->empty_block_rsv;
5268		-
5269		- return block_rsv;
5270		-}
5271		-
5272		-static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5273		- u64 num_bytes)
5274		-{
5275		- int ret = -ENOSPC;
5276		- spin_lock(&block_rsv->lock);
5277		- if (block_rsv->reserved >= num_bytes) {
5278		- block_rsv->reserved -= num_bytes;
5279		- if (block_rsv->reserved < block_rsv->size)
5280		- block_rsv->full = 0;
5281		- ret = 0;
5282		- }
5283		- spin_unlock(&block_rsv->lock);
5284		- return ret;
5285		-}
5286		-
5287		-static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5288		- u64 num_bytes, int update_size)
5289		-{
5290		- spin_lock(&block_rsv->lock);
5291		- block_rsv->reserved += num_bytes;
5292		- if (update_size)
5293		- block_rsv->size += num_bytes;
5294		- else if (block_rsv->reserved >= block_rsv->size)
5295		- block_rsv->full = 1;
5296		- spin_unlock(&block_rsv->lock);
5297		-}
5298		-
5299		-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5300		- struct btrfs_block_rsv *dest, u64 num_bytes,
5301		- int min_factor)
5302		-{
5303		- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5304		- u64 min_bytes;
5305		-
5306		- if (global_rsv->space_info != dest->space_info)
5307		- return -ENOSPC;
5308		-
5309		- spin_lock(&global_rsv->lock);
5310		- min_bytes = div_factor(global_rsv->size, min_factor);
5311		- if (global_rsv->reserved < min_bytes + num_bytes) {
5312		- spin_unlock(&global_rsv->lock);
5313		- return -ENOSPC;
5314		- }
5315		- global_rsv->reserved -= num_bytes;
5316		- if (global_rsv->reserved < global_rsv->size)
5317		- global_rsv->full = 0;
5318		- spin_unlock(&global_rsv->lock);
5319		-
5320		- block_rsv_add_bytes(dest, num_bytes, 1);
5321		- return 0;
5322		-}
5323		-
5324		-/*
5325		- * This is for space we already have accounted in space_info->bytes_may_use, so
5326		- * basically when we're returning space from block_rsv's.
5327		- */
5328		-static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5329		- struct btrfs_space_info *space_info,
5330		- u64 num_bytes)
5331		-{
5332		- struct reserve_ticket *ticket;
5333		- struct list_head *head;
5334		- u64 used;
5335		- enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5336		- bool check_overcommit = false;
5337		-
5338		- spin_lock(&space_info->lock);
5339		- head = &space_info->priority_tickets;
5340		-
5341		- /*
5342		- * If we are over our limit then we need to check and see if we can
5343		- * overcommit, and if we can't then we just need to free up our space
5344		- * and not satisfy any requests.
5345		- */
5346		- used = btrfs_space_info_used(space_info, true);
5347		- if (used - num_bytes >= space_info->total_bytes)
5348		- check_overcommit = true;
5349		-again:
5350		- while (!list_empty(head) && num_bytes) {
5351		- ticket = list_first_entry(head, struct reserve_ticket,
5352		- list);
5353		- /*
5354		- * We use 0 bytes because this space is already reserved, so
5355		- * adding the ticket space would be a double count.
5356		- */
5357		- if (check_overcommit &&
5358		- !can_overcommit(fs_info, space_info, 0, flush, false))
5359		- break;
5360		- if (num_bytes >= ticket->bytes) {
5361		- list_del_init(&ticket->list);
5362		- num_bytes -= ticket->bytes;
5363		- ticket->bytes = 0;
5364		- space_info->tickets_id++;
5365		- wake_up(&ticket->wait);
5366		- } else {
5367		- ticket->bytes -= num_bytes;
5368		- num_bytes = 0;
5369		- }
5370		- }
5371		-
5372		- if (num_bytes && head == &space_info->priority_tickets) {
5373		- head = &space_info->tickets;
5374		- flush = BTRFS_RESERVE_FLUSH_ALL;
5375		- goto again;
5376		- }
5377		- space_info->bytes_may_use -= num_bytes;
5378		- trace_btrfs_space_reservation(fs_info, "space_info",
5379		- space_info->flags, num_bytes, 0);
5380		- spin_unlock(&space_info->lock);
5381		-}
5382		-
5383		-/*
5384		- * This is for newly allocated space that isn't accounted in
5385		- * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
5386		- * we use this helper.
5387		- */
5388		-static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5389		- struct btrfs_space_info *space_info,
5390		- u64 num_bytes)
5391		-{
5392		- struct reserve_ticket *ticket;
5393		- struct list_head *head = &space_info->priority_tickets;
5394		-
5395		-again:
5396		- while (!list_empty(head) && num_bytes) {
5397		- ticket = list_first_entry(head, struct reserve_ticket,
5398		- list);
5399		- if (num_bytes >= ticket->bytes) {
5400		- trace_btrfs_space_reservation(fs_info, "space_info",
5401		- space_info->flags,
5402		- ticket->bytes, 1);
5403		- list_del_init(&ticket->list);
5404		- num_bytes -= ticket->bytes;
5405		- space_info->bytes_may_use += ticket->bytes;
5406		- ticket->bytes = 0;
5407		- space_info->tickets_id++;
5408		- wake_up(&ticket->wait);
5409		- } else {
5410		- trace_btrfs_space_reservation(fs_info, "space_info",
5411		- space_info->flags,
5412		- num_bytes, 1);
5413		- space_info->bytes_may_use += num_bytes;
5414		- ticket->bytes -= num_bytes;
5415		- num_bytes = 0;
5416		- }
5417		- }
5418		-
5419		- if (num_bytes && head == &space_info->priority_tickets) {
5420		- head = &space_info->tickets;
5421		- goto again;
5422		- }
5423		-}
5424		-
5425		-static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5426		- struct btrfs_block_rsv *block_rsv,
5427		- struct btrfs_block_rsv *dest, u64 num_bytes,
5428		- u64 *qgroup_to_release_ret)
5429		-{
5430		- struct btrfs_space_info *space_info = block_rsv->space_info;
5431		- u64 qgroup_to_release = 0;
5432		- u64 ret;
5433		-
5434		- spin_lock(&block_rsv->lock);
5435		- if (num_bytes == (u64)-1) {
5436		- num_bytes = block_rsv->size;
5437		- qgroup_to_release = block_rsv->qgroup_rsv_size;
5438		- }
5439		- block_rsv->size -= num_bytes;
5440		- if (block_rsv->reserved >= block_rsv->size) {
5441		- num_bytes = block_rsv->reserved - block_rsv->size;
5442		- block_rsv->reserved = block_rsv->size;
5443		- block_rsv->full = 1;
5444		- } else {
5445		- num_bytes = 0;
5446		- }
5447		- if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5448		- qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5449		- block_rsv->qgroup_rsv_size;
5450		- block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5451		- } else {
5452		- qgroup_to_release = 0;
5453		- }
5454		- spin_unlock(&block_rsv->lock);
5455		-
5456		- ret = num_bytes;
5457		- if (num_bytes > 0) {
5458		- if (dest) {
5459		- spin_lock(&dest->lock);
5460		- if (!dest->full) {
5461		- u64 bytes_to_add;
5462		-
5463		- bytes_to_add = dest->size - dest->reserved;
5464		- bytes_to_add = min(num_bytes, bytes_to_add);
5465		- dest->reserved += bytes_to_add;
5466		- if (dest->reserved >= dest->size)
5467		- dest->full = 1;
5468		- num_bytes -= bytes_to_add;
5469		- }
5470		- spin_unlock(&dest->lock);
5471		- }
5472		- if (num_bytes)
5473		- space_info_add_old_bytes(fs_info, space_info,
5474		- num_bytes);
5475		- }
5476		- if (qgroup_to_release_ret)
5477		- *qgroup_to_release_ret = qgroup_to_release;
5478		- return ret;
5479		-}
5480		-
5481		-int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5482		- struct btrfs_block_rsv *dst, u64 num_bytes,
5483		- int update_size)
5484		-{
5485		- int ret;
5486		-
5487		- ret = block_rsv_use_bytes(src, num_bytes);
5488		- if (ret)
5489		- return ret;
5490		-
5491		- block_rsv_add_bytes(dst, num_bytes, update_size);
5492		- return 0;
5493		-}
5494		-
5495		-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5496		-{
5497		- memset(rsv, 0, sizeof(*rsv));
5498		- spin_lock_init(&rsv->lock);
5499		- rsv->type = type;
5500		-}
5501		-
5502		-void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5503		- struct btrfs_block_rsv *rsv,
5504		- unsigned short type)
5505		-{
5506		- btrfs_init_block_rsv(rsv, type);
5507		- rsv->space_info = __find_space_info(fs_info,
5508		- BTRFS_BLOCK_GROUP_METADATA);
5509		-}
5510		-
5511		-struct btrfs_block_rsv btrfs_alloc_block_rsv(struct btrfs_fs_info fs_info,
5512		- unsigned short type)
5513		-{
5514		- struct btrfs_block_rsv *block_rsv;
5515		-
5516		- block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5517		- if (!block_rsv)
5518		- return NULL;
5519		-
5520		- btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5521		- return block_rsv;
5522		-}
5523		-
5524		-void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5525		- struct btrfs_block_rsv *rsv)
5526		-{
5527		- if (!rsv)
5528		- return;
5529		- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5530		- kfree(rsv);
5531		-}
5532		-
5533		-int btrfs_block_rsv_add(struct btrfs_root *root,
5534		- struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5535		- enum btrfs_reserve_flush_enum flush)
5536		-{
5537		- int ret;
5538		-
5539		- if (num_bytes == 0)
5540		- return 0;
5541		-
5542		- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5543		- if (!ret) {
5544		- block_rsv_add_bytes(block_rsv, num_bytes, 1);
5545		- return 0;
5546		- }
5547		-
5548		- return ret;
5549		-}
5550		-
5551		-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5552		-{
5553		- u64 num_bytes = 0;
5554		- int ret = -ENOSPC;
5555		-
5556		- if (!block_rsv)
5557		- return 0;
5558		-
5559		- spin_lock(&block_rsv->lock);
5560		- num_bytes = div_factor(block_rsv->size, min_factor);
5561		- if (block_rsv->reserved >= num_bytes)
5562		- ret = 0;
5563		- spin_unlock(&block_rsv->lock);
5564		-
5565		- return ret;
5566		-}
5567		-
5568		-int btrfs_block_rsv_refill(struct btrfs_root *root,
5569		- struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5570		- enum btrfs_reserve_flush_enum flush)
5571		-{
5572		- u64 num_bytes = 0;
5573		- int ret = -ENOSPC;
5574		-
5575		- if (!block_rsv)
5576		- return 0;
5577		-
5578		- spin_lock(&block_rsv->lock);
5579		- num_bytes = min_reserved;
5580		- if (block_rsv->reserved >= num_bytes)
5581		- ret = 0;
5582		- else
5583		- num_bytes -= block_rsv->reserved;
5584		- spin_unlock(&block_rsv->lock);
5585		-
5586		- if (!ret)
5587		- return 0;
5588		-
5589		- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5590		- if (!ret) {
5591		- block_rsv_add_bytes(block_rsv, num_bytes, 0);
5592		- return 0;
5593		- }
5594		-
5595		- return ret;
5596		-}
5597		-
5598		-/**
5599		- * btrfs_inode_rsv_refill - refill the inode block rsv.
5600		- * @inode - the inode we are refilling.
5601		- * @flush - the flusing restriction.
5602		- *
5603		- * Essentially the same as btrfs_block_rsv_refill, except it uses the
5604		- * block_rsv->size as the minimum size. We'll either refill the missing amount
5605		- * or return if we already have enough space. This will also handle the resreve
5606		- * tracepoint for the reserved amount.
5607		- */
5608		-static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5609		- enum btrfs_reserve_flush_enum flush)
5610		-{
5611		- struct btrfs_root *root = inode->root;
5612		- struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5613		- u64 num_bytes = 0;
5614		- u64 qgroup_num_bytes = 0;
5615		- int ret = -ENOSPC;
5616		-
5617		- spin_lock(&block_rsv->lock);
5618		- if (block_rsv->reserved < block_rsv->size)
5619		- num_bytes = block_rsv->size - block_rsv->reserved;
5620		- if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
5621		- qgroup_num_bytes = block_rsv->qgroup_rsv_size -
5622		- block_rsv->qgroup_rsv_reserved;
5623		- spin_unlock(&block_rsv->lock);
5624		-
5625		- if (num_bytes == 0)
5626		- return 0;
5627		-
5628		- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
5629		- if (ret)
5630		- return ret;
5631		- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5632		- if (!ret) {
5633		- block_rsv_add_bytes(block_rsv, num_bytes, 0);
5634		- trace_btrfs_space_reservation(root->fs_info, "delalloc",
5635		- btrfs_ino(inode), num_bytes, 1);
5636		-
5637		- /* Don't forget to increase qgroup_rsv_reserved */
5638		- spin_lock(&block_rsv->lock);
5639		- block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
5640		- spin_unlock(&block_rsv->lock);
5641		- } else
5642		- btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5643		- return ret;
5644		-}
5645		-
5646		-/**
5647		- * btrfs_inode_rsv_release - release any excessive reservation.
5648		- * @inode - the inode we need to release from.
5649		- * @qgroup_free - free or convert qgroup meta.
5650		- * Unlike normal operation, qgroup meta reservation needs to know if we are
5651		- * freeing qgroup reservation or just converting it into per-trans. Normally
5652		- * @qgroup_free is true for error handling, and false for normal release.
5653		- *
5654		- * This is the same as btrfs_block_rsv_release, except that it handles the
5655		- * tracepoint for the reservation.
5656		- */
5657		-static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5658		-{
5659		- struct btrfs_fs_info *fs_info = inode->root->fs_info;
5660		- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5661		- struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5662		- u64 released = 0;
5663		- u64 qgroup_to_release = 0;
5664		-
5665		- /*
5666		- * Since we statically set the block_rsv->size we just want to say we
5667		- * are releasing 0 bytes, and then we'll just get the reservation over
5668		- * the size free'd.
5669		- */
5670		- released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
5671		- &qgroup_to_release);
5672		- if (released > 0)
5673		- trace_btrfs_space_reservation(fs_info, "delalloc",
5674		- btrfs_ino(inode), released, 0);
5675		- if (qgroup_free)
5676		- btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5677		- else
5678		- btrfs_qgroup_convert_reserved_meta(inode->root,
5679		- qgroup_to_release);
5680		-}
5681		-
5682		-void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5683		- struct btrfs_block_rsv *block_rsv,
5684		- u64 num_bytes)
5685		-{
5686		- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5687		-
5688		- if (global_rsv == block_rsv \|\|
5689		- block_rsv->space_info != global_rsv->space_info)
5690		- global_rsv = NULL;
5691		- block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
5692		-}
5693		-
5694		-static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5695		-{
5696		- struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5697		- struct btrfs_space_info *sinfo = block_rsv->space_info;
5698		- u64 num_bytes;
5699		-
5700		- /*
5701		- * The global block rsv is based on the size of the extent tree, the
5702		- * checksum tree and the root tree. If the fs is empty we want to set
5703		- * it to a minimal amount for safety.
5704		- */
5705		- num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5706		- btrfs_root_used(&fs_info->csum_root->root_item) +
5707		- btrfs_root_used(&fs_info->tree_root->root_item);
5708		- num_bytes = max_t(u64, num_bytes, SZ_16M);
5709		-
5710		- spin_lock(&sinfo->lock);
5711		- spin_lock(&block_rsv->lock);
5712		-
5713		- block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5714		-
5715		- if (block_rsv->reserved < block_rsv->size) {
5716		- num_bytes = btrfs_space_info_used(sinfo, true);
5717		- if (sinfo->total_bytes > num_bytes) {
5718		- num_bytes = sinfo->total_bytes - num_bytes;
5719		- num_bytes = min(num_bytes,
5720		- block_rsv->size - block_rsv->reserved);
5721		- block_rsv->reserved += num_bytes;
5722		- sinfo->bytes_may_use += num_bytes;
5723		- trace_btrfs_space_reservation(fs_info, "space_info",
5724		- sinfo->flags, num_bytes,
5725		- 1);
5726		- }
5727		- } else if (block_rsv->reserved > block_rsv->size) {
5728		- num_bytes = block_rsv->reserved - block_rsv->size;
5729		- sinfo->bytes_may_use -= num_bytes;
5730		- trace_btrfs_space_reservation(fs_info, "space_info",
5731		- sinfo->flags, num_bytes, 0);
5732		- block_rsv->reserved = block_rsv->size;
5733		- }
5734		-
5735		- if (block_rsv->reserved == block_rsv->size)
5736		- block_rsv->full = 1;
5737		- else
5738		- block_rsv->full = 0;
5739		-
5740		- spin_unlock(&block_rsv->lock);
5741		- spin_unlock(&sinfo->lock);
5742		-}
5743		-
5744		-static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5745		-{
5746		- struct btrfs_space_info *space_info;
5747		-
5748		- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5749		- fs_info->chunk_block_rsv.space_info = space_info;
5750		-
5751		- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5752		- fs_info->global_block_rsv.space_info = space_info;
5753		- fs_info->trans_block_rsv.space_info = space_info;
5754		- fs_info->empty_block_rsv.space_info = space_info;
5755		- fs_info->delayed_block_rsv.space_info = space_info;
5756		-
5757		- fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5758		- fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5759		- fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5760		- fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5761		- if (fs_info->quota_root)
5762		- fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5763		- fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5764		-
5765		- update_global_block_rsv(fs_info);
5766		-}
5767		-
5768		-static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5769		-{
5770		- block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5771		- (u64)-1, NULL);
5772		- WARN_ON(fs_info->trans_block_rsv.size > 0);
5773		- WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5774		- WARN_ON(fs_info->chunk_block_rsv.size > 0);
5775		- WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5776		- WARN_ON(fs_info->delayed_block_rsv.size > 0);
5777		- WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5778		-}
5779		-
5780		-
5781		-/*
5782		- * To be called after all the new block groups attached to the transaction
5783		- * handle have been created (btrfs_create_pending_block_groups()).
5784		- */
5785		-void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5786		-{
5787		- struct btrfs_fs_info *fs_info = trans->fs_info;
5788		-
5789		- if (!trans->chunk_bytes_reserved)
5790		- return;
5791		-
5792		- WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5793		-
5794		- block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5795		- trans->chunk_bytes_reserved, NULL);
5796		- trans->chunk_bytes_reserved = 0;
5797		-}
5798		-
5799		-/*
5800		- * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5801		- * root: the root of the parent directory
5802		- * rsv: block reservation
5803		- * items: the number of items that we need do reservation
5804		- * use_global_rsv: allow fallback to the global block reservation
5805		- *
5806		- * This function is used to reserve the space for snapshot/subvolume
5807		- * creation and deletion. Those operations are different with the
5808		- * common file/directory operations, they change two fs/file trees
5809		- * and root tree, the number of items that the qgroup reserves is
5810		- * different with the free space reservation. So we can not use
5811		- * the space reservation mechanism in start_transaction().
5812		- */
5813		-int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5814		- struct btrfs_block_rsv *rsv, int items,
5815		- bool use_global_rsv)
5816		-{
5817		- u64 qgroup_num_bytes = 0;
5818		- u64 num_bytes;
5819		- int ret;
5820		- struct btrfs_fs_info *fs_info = root->fs_info;
5821		- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5822		-
5823		- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
5824		- /* One for parent inode, two for dir entries */
5825		- qgroup_num_bytes = 3 * fs_info->nodesize;
5826		- ret = btrfs_qgroup_reserve_meta_prealloc(root,
5827		- qgroup_num_bytes, true);
5828		- if (ret)
5829		- return ret;
5830		- }
5831		-
5832		- num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
5833		- rsv->space_info = __find_space_info(fs_info,
5834		- BTRFS_BLOCK_GROUP_METADATA);
5835		- ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5836		- BTRFS_RESERVE_FLUSH_ALL);
5837		-
5838		- if (ret == -ENOSPC && use_global_rsv)
5839		- ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
5840		-
5841		- if (ret && qgroup_num_bytes)
5842		- btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5843		-
5844		- return ret;
5845		-}
5846		-
5847		-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
5848		- struct btrfs_block_rsv *rsv)
5849		-{
5850		- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5851		-}
5852		-
5853		-static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
5854		- struct btrfs_inode *inode)
5855		-{
5856		- struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5857		- u64 reserve_size = 0;
5858		- u64 qgroup_rsv_size = 0;
5859		- u64 csum_leaves;
5860		- unsigned outstanding_extents;
5861		-
5862		- lockdep_assert_held(&inode->lock);
5863		- outstanding_extents = inode->outstanding_extents;
5864		- if (outstanding_extents)
5865		- reserve_size = btrfs_calc_trans_metadata_size(fs_info,
5866		- outstanding_extents + 1);
5867		- csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
5868		- inode->csum_bytes);
5869		- reserve_size += btrfs_calc_trans_metadata_size(fs_info,
5870		- csum_leaves);
5871		- /*
5872		- * For qgroup rsv, the calculation is very simple:
5873		- * account one nodesize for each outstanding extent
5874		- *
5875		- * This is overestimating in most cases.
5876		- */
5877		- qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
5878		-
5879		- spin_lock(&block_rsv->lock);
5880		- block_rsv->size = reserve_size;
5881		- block_rsv->qgroup_rsv_size = qgroup_rsv_size;
5882		- spin_unlock(&block_rsv->lock);
5883		-}
5884		-
5885		-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
5886		-{
5887		- struct btrfs_fs_info *fs_info = inode->root->fs_info;
5888		- unsigned nr_extents;
5889		- enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5890		- int ret = 0;
5891		- bool delalloc_lock = true;
5892		-
5893		- /* If we are a free space inode we need to not flush since we will be in
5894		- * the middle of a transaction commit. We also don't need the delalloc
5895		- * mutex since we won't race with anybody. We need this mostly to make
5896		- * lockdep shut its filthy mouth.
5897		- *
5898		- * If we have a transaction open (can happen if we call truncate_block
5899		- * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5900		- */
5901		- if (btrfs_is_free_space_inode(inode)) {
5902		- flush = BTRFS_RESERVE_NO_FLUSH;
5903		- delalloc_lock = false;
5904		- } else {
5905		- if (current->journal_info)
5906		- flush = BTRFS_RESERVE_FLUSH_LIMIT;
5907		-
5908		- if (btrfs_transaction_in_commit(fs_info))
5909		- schedule_timeout(1);
5910		- }
5911		-
5912		- if (delalloc_lock)
5913		- mutex_lock(&inode->delalloc_mutex);
5914		-
5915		- num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5916		-
5917		- /* Add our new extents and calculate the new rsv size. */
5918		- spin_lock(&inode->lock);
5919		- nr_extents = count_max_extents(num_bytes);
5920		- btrfs_mod_outstanding_extents(inode, nr_extents);
5921		- inode->csum_bytes += num_bytes;
5922		- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5923		- spin_unlock(&inode->lock);
5924		-
5925		- ret = btrfs_inode_rsv_refill(inode, flush);
5926		- if (unlikely(ret))
5927		- goto out_fail;
5928		-
5929		- if (delalloc_lock)
5930		- mutex_unlock(&inode->delalloc_mutex);
5931		- return 0;
5932		-
5933		-out_fail:
5934		- spin_lock(&inode->lock);
5935		- nr_extents = count_max_extents(num_bytes);
5936		- btrfs_mod_outstanding_extents(inode, -nr_extents);
5937		- inode->csum_bytes -= num_bytes;
5938		- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5939		- spin_unlock(&inode->lock);
5940		-
5941		- btrfs_inode_rsv_release(inode, true);
5942		- if (delalloc_lock)
5943		- mutex_unlock(&inode->delalloc_mutex);
5944		- return ret;
5945		-}
5946		-
5947		-/**
5948		- * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5949		- * @inode: the inode to release the reservation for.
5950		- * @num_bytes: the number of bytes we are releasing.
5951		- * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
5952		- *
5953		- * This will release the metadata reservation for an inode. This can be called
5954		- * once we complete IO for a given set of bytes to release their metadata
5955		- * reservations, or on error for the same reason.
5956		- */
5957		-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
5958		- bool qgroup_free)
5959		-{
5960		- struct btrfs_fs_info *fs_info = inode->root->fs_info;
5961		-
5962		- num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5963		- spin_lock(&inode->lock);
5964		- inode->csum_bytes -= num_bytes;
5965		- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5966		- spin_unlock(&inode->lock);
5967		-
5968		- if (btrfs_is_testing(fs_info))
5969		- return;
5970		-
5971		- btrfs_inode_rsv_release(inode, qgroup_free);
5972		-}
5973		-
5974		-/**
5975		- * btrfs_delalloc_release_extents - release our outstanding_extents
5976		- * @inode: the inode to balance the reservation for.
5977		- * @num_bytes: the number of bytes we originally reserved with
5978		- * @qgroup_free: do we need to free qgroup meta reservation or convert them.
5979		- *
5980		- * When we reserve space we increase outstanding_extents for the extents we may
5981		- * add. Once we've set the range as delalloc or created our ordered extents we
5982		- * have outstanding_extents to track the real usage, so we use this to free our
5983		- * temporarily tracked outstanding_extents. This _must_ be used in conjunction
5984		- * with btrfs_delalloc_reserve_metadata.
5985		- */
5986		-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
5987		-{
5988		- struct btrfs_fs_info *fs_info = inode->root->fs_info;
5989		- unsigned num_extents;
5990		-
5991		- spin_lock(&inode->lock);
5992		- num_extents = count_max_extents(num_bytes);
5993		- btrfs_mod_outstanding_extents(inode, -num_extents);
5994		- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5995		- spin_unlock(&inode->lock);
5996		-
5997		- if (btrfs_is_testing(fs_info))
5998		- return;
5999		-
6000		- btrfs_inode_rsv_release(inode, true);
6001		-}
6002		-
6003		-/**
6004		- * btrfs_delalloc_reserve_space - reserve data and metadata space for
6005		- * delalloc
6006		- * @inode: inode we're writing to
6007		- * @start: start range we are writing to
6008		- * @len: how long the range we are writing to
6009		- * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6010		- * current reservation.
6011		- *
6012		- * This will do the following things
6013		- *
6014		- * o reserve space in data space info for num bytes
6015		- * and reserve precious corresponding qgroup space
6016		- * (Done in check_data_free_space)
6017		- *
6018		- * o reserve space for metadata space, based on the number of outstanding
6019		- * extents and how much csums will be needed
6020		- * also reserve metadata space in a per root over-reserve method.
6021		- * o add to the inodes->delalloc_bytes
6022		- * o add it to the fs_info's delalloc inodes list.
6023		- * (Above 3 all done in delalloc_reserve_metadata)
6024		- *
6025		- * Return 0 for success
6026		- * Return <0 for error(-ENOSPC or -EQUOT)
6027		- */
6028		-int btrfs_delalloc_reserve_space(struct inode *inode,
6029		- struct extent_changeset **reserved, u64 start, u64 len)
6030		-{
6031		- int ret;
6032		-
6033		- ret = btrfs_check_data_free_space(inode, reserved, start, len);
6034		- if (ret < 0)
6035		- return ret;
6036		- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6037		- if (ret < 0)
6038		- btrfs_free_reserved_data_space(inode, *reserved, start, len);
6039		- return ret;
6040		-}
6041		-
6042		-/**
6043		- * btrfs_delalloc_release_space - release data and metadata space for delalloc
6044		- * @inode: inode we're releasing space for
6045		- * @start: start position of the space already reserved
6046		- * @len: the len of the space already reserved
6047		- * @release_bytes: the len of the space we consumed or didn't use
6048		- *
6049		- * This function will release the metadata space that was not used and will
6050		- * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6051		- * list if there are no delalloc bytes left.
6052		- * Also it will handle the qgroup reserved space.
6053		- */
6054		-void btrfs_delalloc_release_space(struct inode *inode,
6055		- struct extent_changeset *reserved,
6056		- u64 start, u64 len, bool qgroup_free)
6057		-{
6058		- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
6059		- btrfs_free_reserved_data_space(inode, reserved, start, len);
6060		-}
6061		-
6062		-static int update_block_group(struct btrfs_trans_handle *trans,
6063		- struct btrfs_fs_info *info, u64 bytenr,
6064		- u64 num_bytes, int alloc)
6065		-{
6066		- struct btrfs_block_group_cache *cache = NULL;
6067		- u64 total = num_bytes;
6068		- u64 old_val;
6069		- u64 byte_in_group;
6070		- int factor;
6071		-
6072		- /* block accounting for super block */
6073		- spin_lock(&info->delalloc_root_lock);
6074		- old_val = btrfs_super_bytes_used(info->super_copy);
6075		- if (alloc)
6076		- old_val += num_bytes;
6077		- else
6078		- old_val -= num_bytes;
6079		- btrfs_set_super_bytes_used(info->super_copy, old_val);
6080		- spin_unlock(&info->delalloc_root_lock);
6081		-
6082		- while (total) {
6083		- cache = btrfs_lookup_block_group(info, bytenr);
6084		- if (!cache)
6085		- return -ENOENT;
6086		- factor = btrfs_bg_type_to_factor(cache->flags);
6087		-
6088		- /*
6089		- * If this block group has free space cache written out, we
6090		- * need to make sure to load it if we are removing space. This
6091		- * is because we need the unpinning stage to actually add the
6092		- * space back to the block group, otherwise we will leak space.
6093		- */
6094		- if (!alloc && cache->cached == BTRFS_CACHE_NO)
6095		- cache_block_group(cache, 1);
6096		-
6097		- byte_in_group = bytenr - cache->key.objectid;
6098		- WARN_ON(byte_in_group > cache->key.offset);
6099		-
6100		- spin_lock(&cache->space_info->lock);
6101		- spin_lock(&cache->lock);
6102		-
6103		- if (btrfs_test_opt(info, SPACE_CACHE) &&
6104		- cache->disk_cache_state < BTRFS_DC_CLEAR)
6105		- cache->disk_cache_state = BTRFS_DC_CLEAR;
6106		-
6107		- old_val = btrfs_block_group_used(&cache->item);
6108		- num_bytes = min(total, cache->key.offset - byte_in_group);
6109		- if (alloc) {
6110		- old_val += num_bytes;
6111		- btrfs_set_block_group_used(&cache->item, old_val);
6112		- cache->reserved -= num_bytes;
6113		- cache->space_info->bytes_reserved -= num_bytes;
6114		- cache->space_info->bytes_used += num_bytes;
6115		- cache->space_info->disk_used += num_bytes * factor;
6116		- spin_unlock(&cache->lock);
6117		- spin_unlock(&cache->space_info->lock);
6118		- } else {
6119		- old_val -= num_bytes;
6120		- btrfs_set_block_group_used(&cache->item, old_val);
6121		- cache->pinned += num_bytes;
6122		- cache->space_info->bytes_pinned += num_bytes;
6123		- cache->space_info->bytes_used -= num_bytes;
6124		- cache->space_info->disk_used -= num_bytes * factor;
6125		- spin_unlock(&cache->lock);
6126		- spin_unlock(&cache->space_info->lock);
6127		-
6128		- trace_btrfs_space_reservation(info, "pinned",
6129		- cache->space_info->flags,
6130		- num_bytes, 1);
6131		- percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6132		- num_bytes,
6133		- BTRFS_TOTAL_BYTES_PINNED_BATCH);
6134		- set_extent_dirty(info->pinned_extents,
6135		- bytenr, bytenr + num_bytes - 1,
6136		- GFP_NOFS \| __GFP_NOFAIL);
6137		- }
6138		-
6139		- spin_lock(&trans->transaction->dirty_bgs_lock);
6140		- if (list_empty(&cache->dirty_list)) {
6141		- list_add_tail(&cache->dirty_list,
6142		- &trans->transaction->dirty_bgs);
6143		- trans->transaction->num_dirty_bgs++;
6144		- btrfs_get_block_group(cache);
6145		- }
6146		- spin_unlock(&trans->transaction->dirty_bgs_lock);
6147		-
6148		- /*
6149		- * No longer have used bytes in this block group, queue it for
6150		- * deletion. We do this after adding the block group to the
6151		- * dirty list to avoid races between cleaner kthread and space
6152		- * cache writeout.
6153		- */
6154		- if (!alloc && old_val == 0)
6155		- btrfs_mark_bg_unused(cache);
6156		-
6157		- btrfs_put_block_group(cache);
6158		- total -= num_bytes;
6159		- bytenr += num_bytes;
6160		- }
6161		- return 0;
6162	2510	}
6163	2511
6164	2512	static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
6165	2513	{
6166		- struct btrfs_block_group_cache *cache;
	2514	+ struct btrfs_block_group *cache;
6167	2515	u64 bytenr;
6168	2516
6169	2517	spin_lock(&fs_info->block_group_cache_lock);
..	..	@@ -6177,20 +2525,23 @@
6177	2525	if (!cache)
6178	2526	return 0;
6179	2527
6180		- bytenr = cache->key.objectid;
	2528	+ bytenr = cache->start;
6181	2529	btrfs_put_block_group(cache);
6182	2530
6183	2531	return bytenr;
6184	2532	}
6185	2533
6186		-static int pin_down_extent(struct btrfs_fs_info *fs_info,
6187		- struct btrfs_block_group_cache *cache,
	2534	+static int pin_down_extent(struct btrfs_trans_handle *trans,
	2535	+ struct btrfs_block_group *cache,
6188	2536	u64 bytenr, u64 num_bytes, int reserved)
6189	2537	{
	2538	+ struct btrfs_fs_info *fs_info = cache->fs_info;
	2539	+
6190	2540	spin_lock(&cache->space_info->lock);
6191	2541	spin_lock(&cache->lock);
6192	2542	cache->pinned += num_bytes;
6193		- cache->space_info->bytes_pinned += num_bytes;
	2543	+ btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
	2544	+ num_bytes);
6194	2545	if (reserved) {
6195	2546	cache->reserved -= num_bytes;
6196	2547	cache->space_info->bytes_reserved -= num_bytes;
..	..	@@ -6198,27 +2549,21 @@
6198	2549	spin_unlock(&cache->lock);
6199	2550	spin_unlock(&cache->space_info->lock);
6200	2551
6201		- trace_btrfs_space_reservation(fs_info, "pinned",
6202		- cache->space_info->flags, num_bytes, 1);
6203		- percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6204		- num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6205		- set_extent_dirty(fs_info->pinned_extents, bytenr,
	2552	+ __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
	2553	+ set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
6206	2554	bytenr + num_bytes - 1, GFP_NOFS \| __GFP_NOFAIL);
6207	2555	return 0;
6208	2556	}
6209	2557
6210		-/*
6211		- * this function must be called within transaction
6212		- */
6213		-int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
	2558	+int btrfs_pin_extent(struct btrfs_trans_handle *trans,
6214	2559	u64 bytenr, u64 num_bytes, int reserved)
6215	2560	{
6216		- struct btrfs_block_group_cache *cache;
	2561	+ struct btrfs_block_group *cache;
6217	2562
6218		- cache = btrfs_lookup_block_group(fs_info, bytenr);
	2563	+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
6219	2564	BUG_ON(!cache); /* Logic error */
6220	2565
6221		- pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
	2566	+ pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
6222	2567
6223	2568	btrfs_put_block_group(cache);
6224	2569	return 0;
..	..	@@ -6227,13 +2572,15 @@
6227	2572	/*
6228	2573	* this function must be called within transaction
6229	2574	*/
6230		-int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
	2575	+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
6231	2576	u64 bytenr, u64 num_bytes)
6232	2577	{
6233		- struct btrfs_block_group_cache *cache;
	2578	+ struct btrfs_block_group *cache;
6234	2579	int ret;
6235	2580
6236		- cache = btrfs_lookup_block_group(fs_info, bytenr);
	2581	+ btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes);
	2582	+
	2583	+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
6237	2584	if (!cache)
6238	2585	return -EINVAL;
6239	2586
..	..	@@ -6243,9 +2590,9 @@
6243	2590	* to one because the slow code to read in the free extents does check
6244	2591	* the pinned extents.
6245	2592	*/
6246		- cache_block_group(cache, 1);
	2593	+ btrfs_cache_block_group(cache, 1);
6247	2594
6248		- pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
	2595	+ pin_down_extent(trans, cache, bytenr, num_bytes, 0);
6249	2596
6250	2597	/* remove us from the free space cache (if we're there at all) */
6251	2598	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
..	..	@@ -6257,25 +2604,26 @@
6257	2604	u64 start, u64 num_bytes)
6258	2605	{
6259	2606	int ret;
6260		- struct btrfs_block_group_cache *block_group;
	2607	+ struct btrfs_block_group *block_group;
6261	2608	struct btrfs_caching_control *caching_ctl;
6262	2609
6263	2610	block_group = btrfs_lookup_block_group(fs_info, start);
6264	2611	if (!block_group)
6265	2612	return -EINVAL;
6266	2613
6267		- cache_block_group(block_group, 0);
6268		- caching_ctl = get_caching_control(block_group);
	2614	+ btrfs_cache_block_group(block_group, 0);
	2615	+ caching_ctl = btrfs_get_caching_control(block_group);
6269	2616
6270	2617	if (!caching_ctl) {
6271	2618	/* Logic error */
6272		- BUG_ON(!block_group_cache_done(block_group));
	2619	+ BUG_ON(!btrfs_block_group_done(block_group));
6273	2620	ret = btrfs_remove_free_space(block_group, start, num_bytes);
6274	2621	} else {
6275	2622	mutex_lock(&caching_ctl->mutex);
6276	2623
6277	2624	if (start >= caching_ctl->progress) {
6278		- ret = add_excluded_extent(fs_info, start, num_bytes);
	2625	+ ret = btrfs_add_excluded_extent(fs_info, start,
	2626	+ num_bytes);
6279	2627	} else if (start + num_bytes <= caching_ctl->progress) {
6280	2628	ret = btrfs_remove_free_space(block_group,
6281	2629	start, num_bytes);
..	..	@@ -6289,19 +2637,20 @@
6289	2637	num_bytes = (start + num_bytes) -
6290	2638	caching_ctl->progress;
6291	2639	start = caching_ctl->progress;
6292		- ret = add_excluded_extent(fs_info, start, num_bytes);
	2640	+ ret = btrfs_add_excluded_extent(fs_info, start,
	2641	+ num_bytes);
6293	2642	}
6294	2643	out_lock:
6295	2644	mutex_unlock(&caching_ctl->mutex);
6296		- put_caching_control(caching_ctl);
	2645	+ btrfs_put_caching_control(caching_ctl);
6297	2646	}
6298	2647	btrfs_put_block_group(block_group);
6299	2648	return ret;
6300	2649	}
6301	2650
6302		-int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
6303		- struct extent_buffer *eb)
	2651	+int btrfs_exclude_logged_extents(struct extent_buffer *eb)
6304	2652	{
	2653	+ struct btrfs_fs_info *fs_info = eb->fs_info;
6305	2654	struct btrfs_file_extent_item *item;
6306	2655	struct btrfs_key key;
6307	2656	int found_type;
..	..	@@ -6332,146 +2681,9 @@
6332	2681	}
6333	2682
6334	2683	static void
6335		-btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
	2684	+btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
6336	2685	{
6337	2686	atomic_inc(&bg->reservations);
6338		-}
6339		-
6340		-void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6341		- const u64 start)
6342		-{
6343		- struct btrfs_block_group_cache *bg;
6344		-
6345		- bg = btrfs_lookup_block_group(fs_info, start);
6346		- ASSERT(bg);
6347		- if (atomic_dec_and_test(&bg->reservations))
6348		- wake_up_var(&bg->reservations);
6349		- btrfs_put_block_group(bg);
6350		-}
6351		-
6352		-void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6353		-{
6354		- struct btrfs_space_info *space_info = bg->space_info;
6355		-
6356		- ASSERT(bg->ro);
6357		-
6358		- if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6359		- return;
6360		-
6361		- /*
6362		- * Our block group is read only but before we set it to read only,
6363		- * some task might have had allocated an extent from it already, but it
6364		- * has not yet created a respective ordered extent (and added it to a
6365		- * root's list of ordered extents).
6366		- * Therefore wait for any task currently allocating extents, since the
6367		- * block group's reservations counter is incremented while a read lock
6368		- * on the groups' semaphore is held and decremented after releasing
6369		- * the read access on that semaphore and creating the ordered extent.
6370		- */
6371		- down_write(&space_info->groups_sem);
6372		- up_write(&space_info->groups_sem);
6373		-
6374		- wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
6375		-}
6376		-
6377		-/**
6378		- * btrfs_add_reserved_bytes - update the block_group and space info counters
6379		- * @cache: The cache we are manipulating
6380		- * @ram_bytes: The number of bytes of file content, and will be same to
6381		- * @num_bytes except for the compress path.
6382		- * @num_bytes: The number of bytes in question
6383		- * @delalloc: The blocks are allocated for the delalloc write
6384		- *
6385		- * This is called by the allocator when it reserves space. If this is a
6386		- * reservation and the block group has become read only we cannot make the
6387		- * reservation and return -EAGAIN, otherwise this function always succeeds.
6388		- */
6389		-static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6390		- u64 ram_bytes, u64 num_bytes, int delalloc)
6391		-{
6392		- struct btrfs_space_info *space_info = cache->space_info;
6393		- int ret = 0;
6394		-
6395		- spin_lock(&space_info->lock);
6396		- spin_lock(&cache->lock);
6397		- if (cache->ro) {
6398		- ret = -EAGAIN;
6399		- } else {
6400		- cache->reserved += num_bytes;
6401		- space_info->bytes_reserved += num_bytes;
6402		-
6403		- trace_btrfs_space_reservation(cache->fs_info,
6404		- "space_info", space_info->flags,
6405		- ram_bytes, 0);
6406		- space_info->bytes_may_use -= ram_bytes;
6407		- if (delalloc)
6408		- cache->delalloc_bytes += num_bytes;
6409		- }
6410		- spin_unlock(&cache->lock);
6411		- spin_unlock(&space_info->lock);
6412		- return ret;
6413		-}
6414		-
6415		-/**
6416		- * btrfs_free_reserved_bytes - update the block_group and space info counters
6417		- * @cache: The cache we are manipulating
6418		- * @num_bytes: The number of bytes in question
6419		- * @delalloc: The blocks are allocated for the delalloc write
6420		- *
6421		- * This is called by somebody who is freeing space that was never actually used
6422		- * on disk. For example if you reserve some space for a new leaf in transaction
6423		- * A and before transaction A commits you free that leaf, you call this with
6424		- * reserve set to 0 in order to clear the reservation.
6425		- */
6426		-
6427		-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6428		- u64 num_bytes, int delalloc)
6429		-{
6430		- struct btrfs_space_info *space_info = cache->space_info;
6431		- int ret = 0;
6432		-
6433		- spin_lock(&space_info->lock);
6434		- spin_lock(&cache->lock);
6435		- if (cache->ro)
6436		- space_info->bytes_readonly += num_bytes;
6437		- cache->reserved -= num_bytes;
6438		- space_info->bytes_reserved -= num_bytes;
6439		- space_info->max_extent_size = 0;
6440		-
6441		- if (delalloc)
6442		- cache->delalloc_bytes -= num_bytes;
6443		- spin_unlock(&cache->lock);
6444		- spin_unlock(&space_info->lock);
6445		- return ret;
6446		-}
6447		-void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6448		-{
6449		- struct btrfs_caching_control *next;
6450		- struct btrfs_caching_control *caching_ctl;
6451		- struct btrfs_block_group_cache *cache;
6452		-
6453		- down_write(&fs_info->commit_root_sem);
6454		-
6455		- list_for_each_entry_safe(caching_ctl, next,
6456		- &fs_info->caching_block_groups, list) {
6457		- cache = caching_ctl->block_group;
6458		- if (block_group_cache_done(cache)) {
6459		- cache->last_byte_to_unpin = (u64)-1;
6460		- list_del_init(&caching_ctl->list);
6461		- put_caching_control(caching_ctl);
6462		- } else {
6463		- cache->last_byte_to_unpin = caching_ctl->progress;
6464		- }
6465		- }
6466		-
6467		- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6468		- fs_info->pinned_extents = &fs_info->freed_extents[1];
6469		- else
6470		- fs_info->pinned_extents = &fs_info->freed_extents[0];
6471		-
6472		- up_write(&fs_info->commit_root_sem);
6473		-
6474		- update_global_block_rsv(fs_info);
6475	2687	}
6476	2688
6477	2689	/*
..	..	@@ -6507,7 +2719,7 @@
6507	2719	u64 start, u64 end,
6508	2720	const bool return_free_space)
6509	2721	{
6510		- struct btrfs_block_group_cache *cache = NULL;
	2722	+ struct btrfs_block_group *cache = NULL;
6511	2723	struct btrfs_space_info *space_info;
6512	2724	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6513	2725	struct btrfs_free_cluster *cluster = NULL;
..	..	@@ -6519,7 +2731,7 @@
6519	2731	while (start <= end) {
6520	2732	readonly = false;
6521	2733	if (!cache \|\|
6522		- start >= cache->key.objectid + cache->key.offset) {
	2734	+ start >= cache->start + cache->length) {
6523	2735	if (cache)
6524	2736	btrfs_put_block_group(cache);
6525	2737	total_unpinned = 0;
..	..	@@ -6532,13 +2744,13 @@
6532	2744	empty_cluster <<= 1;
6533	2745	}
6534	2746
6535		- len = cache->key.objectid + cache->key.offset - start;
	2747	+ len = cache->start + cache->length - start;
6536	2748	len = min(len, end + 1 - start);
6537	2749
6538		- if (start < cache->last_byte_to_unpin) {
6539		- len = min(len, cache->last_byte_to_unpin - start);
6540		- if (return_free_space)
6541		- btrfs_add_free_space(cache, start, len);
	2750	+ if (start < cache->last_byte_to_unpin && return_free_space) {
	2751	+ u64 add_len = min(len, cache->last_byte_to_unpin - start);
	2752	+
	2753	+ btrfs_add_free_space(cache, start, add_len);
6542	2754	}
6543	2755
6544	2756	start += len;
..	..	@@ -6561,13 +2773,9 @@
6561	2773	spin_lock(&space_info->lock);
6562	2774	spin_lock(&cache->lock);
6563	2775	cache->pinned -= len;
6564		- space_info->bytes_pinned -= len;
6565		-
6566		- trace_btrfs_space_reservation(fs_info, "pinned",
6567		- space_info->flags, len, 0);
	2776	+ btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
6568	2777	space_info->max_extent_size = 0;
6569		- percpu_counter_add_batch(&space_info->total_bytes_pinned,
6570		- -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
	2778	+ __btrfs_mod_total_bytes_pinned(space_info, -len);
6571	2779	if (cache->ro) {
6572	2780	space_info->bytes_readonly += len;
6573	2781	readonly = true;
..	..	@@ -6582,21 +2790,17 @@
6582	2790	to_add = min(len, global_rsv->size -
6583	2791	global_rsv->reserved);
6584	2792	global_rsv->reserved += to_add;
6585		- space_info->bytes_may_use += to_add;
	2793	+ btrfs_space_info_update_bytes_may_use(fs_info,
	2794	+ space_info, to_add);
6586	2795	if (global_rsv->reserved >= global_rsv->size)
6587	2796	global_rsv->full = 1;
6588		- trace_btrfs_space_reservation(fs_info,
6589		- "space_info",
6590		- space_info->flags,
6591		- to_add, 1);
6592	2797	len -= to_add;
6593	2798	}
6594	2799	spin_unlock(&global_rsv->lock);
6595		- /* Add to any tickets we may have */
6596		- if (len)
6597		- space_info_add_new_bytes(fs_info, space_info,
6598		- len);
6599	2800	}
	2801	+ /* Add to any tickets we may have */
	2802	+ if (!readonly && return_free_space && len)
	2803	+ btrfs_try_granting_tickets(fs_info, space_info);
6600	2804	spin_unlock(&space_info->lock);
6601	2805	}
6602	2806
..	..	@@ -6608,19 +2812,16 @@
6608	2812	int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
6609	2813	{
6610	2814	struct btrfs_fs_info *fs_info = trans->fs_info;
6611		- struct btrfs_block_group_cache block_group, tmp;
	2815	+ struct btrfs_block_group block_group, tmp;
6612	2816	struct list_head *deleted_bgs;
6613	2817	struct extent_io_tree *unpin;
6614	2818	u64 start;
6615	2819	u64 end;
6616	2820	int ret;
6617	2821
6618		- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6619		- unpin = &fs_info->freed_extents[1];
6620		- else
6621		- unpin = &fs_info->freed_extents[0];
	2822	+ unpin = &trans->transaction->pinned_extents;
6622	2823
6623		- while (!trans->aborted) {
	2824	+ while (!TRANS_ABORTED(trans)) {
6624	2825	struct extent_state *cached_state = NULL;
6625	2826
6626	2827	mutex_lock(&fs_info->unused_bg_unpin_mutex);
..	..	@@ -6630,8 +2831,11 @@
6630	2831	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6631	2832	break;
6632	2833	}
	2834	+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
	2835	+ clear_extent_bits(&fs_info->excluded_extents, start,
	2836	+ end, EXTENT_UPTODATE);
6633	2837
6634		- if (btrfs_test_opt(fs_info, DISCARD))
	2838	+ if (btrfs_test_opt(fs_info, DISCARD_SYNC))
6635	2839	ret = btrfs_discard_extent(fs_info, start,
6636	2840	end + 1 - start, NULL);
6637	2841
..	..	@@ -6640,6 +2844,11 @@
6640	2844	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6641	2845	free_extent_state(cached_state);
6642	2846	cond_resched();
	2847	+ }
	2848	+
	2849	+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
	2850	+ btrfs_discard_calc_delay(&fs_info->discard_ctl);
	2851	+ btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
6643	2852	}
6644	2853
6645	2854	/*
..	..	@@ -6652,14 +2861,14 @@
6652	2861	u64 trimmed = 0;
6653	2862
6654	2863	ret = -EROFS;
6655		- if (!trans->aborted)
	2864	+ if (!TRANS_ABORTED(trans))
6656	2865	ret = btrfs_discard_extent(fs_info,
6657		- block_group->key.objectid,
6658		- block_group->key.offset,
	2866	+ block_group->start,
	2867	+ block_group->length,
6659	2868	&trimmed);
6660	2869
6661	2870	list_del_init(&block_group->bg_list);
6662		- btrfs_put_block_group_trimming(block_group);
	2871	+ btrfs_unfreeze_block_group(block_group);
6663	2872	btrfs_put_block_group(block_group);
6664	2873
6665	2874	if (ret) {
..	..	@@ -6673,6 +2882,65 @@
6673	2882	return 0;
6674	2883	}
6675	2884
	2885	+/*
	2886	+ * Drop one or more refs of @node.
	2887	+ *
	2888	+ * 1. Locate the extent refs.
	2889	+ * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
	2890	+ * Locate it, then reduce the refs number or remove the ref line completely.
	2891	+ *
	2892	+ * 2. Update the refs count in EXTENT/METADATA_ITEM
	2893	+ *
	2894	+ * Inline backref case:
	2895	+ *
	2896	+ * in extent tree we have:
	2897	+ *
	2898	+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
	2899	+ * refs 2 gen 6 flags DATA
	2900	+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
	2901	+ * extent data backref root FS_TREE objectid 257 offset 0 count 1
	2902	+ *
	2903	+ * This function gets called with:
	2904	+ *
	2905	+ * node->bytenr = 13631488
	2906	+ * node->num_bytes = 1048576
	2907	+ * root_objectid = FS_TREE
	2908	+ * owner_objectid = 257
	2909	+ * owner_offset = 0
	2910	+ * refs_to_drop = 1
	2911	+ *
	2912	+ * Then we should get some like:
	2913	+ *
	2914	+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
	2915	+ * refs 1 gen 6 flags DATA
	2916	+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
	2917	+ *
	2918	+ * Keyed backref case:
	2919	+ *
	2920	+ * in extent tree we have:
	2921	+ *
	2922	+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
	2923	+ * refs 754 gen 6 flags DATA
	2924	+ * [...]
	2925	+ * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
	2926	+ * extent data backref root FS_TREE objectid 866 offset 0 count 1
	2927	+ *
	2928	+ * This function get called with:
	2929	+ *
	2930	+ * node->bytenr = 13631488
	2931	+ * node->num_bytes = 1048576
	2932	+ * root_objectid = FS_TREE
	2933	+ * owner_objectid = 866
	2934	+ * owner_offset = 0
	2935	+ * refs_to_drop = 1
	2936	+ *
	2937	+ * Then we should get some like:
	2938	+ *
	2939	+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
	2940	+ * refs 753 gen 6 flags DATA
	2941	+ *
	2942	+ * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
	2943	+ */
6676	2944	static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6677	2945	struct btrfs_delayed_ref_node *node, u64 parent,
6678	2946	u64 root_objectid, u64 owner_objectid,
..	..	@@ -6702,11 +2970,18 @@
6702	2970	if (!path)
6703	2971	return -ENOMEM;
6704	2972
6705		- path->reada = READA_FORWARD;
6706	2973	path->leave_spinning = 1;
6707	2974
6708	2975	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6709		- BUG_ON(!is_data && refs_to_drop != 1);
	2976	+
	2977	+ if (!is_data && refs_to_drop != 1) {
	2978	+ btrfs_crit(info,
	2979	+"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
	2980	+ node->bytenr, refs_to_drop);
	2981	+ ret = -EINVAL;
	2982	+ btrfs_abort_transaction(trans, ret);
	2983	+ goto out;
	2984	+ }
6710	2985
6711	2986	if (is_data)
6712	2987	skinny_metadata = false;
..	..	@@ -6715,6 +2990,13 @@
6715	2990	parent, root_objectid, owner_objectid,
6716	2991	owner_offset);
6717	2992	if (ret == 0) {
	2993	+ /*
	2994	+ * Either the inline backref or the SHARED_DATA_REF/
	2995	+ * SHARED_BLOCK_REF is found
	2996	+ *
	2997	+ * Here is a quick path to locate EXTENT/METADATA_ITEM.
	2998	+ * It's possible the EXTENT/METADATA_ITEM is near current slot.
	2999	+ */
6718	3000	extent_slot = path->slots[0];
6719	3001	while (extent_slot >= 0) {
6720	3002	btrfs_item_key_to_cpu(path->nodes[0], &key,
..	..	@@ -6731,13 +3013,21 @@
6731	3013	found_extent = 1;
6732	3014	break;
6733	3015	}
	3016	+
	3017	+ /* Quick path didn't find the EXTEMT/METADATA_ITEM */
6734	3018	if (path->slots[0] - extent_slot > 5)
6735	3019	break;
6736	3020	extent_slot--;
6737	3021	}
6738	3022
6739	3023	if (!found_extent) {
6740		- BUG_ON(iref);
	3024	+ if (iref) {
	3025	+ btrfs_crit(info,
	3026	+"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
	3027	+ btrfs_abort_transaction(trans, -EUCLEAN);
	3028	+ goto err_dump;
	3029	+ }
	3030	+ /* Must be SHARED_* item, remove the backref first */
6741	3031	ret = remove_extent_backref(trans, path, NULL,
6742	3032	refs_to_drop,
6743	3033	is_data, &last_ref);
..	..	@@ -6748,6 +3038,7 @@
6748	3038	btrfs_release_path(path);
6749	3039	path->leave_spinning = 1;
6750	3040
	3041	+ /* Slow path to locate EXTENT/METADATA_ITEM */
6751	3042	key.objectid = bytenr;
6752	3043	key.type = BTRFS_EXTENT_ITEM_KEY;
6753	3044	key.offset = num_bytes;
..	..	@@ -6822,19 +3113,26 @@
6822	3113	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6823	3114	key.type == BTRFS_EXTENT_ITEM_KEY) {
6824	3115	struct btrfs_tree_block_info *bi;
6825		- BUG_ON(item_size < sizeof(ei) + sizeof(bi));
	3116	+ if (item_size < sizeof(ei) + sizeof(bi)) {
	3117	+ btrfs_crit(info,
	3118	+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
	3119	+ key.objectid, key.type, key.offset,
	3120	+ owner_objectid, item_size,
	3121	+ sizeof(ei) + sizeof(bi));
	3122	+ btrfs_abort_transaction(trans, -EUCLEAN);
	3123	+ goto err_dump;
	3124	+ }
6826	3125	bi = (struct btrfs_tree_block_info *)(ei + 1);
6827	3126	WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6828	3127	}
6829	3128
6830	3129	refs = btrfs_extent_refs(leaf, ei);
6831	3130	if (refs < refs_to_drop) {
6832		- btrfs_err(info,
6833		- "trying to drop %d refs but we only have %Lu for bytenr %Lu",
	3131	+ btrfs_crit(info,
	3132	+ "trying to drop %d refs but we only have %llu for bytenr %llu",
6834	3133	refs_to_drop, refs, bytenr);
6835		- ret = -EINVAL;
6836		- btrfs_abort_transaction(trans, ret);
6837		- goto out;
	3134	+ btrfs_abort_transaction(trans, -EUCLEAN);
	3135	+ goto err_dump;
6838	3136	}
6839	3137	refs -= refs_to_drop;
6840	3138
..	..	@@ -6846,7 +3144,12 @@
6846	3144	* be updated by remove_extent_backref
6847	3145	*/
6848	3146	if (iref) {
6849		- BUG_ON(!found_extent);
	3147	+ if (!found_extent) {
	3148	+ btrfs_crit(info,
	3149	+"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
	3150	+ btrfs_abort_transaction(trans, -EUCLEAN);
	3151	+ goto err_dump;
	3152	+ }
6850	3153	} else {
6851	3154	btrfs_set_extent_refs(leaf, ei, refs);
6852	3155	btrfs_mark_buffer_dirty(leaf);
..	..	@@ -6861,13 +3164,39 @@
6861	3164	}
6862	3165	}
6863	3166	} else {
	3167	+ /* In this branch refs == 1 */
6864	3168	if (found_extent) {
6865		- BUG_ON(is_data && refs_to_drop !=
6866		- extent_data_ref_count(path, iref));
	3169	+ if (is_data && refs_to_drop !=
	3170	+ extent_data_ref_count(path, iref)) {
	3171	+ btrfs_crit(info,
	3172	+ "invalid refs_to_drop, current refs %u refs_to_drop %u",
	3173	+ extent_data_ref_count(path, iref),
	3174	+ refs_to_drop);
	3175	+ btrfs_abort_transaction(trans, -EUCLEAN);
	3176	+ goto err_dump;
	3177	+ }
6867	3178	if (iref) {
6868		- BUG_ON(path->slots[0] != extent_slot);
	3179	+ if (path->slots[0] != extent_slot) {
	3180	+ btrfs_crit(info,
	3181	+"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
	3182	+ key.objectid, key.type,
	3183	+ key.offset);
	3184	+ btrfs_abort_transaction(trans, -EUCLEAN);
	3185	+ goto err_dump;
	3186	+ }
6869	3187	} else {
6870		- BUG_ON(path->slots[0] != extent_slot + 1);
	3188	+ /*
	3189	+ * No inline ref, we must be at SHARED_* item,
	3190	+ * And it's single ref, it must be:
	3191	+ * \| extent_slot \|\|extent_slot + 1\|
	3192	+ * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
	3193	+ */
	3194	+ if (path->slots[0] != extent_slot + 1) {
	3195	+ btrfs_crit(info,
	3196	+ "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
	3197	+ btrfs_abort_transaction(trans, -EUCLEAN);
	3198	+ goto err_dump;
	3199	+ }
6871	3200	path->slots[0] = extent_slot;
6872	3201	num_to_del = 2;
6873	3202	}
..	..	@@ -6897,7 +3226,7 @@
6897	3226	goto out;
6898	3227	}
6899	3228
6900		- ret = update_block_group(trans, info, bytenr, num_bytes, 0);
	3229	+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
6901	3230	if (ret) {
6902	3231	btrfs_abort_transaction(trans, ret);
6903	3232	goto out;
..	..	@@ -6908,6 +3237,19 @@
6908	3237	out:
6909	3238	btrfs_free_path(path);
6910	3239	return ret;
	3240	+err_dump:
	3241	+ /*
	3242	+ * Leaf dump can take up a lot of log buffer, so we only do full leaf
	3243	+ * dump for debug build.
	3244	+ */
	3245	+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
	3246	+ btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
	3247	+ path->slots[0], extent_slot);
	3248	+ btrfs_print_leaf(path->nodes[0]);
	3249	+ }
	3250	+
	3251	+ btrfs_free_path(path);
	3252	+ return -EUCLEAN;
6911	3253	}
6912	3254
6913	3255	/*
..	..	@@ -6930,15 +3272,11 @@
6930	3272	goto out_delayed_unlock;
6931	3273
6932	3274	spin_lock(&head->lock);
6933		- if (!RB_EMPTY_ROOT(&head->ref_tree))
	3275	+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
6934	3276	goto out;
6935	3277
6936		- if (head->extent_op) {
6937		- if (!head->must_insert_reserved)
6938		- goto out;
6939		- btrfs_free_delayed_extent_op(head->extent_op);
6940		- head->extent_op = NULL;
6941		- }
	3278	+ if (cleanup_extent_op(head) != NULL)
	3279	+ goto out;
6942	3280
6943	3281	/*
6944	3282	* waiting for the lock here would deadlock. If someone else has it
..	..	@@ -6947,22 +3285,9 @@
6947	3285	if (!mutex_trylock(&head->mutex))
6948	3286	goto out;
6949	3287
6950		- /*
6951		- * at this point we have a head with no other entries. Go
6952		- * ahead and process it.
6953		- */
6954		- rb_erase(&head->href_node, &delayed_refs->href_root);
6955		- RB_CLEAR_NODE(&head->href_node);
6956		- atomic_dec(&delayed_refs->num_entries);
6957		-
6958		- /*
6959		- * we don't take a ref on the node because we're removing it from the
6960		- * tree, so we just steal the ref the tree was holding.
6961		- */
6962		- delayed_refs->num_heads--;
6963		- if (head->processing == 0)
6964		- delayed_refs->num_heads_ready--;
	3288	+ btrfs_delete_ref_head(delayed_refs, head);
6965	3289	head->processing = 0;
	3290	+
6966	3291	spin_unlock(&head->lock);
6967	3292	spin_unlock(&delayed_refs->lock);
6968	3293
..	..	@@ -6970,6 +3295,7 @@
6970	3295	if (head->must_insert_reserved)
6971	3296	ret = 1;
6972	3297
	3298	+ btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
6973	3299	mutex_unlock(&head->mutex);
6974	3300	btrfs_put_delayed_ref_head(head);
6975	3301	return ret;
..	..	@@ -6987,28 +3313,22 @@
6987	3313	u64 parent, int last_ref)
6988	3314	{
6989	3315	struct btrfs_fs_info *fs_info = root->fs_info;
6990		- int pin = 1;
	3316	+ struct btrfs_ref generic_ref = { 0 };
6991	3317	int ret;
6992	3318
6993		- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6994		- int old_ref_mod, new_ref_mod;
	3319	+ btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
	3320	+ buf->start, buf->len, parent);
	3321	+ btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
	3322	+ root->root_key.objectid);
6995	3323
6996		- btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
6997		- root->root_key.objectid,
6998		- btrfs_header_level(buf), 0,
6999		- BTRFS_DROP_DELAYED_REF);
7000		- ret = btrfs_add_delayed_tree_ref(trans, buf->start,
7001		- buf->len, parent,
7002		- root->root_key.objectid,
7003		- btrfs_header_level(buf),
7004		- BTRFS_DROP_DELAYED_REF, NULL,
7005		- &old_ref_mod, &new_ref_mod);
	3324	+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
	3325	+ btrfs_ref_tree_mod(fs_info, &generic_ref);
	3326	+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
7006	3327	BUG_ON(ret); /* -ENOMEM */
7007		- pin = old_ref_mod >= 0 && new_ref_mod < 0;
7008	3328	}
7009	3329
7010	3330	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7011		- struct btrfs_block_group_cache *cache;
	3331	+ struct btrfs_block_group *cache;
7012	3332
7013	3333	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7014	3334	ret = check_ref_cleanup(trans, buf->start);
..	..	@@ -7016,12 +3336,10 @@
7016	3336	goto out;
7017	3337	}
7018	3338
7019		- pin = 0;
7020	3339	cache = btrfs_lookup_block_group(fs_info, buf->start);
7021	3340
7022	3341	if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7023		- pin_down_extent(fs_info, cache, buf->start,
7024		- buf->len, 1);
	3342	+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
7025	3343	btrfs_put_block_group(cache);
7026	3344	goto out;
7027	3345	}
..	..	@@ -7034,10 +3352,6 @@
7034	3352	trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
7035	3353	}
7036	3354	out:
7037		- if (pin)
7038		- add_pinned_bytes(fs_info, buf->len, true,
7039		- root->root_key.objectid);
7040		-
7041	3355	if (last_ref) {
7042	3356	/*
7043	3357	* Deleting the buffer, clear the corrupt flag since it doesn't
..	..	@@ -7048,120 +3362,56 @@
7048	3362	}
7049	3363
7050	3364	/* Can return -ENOMEM */
7051		-int btrfs_free_extent(struct btrfs_trans_handle *trans,
7052		- struct btrfs_root *root,
7053		- u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7054		- u64 owner, u64 offset)
	3365	+int btrfs_free_extent(struct btrfs_trans_handle trans, struct btrfs_ref ref)
7055	3366	{
7056		- struct btrfs_fs_info *fs_info = root->fs_info;
7057		- int old_ref_mod, new_ref_mod;
	3367	+ struct btrfs_fs_info *fs_info = trans->fs_info;
7058	3368	int ret;
7059	3369
7060	3370	if (btrfs_is_testing(fs_info))
7061	3371	return 0;
7062	3372
7063		- if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
7064		- btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
7065		- root_objectid, owner, offset,
7066		- BTRFS_DROP_DELAYED_REF);
7067		-
7068	3373	/*
7069	3374	* tree log blocks never actually go into the extent allocation
7070	3375	* tree, just update pinning info and exit early.
7071	3376	*/
7072		- if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7073		- WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
	3377	+ if ((ref->type == BTRFS_REF_METADATA &&
	3378	+ ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) \|\|
	3379	+ (ref->type == BTRFS_REF_DATA &&
	3380	+ ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
7074	3381	/* unlocks the pinned mutex */
7075		- btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
7076		- old_ref_mod = new_ref_mod = 0;
	3382	+ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
7077	3383	ret = 0;
7078		- } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7079		- ret = btrfs_add_delayed_tree_ref(trans, bytenr,
7080		- num_bytes, parent,
7081		- root_objectid, (int)owner,
7082		- BTRFS_DROP_DELAYED_REF, NULL,
7083		- &old_ref_mod, &new_ref_mod);
	3384	+ } else if (ref->type == BTRFS_REF_METADATA) {
	3385	+ ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
7084	3386	} else {
7085		- ret = btrfs_add_delayed_data_ref(trans, bytenr,
7086		- num_bytes, parent,
7087		- root_objectid, owner, offset,
7088		- 0, BTRFS_DROP_DELAYED_REF,
7089		- &old_ref_mod, &new_ref_mod);
	3387	+ ret = btrfs_add_delayed_data_ref(trans, ref, 0);
7090	3388	}
7091	3389
7092		- if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
7093		- bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
	3390	+ if (!((ref->type == BTRFS_REF_METADATA &&
	3391	+ ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) \|\|
	3392	+ (ref->type == BTRFS_REF_DATA &&
	3393	+ ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
	3394	+ btrfs_ref_tree_mod(fs_info, ref);
7094	3395
7095		- add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
7096		- }
7097		-
7098		- return ret;
7099		-}
7100		-
7101		-/*
7102		- * when we wait for progress in the block group caching, its because
7103		- * our allocation attempt failed at least once. So, we must sleep
7104		- * and let some progress happen before we try again.
7105		- *
7106		- * This function will sleep at least once waiting for new free space to
7107		- * show up, and then it will check the block group free space numbers
7108		- * for our min num_bytes. Another option is to have it go ahead
7109		- * and look in the rbtree for a free extent of a given size, but this
7110		- * is a good start.
7111		- *
7112		- * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7113		- * any of the information in this block group.
7114		- */
7115		-static noinline void
7116		-wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7117		- u64 num_bytes)
7118		-{
7119		- struct btrfs_caching_control *caching_ctl;
7120		-
7121		- caching_ctl = get_caching_control(cache);
7122		- if (!caching_ctl)
7123		- return;
7124		-
7125		- wait_event(caching_ctl->wait, block_group_cache_done(cache) \|\|
7126		- (cache->free_space_ctl->free_space >= num_bytes));
7127		-
7128		- put_caching_control(caching_ctl);
7129		-}
7130		-
7131		-static noinline int
7132		-wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7133		-{
7134		- struct btrfs_caching_control *caching_ctl;
7135		- int ret = 0;
7136		-
7137		- caching_ctl = get_caching_control(cache);
7138		- if (!caching_ctl)
7139		- return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7140		-
7141		- wait_event(caching_ctl->wait, block_group_cache_done(cache));
7142		- if (cache->cached == BTRFS_CACHE_ERROR)
7143		- ret = -EIO;
7144		- put_caching_control(caching_ctl);
7145	3396	return ret;
7146	3397	}
7147	3398
7148	3399	enum btrfs_loop_type {
7149		- LOOP_CACHING_NOWAIT = 0,
7150		- LOOP_CACHING_WAIT = 1,
7151		- LOOP_ALLOC_CHUNK = 2,
7152		- LOOP_NO_EMPTY_SIZE = 3,
	3400	+ LOOP_CACHING_NOWAIT,
	3401	+ LOOP_CACHING_WAIT,
	3402	+ LOOP_ALLOC_CHUNK,
	3403	+ LOOP_NO_EMPTY_SIZE,
7153	3404	};
7154	3405
7155	3406	static inline void
7156		-btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
	3407	+btrfs_lock_block_group(struct btrfs_block_group *cache,
7157	3408	int delalloc)
7158	3409	{
7159	3410	if (delalloc)
7160	3411	down_read(&cache->data_rwsem);
7161	3412	}
7162	3413
7163		-static inline void
7164		-btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
	3414	+static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
7165	3415	int delalloc)
7166	3416	{
7167	3417	btrfs_get_block_group(cache);
..	..	@@ -7169,12 +3419,13 @@
7169	3419	down_read(&cache->data_rwsem);
7170	3420	}
7171	3421
7172		-static struct btrfs_block_group_cache *
7173		-btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
	3422	+static struct btrfs_block_group *btrfs_lock_cluster(
	3423	+ struct btrfs_block_group *block_group,
7174	3424	struct btrfs_free_cluster *cluster,
7175	3425	int delalloc)
	3426	+ __acquires(&cluster->refill_lock)
7176	3427	{
7177		- struct btrfs_block_group_cache *used_bg = NULL;
	3428	+ struct btrfs_block_group *used_bg = NULL;
7178	3429
7179	3430	spin_lock(&cluster->refill_lock);
7180	3431	while (1) {
..	..	@@ -7208,12 +3459,503 @@
7208	3459	}
7209	3460
7210	3461	static inline void
7211		-btrfs_release_block_group(struct btrfs_block_group_cache *cache,
	3462	+btrfs_release_block_group(struct btrfs_block_group *cache,
7212	3463	int delalloc)
7213	3464	{
7214	3465	if (delalloc)
7215	3466	up_read(&cache->data_rwsem);
7216	3467	btrfs_put_block_group(cache);
	3468	+}
	3469	+
	3470	+enum btrfs_extent_allocation_policy {
	3471	+ BTRFS_EXTENT_ALLOC_CLUSTERED,
	3472	+};
	3473	+
	3474	+/*
	3475	+ * Structure used internally for find_free_extent() function. Wraps needed
	3476	+ * parameters.
	3477	+ */
	3478	+struct find_free_extent_ctl {
	3479	+ /* Basic allocation info */
	3480	+ u64 num_bytes;
	3481	+ u64 empty_size;
	3482	+ u64 flags;
	3483	+ int delalloc;
	3484	+
	3485	+ /* Where to start the search inside the bg */
	3486	+ u64 search_start;
	3487	+
	3488	+ /* For clustered allocation */
	3489	+ u64 empty_cluster;
	3490	+ struct btrfs_free_cluster *last_ptr;
	3491	+ bool use_cluster;
	3492	+
	3493	+ bool have_caching_bg;
	3494	+ bool orig_have_caching_bg;
	3495	+
	3496	+ /* RAID index, converted from flags */
	3497	+ int index;
	3498	+
	3499	+ /*
	3500	+ * Current loop number, check find_free_extent_update_loop() for details
	3501	+ */
	3502	+ int loop;
	3503	+
	3504	+ /*
	3505	+ * Whether we're refilling a cluster, if true we need to re-search
	3506	+ * current block group but don't try to refill the cluster again.
	3507	+ */
	3508	+ bool retry_clustered;
	3509	+
	3510	+ /*
	3511	+ * Whether we're updating free space cache, if true we need to re-search
	3512	+ * current block group but don't try updating free space cache again.
	3513	+ */
	3514	+ bool retry_unclustered;
	3515	+
	3516	+ /* If current block group is cached */
	3517	+ int cached;
	3518	+
	3519	+ /* Max contiguous hole found */
	3520	+ u64 max_extent_size;
	3521	+
	3522	+ /* Total free space from free space cache, not always contiguous */
	3523	+ u64 total_free_space;
	3524	+
	3525	+ /* Found result */
	3526	+ u64 found_offset;
	3527	+
	3528	+ /* Hint where to start looking for an empty space */
	3529	+ u64 hint_byte;
	3530	+
	3531	+ /* Allocation policy */
	3532	+ enum btrfs_extent_allocation_policy policy;
	3533	+};
	3534	+
	3535	+
	3536	+/*
	3537	+ * Helper function for find_free_extent().
	3538	+ *
	3539	+ * Return -ENOENT to inform caller that we need fallback to unclustered mode.
	3540	+ * Return -EAGAIN to inform caller that we need to re-search this block group
	3541	+ * Return >0 to inform caller that we find nothing
	3542	+ * Return 0 means we have found a location and set ffe_ctl->found_offset.
	3543	+ */
	3544	+static int find_free_extent_clustered(struct btrfs_block_group *bg,
	3545	+ struct find_free_extent_ctl *ffe_ctl,
	3546	+ struct btrfs_block_group **cluster_bg_ret)
	3547	+{
	3548	+ struct btrfs_block_group *cluster_bg;
	3549	+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
	3550	+ u64 aligned_cluster;
	3551	+ u64 offset;
	3552	+ int ret;
	3553	+
	3554	+ cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
	3555	+ if (!cluster_bg)
	3556	+ goto refill_cluster;
	3557	+ if (cluster_bg != bg && (cluster_bg->ro \|\|
	3558	+ !block_group_bits(cluster_bg, ffe_ctl->flags)))
	3559	+ goto release_cluster;
	3560	+
	3561	+ offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
	3562	+ ffe_ctl->num_bytes, cluster_bg->start,
	3563	+ &ffe_ctl->max_extent_size);
	3564	+ if (offset) {
	3565	+ /* We have a block, we're done */
	3566	+ spin_unlock(&last_ptr->refill_lock);
	3567	+ trace_btrfs_reserve_extent_cluster(cluster_bg,
	3568	+ ffe_ctl->search_start, ffe_ctl->num_bytes);
	3569	+ *cluster_bg_ret = cluster_bg;
	3570	+ ffe_ctl->found_offset = offset;
	3571	+ return 0;
	3572	+ }
	3573	+ WARN_ON(last_ptr->block_group != cluster_bg);
	3574	+
	3575	+release_cluster:
	3576	+ /*
	3577	+ * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
	3578	+ * lets just skip it and let the allocator find whatever block it can
	3579	+ * find. If we reach this point, we will have tried the cluster
	3580	+ * allocator plenty of times and not have found anything, so we are
	3581	+ * likely way too fragmented for the clustering stuff to find anything.
	3582	+ *
	3583	+ * However, if the cluster is taken from the current block group,
	3584	+ * release the cluster first, so that we stand a better chance of
	3585	+ * succeeding in the unclustered allocation.
	3586	+ */
	3587	+ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
	3588	+ spin_unlock(&last_ptr->refill_lock);
	3589	+ btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
	3590	+ return -ENOENT;
	3591	+ }
	3592	+
	3593	+ /* This cluster didn't work out, free it and start over */
	3594	+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
	3595	+
	3596	+ if (cluster_bg != bg)
	3597	+ btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
	3598	+
	3599	+refill_cluster:
	3600	+ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
	3601	+ spin_unlock(&last_ptr->refill_lock);
	3602	+ return -ENOENT;
	3603	+ }
	3604	+
	3605	+ aligned_cluster = max_t(u64,
	3606	+ ffe_ctl->empty_cluster + ffe_ctl->empty_size,
	3607	+ bg->full_stripe_len);
	3608	+ ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
	3609	+ ffe_ctl->num_bytes, aligned_cluster);
	3610	+ if (ret == 0) {
	3611	+ /* Now pull our allocation out of this cluster */
	3612	+ offset = btrfs_alloc_from_cluster(bg, last_ptr,
	3613	+ ffe_ctl->num_bytes, ffe_ctl->search_start,
	3614	+ &ffe_ctl->max_extent_size);
	3615	+ if (offset) {
	3616	+ /* We found one, proceed */
	3617	+ spin_unlock(&last_ptr->refill_lock);
	3618	+ trace_btrfs_reserve_extent_cluster(bg,
	3619	+ ffe_ctl->search_start,
	3620	+ ffe_ctl->num_bytes);
	3621	+ ffe_ctl->found_offset = offset;
	3622	+ return 0;
	3623	+ }
	3624	+ } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
	3625	+ !ffe_ctl->retry_clustered) {
	3626	+ spin_unlock(&last_ptr->refill_lock);
	3627	+
	3628	+ ffe_ctl->retry_clustered = true;
	3629	+ btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
	3630	+ ffe_ctl->empty_cluster + ffe_ctl->empty_size);
	3631	+ return -EAGAIN;
	3632	+ }
	3633	+ /*
	3634	+ * At this point we either didn't find a cluster or we weren't able to
	3635	+ * allocate a block from our cluster. Free the cluster we've been
	3636	+ * trying to use, and go to the next block group.
	3637	+ */
	3638	+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
	3639	+ spin_unlock(&last_ptr->refill_lock);
	3640	+ return 1;
	3641	+}
	3642	+
	3643	+/*
	3644	+ * Return >0 to inform caller that we find nothing
	3645	+ * Return 0 when we found an free extent and set ffe_ctrl->found_offset
	3646	+ * Return -EAGAIN to inform caller that we need to re-search this block group
	3647	+ */
	3648	+static int find_free_extent_unclustered(struct btrfs_block_group *bg,
	3649	+ struct find_free_extent_ctl *ffe_ctl)
	3650	+{
	3651	+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
	3652	+ u64 offset;
	3653	+
	3654	+ /*
	3655	+ * We are doing an unclustered allocation, set the fragmented flag so
	3656	+ * we don't bother trying to setup a cluster again until we get more
	3657	+ * space.
	3658	+ */
	3659	+ if (unlikely(last_ptr)) {
	3660	+ spin_lock(&last_ptr->lock);
	3661	+ last_ptr->fragmented = 1;
	3662	+ spin_unlock(&last_ptr->lock);
	3663	+ }
	3664	+ if (ffe_ctl->cached) {
	3665	+ struct btrfs_free_space_ctl *free_space_ctl;
	3666	+
	3667	+ free_space_ctl = bg->free_space_ctl;
	3668	+ spin_lock(&free_space_ctl->tree_lock);
	3669	+ if (free_space_ctl->free_space <
	3670	+ ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
	3671	+ ffe_ctl->empty_size) {
	3672	+ ffe_ctl->total_free_space = max_t(u64,
	3673	+ ffe_ctl->total_free_space,
	3674	+ free_space_ctl->free_space);
	3675	+ spin_unlock(&free_space_ctl->tree_lock);
	3676	+ return 1;
	3677	+ }
	3678	+ spin_unlock(&free_space_ctl->tree_lock);
	3679	+ }
	3680	+
	3681	+ offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
	3682	+ ffe_ctl->num_bytes, ffe_ctl->empty_size,
	3683	+ &ffe_ctl->max_extent_size);
	3684	+
	3685	+ /*
	3686	+ * If we didn't find a chunk, and we haven't failed on this block group
	3687	+ * before, and this block group is in the middle of caching and we are
	3688	+ * ok with waiting, then go ahead and wait for progress to be made, and
	3689	+ * set @retry_unclustered to true.
	3690	+ *
	3691	+ * If @retry_unclustered is true then we've already waited on this
	3692	+ * block group once and should move on to the next block group.
	3693	+ */
	3694	+ if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
	3695	+ ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
	3696	+ btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
	3697	+ ffe_ctl->empty_size);
	3698	+ ffe_ctl->retry_unclustered = true;
	3699	+ return -EAGAIN;
	3700	+ } else if (!offset) {
	3701	+ return 1;
	3702	+ }
	3703	+ ffe_ctl->found_offset = offset;
	3704	+ return 0;
	3705	+}
	3706	+
	3707	+static int do_allocation_clustered(struct btrfs_block_group *block_group,
	3708	+ struct find_free_extent_ctl *ffe_ctl,
	3709	+ struct btrfs_block_group **bg_ret)
	3710	+{
	3711	+ int ret;
	3712	+
	3713	+ /* We want to try and use the cluster allocator, so lets look there */
	3714	+ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
	3715	+ ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
	3716	+ if (ret >= 0 \|\| ret == -EAGAIN)
	3717	+ return ret;
	3718	+ /* ret == -ENOENT case falls through */
	3719	+ }
	3720	+
	3721	+ return find_free_extent_unclustered(block_group, ffe_ctl);
	3722	+}
	3723	+
	3724	+static int do_allocation(struct btrfs_block_group *block_group,
	3725	+ struct find_free_extent_ctl *ffe_ctl,
	3726	+ struct btrfs_block_group **bg_ret)
	3727	+{
	3728	+ switch (ffe_ctl->policy) {
	3729	+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
	3730	+ return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
	3731	+ default:
	3732	+ BUG();
	3733	+ }
	3734	+}
	3735	+
	3736	+static void release_block_group(struct btrfs_block_group *block_group,
	3737	+ struct find_free_extent_ctl *ffe_ctl,
	3738	+ int delalloc)
	3739	+{
	3740	+ switch (ffe_ctl->policy) {
	3741	+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
	3742	+ ffe_ctl->retry_clustered = false;
	3743	+ ffe_ctl->retry_unclustered = false;
	3744	+ break;
	3745	+ default:
	3746	+ BUG();
	3747	+ }
	3748	+
	3749	+ BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
	3750	+ ffe_ctl->index);
	3751	+ btrfs_release_block_group(block_group, delalloc);
	3752	+}
	3753	+
	3754	+static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
	3755	+ struct btrfs_key *ins)
	3756	+{
	3757	+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
	3758	+
	3759	+ if (!ffe_ctl->use_cluster && last_ptr) {
	3760	+ spin_lock(&last_ptr->lock);
	3761	+ last_ptr->window_start = ins->objectid;
	3762	+ spin_unlock(&last_ptr->lock);
	3763	+ }
	3764	+}
	3765	+
	3766	+static void found_extent(struct find_free_extent_ctl *ffe_ctl,
	3767	+ struct btrfs_key *ins)
	3768	+{
	3769	+ switch (ffe_ctl->policy) {
	3770	+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
	3771	+ found_extent_clustered(ffe_ctl, ins);
	3772	+ break;
	3773	+ default:
	3774	+ BUG();
	3775	+ }
	3776	+}
	3777	+
	3778	+static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
	3779	+{
	3780	+ switch (ffe_ctl->policy) {
	3781	+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
	3782	+ /*
	3783	+ * If we can't allocate a new chunk we've already looped through
	3784	+ * at least once, move on to the NO_EMPTY_SIZE case.
	3785	+ */
	3786	+ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
	3787	+ return 0;
	3788	+ default:
	3789	+ BUG();
	3790	+ }
	3791	+}
	3792	+
	3793	+/*
	3794	+ * Return >0 means caller needs to re-search for free extent
	3795	+ * Return 0 means we have the needed free extent.
	3796	+ * Return <0 means we failed to locate any free extent.
	3797	+ */
	3798	+static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
	3799	+ struct btrfs_key *ins,
	3800	+ struct find_free_extent_ctl *ffe_ctl,
	3801	+ bool full_search)
	3802	+{
	3803	+ struct btrfs_root *root = fs_info->extent_root;
	3804	+ int ret;
	3805	+
	3806	+ if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
	3807	+ ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
	3808	+ ffe_ctl->orig_have_caching_bg = true;
	3809	+
	3810	+ if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
	3811	+ ffe_ctl->have_caching_bg)
	3812	+ return 1;
	3813	+
	3814	+ if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
	3815	+ return 1;
	3816	+
	3817	+ if (ins->objectid) {
	3818	+ found_extent(ffe_ctl, ins);
	3819	+ return 0;
	3820	+ }
	3821	+
	3822	+ /*
	3823	+ * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
	3824	+ * caching kthreads as we move along
	3825	+ * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
	3826	+ * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
	3827	+ * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
	3828	+ * again
	3829	+ */
	3830	+ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
	3831	+ ffe_ctl->index = 0;
	3832	+ if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
	3833	+ /*
	3834	+ * We want to skip the LOOP_CACHING_WAIT step if we
	3835	+ * don't have any uncached bgs and we've already done a
	3836	+ * full search through.
	3837	+ */
	3838	+ if (ffe_ctl->orig_have_caching_bg \|\| !full_search)
	3839	+ ffe_ctl->loop = LOOP_CACHING_WAIT;
	3840	+ else
	3841	+ ffe_ctl->loop = LOOP_ALLOC_CHUNK;
	3842	+ } else {
	3843	+ ffe_ctl->loop++;
	3844	+ }
	3845	+
	3846	+ if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
	3847	+ struct btrfs_trans_handle *trans;
	3848	+ int exist = 0;
	3849	+
	3850	+ trans = current->journal_info;
	3851	+ if (trans)
	3852	+ exist = 1;
	3853	+ else
	3854	+ trans = btrfs_join_transaction(root);
	3855	+
	3856	+ if (IS_ERR(trans)) {
	3857	+ ret = PTR_ERR(trans);
	3858	+ return ret;
	3859	+ }
	3860	+
	3861	+ ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
	3862	+ CHUNK_ALLOC_FORCE);
	3863	+
	3864	+ /* Do not bail out on ENOSPC since we can do more. */
	3865	+ if (ret == -ENOSPC)
	3866	+ ret = chunk_allocation_failed(ffe_ctl);
	3867	+ else if (ret < 0)
	3868	+ btrfs_abort_transaction(trans, ret);
	3869	+ else
	3870	+ ret = 0;
	3871	+ if (!exist)
	3872	+ btrfs_end_transaction(trans);
	3873	+ if (ret)
	3874	+ return ret;
	3875	+ }
	3876	+
	3877	+ if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
	3878	+ if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
	3879	+ return -ENOSPC;
	3880	+
	3881	+ /*
	3882	+ * Don't loop again if we already have no empty_size and
	3883	+ * no empty_cluster.
	3884	+ */
	3885	+ if (ffe_ctl->empty_size == 0 &&
	3886	+ ffe_ctl->empty_cluster == 0)
	3887	+ return -ENOSPC;
	3888	+ ffe_ctl->empty_size = 0;
	3889	+ ffe_ctl->empty_cluster = 0;
	3890	+ }
	3891	+ return 1;
	3892	+ }
	3893	+ return -ENOSPC;
	3894	+}
	3895	+
	3896	+static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
	3897	+ struct find_free_extent_ctl *ffe_ctl,
	3898	+ struct btrfs_space_info *space_info,
	3899	+ struct btrfs_key *ins)
	3900	+{
	3901	+ /*
	3902	+ * If our free space is heavily fragmented we may not be able to make
	3903	+ * big contiguous allocations, so instead of doing the expensive search
	3904	+ * for free space, simply return ENOSPC with our max_extent_size so we
	3905	+ * can go ahead and search for a more manageable chunk.
	3906	+ *
	3907	+ * If our max_extent_size is large enough for our allocation simply
	3908	+ * disable clustering since we will likely not be able to find enough
	3909	+ * space to create a cluster and induce latency trying.
	3910	+ */
	3911	+ if (space_info->max_extent_size) {
	3912	+ spin_lock(&space_info->lock);
	3913	+ if (space_info->max_extent_size &&
	3914	+ ffe_ctl->num_bytes > space_info->max_extent_size) {
	3915	+ ins->offset = space_info->max_extent_size;
	3916	+ spin_unlock(&space_info->lock);
	3917	+ return -ENOSPC;
	3918	+ } else if (space_info->max_extent_size) {
	3919	+ ffe_ctl->use_cluster = false;
	3920	+ }
	3921	+ spin_unlock(&space_info->lock);
	3922	+ }
	3923	+
	3924	+ ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
	3925	+ &ffe_ctl->empty_cluster);
	3926	+ if (ffe_ctl->last_ptr) {
	3927	+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
	3928	+
	3929	+ spin_lock(&last_ptr->lock);
	3930	+ if (last_ptr->block_group)
	3931	+ ffe_ctl->hint_byte = last_ptr->window_start;
	3932	+ if (last_ptr->fragmented) {
	3933	+ /*
	3934	+ * We still set window_start so we can keep track of the
	3935	+ * last place we found an allocation to try and save
	3936	+ * some time.
	3937	+ */
	3938	+ ffe_ctl->hint_byte = last_ptr->window_start;
	3939	+ ffe_ctl->use_cluster = false;
	3940	+ }
	3941	+ spin_unlock(&last_ptr->lock);
	3942	+ }
	3943	+
	3944	+ return 0;
	3945	+}
	3946	+
	3947	+static int prepare_allocation(struct btrfs_fs_info *fs_info,
	3948	+ struct find_free_extent_ctl *ffe_ctl,
	3949	+ struct btrfs_space_info *space_info,
	3950	+ struct btrfs_key *ins)
	3951	+{
	3952	+ switch (ffe_ctl->policy) {
	3953	+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
	3954	+ return prepare_allocation_clustered(fs_info, ffe_ctl,
	3955	+ space_info, ins);
	3956	+ default:
	3957	+ BUG();
	3958	+ }
7217	3959	}
7218	3960
7219	3961	/*
..	..	@@ -7226,87 +3968,76 @@
7226	3968	*
7227	3969	* If there is no suitable free space, we will record the max size of
7228	3970	* the free space extent currently.
	3971	+ *
	3972	+ * The overall logic and call chain:
	3973	+ *
	3974	+ * find_free_extent()
	3975	+ * \|- Iterate through all block groups
	3976	+ * \| \|- Get a valid block group
	3977	+ * \| \|- Try to do clustered allocation in that block group
	3978	+ * \| \|- Try to do unclustered allocation in that block group
	3979	+ * \| \|- Check if the result is valid
	3980	+ * \| \| \|- If valid, then exit
	3981	+ * \| \|- Jump to next block group
	3982	+ * \|
	3983	+ * \|- Push harder to find free extents
	3984	+ * \|- If not found, re-iterate all block groups
7229	3985	*/
7230		-static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
	3986	+static noinline int find_free_extent(struct btrfs_root *root,
7231	3987	u64 ram_bytes, u64 num_bytes, u64 empty_size,
7232		- u64 hint_byte, struct btrfs_key *ins,
	3988	+ u64 hint_byte_orig, struct btrfs_key *ins,
7233	3989	u64 flags, int delalloc)
7234	3990	{
	3991	+ struct btrfs_fs_info *fs_info = root->fs_info;
7235	3992	int ret = 0;
7236		- struct btrfs_root *root = fs_info->extent_root;
7237		- struct btrfs_free_cluster *last_ptr = NULL;
7238		- struct btrfs_block_group_cache *block_group = NULL;
7239		- u64 search_start = 0;
7240		- u64 max_extent_size = 0;
7241		- u64 max_free_space = 0;
7242		- u64 empty_cluster = 0;
	3993	+ int cache_block_group_error = 0;
	3994	+ struct btrfs_block_group *block_group = NULL;
	3995	+ struct find_free_extent_ctl ffe_ctl = {0};
7243	3996	struct btrfs_space_info *space_info;
7244		- int loop = 0;
7245		- int index = btrfs_bg_flags_to_raid_index(flags);
7246		- bool failed_cluster_refill = false;
7247		- bool failed_alloc = false;
7248		- bool use_cluster = true;
7249		- bool have_caching_bg = false;
7250		- bool orig_have_caching_bg = false;
7251	3997	bool full_search = false;
7252	3998
7253	3999	WARN_ON(num_bytes < fs_info->sectorsize);
	4000	+
	4001	+ ffe_ctl.num_bytes = num_bytes;
	4002	+ ffe_ctl.empty_size = empty_size;
	4003	+ ffe_ctl.flags = flags;
	4004	+ ffe_ctl.search_start = 0;
	4005	+ ffe_ctl.delalloc = delalloc;
	4006	+ ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
	4007	+ ffe_ctl.have_caching_bg = false;
	4008	+ ffe_ctl.orig_have_caching_bg = false;
	4009	+ ffe_ctl.found_offset = 0;
	4010	+ ffe_ctl.hint_byte = hint_byte_orig;
	4011	+ ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
	4012	+
	4013	+ /* For clustered allocation */
	4014	+ ffe_ctl.retry_clustered = false;
	4015	+ ffe_ctl.retry_unclustered = false;
	4016	+ ffe_ctl.last_ptr = NULL;
	4017	+ ffe_ctl.use_cluster = true;
	4018	+
7254	4019	ins->type = BTRFS_EXTENT_ITEM_KEY;
7255	4020	ins->objectid = 0;
7256	4021	ins->offset = 0;
7257	4022
7258		- trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
	4023	+ trace_find_free_extent(root, num_bytes, empty_size, flags);
7259	4024
7260		- space_info = __find_space_info(fs_info, flags);
	4025	+ space_info = btrfs_find_space_info(fs_info, flags);
7261	4026	if (!space_info) {
7262	4027	btrfs_err(fs_info, "No space info for %llu", flags);
7263	4028	return -ENOSPC;
7264	4029	}
7265	4030
7266		- /*
7267		- * If our free space is heavily fragmented we may not be able to make
7268		- * big contiguous allocations, so instead of doing the expensive search
7269		- * for free space, simply return ENOSPC with our max_extent_size so we
7270		- * can go ahead and search for a more manageable chunk.
7271		- *
7272		- * If our max_extent_size is large enough for our allocation simply
7273		- * disable clustering since we will likely not be able to find enough
7274		- * space to create a cluster and induce latency trying.
7275		- */
7276		- if (unlikely(space_info->max_extent_size)) {
7277		- spin_lock(&space_info->lock);
7278		- if (space_info->max_extent_size &&
7279		- num_bytes > space_info->max_extent_size) {
7280		- ins->offset = space_info->max_extent_size;
7281		- spin_unlock(&space_info->lock);
7282		- return -ENOSPC;
7283		- } else if (space_info->max_extent_size) {
7284		- use_cluster = false;
7285		- }
7286		- spin_unlock(&space_info->lock);
7287		- }
	4031	+ ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
	4032	+ if (ret < 0)
	4033	+ return ret;
7288	4034
7289		- last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
7290		- if (last_ptr) {
7291		- spin_lock(&last_ptr->lock);
7292		- if (last_ptr->block_group)
7293		- hint_byte = last_ptr->window_start;
7294		- if (last_ptr->fragmented) {
7295		- /*
7296		- * We still set window_start so we can keep track of the
7297		- * last place we found an allocation to try and save
7298		- * some time.
7299		- */
7300		- hint_byte = last_ptr->window_start;
7301		- use_cluster = false;
7302		- }
7303		- spin_unlock(&last_ptr->lock);
7304		- }
7305		-
7306		- search_start = max(search_start, first_logical_byte(fs_info, 0));
7307		- search_start = max(search_start, hint_byte);
7308		- if (search_start == hint_byte) {
7309		- block_group = btrfs_lookup_block_group(fs_info, search_start);
	4035	+ ffe_ctl.search_start = max(ffe_ctl.search_start,
	4036	+ first_logical_byte(fs_info, 0));
	4037	+ ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
	4038	+ if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
	4039	+ block_group = btrfs_lookup_block_group(fs_info,
	4040	+ ffe_ctl.search_start);
7310	4041	/*
7311	4042	* we don't want to use the block group if it doesn't match our
7312	4043	* allocation bits, or if its not cached.
..	..	@@ -7328,7 +4059,7 @@
7328	4059	btrfs_put_block_group(block_group);
7329	4060	up_read(&space_info->groups_sem);
7330	4061	} else {
7331		- index = btrfs_bg_flags_to_raid_index(
	4062	+ ffe_ctl.index = btrfs_bg_flags_to_raid_index(
7332	4063	block_group->flags);
7333	4064	btrfs_lock_block_group(block_group, delalloc);
7334	4065	goto have_block_group;
..	..	@@ -7338,21 +4069,21 @@
7338	4069	}
7339	4070	}
7340	4071	search:
7341		- have_caching_bg = false;
7342		- if (index == 0 \|\| index == btrfs_bg_flags_to_raid_index(flags))
	4072	+ ffe_ctl.have_caching_bg = false;
	4073	+ if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) \|\|
	4074	+ ffe_ctl.index == 0)
7343	4075	full_search = true;
7344	4076	down_read(&space_info->groups_sem);
7345		- list_for_each_entry(block_group, &space_info->block_groups[index],
7346		- list) {
7347		- u64 offset;
7348		- int cached;
	4077	+ list_for_each_entry(block_group,
	4078	+ &space_info->block_groups[ffe_ctl.index], list) {
	4079	+ struct btrfs_block_group *bg_ret;
7349	4080
7350	4081	/* If the block group is read-only, we can skip it entirely. */
7351	4082	if (unlikely(block_group->ro))
7352	4083	continue;
7353	4084
7354	4085	btrfs_grab_block_group(block_group, delalloc);
7355		- search_start = block_group->key.objectid;
	4086	+ ffe_ctl.search_start = block_group->start;
7356	4087
7357	4088	/*
7358	4089	* this can happen if we end up cycling through all the
..	..	@@ -7361,9 +4092,8 @@
7361	4092	*/
7362	4093	if (!block_group_bits(block_group, flags)) {
7363	4094	u64 extra = BTRFS_BLOCK_GROUP_DUP \|
7364		- BTRFS_BLOCK_GROUP_RAID1 \|
7365		- BTRFS_BLOCK_GROUP_RAID5 \|
7366		- BTRFS_BLOCK_GROUP_RAID6 \|
	4095	+ BTRFS_BLOCK_GROUP_RAID1_MASK \|
	4096	+ BTRFS_BLOCK_GROUP_RAID56_MASK \|
7367	4097	BTRFS_BLOCK_GROUP_RAID10;
7368	4098
7369	4099	/*
..	..	@@ -7384,379 +4114,101 @@
7384	4114	}
7385	4115
7386	4116	have_block_group:
7387		- cached = block_group_cache_done(block_group);
7388		- if (unlikely(!cached)) {
7389		- have_caching_bg = true;
7390		- ret = cache_block_group(block_group, 0);
7391		- BUG_ON(ret < 0);
	4117	+ ffe_ctl.cached = btrfs_block_group_done(block_group);
	4118	+ if (unlikely(!ffe_ctl.cached)) {
	4119	+ ffe_ctl.have_caching_bg = true;
	4120	+ ret = btrfs_cache_block_group(block_group, 0);
	4121	+
	4122	+ /*
	4123	+ * If we get ENOMEM here or something else we want to
	4124	+ * try other block groups, because it may not be fatal.
	4125	+ * However if we can't find anything else we need to
	4126	+ * save our return here so that we return the actual
	4127	+ * error that caused problems, not ENOSPC.
	4128	+ */
	4129	+ if (ret < 0) {
	4130	+ if (!cache_block_group_error)
	4131	+ cache_block_group_error = ret;
	4132	+ ret = 0;
	4133	+ goto loop;
	4134	+ }
7392	4135	ret = 0;
7393	4136	}
7394	4137
7395	4138	if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7396	4139	goto loop;
7397	4140
7398		- /*
7399		- * Ok we want to try and use the cluster allocator, so
7400		- * lets look there
7401		- */
7402		- if (last_ptr && use_cluster) {
7403		- struct btrfs_block_group_cache *used_block_group;
7404		- unsigned long aligned_cluster;
7405		- /*
7406		- * the refill lock keeps out other
7407		- * people trying to start a new cluster
7408		- */
7409		- used_block_group = btrfs_lock_cluster(block_group,
7410		- last_ptr,
7411		- delalloc);
7412		- if (!used_block_group)
7413		- goto refill_cluster;
7414		-
7415		- if (used_block_group != block_group &&
7416		- (used_block_group->ro \|\|
7417		- !block_group_bits(used_block_group, flags)))
7418		- goto release_cluster;
7419		-
7420		- offset = btrfs_alloc_from_cluster(used_block_group,
7421		- last_ptr,
7422		- num_bytes,
7423		- used_block_group->key.objectid,
7424		- &max_extent_size);
7425		- if (offset) {
7426		- /* we have a block, we're done */
7427		- spin_unlock(&last_ptr->refill_lock);
7428		- trace_btrfs_reserve_extent_cluster(
7429		- used_block_group,
7430		- search_start, num_bytes);
7431		- if (used_block_group != block_group) {
7432		- btrfs_release_block_group(block_group,
7433		- delalloc);
7434		- block_group = used_block_group;
7435		- }
7436		- goto checks;
	4141	+ bg_ret = NULL;
	4142	+ ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
	4143	+ if (ret == 0) {
	4144	+ if (bg_ret && bg_ret != block_group) {
	4145	+ btrfs_release_block_group(block_group, delalloc);
	4146	+ block_group = bg_ret;
7437	4147	}
7438		-
7439		- WARN_ON(last_ptr->block_group != used_block_group);
7440		-release_cluster:
7441		- /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7442		- * set up a new clusters, so lets just skip it
7443		- * and let the allocator find whatever block
7444		- * it can find. If we reach this point, we
7445		- * will have tried the cluster allocator
7446		- * plenty of times and not have found
7447		- * anything, so we are likely way too
7448		- * fragmented for the clustering stuff to find
7449		- * anything.
7450		- *
7451		- * However, if the cluster is taken from the
7452		- * current block group, release the cluster
7453		- * first, so that we stand a better chance of
7454		- * succeeding in the unclustered
7455		- * allocation. */
7456		- if (loop >= LOOP_NO_EMPTY_SIZE &&
7457		- used_block_group != block_group) {
7458		- spin_unlock(&last_ptr->refill_lock);
7459		- btrfs_release_block_group(used_block_group,
7460		- delalloc);
7461		- goto unclustered_alloc;
7462		- }
7463		-
7464		- /*
7465		- * this cluster didn't work out, free it and
7466		- * start over
7467		- */
7468		- btrfs_return_cluster_to_free_space(NULL, last_ptr);
7469		-
7470		- if (used_block_group != block_group)
7471		- btrfs_release_block_group(used_block_group,
7472		- delalloc);
7473		-refill_cluster:
7474		- if (loop >= LOOP_NO_EMPTY_SIZE) {
7475		- spin_unlock(&last_ptr->refill_lock);
7476		- goto unclustered_alloc;
7477		- }
7478		-
7479		- aligned_cluster = max_t(unsigned long,
7480		- empty_cluster + empty_size,
7481		- block_group->full_stripe_len);
7482		-
7483		- /* allocate a cluster in this block group */
7484		- ret = btrfs_find_space_cluster(fs_info, block_group,
7485		- last_ptr, search_start,
7486		- num_bytes,
7487		- aligned_cluster);
7488		- if (ret == 0) {
7489		- /*
7490		- * now pull our allocation out of this
7491		- * cluster
7492		- */
7493		- offset = btrfs_alloc_from_cluster(block_group,
7494		- last_ptr,
7495		- num_bytes,
7496		- search_start,
7497		- &max_extent_size);
7498		- if (offset) {
7499		- /* we found one, proceed */
7500		- spin_unlock(&last_ptr->refill_lock);
7501		- trace_btrfs_reserve_extent_cluster(
7502		- block_group, search_start,
7503		- num_bytes);
7504		- goto checks;
7505		- }
7506		- } else if (!cached && loop > LOOP_CACHING_NOWAIT
7507		- && !failed_cluster_refill) {
7508		- spin_unlock(&last_ptr->refill_lock);
7509		-
7510		- failed_cluster_refill = true;
7511		- wait_block_group_cache_progress(block_group,
7512		- num_bytes + empty_cluster + empty_size);
7513		- goto have_block_group;
7514		- }
7515		-
7516		- /*
7517		- * at this point we either didn't find a cluster
7518		- * or we weren't able to allocate a block from our
7519		- * cluster. Free the cluster we've been trying
7520		- * to use, and go to the next block group
7521		- */
7522		- btrfs_return_cluster_to_free_space(NULL, last_ptr);
7523		- spin_unlock(&last_ptr->refill_lock);
7524		- goto loop;
7525		- }
7526		-
7527		-unclustered_alloc:
7528		- /*
7529		- * We are doing an unclustered alloc, set the fragmented flag so
7530		- * we don't bother trying to setup a cluster again until we get
7531		- * more space.
7532		- */
7533		- if (unlikely(last_ptr)) {
7534		- spin_lock(&last_ptr->lock);
7535		- last_ptr->fragmented = 1;
7536		- spin_unlock(&last_ptr->lock);
7537		- }
7538		- if (cached) {
7539		- struct btrfs_free_space_ctl *ctl =
7540		- block_group->free_space_ctl;
7541		-
7542		- spin_lock(&ctl->tree_lock);
7543		- if (ctl->free_space <
7544		- num_bytes + empty_cluster + empty_size) {
7545		- max_free_space = max(max_free_space,
7546		- ctl->free_space);
7547		- spin_unlock(&ctl->tree_lock);
7548		- goto loop;
7549		- }
7550		- spin_unlock(&ctl->tree_lock);
7551		- }
7552		-
7553		- offset = btrfs_find_space_for_alloc(block_group, search_start,
7554		- num_bytes, empty_size,
7555		- &max_extent_size);
7556		- /*
7557		- * If we didn't find a chunk, and we haven't failed on this
7558		- * block group before, and this block group is in the middle of
7559		- * caching and we are ok with waiting, then go ahead and wait
7560		- * for progress to be made, and set failed_alloc to true.
7561		- *
7562		- * If failed_alloc is true then we've already waited on this
7563		- * block group once and should move on to the next block group.
7564		- */
7565		- if (!offset && !failed_alloc && !cached &&
7566		- loop > LOOP_CACHING_NOWAIT) {
7567		- wait_block_group_cache_progress(block_group,
7568		- num_bytes + empty_size);
7569		- failed_alloc = true;
	4148	+ } else if (ret == -EAGAIN) {
7570	4149	goto have_block_group;
7571		- } else if (!offset) {
	4150	+ } else if (ret > 0) {
7572	4151	goto loop;
7573	4152	}
7574		-checks:
7575		- search_start = round_up(offset, fs_info->stripesize);
	4153	+
	4154	+ /* Checks */
	4155	+ ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
	4156	+ fs_info->stripesize);
7576	4157
7577	4158	/* move on to the next group */
7578		- if (search_start + num_bytes >
7579		- block_group->key.objectid + block_group->key.offset) {
7580		- btrfs_add_free_space(block_group, offset, num_bytes);
	4159	+ if (ffe_ctl.search_start + num_bytes >
	4160	+ block_group->start + block_group->length) {
	4161	+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
	4162	+ num_bytes);
7581	4163	goto loop;
7582	4164	}
7583	4165
7584		- if (offset < search_start)
7585		- btrfs_add_free_space(block_group, offset,
7586		- search_start - offset);
	4166	+ if (ffe_ctl.found_offset < ffe_ctl.search_start)
	4167	+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
	4168	+ ffe_ctl.search_start - ffe_ctl.found_offset);
7587	4169
7588	4170	ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7589	4171	num_bytes, delalloc);
7590	4172	if (ret == -EAGAIN) {
7591		- btrfs_add_free_space(block_group, offset, num_bytes);
	4173	+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
	4174	+ num_bytes);
7592	4175	goto loop;
7593	4176	}
7594	4177	btrfs_inc_block_group_reservations(block_group);
7595	4178
7596	4179	/* we are all good, lets return */
7597		- ins->objectid = search_start;
	4180	+ ins->objectid = ffe_ctl.search_start;
7598	4181	ins->offset = num_bytes;
7599	4182
7600		- trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
	4183	+ trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
	4184	+ num_bytes);
7601	4185	btrfs_release_block_group(block_group, delalloc);
7602	4186	break;
7603	4187	loop:
7604		- failed_cluster_refill = false;
7605		- failed_alloc = false;
7606		- BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
7607		- index);
7608		- btrfs_release_block_group(block_group, delalloc);
	4188	+ release_block_group(block_group, &ffe_ctl, delalloc);
7609	4189	cond_resched();
7610	4190	}
7611	4191	up_read(&space_info->groups_sem);
7612	4192
7613		- if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7614		- && !orig_have_caching_bg)
7615		- orig_have_caching_bg = true;
7616		-
7617		- if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
	4193	+ ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
	4194	+ if (ret > 0)
7618	4195	goto search;
7619	4196
7620		- if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7621		- goto search;
7622		-
7623		- /*
7624		- * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7625		- * caching kthreads as we move along
7626		- * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7627		- * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7628		- * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7629		- * again
7630		- */
7631		- if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7632		- index = 0;
7633		- if (loop == LOOP_CACHING_NOWAIT) {
7634		- /*
7635		- * We want to skip the LOOP_CACHING_WAIT step if we
7636		- * don't have any uncached bgs and we've already done a
7637		- * full search through.
7638		- */
7639		- if (orig_have_caching_bg \|\| !full_search)
7640		- loop = LOOP_CACHING_WAIT;
7641		- else
7642		- loop = LOOP_ALLOC_CHUNK;
7643		- } else {
7644		- loop++;
7645		- }
7646		-
7647		- if (loop == LOOP_ALLOC_CHUNK) {
7648		- struct btrfs_trans_handle *trans;
7649		- int exist = 0;
7650		-
7651		- trans = current->journal_info;
7652		- if (trans)
7653		- exist = 1;
7654		- else
7655		- trans = btrfs_join_transaction(root);
7656		-
7657		- if (IS_ERR(trans)) {
7658		- ret = PTR_ERR(trans);
7659		- goto out;
7660		- }
7661		-
7662		- ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
7663		-
7664		- /*
7665		- * If we can't allocate a new chunk we've already looped
7666		- * through at least once, move on to the NO_EMPTY_SIZE
7667		- * case.
7668		- */
7669		- if (ret == -ENOSPC)
7670		- loop = LOOP_NO_EMPTY_SIZE;
7671		-
7672		- /*
7673		- * Do not bail out on ENOSPC since we
7674		- * can do more things.
7675		- */
7676		- if (ret < 0 && ret != -ENOSPC)
7677		- btrfs_abort_transaction(trans, ret);
7678		- else
7679		- ret = 0;
7680		- if (!exist)
7681		- btrfs_end_transaction(trans);
7682		- if (ret)
7683		- goto out;
7684		- }
7685		-
7686		- if (loop == LOOP_NO_EMPTY_SIZE) {
7687		- /*
7688		- * Don't loop again if we already have no empty_size and
7689		- * no empty_cluster.
7690		- */
7691		- if (empty_size == 0 &&
7692		- empty_cluster == 0) {
7693		- ret = -ENOSPC;
7694		- goto out;
7695		- }
7696		- empty_size = 0;
7697		- empty_cluster = 0;
7698		- }
7699		-
7700		- goto search;
7701		- } else if (!ins->objectid) {
7702		- ret = -ENOSPC;
7703		- } else if (ins->objectid) {
7704		- if (!use_cluster && last_ptr) {
7705		- spin_lock(&last_ptr->lock);
7706		- last_ptr->window_start = ins->objectid;
7707		- spin_unlock(&last_ptr->lock);
7708		- }
7709		- ret = 0;
7710		- }
7711		-out:
7712		- if (ret == -ENOSPC) {
7713		- if (!max_extent_size)
7714		- max_extent_size = max_free_space;
	4197	+ if (ret == -ENOSPC && !cache_block_group_error) {
	4198	+ /*
	4199	+ * Use ffe_ctl->total_free_space as fallback if we can't find
	4200	+ * any contiguous hole.
	4201	+ */
	4202	+ if (!ffe_ctl.max_extent_size)
	4203	+ ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
7715	4204	spin_lock(&space_info->lock);
7716		- space_info->max_extent_size = max_extent_size;
	4205	+ space_info->max_extent_size = ffe_ctl.max_extent_size;
7717	4206	spin_unlock(&space_info->lock);
7718		- ins->offset = max_extent_size;
	4207	+ ins->offset = ffe_ctl.max_extent_size;
	4208	+ } else if (ret == -ENOSPC) {
	4209	+ ret = cache_block_group_error;
7719	4210	}
7720	4211	return ret;
7721		-}
7722		-
7723		-static void dump_space_info(struct btrfs_fs_info *fs_info,
7724		- struct btrfs_space_info *info, u64 bytes,
7725		- int dump_block_groups)
7726		-{
7727		- struct btrfs_block_group_cache *cache;
7728		- int index = 0;
7729		-
7730		- spin_lock(&info->lock);
7731		- btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
7732		- info->flags,
7733		- info->total_bytes - btrfs_space_info_used(info, true),
7734		- info->full ? "" : "not ");
7735		- btrfs_info(fs_info,
7736		- "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
7737		- info->total_bytes, info->bytes_used, info->bytes_pinned,
7738		- info->bytes_reserved, info->bytes_may_use,
7739		- info->bytes_readonly);
7740		- spin_unlock(&info->lock);
7741		-
7742		- if (!dump_block_groups)
7743		- return;
7744		-
7745		- down_read(&info->groups_sem);
7746		-again:
7747		- list_for_each_entry(cache, &info->block_groups[index], list) {
7748		- spin_lock(&cache->lock);
7749		- btrfs_info(fs_info,
7750		- "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
7751		- cache->key.objectid, cache->key.offset,
7752		- btrfs_block_group_used(&cache->item), cache->pinned,
7753		- cache->reserved, cache->ro ? "[readonly]" : "");
7754		- btrfs_dump_free_space(cache, bytes);
7755		- spin_unlock(&cache->lock);
7756		- }
7757		- if (++index < BTRFS_NR_RAID_TYPES)
7758		- goto again;
7759		- up_read(&info->groups_sem);
7760	4212	}
7761	4213
7762	4214	/*
..	..	@@ -7817,7 +4269,7 @@
7817	4269	flags = get_alloc_profile_by_root(root, is_data);
7818	4270	again:
7819	4271	WARN_ON(num_bytes < fs_info->sectorsize);
7820		- ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
	4272	+ ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
7821	4273	hint_byte, ins, flags, delalloc);
7822	4274	if (!ret && !is_data) {
7823	4275	btrfs_dec_block_group_reservations(fs_info, ins->objectid);
..	..	@@ -7834,24 +4286,23 @@
7834	4286	} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
7835	4287	struct btrfs_space_info *sinfo;
7836	4288
7837		- sinfo = __find_space_info(fs_info, flags);
	4289	+ sinfo = btrfs_find_space_info(fs_info, flags);
7838	4290	btrfs_err(fs_info,
7839	4291	"allocation failed flags %llu, wanted %llu",
7840	4292	flags, num_bytes);
7841	4293	if (sinfo)
7842		- dump_space_info(fs_info, sinfo, num_bytes, 1);
	4294	+ btrfs_dump_space_info(fs_info, sinfo,
	4295	+ num_bytes, 1);
7843	4296	}
7844	4297	}
7845	4298
7846	4299	return ret;
7847	4300	}
7848	4301
7849		-static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7850		- u64 start, u64 len,
7851		- int pin, int delalloc)
	4302	+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
	4303	+ u64 start, u64 len, int delalloc)
7852	4304	{
7853		- struct btrfs_block_group_cache *cache;
7854		- int ret = 0;
	4305	+ struct btrfs_block_group *cache;
7855	4306
7856	4307	cache = btrfs_lookup_block_group(fs_info, start);
7857	4308	if (!cache) {
..	..	@@ -7860,30 +4311,30 @@
7860	4311	return -ENOSPC;
7861	4312	}
7862	4313
7863		- if (pin)
7864		- pin_down_extent(fs_info, cache, start, len, 1);
7865		- else {
7866		- if (btrfs_test_opt(fs_info, DISCARD))
7867		- ret = btrfs_discard_extent(fs_info, start, len, NULL);
7868		- btrfs_add_free_space(cache, start, len);
7869		- btrfs_free_reserved_bytes(cache, len, delalloc);
7870		- trace_btrfs_reserved_extent_free(fs_info, start, len);
7871		- }
	4314	+ btrfs_add_free_space(cache, start, len);
	4315	+ btrfs_free_reserved_bytes(cache, len, delalloc);
	4316	+ trace_btrfs_reserved_extent_free(fs_info, start, len);
7872	4317
7873	4318	btrfs_put_block_group(cache);
	4319	+ return 0;
	4320	+}
	4321	+
	4322	+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
	4323	+ u64 len)
	4324	+{
	4325	+ struct btrfs_block_group *cache;
	4326	+ int ret = 0;
	4327	+
	4328	+ cache = btrfs_lookup_block_group(trans->fs_info, start);
	4329	+ if (!cache) {
	4330	+ btrfs_err(trans->fs_info, "unable to find block group for %llu",
	4331	+ start);
	4332	+ return -ENOSPC;
	4333	+ }
	4334	+
	4335	+ ret = pin_down_extent(trans, cache, start, len, 1);
	4336	+ btrfs_put_block_group(cache);
7874	4337	return ret;
7875		-}
7876		-
7877		-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7878		- u64 start, u64 len, int delalloc)
7879		-{
7880		- return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
7881		-}
7882		-
7883		-int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
7884		- u64 start, u64 len)
7885		-{
7886		- return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
7887	4338	}
7888	4339
7889	4340	static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
..	..	@@ -7950,7 +4401,7 @@
7950	4401	if (ret)
7951	4402	return ret;
7952	4403
7953		- ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
	4404	+ ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
7954	4405	if (ret) { /* -ENOENT, logic error */
7955	4406	btrfs_err(fs_info, "update block group failed for %llu %llu",
7956	4407	ins->objectid, ins->offset);
..	..	@@ -8040,8 +4491,8 @@
8040	4491	if (ret)
8041	4492	return ret;
8042	4493
8043		- ret = update_block_group(trans, fs_info, extent_key.objectid,
8044		- fs_info->nodesize, 1);
	4494	+ ret = btrfs_update_block_group(trans, extent_key.objectid,
	4495	+ fs_info->nodesize, 1);
8045	4496	if (ret) { /* -ENOENT, logic error */
8046	4497	btrfs_err(fs_info, "update block group failed for %llu %llu",
8047	4498	extent_key.objectid, extent_key.offset);
..	..	@@ -8058,20 +4509,16 @@
8058	4509	u64 offset, u64 ram_bytes,
8059	4510	struct btrfs_key *ins)
8060	4511	{
8061		- int ret;
	4512	+ struct btrfs_ref generic_ref = { 0 };
8062	4513
8063	4514	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
8064	4515
8065		- btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
8066		- root->root_key.objectid, owner, offset,
8067		- BTRFS_ADD_DELAYED_EXTENT);
	4516	+ btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
	4517	+ ins->objectid, ins->offset, 0);
	4518	+ btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
	4519	+ btrfs_ref_tree_mod(root->fs_info, &generic_ref);
8068	4520
8069		- ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
8070		- ins->offset, 0,
8071		- root->root_key.objectid, owner,
8072		- offset, ram_bytes,
8073		- BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8074		- return ret;
	4521	+ return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
8075	4522	}
8076	4523
8077	4524	/*
..	..	@@ -8085,7 +4532,7 @@
8085	4532	{
8086	4533	struct btrfs_fs_info *fs_info = trans->fs_info;
8087	4534	int ret;
8088		- struct btrfs_block_group_cache *block_group;
	4535	+ struct btrfs_block_group *block_group;
8089	4536	struct btrfs_space_info *space_info;
8090	4537
8091	4538	/*
..	..	@@ -8113,13 +4560,16 @@
8113	4560
8114	4561	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
8115	4562	offset, ins, 1);
	4563	+ if (ret)
	4564	+ btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
8116	4565	btrfs_put_block_group(block_group);
8117	4566	return ret;
8118	4567	}
8119	4568
8120	4569	static struct extent_buffer *
8121	4570	btrfs_init_new_buffer(struct btrfs_trans_handle trans, struct btrfs_root root,
8122		- u64 bytenr, int level, u64 owner)
	4571	+ u64 bytenr, int level, u64 owner,
	4572	+ enum btrfs_lock_nesting nest)
8123	4573	{
8124	4574	struct btrfs_fs_info *fs_info = root->fs_info;
8125	4575	struct extent_buffer *buf;
..	..	@@ -8141,12 +4591,12 @@
8141	4591	return ERR_PTR(-EUCLEAN);
8142	4592	}
8143	4593
8144		- btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8145		- btrfs_tree_lock(buf);
8146		- clean_tree_block(fs_info, buf);
	4594	+ btrfs_set_buffer_lockdep_class(owner, buf, level);
	4595	+ __btrfs_tree_lock(buf, nest);
	4596	+ btrfs_clean_tree_block(buf);
8147	4597	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8148	4598
8149		- btrfs_set_lock_blocking(buf);
	4599	+ btrfs_set_lock_blocking_write(buf);
8150	4600	set_extent_buffer_uptodate(buf);
8151	4601
8152	4602	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
..	..	@@ -8155,13 +4605,13 @@
8155	4605	btrfs_set_header_generation(buf, trans->transid);
8156	4606	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
8157	4607	btrfs_set_header_owner(buf, owner);
8158		- write_extent_buffer_fsid(buf, fs_info->fsid);
	4608	+ write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
8159	4609	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
8160	4610	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8161	4611	buf->log_index = root->log_transid % 2;
8162	4612	/*
8163	4613	* we allow two log transactions at a time, use different
8164		- * EXENT bit to differentiate dirty pages.
	4614	+ * EXTENT bit to differentiate dirty pages.
8165	4615	*/
8166	4616	if (buf->log_index == 0)
8167	4617	set_extent_dirty(&root->dirty_log_pages, buf->start,
..	..	@@ -8179,68 +4629,6 @@
8179	4629	return buf;
8180	4630	}
8181	4631
8182		-static struct btrfs_block_rsv *
8183		-use_block_rsv(struct btrfs_trans_handle *trans,
8184		- struct btrfs_root *root, u32 blocksize)
8185		-{
8186		- struct btrfs_fs_info *fs_info = root->fs_info;
8187		- struct btrfs_block_rsv *block_rsv;
8188		- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8189		- int ret;
8190		- bool global_updated = false;
8191		-
8192		- block_rsv = get_block_rsv(trans, root);
8193		-
8194		- if (unlikely(block_rsv->size == 0))
8195		- goto try_reserve;
8196		-again:
8197		- ret = block_rsv_use_bytes(block_rsv, blocksize);
8198		- if (!ret)
8199		- return block_rsv;
8200		-
8201		- if (block_rsv->failfast)
8202		- return ERR_PTR(ret);
8203		-
8204		- if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8205		- global_updated = true;
8206		- update_global_block_rsv(fs_info);
8207		- goto again;
8208		- }
8209		-
8210		- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8211		- static DEFINE_RATELIMIT_STATE(_rs,
8212		- DEFAULT_RATELIMIT_INTERVAL * 10,
8213		- /DEFAULT_RATELIMIT_BURST/ 1);
8214		- if (__ratelimit(&_rs))
8215		- WARN(1, KERN_DEBUG
8216		- "BTRFS: block rsv returned %d\n", ret);
8217		- }
8218		-try_reserve:
8219		- ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8220		- BTRFS_RESERVE_NO_FLUSH);
8221		- if (!ret)
8222		- return block_rsv;
8223		- /*
8224		- * If we couldn't reserve metadata bytes try and use some from
8225		- * the global reserve if its space type is the same as the global
8226		- * reservation.
8227		- */
8228		- if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8229		- block_rsv->space_info == global_rsv->space_info) {
8230		- ret = block_rsv_use_bytes(global_rsv, blocksize);
8231		- if (!ret)
8232		- return global_rsv;
8233		- }
8234		- return ERR_PTR(ret);
8235		-}
8236		-
8237		-static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8238		- struct btrfs_block_rsv *block_rsv, u32 blocksize)
8239		-{
8240		- block_rsv_add_bytes(block_rsv, blocksize, 0);
8241		- block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8242		-}
8243		-
8244	4632	/*
8245	4633	* finds a free extent and does all the dirty work required for allocation
8246	4634	* returns the tree buffer or an ERR_PTR on error.
..	..	@@ -8250,13 +4638,15 @@
8250	4638	u64 parent, u64 root_objectid,
8251	4639	const struct btrfs_disk_key *key,
8252	4640	int level, u64 hint,
8253		- u64 empty_size)
	4641	+ u64 empty_size,
	4642	+ enum btrfs_lock_nesting nest)
8254	4643	{
8255	4644	struct btrfs_fs_info *fs_info = root->fs_info;
8256	4645	struct btrfs_key ins;
8257	4646	struct btrfs_block_rsv *block_rsv;
8258	4647	struct extent_buffer *buf;
8259	4648	struct btrfs_delayed_extent_op *extent_op;
	4649	+ struct btrfs_ref generic_ref = { 0 };
8260	4650	u64 flags = 0;
8261	4651	int ret;
8262	4652	u32 blocksize = fs_info->nodesize;
..	..	@@ -8265,14 +4655,14 @@
8265	4655	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8266	4656	if (btrfs_is_testing(fs_info)) {
8267	4657	buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8268		- level, root_objectid);
	4658	+ level, root_objectid, nest);
8269	4659	if (!IS_ERR(buf))
8270	4660	root->alloc_bytenr += blocksize;
8271	4661	return buf;
8272	4662	}
8273	4663	#endif
8274	4664
8275		- block_rsv = use_block_rsv(trans, root, blocksize);
	4665	+ block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
8276	4666	if (IS_ERR(block_rsv))
8277	4667	return ERR_CAST(block_rsv);
8278	4668
..	..	@@ -8282,7 +4672,7 @@
8282	4672	goto out_unuse;
8283	4673
8284	4674	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
8285		- root_objectid);
	4675	+ root_objectid, nest);
8286	4676	if (IS_ERR(buf)) {
8287	4677	ret = PTR_ERR(buf);
8288	4678	goto out_free_reserved;
..	..	@@ -8311,14 +4701,12 @@
8311	4701	extent_op->is_data = false;
8312	4702	extent_op->level = level;
8313	4703
8314		- btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
8315		- root_objectid, level, 0,
8316		- BTRFS_ADD_DELAYED_EXTENT);
8317		- ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
8318		- ins.offset, parent,
8319		- root_objectid, level,
8320		- BTRFS_ADD_DELAYED_EXTENT,
8321		- extent_op, NULL, NULL);
	4704	+ btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
	4705	+ ins.objectid, ins.offset, parent);
	4706	+ generic_ref.real_root = root->root_key.objectid;
	4707	+ btrfs_init_tree_ref(&generic_ref, level, root_objectid);
	4708	+ btrfs_ref_tree_mod(fs_info, &generic_ref);
	4709	+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
8322	4710	if (ret)
8323	4711	goto out_free_delayed;
8324	4712	}
..	..	@@ -8327,11 +4715,12 @@
8327	4715	out_free_delayed:
8328	4716	btrfs_free_delayed_extent_op(extent_op);
8329	4717	out_free_buf:
	4718	+ btrfs_tree_unlock(buf);
8330	4719	free_extent_buffer(buf);
8331	4720	out_free_reserved:
8332	4721	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8333	4722	out_unuse:
8334		- unuse_block_rsv(fs_info, block_rsv, blocksize);
	4723	+ btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
8335	4724	return ERR_PTR(ret);
8336	4725	}
8337	4726
..	..	@@ -8339,6 +4728,8 @@
8339	4728	u64 refs[BTRFS_MAX_LEVEL];
8340	4729	u64 flags[BTRFS_MAX_LEVEL];
8341	4730	struct btrfs_key update_progress;
	4731	+ struct btrfs_key drop_progress;
	4732	+ int drop_level;
8342	4733	int stage;
8343	4734	int level;
8344	4735	int shared_level;
..	..	@@ -8346,6 +4737,7 @@
8346	4737	int keep_locks;
8347	4738	int reada_slot;
8348	4739	int reada_count;
	4740	+ int restarted;
8349	4741	};
8350	4742
8351	4743	#define DROP_REFERENCE 1
..	..	@@ -8490,8 +4882,7 @@
8490	4882	BUG_ON(ret); /* -ENOMEM */
8491	4883	ret = btrfs_dec_ref(trans, root, eb, 0);
8492	4884	BUG_ON(ret); /* -ENOMEM */
8493		- ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
8494		- eb->len, flag,
	4885	+ ret = btrfs_set_disk_extent_flags(trans, eb, flag,
8495	4886	btrfs_header_level(eb), 0);
8496	4887	BUG_ON(ret); /* -ENOMEM */
8497	4888	wc->flags[level] \|= flag;
..	..	@@ -8506,6 +4897,33 @@
8506	4897	path->locks[level] = 0;
8507	4898	}
8508	4899	return 0;
	4900	+}
	4901	+
	4902	+/*
	4903	+ * This is used to verify a ref exists for this root to deal with a bug where we
	4904	+ * would have a drop_progress key that hadn't been updated properly.
	4905	+ */
	4906	+static int check_ref_exists(struct btrfs_trans_handle *trans,
	4907	+ struct btrfs_root *root, u64 bytenr, u64 parent,
	4908	+ int level)
	4909	+{
	4910	+ struct btrfs_path *path;
	4911	+ struct btrfs_extent_inline_ref *iref;
	4912	+ int ret;
	4913	+
	4914	+ path = btrfs_alloc_path();
	4915	+ if (!path)
	4916	+ return -ENOMEM;
	4917	+
	4918	+ ret = lookup_extent_backref(trans, path, &iref, bytenr,
	4919	+ root->fs_info->nodesize, parent,
	4920	+ root->root_key.objectid, level, 0);
	4921	+ btrfs_free_path(path);
	4922	+ if (ret == -ENOENT)
	4923	+ return 0;
	4924	+ if (ret < 0)
	4925	+ return ret;
	4926	+ return 1;
8509	4927	}
8510	4928
8511	4929	/*
..	..	@@ -8530,9 +4948,9 @@
8530	4948	u64 bytenr;
8531	4949	u64 generation;
8532	4950	u64 parent;
8533		- u32 blocksize;
8534	4951	struct btrfs_key key;
8535	4952	struct btrfs_key first_key;
	4953	+ struct btrfs_ref ref = { 0 };
8536	4954	struct extent_buffer *next;
8537	4955	int level = wc->level;
8538	4956	int reada = 0;
..	..	@@ -8555,7 +4973,6 @@
8555	4973	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8556	4974	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
8557	4975	path->slots[level]);
8558		- blocksize = fs_info->nodesize;
8559	4976
8560	4977	next = find_extent_buffer(fs_info, bytenr);
8561	4978	if (!next) {
..	..	@@ -8568,7 +4985,7 @@
8568	4985	reada = 1;
8569	4986	}
8570	4987	btrfs_tree_lock(next);
8571		- btrfs_set_lock_blocking(next);
	4988	+ btrfs_set_lock_blocking_write(next);
8572	4989
8573	4990	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
8574	4991	&wc->refs[level - 1],
..	..	@@ -8628,7 +5045,7 @@
8628	5045	return -EIO;
8629	5046	}
8630	5047	btrfs_tree_lock(next);
8631		- btrfs_set_lock_blocking(next);
	5048	+ btrfs_set_lock_blocking_write(next);
8632	5049	}
8633	5050
8634	5051	level--;
..	..	@@ -8664,7 +5081,30 @@
8664	5081	parent = 0;
8665	5082	}
8666	5083
8667		- if (need_account) {
	5084	+ /*
	5085	+ * If we had a drop_progress we need to verify the refs are set
	5086	+ * as expected. If we find our ref then we know that from here
	5087	+ * on out everything should be correct, and we can clear the
	5088	+ * ->restarted flag.
	5089	+ */
	5090	+ if (wc->restarted) {
	5091	+ ret = check_ref_exists(trans, root, bytenr, parent,
	5092	+ level - 1);
	5093	+ if (ret < 0)
	5094	+ goto out_unlock;
	5095	+ if (ret == 0)
	5096	+ goto no_delete;
	5097	+ ret = 0;
	5098	+ wc->restarted = 0;
	5099	+ }
	5100	+
	5101	+ /*
	5102	+ * Reloc tree doesn't contribute to qgroup numbers, and we have
	5103	+ * already accounted them at merge time (replace_path),
	5104	+ * thus we could skip expensive subtree trace here.
	5105	+ */
	5106	+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
	5107	+ need_account) {
8668	5108	ret = btrfs_qgroup_trace_subtree(trans, next,
8669	5109	generation, level - 1);
8670	5110	if (ret) {
..	..	@@ -8673,13 +5113,24 @@
8673	5113	ret);
8674	5114	}
8675	5115	}
8676		- ret = btrfs_free_extent(trans, root, bytenr, blocksize,
8677		- parent, root->root_key.objectid,
8678		- level - 1, 0);
	5116	+
	5117	+ /*
	5118	+ * We need to update the next key in our walk control so we can
	5119	+ * update the drop_progress key accordingly. We don't care if
	5120	+ * find_next_key doesn't find a key because that means we're at
	5121	+ * the end and are going to clean up now.
	5122	+ */
	5123	+ wc->drop_level = level;
	5124	+ find_next_key(path, level, &wc->drop_progress);
	5125	+
	5126	+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
	5127	+ fs_info->nodesize, parent);
	5128	+ btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
	5129	+ ret = btrfs_free_extent(trans, &ref);
8679	5130	if (ret)
8680	5131	goto out_unlock;
8681	5132	}
8682		-
	5133	+no_delete:
8683	5134	*lookup_info = 1;
8684	5135	ret = 1;
8685	5136
..	..	@@ -8734,7 +5185,7 @@
8734	5185	if (!path->locks[level]) {
8735	5186	BUG_ON(level == 0);
8736	5187	btrfs_tree_lock(eb);
8737		- btrfs_set_lock_blocking(eb);
	5188	+ btrfs_set_lock_blocking_write(eb);
8738	5189	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8739	5190
8740	5191	ret = btrfs_lookup_extent_info(trans, fs_info,
..	..	@@ -8765,21 +5216,23 @@
8765	5216	else
8766	5217	ret = btrfs_dec_ref(trans, root, eb, 0);
8767	5218	BUG_ON(ret); /* -ENOMEM */
8768		- ret = btrfs_qgroup_trace_leaf_items(trans, eb);
8769		- if (ret) {
8770		- btrfs_err_rl(fs_info,
8771		- "error %d accounting leaf items. Quota is out of sync, rescan required.",
	5219	+ if (is_fstree(root->root_key.objectid)) {
	5220	+ ret = btrfs_qgroup_trace_leaf_items(trans, eb);
	5221	+ if (ret) {
	5222	+ btrfs_err_rl(fs_info,
	5223	+ "error %d accounting leaf items, quota is out of sync, rescan required",
8772	5224	ret);
	5225	+ }
8773	5226	}
8774	5227	}
8775		- /* make block locked assertion in clean_tree_block happy */
	5228	+ /* make block locked assertion in btrfs_clean_tree_block happy */
8776	5229	if (!path->locks[level] &&
8777	5230	btrfs_header_generation(eb) == trans->transid) {
8778	5231	btrfs_tree_lock(eb);
8779		- btrfs_set_lock_blocking(eb);
	5232	+ btrfs_set_lock_blocking_write(eb);
8780	5233	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8781	5234	}
8782		- clean_tree_block(fs_info, eb);
	5235	+ btrfs_clean_tree_block(eb);
8783	5236	}
8784	5237
8785	5238	if (eb == root->node) {
..	..	@@ -8887,9 +5340,7 @@
8887	5340	*
8888	5341	* If called with for_reloc == 0, may exit early with -EAGAIN
8889	5342	*/
8890		-int btrfs_drop_snapshot(struct btrfs_root *root,
8891		- struct btrfs_block_rsv *block_rsv, int update_ref,
8892		- int for_reloc)
	5343	+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
8893	5344	{
8894	5345	struct btrfs_fs_info *fs_info = root->fs_info;
8895	5346	struct btrfs_path *path;
..	..	@@ -8903,7 +5354,7 @@
8903	5354	int level;
8904	5355	bool root_dropped = false;
8905	5356
8906		- btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
	5357	+ btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
8907	5358
8908	5359	path = btrfs_alloc_path();
8909	5360	if (!path) {
..	..	@@ -8918,7 +5369,14 @@
8918	5369	goto out;
8919	5370	}
8920	5371
8921		- trans = btrfs_start_transaction(tree_root, 0);
	5372	+ /*
	5373	+ * Use join to avoid potential EINTR from transaction start. See
	5374	+ * wait_reserve_ticket and the whole reservation callchain.
	5375	+ */
	5376	+ if (for_reloc)
	5377	+ trans = btrfs_join_transaction(tree_root);
	5378	+ else
	5379	+ trans = btrfs_start_transaction(tree_root, 0);
8922	5380	if (IS_ERR(trans)) {
8923	5381	err = PTR_ERR(trans);
8924	5382	goto out_free;
..	..	@@ -8928,13 +5386,19 @@
8928	5386	if (err)
8929	5387	goto out_end_trans;
8930	5388
8931		- if (block_rsv)
8932		- trans->block_rsv = block_rsv;
8933		-
	5389	+ /*
	5390	+ * This will help us catch people modifying the fs tree while we're
	5391	+ * dropping it. It is unsafe to mess with the fs tree while it's being
	5392	+ * dropped as we unlock the root node and parent nodes as we walk down
	5393	+ * the tree, assuming nothing will change. If something does change
	5394	+ * then we'll have stale information and drop references to blocks we've
	5395	+ * already dropped.
	5396	+ */
	5397	+ set_bit(BTRFS_ROOT_DELETING, &root->state);
8934	5398	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
8935	5399	level = btrfs_header_level(root->node);
8936	5400	path->nodes[level] = btrfs_lock_root_node(root);
8937		- btrfs_set_lock_blocking(path->nodes[level]);
	5401	+ btrfs_set_lock_blocking_write(path->nodes[level]);
8938	5402	path->slots[level] = 0;
8939	5403	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8940	5404	memset(&wc->update_progress, 0,
..	..	@@ -8964,7 +5428,7 @@
8964	5428	level = btrfs_header_level(root->node);
8965	5429	while (1) {
8966	5430	btrfs_tree_lock(path->nodes[level]);
8967		- btrfs_set_lock_blocking(path->nodes[level]);
	5431	+ btrfs_set_lock_blocking_write(path->nodes[level]);
8968	5432	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8969	5433
8970	5434	ret = btrfs_lookup_extent_info(trans, fs_info,
..	..	@@ -8987,6 +5451,7 @@
8987	5451	}
8988	5452	}
8989	5453
	5454	+ wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
8990	5455	wc->level = level;
8991	5456	wc->shared_level = -1;
8992	5457	wc->stage = DROP_REFERENCE;
..	..	@@ -9014,12 +5479,14 @@
9014	5479	}
9015	5480
9016	5481	if (wc->stage == DROP_REFERENCE) {
9017		- level = wc->level;
9018		- btrfs_node_key(path->nodes[level],
9019		- &root_item->drop_progress,
9020		- path->slots[level]);
9021		- root_item->drop_level = level;
	5482	+ wc->drop_level = wc->level;
	5483	+ btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
	5484	+ &wc->drop_progress,
	5485	+ path->slots[wc->drop_level]);
9022	5486	}
	5487	+ btrfs_cpu_key_to_disk(&root_item->drop_progress,
	5488	+ &wc->drop_progress);
	5489	+ root_item->drop_level = wc->drop_level;
9023	5490
9024	5491	BUG_ON(wc->level == 0);
9025	5492	if (btrfs_should_end_transaction(trans) \|\|
..	..	@@ -9041,13 +5508,19 @@
9041	5508	goto out_free;
9042	5509	}
9043	5510
9044		- trans = btrfs_start_transaction(tree_root, 0);
	5511	+ /*
	5512	+ * Use join to avoid potential EINTR from transaction
	5513	+ * start. See wait_reserve_ticket and the whole
	5514	+ * reservation callchain.
	5515	+ */
	5516	+ if (for_reloc)
	5517	+ trans = btrfs_join_transaction(tree_root);
	5518	+ else
	5519	+ trans = btrfs_start_transaction(tree_root, 0);
9045	5520	if (IS_ERR(trans)) {
9046	5521	err = PTR_ERR(trans);
9047	5522	goto out_free;
9048	5523	}
9049		- if (block_rsv)
9050		- trans->block_rsv = block_rsv;
9051	5524	}
9052	5525	}
9053	5526	btrfs_release_path(path);
..	..	@@ -9079,13 +5552,18 @@
9079	5552	}
9080	5553	}
9081	5554
9082		- if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
	5555	+ /*
	5556	+ * This subvolume is going to be completely dropped, and won't be
	5557	+ * recorded as dirty roots, thus pertrans meta rsv will not be freed at
	5558	+ * commit transaction time. So free it here manually.
	5559	+ */
	5560	+ btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
	5561	+ btrfs_qgroup_free_meta_all_pertrans(root);
	5562	+
	5563	+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
9083	5564	btrfs_add_dropped_root(trans, root);
9084		- } else {
9085		- free_extent_buffer(root->node);
9086		- free_extent_buffer(root->commit_root);
9087		- btrfs_put_fs_root(root);
9088		- }
	5565	+ else
	5566	+ btrfs_put_root(root);
9089	5567	root_dropped = true;
9090	5568	out_end_trans:
9091	5569	btrfs_end_transaction_throttle(trans);
..	..	@@ -9138,7 +5616,7 @@
9138	5616
9139	5617	btrfs_assert_tree_locked(parent);
9140	5618	parent_level = btrfs_header_level(parent);
9141		- extent_buffer_get(parent);
	5619	+ atomic_inc(&parent->refs);
9142	5620	path->nodes[parent_level] = parent;
9143	5621	path->slots[parent_level] = btrfs_header_nritems(parent);
9144	5622
..	..	@@ -9176,184 +5654,13 @@
9176	5654	return ret;
9177	5655	}
9178	5656
9179		-static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9180		-{
9181		- u64 num_devices;
9182		- u64 stripped;
9183		-
9184		- /*
9185		- * if restripe for this chunk_type is on pick target profile and
9186		- * return, otherwise do the usual balance
9187		- */
9188		- stripped = get_restripe_target(fs_info, flags);
9189		- if (stripped)
9190		- return extended_to_chunk(stripped);
9191		-
9192		- num_devices = fs_info->fs_devices->rw_devices;
9193		-
9194		- stripped = BTRFS_BLOCK_GROUP_RAID0 \|
9195		- BTRFS_BLOCK_GROUP_RAID5 \| BTRFS_BLOCK_GROUP_RAID6 \|
9196		- BTRFS_BLOCK_GROUP_RAID1 \| BTRFS_BLOCK_GROUP_RAID10;
9197		-
9198		- if (num_devices == 1) {
9199		- stripped \|= BTRFS_BLOCK_GROUP_DUP;
9200		- stripped = flags & ~stripped;
9201		-
9202		- /* turn raid0 into single device chunks */
9203		- if (flags & BTRFS_BLOCK_GROUP_RAID0)
9204		- return stripped;
9205		-
9206		- /* turn mirroring into duplication */
9207		- if (flags & (BTRFS_BLOCK_GROUP_RAID1 \|
9208		- BTRFS_BLOCK_GROUP_RAID10))
9209		- return stripped \| BTRFS_BLOCK_GROUP_DUP;
9210		- } else {
9211		- /* they already had raid on here, just return */
9212		- if (flags & stripped)
9213		- return flags;
9214		-
9215		- stripped \|= BTRFS_BLOCK_GROUP_DUP;
9216		- stripped = flags & ~stripped;
9217		-
9218		- /* switch duplicated blocks with raid1 */
9219		- if (flags & BTRFS_BLOCK_GROUP_DUP)
9220		- return stripped \| BTRFS_BLOCK_GROUP_RAID1;
9221		-
9222		- /* this is drive concat, leave it alone */
9223		- }
9224		-
9225		- return flags;
9226		-}
9227		-
9228		-static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9229		-{
9230		- struct btrfs_space_info *sinfo = cache->space_info;
9231		- u64 num_bytes;
9232		- u64 min_allocable_bytes;
9233		- int ret = -ENOSPC;
9234		-
9235		- /*
9236		- * We need some metadata space and system metadata space for
9237		- * allocating chunks in some corner cases until we force to set
9238		- * it to be readonly.
9239		- */
9240		- if ((sinfo->flags &
9241		- (BTRFS_BLOCK_GROUP_SYSTEM \| BTRFS_BLOCK_GROUP_METADATA)) &&
9242		- !force)
9243		- min_allocable_bytes = SZ_1M;
9244		- else
9245		- min_allocable_bytes = 0;
9246		-
9247		- spin_lock(&sinfo->lock);
9248		- spin_lock(&cache->lock);
9249		-
9250		- if (cache->ro) {
9251		- cache->ro++;
9252		- ret = 0;
9253		- goto out;
9254		- }
9255		-
9256		- num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9257		- cache->bytes_super - btrfs_block_group_used(&cache->item);
9258		-
9259		- if (btrfs_space_info_used(sinfo, true) + num_bytes +
9260		- min_allocable_bytes <= sinfo->total_bytes) {
9261		- sinfo->bytes_readonly += num_bytes;
9262		- cache->ro++;
9263		- list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9264		- ret = 0;
9265		- }
9266		-out:
9267		- spin_unlock(&cache->lock);
9268		- spin_unlock(&sinfo->lock);
9269		- return ret;
9270		-}
9271		-
9272		-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
9273		-
9274		-{
9275		- struct btrfs_fs_info *fs_info = cache->fs_info;
9276		- struct btrfs_trans_handle *trans;
9277		- u64 alloc_flags;
9278		- int ret;
9279		-
9280		-again:
9281		- trans = btrfs_join_transaction(fs_info->extent_root);
9282		- if (IS_ERR(trans))
9283		- return PTR_ERR(trans);
9284		-
9285		- /*
9286		- * we're not allowed to set block groups readonly after the dirty
9287		- * block groups cache has started writing. If it already started,
9288		- * back off and let this transaction commit
9289		- */
9290		- mutex_lock(&fs_info->ro_block_group_mutex);
9291		- if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9292		- u64 transid = trans->transid;
9293		-
9294		- mutex_unlock(&fs_info->ro_block_group_mutex);
9295		- btrfs_end_transaction(trans);
9296		-
9297		- ret = btrfs_wait_for_commit(fs_info, transid);
9298		- if (ret)
9299		- return ret;
9300		- goto again;
9301		- }
9302		-
9303		- /*
9304		- * if we are changing raid levels, try to allocate a corresponding
9305		- * block group with the new raid level.
9306		- */
9307		- alloc_flags = update_block_group_flags(fs_info, cache->flags);
9308		- if (alloc_flags != cache->flags) {
9309		- ret = do_chunk_alloc(trans, alloc_flags,
9310		- CHUNK_ALLOC_FORCE);
9311		- /*
9312		- * ENOSPC is allowed here, we may have enough space
9313		- * already allocated at the new raid level to
9314		- * carry on
9315		- */
9316		- if (ret == -ENOSPC)
9317		- ret = 0;
9318		- if (ret < 0)
9319		- goto out;
9320		- }
9321		-
9322		- ret = inc_block_group_ro(cache, 0);
9323		- if (!ret)
9324		- goto out;
9325		- alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9326		- ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9327		- if (ret < 0)
9328		- goto out;
9329		- ret = inc_block_group_ro(cache, 0);
9330		-out:
9331		- if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9332		- alloc_flags = update_block_group_flags(fs_info, cache->flags);
9333		- mutex_lock(&fs_info->chunk_mutex);
9334		- check_system_chunk(trans, alloc_flags);
9335		- mutex_unlock(&fs_info->chunk_mutex);
9336		- }
9337		- mutex_unlock(&fs_info->ro_block_group_mutex);
9338		-
9339		- btrfs_end_transaction(trans);
9340		- return ret;
9341		-}
9342		-
9343		-int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
9344		-{
9345		- u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
9346		-
9347		- return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9348		-}
9349		-
9350	5657	/*
9351	5658	* helper to account the unused space of all the readonly block group in the
9352	5659	* space_info. takes mirrors into account.
9353	5660	*/
9354	5661	u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9355	5662	{
9356		- struct btrfs_block_group_cache *block_group;
	5663	+ struct btrfs_block_group *block_group;
9357	5664	u64 free_bytes = 0;
9358	5665	int factor;
9359	5666
..	..	@@ -9371,1412 +5678,14 @@
9371	5678	}
9372	5679
9373	5680	factor = btrfs_bg_type_to_factor(block_group->flags);
9374		- free_bytes += (block_group->key.offset -
9375		- btrfs_block_group_used(&block_group->item)) *
9376		- factor;
	5681	+ free_bytes += (block_group->length -
	5682	+ block_group->used) * factor;
9377	5683
9378	5684	spin_unlock(&block_group->lock);
9379	5685	}
9380	5686	spin_unlock(&sinfo->lock);
9381	5687
9382	5688	return free_bytes;
9383		-}
9384		-
9385		-void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9386		-{
9387		- struct btrfs_space_info *sinfo = cache->space_info;
9388		- u64 num_bytes;
9389		-
9390		- BUG_ON(!cache->ro);
9391		-
9392		- spin_lock(&sinfo->lock);
9393		- spin_lock(&cache->lock);
9394		- if (!--cache->ro) {
9395		- num_bytes = cache->key.offset - cache->reserved -
9396		- cache->pinned - cache->bytes_super -
9397		- btrfs_block_group_used(&cache->item);
9398		- sinfo->bytes_readonly -= num_bytes;
9399		- list_del_init(&cache->ro_list);
9400		- }
9401		- spin_unlock(&cache->lock);
9402		- spin_unlock(&sinfo->lock);
9403		-}
9404		-
9405		-/*
9406		- * checks to see if its even possible to relocate this block group.
9407		- *
9408		- * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9409		- * ok to go ahead and try.
9410		- */
9411		-int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9412		-{
9413		- struct btrfs_root *root = fs_info->extent_root;
9414		- struct btrfs_block_group_cache *block_group;
9415		- struct btrfs_space_info *space_info;
9416		- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9417		- struct btrfs_device *device;
9418		- struct btrfs_trans_handle *trans;
9419		- u64 min_free;
9420		- u64 dev_min = 1;
9421		- u64 dev_nr = 0;
9422		- u64 target;
9423		- int debug;
9424		- int index;
9425		- int full = 0;
9426		- int ret = 0;
9427		-
9428		- debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9429		-
9430		- block_group = btrfs_lookup_block_group(fs_info, bytenr);
9431		-
9432		- /* odd, couldn't find the block group, leave it alone */
9433		- if (!block_group) {
9434		- if (debug)
9435		- btrfs_warn(fs_info,
9436		- "can't find block group for bytenr %llu",
9437		- bytenr);
9438		- return -1;
9439		- }
9440		-
9441		- min_free = btrfs_block_group_used(&block_group->item);
9442		-
9443		- /* no bytes used, we're good */
9444		- if (!min_free)
9445		- goto out;
9446		-
9447		- space_info = block_group->space_info;
9448		- spin_lock(&space_info->lock);
9449		-
9450		- full = space_info->full;
9451		-
9452		- /*
9453		- * if this is the last block group we have in this space, we can't
9454		- * relocate it unless we're able to allocate a new chunk below.
9455		- *
9456		- * Otherwise, we need to make sure we have room in the space to handle
9457		- * all of the extents from this block group. If we can, we're good
9458		- */
9459		- if ((space_info->total_bytes != block_group->key.offset) &&
9460		- (btrfs_space_info_used(space_info, false) + min_free <
9461		- space_info->total_bytes)) {
9462		- spin_unlock(&space_info->lock);
9463		- goto out;
9464		- }
9465		- spin_unlock(&space_info->lock);
9466		-
9467		- /*
9468		- * ok we don't have enough space, but maybe we have free space on our
9469		- * devices to allocate new chunks for relocation, so loop through our
9470		- * alloc devices and guess if we have enough space. if this block
9471		- * group is going to be restriped, run checks against the target
9472		- * profile instead of the current one.
9473		- */
9474		- ret = -1;
9475		-
9476		- /*
9477		- * index:
9478		- * 0: raid10
9479		- * 1: raid1
9480		- * 2: dup
9481		- * 3: raid0
9482		- * 4: single
9483		- */
9484		- target = get_restripe_target(fs_info, block_group->flags);
9485		- if (target) {
9486		- index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
9487		- } else {
9488		- /*
9489		- * this is just a balance, so if we were marked as full
9490		- * we know there is no space for a new chunk
9491		- */
9492		- if (full) {
9493		- if (debug)
9494		- btrfs_warn(fs_info,
9495		- "no space to alloc new chunk for block group %llu",
9496		- block_group->key.objectid);
9497		- goto out;
9498		- }
9499		-
9500		- index = btrfs_bg_flags_to_raid_index(block_group->flags);
9501		- }
9502		-
9503		- if (index == BTRFS_RAID_RAID10) {
9504		- dev_min = 4;
9505		- /* Divide by 2 */
9506		- min_free >>= 1;
9507		- } else if (index == BTRFS_RAID_RAID1) {
9508		- dev_min = 2;
9509		- } else if (index == BTRFS_RAID_DUP) {
9510		- /* Multiply by 2 */
9511		- min_free <<= 1;
9512		- } else if (index == BTRFS_RAID_RAID0) {
9513		- dev_min = fs_devices->rw_devices;
9514		- min_free = div64_u64(min_free, dev_min);
9515		- }
9516		-
9517		- /* We need to do this so that we can look at pending chunks */
9518		- trans = btrfs_join_transaction(root);
9519		- if (IS_ERR(trans)) {
9520		- ret = PTR_ERR(trans);
9521		- goto out;
9522		- }
9523		-
9524		- mutex_lock(&fs_info->chunk_mutex);
9525		- list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9526		- u64 dev_offset;
9527		-
9528		- /*
9529		- * check to make sure we can actually find a chunk with enough
9530		- * space to fit our block group in.
9531		- */
9532		- if (device->total_bytes > device->bytes_used + min_free &&
9533		- !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
9534		- ret = find_free_dev_extent(trans, device, min_free,
9535		- &dev_offset, NULL);
9536		- if (!ret)
9537		- dev_nr++;
9538		-
9539		- if (dev_nr >= dev_min)
9540		- break;
9541		-
9542		- ret = -1;
9543		- }
9544		- }
9545		- if (debug && ret == -1)
9546		- btrfs_warn(fs_info,
9547		- "no space to allocate a new chunk for block group %llu",
9548		- block_group->key.objectid);
9549		- mutex_unlock(&fs_info->chunk_mutex);
9550		- btrfs_end_transaction(trans);
9551		-out:
9552		- btrfs_put_block_group(block_group);
9553		- return ret;
9554		-}
9555		-
9556		-static int find_first_block_group(struct btrfs_fs_info *fs_info,
9557		- struct btrfs_path *path,
9558		- struct btrfs_key *key)
9559		-{
9560		- struct btrfs_root *root = fs_info->extent_root;
9561		- int ret = 0;
9562		- struct btrfs_key found_key;
9563		- struct extent_buffer *leaf;
9564		- struct btrfs_block_group_item bg;
9565		- u64 flags;
9566		- int slot;
9567		-
9568		- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9569		- if (ret < 0)
9570		- goto out;
9571		-
9572		- while (1) {
9573		- slot = path->slots[0];
9574		- leaf = path->nodes[0];
9575		- if (slot >= btrfs_header_nritems(leaf)) {
9576		- ret = btrfs_next_leaf(root, path);
9577		- if (ret == 0)
9578		- continue;
9579		- if (ret < 0)
9580		- goto out;
9581		- break;
9582		- }
9583		- btrfs_item_key_to_cpu(leaf, &found_key, slot);
9584		-
9585		- if (found_key.objectid >= key->objectid &&
9586		- found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9587		- struct extent_map_tree *em_tree;
9588		- struct extent_map *em;
9589		-
9590		- em_tree = &root->fs_info->mapping_tree.map_tree;
9591		- read_lock(&em_tree->lock);
9592		- em = lookup_extent_mapping(em_tree, found_key.objectid,
9593		- found_key.offset);
9594		- read_unlock(&em_tree->lock);
9595		- if (!em) {
9596		- btrfs_err(fs_info,
9597		- "logical %llu len %llu found bg but no related chunk",
9598		- found_key.objectid, found_key.offset);
9599		- ret = -ENOENT;
9600		- } else if (em->start != found_key.objectid \|\|
9601		- em->len != found_key.offset) {
9602		- btrfs_err(fs_info,
9603		- "block group %llu len %llu mismatch with chunk %llu len %llu",
9604		- found_key.objectid, found_key.offset,
9605		- em->start, em->len);
9606		- ret = -EUCLEAN;
9607		- } else {
9608		- read_extent_buffer(leaf, &bg,
9609		- btrfs_item_ptr_offset(leaf, slot),
9610		- sizeof(bg));
9611		- flags = btrfs_block_group_flags(&bg) &
9612		- BTRFS_BLOCK_GROUP_TYPE_MASK;
9613		-
9614		- if (flags != (em->map_lookup->type &
9615		- BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9616		- btrfs_err(fs_info,
9617		-"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
9618		- found_key.objectid,
9619		- found_key.offset, flags,
9620		- (BTRFS_BLOCK_GROUP_TYPE_MASK &
9621		- em->map_lookup->type));
9622		- ret = -EUCLEAN;
9623		- } else {
9624		- ret = 0;
9625		- }
9626		- }
9627		- free_extent_map(em);
9628		- goto out;
9629		- }
9630		- path->slots[0]++;
9631		- }
9632		-out:
9633		- return ret;
9634		-}
9635		-
9636		-void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9637		-{
9638		- struct btrfs_block_group_cache *block_group;
9639		- u64 last = 0;
9640		-
9641		- while (1) {
9642		- struct inode *inode;
9643		-
9644		- block_group = btrfs_lookup_first_block_group(info, last);
9645		- while (block_group) {
9646		- wait_block_group_cache_done(block_group);
9647		- spin_lock(&block_group->lock);
9648		- if (block_group->iref)
9649		- break;
9650		- spin_unlock(&block_group->lock);
9651		- block_group = next_block_group(info, block_group);
9652		- }
9653		- if (!block_group) {
9654		- if (last == 0)
9655		- break;
9656		- last = 0;
9657		- continue;
9658		- }
9659		-
9660		- inode = block_group->inode;
9661		- block_group->iref = 0;
9662		- block_group->inode = NULL;
9663		- spin_unlock(&block_group->lock);
9664		- ASSERT(block_group->io_ctl.inode == NULL);
9665		- iput(inode);
9666		- last = block_group->key.objectid + block_group->key.offset;
9667		- btrfs_put_block_group(block_group);
9668		- }
9669		-}
9670		-
9671		-/*
9672		- * Must be called only after stopping all workers, since we could have block
9673		- * group caching kthreads running, and therefore they could race with us if we
9674		- * freed the block groups before stopping them.
9675		- */
9676		-int btrfs_free_block_groups(struct btrfs_fs_info *info)
9677		-{
9678		- struct btrfs_block_group_cache *block_group;
9679		- struct btrfs_space_info *space_info;
9680		- struct btrfs_caching_control *caching_ctl;
9681		- struct rb_node *n;
9682		-
9683		- down_write(&info->commit_root_sem);
9684		- while (!list_empty(&info->caching_block_groups)) {
9685		- caching_ctl = list_entry(info->caching_block_groups.next,
9686		- struct btrfs_caching_control, list);
9687		- list_del(&caching_ctl->list);
9688		- put_caching_control(caching_ctl);
9689		- }
9690		- up_write(&info->commit_root_sem);
9691		-
9692		- spin_lock(&info->unused_bgs_lock);
9693		- while (!list_empty(&info->unused_bgs)) {
9694		- block_group = list_first_entry(&info->unused_bgs,
9695		- struct btrfs_block_group_cache,
9696		- bg_list);
9697		- list_del_init(&block_group->bg_list);
9698		- btrfs_put_block_group(block_group);
9699		- }
9700		- spin_unlock(&info->unused_bgs_lock);
9701		-
9702		- spin_lock(&info->block_group_cache_lock);
9703		- while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9704		- block_group = rb_entry(n, struct btrfs_block_group_cache,
9705		- cache_node);
9706		- rb_erase(&block_group->cache_node,
9707		- &info->block_group_cache_tree);
9708		- RB_CLEAR_NODE(&block_group->cache_node);
9709		- spin_unlock(&info->block_group_cache_lock);
9710		-
9711		- down_write(&block_group->space_info->groups_sem);
9712		- list_del(&block_group->list);
9713		- up_write(&block_group->space_info->groups_sem);
9714		-
9715		- /*
9716		- * We haven't cached this block group, which means we could
9717		- * possibly have excluded extents on this block group.
9718		- */
9719		- if (block_group->cached == BTRFS_CACHE_NO \|\|
9720		- block_group->cached == BTRFS_CACHE_ERROR)
9721		- free_excluded_extents(block_group);
9722		-
9723		- btrfs_remove_free_space_cache(block_group);
9724		- ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
9725		- ASSERT(list_empty(&block_group->dirty_list));
9726		- ASSERT(list_empty(&block_group->io_list));
9727		- ASSERT(list_empty(&block_group->bg_list));
9728		- ASSERT(atomic_read(&block_group->count) == 1);
9729		- btrfs_put_block_group(block_group);
9730		-
9731		- spin_lock(&info->block_group_cache_lock);
9732		- }
9733		- spin_unlock(&info->block_group_cache_lock);
9734		-
9735		- /* now that all the block groups are freed, go through and
9736		- * free all the space_info structs. This is only called during
9737		- * the final stages of unmount, and so we know nobody is
9738		- * using them. We call synchronize_rcu() once before we start,
9739		- * just to be on the safe side.
9740		- */
9741		- synchronize_rcu();
9742		-
9743		- release_global_block_rsv(info);
9744		-
9745		- while (!list_empty(&info->space_info)) {
9746		- int i;
9747		-
9748		- space_info = list_entry(info->space_info.next,
9749		- struct btrfs_space_info,
9750		- list);
9751		-
9752		- /*
9753		- * Do not hide this behind enospc_debug, this is actually
9754		- * important and indicates a real bug if this happens.
9755		- */
9756		- if (WARN_ON(space_info->bytes_pinned > 0 \|\|
9757		- space_info->bytes_reserved > 0 \|\|
9758		- space_info->bytes_may_use > 0))
9759		- dump_space_info(info, space_info, 0, 0);
9760		- list_del(&space_info->list);
9761		- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9762		- struct kobject *kobj;
9763		- kobj = space_info->block_group_kobjs[i];
9764		- space_info->block_group_kobjs[i] = NULL;
9765		- if (kobj) {
9766		- kobject_del(kobj);
9767		- kobject_put(kobj);
9768		- }
9769		- }
9770		- kobject_del(&space_info->kobj);
9771		- kobject_put(&space_info->kobj);
9772		- }
9773		- return 0;
9774		-}
9775		-
9776		-/* link_block_group will queue up kobjects to add when we're reclaim-safe */
9777		-void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
9778		-{
9779		- struct btrfs_space_info *space_info;
9780		- struct raid_kobject *rkobj;
9781		- LIST_HEAD(list);
9782		- int index;
9783		- int ret = 0;
9784		-
9785		- spin_lock(&fs_info->pending_raid_kobjs_lock);
9786		- list_splice_init(&fs_info->pending_raid_kobjs, &list);
9787		- spin_unlock(&fs_info->pending_raid_kobjs_lock);
9788		-
9789		- list_for_each_entry(rkobj, &list, list) {
9790		- space_info = __find_space_info(fs_info, rkobj->flags);
9791		- index = btrfs_bg_flags_to_raid_index(rkobj->flags);
9792		-
9793		- ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9794		- "%s", get_raid_name(index));
9795		- if (ret) {
9796		- kobject_put(&rkobj->kobj);
9797		- break;
9798		- }
9799		- }
9800		- if (ret)
9801		- btrfs_warn(fs_info,
9802		- "failed to add kobject for block cache, ignoring");
9803		-}
9804		-
9805		-static void link_block_group(struct btrfs_block_group_cache *cache)
9806		-{
9807		- struct btrfs_space_info *space_info = cache->space_info;
9808		- struct btrfs_fs_info *fs_info = cache->fs_info;
9809		- int index = btrfs_bg_flags_to_raid_index(cache->flags);
9810		- bool first = false;
9811		-
9812		- down_write(&space_info->groups_sem);
9813		- if (list_empty(&space_info->block_groups[index]))
9814		- first = true;
9815		- list_add_tail(&cache->list, &space_info->block_groups[index]);
9816		- up_write(&space_info->groups_sem);
9817		-
9818		- if (first) {
9819		- struct raid_kobject rkobj = kzalloc(sizeof(rkobj), GFP_NOFS);
9820		- if (!rkobj) {
9821		- btrfs_warn(cache->fs_info,
9822		- "couldn't alloc memory for raid level kobject");
9823		- return;
9824		- }
9825		- rkobj->flags = cache->flags;
9826		- kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9827		-
9828		- spin_lock(&fs_info->pending_raid_kobjs_lock);
9829		- list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
9830		- spin_unlock(&fs_info->pending_raid_kobjs_lock);
9831		- space_info->block_group_kobjs[index] = &rkobj->kobj;
9832		- }
9833		-}
9834		-
9835		-static struct btrfs_block_group_cache *
9836		-btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
9837		- u64 start, u64 size)
9838		-{
9839		- struct btrfs_block_group_cache *cache;
9840		-
9841		- cache = kzalloc(sizeof(*cache), GFP_NOFS);
9842		- if (!cache)
9843		- return NULL;
9844		-
9845		- cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9846		- GFP_NOFS);
9847		- if (!cache->free_space_ctl) {
9848		- kfree(cache);
9849		- return NULL;
9850		- }
9851		-
9852		- cache->key.objectid = start;
9853		- cache->key.offset = size;
9854		- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9855		-
9856		- cache->fs_info = fs_info;
9857		- cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
9858		- set_free_space_tree_thresholds(cache);
9859		-
9860		- atomic_set(&cache->count, 1);
9861		- spin_lock_init(&cache->lock);
9862		- init_rwsem(&cache->data_rwsem);
9863		- INIT_LIST_HEAD(&cache->list);
9864		- INIT_LIST_HEAD(&cache->cluster_list);
9865		- INIT_LIST_HEAD(&cache->bg_list);
9866		- INIT_LIST_HEAD(&cache->ro_list);
9867		- INIT_LIST_HEAD(&cache->dirty_list);
9868		- INIT_LIST_HEAD(&cache->io_list);
9869		- btrfs_init_free_space_ctl(cache);
9870		- atomic_set(&cache->trimming, 0);
9871		- mutex_init(&cache->free_space_lock);
9872		- btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
9873		-
9874		- return cache;
9875		-}
9876		-
9877		-
9878		-/*
9879		- * Iterate all chunks and verify that each of them has the corresponding block
9880		- * group
9881		- */
9882		-static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
9883		-{
9884		- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
9885		- struct extent_map *em;
9886		- struct btrfs_block_group_cache *bg;
9887		- u64 start = 0;
9888		- int ret = 0;
9889		-
9890		- while (1) {
9891		- read_lock(&map_tree->map_tree.lock);
9892		- /*
9893		- * lookup_extent_mapping will return the first extent map
9894		- * intersecting the range, so setting @len to 1 is enough to
9895		- * get the first chunk.
9896		- */
9897		- em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
9898		- read_unlock(&map_tree->map_tree.lock);
9899		- if (!em)
9900		- break;
9901		-
9902		- bg = btrfs_lookup_block_group(fs_info, em->start);
9903		- if (!bg) {
9904		- btrfs_err(fs_info,
9905		- "chunk start=%llu len=%llu doesn't have corresponding block group",
9906		- em->start, em->len);
9907		- ret = -EUCLEAN;
9908		- free_extent_map(em);
9909		- break;
9910		- }
9911		- if (bg->key.objectid != em->start \|\|
9912		- bg->key.offset != em->len \|\|
9913		- (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
9914		- (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9915		- btrfs_err(fs_info,
9916		-"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
9917		- em->start, em->len,
9918		- em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
9919		- bg->key.objectid, bg->key.offset,
9920		- bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
9921		- ret = -EUCLEAN;
9922		- free_extent_map(em);
9923		- btrfs_put_block_group(bg);
9924		- break;
9925		- }
9926		- start = em->start + em->len;
9927		- free_extent_map(em);
9928		- btrfs_put_block_group(bg);
9929		- }
9930		- return ret;
9931		-}
9932		-
9933		-int btrfs_read_block_groups(struct btrfs_fs_info *info)
9934		-{
9935		- struct btrfs_path *path;
9936		- int ret;
9937		- struct btrfs_block_group_cache *cache;
9938		- struct btrfs_space_info *space_info;
9939		- struct btrfs_key key;
9940		- struct btrfs_key found_key;
9941		- struct extent_buffer *leaf;
9942		- int need_clear = 0;
9943		- u64 cache_gen;
9944		- u64 feature;
9945		- int mixed;
9946		-
9947		- feature = btrfs_super_incompat_flags(info->super_copy);
9948		- mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
9949		-
9950		- key.objectid = 0;
9951		- key.offset = 0;
9952		- key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9953		- path = btrfs_alloc_path();
9954		- if (!path)
9955		- return -ENOMEM;
9956		- path->reada = READA_FORWARD;
9957		-
9958		- cache_gen = btrfs_super_cache_generation(info->super_copy);
9959		- if (btrfs_test_opt(info, SPACE_CACHE) &&
9960		- btrfs_super_generation(info->super_copy) != cache_gen)
9961		- need_clear = 1;
9962		- if (btrfs_test_opt(info, CLEAR_CACHE))
9963		- need_clear = 1;
9964		-
9965		- while (1) {
9966		- ret = find_first_block_group(info, path, &key);
9967		- if (ret > 0)
9968		- break;
9969		- if (ret != 0)
9970		- goto error;
9971		-
9972		- leaf = path->nodes[0];
9973		- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9974		-
9975		- cache = btrfs_create_block_group_cache(info, found_key.objectid,
9976		- found_key.offset);
9977		- if (!cache) {
9978		- ret = -ENOMEM;
9979		- goto error;
9980		- }
9981		-
9982		- if (need_clear) {
9983		- /*
9984		- * When we mount with old space cache, we need to
9985		- * set BTRFS_DC_CLEAR and set dirty flag.
9986		- *
9987		- * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
9988		- * truncate the old free space cache inode and
9989		- * setup a new one.
9990		- * b) Setting 'dirty flag' makes sure that we flush
9991		- * the new space cache info onto disk.
9992		- */
9993		- if (btrfs_test_opt(info, SPACE_CACHE))
9994		- cache->disk_cache_state = BTRFS_DC_CLEAR;
9995		- }
9996		-
9997		- read_extent_buffer(leaf, &cache->item,
9998		- btrfs_item_ptr_offset(leaf, path->slots[0]),
9999		- sizeof(cache->item));
10000		- cache->flags = btrfs_block_group_flags(&cache->item);
10001		- if (!mixed &&
10002		- ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10003		- (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10004		- btrfs_err(info,
10005		-"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10006		- cache->key.objectid);
10007		- btrfs_put_block_group(cache);
10008		- ret = -EINVAL;
10009		- goto error;
10010		- }
10011		-
10012		- key.objectid = found_key.objectid + found_key.offset;
10013		- btrfs_release_path(path);
10014		-
10015		- /*
10016		- * We need to exclude the super stripes now so that the space
10017		- * info has super bytes accounted for, otherwise we'll think
10018		- * we have more space than we actually do.
10019		- */
10020		- ret = exclude_super_stripes(cache);
10021		- if (ret) {
10022		- /*
10023		- * We may have excluded something, so call this just in
10024		- * case.
10025		- */
10026		- free_excluded_extents(cache);
10027		- btrfs_put_block_group(cache);
10028		- goto error;
10029		- }
10030		-
10031		- /*
10032		- * check for two cases, either we are full, and therefore
10033		- * don't need to bother with the caching work since we won't
10034		- * find any space, or we are empty, and we can just add all
10035		- * the space in and be done with it. This saves us _alot_ of
10036		- * time, particularly in the full case.
10037		- */
10038		- if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10039		- cache->last_byte_to_unpin = (u64)-1;
10040		- cache->cached = BTRFS_CACHE_FINISHED;
10041		- free_excluded_extents(cache);
10042		- } else if (btrfs_block_group_used(&cache->item) == 0) {
10043		- cache->last_byte_to_unpin = (u64)-1;
10044		- cache->cached = BTRFS_CACHE_FINISHED;
10045		- add_new_free_space(cache, found_key.objectid,
10046		- found_key.objectid +
10047		- found_key.offset);
10048		- free_excluded_extents(cache);
10049		- }
10050		-
10051		- ret = btrfs_add_block_group_cache(info, cache);
10052		- if (ret) {
10053		- btrfs_remove_free_space_cache(cache);
10054		- btrfs_put_block_group(cache);
10055		- goto error;
10056		- }
10057		-
10058		- trace_btrfs_add_block_group(info, cache, 0);
10059		- update_space_info(info, cache->flags, found_key.offset,
10060		- btrfs_block_group_used(&cache->item),
10061		- cache->bytes_super, &space_info);
10062		-
10063		- cache->space_info = space_info;
10064		-
10065		- link_block_group(cache);
10066		-
10067		- set_avail_alloc_bits(info, cache->flags);
10068		- if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10069		- inc_block_group_ro(cache, 1);
10070		- } else if (btrfs_block_group_used(&cache->item) == 0) {
10071		- ASSERT(list_empty(&cache->bg_list));
10072		- btrfs_mark_bg_unused(cache);
10073		- }
10074		- }
10075		-
10076		- list_for_each_entry_rcu(space_info, &info->space_info, list) {
10077		- if (!(get_alloc_profile(info, space_info->flags) &
10078		- (BTRFS_BLOCK_GROUP_RAID10 \|
10079		- BTRFS_BLOCK_GROUP_RAID1 \|
10080		- BTRFS_BLOCK_GROUP_RAID5 \|
10081		- BTRFS_BLOCK_GROUP_RAID6 \|
10082		- BTRFS_BLOCK_GROUP_DUP)))
10083		- continue;
10084		- /*
10085		- * avoid allocating from un-mirrored block group if there are
10086		- * mirrored block groups.
10087		- */
10088		- list_for_each_entry(cache,
10089		- &space_info->block_groups[BTRFS_RAID_RAID0],
10090		- list)
10091		- inc_block_group_ro(cache, 1);
10092		- list_for_each_entry(cache,
10093		- &space_info->block_groups[BTRFS_RAID_SINGLE],
10094		- list)
10095		- inc_block_group_ro(cache, 1);
10096		- }
10097		-
10098		- btrfs_add_raid_kobjects(info);
10099		- init_global_block_rsv(info);
10100		- ret = check_chunk_block_group_mappings(info);
10101		-error:
10102		- btrfs_free_path(path);
10103		- return ret;
10104		-}
10105		-
10106		-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10107		-{
10108		- struct btrfs_fs_info *fs_info = trans->fs_info;
10109		- struct btrfs_block_group_cache *block_group;
10110		- struct btrfs_root *extent_root = fs_info->extent_root;
10111		- struct btrfs_block_group_item item;
10112		- struct btrfs_key key;
10113		- int ret = 0;
10114		-
10115		- if (!trans->can_flush_pending_bgs)
10116		- return;
10117		-
10118		- while (!list_empty(&trans->new_bgs)) {
10119		- block_group = list_first_entry(&trans->new_bgs,
10120		- struct btrfs_block_group_cache,
10121		- bg_list);
10122		- if (ret)
10123		- goto next;
10124		-
10125		- spin_lock(&block_group->lock);
10126		- memcpy(&item, &block_group->item, sizeof(item));
10127		- memcpy(&key, &block_group->key, sizeof(key));
10128		- spin_unlock(&block_group->lock);
10129		-
10130		- ret = btrfs_insert_item(trans, extent_root, &key, &item,
10131		- sizeof(item));
10132		- if (ret)
10133		- btrfs_abort_transaction(trans, ret);
10134		- ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
10135		- if (ret)
10136		- btrfs_abort_transaction(trans, ret);
10137		- add_block_group_free_space(trans, block_group);
10138		- /* already aborted the transaction if it failed. */
10139		-next:
10140		- list_del_init(&block_group->bg_list);
10141		- }
10142		- btrfs_trans_release_chunk_metadata(trans);
10143		-}
10144		-
10145		-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10146		- u64 type, u64 chunk_offset, u64 size)
10147		-{
10148		- struct btrfs_fs_info *fs_info = trans->fs_info;
10149		- struct btrfs_block_group_cache *cache;
10150		- int ret;
10151		-
10152		- btrfs_set_log_full_commit(fs_info, trans);
10153		-
10154		- cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10155		- if (!cache)
10156		- return -ENOMEM;
10157		-
10158		- btrfs_set_block_group_used(&cache->item, bytes_used);
10159		- btrfs_set_block_group_chunk_objectid(&cache->item,
10160		- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10161		- btrfs_set_block_group_flags(&cache->item, type);
10162		-
10163		- cache->flags = type;
10164		- cache->last_byte_to_unpin = (u64)-1;
10165		- cache->cached = BTRFS_CACHE_FINISHED;
10166		- cache->needs_free_space = 1;
10167		- ret = exclude_super_stripes(cache);
10168		- if (ret) {
10169		- /*
10170		- * We may have excluded something, so call this just in
10171		- * case.
10172		- */
10173		- free_excluded_extents(cache);
10174		- btrfs_put_block_group(cache);
10175		- return ret;
10176		- }
10177		-
10178		- add_new_free_space(cache, chunk_offset, chunk_offset + size);
10179		-
10180		- free_excluded_extents(cache);
10181		-
10182		-#ifdef CONFIG_BTRFS_DEBUG
10183		- if (btrfs_should_fragment_free_space(cache)) {
10184		- u64 new_bytes_used = size - bytes_used;
10185		-
10186		- bytes_used += new_bytes_used >> 1;
10187		- fragment_free_space(cache);
10188		- }
10189		-#endif
10190		- /*
10191		- * Ensure the corresponding space_info object is created and
10192		- * assigned to our block group. We want our bg to be added to the rbtree
10193		- * with its ->space_info set.
10194		- */
10195		- cache->space_info = __find_space_info(fs_info, cache->flags);
10196		- ASSERT(cache->space_info);
10197		-
10198		- ret = btrfs_add_block_group_cache(fs_info, cache);
10199		- if (ret) {
10200		- btrfs_remove_free_space_cache(cache);
10201		- btrfs_put_block_group(cache);
10202		- return ret;
10203		- }
10204		-
10205		- /*
10206		- * Now that our block group has its ->space_info set and is inserted in
10207		- * the rbtree, update the space info's counters.
10208		- */
10209		- trace_btrfs_add_block_group(fs_info, cache, 1);
10210		- update_space_info(fs_info, cache->flags, size, bytes_used,
10211		- cache->bytes_super, &cache->space_info);
10212		- update_global_block_rsv(fs_info);
10213		-
10214		- link_block_group(cache);
10215		-
10216		- list_add_tail(&cache->bg_list, &trans->new_bgs);
10217		-
10218		- set_avail_alloc_bits(fs_info, type);
10219		- return 0;
10220		-}
10221		-
10222		-static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10223		-{
10224		- u64 extra_flags = chunk_to_extended(flags) &
10225		- BTRFS_EXTENDED_PROFILE_MASK;
10226		-
10227		- write_seqlock(&fs_info->profiles_lock);
10228		- if (flags & BTRFS_BLOCK_GROUP_DATA)
10229		- fs_info->avail_data_alloc_bits &= ~extra_flags;
10230		- if (flags & BTRFS_BLOCK_GROUP_METADATA)
10231		- fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10232		- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10233		- fs_info->avail_system_alloc_bits &= ~extra_flags;
10234		- write_sequnlock(&fs_info->profiles_lock);
10235		-}
10236		-
10237		-int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10238		- u64 group_start, struct extent_map *em)
10239		-{
10240		- struct btrfs_fs_info *fs_info = trans->fs_info;
10241		- struct btrfs_root *root = fs_info->extent_root;
10242		- struct btrfs_path *path;
10243		- struct btrfs_block_group_cache *block_group;
10244		- struct btrfs_free_cluster *cluster;
10245		- struct btrfs_root *tree_root = fs_info->tree_root;
10246		- struct btrfs_key key;
10247		- struct inode *inode;
10248		- struct kobject *kobj = NULL;
10249		- int ret;
10250		- int index;
10251		- int factor;
10252		- struct btrfs_caching_control *caching_ctl = NULL;
10253		- bool remove_em;
10254		-
10255		- block_group = btrfs_lookup_block_group(fs_info, group_start);
10256		- BUG_ON(!block_group);
10257		- BUG_ON(!block_group->ro);
10258		-
10259		- trace_btrfs_remove_block_group(block_group);
10260		- /*
10261		- * Free the reserved super bytes from this block group before
10262		- * remove it.
10263		- */
10264		- free_excluded_extents(block_group);
10265		- btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10266		- block_group->key.offset);
10267		-
10268		- memcpy(&key, &block_group->key, sizeof(key));
10269		- index = btrfs_bg_flags_to_raid_index(block_group->flags);
10270		- factor = btrfs_bg_type_to_factor(block_group->flags);
10271		-
10272		- /* make sure this block group isn't part of an allocation cluster */
10273		- cluster = &fs_info->data_alloc_cluster;
10274		- spin_lock(&cluster->refill_lock);
10275		- btrfs_return_cluster_to_free_space(block_group, cluster);
10276		- spin_unlock(&cluster->refill_lock);
10277		-
10278		- /*
10279		- * make sure this block group isn't part of a metadata
10280		- * allocation cluster
10281		- */
10282		- cluster = &fs_info->meta_alloc_cluster;
10283		- spin_lock(&cluster->refill_lock);
10284		- btrfs_return_cluster_to_free_space(block_group, cluster);
10285		- spin_unlock(&cluster->refill_lock);
10286		-
10287		- path = btrfs_alloc_path();
10288		- if (!path) {
10289		- ret = -ENOMEM;
10290		- goto out;
10291		- }
10292		-
10293		- /*
10294		- * get the inode first so any iput calls done for the io_list
10295		- * aren't the final iput (no unlinks allowed now)
10296		- */
10297		- inode = lookup_free_space_inode(fs_info, block_group, path);
10298		-
10299		- mutex_lock(&trans->transaction->cache_write_mutex);
10300		- /*
10301		- * make sure our free spache cache IO is done before remove the
10302		- * free space inode
10303		- */
10304		- spin_lock(&trans->transaction->dirty_bgs_lock);
10305		- if (!list_empty(&block_group->io_list)) {
10306		- list_del_init(&block_group->io_list);
10307		-
10308		- WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10309		-
10310		- spin_unlock(&trans->transaction->dirty_bgs_lock);
10311		- btrfs_wait_cache_io(trans, block_group, path);
10312		- btrfs_put_block_group(block_group);
10313		- spin_lock(&trans->transaction->dirty_bgs_lock);
10314		- }
10315		-
10316		- if (!list_empty(&block_group->dirty_list)) {
10317		- list_del_init(&block_group->dirty_list);
10318		- btrfs_put_block_group(block_group);
10319		- }
10320		- spin_unlock(&trans->transaction->dirty_bgs_lock);
10321		- mutex_unlock(&trans->transaction->cache_write_mutex);
10322		-
10323		- if (!IS_ERR(inode)) {
10324		- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10325		- if (ret) {
10326		- btrfs_add_delayed_iput(inode);
10327		- goto out;
10328		- }
10329		- clear_nlink(inode);
10330		- /* One for the block groups ref */
10331		- spin_lock(&block_group->lock);
10332		- if (block_group->iref) {
10333		- block_group->iref = 0;
10334		- block_group->inode = NULL;
10335		- spin_unlock(&block_group->lock);
10336		- iput(inode);
10337		- } else {
10338		- spin_unlock(&block_group->lock);
10339		- }
10340		- /* One for our lookup ref */
10341		- btrfs_add_delayed_iput(inode);
10342		- }
10343		-
10344		- key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10345		- key.offset = block_group->key.objectid;
10346		- key.type = 0;
10347		-
10348		- ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10349		- if (ret < 0)
10350		- goto out;
10351		- if (ret > 0)
10352		- btrfs_release_path(path);
10353		- if (ret == 0) {
10354		- ret = btrfs_del_item(trans, tree_root, path);
10355		- if (ret)
10356		- goto out;
10357		- btrfs_release_path(path);
10358		- }
10359		-
10360		- spin_lock(&fs_info->block_group_cache_lock);
10361		- rb_erase(&block_group->cache_node,
10362		- &fs_info->block_group_cache_tree);
10363		- RB_CLEAR_NODE(&block_group->cache_node);
10364		-
10365		- /* Once for the block groups rbtree */
10366		- btrfs_put_block_group(block_group);
10367		-
10368		- if (fs_info->first_logical_byte == block_group->key.objectid)
10369		- fs_info->first_logical_byte = (u64)-1;
10370		- spin_unlock(&fs_info->block_group_cache_lock);
10371		-
10372		- down_write(&block_group->space_info->groups_sem);
10373		- /*
10374		- * we must use list_del_init so people can check to see if they
10375		- * are still on the list after taking the semaphore
10376		- */
10377		- list_del_init(&block_group->list);
10378		- if (list_empty(&block_group->space_info->block_groups[index])) {
10379		- kobj = block_group->space_info->block_group_kobjs[index];
10380		- block_group->space_info->block_group_kobjs[index] = NULL;
10381		- clear_avail_alloc_bits(fs_info, block_group->flags);
10382		- }
10383		- up_write(&block_group->space_info->groups_sem);
10384		- if (kobj) {
10385		- kobject_del(kobj);
10386		- kobject_put(kobj);
10387		- }
10388		-
10389		- if (block_group->has_caching_ctl)
10390		- caching_ctl = get_caching_control(block_group);
10391		- if (block_group->cached == BTRFS_CACHE_STARTED)
10392		- wait_block_group_cache_done(block_group);
10393		- if (block_group->has_caching_ctl) {
10394		- down_write(&fs_info->commit_root_sem);
10395		- if (!caching_ctl) {
10396		- struct btrfs_caching_control *ctl;
10397		-
10398		- list_for_each_entry(ctl,
10399		- &fs_info->caching_block_groups, list)
10400		- if (ctl->block_group == block_group) {
10401		- caching_ctl = ctl;
10402		- refcount_inc(&caching_ctl->count);
10403		- break;
10404		- }
10405		- }
10406		- if (caching_ctl)
10407		- list_del_init(&caching_ctl->list);
10408		- up_write(&fs_info->commit_root_sem);
10409		- if (caching_ctl) {
10410		- /* Once for the caching bgs list and once for us. */
10411		- put_caching_control(caching_ctl);
10412		- put_caching_control(caching_ctl);
10413		- }
10414		- }
10415		-
10416		- spin_lock(&trans->transaction->dirty_bgs_lock);
10417		- if (!list_empty(&block_group->dirty_list)) {
10418		- WARN_ON(1);
10419		- }
10420		- if (!list_empty(&block_group->io_list)) {
10421		- WARN_ON(1);
10422		- }
10423		- spin_unlock(&trans->transaction->dirty_bgs_lock);
10424		- btrfs_remove_free_space_cache(block_group);
10425		-
10426		- spin_lock(&block_group->space_info->lock);
10427		- list_del_init(&block_group->ro_list);
10428		-
10429		- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10430		- WARN_ON(block_group->space_info->total_bytes
10431		- < block_group->key.offset);
10432		- WARN_ON(block_group->space_info->bytes_readonly
10433		- < block_group->key.offset);
10434		- WARN_ON(block_group->space_info->disk_total
10435		- < block_group->key.offset * factor);
10436		- }
10437		- block_group->space_info->total_bytes -= block_group->key.offset;
10438		- block_group->space_info->bytes_readonly -= block_group->key.offset;
10439		- block_group->space_info->disk_total -= block_group->key.offset * factor;
10440		-
10441		- spin_unlock(&block_group->space_info->lock);
10442		-
10443		- memcpy(&key, &block_group->key, sizeof(key));
10444		-
10445		- mutex_lock(&fs_info->chunk_mutex);
10446		- if (!list_empty(&em->list)) {
10447		- /* We're in the transaction->pending_chunks list. */
10448		- free_extent_map(em);
10449		- }
10450		- spin_lock(&block_group->lock);
10451		- block_group->removed = 1;
10452		- /*
10453		- * At this point trimming can't start on this block group, because we
10454		- * removed the block group from the tree fs_info->block_group_cache_tree
10455		- * so no one can't find it anymore and even if someone already got this
10456		- * block group before we removed it from the rbtree, they have already
10457		- * incremented block_group->trimming - if they didn't, they won't find
10458		- * any free space entries because we already removed them all when we
10459		- * called btrfs_remove_free_space_cache().
10460		- *
10461		- * And we must not remove the extent map from the fs_info->mapping_tree
10462		- * to prevent the same logical address range and physical device space
10463		- * ranges from being reused for a new block group. This is because our
10464		- * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10465		- * completely transactionless, so while it is trimming a range the
10466		- * currently running transaction might finish and a new one start,
10467		- * allowing for new block groups to be created that can reuse the same
10468		- * physical device locations unless we take this special care.
10469		- *
10470		- * There may also be an implicit trim operation if the file system
10471		- * is mounted with -odiscard. The same protections must remain
10472		- * in place until the extents have been discarded completely when
10473		- * the transaction commit has completed.
10474		- */
10475		- remove_em = (atomic_read(&block_group->trimming) == 0);
10476		- /*
10477		- * Make sure a trimmer task always sees the em in the pinned_chunks list
10478		- * if it sees block_group->removed == 1 (needs to lock block_group->lock
10479		- * before checking block_group->removed).
10480		- */
10481		- if (!remove_em) {
10482		- /*
10483		- * Our em might be in trans->transaction->pending_chunks which
10484		- * is protected by fs_info->chunk_mutex ([lock\|unlock]_chunks),
10485		- * and so is the fs_info->pinned_chunks list.
10486		- *
10487		- * So at this point we must be holding the chunk_mutex to avoid
10488		- * any races with chunk allocation (more specifically at
10489		- * volumes.c:contains_pending_extent()), to ensure it always
10490		- * sees the em, either in the pending_chunks list or in the
10491		- * pinned_chunks list.
10492		- */
10493		- list_move_tail(&em->list, &fs_info->pinned_chunks);
10494		- }
10495		- spin_unlock(&block_group->lock);
10496		-
10497		- mutex_unlock(&fs_info->chunk_mutex);
10498		-
10499		- ret = remove_block_group_free_space(trans, block_group);
10500		- if (ret)
10501		- goto out;
10502		-
10503		- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10504		- if (ret > 0)
10505		- ret = -EIO;
10506		- if (ret < 0)
10507		- goto out;
10508		-
10509		- ret = btrfs_del_item(trans, root, path);
10510		- if (ret)
10511		- goto out;
10512		-
10513		- if (remove_em) {
10514		- struct extent_map_tree *em_tree;
10515		-
10516		- em_tree = &fs_info->mapping_tree.map_tree;
10517		- write_lock(&em_tree->lock);
10518		- /*
10519		- * The em might be in the pending_chunks list, so make sure the
10520		- * chunk mutex is locked, since remove_extent_mapping() will
10521		- * delete us from that list.
10522		- */
10523		- remove_extent_mapping(em_tree, em);
10524		- write_unlock(&em_tree->lock);
10525		- /* once for the tree */
10526		- free_extent_map(em);
10527		- }
10528		-
10529		-out:
10530		- /* Once for the lookup reference */
10531		- btrfs_put_block_group(block_group);
10532		- btrfs_free_path(path);
10533		- return ret;
10534		-}
10535		-
10536		-struct btrfs_trans_handle *
10537		-btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10538		- const u64 chunk_offset)
10539		-{
10540		- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10541		- struct extent_map *em;
10542		- struct map_lookup *map;
10543		- unsigned int num_items;
10544		-
10545		- read_lock(&em_tree->lock);
10546		- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10547		- read_unlock(&em_tree->lock);
10548		- ASSERT(em && em->start == chunk_offset);
10549		-
10550		- /*
10551		- * We need to reserve 3 + N units from the metadata space info in order
10552		- * to remove a block group (done at btrfs_remove_chunk() and at
10553		- * btrfs_remove_block_group()), which are used for:
10554		- *
10555		- * 1 unit for adding the free space inode's orphan (located in the tree
10556		- * of tree roots).
10557		- * 1 unit for deleting the block group item (located in the extent
10558		- * tree).
10559		- * 1 unit for deleting the free space item (located in tree of tree
10560		- * roots).
10561		- * N units for deleting N device extent items corresponding to each
10562		- * stripe (located in the device tree).
10563		- *
10564		- * In order to remove a block group we also need to reserve units in the
10565		- * system space info in order to update the chunk tree (update one or
10566		- * more device items and remove one chunk item), but this is done at
10567		- * btrfs_remove_chunk() through a call to check_system_chunk().
10568		- */
10569		- map = em->map_lookup;
10570		- num_items = 3 + map->num_stripes;
10571		- free_extent_map(em);
10572		-
10573		- return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10574		- num_items, 1);
10575		-}
10576		-
10577		-/*
10578		- * Process the unused_bgs list and remove any that don't have any allocated
10579		- * space inside of them.
10580		- */
10581		-void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10582		-{
10583		- struct btrfs_block_group_cache *block_group;
10584		- struct btrfs_space_info *space_info;
10585		- struct btrfs_trans_handle *trans;
10586		- int ret = 0;
10587		-
10588		- if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10589		- return;
10590		-
10591		- spin_lock(&fs_info->unused_bgs_lock);
10592		- while (!list_empty(&fs_info->unused_bgs)) {
10593		- u64 start, end;
10594		- int trimming;
10595		-
10596		- block_group = list_first_entry(&fs_info->unused_bgs,
10597		- struct btrfs_block_group_cache,
10598		- bg_list);
10599		- list_del_init(&block_group->bg_list);
10600		-
10601		- space_info = block_group->space_info;
10602		-
10603		- if (ret \|\| btrfs_mixed_space_info(space_info)) {
10604		- btrfs_put_block_group(block_group);
10605		- continue;
10606		- }
10607		- spin_unlock(&fs_info->unused_bgs_lock);
10608		-
10609		- mutex_lock(&fs_info->delete_unused_bgs_mutex);
10610		-
10611		- /* Don't want to race with allocators so take the groups_sem */
10612		- down_write(&space_info->groups_sem);
10613		- spin_lock(&block_group->lock);
10614		- if (block_group->reserved \|\| block_group->pinned \|\|
10615		- btrfs_block_group_used(&block_group->item) \|\|
10616		- block_group->ro \|\|
10617		- list_is_singular(&block_group->list)) {
10618		- /*
10619		- * We want to bail if we made new allocations or have
10620		- * outstanding allocations in this block group. We do
10621		- * the ro check in case balance is currently acting on
10622		- * this block group.
10623		- */
10624		- trace_btrfs_skip_unused_block_group(block_group);
10625		- spin_unlock(&block_group->lock);
10626		- up_write(&space_info->groups_sem);
10627		- goto next;
10628		- }
10629		- spin_unlock(&block_group->lock);
10630		-
10631		- /* We don't want to force the issue, only flip if it's ok. */
10632		- ret = inc_block_group_ro(block_group, 0);
10633		- up_write(&space_info->groups_sem);
10634		- if (ret < 0) {
10635		- ret = 0;
10636		- goto next;
10637		- }
10638		-
10639		- /*
10640		- * Want to do this before we do anything else so we can recover
10641		- * properly if we fail to join the transaction.
10642		- */
10643		- trans = btrfs_start_trans_remove_block_group(fs_info,
10644		- block_group->key.objectid);
10645		- if (IS_ERR(trans)) {
10646		- btrfs_dec_block_group_ro(block_group);
10647		- ret = PTR_ERR(trans);
10648		- goto next;
10649		- }
10650		-
10651		- /*
10652		- * We could have pending pinned extents for this block group,
10653		- * just delete them, we don't care about them anymore.
10654		- */
10655		- start = block_group->key.objectid;
10656		- end = start + block_group->key.offset - 1;
10657		- /*
10658		- * Hold the unused_bg_unpin_mutex lock to avoid racing with
10659		- * btrfs_finish_extent_commit(). If we are at transaction N,
10660		- * another task might be running finish_extent_commit() for the
10661		- * previous transaction N - 1, and have seen a range belonging
10662		- * to the block group in freed_extents[] before we were able to
10663		- * clear the whole block group range from freed_extents[]. This
10664		- * means that task can lookup for the block group after we
10665		- * unpinned it from freed_extents[] and removed it, leading to
10666		- * a BUG_ON() at btrfs_unpin_extent_range().
10667		- */
10668		- mutex_lock(&fs_info->unused_bg_unpin_mutex);
10669		- ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10670		- EXTENT_DIRTY);
10671		- if (ret) {
10672		- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10673		- btrfs_dec_block_group_ro(block_group);
10674		- goto end_trans;
10675		- }
10676		- ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10677		- EXTENT_DIRTY);
10678		- if (ret) {
10679		- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10680		- btrfs_dec_block_group_ro(block_group);
10681		- goto end_trans;
10682		- }
10683		- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10684		-
10685		- /* Reset pinned so btrfs_put_block_group doesn't complain */
10686		- spin_lock(&space_info->lock);
10687		- spin_lock(&block_group->lock);
10688		-
10689		- space_info->bytes_pinned -= block_group->pinned;
10690		- space_info->bytes_readonly += block_group->pinned;
10691		- percpu_counter_add_batch(&space_info->total_bytes_pinned,
10692		- -block_group->pinned,
10693		- BTRFS_TOTAL_BYTES_PINNED_BATCH);
10694		- block_group->pinned = 0;
10695		-
10696		- spin_unlock(&block_group->lock);
10697		- spin_unlock(&space_info->lock);
10698		-
10699		- /* DISCARD can flip during remount */
10700		- trimming = btrfs_test_opt(fs_info, DISCARD);
10701		-
10702		- /* Implicit trim during transaction commit. */
10703		- if (trimming)
10704		- btrfs_get_block_group_trimming(block_group);
10705		-
10706		- /*
10707		- * Btrfs_remove_chunk will abort the transaction if things go
10708		- * horribly wrong.
10709		- */
10710		- ret = btrfs_remove_chunk(trans, block_group->key.objectid);
10711		-
10712		- if (ret) {
10713		- if (trimming)
10714		- btrfs_put_block_group_trimming(block_group);
10715		- goto end_trans;
10716		- }
10717		-
10718		- /*
10719		- * If we're not mounted with -odiscard, we can just forget
10720		- * about this block group. Otherwise we'll need to wait
10721		- * until transaction commit to do the actual discard.
10722		- */
10723		- if (trimming) {
10724		- spin_lock(&fs_info->unused_bgs_lock);
10725		- /*
10726		- * A concurrent scrub might have added us to the list
10727		- * fs_info->unused_bgs, so use a list_move operation
10728		- * to add the block group to the deleted_bgs list.
10729		- */
10730		- list_move(&block_group->bg_list,
10731		- &trans->transaction->deleted_bgs);
10732		- spin_unlock(&fs_info->unused_bgs_lock);
10733		- btrfs_get_block_group(block_group);
10734		- }
10735		-end_trans:
10736		- btrfs_end_transaction(trans);
10737		-next:
10738		- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10739		- btrfs_put_block_group(block_group);
10740		- spin_lock(&fs_info->unused_bgs_lock);
10741		- }
10742		- spin_unlock(&fs_info->unused_bgs_lock);
10743		-}
10744		-
10745		-int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10746		-{
10747		- struct btrfs_super_block *disk_super;
10748		- u64 features;
10749		- u64 flags;
10750		- int mixed = 0;
10751		- int ret;
10752		-
10753		- disk_super = fs_info->super_copy;
10754		- if (!btrfs_super_root(disk_super))
10755		- return -EINVAL;
10756		-
10757		- features = btrfs_super_incompat_flags(disk_super);
10758		- if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10759		- mixed = 1;
10760		-
10761		- flags = BTRFS_BLOCK_GROUP_SYSTEM;
10762		- ret = create_space_info(fs_info, flags);
10763		- if (ret)
10764		- goto out;
10765		-
10766		- if (mixed) {
10767		- flags = BTRFS_BLOCK_GROUP_METADATA \| BTRFS_BLOCK_GROUP_DATA;
10768		- ret = create_space_info(fs_info, flags);
10769		- } else {
10770		- flags = BTRFS_BLOCK_GROUP_METADATA;
10771		- ret = create_space_info(fs_info, flags);
10772		- if (ret)
10773		- goto out;
10774		-
10775		- flags = BTRFS_BLOCK_GROUP_DATA;
10776		- ret = create_space_info(fs_info, flags);
10777		- }
10778		-out:
10779		- return ret;
10780	5689	}
10781	5690
10782	5691	int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
..	..	@@ -10805,10 +5714,9 @@
10805	5714	* it while performing the free space search since we have already
10806	5715	* held back allocations.
10807	5716	*/
10808		-static int btrfs_trim_free_extents(struct btrfs_device *device,
10809		- u64 minlen, u64 *trimmed)
	5717	+static int btrfs_trim_free_extents(struct btrfs_device device, u64 trimmed)
10810	5718	{
10811		- u64 start = 0, len = 0;
	5719	+ u64 start = SZ_1M, len = 0, end = 0;
10812	5720	int ret;
10813	5721
10814	5722	*trimmed = 0;
..	..	@@ -10817,7 +5725,7 @@
10817	5725	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
10818	5726	return 0;
10819	5727
10820		- /* Not writeable = nothing to do. */
	5728	+ /* Not writable = nothing to do. */
10821	5729	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
10822	5730	return 0;
10823	5731
..	..	@@ -10829,43 +5737,54 @@
10829	5737
10830	5738	while (1) {
10831	5739	struct btrfs_fs_info *fs_info = device->fs_info;
10832		- struct btrfs_transaction *trans;
10833	5740	u64 bytes;
10834	5741
10835	5742	ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10836	5743	if (ret)
10837	5744	break;
10838	5745
10839		- ret = down_read_killable(&fs_info->commit_root_sem);
10840		- if (ret) {
	5746	+ find_first_clear_extent_bit(&device->alloc_state, start,
	5747	+ &start, &end,
	5748	+ CHUNK_TRIMMED \| CHUNK_ALLOCATED);
	5749	+
	5750	+ /* Check if there are any CHUNK_* bits left */
	5751	+ if (start > device->total_bytes) {
	5752	+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
	5753	+ btrfs_warn_in_rcu(fs_info,
	5754	+"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
	5755	+ start, end - start + 1,
	5756	+ rcu_str_deref(device->name),
	5757	+ device->total_bytes);
10841	5758	mutex_unlock(&fs_info->chunk_mutex);
	5759	+ ret = 0;
10842	5760	break;
10843	5761	}
10844	5762
10845		- spin_lock(&fs_info->trans_lock);
10846		- trans = fs_info->running_transaction;
10847		- if (trans)
10848		- refcount_inc(&trans->use_count);
10849		- spin_unlock(&fs_info->trans_lock);
	5763	+ /* Ensure we skip the reserved area in the first 1M */
	5764	+ start = max_t(u64, start, SZ_1M);
10850	5765
10851		- if (!trans)
10852		- up_read(&fs_info->commit_root_sem);
	5766	+ /*
	5767	+ * If find_first_clear_extent_bit find a range that spans the
	5768	+ * end of the device it will set end to -1, in this case it's up
	5769	+ * to the caller to trim the value to the size of the device.
	5770	+ */
	5771	+ end = min(end, device->total_bytes - 1);
10853	5772
10854		- ret = find_free_dev_extent_start(trans, device, minlen, start,
10855		- &start, &len);
10856		- if (trans) {
10857		- up_read(&fs_info->commit_root_sem);
10858		- btrfs_put_transaction(trans);
10859		- }
	5773	+ len = end - start + 1;
10860	5774
10861		- if (ret) {
	5775	+ /* We didn't find any extents */
	5776	+ if (!len) {
10862	5777	mutex_unlock(&fs_info->chunk_mutex);
10863		- if (ret == -ENOSPC)
10864		- ret = 0;
	5778	+ ret = 0;
10865	5779	break;
10866	5780	}
10867	5781
10868		- ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
	5782	+ ret = btrfs_issue_discard(device->bdev, start, len,
	5783	+ &bytes);
	5784	+ if (!ret)
	5785	+ set_extent_bits(&device->alloc_state, start,
	5786	+ start + bytes - 1,
	5787	+ CHUNK_TRIMMED);
10869	5788	mutex_unlock(&fs_info->chunk_mutex);
10870	5789
10871	5790	if (ret)
..	..	@@ -10896,10 +5815,11 @@
10896	5815	*/
10897	5816	int btrfs_trim_fs(struct btrfs_fs_info fs_info, struct fstrim_range range)
10898	5817	{
10899		- struct btrfs_block_group_cache *cache = NULL;
	5818	+ struct btrfs_block_group *cache = NULL;
10900	5819	struct btrfs_device *device;
10901	5820	struct list_head *devices;
10902	5821	u64 group_trimmed;
	5822	+ u64 range_end = U64_MAX;
10903	5823	u64 start;
10904	5824	u64 end;
10905	5825	u64 trimmed = 0;
..	..	@@ -10909,26 +5829,33 @@
10909	5829	int dev_ret = 0;
10910	5830	int ret = 0;
10911	5831
	5832	+ /*
	5833	+ * Check range overflow if range->len is set.
	5834	+ * The default range->len is U64_MAX.
	5835	+ */
	5836	+ if (range->len != U64_MAX &&
	5837	+ check_add_overflow(range->start, range->len, &range_end))
	5838	+ return -EINVAL;
	5839	+
10912	5840	cache = btrfs_lookup_first_block_group(fs_info, range->start);
10913		- for (; cache; cache = next_block_group(fs_info, cache)) {
10914		- if (cache->key.objectid >= (range->start + range->len)) {
	5841	+ for (; cache; cache = btrfs_next_block_group(cache)) {
	5842	+ if (cache->start >= range_end) {
10915	5843	btrfs_put_block_group(cache);
10916	5844	break;
10917	5845	}
10918	5846
10919		- start = max(range->start, cache->key.objectid);
10920		- end = min(range->start + range->len,
10921		- cache->key.objectid + cache->key.offset);
	5847	+ start = max(range->start, cache->start);
	5848	+ end = min(range_end, cache->start + cache->length);
10922	5849
10923	5850	if (end - start >= range->minlen) {
10924		- if (!block_group_cache_done(cache)) {
10925		- ret = cache_block_group(cache, 0);
	5851	+ if (!btrfs_block_group_done(cache)) {
	5852	+ ret = btrfs_cache_block_group(cache, 0);
10926	5853	if (ret) {
10927	5854	bg_failed++;
10928	5855	bg_ret = ret;
10929	5856	continue;
10930	5857	}
10931		- ret = wait_block_group_cache_done(cache);
	5858	+ ret = btrfs_wait_block_group_cache_done(cache);
10932	5859	if (ret) {
10933	5860	bg_failed++;
10934	5861	bg_ret = ret;
..	..	@@ -10957,8 +5884,10 @@
10957	5884	mutex_lock(&fs_info->fs_devices->device_list_mutex);
10958	5885	devices = &fs_info->fs_devices->devices;
10959	5886	list_for_each_entry(device, devices, dev_list) {
10960		- ret = btrfs_trim_free_extents(device, range->minlen,
10961		- &group_trimmed);
	5887	+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
	5888	+ continue;
	5889	+
	5890	+ ret = btrfs_trim_free_extents(device, &group_trimmed);
10962	5891	if (ret) {
10963	5892	dev_failed++;
10964	5893	dev_ret = ret;
..	..	@@ -10977,61 +5906,4 @@
10977	5906	if (bg_ret)
10978	5907	return bg_ret;
10979	5908	return dev_ret;
10980		-}
10981		-
10982		-/*
10983		- * btrfs_{start,end}_write_no_snapshotting() are similar to
10984		- * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
10985		- * data into the page cache through nocow before the subvolume is snapshoted,
10986		- * but flush the data into disk after the snapshot creation, or to prevent
10987		- * operations while snapshotting is ongoing and that cause the snapshot to be
10988		- * inconsistent (writes followed by expanding truncates for example).
10989		- */
10990		-void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
10991		-{
10992		- percpu_counter_dec(&root->subv_writers->counter);
10993		- cond_wake_up(&root->subv_writers->wait);
10994		-}
10995		-
10996		-int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
10997		-{
10998		- if (atomic_read(&root->will_be_snapshotted))
10999		- return 0;
11000		-
11001		- percpu_counter_inc(&root->subv_writers->counter);
11002		- /*
11003		- * Make sure counter is updated before we check for snapshot creation.
11004		- */
11005		- smp_mb();
11006		- if (atomic_read(&root->will_be_snapshotted)) {
11007		- btrfs_end_write_no_snapshotting(root);
11008		- return 0;
11009		- }
11010		- return 1;
11011		-}
11012		-
11013		-void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11014		-{
11015		- while (true) {
11016		- int ret;
11017		-
11018		- ret = btrfs_start_write_no_snapshotting(root);
11019		- if (ret)
11020		- break;
11021		- wait_var_event(&root->will_be_snapshotted,
11022		- !atomic_read(&root->will_be_snapshotted));
11023		- }
11024		-}
11025		-
11026		-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
11027		-{
11028		- struct btrfs_fs_info *fs_info = bg->fs_info;
11029		-
11030		- spin_lock(&fs_info->unused_bgs_lock);
11031		- if (list_empty(&bg->bg_list)) {
11032		- btrfs_get_block_group(bg);
11033		- trace_btrfs_add_unused_block_group(bg);
11034		- list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
11035		- }
11036		- spin_unlock(&fs_info->unused_bgs_lock);
11037	5909	}