hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/fs/btrfs/extent-tree.c
....@@ -16,6 +16,7 @@
1616 #include <linux/percpu_counter.h>
1717 #include <linux/lockdep.h>
1818 #include <linux/crc32c.h>
19
+#include "misc.h"
1920 #include "tree-log.h"
2021 #include "disk-io.h"
2122 #include "print-tree.h"
....@@ -24,32 +25,18 @@
2425 #include "locking.h"
2526 #include "free-space-cache.h"
2627 #include "free-space-tree.h"
27
-#include "math.h"
2828 #include "sysfs.h"
2929 #include "qgroup.h"
3030 #include "ref-verify.h"
31
+#include "space-info.h"
32
+#include "block-rsv.h"
33
+#include "delalloc-space.h"
34
+#include "block-group.h"
35
+#include "discard.h"
36
+#include "rcu-string.h"
3137
3238 #undef SCRAMBLE_DELAYED_REFS
3339
34
-/*
35
- * control flags for do_chunk_alloc's force field
36
- * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
37
- * if we really need one.
38
- *
39
- * CHUNK_ALLOC_LIMITED means to only try and allocate one
40
- * if we have very few chunks already allocated. This is
41
- * used as part of the clustering code to help make sure
42
- * we have a good pool of storage to cluster in, without
43
- * filling the FS with empty chunks
44
- *
45
- * CHUNK_ALLOC_FORCE means it must try to allocate one
46
- *
47
- */
48
-enum {
49
- CHUNK_ALLOC_NO_FORCE = 0,
50
- CHUNK_ALLOC_LIMITED = 1,
51
- CHUNK_ALLOC_FORCE = 2,
52
-};
5340
5441 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5542 struct btrfs_delayed_ref_node *node, u64 parent,
....@@ -66,712 +53,33 @@
6653 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6754 struct btrfs_delayed_ref_node *node,
6855 struct btrfs_delayed_extent_op *extent_op);
69
-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
70
- int force);
7156 static int find_next_key(struct btrfs_path *path, int level,
7257 struct btrfs_key *key);
73
-static void dump_space_info(struct btrfs_fs_info *fs_info,
74
- struct btrfs_space_info *info, u64 bytes,
75
- int dump_block_groups);
76
-static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
77
- u64 num_bytes);
78
-static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
79
- struct btrfs_space_info *space_info,
80
- u64 num_bytes);
81
-static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
82
- struct btrfs_space_info *space_info,
83
- u64 num_bytes);
8458
85
-static noinline int
86
-block_group_cache_done(struct btrfs_block_group_cache *cache)
87
-{
88
- smp_mb();
89
- return cache->cached == BTRFS_CACHE_FINISHED ||
90
- cache->cached == BTRFS_CACHE_ERROR;
91
-}
92
-
93
-static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
59
+static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
9460 {
9561 return (cache->flags & bits) == bits;
9662 }
9763
98
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
99
-{
100
- atomic_inc(&cache->count);
101
-}
102
-
103
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104
-{
105
- if (atomic_dec_and_test(&cache->count)) {
106
- WARN_ON(cache->pinned > 0);
107
- WARN_ON(cache->reserved > 0);
108
-
109
- /*
110
- * If not empty, someone is still holding mutex of
111
- * full_stripe_lock, which can only be released by caller.
112
- * And it will definitely cause use-after-free when caller
113
- * tries to release full stripe lock.
114
- *
115
- * No better way to resolve, but only to warn.
116
- */
117
- WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
118
- kfree(cache->free_space_ctl);
119
- kfree(cache);
120
- }
121
-}
122
-
123
-/*
124
- * this adds the block group to the fs_info rb tree for the block group
125
- * cache
126
- */
127
-static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
128
- struct btrfs_block_group_cache *block_group)
129
-{
130
- struct rb_node **p;
131
- struct rb_node *parent = NULL;
132
- struct btrfs_block_group_cache *cache;
133
-
134
- spin_lock(&info->block_group_cache_lock);
135
- p = &info->block_group_cache_tree.rb_node;
136
-
137
- while (*p) {
138
- parent = *p;
139
- cache = rb_entry(parent, struct btrfs_block_group_cache,
140
- cache_node);
141
- if (block_group->key.objectid < cache->key.objectid) {
142
- p = &(*p)->rb_left;
143
- } else if (block_group->key.objectid > cache->key.objectid) {
144
- p = &(*p)->rb_right;
145
- } else {
146
- spin_unlock(&info->block_group_cache_lock);
147
- return -EEXIST;
148
- }
149
- }
150
-
151
- rb_link_node(&block_group->cache_node, parent, p);
152
- rb_insert_color(&block_group->cache_node,
153
- &info->block_group_cache_tree);
154
-
155
- if (info->first_logical_byte > block_group->key.objectid)
156
- info->first_logical_byte = block_group->key.objectid;
157
-
158
- spin_unlock(&info->block_group_cache_lock);
159
-
160
- return 0;
161
-}
162
-
163
-/*
164
- * This will return the block group at or after bytenr if contains is 0, else
165
- * it will return the block group that contains the bytenr
166
- */
167
-static struct btrfs_block_group_cache *
168
-block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
169
- int contains)
170
-{
171
- struct btrfs_block_group_cache *cache, *ret = NULL;
172
- struct rb_node *n;
173
- u64 end, start;
174
-
175
- spin_lock(&info->block_group_cache_lock);
176
- n = info->block_group_cache_tree.rb_node;
177
-
178
- while (n) {
179
- cache = rb_entry(n, struct btrfs_block_group_cache,
180
- cache_node);
181
- end = cache->key.objectid + cache->key.offset - 1;
182
- start = cache->key.objectid;
183
-
184
- if (bytenr < start) {
185
- if (!contains && (!ret || start < ret->key.objectid))
186
- ret = cache;
187
- n = n->rb_left;
188
- } else if (bytenr > start) {
189
- if (contains && bytenr <= end) {
190
- ret = cache;
191
- break;
192
- }
193
- n = n->rb_right;
194
- } else {
195
- ret = cache;
196
- break;
197
- }
198
- }
199
- if (ret) {
200
- btrfs_get_block_group(ret);
201
- if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
202
- info->first_logical_byte = ret->key.objectid;
203
- }
204
- spin_unlock(&info->block_group_cache_lock);
205
-
206
- return ret;
207
-}
208
-
209
-static int add_excluded_extent(struct btrfs_fs_info *fs_info,
210
- u64 start, u64 num_bytes)
64
+int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
65
+ u64 start, u64 num_bytes)
21166 {
21267 u64 end = start + num_bytes - 1;
213
- set_extent_bits(&fs_info->freed_extents[0],
214
- start, end, EXTENT_UPTODATE);
215
- set_extent_bits(&fs_info->freed_extents[1],
216
- start, end, EXTENT_UPTODATE);
68
+ set_extent_bits(&fs_info->excluded_extents, start, end,
69
+ EXTENT_UPTODATE);
21770 return 0;
21871 }
21972
220
-static void free_excluded_extents(struct btrfs_block_group_cache *cache)
73
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
22174 {
22275 struct btrfs_fs_info *fs_info = cache->fs_info;
22376 u64 start, end;
22477
225
- start = cache->key.objectid;
226
- end = start + cache->key.offset - 1;
78
+ start = cache->start;
79
+ end = start + cache->length - 1;
22780
228
- clear_extent_bits(&fs_info->freed_extents[0],
229
- start, end, EXTENT_UPTODATE);
230
- clear_extent_bits(&fs_info->freed_extents[1],
231
- start, end, EXTENT_UPTODATE);
232
-}
233
-
234
-static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
235
-{
236
- struct btrfs_fs_info *fs_info = cache->fs_info;
237
- u64 bytenr;
238
- u64 *logical;
239
- int stripe_len;
240
- int i, nr, ret;
241
-
242
- if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
243
- stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
244
- cache->bytes_super += stripe_len;
245
- ret = add_excluded_extent(fs_info, cache->key.objectid,
246
- stripe_len);
247
- if (ret)
248
- return ret;
249
- }
250
-
251
- for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
252
- bytenr = btrfs_sb_offset(i);
253
- ret = btrfs_rmap_block(fs_info, cache->key.objectid,
254
- bytenr, &logical, &nr, &stripe_len);
255
- if (ret)
256
- return ret;
257
-
258
- while (nr--) {
259
- u64 start, len;
260
-
261
- if (logical[nr] > cache->key.objectid +
262
- cache->key.offset)
263
- continue;
264
-
265
- if (logical[nr] + stripe_len <= cache->key.objectid)
266
- continue;
267
-
268
- start = logical[nr];
269
- if (start < cache->key.objectid) {
270
- start = cache->key.objectid;
271
- len = (logical[nr] + stripe_len) - start;
272
- } else {
273
- len = min_t(u64, stripe_len,
274
- cache->key.objectid +
275
- cache->key.offset - start);
276
- }
277
-
278
- cache->bytes_super += len;
279
- ret = add_excluded_extent(fs_info, start, len);
280
- if (ret) {
281
- kfree(logical);
282
- return ret;
283
- }
284
- }
285
-
286
- kfree(logical);
287
- }
288
- return 0;
289
-}
290
-
291
-static struct btrfs_caching_control *
292
-get_caching_control(struct btrfs_block_group_cache *cache)
293
-{
294
- struct btrfs_caching_control *ctl;
295
-
296
- spin_lock(&cache->lock);
297
- if (!cache->caching_ctl) {
298
- spin_unlock(&cache->lock);
299
- return NULL;
300
- }
301
-
302
- ctl = cache->caching_ctl;
303
- refcount_inc(&ctl->count);
304
- spin_unlock(&cache->lock);
305
- return ctl;
306
-}
307
-
308
-static void put_caching_control(struct btrfs_caching_control *ctl)
309
-{
310
- if (refcount_dec_and_test(&ctl->count))
311
- kfree(ctl);
312
-}
313
-
314
-#ifdef CONFIG_BTRFS_DEBUG
315
-static void fragment_free_space(struct btrfs_block_group_cache *block_group)
316
-{
317
- struct btrfs_fs_info *fs_info = block_group->fs_info;
318
- u64 start = block_group->key.objectid;
319
- u64 len = block_group->key.offset;
320
- u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
321
- fs_info->nodesize : fs_info->sectorsize;
322
- u64 step = chunk << 1;
323
-
324
- while (len > chunk) {
325
- btrfs_remove_free_space(block_group, start, chunk);
326
- start += step;
327
- if (len < step)
328
- len = 0;
329
- else
330
- len -= step;
331
- }
332
-}
333
-#endif
334
-
335
-/*
336
- * this is only called by cache_block_group, since we could have freed extents
337
- * we need to check the pinned_extents for any extents that can't be used yet
338
- * since their free space will be released as soon as the transaction commits.
339
- */
340
-u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
341
- u64 start, u64 end)
342
-{
343
- struct btrfs_fs_info *info = block_group->fs_info;
344
- u64 extent_start, extent_end, size, total_added = 0;
345
- int ret;
346
-
347
- while (start < end) {
348
- ret = find_first_extent_bit(info->pinned_extents, start,
349
- &extent_start, &extent_end,
350
- EXTENT_DIRTY | EXTENT_UPTODATE,
351
- NULL);
352
- if (ret)
353
- break;
354
-
355
- if (extent_start <= start) {
356
- start = extent_end + 1;
357
- } else if (extent_start > start && extent_start < end) {
358
- size = extent_start - start;
359
- total_added += size;
360
- ret = btrfs_add_free_space(block_group, start,
361
- size);
362
- BUG_ON(ret); /* -ENOMEM or logic error */
363
- start = extent_end + 1;
364
- } else {
365
- break;
366
- }
367
- }
368
-
369
- if (start < end) {
370
- size = end - start;
371
- total_added += size;
372
- ret = btrfs_add_free_space(block_group, start, size);
373
- BUG_ON(ret); /* -ENOMEM or logic error */
374
- }
375
-
376
- return total_added;
377
-}
378
-
379
-static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
380
-{
381
- struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
382
- struct btrfs_fs_info *fs_info = block_group->fs_info;
383
- struct btrfs_root *extent_root = fs_info->extent_root;
384
- struct btrfs_path *path;
385
- struct extent_buffer *leaf;
386
- struct btrfs_key key;
387
- u64 total_found = 0;
388
- u64 last = 0;
389
- u32 nritems;
390
- int ret;
391
- bool wakeup = true;
392
-
393
- path = btrfs_alloc_path();
394
- if (!path)
395
- return -ENOMEM;
396
-
397
- last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
398
-
399
-#ifdef CONFIG_BTRFS_DEBUG
400
- /*
401
- * If we're fragmenting we don't want to make anybody think we can
402
- * allocate from this block group until we've had a chance to fragment
403
- * the free space.
404
- */
405
- if (btrfs_should_fragment_free_space(block_group))
406
- wakeup = false;
407
-#endif
408
- /*
409
- * We don't want to deadlock with somebody trying to allocate a new
410
- * extent for the extent root while also trying to search the extent
411
- * root to add free space. So we skip locking and search the commit
412
- * root, since its read-only
413
- */
414
- path->skip_locking = 1;
415
- path->search_commit_root = 1;
416
- path->reada = READA_FORWARD;
417
-
418
- key.objectid = last;
419
- key.offset = 0;
420
- key.type = BTRFS_EXTENT_ITEM_KEY;
421
-
422
-next:
423
- ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
424
- if (ret < 0)
425
- goto out;
426
-
427
- leaf = path->nodes[0];
428
- nritems = btrfs_header_nritems(leaf);
429
-
430
- while (1) {
431
- if (btrfs_fs_closing(fs_info) > 1) {
432
- last = (u64)-1;
433
- break;
434
- }
435
-
436
- if (path->slots[0] < nritems) {
437
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
438
- } else {
439
- ret = find_next_key(path, 0, &key);
440
- if (ret)
441
- break;
442
-
443
- if (need_resched() ||
444
- rwsem_is_contended(&fs_info->commit_root_sem)) {
445
- if (wakeup)
446
- caching_ctl->progress = last;
447
- btrfs_release_path(path);
448
- up_read(&fs_info->commit_root_sem);
449
- mutex_unlock(&caching_ctl->mutex);
450
- cond_resched();
451
- mutex_lock(&caching_ctl->mutex);
452
- down_read(&fs_info->commit_root_sem);
453
- goto next;
454
- }
455
-
456
- ret = btrfs_next_leaf(extent_root, path);
457
- if (ret < 0)
458
- goto out;
459
- if (ret)
460
- break;
461
- leaf = path->nodes[0];
462
- nritems = btrfs_header_nritems(leaf);
463
- continue;
464
- }
465
-
466
- if (key.objectid < last) {
467
- key.objectid = last;
468
- key.offset = 0;
469
- key.type = BTRFS_EXTENT_ITEM_KEY;
470
-
471
- if (wakeup)
472
- caching_ctl->progress = last;
473
- btrfs_release_path(path);
474
- goto next;
475
- }
476
-
477
- if (key.objectid < block_group->key.objectid) {
478
- path->slots[0]++;
479
- continue;
480
- }
481
-
482
- if (key.objectid >= block_group->key.objectid +
483
- block_group->key.offset)
484
- break;
485
-
486
- if (key.type == BTRFS_EXTENT_ITEM_KEY ||
487
- key.type == BTRFS_METADATA_ITEM_KEY) {
488
- total_found += add_new_free_space(block_group, last,
489
- key.objectid);
490
- if (key.type == BTRFS_METADATA_ITEM_KEY)
491
- last = key.objectid +
492
- fs_info->nodesize;
493
- else
494
- last = key.objectid + key.offset;
495
-
496
- if (total_found > CACHING_CTL_WAKE_UP) {
497
- total_found = 0;
498
- if (wakeup)
499
- wake_up(&caching_ctl->wait);
500
- }
501
- }
502
- path->slots[0]++;
503
- }
504
- ret = 0;
505
-
506
- total_found += add_new_free_space(block_group, last,
507
- block_group->key.objectid +
508
- block_group->key.offset);
509
- caching_ctl->progress = (u64)-1;
510
-
511
-out:
512
- btrfs_free_path(path);
513
- return ret;
514
-}
515
-
516
-static noinline void caching_thread(struct btrfs_work *work)
517
-{
518
- struct btrfs_block_group_cache *block_group;
519
- struct btrfs_fs_info *fs_info;
520
- struct btrfs_caching_control *caching_ctl;
521
- int ret;
522
-
523
- caching_ctl = container_of(work, struct btrfs_caching_control, work);
524
- block_group = caching_ctl->block_group;
525
- fs_info = block_group->fs_info;
526
-
527
- mutex_lock(&caching_ctl->mutex);
528
- down_read(&fs_info->commit_root_sem);
529
-
530
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
531
- ret = load_free_space_tree(caching_ctl);
532
- else
533
- ret = load_extent_tree_free(caching_ctl);
534
-
535
- spin_lock(&block_group->lock);
536
- block_group->caching_ctl = NULL;
537
- block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
538
- spin_unlock(&block_group->lock);
539
-
540
-#ifdef CONFIG_BTRFS_DEBUG
541
- if (btrfs_should_fragment_free_space(block_group)) {
542
- u64 bytes_used;
543
-
544
- spin_lock(&block_group->space_info->lock);
545
- spin_lock(&block_group->lock);
546
- bytes_used = block_group->key.offset -
547
- btrfs_block_group_used(&block_group->item);
548
- block_group->space_info->bytes_used += bytes_used >> 1;
549
- spin_unlock(&block_group->lock);
550
- spin_unlock(&block_group->space_info->lock);
551
- fragment_free_space(block_group);
552
- }
553
-#endif
554
-
555
- caching_ctl->progress = (u64)-1;
556
-
557
- up_read(&fs_info->commit_root_sem);
558
- free_excluded_extents(block_group);
559
- mutex_unlock(&caching_ctl->mutex);
560
-
561
- wake_up(&caching_ctl->wait);
562
-
563
- put_caching_control(caching_ctl);
564
- btrfs_put_block_group(block_group);
565
-}
566
-
567
-static int cache_block_group(struct btrfs_block_group_cache *cache,
568
- int load_cache_only)
569
-{
570
- DEFINE_WAIT(wait);
571
- struct btrfs_fs_info *fs_info = cache->fs_info;
572
- struct btrfs_caching_control *caching_ctl;
573
- int ret = 0;
574
-
575
- caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
576
- if (!caching_ctl)
577
- return -ENOMEM;
578
-
579
- INIT_LIST_HEAD(&caching_ctl->list);
580
- mutex_init(&caching_ctl->mutex);
581
- init_waitqueue_head(&caching_ctl->wait);
582
- caching_ctl->block_group = cache;
583
- caching_ctl->progress = cache->key.objectid;
584
- refcount_set(&caching_ctl->count, 1);
585
- btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
586
- caching_thread, NULL, NULL);
587
-
588
- spin_lock(&cache->lock);
589
- /*
590
- * This should be a rare occasion, but this could happen I think in the
591
- * case where one thread starts to load the space cache info, and then
592
- * some other thread starts a transaction commit which tries to do an
593
- * allocation while the other thread is still loading the space cache
594
- * info. The previous loop should have kept us from choosing this block
595
- * group, but if we've moved to the state where we will wait on caching
596
- * block groups we need to first check if we're doing a fast load here,
597
- * so we can wait for it to finish, otherwise we could end up allocating
598
- * from a block group who's cache gets evicted for one reason or
599
- * another.
600
- */
601
- while (cache->cached == BTRFS_CACHE_FAST) {
602
- struct btrfs_caching_control *ctl;
603
-
604
- ctl = cache->caching_ctl;
605
- refcount_inc(&ctl->count);
606
- prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
607
- spin_unlock(&cache->lock);
608
-
609
- schedule();
610
-
611
- finish_wait(&ctl->wait, &wait);
612
- put_caching_control(ctl);
613
- spin_lock(&cache->lock);
614
- }
615
-
616
- if (cache->cached != BTRFS_CACHE_NO) {
617
- spin_unlock(&cache->lock);
618
- kfree(caching_ctl);
619
- return 0;
620
- }
621
- WARN_ON(cache->caching_ctl);
622
- cache->caching_ctl = caching_ctl;
623
- cache->cached = BTRFS_CACHE_FAST;
624
- spin_unlock(&cache->lock);
625
-
626
- if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
627
- mutex_lock(&caching_ctl->mutex);
628
- ret = load_free_space_cache(fs_info, cache);
629
-
630
- spin_lock(&cache->lock);
631
- if (ret == 1) {
632
- cache->caching_ctl = NULL;
633
- cache->cached = BTRFS_CACHE_FINISHED;
634
- cache->last_byte_to_unpin = (u64)-1;
635
- caching_ctl->progress = (u64)-1;
636
- } else {
637
- if (load_cache_only) {
638
- cache->caching_ctl = NULL;
639
- cache->cached = BTRFS_CACHE_NO;
640
- } else {
641
- cache->cached = BTRFS_CACHE_STARTED;
642
- cache->has_caching_ctl = 1;
643
- }
644
- }
645
- spin_unlock(&cache->lock);
646
-#ifdef CONFIG_BTRFS_DEBUG
647
- if (ret == 1 &&
648
- btrfs_should_fragment_free_space(cache)) {
649
- u64 bytes_used;
650
-
651
- spin_lock(&cache->space_info->lock);
652
- spin_lock(&cache->lock);
653
- bytes_used = cache->key.offset -
654
- btrfs_block_group_used(&cache->item);
655
- cache->space_info->bytes_used += bytes_used >> 1;
656
- spin_unlock(&cache->lock);
657
- spin_unlock(&cache->space_info->lock);
658
- fragment_free_space(cache);
659
- }
660
-#endif
661
- mutex_unlock(&caching_ctl->mutex);
662
-
663
- wake_up(&caching_ctl->wait);
664
- if (ret == 1) {
665
- put_caching_control(caching_ctl);
666
- free_excluded_extents(cache);
667
- return 0;
668
- }
669
- } else {
670
- /*
671
- * We're either using the free space tree or no caching at all.
672
- * Set cached to the appropriate value and wakeup any waiters.
673
- */
674
- spin_lock(&cache->lock);
675
- if (load_cache_only) {
676
- cache->caching_ctl = NULL;
677
- cache->cached = BTRFS_CACHE_NO;
678
- } else {
679
- cache->cached = BTRFS_CACHE_STARTED;
680
- cache->has_caching_ctl = 1;
681
- }
682
- spin_unlock(&cache->lock);
683
- wake_up(&caching_ctl->wait);
684
- }
685
-
686
- if (load_cache_only) {
687
- put_caching_control(caching_ctl);
688
- return 0;
689
- }
690
-
691
- down_write(&fs_info->commit_root_sem);
692
- refcount_inc(&caching_ctl->count);
693
- list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
694
- up_write(&fs_info->commit_root_sem);
695
-
696
- btrfs_get_block_group(cache);
697
-
698
- btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
699
-
700
- return ret;
701
-}
702
-
703
-/*
704
- * return the block group that starts at or after bytenr
705
- */
706
-static struct btrfs_block_group_cache *
707
-btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
708
-{
709
- return block_group_cache_tree_search(info, bytenr, 0);
710
-}
711
-
712
-/*
713
- * return the block group that contains the given bytenr
714
- */
715
-struct btrfs_block_group_cache *btrfs_lookup_block_group(
716
- struct btrfs_fs_info *info,
717
- u64 bytenr)
718
-{
719
- return block_group_cache_tree_search(info, bytenr, 1);
720
-}
721
-
722
-static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
723
- u64 flags)
724
-{
725
- struct list_head *head = &info->space_info;
726
- struct btrfs_space_info *found;
727
-
728
- flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
729
-
730
- rcu_read_lock();
731
- list_for_each_entry_rcu(found, head, list) {
732
- if (found->flags & flags) {
733
- rcu_read_unlock();
734
- return found;
735
- }
736
- }
737
- rcu_read_unlock();
738
- return NULL;
739
-}
740
-
741
-static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
742
- bool metadata, u64 root_objectid)
743
-{
744
- struct btrfs_space_info *space_info;
745
- u64 flags;
746
-
747
- if (metadata) {
748
- if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
749
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
750
- else
751
- flags = BTRFS_BLOCK_GROUP_METADATA;
752
- } else {
753
- flags = BTRFS_BLOCK_GROUP_DATA;
754
- }
755
-
756
- space_info = __find_space_info(fs_info, flags);
757
- ASSERT(space_info);
758
- percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
759
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
760
-}
761
-
762
-/*
763
- * after adding space to the filesystem, we need to clear the full flags
764
- * on all the space infos.
765
- */
766
-void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
767
-{
768
- struct list_head *head = &info->space_info;
769
- struct btrfs_space_info *found;
770
-
771
- rcu_read_lock();
772
- list_for_each_entry_rcu(found, head, list)
773
- found->full = 0;
774
- rcu_read_unlock();
81
+ clear_extent_bits(&fs_info->excluded_extents, start, end,
82
+ EXTENT_UPTODATE);
77583 }
77684
77785 /* simple helper to search for an existing data extent at a given offset */
....@@ -1037,7 +345,7 @@
1037345
1038346 /*
1039347 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1040
- * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
348
+ * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
1041349 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1042350 */
1043351 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
....@@ -1092,18 +400,18 @@
1092400 return BTRFS_REF_TYPE_INVALID;
1093401 }
1094402
1095
-static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
403
+u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1096404 {
1097405 u32 high_crc = ~(u32)0;
1098406 u32 low_crc = ~(u32)0;
1099407 __le64 lenum;
1100408
1101409 lenum = cpu_to_le64(root_objectid);
1102
- high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
410
+ high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1103411 lenum = cpu_to_le64(owner);
1104
- low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
412
+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1105413 lenum = cpu_to_le64(offset);
1106
- low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
414
+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1107415
1108416 return ((u64)high_crc << 31) ^ (u64)low_crc;
1109417 }
....@@ -1549,6 +857,11 @@
1549857 err = -ENOENT;
1550858 goto out;
1551859 } else if (WARN_ON(ret)) {
860
+ btrfs_print_leaf(path->nodes[0]);
861
+ btrfs_err(fs_info,
862
+"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
863
+ bytenr, num_bytes, parent, root_objectid, owner,
864
+ offset);
1552865 err = -EIO;
1553866 goto out;
1554867 }
....@@ -1685,7 +998,7 @@
1685998 type = extent_ref_type(parent, owner);
1686999 size = btrfs_extent_inline_ref_size(type);
16871000
1688
- btrfs_extend_item(fs_info, path, size);
1001
+ btrfs_extend_item(path, size);
16891002
16901003 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
16911004 refs = btrfs_extent_refs(leaf, ei);
....@@ -1760,7 +1073,6 @@
17601073 int *last_ref)
17611074 {
17621075 struct extent_buffer *leaf = path->nodes[0];
1763
- struct btrfs_fs_info *fs_info = leaf->fs_info;
17641076 struct btrfs_extent_item *ei;
17651077 struct btrfs_extent_data_ref *dref = NULL;
17661078 struct btrfs_shared_data_ref *sref = NULL;
....@@ -1815,7 +1127,7 @@
18151127 memmove_extent_buffer(leaf, ptr, ptr + size,
18161128 end - ptr - size);
18171129 item_size -= size;
1818
- btrfs_truncate_item(fs_info, path, item_size, 1);
1130
+ btrfs_truncate_item(path, item_size, 1);
18191131 }
18201132 btrfs_mark_buffer_dirty(leaf);
18211133 }
....@@ -1835,7 +1147,22 @@
18351147 num_bytes, parent, root_objectid,
18361148 owner, offset, 1);
18371149 if (ret == 0) {
1838
- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1150
+ /*
1151
+ * We're adding refs to a tree block we already own, this
1152
+ * should not happen at all.
1153
+ */
1154
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1155
+ btrfs_crit(trans->fs_info,
1156
+"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
1157
+ bytenr, num_bytes, root_objectid);
1158
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
1159
+ WARN_ON(1);
1160
+ btrfs_crit(trans->fs_info,
1161
+ "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
1162
+ btrfs_print_leaf(path->nodes[0]);
1163
+ }
1164
+ return -EUCLEAN;
1165
+ }
18391166 update_inline_extent_backref(path, iref, refs_to_add,
18401167 extent_op, NULL);
18411168 } else if (ret == -ENOENT) {
....@@ -1843,24 +1170,6 @@
18431170 root_objectid, owner, offset,
18441171 refs_to_add, extent_op);
18451172 ret = 0;
1846
- }
1847
- return ret;
1848
-}
1849
-
1850
-static int insert_extent_backref(struct btrfs_trans_handle *trans,
1851
- struct btrfs_path *path,
1852
- u64 bytenr, u64 parent, u64 root_objectid,
1853
- u64 owner, u64 offset, int refs_to_add)
1854
-{
1855
- int ret;
1856
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1857
- BUG_ON(refs_to_add != 1);
1858
- ret = insert_tree_block_ref(trans, path, bytenr, parent,
1859
- root_objectid);
1860
- } else {
1861
- ret = insert_extent_data_ref(trans, path, bytenr, parent,
1862
- root_objectid, owner, offset,
1863
- refs_to_add);
18641173 }
18651174 return ret;
18661175 }
....@@ -1886,7 +1195,6 @@
18861195 return ret;
18871196 }
18881197
1889
-#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
18901198 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
18911199 u64 *discarded_bytes)
18921200 {
....@@ -1962,8 +1270,10 @@
19621270 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
19631271 u64 num_bytes, u64 *actual_bytes)
19641272 {
1965
- int ret;
1273
+ int ret = 0;
19661274 u64 discarded_bytes = 0;
1275
+ u64 end = bytenr + num_bytes;
1276
+ u64 cur = bytenr;
19671277 struct btrfs_bio *bbio = NULL;
19681278
19691279
....@@ -1972,15 +1282,23 @@
19721282 * associated to its stripes that don't go away while we are discarding.
19731283 */
19741284 btrfs_bio_counter_inc_blocked(fs_info);
1975
- /* Tell the block device(s) that the sectors can be discarded */
1976
- ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1977
- &bbio, 0);
1978
- /* Error condition is -ENOMEM */
1979
- if (!ret) {
1980
- struct btrfs_bio_stripe *stripe = bbio->stripes;
1285
+ while (cur < end) {
1286
+ struct btrfs_bio_stripe *stripe;
19811287 int i;
19821288
1289
+ num_bytes = end - cur;
1290
+ /* Tell the block device(s) that the sectors can be discarded */
1291
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
1292
+ &num_bytes, &bbio, 0);
1293
+ /*
1294
+ * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
1295
+ * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
1296
+ * thus we can't continue anyway.
1297
+ */
1298
+ if (ret < 0)
1299
+ goto out;
19831300
1301
+ stripe = bbio->stripes;
19841302 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
19851303 u64 bytes;
19861304 struct request_queue *req_q;
....@@ -2001,10 +1319,19 @@
20011319 stripe->physical,
20021320 stripe->length,
20031321 &bytes);
2004
- if (!ret)
1322
+ if (!ret) {
20051323 discarded_bytes += bytes;
2006
- else if (ret != -EOPNOTSUPP)
2007
- break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1324
+ } else if (ret != -EOPNOTSUPP) {
1325
+ /*
1326
+ * Logic errors or -ENOMEM, or -EIO, but
1327
+ * unlikely to happen.
1328
+ *
1329
+ * And since there are two loops, explicitly
1330
+ * go to out to avoid confusion.
1331
+ */
1332
+ btrfs_put_bbio(bbio);
1333
+ goto out;
1334
+ }
20081335
20091336 /*
20101337 * Just in case we get back EOPNOTSUPP for some reason,
....@@ -2014,7 +1341,9 @@
20141341 ret = 0;
20151342 }
20161343 btrfs_put_bbio(bbio);
1344
+ cur += num_bytes;
20171345 }
1346
+out:
20181347 btrfs_bio_counter_dec(fs_info);
20191348
20201349 if (actual_bytes)
....@@ -2028,45 +1357,31 @@
20281357
20291358 /* Can return -ENOMEM */
20301359 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2031
- struct btrfs_root *root,
2032
- u64 bytenr, u64 num_bytes, u64 parent,
2033
- u64 root_objectid, u64 owner, u64 offset)
1360
+ struct btrfs_ref *generic_ref)
20341361 {
2035
- struct btrfs_fs_info *fs_info = root->fs_info;
2036
- int old_ref_mod, new_ref_mod;
1362
+ struct btrfs_fs_info *fs_info = trans->fs_info;
20371363 int ret;
20381364
2039
- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2040
- root_objectid == BTRFS_TREE_LOG_OBJECTID);
1365
+ ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1366
+ generic_ref->action);
1367
+ BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1368
+ generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
20411369
2042
- btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2043
- owner, offset, BTRFS_ADD_DELAYED_REF);
1370
+ if (generic_ref->type == BTRFS_REF_METADATA)
1371
+ ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1372
+ else
1373
+ ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
20441374
2045
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2046
- ret = btrfs_add_delayed_tree_ref(trans, bytenr,
2047
- num_bytes, parent,
2048
- root_objectid, (int)owner,
2049
- BTRFS_ADD_DELAYED_REF, NULL,
2050
- &old_ref_mod, &new_ref_mod);
2051
- } else {
2052
- ret = btrfs_add_delayed_data_ref(trans, bytenr,
2053
- num_bytes, parent,
2054
- root_objectid, owner, offset,
2055
- 0, BTRFS_ADD_DELAYED_REF,
2056
- &old_ref_mod, &new_ref_mod);
2057
- }
2058
-
2059
- if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
2060
- bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
2061
-
2062
- add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
2063
- }
1375
+ btrfs_ref_tree_mod(fs_info, generic_ref);
20641376
20651377 return ret;
20661378 }
20671379
20681380 /*
20691381 * __btrfs_inc_extent_ref - insert backreference for a given extent
1382
+ *
1383
+ * The counterpart is in __btrfs_free_extent(), with examples and more details
1384
+ * how it works.
20701385 *
20711386 * @trans: Handle of transaction
20721387 *
....@@ -2118,7 +1433,6 @@
21181433 if (!path)
21191434 return -ENOMEM;
21201435
2121
- path->reada = READA_FORWARD;
21221436 path->leave_spinning = 1;
21231437 /* this will setup the path even if it fails to insert the back ref */
21241438 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
....@@ -2143,11 +1457,17 @@
21431457 btrfs_mark_buffer_dirty(leaf);
21441458 btrfs_release_path(path);
21451459
2146
- path->reada = READA_FORWARD;
21471460 path->leave_spinning = 1;
21481461 /* now insert the actual backref */
2149
- ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2150
- owner, offset, refs_to_add);
1462
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1463
+ BUG_ON(refs_to_add != 1);
1464
+ ret = insert_tree_block_ref(trans, path, bytenr, parent,
1465
+ root_objectid);
1466
+ } else {
1467
+ ret = insert_extent_data_ref(trans, path, bytenr, parent,
1468
+ root_objectid, owner, offset,
1469
+ refs_to_add);
1470
+ }
21511471 if (ret)
21521472 btrfs_abort_transaction(trans, ret);
21531473 out:
....@@ -2232,7 +1552,7 @@
22321552 int err = 0;
22331553 int metadata = !extent_op->is_data;
22341554
2235
- if (trans->aborted)
1555
+ if (TRANS_ABORTED(trans))
22361556 return 0;
22371557
22381558 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
....@@ -2253,7 +1573,6 @@
22531573 }
22541574
22551575 again:
2256
- path->reada = READA_FORWARD;
22571576 path->leave_spinning = 1;
22581577 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
22591578 if (ret < 0) {
....@@ -2352,10 +1671,9 @@
23521671 {
23531672 int ret = 0;
23541673
2355
- if (trans->aborted) {
1674
+ if (TRANS_ABORTED(trans)) {
23561675 if (insert_reserved)
2357
- btrfs_pin_extent(trans->fs_info, node->bytenr,
2358
- node->num_bytes, 1);
1676
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
23591677 return 0;
23601678 }
23611679
....@@ -2370,8 +1688,12 @@
23701688 else
23711689 BUG();
23721690 if (ret && insert_reserved)
2373
- btrfs_pin_extent(trans->fs_info, node->bytenr,
2374
- node->num_bytes, 1);
1691
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1692
+ if (ret < 0)
1693
+ btrfs_err(trans->fs_info,
1694
+"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1695
+ node->bytenr, node->num_bytes, node->type,
1696
+ node->action, node->ref_mod, ret);
23751697 return ret;
23761698 }
23771699
....@@ -2380,7 +1702,7 @@
23801702 {
23811703 struct btrfs_delayed_ref_node *ref;
23821704
2383
- if (RB_EMPTY_ROOT(&head->ref_tree))
1705
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
23841706 return NULL;
23851707
23861708 /*
....@@ -2393,7 +1715,7 @@
23931715 return list_first_entry(&head->ref_add_list,
23941716 struct btrfs_delayed_ref_node, add_list);
23951717
2396
- ref = rb_entry(rb_first(&head->ref_tree),
1718
+ ref = rb_entry(rb_first_cached(&head->ref_tree),
23971719 struct btrfs_delayed_ref_node, ref_node);
23981720 ASSERT(list_empty(&ref->add_list));
23991721 return ref;
....@@ -2409,23 +1731,69 @@
24091731 btrfs_delayed_ref_unlock(head);
24101732 }
24111733
2412
-static int cleanup_extent_op(struct btrfs_trans_handle *trans,
2413
- struct btrfs_delayed_ref_head *head)
1734
+static struct btrfs_delayed_extent_op *cleanup_extent_op(
1735
+ struct btrfs_delayed_ref_head *head)
24141736 {
24151737 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1738
+
1739
+ if (!extent_op)
1740
+ return NULL;
1741
+
1742
+ if (head->must_insert_reserved) {
1743
+ head->extent_op = NULL;
1744
+ btrfs_free_delayed_extent_op(extent_op);
1745
+ return NULL;
1746
+ }
1747
+ return extent_op;
1748
+}
1749
+
1750
+static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1751
+ struct btrfs_delayed_ref_head *head)
1752
+{
1753
+ struct btrfs_delayed_extent_op *extent_op;
24161754 int ret;
24171755
1756
+ extent_op = cleanup_extent_op(head);
24181757 if (!extent_op)
24191758 return 0;
24201759 head->extent_op = NULL;
2421
- if (head->must_insert_reserved) {
2422
- btrfs_free_delayed_extent_op(extent_op);
2423
- return 0;
2424
- }
24251760 spin_unlock(&head->lock);
24261761 ret = run_delayed_extent_op(trans, head, extent_op);
24271762 btrfs_free_delayed_extent_op(extent_op);
24281763 return ret ? ret : 1;
1764
+}
1765
+
1766
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1767
+ struct btrfs_delayed_ref_root *delayed_refs,
1768
+ struct btrfs_delayed_ref_head *head)
1769
+{
1770
+ int nr_items = 1; /* Dropping this ref head update. */
1771
+
1772
+ /*
1773
+ * We had csum deletions accounted for in our delayed refs rsv, we need
1774
+ * to drop the csum leaves for this update from our delayed_refs_rsv.
1775
+ */
1776
+ if (head->total_ref_mod < 0 && head->is_data) {
1777
+ spin_lock(&delayed_refs->lock);
1778
+ delayed_refs->pending_csums -= head->num_bytes;
1779
+ spin_unlock(&delayed_refs->lock);
1780
+ nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1781
+ }
1782
+
1783
+ /*
1784
+ * We were dropping refs, or had a new ref and dropped it, and thus must
1785
+ * adjust down our total_bytes_pinned, the space may or may not have
1786
+ * been pinned and so is accounted for properly in the pinned space by
1787
+ * now.
1788
+ */
1789
+ if (head->total_ref_mod < 0 ||
1790
+ (head->total_ref_mod == 0 && head->must_insert_reserved)) {
1791
+ u64 flags = btrfs_ref_head_to_space_flags(head);
1792
+
1793
+ btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
1794
+ }
1795
+
1796
+ btrfs_delayed_refs_rsv_release(fs_info, nr_items);
24291797 }
24301798
24311799 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
....@@ -2438,7 +1806,7 @@
24381806
24391807 delayed_refs = &trans->transaction->delayed_refs;
24401808
2441
- ret = cleanup_extent_op(trans, head);
1809
+ ret = run_and_cleanup_extent_op(trans, head);
24421810 if (ret < 0) {
24431811 unselect_delayed_ref_head(delayed_refs, head);
24441812 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
....@@ -2454,156 +1822,91 @@
24541822 spin_unlock(&head->lock);
24551823 spin_lock(&delayed_refs->lock);
24561824 spin_lock(&head->lock);
2457
- if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
1825
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
24581826 spin_unlock(&head->lock);
24591827 spin_unlock(&delayed_refs->lock);
24601828 return 1;
24611829 }
2462
- delayed_refs->num_heads--;
2463
- rb_erase(&head->href_node, &delayed_refs->href_root);
2464
- RB_CLEAR_NODE(&head->href_node);
1830
+ btrfs_delete_ref_head(delayed_refs, head);
24651831 spin_unlock(&head->lock);
24661832 spin_unlock(&delayed_refs->lock);
2467
- atomic_dec(&delayed_refs->num_entries);
2468
-
2469
- trace_run_delayed_ref_head(fs_info, head, 0);
2470
-
2471
- if (head->total_ref_mod < 0) {
2472
- struct btrfs_space_info *space_info;
2473
- u64 flags;
2474
-
2475
- if (head->is_data)
2476
- flags = BTRFS_BLOCK_GROUP_DATA;
2477
- else if (head->is_system)
2478
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
2479
- else
2480
- flags = BTRFS_BLOCK_GROUP_METADATA;
2481
- space_info = __find_space_info(fs_info, flags);
2482
- ASSERT(space_info);
2483
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
2484
- -head->num_bytes,
2485
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
2486
-
2487
- if (head->is_data) {
2488
- spin_lock(&delayed_refs->lock);
2489
- delayed_refs->pending_csums -= head->num_bytes;
2490
- spin_unlock(&delayed_refs->lock);
2491
- }
2492
- }
24931833
24941834 if (head->must_insert_reserved) {
2495
- btrfs_pin_extent(fs_info, head->bytenr,
2496
- head->num_bytes, 1);
1835
+ btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
24971836 if (head->is_data) {
24981837 ret = btrfs_del_csums(trans, fs_info->csum_root,
24991838 head->bytenr, head->num_bytes);
25001839 }
25011840 }
25021841
2503
- /* Also free its reserved qgroup space */
2504
- btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2505
- head->qgroup_reserved);
1842
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1843
+
1844
+ trace_run_delayed_ref_head(fs_info, head, 0);
25061845 btrfs_delayed_ref_unlock(head);
25071846 btrfs_put_delayed_ref_head(head);
25081847 return ret;
25091848 }
25101849
2511
-/*
2512
- * Returns 0 on success or if called with an already aborted transaction.
2513
- * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2514
- */
2515
-static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2516
- unsigned long nr)
1850
+static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1851
+ struct btrfs_trans_handle *trans)
1852
+{
1853
+ struct btrfs_delayed_ref_root *delayed_refs =
1854
+ &trans->transaction->delayed_refs;
1855
+ struct btrfs_delayed_ref_head *head = NULL;
1856
+ int ret;
1857
+
1858
+ spin_lock(&delayed_refs->lock);
1859
+ head = btrfs_select_ref_head(delayed_refs);
1860
+ if (!head) {
1861
+ spin_unlock(&delayed_refs->lock);
1862
+ return head;
1863
+ }
1864
+
1865
+ /*
1866
+ * Grab the lock that says we are going to process all the refs for
1867
+ * this head
1868
+ */
1869
+ ret = btrfs_delayed_ref_lock(delayed_refs, head);
1870
+ spin_unlock(&delayed_refs->lock);
1871
+
1872
+ /*
1873
+ * We may have dropped the spin lock to get the head mutex lock, and
1874
+ * that might have given someone else time to free the head. If that's
1875
+ * true, it has been removed from our list and we can move on.
1876
+ */
1877
+ if (ret == -EAGAIN)
1878
+ head = ERR_PTR(-EAGAIN);
1879
+
1880
+ return head;
1881
+}
1882
+
1883
+static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1884
+ struct btrfs_delayed_ref_head *locked_ref,
1885
+ unsigned long *run_refs)
25171886 {
25181887 struct btrfs_fs_info *fs_info = trans->fs_info;
25191888 struct btrfs_delayed_ref_root *delayed_refs;
2520
- struct btrfs_delayed_ref_node *ref;
2521
- struct btrfs_delayed_ref_head *locked_ref = NULL;
25221889 struct btrfs_delayed_extent_op *extent_op;
2523
- ktime_t start = ktime_get();
2524
- int ret;
2525
- unsigned long count = 0;
2526
- unsigned long actual_count = 0;
1890
+ struct btrfs_delayed_ref_node *ref;
25271891 int must_insert_reserved = 0;
1892
+ int ret;
25281893
25291894 delayed_refs = &trans->transaction->delayed_refs;
2530
- while (1) {
2531
- if (!locked_ref) {
2532
- if (count >= nr)
2533
- break;
25341895
2535
- spin_lock(&delayed_refs->lock);
2536
- locked_ref = btrfs_select_ref_head(trans);
2537
- if (!locked_ref) {
2538
- spin_unlock(&delayed_refs->lock);
2539
- break;
2540
- }
1896
+ lockdep_assert_held(&locked_ref->mutex);
1897
+ lockdep_assert_held(&locked_ref->lock);
25411898
2542
- /* grab the lock that says we are going to process
2543
- * all the refs for this head */
2544
- ret = btrfs_delayed_ref_lock(trans, locked_ref);
2545
- spin_unlock(&delayed_refs->lock);
2546
- /*
2547
- * we may have dropped the spin lock to get the head
2548
- * mutex lock, and that might have given someone else
2549
- * time to free the head. If that's true, it has been
2550
- * removed from our list and we can move on.
2551
- */
2552
- if (ret == -EAGAIN) {
2553
- locked_ref = NULL;
2554
- count++;
2555
- continue;
2556
- }
2557
- }
2558
-
2559
- /*
2560
- * We need to try and merge add/drops of the same ref since we
2561
- * can run into issues with relocate dropping the implicit ref
2562
- * and then it being added back again before the drop can
2563
- * finish. If we merged anything we need to re-loop so we can
2564
- * get a good ref.
2565
- * Or we can get node references of the same type that weren't
2566
- * merged when created due to bumps in the tree mod seq, and
2567
- * we need to merge them to prevent adding an inline extent
2568
- * backref before dropping it (triggering a BUG_ON at
2569
- * insert_inline_extent_backref()).
2570
- */
2571
- spin_lock(&locked_ref->lock);
2572
- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2573
-
2574
- ref = select_delayed_ref(locked_ref);
2575
-
2576
- if (ref && ref->seq &&
1899
+ while ((ref = select_delayed_ref(locked_ref))) {
1900
+ if (ref->seq &&
25771901 btrfs_check_delayed_seq(fs_info, ref->seq)) {
25781902 spin_unlock(&locked_ref->lock);
25791903 unselect_delayed_ref_head(delayed_refs, locked_ref);
2580
- locked_ref = NULL;
2581
- cond_resched();
2582
- count++;
2583
- continue;
1904
+ return -EAGAIN;
25841905 }
25851906
2586
- /*
2587
- * We're done processing refs in this ref_head, clean everything
2588
- * up and move on to the next ref_head.
2589
- */
2590
- if (!ref) {
2591
- ret = cleanup_ref_head(trans, locked_ref);
2592
- if (ret > 0 ) {
2593
- /* We dropped our lock, we need to loop. */
2594
- ret = 0;
2595
- continue;
2596
- } else if (ret) {
2597
- return ret;
2598
- }
2599
- locked_ref = NULL;
2600
- count++;
2601
- continue;
2602
- }
2603
-
2604
- actual_count++;
1907
+ (*run_refs)++;
26051908 ref->in_tree = 0;
2606
- rb_erase(&ref->ref_node, &locked_ref->ref_tree);
1909
+ rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
26071910 RB_CLEAR_NODE(&ref->ref_node);
26081911 if (!list_empty(&ref->add_list))
26091912 list_del(&ref->add_list);
....@@ -2625,8 +1928,8 @@
26251928 atomic_dec(&delayed_refs->num_entries);
26261929
26271930 /*
2628
- * Record the must-insert_reserved flag before we drop the spin
2629
- * lock.
1931
+ * Record the must_insert_reserved flag before we drop the
1932
+ * spin lock.
26301933 */
26311934 must_insert_reserved = locked_ref->must_insert_reserved;
26321935 locked_ref->must_insert_reserved = 0;
....@@ -2642,15 +1945,93 @@
26421945 if (ret) {
26431946 unselect_delayed_ref_head(delayed_refs, locked_ref);
26441947 btrfs_put_delayed_ref(ref);
2645
- btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2646
- ret);
26471948 return ret;
26481949 }
26491950
26501951 btrfs_put_delayed_ref(ref);
2651
- count++;
26521952 cond_resched();
1953
+
1954
+ spin_lock(&locked_ref->lock);
1955
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
26531956 }
1957
+
1958
+ return 0;
1959
+}
1960
+
1961
+/*
1962
+ * Returns 0 on success or if called with an already aborted transaction.
1963
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
1964
+ */
1965
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1966
+ unsigned long nr)
1967
+{
1968
+ struct btrfs_fs_info *fs_info = trans->fs_info;
1969
+ struct btrfs_delayed_ref_root *delayed_refs;
1970
+ struct btrfs_delayed_ref_head *locked_ref = NULL;
1971
+ ktime_t start = ktime_get();
1972
+ int ret;
1973
+ unsigned long count = 0;
1974
+ unsigned long actual_count = 0;
1975
+
1976
+ delayed_refs = &trans->transaction->delayed_refs;
1977
+ do {
1978
+ if (!locked_ref) {
1979
+ locked_ref = btrfs_obtain_ref_head(trans);
1980
+ if (IS_ERR_OR_NULL(locked_ref)) {
1981
+ if (PTR_ERR(locked_ref) == -EAGAIN) {
1982
+ continue;
1983
+ } else {
1984
+ break;
1985
+ }
1986
+ }
1987
+ count++;
1988
+ }
1989
+ /*
1990
+ * We need to try and merge add/drops of the same ref since we
1991
+ * can run into issues with relocate dropping the implicit ref
1992
+ * and then it being added back again before the drop can
1993
+ * finish. If we merged anything we need to re-loop so we can
1994
+ * get a good ref.
1995
+ * Or we can get node references of the same type that weren't
1996
+ * merged when created due to bumps in the tree mod seq, and
1997
+ * we need to merge them to prevent adding an inline extent
1998
+ * backref before dropping it (triggering a BUG_ON at
1999
+ * insert_inline_extent_backref()).
2000
+ */
2001
+ spin_lock(&locked_ref->lock);
2002
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2003
+
2004
+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2005
+ &actual_count);
2006
+ if (ret < 0 && ret != -EAGAIN) {
2007
+ /*
2008
+ * Error, btrfs_run_delayed_refs_for_head already
2009
+ * unlocked everything so just bail out
2010
+ */
2011
+ return ret;
2012
+ } else if (!ret) {
2013
+ /*
2014
+ * Success, perform the usual cleanup of a processed
2015
+ * head
2016
+ */
2017
+ ret = cleanup_ref_head(trans, locked_ref);
2018
+ if (ret > 0 ) {
2019
+ /* We dropped our lock, we need to loop. */
2020
+ ret = 0;
2021
+ continue;
2022
+ } else if (ret) {
2023
+ return ret;
2024
+ }
2025
+ }
2026
+
2027
+ /*
2028
+ * Either success case or btrfs_run_delayed_refs_for_head
2029
+ * returned -EAGAIN, meaning we need to select another head
2030
+ */
2031
+
2032
+ locked_ref = NULL;
2033
+ cond_resched();
2034
+ } while ((nr != -1 && count < nr) || locked_ref);
26542035
26552036 /*
26562037 * We don't want to include ref heads since we can have empty ref heads
....@@ -2716,22 +2097,6 @@
27162097 }
27172098 #endif
27182099
2719
-static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2720
-{
2721
- u64 num_bytes;
2722
-
2723
- num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2724
- sizeof(struct btrfs_extent_inline_ref));
2725
- if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2726
- num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2727
-
2728
- /*
2729
- * We don't ever fill up leaves all the way so multiply by 2 just to be
2730
- * closer to what we're really going to want to use.
2731
- */
2732
- return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2733
-}
2734
-
27352100 /*
27362101 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
27372102 * would require to store the csums for that many bytes.
....@@ -2749,153 +2114,6 @@
27492114 num_csums += num_csums_per_leaf - 1;
27502115 num_csums = div64_u64(num_csums, num_csums_per_leaf);
27512116 return num_csums;
2752
-}
2753
-
2754
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2755
- struct btrfs_fs_info *fs_info)
2756
-{
2757
- struct btrfs_block_rsv *global_rsv;
2758
- u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2759
- u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2760
- unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
2761
- u64 num_bytes, num_dirty_bgs_bytes;
2762
- int ret = 0;
2763
-
2764
- num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
2765
- num_heads = heads_to_leaves(fs_info, num_heads);
2766
- if (num_heads > 1)
2767
- num_bytes += (num_heads - 1) * fs_info->nodesize;
2768
- num_bytes <<= 1;
2769
- num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
2770
- fs_info->nodesize;
2771
- num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
2772
- num_dirty_bgs);
2773
- global_rsv = &fs_info->global_block_rsv;
2774
-
2775
- /*
2776
- * If we can't allocate any more chunks lets make sure we have _lots_ of
2777
- * wiggle room since running delayed refs can create more delayed refs.
2778
- */
2779
- if (global_rsv->space_info->full) {
2780
- num_dirty_bgs_bytes <<= 1;
2781
- num_bytes <<= 1;
2782
- }
2783
-
2784
- spin_lock(&global_rsv->lock);
2785
- if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2786
- ret = 1;
2787
- spin_unlock(&global_rsv->lock);
2788
- return ret;
2789
-}
2790
-
2791
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2792
- struct btrfs_fs_info *fs_info)
2793
-{
2794
- u64 num_entries =
2795
- atomic_read(&trans->transaction->delayed_refs.num_entries);
2796
- u64 avg_runtime;
2797
- u64 val;
2798
-
2799
- smp_mb();
2800
- avg_runtime = fs_info->avg_delayed_ref_runtime;
2801
- val = num_entries * avg_runtime;
2802
- if (val >= NSEC_PER_SEC)
2803
- return 1;
2804
- if (val >= NSEC_PER_SEC / 2)
2805
- return 2;
2806
-
2807
- return btrfs_check_space_for_delayed_refs(trans, fs_info);
2808
-}
2809
-
2810
-struct async_delayed_refs {
2811
- struct btrfs_root *root;
2812
- u64 transid;
2813
- int count;
2814
- int error;
2815
- int sync;
2816
- struct completion wait;
2817
- struct btrfs_work work;
2818
-};
2819
-
2820
-static inline struct async_delayed_refs *
2821
-to_async_delayed_refs(struct btrfs_work *work)
2822
-{
2823
- return container_of(work, struct async_delayed_refs, work);
2824
-}
2825
-
2826
-static void delayed_ref_async_start(struct btrfs_work *work)
2827
-{
2828
- struct async_delayed_refs *async = to_async_delayed_refs(work);
2829
- struct btrfs_trans_handle *trans;
2830
- struct btrfs_fs_info *fs_info = async->root->fs_info;
2831
- int ret;
2832
-
2833
- /* if the commit is already started, we don't need to wait here */
2834
- if (btrfs_transaction_blocked(fs_info))
2835
- goto done;
2836
-
2837
- trans = btrfs_join_transaction(async->root);
2838
- if (IS_ERR(trans)) {
2839
- async->error = PTR_ERR(trans);
2840
- goto done;
2841
- }
2842
-
2843
- /*
2844
- * trans->sync means that when we call end_transaction, we won't
2845
- * wait on delayed refs
2846
- */
2847
- trans->sync = true;
2848
-
2849
- /* Don't bother flushing if we got into a different transaction */
2850
- if (trans->transid > async->transid)
2851
- goto end;
2852
-
2853
- ret = btrfs_run_delayed_refs(trans, async->count);
2854
- if (ret)
2855
- async->error = ret;
2856
-end:
2857
- ret = btrfs_end_transaction(trans);
2858
- if (ret && !async->error)
2859
- async->error = ret;
2860
-done:
2861
- if (async->sync)
2862
- complete(&async->wait);
2863
- else
2864
- kfree(async);
2865
-}
2866
-
2867
-int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
2868
- unsigned long count, u64 transid, int wait)
2869
-{
2870
- struct async_delayed_refs *async;
2871
- int ret;
2872
-
2873
- async = kmalloc(sizeof(*async), GFP_NOFS);
2874
- if (!async)
2875
- return -ENOMEM;
2876
-
2877
- async->root = fs_info->tree_root;
2878
- async->count = count;
2879
- async->error = 0;
2880
- async->transid = transid;
2881
- if (wait)
2882
- async->sync = 1;
2883
- else
2884
- async->sync = 0;
2885
- init_completion(&async->wait);
2886
-
2887
- btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2888
- delayed_ref_async_start, NULL, NULL);
2889
-
2890
- btrfs_queue_work(fs_info->extent_workers, &async->work);
2891
-
2892
- if (wait) {
2893
- wait_for_completion(&async->wait);
2894
- ret = async->error;
2895
- kfree(async);
2896
- return ret;
2897
- }
2898
- return 0;
28992117 }
29002118
29012119 /*
....@@ -2919,7 +2137,7 @@
29192137 int run_all = count == (unsigned long)-1;
29202138
29212139 /* We'll clean this up in btrfs_cleanup_transaction */
2922
- if (trans->aborted)
2140
+ if (TRANS_ABORTED(trans))
29232141 return 0;
29242142
29252143 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
....@@ -2940,11 +2158,10 @@
29402158 }
29412159
29422160 if (run_all) {
2943
- if (!list_empty(&trans->new_bgs))
2944
- btrfs_create_pending_block_groups(trans);
2161
+ btrfs_create_pending_block_groups(trans);
29452162
29462163 spin_lock(&delayed_refs->lock);
2947
- node = rb_first(&delayed_refs->href_root);
2164
+ node = rb_first_cached(&delayed_refs->href_root);
29482165 if (!node) {
29492166 spin_unlock(&delayed_refs->lock);
29502167 goto out;
....@@ -2967,8 +2184,7 @@
29672184 }
29682185
29692186 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2970
- struct btrfs_fs_info *fs_info,
2971
- u64 bytenr, u64 num_bytes, u64 flags,
2187
+ struct extent_buffer *eb, u64 flags,
29722188 int level, int is_data)
29732189 {
29742190 struct btrfs_delayed_extent_op *extent_op;
....@@ -2984,8 +2200,7 @@
29842200 extent_op->is_data = is_data ? true : false;
29852201 extent_op->level = level;
29862202
2987
- ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
2988
- num_bytes, extent_op);
2203
+ ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
29892204 if (ret)
29902205 btrfs_free_delayed_extent_op(extent_op);
29912206 return ret;
....@@ -3043,7 +2258,8 @@
30432258 * XXX: We should replace this with a proper search function in the
30442259 * future.
30452260 */
3046
- for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
2261
+ for (node = rb_first_cached(&head->ref_tree); node;
2262
+ node = rb_next(node)) {
30472263 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
30482264 /* If it's a shared ref we know a cross reference exists */
30492265 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
....@@ -3072,7 +2288,8 @@
30722288
30732289 static noinline int check_committed_ref(struct btrfs_root *root,
30742290 struct btrfs_path *path,
3075
- u64 objectid, u64 offset, u64 bytenr)
2291
+ u64 objectid, u64 offset, u64 bytenr,
2292
+ bool strict)
30762293 {
30772294 struct btrfs_fs_info *fs_info = root->fs_info;
30782295 struct btrfs_root *extent_root = fs_info->extent_root;
....@@ -3109,16 +2326,23 @@
31092326 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
31102327 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
31112328
2329
+ /* If extent item has more than 1 inline ref then it's shared */
31122330 if (item_size != sizeof(*ei) +
31132331 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
31142332 goto out;
31152333
3116
- if (btrfs_extent_generation(leaf, ei) <=
3117
- btrfs_root_last_snapshot(&root->root_item))
2334
+ /*
2335
+ * If extent created before last snapshot => it's shared unless the
2336
+ * snapshot has been deleted. Use the heuristic if strict is false.
2337
+ */
2338
+ if (!strict &&
2339
+ (btrfs_extent_generation(leaf, ei) <=
2340
+ btrfs_root_last_snapshot(&root->root_item)))
31182341 goto out;
31192342
31202343 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
31212344
2345
+ /* If this extent has SHARED_DATA_REF then it's shared */
31222346 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
31232347 if (type != BTRFS_EXTENT_DATA_REF_KEY)
31242348 goto out;
....@@ -3138,11 +2362,10 @@
31382362 }
31392363
31402364 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3141
- u64 bytenr)
2365
+ u64 bytenr, bool strict)
31422366 {
31432367 struct btrfs_path *path;
31442368 int ret;
3145
- int ret2;
31462369
31472370 path = btrfs_alloc_path();
31482371 if (!path)
....@@ -3150,21 +2373,13 @@
31502373
31512374 do {
31522375 ret = check_committed_ref(root, path, objectid,
3153
- offset, bytenr);
2376
+ offset, bytenr, strict);
31542377 if (ret && ret != -ENOENT)
31552378 goto out;
31562379
3157
- ret2 = check_delayed_ref(root, path, objectid,
3158
- offset, bytenr);
3159
- } while (ret2 == -EAGAIN);
2380
+ ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2381
+ } while (ret == -EAGAIN);
31602382
3161
- if (ret2 && ret2 != -ENOENT) {
3162
- ret = ret2;
3163
- goto out;
3164
- }
3165
-
3166
- if (ret != -ENOENT || ret2 != -ENOENT)
3167
- ret = 0;
31682383 out:
31692384 btrfs_free_path(path);
31702385 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
....@@ -3185,13 +2400,12 @@
31852400 u32 nritems;
31862401 struct btrfs_key key;
31872402 struct btrfs_file_extent_item *fi;
2403
+ struct btrfs_ref generic_ref = { 0 };
2404
+ bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
31882405 int i;
2406
+ int action;
31892407 int level;
31902408 int ret = 0;
3191
- int (*process_func)(struct btrfs_trans_handle *,
3192
- struct btrfs_root *,
3193
- u64, u64, u64, u64, u64, u64);
3194
-
31952409
31962410 if (btrfs_is_testing(fs_info))
31972411 return 0;
....@@ -3200,18 +2414,17 @@
32002414 nritems = btrfs_header_nritems(buf);
32012415 level = btrfs_header_level(buf);
32022416
3203
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
2417
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
32042418 return 0;
3205
-
3206
- if (inc)
3207
- process_func = btrfs_inc_extent_ref;
3208
- else
3209
- process_func = btrfs_free_extent;
32102419
32112420 if (full_backref)
32122421 parent = buf->start;
32132422 else
32142423 parent = 0;
2424
+ if (inc)
2425
+ action = BTRFS_ADD_DELAYED_REF;
2426
+ else
2427
+ action = BTRFS_DROP_DELAYED_REF;
32152428
32162429 for (i = 0; i < nritems; i++) {
32172430 if (level == 0) {
....@@ -3229,16 +2442,30 @@
32292442
32302443 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
32312444 key.offset -= btrfs_file_extent_offset(buf, fi);
3232
- ret = process_func(trans, root, bytenr, num_bytes,
3233
- parent, ref_root, key.objectid,
3234
- key.offset);
2445
+ btrfs_init_generic_ref(&generic_ref, action, bytenr,
2446
+ num_bytes, parent);
2447
+ generic_ref.real_root = root->root_key.objectid;
2448
+ btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2449
+ key.offset);
2450
+ generic_ref.skip_qgroup = for_reloc;
2451
+ if (inc)
2452
+ ret = btrfs_inc_extent_ref(trans, &generic_ref);
2453
+ else
2454
+ ret = btrfs_free_extent(trans, &generic_ref);
32352455 if (ret)
32362456 goto fail;
32372457 } else {
32382458 bytenr = btrfs_node_blockptr(buf, i);
32392459 num_bytes = fs_info->nodesize;
3240
- ret = process_func(trans, root, bytenr, num_bytes,
3241
- parent, ref_root, level - 1, 0);
2460
+ btrfs_init_generic_ref(&generic_ref, action, bytenr,
2461
+ num_bytes, parent);
2462
+ generic_ref.real_root = root->root_key.objectid;
2463
+ btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
2464
+ generic_ref.skip_qgroup = for_reloc;
2465
+ if (inc)
2466
+ ret = btrfs_inc_extent_ref(trans, &generic_ref);
2467
+ else
2468
+ ret = btrfs_free_extent(trans, &generic_ref);
32422469 if (ret)
32432470 goto fail;
32442471 }
....@@ -3260,555 +2487,9 @@
32602487 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
32612488 }
32622489
3263
-static int write_one_cache_group(struct btrfs_trans_handle *trans,
3264
- struct btrfs_fs_info *fs_info,
3265
- struct btrfs_path *path,
3266
- struct btrfs_block_group_cache *cache)
3267
-{
3268
- int ret;
3269
- struct btrfs_root *extent_root = fs_info->extent_root;
3270
- unsigned long bi;
3271
- struct extent_buffer *leaf;
3272
-
3273
- ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3274
- if (ret) {
3275
- if (ret > 0)
3276
- ret = -ENOENT;
3277
- goto fail;
3278
- }
3279
-
3280
- leaf = path->nodes[0];
3281
- bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3282
- write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3283
- btrfs_mark_buffer_dirty(leaf);
3284
-fail:
3285
- btrfs_release_path(path);
3286
- return ret;
3287
-
3288
-}
3289
-
3290
-static struct btrfs_block_group_cache *
3291
-next_block_group(struct btrfs_fs_info *fs_info,
3292
- struct btrfs_block_group_cache *cache)
3293
-{
3294
- struct rb_node *node;
3295
-
3296
- spin_lock(&fs_info->block_group_cache_lock);
3297
-
3298
- /* If our block group was removed, we need a full search. */
3299
- if (RB_EMPTY_NODE(&cache->cache_node)) {
3300
- const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3301
-
3302
- spin_unlock(&fs_info->block_group_cache_lock);
3303
- btrfs_put_block_group(cache);
3304
- cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3305
- }
3306
- node = rb_next(&cache->cache_node);
3307
- btrfs_put_block_group(cache);
3308
- if (node) {
3309
- cache = rb_entry(node, struct btrfs_block_group_cache,
3310
- cache_node);
3311
- btrfs_get_block_group(cache);
3312
- } else
3313
- cache = NULL;
3314
- spin_unlock(&fs_info->block_group_cache_lock);
3315
- return cache;
3316
-}
3317
-
3318
-static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3319
- struct btrfs_trans_handle *trans,
3320
- struct btrfs_path *path)
3321
-{
3322
- struct btrfs_fs_info *fs_info = block_group->fs_info;
3323
- struct btrfs_root *root = fs_info->tree_root;
3324
- struct inode *inode = NULL;
3325
- struct extent_changeset *data_reserved = NULL;
3326
- u64 alloc_hint = 0;
3327
- int dcs = BTRFS_DC_ERROR;
3328
- u64 num_pages = 0;
3329
- int retries = 0;
3330
- int ret = 0;
3331
-
3332
- /*
3333
- * If this block group is smaller than 100 megs don't bother caching the
3334
- * block group.
3335
- */
3336
- if (block_group->key.offset < (100 * SZ_1M)) {
3337
- spin_lock(&block_group->lock);
3338
- block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3339
- spin_unlock(&block_group->lock);
3340
- return 0;
3341
- }
3342
-
3343
- if (trans->aborted)
3344
- return 0;
3345
-again:
3346
- inode = lookup_free_space_inode(fs_info, block_group, path);
3347
- if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3348
- ret = PTR_ERR(inode);
3349
- btrfs_release_path(path);
3350
- goto out;
3351
- }
3352
-
3353
- if (IS_ERR(inode)) {
3354
- BUG_ON(retries);
3355
- retries++;
3356
-
3357
- if (block_group->ro)
3358
- goto out_free;
3359
-
3360
- ret = create_free_space_inode(fs_info, trans, block_group,
3361
- path);
3362
- if (ret)
3363
- goto out_free;
3364
- goto again;
3365
- }
3366
-
3367
- /*
3368
- * We want to set the generation to 0, that way if anything goes wrong
3369
- * from here on out we know not to trust this cache when we load up next
3370
- * time.
3371
- */
3372
- BTRFS_I(inode)->generation = 0;
3373
- ret = btrfs_update_inode(trans, root, inode);
3374
- if (ret) {
3375
- /*
3376
- * So theoretically we could recover from this, simply set the
3377
- * super cache generation to 0 so we know to invalidate the
3378
- * cache, but then we'd have to keep track of the block groups
3379
- * that fail this way so we know we _have_ to reset this cache
3380
- * before the next commit or risk reading stale cache. So to
3381
- * limit our exposure to horrible edge cases lets just abort the
3382
- * transaction, this only happens in really bad situations
3383
- * anyway.
3384
- */
3385
- btrfs_abort_transaction(trans, ret);
3386
- goto out_put;
3387
- }
3388
- WARN_ON(ret);
3389
-
3390
- /* We've already setup this transaction, go ahead and exit */
3391
- if (block_group->cache_generation == trans->transid &&
3392
- i_size_read(inode)) {
3393
- dcs = BTRFS_DC_SETUP;
3394
- goto out_put;
3395
- }
3396
-
3397
- if (i_size_read(inode) > 0) {
3398
- ret = btrfs_check_trunc_cache_free_space(fs_info,
3399
- &fs_info->global_block_rsv);
3400
- if (ret)
3401
- goto out_put;
3402
-
3403
- ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3404
- if (ret)
3405
- goto out_put;
3406
- }
3407
-
3408
- spin_lock(&block_group->lock);
3409
- if (block_group->cached != BTRFS_CACHE_FINISHED ||
3410
- !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3411
- /*
3412
- * don't bother trying to write stuff out _if_
3413
- * a) we're not cached,
3414
- * b) we're with nospace_cache mount option,
3415
- * c) we're with v2 space_cache (FREE_SPACE_TREE).
3416
- */
3417
- dcs = BTRFS_DC_WRITTEN;
3418
- spin_unlock(&block_group->lock);
3419
- goto out_put;
3420
- }
3421
- spin_unlock(&block_group->lock);
3422
-
3423
- /*
3424
- * We hit an ENOSPC when setting up the cache in this transaction, just
3425
- * skip doing the setup, we've already cleared the cache so we're safe.
3426
- */
3427
- if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3428
- ret = -ENOSPC;
3429
- goto out_put;
3430
- }
3431
-
3432
- /*
3433
- * Try to preallocate enough space based on how big the block group is.
3434
- * Keep in mind this has to include any pinned space which could end up
3435
- * taking up quite a bit since it's not folded into the other space
3436
- * cache.
3437
- */
3438
- num_pages = div_u64(block_group->key.offset, SZ_256M);
3439
- if (!num_pages)
3440
- num_pages = 1;
3441
-
3442
- num_pages *= 16;
3443
- num_pages *= PAGE_SIZE;
3444
-
3445
- ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3446
- if (ret)
3447
- goto out_put;
3448
-
3449
- ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3450
- num_pages, num_pages,
3451
- &alloc_hint);
3452
- /*
3453
- * Our cache requires contiguous chunks so that we don't modify a bunch
3454
- * of metadata or split extents when writing the cache out, which means
3455
- * we can enospc if we are heavily fragmented in addition to just normal
3456
- * out of space conditions. So if we hit this just skip setting up any
3457
- * other block groups for this transaction, maybe we'll unpin enough
3458
- * space the next time around.
3459
- */
3460
- if (!ret)
3461
- dcs = BTRFS_DC_SETUP;
3462
- else if (ret == -ENOSPC)
3463
- set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3464
-
3465
-out_put:
3466
- iput(inode);
3467
-out_free:
3468
- btrfs_release_path(path);
3469
-out:
3470
- spin_lock(&block_group->lock);
3471
- if (!ret && dcs == BTRFS_DC_SETUP)
3472
- block_group->cache_generation = trans->transid;
3473
- block_group->disk_cache_state = dcs;
3474
- spin_unlock(&block_group->lock);
3475
-
3476
- extent_changeset_free(data_reserved);
3477
- return ret;
3478
-}
3479
-
3480
-int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3481
- struct btrfs_fs_info *fs_info)
3482
-{
3483
- struct btrfs_block_group_cache *cache, *tmp;
3484
- struct btrfs_transaction *cur_trans = trans->transaction;
3485
- struct btrfs_path *path;
3486
-
3487
- if (list_empty(&cur_trans->dirty_bgs) ||
3488
- !btrfs_test_opt(fs_info, SPACE_CACHE))
3489
- return 0;
3490
-
3491
- path = btrfs_alloc_path();
3492
- if (!path)
3493
- return -ENOMEM;
3494
-
3495
- /* Could add new block groups, use _safe just in case */
3496
- list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3497
- dirty_list) {
3498
- if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3499
- cache_save_setup(cache, trans, path);
3500
- }
3501
-
3502
- btrfs_free_path(path);
3503
- return 0;
3504
-}
3505
-
3506
-/*
3507
- * transaction commit does final block group cache writeback during a
3508
- * critical section where nothing is allowed to change the FS. This is
3509
- * required in order for the cache to actually match the block group,
3510
- * but can introduce a lot of latency into the commit.
3511
- *
3512
- * So, btrfs_start_dirty_block_groups is here to kick off block group
3513
- * cache IO. There's a chance we'll have to redo some of it if the
3514
- * block group changes again during the commit, but it greatly reduces
3515
- * the commit latency by getting rid of the easy block groups while
3516
- * we're still allowing others to join the commit.
3517
- */
3518
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3519
-{
3520
- struct btrfs_fs_info *fs_info = trans->fs_info;
3521
- struct btrfs_block_group_cache *cache;
3522
- struct btrfs_transaction *cur_trans = trans->transaction;
3523
- int ret = 0;
3524
- int should_put;
3525
- struct btrfs_path *path = NULL;
3526
- LIST_HEAD(dirty);
3527
- struct list_head *io = &cur_trans->io_bgs;
3528
- int num_started = 0;
3529
- int loops = 0;
3530
-
3531
- spin_lock(&cur_trans->dirty_bgs_lock);
3532
- if (list_empty(&cur_trans->dirty_bgs)) {
3533
- spin_unlock(&cur_trans->dirty_bgs_lock);
3534
- return 0;
3535
- }
3536
- list_splice_init(&cur_trans->dirty_bgs, &dirty);
3537
- spin_unlock(&cur_trans->dirty_bgs_lock);
3538
-
3539
-again:
3540
- /*
3541
- * make sure all the block groups on our dirty list actually
3542
- * exist
3543
- */
3544
- btrfs_create_pending_block_groups(trans);
3545
-
3546
- if (!path) {
3547
- path = btrfs_alloc_path();
3548
- if (!path)
3549
- return -ENOMEM;
3550
- }
3551
-
3552
- /*
3553
- * cache_write_mutex is here only to save us from balance or automatic
3554
- * removal of empty block groups deleting this block group while we are
3555
- * writing out the cache
3556
- */
3557
- mutex_lock(&trans->transaction->cache_write_mutex);
3558
- while (!list_empty(&dirty)) {
3559
- cache = list_first_entry(&dirty,
3560
- struct btrfs_block_group_cache,
3561
- dirty_list);
3562
- /*
3563
- * this can happen if something re-dirties a block
3564
- * group that is already under IO. Just wait for it to
3565
- * finish and then do it all again
3566
- */
3567
- if (!list_empty(&cache->io_list)) {
3568
- list_del_init(&cache->io_list);
3569
- btrfs_wait_cache_io(trans, cache, path);
3570
- btrfs_put_block_group(cache);
3571
- }
3572
-
3573
-
3574
- /*
3575
- * btrfs_wait_cache_io uses the cache->dirty_list to decide
3576
- * if it should update the cache_state. Don't delete
3577
- * until after we wait.
3578
- *
3579
- * Since we're not running in the commit critical section
3580
- * we need the dirty_bgs_lock to protect from update_block_group
3581
- */
3582
- spin_lock(&cur_trans->dirty_bgs_lock);
3583
- list_del_init(&cache->dirty_list);
3584
- spin_unlock(&cur_trans->dirty_bgs_lock);
3585
-
3586
- should_put = 1;
3587
-
3588
- cache_save_setup(cache, trans, path);
3589
-
3590
- if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3591
- cache->io_ctl.inode = NULL;
3592
- ret = btrfs_write_out_cache(fs_info, trans,
3593
- cache, path);
3594
- if (ret == 0 && cache->io_ctl.inode) {
3595
- num_started++;
3596
- should_put = 0;
3597
-
3598
- /*
3599
- * The cache_write_mutex is protecting the
3600
- * io_list, also refer to the definition of
3601
- * btrfs_transaction::io_bgs for more details
3602
- */
3603
- list_add_tail(&cache->io_list, io);
3604
- } else {
3605
- /*
3606
- * if we failed to write the cache, the
3607
- * generation will be bad and life goes on
3608
- */
3609
- ret = 0;
3610
- }
3611
- }
3612
- if (!ret) {
3613
- ret = write_one_cache_group(trans, fs_info,
3614
- path, cache);
3615
- /*
3616
- * Our block group might still be attached to the list
3617
- * of new block groups in the transaction handle of some
3618
- * other task (struct btrfs_trans_handle->new_bgs). This
3619
- * means its block group item isn't yet in the extent
3620
- * tree. If this happens ignore the error, as we will
3621
- * try again later in the critical section of the
3622
- * transaction commit.
3623
- */
3624
- if (ret == -ENOENT) {
3625
- ret = 0;
3626
- spin_lock(&cur_trans->dirty_bgs_lock);
3627
- if (list_empty(&cache->dirty_list)) {
3628
- list_add_tail(&cache->dirty_list,
3629
- &cur_trans->dirty_bgs);
3630
- btrfs_get_block_group(cache);
3631
- }
3632
- spin_unlock(&cur_trans->dirty_bgs_lock);
3633
- } else if (ret) {
3634
- btrfs_abort_transaction(trans, ret);
3635
- }
3636
- }
3637
-
3638
- /* if its not on the io list, we need to put the block group */
3639
- if (should_put)
3640
- btrfs_put_block_group(cache);
3641
-
3642
- if (ret)
3643
- break;
3644
-
3645
- /*
3646
- * Avoid blocking other tasks for too long. It might even save
3647
- * us from writing caches for block groups that are going to be
3648
- * removed.
3649
- */
3650
- mutex_unlock(&trans->transaction->cache_write_mutex);
3651
- mutex_lock(&trans->transaction->cache_write_mutex);
3652
- }
3653
- mutex_unlock(&trans->transaction->cache_write_mutex);
3654
-
3655
- /*
3656
- * go through delayed refs for all the stuff we've just kicked off
3657
- * and then loop back (just once)
3658
- */
3659
- ret = btrfs_run_delayed_refs(trans, 0);
3660
- if (!ret && loops == 0) {
3661
- loops++;
3662
- spin_lock(&cur_trans->dirty_bgs_lock);
3663
- list_splice_init(&cur_trans->dirty_bgs, &dirty);
3664
- /*
3665
- * dirty_bgs_lock protects us from concurrent block group
3666
- * deletes too (not just cache_write_mutex).
3667
- */
3668
- if (!list_empty(&dirty)) {
3669
- spin_unlock(&cur_trans->dirty_bgs_lock);
3670
- goto again;
3671
- }
3672
- spin_unlock(&cur_trans->dirty_bgs_lock);
3673
- } else if (ret < 0) {
3674
- btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3675
- }
3676
-
3677
- btrfs_free_path(path);
3678
- return ret;
3679
-}
3680
-
3681
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3682
- struct btrfs_fs_info *fs_info)
3683
-{
3684
- struct btrfs_block_group_cache *cache;
3685
- struct btrfs_transaction *cur_trans = trans->transaction;
3686
- int ret = 0;
3687
- int should_put;
3688
- struct btrfs_path *path;
3689
- struct list_head *io = &cur_trans->io_bgs;
3690
- int num_started = 0;
3691
-
3692
- path = btrfs_alloc_path();
3693
- if (!path)
3694
- return -ENOMEM;
3695
-
3696
- /*
3697
- * Even though we are in the critical section of the transaction commit,
3698
- * we can still have concurrent tasks adding elements to this
3699
- * transaction's list of dirty block groups. These tasks correspond to
3700
- * endio free space workers started when writeback finishes for a
3701
- * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3702
- * allocate new block groups as a result of COWing nodes of the root
3703
- * tree when updating the free space inode. The writeback for the space
3704
- * caches is triggered by an earlier call to
3705
- * btrfs_start_dirty_block_groups() and iterations of the following
3706
- * loop.
3707
- * Also we want to do the cache_save_setup first and then run the
3708
- * delayed refs to make sure we have the best chance at doing this all
3709
- * in one shot.
3710
- */
3711
- spin_lock(&cur_trans->dirty_bgs_lock);
3712
- while (!list_empty(&cur_trans->dirty_bgs)) {
3713
- cache = list_first_entry(&cur_trans->dirty_bgs,
3714
- struct btrfs_block_group_cache,
3715
- dirty_list);
3716
-
3717
- /*
3718
- * this can happen if cache_save_setup re-dirties a block
3719
- * group that is already under IO. Just wait for it to
3720
- * finish and then do it all again
3721
- */
3722
- if (!list_empty(&cache->io_list)) {
3723
- spin_unlock(&cur_trans->dirty_bgs_lock);
3724
- list_del_init(&cache->io_list);
3725
- btrfs_wait_cache_io(trans, cache, path);
3726
- btrfs_put_block_group(cache);
3727
- spin_lock(&cur_trans->dirty_bgs_lock);
3728
- }
3729
-
3730
- /*
3731
- * don't remove from the dirty list until after we've waited
3732
- * on any pending IO
3733
- */
3734
- list_del_init(&cache->dirty_list);
3735
- spin_unlock(&cur_trans->dirty_bgs_lock);
3736
- should_put = 1;
3737
-
3738
- cache_save_setup(cache, trans, path);
3739
-
3740
- if (!ret)
3741
- ret = btrfs_run_delayed_refs(trans,
3742
- (unsigned long) -1);
3743
-
3744
- if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3745
- cache->io_ctl.inode = NULL;
3746
- ret = btrfs_write_out_cache(fs_info, trans,
3747
- cache, path);
3748
- if (ret == 0 && cache->io_ctl.inode) {
3749
- num_started++;
3750
- should_put = 0;
3751
- list_add_tail(&cache->io_list, io);
3752
- } else {
3753
- /*
3754
- * if we failed to write the cache, the
3755
- * generation will be bad and life goes on
3756
- */
3757
- ret = 0;
3758
- }
3759
- }
3760
- if (!ret) {
3761
- ret = write_one_cache_group(trans, fs_info,
3762
- path, cache);
3763
- /*
3764
- * One of the free space endio workers might have
3765
- * created a new block group while updating a free space
3766
- * cache's inode (at inode.c:btrfs_finish_ordered_io())
3767
- * and hasn't released its transaction handle yet, in
3768
- * which case the new block group is still attached to
3769
- * its transaction handle and its creation has not
3770
- * finished yet (no block group item in the extent tree
3771
- * yet, etc). If this is the case, wait for all free
3772
- * space endio workers to finish and retry. This is a
3773
- * a very rare case so no need for a more efficient and
3774
- * complex approach.
3775
- */
3776
- if (ret == -ENOENT) {
3777
- wait_event(cur_trans->writer_wait,
3778
- atomic_read(&cur_trans->num_writers) == 1);
3779
- ret = write_one_cache_group(trans, fs_info,
3780
- path, cache);
3781
- }
3782
- if (ret)
3783
- btrfs_abort_transaction(trans, ret);
3784
- }
3785
-
3786
- /* if its not on the io list, we need to put the block group */
3787
- if (should_put)
3788
- btrfs_put_block_group(cache);
3789
- spin_lock(&cur_trans->dirty_bgs_lock);
3790
- }
3791
- spin_unlock(&cur_trans->dirty_bgs_lock);
3792
-
3793
- /*
3794
- * Refer to the definition of io_bgs member for details why it's safe
3795
- * to use it without any locking
3796
- */
3797
- while (!list_empty(io)) {
3798
- cache = list_first_entry(io, struct btrfs_block_group_cache,
3799
- io_list);
3800
- list_del_init(&cache->io_list);
3801
- btrfs_wait_cache_io(trans, cache, path);
3802
- btrfs_put_block_group(cache);
3803
- }
3804
-
3805
- btrfs_free_path(path);
3806
- return ret;
3807
-}
3808
-
38092490 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
38102491 {
3811
- struct btrfs_block_group_cache *block_group;
2492
+ struct btrfs_block_group *block_group;
38122493 int readonly = 0;
38132494
38142495 block_group = btrfs_lookup_block_group(fs_info, bytenr);
....@@ -3817,253 +2498,6 @@
38172498 if (block_group)
38182499 btrfs_put_block_group(block_group);
38192500 return readonly;
3820
-}
3821
-
3822
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3823
-{
3824
- struct btrfs_block_group_cache *bg;
3825
- bool ret = true;
3826
-
3827
- bg = btrfs_lookup_block_group(fs_info, bytenr);
3828
- if (!bg)
3829
- return false;
3830
-
3831
- spin_lock(&bg->lock);
3832
- if (bg->ro)
3833
- ret = false;
3834
- else
3835
- atomic_inc(&bg->nocow_writers);
3836
- spin_unlock(&bg->lock);
3837
-
3838
- /* no put on block group, done by btrfs_dec_nocow_writers */
3839
- if (!ret)
3840
- btrfs_put_block_group(bg);
3841
-
3842
- return ret;
3843
-
3844
-}
3845
-
3846
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3847
-{
3848
- struct btrfs_block_group_cache *bg;
3849
-
3850
- bg = btrfs_lookup_block_group(fs_info, bytenr);
3851
- ASSERT(bg);
3852
- if (atomic_dec_and_test(&bg->nocow_writers))
3853
- wake_up_var(&bg->nocow_writers);
3854
- /*
3855
- * Once for our lookup and once for the lookup done by a previous call
3856
- * to btrfs_inc_nocow_writers()
3857
- */
3858
- btrfs_put_block_group(bg);
3859
- btrfs_put_block_group(bg);
3860
-}
3861
-
3862
-void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3863
-{
3864
- wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3865
-}
3866
-
3867
-static const char *alloc_name(u64 flags)
3868
-{
3869
- switch (flags) {
3870
- case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3871
- return "mixed";
3872
- case BTRFS_BLOCK_GROUP_METADATA:
3873
- return "metadata";
3874
- case BTRFS_BLOCK_GROUP_DATA:
3875
- return "data";
3876
- case BTRFS_BLOCK_GROUP_SYSTEM:
3877
- return "system";
3878
- default:
3879
- WARN_ON(1);
3880
- return "invalid-combination";
3881
- };
3882
-}
3883
-
3884
-static int create_space_info(struct btrfs_fs_info *info, u64 flags)
3885
-{
3886
-
3887
- struct btrfs_space_info *space_info;
3888
- int i;
3889
- int ret;
3890
-
3891
- space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
3892
- if (!space_info)
3893
- return -ENOMEM;
3894
-
3895
- ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
3896
- GFP_KERNEL);
3897
- if (ret) {
3898
- kfree(space_info);
3899
- return ret;
3900
- }
3901
-
3902
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3903
- INIT_LIST_HEAD(&space_info->block_groups[i]);
3904
- init_rwsem(&space_info->groups_sem);
3905
- spin_lock_init(&space_info->lock);
3906
- space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3907
- space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3908
- init_waitqueue_head(&space_info->wait);
3909
- INIT_LIST_HEAD(&space_info->ro_bgs);
3910
- INIT_LIST_HEAD(&space_info->tickets);
3911
- INIT_LIST_HEAD(&space_info->priority_tickets);
3912
-
3913
- ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
3914
- info->space_info_kobj, "%s",
3915
- alloc_name(space_info->flags));
3916
- if (ret) {
3917
- kobject_put(&space_info->kobj);
3918
- return ret;
3919
- }
3920
-
3921
- list_add_rcu(&space_info->list, &info->space_info);
3922
- if (flags & BTRFS_BLOCK_GROUP_DATA)
3923
- info->data_sinfo = space_info;
3924
-
3925
- return ret;
3926
-}
3927
-
3928
-static void update_space_info(struct btrfs_fs_info *info, u64 flags,
3929
- u64 total_bytes, u64 bytes_used,
3930
- u64 bytes_readonly,
3931
- struct btrfs_space_info **space_info)
3932
-{
3933
- struct btrfs_space_info *found;
3934
- int factor;
3935
-
3936
- factor = btrfs_bg_type_to_factor(flags);
3937
-
3938
- found = __find_space_info(info, flags);
3939
- ASSERT(found);
3940
- spin_lock(&found->lock);
3941
- found->total_bytes += total_bytes;
3942
- found->disk_total += total_bytes * factor;
3943
- found->bytes_used += bytes_used;
3944
- found->disk_used += bytes_used * factor;
3945
- found->bytes_readonly += bytes_readonly;
3946
- if (total_bytes > 0)
3947
- found->full = 0;
3948
- space_info_add_new_bytes(info, found, total_bytes -
3949
- bytes_used - bytes_readonly);
3950
- spin_unlock(&found->lock);
3951
- *space_info = found;
3952
-}
3953
-
3954
-static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3955
-{
3956
- u64 extra_flags = chunk_to_extended(flags) &
3957
- BTRFS_EXTENDED_PROFILE_MASK;
3958
-
3959
- write_seqlock(&fs_info->profiles_lock);
3960
- if (flags & BTRFS_BLOCK_GROUP_DATA)
3961
- fs_info->avail_data_alloc_bits |= extra_flags;
3962
- if (flags & BTRFS_BLOCK_GROUP_METADATA)
3963
- fs_info->avail_metadata_alloc_bits |= extra_flags;
3964
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3965
- fs_info->avail_system_alloc_bits |= extra_flags;
3966
- write_sequnlock(&fs_info->profiles_lock);
3967
-}
3968
-
3969
-/*
3970
- * returns target flags in extended format or 0 if restripe for this
3971
- * chunk_type is not in progress
3972
- *
3973
- * should be called with balance_lock held
3974
- */
3975
-static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3976
-{
3977
- struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3978
- u64 target = 0;
3979
-
3980
- if (!bctl)
3981
- return 0;
3982
-
3983
- if (flags & BTRFS_BLOCK_GROUP_DATA &&
3984
- bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3985
- target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3986
- } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3987
- bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3988
- target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3989
- } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3990
- bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3991
- target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3992
- }
3993
-
3994
- return target;
3995
-}
3996
-
3997
-/*
3998
- * @flags: available profiles in extended format (see ctree.h)
3999
- *
4000
- * Returns reduced profile in chunk format. If profile changing is in
4001
- * progress (either running or paused) picks the target profile (if it's
4002
- * already available), otherwise falls back to plain reducing.
4003
- */
4004
-static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
4005
-{
4006
- u64 num_devices = fs_info->fs_devices->rw_devices;
4007
- u64 target;
4008
- u64 raid_type;
4009
- u64 allowed = 0;
4010
-
4011
- /*
4012
- * see if restripe for this chunk_type is in progress, if so
4013
- * try to reduce to the target profile
4014
- */
4015
- spin_lock(&fs_info->balance_lock);
4016
- target = get_restripe_target(fs_info, flags);
4017
- if (target) {
4018
- /* pick target profile only if it's already available */
4019
- if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4020
- spin_unlock(&fs_info->balance_lock);
4021
- return extended_to_chunk(target);
4022
- }
4023
- }
4024
- spin_unlock(&fs_info->balance_lock);
4025
-
4026
- /* First, mask out the RAID levels which aren't possible */
4027
- for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4028
- if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4029
- allowed |= btrfs_raid_array[raid_type].bg_flag;
4030
- }
4031
- allowed &= flags;
4032
-
4033
- if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4034
- allowed = BTRFS_BLOCK_GROUP_RAID6;
4035
- else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4036
- allowed = BTRFS_BLOCK_GROUP_RAID5;
4037
- else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4038
- allowed = BTRFS_BLOCK_GROUP_RAID10;
4039
- else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4040
- allowed = BTRFS_BLOCK_GROUP_RAID1;
4041
- else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4042
- allowed = BTRFS_BLOCK_GROUP_RAID0;
4043
-
4044
- flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4045
-
4046
- return extended_to_chunk(flags | allowed);
4047
-}
4048
-
4049
-static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
4050
-{
4051
- unsigned seq;
4052
- u64 flags;
4053
-
4054
- do {
4055
- flags = orig_flags;
4056
- seq = read_seqbegin(&fs_info->profiles_lock);
4057
-
4058
- if (flags & BTRFS_BLOCK_GROUP_DATA)
4059
- flags |= fs_info->avail_data_alloc_bits;
4060
- else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4061
- flags |= fs_info->avail_system_alloc_bits;
4062
- else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4063
- flags |= fs_info->avail_metadata_alloc_bits;
4064
- } while (read_seqretry(&fs_info->profiles_lock, seq));
4065
-
4066
- return btrfs_reduce_alloc_profile(fs_info, flags);
40672501 }
40682502
40692503 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
....@@ -4079,2091 +2513,13 @@
40792513 else
40802514 flags = BTRFS_BLOCK_GROUP_METADATA;
40812515
4082
- ret = get_alloc_profile(fs_info, flags);
2516
+ ret = btrfs_get_alloc_profile(fs_info, flags);
40832517 return ret;
4084
-}
4085
-
4086
-u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4087
-{
4088
- return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4089
-}
4090
-
4091
-u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4092
-{
4093
- return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4094
-}
4095
-
4096
-u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4097
-{
4098
- return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4099
-}
4100
-
4101
-static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4102
- bool may_use_included)
4103
-{
4104
- ASSERT(s_info);
4105
- return s_info->bytes_used + s_info->bytes_reserved +
4106
- s_info->bytes_pinned + s_info->bytes_readonly +
4107
- (may_use_included ? s_info->bytes_may_use : 0);
4108
-}
4109
-
4110
-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
4111
-{
4112
- struct btrfs_root *root = inode->root;
4113
- struct btrfs_fs_info *fs_info = root->fs_info;
4114
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
4115
- u64 used;
4116
- int ret = 0;
4117
- int need_commit = 2;
4118
- int have_pinned_space;
4119
-
4120
- /* make sure bytes are sectorsize aligned */
4121
- bytes = ALIGN(bytes, fs_info->sectorsize);
4122
-
4123
- if (btrfs_is_free_space_inode(inode)) {
4124
- need_commit = 0;
4125
- ASSERT(current->journal_info);
4126
- }
4127
-
4128
-again:
4129
- /* make sure we have enough space to handle the data first */
4130
- spin_lock(&data_sinfo->lock);
4131
- used = btrfs_space_info_used(data_sinfo, true);
4132
-
4133
- if (used + bytes > data_sinfo->total_bytes) {
4134
- struct btrfs_trans_handle *trans;
4135
-
4136
- /*
4137
- * if we don't have enough free bytes in this space then we need
4138
- * to alloc a new chunk.
4139
- */
4140
- if (!data_sinfo->full) {
4141
- u64 alloc_target;
4142
-
4143
- data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4144
- spin_unlock(&data_sinfo->lock);
4145
-
4146
- alloc_target = btrfs_data_alloc_profile(fs_info);
4147
- /*
4148
- * It is ugly that we don't call nolock join
4149
- * transaction for the free space inode case here.
4150
- * But it is safe because we only do the data space
4151
- * reservation for the free space cache in the
4152
- * transaction context, the common join transaction
4153
- * just increase the counter of the current transaction
4154
- * handler, doesn't try to acquire the trans_lock of
4155
- * the fs.
4156
- */
4157
- trans = btrfs_join_transaction(root);
4158
- if (IS_ERR(trans))
4159
- return PTR_ERR(trans);
4160
-
4161
- ret = do_chunk_alloc(trans, alloc_target,
4162
- CHUNK_ALLOC_NO_FORCE);
4163
- btrfs_end_transaction(trans);
4164
- if (ret < 0) {
4165
- if (ret != -ENOSPC)
4166
- return ret;
4167
- else {
4168
- have_pinned_space = 1;
4169
- goto commit_trans;
4170
- }
4171
- }
4172
-
4173
- goto again;
4174
- }
4175
-
4176
- /*
4177
- * If we don't have enough pinned space to deal with this
4178
- * allocation, and no removed chunk in current transaction,
4179
- * don't bother committing the transaction.
4180
- */
4181
- have_pinned_space = __percpu_counter_compare(
4182
- &data_sinfo->total_bytes_pinned,
4183
- used + bytes - data_sinfo->total_bytes,
4184
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
4185
- spin_unlock(&data_sinfo->lock);
4186
-
4187
- /* commit the current transaction and try again */
4188
-commit_trans:
4189
- if (need_commit) {
4190
- need_commit--;
4191
-
4192
- if (need_commit > 0) {
4193
- btrfs_start_delalloc_roots(fs_info, -1);
4194
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4195
- (u64)-1);
4196
- }
4197
-
4198
- trans = btrfs_join_transaction(root);
4199
- if (IS_ERR(trans))
4200
- return PTR_ERR(trans);
4201
- if (have_pinned_space >= 0 ||
4202
- test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4203
- &trans->transaction->flags) ||
4204
- need_commit > 0) {
4205
- ret = btrfs_commit_transaction(trans);
4206
- if (ret)
4207
- return ret;
4208
- /*
4209
- * The cleaner kthread might still be doing iput
4210
- * operations. Wait for it to finish so that
4211
- * more space is released.
4212
- */
4213
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
4214
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
4215
- goto again;
4216
- } else {
4217
- btrfs_end_transaction(trans);
4218
- }
4219
- }
4220
-
4221
- trace_btrfs_space_reservation(fs_info,
4222
- "space_info:enospc",
4223
- data_sinfo->flags, bytes, 1);
4224
- return -ENOSPC;
4225
- }
4226
- data_sinfo->bytes_may_use += bytes;
4227
- trace_btrfs_space_reservation(fs_info, "space_info",
4228
- data_sinfo->flags, bytes, 1);
4229
- spin_unlock(&data_sinfo->lock);
4230
-
4231
- return 0;
4232
-}
4233
-
4234
-int btrfs_check_data_free_space(struct inode *inode,
4235
- struct extent_changeset **reserved, u64 start, u64 len)
4236
-{
4237
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4238
- int ret;
4239
-
4240
- /* align the range */
4241
- len = round_up(start + len, fs_info->sectorsize) -
4242
- round_down(start, fs_info->sectorsize);
4243
- start = round_down(start, fs_info->sectorsize);
4244
-
4245
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4246
- if (ret < 0)
4247
- return ret;
4248
-
4249
- /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4250
- ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4251
- if (ret < 0)
4252
- btrfs_free_reserved_data_space_noquota(inode, start, len);
4253
- else
4254
- ret = 0;
4255
- return ret;
4256
-}
4257
-
4258
-/*
4259
- * Called if we need to clear a data reservation for this inode
4260
- * Normally in a error case.
4261
- *
4262
- * This one will *NOT* use accurate qgroup reserved space API, just for case
4263
- * which we can't sleep and is sure it won't affect qgroup reserved space.
4264
- * Like clear_bit_hook().
4265
- */
4266
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4267
- u64 len)
4268
-{
4269
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4270
- struct btrfs_space_info *data_sinfo;
4271
-
4272
- /* Make sure the range is aligned to sectorsize */
4273
- len = round_up(start + len, fs_info->sectorsize) -
4274
- round_down(start, fs_info->sectorsize);
4275
- start = round_down(start, fs_info->sectorsize);
4276
-
4277
- data_sinfo = fs_info->data_sinfo;
4278
- spin_lock(&data_sinfo->lock);
4279
- if (WARN_ON(data_sinfo->bytes_may_use < len))
4280
- data_sinfo->bytes_may_use = 0;
4281
- else
4282
- data_sinfo->bytes_may_use -= len;
4283
- trace_btrfs_space_reservation(fs_info, "space_info",
4284
- data_sinfo->flags, len, 0);
4285
- spin_unlock(&data_sinfo->lock);
4286
-}
4287
-
4288
-/*
4289
- * Called if we need to clear a data reservation for this inode
4290
- * Normally in a error case.
4291
- *
4292
- * This one will handle the per-inode data rsv map for accurate reserved
4293
- * space framework.
4294
- */
4295
-void btrfs_free_reserved_data_space(struct inode *inode,
4296
- struct extent_changeset *reserved, u64 start, u64 len)
4297
-{
4298
- struct btrfs_root *root = BTRFS_I(inode)->root;
4299
-
4300
- /* Make sure the range is aligned to sectorsize */
4301
- len = round_up(start + len, root->fs_info->sectorsize) -
4302
- round_down(start, root->fs_info->sectorsize);
4303
- start = round_down(start, root->fs_info->sectorsize);
4304
-
4305
- btrfs_free_reserved_data_space_noquota(inode, start, len);
4306
- btrfs_qgroup_free_data(inode, reserved, start, len);
4307
-}
4308
-
4309
-static void force_metadata_allocation(struct btrfs_fs_info *info)
4310
-{
4311
- struct list_head *head = &info->space_info;
4312
- struct btrfs_space_info *found;
4313
-
4314
- rcu_read_lock();
4315
- list_for_each_entry_rcu(found, head, list) {
4316
- if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4317
- found->force_alloc = CHUNK_ALLOC_FORCE;
4318
- }
4319
- rcu_read_unlock();
4320
-}
4321
-
4322
-static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4323
-{
4324
- return (global->size << 1);
4325
-}
4326
-
4327
-static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4328
- struct btrfs_space_info *sinfo, int force)
4329
-{
4330
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4331
- u64 bytes_used = btrfs_space_info_used(sinfo, false);
4332
- u64 thresh;
4333
-
4334
- if (force == CHUNK_ALLOC_FORCE)
4335
- return 1;
4336
-
4337
- /*
4338
- * We need to take into account the global rsv because for all intents
4339
- * and purposes it's used space. Don't worry about locking the
4340
- * global_rsv, it doesn't change except when the transaction commits.
4341
- */
4342
- if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4343
- bytes_used += calc_global_rsv_need_space(global_rsv);
4344
-
4345
- /*
4346
- * in limited mode, we want to have some free space up to
4347
- * about 1% of the FS size.
4348
- */
4349
- if (force == CHUNK_ALLOC_LIMITED) {
4350
- thresh = btrfs_super_total_bytes(fs_info->super_copy);
4351
- thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4352
-
4353
- if (sinfo->total_bytes - bytes_used < thresh)
4354
- return 1;
4355
- }
4356
-
4357
- if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4358
- return 0;
4359
- return 1;
4360
-}
4361
-
4362
-static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4363
-{
4364
- u64 num_dev;
4365
-
4366
- if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4367
- BTRFS_BLOCK_GROUP_RAID0 |
4368
- BTRFS_BLOCK_GROUP_RAID5 |
4369
- BTRFS_BLOCK_GROUP_RAID6))
4370
- num_dev = fs_info->fs_devices->rw_devices;
4371
- else if (type & BTRFS_BLOCK_GROUP_RAID1)
4372
- num_dev = 2;
4373
- else
4374
- num_dev = 1; /* DUP or single */
4375
-
4376
- return num_dev;
4377
-}
4378
-
4379
-/*
4380
- * If @is_allocation is true, reserve space in the system space info necessary
4381
- * for allocating a chunk, otherwise if it's false, reserve space necessary for
4382
- * removing a chunk.
4383
- */
4384
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4385
-{
4386
- struct btrfs_fs_info *fs_info = trans->fs_info;
4387
- struct btrfs_space_info *info;
4388
- u64 left;
4389
- u64 thresh;
4390
- int ret = 0;
4391
- u64 num_devs;
4392
-
4393
- /*
4394
- * Needed because we can end up allocating a system chunk and for an
4395
- * atomic and race free space reservation in the chunk block reserve.
4396
- */
4397
- lockdep_assert_held(&fs_info->chunk_mutex);
4398
-
4399
- info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4400
- spin_lock(&info->lock);
4401
- left = info->total_bytes - btrfs_space_info_used(info, true);
4402
- spin_unlock(&info->lock);
4403
-
4404
- num_devs = get_profile_num_devs(fs_info, type);
4405
-
4406
- /* num_devs device items to update and 1 chunk item to add or remove */
4407
- thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4408
- btrfs_calc_trans_metadata_size(fs_info, 1);
4409
-
4410
- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4411
- btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4412
- left, thresh, type);
4413
- dump_space_info(fs_info, info, 0, 0);
4414
- }
4415
-
4416
- if (left < thresh) {
4417
- u64 flags = btrfs_system_alloc_profile(fs_info);
4418
-
4419
- /*
4420
- * Ignore failure to create system chunk. We might end up not
4421
- * needing it, as we might not need to COW all nodes/leafs from
4422
- * the paths we visit in the chunk tree (they were already COWed
4423
- * or created in the current transaction for example).
4424
- */
4425
- ret = btrfs_alloc_chunk(trans, flags);
4426
- }
4427
-
4428
- if (!ret) {
4429
- ret = btrfs_block_rsv_add(fs_info->chunk_root,
4430
- &fs_info->chunk_block_rsv,
4431
- thresh, BTRFS_RESERVE_NO_FLUSH);
4432
- if (!ret)
4433
- trans->chunk_bytes_reserved += thresh;
4434
- }
4435
-}
4436
-
4437
-/*
4438
- * If force is CHUNK_ALLOC_FORCE:
4439
- * - return 1 if it successfully allocates a chunk,
4440
- * - return errors including -ENOSPC otherwise.
4441
- * If force is NOT CHUNK_ALLOC_FORCE:
4442
- * - return 0 if it doesn't need to allocate a new chunk,
4443
- * - return 1 if it successfully allocates a chunk,
4444
- * - return errors including -ENOSPC otherwise.
4445
- */
4446
-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4447
- int force)
4448
-{
4449
- struct btrfs_fs_info *fs_info = trans->fs_info;
4450
- struct btrfs_space_info *space_info;
4451
- bool wait_for_alloc = false;
4452
- bool should_alloc = false;
4453
- int ret = 0;
4454
-
4455
- /* Don't re-enter if we're already allocating a chunk */
4456
- if (trans->allocating_chunk)
4457
- return -ENOSPC;
4458
-
4459
- space_info = __find_space_info(fs_info, flags);
4460
- ASSERT(space_info);
4461
-
4462
- do {
4463
- spin_lock(&space_info->lock);
4464
- if (force < space_info->force_alloc)
4465
- force = space_info->force_alloc;
4466
- should_alloc = should_alloc_chunk(fs_info, space_info, force);
4467
- if (space_info->full) {
4468
- /* No more free physical space */
4469
- if (should_alloc)
4470
- ret = -ENOSPC;
4471
- else
4472
- ret = 0;
4473
- spin_unlock(&space_info->lock);
4474
- return ret;
4475
- } else if (!should_alloc) {
4476
- spin_unlock(&space_info->lock);
4477
- return 0;
4478
- } else if (space_info->chunk_alloc) {
4479
- /*
4480
- * Someone is already allocating, so we need to block
4481
- * until this someone is finished and then loop to
4482
- * recheck if we should continue with our allocation
4483
- * attempt.
4484
- */
4485
- wait_for_alloc = true;
4486
- spin_unlock(&space_info->lock);
4487
- mutex_lock(&fs_info->chunk_mutex);
4488
- mutex_unlock(&fs_info->chunk_mutex);
4489
- } else {
4490
- /* Proceed with allocation */
4491
- space_info->chunk_alloc = 1;
4492
- wait_for_alloc = false;
4493
- spin_unlock(&space_info->lock);
4494
- }
4495
-
4496
- cond_resched();
4497
- } while (wait_for_alloc);
4498
-
4499
- mutex_lock(&fs_info->chunk_mutex);
4500
- trans->allocating_chunk = true;
4501
-
4502
- /*
4503
- * If we have mixed data/metadata chunks we want to make sure we keep
4504
- * allocating mixed chunks instead of individual chunks.
4505
- */
4506
- if (btrfs_mixed_space_info(space_info))
4507
- flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4508
-
4509
- /*
4510
- * if we're doing a data chunk, go ahead and make sure that
4511
- * we keep a reasonable number of metadata chunks allocated in the
4512
- * FS as well.
4513
- */
4514
- if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4515
- fs_info->data_chunk_allocations++;
4516
- if (!(fs_info->data_chunk_allocations %
4517
- fs_info->metadata_ratio))
4518
- force_metadata_allocation(fs_info);
4519
- }
4520
-
4521
- /*
4522
- * Check if we have enough space in SYSTEM chunk because we may need
4523
- * to update devices.
4524
- */
4525
- check_system_chunk(trans, flags);
4526
-
4527
- ret = btrfs_alloc_chunk(trans, flags);
4528
- trans->allocating_chunk = false;
4529
-
4530
- spin_lock(&space_info->lock);
4531
- if (ret < 0) {
4532
- if (ret == -ENOSPC)
4533
- space_info->full = 1;
4534
- else
4535
- goto out;
4536
- } else {
4537
- ret = 1;
4538
- space_info->max_extent_size = 0;
4539
- }
4540
-
4541
- space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4542
-out:
4543
- space_info->chunk_alloc = 0;
4544
- spin_unlock(&space_info->lock);
4545
- mutex_unlock(&fs_info->chunk_mutex);
4546
- /*
4547
- * When we allocate a new chunk we reserve space in the chunk block
4548
- * reserve to make sure we can COW nodes/leafs in the chunk tree or
4549
- * add new nodes/leafs to it if we end up needing to do it when
4550
- * inserting the chunk item and updating device items as part of the
4551
- * second phase of chunk allocation, performed by
4552
- * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4553
- * large number of new block groups to create in our transaction
4554
- * handle's new_bgs list to avoid exhausting the chunk block reserve
4555
- * in extreme cases - like having a single transaction create many new
4556
- * block groups when starting to write out the free space caches of all
4557
- * the block groups that were made dirty during the lifetime of the
4558
- * transaction.
4559
- */
4560
- if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
4561
- btrfs_create_pending_block_groups(trans);
4562
-
4563
- return ret;
4564
-}
4565
-
4566
-static int can_overcommit(struct btrfs_fs_info *fs_info,
4567
- struct btrfs_space_info *space_info, u64 bytes,
4568
- enum btrfs_reserve_flush_enum flush,
4569
- bool system_chunk)
4570
-{
4571
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4572
- u64 profile;
4573
- u64 space_size;
4574
- u64 avail;
4575
- u64 used;
4576
- int factor;
4577
-
4578
- /* Don't overcommit when in mixed mode. */
4579
- if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4580
- return 0;
4581
-
4582
- if (system_chunk)
4583
- profile = btrfs_system_alloc_profile(fs_info);
4584
- else
4585
- profile = btrfs_metadata_alloc_profile(fs_info);
4586
-
4587
- used = btrfs_space_info_used(space_info, false);
4588
-
4589
- /*
4590
- * We only want to allow over committing if we have lots of actual space
4591
- * free, but if we don't have enough space to handle the global reserve
4592
- * space then we could end up having a real enospc problem when trying
4593
- * to allocate a chunk or some other such important allocation.
4594
- */
4595
- spin_lock(&global_rsv->lock);
4596
- space_size = calc_global_rsv_need_space(global_rsv);
4597
- spin_unlock(&global_rsv->lock);
4598
- if (used + space_size >= space_info->total_bytes)
4599
- return 0;
4600
-
4601
- used += space_info->bytes_may_use;
4602
-
4603
- avail = atomic64_read(&fs_info->free_chunk_space);
4604
-
4605
- /*
4606
- * If we have dup, raid1 or raid10 then only half of the free
4607
- * space is actually useable. For raid56, the space info used
4608
- * doesn't include the parity drive, so we don't have to
4609
- * change the math
4610
- */
4611
- factor = btrfs_bg_type_to_factor(profile);
4612
- avail = div_u64(avail, factor);
4613
-
4614
- /*
4615
- * If we aren't flushing all things, let us overcommit up to
4616
- * 1/2th of the space. If we can flush, don't let us overcommit
4617
- * too much, let it overcommit up to 1/8 of the space.
4618
- */
4619
- if (flush == BTRFS_RESERVE_FLUSH_ALL)
4620
- avail >>= 3;
4621
- else
4622
- avail >>= 1;
4623
-
4624
- if (used + bytes < space_info->total_bytes + avail)
4625
- return 1;
4626
- return 0;
4627
-}
4628
-
4629
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4630
- unsigned long nr_pages, int nr_items)
4631
-{
4632
- struct super_block *sb = fs_info->sb;
4633
-
4634
- if (down_read_trylock(&sb->s_umount)) {
4635
- writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4636
- up_read(&sb->s_umount);
4637
- } else {
4638
- /*
4639
- * We needn't worry the filesystem going from r/w to r/o though
4640
- * we don't acquire ->s_umount mutex, because the filesystem
4641
- * should guarantee the delalloc inodes list be empty after
4642
- * the filesystem is readonly(all dirty pages are written to
4643
- * the disk).
4644
- */
4645
- btrfs_start_delalloc_roots(fs_info, nr_items);
4646
- if (!current->journal_info)
4647
- btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4648
- }
4649
-}
4650
-
4651
-static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4652
- u64 to_reclaim)
4653
-{
4654
- u64 bytes;
4655
- u64 nr;
4656
-
4657
- bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4658
- nr = div64_u64(to_reclaim, bytes);
4659
- if (!nr)
4660
- nr = 1;
4661
- return nr;
4662
-}
4663
-
4664
-#define EXTENT_SIZE_PER_ITEM SZ_256K
4665
-
4666
-/*
4667
- * shrink metadata reservation for delalloc
4668
- */
4669
-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4670
- u64 orig, bool wait_ordered)
4671
-{
4672
- struct btrfs_space_info *space_info;
4673
- struct btrfs_trans_handle *trans;
4674
- u64 delalloc_bytes;
4675
- u64 max_reclaim;
4676
- u64 items;
4677
- long time_left;
4678
- unsigned long nr_pages;
4679
- int loops;
4680
-
4681
- /* Calc the number of the pages we need flush for space reservation */
4682
- items = calc_reclaim_items_nr(fs_info, to_reclaim);
4683
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4684
-
4685
- trans = (struct btrfs_trans_handle *)current->journal_info;
4686
- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4687
-
4688
- delalloc_bytes = percpu_counter_sum_positive(
4689
- &fs_info->delalloc_bytes);
4690
- if (delalloc_bytes == 0) {
4691
- if (trans)
4692
- return;
4693
- if (wait_ordered)
4694
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4695
- return;
4696
- }
4697
-
4698
- loops = 0;
4699
- while (delalloc_bytes && loops < 3) {
4700
- max_reclaim = min(delalloc_bytes, to_reclaim);
4701
- nr_pages = max_reclaim >> PAGE_SHIFT;
4702
- btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4703
- /*
4704
- * We need to wait for the async pages to actually start before
4705
- * we do anything.
4706
- */
4707
- max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
4708
- if (!max_reclaim)
4709
- goto skip_async;
4710
-
4711
- if (max_reclaim <= nr_pages)
4712
- max_reclaim = 0;
4713
- else
4714
- max_reclaim -= nr_pages;
4715
-
4716
- wait_event(fs_info->async_submit_wait,
4717
- atomic_read(&fs_info->async_delalloc_pages) <=
4718
- (int)max_reclaim);
4719
-skip_async:
4720
- spin_lock(&space_info->lock);
4721
- if (list_empty(&space_info->tickets) &&
4722
- list_empty(&space_info->priority_tickets)) {
4723
- spin_unlock(&space_info->lock);
4724
- break;
4725
- }
4726
- spin_unlock(&space_info->lock);
4727
-
4728
- loops++;
4729
- if (wait_ordered && !trans) {
4730
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4731
- } else {
4732
- time_left = schedule_timeout_killable(1);
4733
- if (time_left)
4734
- break;
4735
- }
4736
- delalloc_bytes = percpu_counter_sum_positive(
4737
- &fs_info->delalloc_bytes);
4738
- }
4739
-}
4740
-
4741
-struct reserve_ticket {
4742
- u64 bytes;
4743
- int error;
4744
- struct list_head list;
4745
- wait_queue_head_t wait;
4746
-};
4747
-
4748
-/**
4749
- * maybe_commit_transaction - possibly commit the transaction if its ok to
4750
- * @root - the root we're allocating for
4751
- * @bytes - the number of bytes we want to reserve
4752
- * @force - force the commit
4753
- *
4754
- * This will check to make sure that committing the transaction will actually
4755
- * get us somewhere and then commit the transaction if it does. Otherwise it
4756
- * will return -ENOSPC.
4757
- */
4758
-static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4759
- struct btrfs_space_info *space_info)
4760
-{
4761
- struct reserve_ticket *ticket = NULL;
4762
- struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4763
- struct btrfs_trans_handle *trans;
4764
- u64 bytes;
4765
-
4766
- trans = (struct btrfs_trans_handle *)current->journal_info;
4767
- if (trans)
4768
- return -EAGAIN;
4769
-
4770
- spin_lock(&space_info->lock);
4771
- if (!list_empty(&space_info->priority_tickets))
4772
- ticket = list_first_entry(&space_info->priority_tickets,
4773
- struct reserve_ticket, list);
4774
- else if (!list_empty(&space_info->tickets))
4775
- ticket = list_first_entry(&space_info->tickets,
4776
- struct reserve_ticket, list);
4777
- bytes = (ticket) ? ticket->bytes : 0;
4778
- spin_unlock(&space_info->lock);
4779
-
4780
- if (!bytes)
4781
- return 0;
4782
-
4783
- /* See if there is enough pinned space to make this reservation */
4784
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4785
- bytes,
4786
- BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
4787
- goto commit;
4788
-
4789
- /*
4790
- * See if there is some space in the delayed insertion reservation for
4791
- * this reservation.
4792
- */
4793
- if (space_info != delayed_rsv->space_info)
4794
- return -ENOSPC;
4795
-
4796
- spin_lock(&delayed_rsv->lock);
4797
- if (delayed_rsv->size > bytes)
4798
- bytes = 0;
4799
- else
4800
- bytes -= delayed_rsv->size;
4801
- spin_unlock(&delayed_rsv->lock);
4802
-
4803
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4804
- bytes,
4805
- BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
4806
- return -ENOSPC;
4807
- }
4808
-
4809
-commit:
4810
- trans = btrfs_join_transaction(fs_info->extent_root);
4811
- if (IS_ERR(trans))
4812
- return -ENOSPC;
4813
-
4814
- return btrfs_commit_transaction(trans);
4815
-}
4816
-
4817
-/*
4818
- * Try to flush some data based on policy set by @state. This is only advisory
4819
- * and may fail for various reasons. The caller is supposed to examine the
4820
- * state of @space_info to detect the outcome.
4821
- */
4822
-static void flush_space(struct btrfs_fs_info *fs_info,
4823
- struct btrfs_space_info *space_info, u64 num_bytes,
4824
- int state)
4825
-{
4826
- struct btrfs_root *root = fs_info->extent_root;
4827
- struct btrfs_trans_handle *trans;
4828
- int nr;
4829
- int ret = 0;
4830
-
4831
- switch (state) {
4832
- case FLUSH_DELAYED_ITEMS_NR:
4833
- case FLUSH_DELAYED_ITEMS:
4834
- if (state == FLUSH_DELAYED_ITEMS_NR)
4835
- nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4836
- else
4837
- nr = -1;
4838
-
4839
- trans = btrfs_join_transaction(root);
4840
- if (IS_ERR(trans)) {
4841
- ret = PTR_ERR(trans);
4842
- break;
4843
- }
4844
- ret = btrfs_run_delayed_items_nr(trans, nr);
4845
- btrfs_end_transaction(trans);
4846
- break;
4847
- case FLUSH_DELALLOC:
4848
- case FLUSH_DELALLOC_WAIT:
4849
- shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4850
- state == FLUSH_DELALLOC_WAIT);
4851
- break;
4852
- case ALLOC_CHUNK:
4853
- trans = btrfs_join_transaction(root);
4854
- if (IS_ERR(trans)) {
4855
- ret = PTR_ERR(trans);
4856
- break;
4857
- }
4858
- ret = do_chunk_alloc(trans,
4859
- btrfs_metadata_alloc_profile(fs_info),
4860
- CHUNK_ALLOC_NO_FORCE);
4861
- btrfs_end_transaction(trans);
4862
- if (ret > 0 || ret == -ENOSPC)
4863
- ret = 0;
4864
- break;
4865
- case COMMIT_TRANS:
4866
- ret = may_commit_transaction(fs_info, space_info);
4867
- break;
4868
- default:
4869
- ret = -ENOSPC;
4870
- break;
4871
- }
4872
-
4873
- trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
4874
- ret);
4875
- return;
4876
-}
4877
-
4878
-static inline u64
4879
-btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4880
- struct btrfs_space_info *space_info,
4881
- bool system_chunk)
4882
-{
4883
- struct reserve_ticket *ticket;
4884
- u64 used;
4885
- u64 expected;
4886
- u64 to_reclaim = 0;
4887
-
4888
- list_for_each_entry(ticket, &space_info->tickets, list)
4889
- to_reclaim += ticket->bytes;
4890
- list_for_each_entry(ticket, &space_info->priority_tickets, list)
4891
- to_reclaim += ticket->bytes;
4892
- if (to_reclaim)
4893
- return to_reclaim;
4894
-
4895
- to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4896
- if (can_overcommit(fs_info, space_info, to_reclaim,
4897
- BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4898
- return 0;
4899
-
4900
- used = btrfs_space_info_used(space_info, true);
4901
-
4902
- if (can_overcommit(fs_info, space_info, SZ_1M,
4903
- BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4904
- expected = div_factor_fine(space_info->total_bytes, 95);
4905
- else
4906
- expected = div_factor_fine(space_info->total_bytes, 90);
4907
-
4908
- if (used > expected)
4909
- to_reclaim = used - expected;
4910
- else
4911
- to_reclaim = 0;
4912
- to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4913
- space_info->bytes_reserved);
4914
- return to_reclaim;
4915
-}
4916
-
4917
-static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
4918
- struct btrfs_space_info *space_info,
4919
- u64 used, bool system_chunk)
4920
-{
4921
- u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4922
-
4923
- /* If we're just plain full then async reclaim just slows us down. */
4924
- if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4925
- return 0;
4926
-
4927
- if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4928
- system_chunk))
4929
- return 0;
4930
-
4931
- return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4932
- !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4933
-}
4934
-
4935
-static void wake_all_tickets(struct list_head *head)
4936
-{
4937
- struct reserve_ticket *ticket;
4938
-
4939
- while (!list_empty(head)) {
4940
- ticket = list_first_entry(head, struct reserve_ticket, list);
4941
- list_del_init(&ticket->list);
4942
- ticket->error = -ENOSPC;
4943
- wake_up(&ticket->wait);
4944
- }
4945
-}
4946
-
4947
-/*
4948
- * This is for normal flushers, we can wait all goddamned day if we want to. We
4949
- * will loop and continuously try to flush as long as we are making progress.
4950
- * We count progress as clearing off tickets each time we have to loop.
4951
- */
4952
-static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4953
-{
4954
- struct btrfs_fs_info *fs_info;
4955
- struct btrfs_space_info *space_info;
4956
- u64 to_reclaim;
4957
- int flush_state;
4958
- int commit_cycles = 0;
4959
- u64 last_tickets_id;
4960
-
4961
- fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4962
- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4963
-
4964
- spin_lock(&space_info->lock);
4965
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4966
- false);
4967
- if (!to_reclaim) {
4968
- space_info->flush = 0;
4969
- spin_unlock(&space_info->lock);
4970
- return;
4971
- }
4972
- last_tickets_id = space_info->tickets_id;
4973
- spin_unlock(&space_info->lock);
4974
-
4975
- flush_state = FLUSH_DELAYED_ITEMS_NR;
4976
- do {
4977
- flush_space(fs_info, space_info, to_reclaim, flush_state);
4978
- spin_lock(&space_info->lock);
4979
- if (list_empty(&space_info->tickets)) {
4980
- space_info->flush = 0;
4981
- spin_unlock(&space_info->lock);
4982
- return;
4983
- }
4984
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
4985
- space_info,
4986
- false);
4987
- if (last_tickets_id == space_info->tickets_id) {
4988
- flush_state++;
4989
- } else {
4990
- last_tickets_id = space_info->tickets_id;
4991
- flush_state = FLUSH_DELAYED_ITEMS_NR;
4992
- if (commit_cycles)
4993
- commit_cycles--;
4994
- }
4995
-
4996
- if (flush_state > COMMIT_TRANS) {
4997
- commit_cycles++;
4998
- if (commit_cycles > 2) {
4999
- wake_all_tickets(&space_info->tickets);
5000
- space_info->flush = 0;
5001
- } else {
5002
- flush_state = FLUSH_DELAYED_ITEMS_NR;
5003
- }
5004
- }
5005
- spin_unlock(&space_info->lock);
5006
- } while (flush_state <= COMMIT_TRANS);
5007
-}
5008
-
5009
-void btrfs_init_async_reclaim_work(struct work_struct *work)
5010
-{
5011
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5012
-}
5013
-
5014
-static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5015
- struct btrfs_space_info *space_info,
5016
- struct reserve_ticket *ticket)
5017
-{
5018
- u64 to_reclaim;
5019
- int flush_state = FLUSH_DELAYED_ITEMS_NR;
5020
-
5021
- spin_lock(&space_info->lock);
5022
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5023
- false);
5024
- if (!to_reclaim) {
5025
- spin_unlock(&space_info->lock);
5026
- return;
5027
- }
5028
- spin_unlock(&space_info->lock);
5029
-
5030
- do {
5031
- flush_space(fs_info, space_info, to_reclaim, flush_state);
5032
- flush_state++;
5033
- spin_lock(&space_info->lock);
5034
- if (ticket->bytes == 0) {
5035
- spin_unlock(&space_info->lock);
5036
- return;
5037
- }
5038
- spin_unlock(&space_info->lock);
5039
-
5040
- /*
5041
- * Priority flushers can't wait on delalloc without
5042
- * deadlocking.
5043
- */
5044
- if (flush_state == FLUSH_DELALLOC ||
5045
- flush_state == FLUSH_DELALLOC_WAIT)
5046
- flush_state = ALLOC_CHUNK;
5047
- } while (flush_state < COMMIT_TRANS);
5048
-}
5049
-
5050
-static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5051
- struct btrfs_space_info *space_info,
5052
- struct reserve_ticket *ticket, u64 orig_bytes)
5053
-
5054
-{
5055
- DEFINE_WAIT(wait);
5056
- int ret = 0;
5057
-
5058
- spin_lock(&space_info->lock);
5059
- while (ticket->bytes > 0 && ticket->error == 0) {
5060
- ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5061
- if (ret) {
5062
- ret = -EINTR;
5063
- break;
5064
- }
5065
- spin_unlock(&space_info->lock);
5066
-
5067
- schedule();
5068
-
5069
- finish_wait(&ticket->wait, &wait);
5070
- spin_lock(&space_info->lock);
5071
- }
5072
- if (!ret)
5073
- ret = ticket->error;
5074
- if (!list_empty(&ticket->list))
5075
- list_del_init(&ticket->list);
5076
- if (ticket->bytes && ticket->bytes < orig_bytes) {
5077
- u64 num_bytes = orig_bytes - ticket->bytes;
5078
- space_info->bytes_may_use -= num_bytes;
5079
- trace_btrfs_space_reservation(fs_info, "space_info",
5080
- space_info->flags, num_bytes, 0);
5081
- }
5082
- spin_unlock(&space_info->lock);
5083
-
5084
- return ret;
5085
-}
5086
-
5087
-/**
5088
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5089
- * @root - the root we're allocating for
5090
- * @space_info - the space info we want to allocate from
5091
- * @orig_bytes - the number of bytes we want
5092
- * @flush - whether or not we can flush to make our reservation
5093
- *
5094
- * This will reserve orig_bytes number of bytes from the space info associated
5095
- * with the block_rsv. If there is not enough space it will make an attempt to
5096
- * flush out space to make room. It will do this by flushing delalloc if
5097
- * possible or committing the transaction. If flush is 0 then no attempts to
5098
- * regain reservations will be made and this will fail if there is not enough
5099
- * space already.
5100
- */
5101
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5102
- struct btrfs_space_info *space_info,
5103
- u64 orig_bytes,
5104
- enum btrfs_reserve_flush_enum flush,
5105
- bool system_chunk)
5106
-{
5107
- struct reserve_ticket ticket;
5108
- u64 used;
5109
- int ret = 0;
5110
-
5111
- ASSERT(orig_bytes);
5112
- ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5113
-
5114
- spin_lock(&space_info->lock);
5115
- ret = -ENOSPC;
5116
- used = btrfs_space_info_used(space_info, true);
5117
-
5118
- /*
5119
- * If we have enough space then hooray, make our reservation and carry
5120
- * on. If not see if we can overcommit, and if we can, hooray carry on.
5121
- * If not things get more complicated.
5122
- */
5123
- if (used + orig_bytes <= space_info->total_bytes) {
5124
- space_info->bytes_may_use += orig_bytes;
5125
- trace_btrfs_space_reservation(fs_info, "space_info",
5126
- space_info->flags, orig_bytes, 1);
5127
- ret = 0;
5128
- } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5129
- system_chunk)) {
5130
- space_info->bytes_may_use += orig_bytes;
5131
- trace_btrfs_space_reservation(fs_info, "space_info",
5132
- space_info->flags, orig_bytes, 1);
5133
- ret = 0;
5134
- }
5135
-
5136
- /*
5137
- * If we couldn't make a reservation then setup our reservation ticket
5138
- * and kick the async worker if it's not already running.
5139
- *
5140
- * If we are a priority flusher then we just need to add our ticket to
5141
- * the list and we will do our own flushing further down.
5142
- */
5143
- if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5144
- ticket.bytes = orig_bytes;
5145
- ticket.error = 0;
5146
- init_waitqueue_head(&ticket.wait);
5147
- if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5148
- list_add_tail(&ticket.list, &space_info->tickets);
5149
- if (!space_info->flush) {
5150
- space_info->flush = 1;
5151
- trace_btrfs_trigger_flush(fs_info,
5152
- space_info->flags,
5153
- orig_bytes, flush,
5154
- "enospc");
5155
- queue_work(system_unbound_wq,
5156
- &fs_info->async_reclaim_work);
5157
- }
5158
- } else {
5159
- list_add_tail(&ticket.list,
5160
- &space_info->priority_tickets);
5161
- }
5162
- } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5163
- used += orig_bytes;
5164
- /*
5165
- * We will do the space reservation dance during log replay,
5166
- * which means we won't have fs_info->fs_root set, so don't do
5167
- * the async reclaim as we will panic.
5168
- */
5169
- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5170
- need_do_async_reclaim(fs_info, space_info,
5171
- used, system_chunk) &&
5172
- !work_busy(&fs_info->async_reclaim_work)) {
5173
- trace_btrfs_trigger_flush(fs_info, space_info->flags,
5174
- orig_bytes, flush, "preempt");
5175
- queue_work(system_unbound_wq,
5176
- &fs_info->async_reclaim_work);
5177
- }
5178
- }
5179
- spin_unlock(&space_info->lock);
5180
- if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5181
- return ret;
5182
-
5183
- if (flush == BTRFS_RESERVE_FLUSH_ALL)
5184
- return wait_reserve_ticket(fs_info, space_info, &ticket,
5185
- orig_bytes);
5186
-
5187
- ret = 0;
5188
- priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5189
- spin_lock(&space_info->lock);
5190
- if (ticket.bytes) {
5191
- if (ticket.bytes < orig_bytes) {
5192
- u64 num_bytes = orig_bytes - ticket.bytes;
5193
- space_info->bytes_may_use -= num_bytes;
5194
- trace_btrfs_space_reservation(fs_info, "space_info",
5195
- space_info->flags,
5196
- num_bytes, 0);
5197
-
5198
- }
5199
- list_del_init(&ticket.list);
5200
- ret = -ENOSPC;
5201
- }
5202
- spin_unlock(&space_info->lock);
5203
- ASSERT(list_empty(&ticket.list));
5204
- return ret;
5205
-}
5206
-
5207
-/**
5208
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5209
- * @root - the root we're allocating for
5210
- * @block_rsv - the block_rsv we're allocating for
5211
- * @orig_bytes - the number of bytes we want
5212
- * @flush - whether or not we can flush to make our reservation
5213
- *
5214
- * This will reserve orgi_bytes number of bytes from the space info associated
5215
- * with the block_rsv. If there is not enough space it will make an attempt to
5216
- * flush out space to make room. It will do this by flushing delalloc if
5217
- * possible or committing the transaction. If flush is 0 then no attempts to
5218
- * regain reservations will be made and this will fail if there is not enough
5219
- * space already.
5220
- */
5221
-static int reserve_metadata_bytes(struct btrfs_root *root,
5222
- struct btrfs_block_rsv *block_rsv,
5223
- u64 orig_bytes,
5224
- enum btrfs_reserve_flush_enum flush)
5225
-{
5226
- struct btrfs_fs_info *fs_info = root->fs_info;
5227
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5228
- int ret;
5229
- bool system_chunk = (root == fs_info->chunk_root);
5230
-
5231
- ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5232
- orig_bytes, flush, system_chunk);
5233
- if (ret == -ENOSPC &&
5234
- unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5235
- if (block_rsv != global_rsv &&
5236
- !block_rsv_use_bytes(global_rsv, orig_bytes))
5237
- ret = 0;
5238
- }
5239
- if (ret == -ENOSPC) {
5240
- trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5241
- block_rsv->space_info->flags,
5242
- orig_bytes, 1);
5243
-
5244
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5245
- dump_space_info(fs_info, block_rsv->space_info,
5246
- orig_bytes, 0);
5247
- }
5248
- return ret;
5249
-}
5250
-
5251
-static struct btrfs_block_rsv *get_block_rsv(
5252
- const struct btrfs_trans_handle *trans,
5253
- const struct btrfs_root *root)
5254
-{
5255
- struct btrfs_fs_info *fs_info = root->fs_info;
5256
- struct btrfs_block_rsv *block_rsv = NULL;
5257
-
5258
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5259
- (root == fs_info->csum_root && trans->adding_csums) ||
5260
- (root == fs_info->uuid_root))
5261
- block_rsv = trans->block_rsv;
5262
-
5263
- if (!block_rsv)
5264
- block_rsv = root->block_rsv;
5265
-
5266
- if (!block_rsv)
5267
- block_rsv = &fs_info->empty_block_rsv;
5268
-
5269
- return block_rsv;
5270
-}
5271
-
5272
-static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5273
- u64 num_bytes)
5274
-{
5275
- int ret = -ENOSPC;
5276
- spin_lock(&block_rsv->lock);
5277
- if (block_rsv->reserved >= num_bytes) {
5278
- block_rsv->reserved -= num_bytes;
5279
- if (block_rsv->reserved < block_rsv->size)
5280
- block_rsv->full = 0;
5281
- ret = 0;
5282
- }
5283
- spin_unlock(&block_rsv->lock);
5284
- return ret;
5285
-}
5286
-
5287
-static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5288
- u64 num_bytes, int update_size)
5289
-{
5290
- spin_lock(&block_rsv->lock);
5291
- block_rsv->reserved += num_bytes;
5292
- if (update_size)
5293
- block_rsv->size += num_bytes;
5294
- else if (block_rsv->reserved >= block_rsv->size)
5295
- block_rsv->full = 1;
5296
- spin_unlock(&block_rsv->lock);
5297
-}
5298
-
5299
-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5300
- struct btrfs_block_rsv *dest, u64 num_bytes,
5301
- int min_factor)
5302
-{
5303
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5304
- u64 min_bytes;
5305
-
5306
- if (global_rsv->space_info != dest->space_info)
5307
- return -ENOSPC;
5308
-
5309
- spin_lock(&global_rsv->lock);
5310
- min_bytes = div_factor(global_rsv->size, min_factor);
5311
- if (global_rsv->reserved < min_bytes + num_bytes) {
5312
- spin_unlock(&global_rsv->lock);
5313
- return -ENOSPC;
5314
- }
5315
- global_rsv->reserved -= num_bytes;
5316
- if (global_rsv->reserved < global_rsv->size)
5317
- global_rsv->full = 0;
5318
- spin_unlock(&global_rsv->lock);
5319
-
5320
- block_rsv_add_bytes(dest, num_bytes, 1);
5321
- return 0;
5322
-}
5323
-
5324
-/*
5325
- * This is for space we already have accounted in space_info->bytes_may_use, so
5326
- * basically when we're returning space from block_rsv's.
5327
- */
5328
-static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5329
- struct btrfs_space_info *space_info,
5330
- u64 num_bytes)
5331
-{
5332
- struct reserve_ticket *ticket;
5333
- struct list_head *head;
5334
- u64 used;
5335
- enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5336
- bool check_overcommit = false;
5337
-
5338
- spin_lock(&space_info->lock);
5339
- head = &space_info->priority_tickets;
5340
-
5341
- /*
5342
- * If we are over our limit then we need to check and see if we can
5343
- * overcommit, and if we can't then we just need to free up our space
5344
- * and not satisfy any requests.
5345
- */
5346
- used = btrfs_space_info_used(space_info, true);
5347
- if (used - num_bytes >= space_info->total_bytes)
5348
- check_overcommit = true;
5349
-again:
5350
- while (!list_empty(head) && num_bytes) {
5351
- ticket = list_first_entry(head, struct reserve_ticket,
5352
- list);
5353
- /*
5354
- * We use 0 bytes because this space is already reserved, so
5355
- * adding the ticket space would be a double count.
5356
- */
5357
- if (check_overcommit &&
5358
- !can_overcommit(fs_info, space_info, 0, flush, false))
5359
- break;
5360
- if (num_bytes >= ticket->bytes) {
5361
- list_del_init(&ticket->list);
5362
- num_bytes -= ticket->bytes;
5363
- ticket->bytes = 0;
5364
- space_info->tickets_id++;
5365
- wake_up(&ticket->wait);
5366
- } else {
5367
- ticket->bytes -= num_bytes;
5368
- num_bytes = 0;
5369
- }
5370
- }
5371
-
5372
- if (num_bytes && head == &space_info->priority_tickets) {
5373
- head = &space_info->tickets;
5374
- flush = BTRFS_RESERVE_FLUSH_ALL;
5375
- goto again;
5376
- }
5377
- space_info->bytes_may_use -= num_bytes;
5378
- trace_btrfs_space_reservation(fs_info, "space_info",
5379
- space_info->flags, num_bytes, 0);
5380
- spin_unlock(&space_info->lock);
5381
-}
5382
-
5383
-/*
5384
- * This is for newly allocated space that isn't accounted in
5385
- * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
5386
- * we use this helper.
5387
- */
5388
-static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5389
- struct btrfs_space_info *space_info,
5390
- u64 num_bytes)
5391
-{
5392
- struct reserve_ticket *ticket;
5393
- struct list_head *head = &space_info->priority_tickets;
5394
-
5395
-again:
5396
- while (!list_empty(head) && num_bytes) {
5397
- ticket = list_first_entry(head, struct reserve_ticket,
5398
- list);
5399
- if (num_bytes >= ticket->bytes) {
5400
- trace_btrfs_space_reservation(fs_info, "space_info",
5401
- space_info->flags,
5402
- ticket->bytes, 1);
5403
- list_del_init(&ticket->list);
5404
- num_bytes -= ticket->bytes;
5405
- space_info->bytes_may_use += ticket->bytes;
5406
- ticket->bytes = 0;
5407
- space_info->tickets_id++;
5408
- wake_up(&ticket->wait);
5409
- } else {
5410
- trace_btrfs_space_reservation(fs_info, "space_info",
5411
- space_info->flags,
5412
- num_bytes, 1);
5413
- space_info->bytes_may_use += num_bytes;
5414
- ticket->bytes -= num_bytes;
5415
- num_bytes = 0;
5416
- }
5417
- }
5418
-
5419
- if (num_bytes && head == &space_info->priority_tickets) {
5420
- head = &space_info->tickets;
5421
- goto again;
5422
- }
5423
-}
5424
-
5425
-static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5426
- struct btrfs_block_rsv *block_rsv,
5427
- struct btrfs_block_rsv *dest, u64 num_bytes,
5428
- u64 *qgroup_to_release_ret)
5429
-{
5430
- struct btrfs_space_info *space_info = block_rsv->space_info;
5431
- u64 qgroup_to_release = 0;
5432
- u64 ret;
5433
-
5434
- spin_lock(&block_rsv->lock);
5435
- if (num_bytes == (u64)-1) {
5436
- num_bytes = block_rsv->size;
5437
- qgroup_to_release = block_rsv->qgroup_rsv_size;
5438
- }
5439
- block_rsv->size -= num_bytes;
5440
- if (block_rsv->reserved >= block_rsv->size) {
5441
- num_bytes = block_rsv->reserved - block_rsv->size;
5442
- block_rsv->reserved = block_rsv->size;
5443
- block_rsv->full = 1;
5444
- } else {
5445
- num_bytes = 0;
5446
- }
5447
- if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5448
- qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5449
- block_rsv->qgroup_rsv_size;
5450
- block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5451
- } else {
5452
- qgroup_to_release = 0;
5453
- }
5454
- spin_unlock(&block_rsv->lock);
5455
-
5456
- ret = num_bytes;
5457
- if (num_bytes > 0) {
5458
- if (dest) {
5459
- spin_lock(&dest->lock);
5460
- if (!dest->full) {
5461
- u64 bytes_to_add;
5462
-
5463
- bytes_to_add = dest->size - dest->reserved;
5464
- bytes_to_add = min(num_bytes, bytes_to_add);
5465
- dest->reserved += bytes_to_add;
5466
- if (dest->reserved >= dest->size)
5467
- dest->full = 1;
5468
- num_bytes -= bytes_to_add;
5469
- }
5470
- spin_unlock(&dest->lock);
5471
- }
5472
- if (num_bytes)
5473
- space_info_add_old_bytes(fs_info, space_info,
5474
- num_bytes);
5475
- }
5476
- if (qgroup_to_release_ret)
5477
- *qgroup_to_release_ret = qgroup_to_release;
5478
- return ret;
5479
-}
5480
-
5481
-int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5482
- struct btrfs_block_rsv *dst, u64 num_bytes,
5483
- int update_size)
5484
-{
5485
- int ret;
5486
-
5487
- ret = block_rsv_use_bytes(src, num_bytes);
5488
- if (ret)
5489
- return ret;
5490
-
5491
- block_rsv_add_bytes(dst, num_bytes, update_size);
5492
- return 0;
5493
-}
5494
-
5495
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5496
-{
5497
- memset(rsv, 0, sizeof(*rsv));
5498
- spin_lock_init(&rsv->lock);
5499
- rsv->type = type;
5500
-}
5501
-
5502
-void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5503
- struct btrfs_block_rsv *rsv,
5504
- unsigned short type)
5505
-{
5506
- btrfs_init_block_rsv(rsv, type);
5507
- rsv->space_info = __find_space_info(fs_info,
5508
- BTRFS_BLOCK_GROUP_METADATA);
5509
-}
5510
-
5511
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5512
- unsigned short type)
5513
-{
5514
- struct btrfs_block_rsv *block_rsv;
5515
-
5516
- block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5517
- if (!block_rsv)
5518
- return NULL;
5519
-
5520
- btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5521
- return block_rsv;
5522
-}
5523
-
5524
-void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5525
- struct btrfs_block_rsv *rsv)
5526
-{
5527
- if (!rsv)
5528
- return;
5529
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5530
- kfree(rsv);
5531
-}
5532
-
5533
-int btrfs_block_rsv_add(struct btrfs_root *root,
5534
- struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5535
- enum btrfs_reserve_flush_enum flush)
5536
-{
5537
- int ret;
5538
-
5539
- if (num_bytes == 0)
5540
- return 0;
5541
-
5542
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5543
- if (!ret) {
5544
- block_rsv_add_bytes(block_rsv, num_bytes, 1);
5545
- return 0;
5546
- }
5547
-
5548
- return ret;
5549
-}
5550
-
5551
-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5552
-{
5553
- u64 num_bytes = 0;
5554
- int ret = -ENOSPC;
5555
-
5556
- if (!block_rsv)
5557
- return 0;
5558
-
5559
- spin_lock(&block_rsv->lock);
5560
- num_bytes = div_factor(block_rsv->size, min_factor);
5561
- if (block_rsv->reserved >= num_bytes)
5562
- ret = 0;
5563
- spin_unlock(&block_rsv->lock);
5564
-
5565
- return ret;
5566
-}
5567
-
5568
-int btrfs_block_rsv_refill(struct btrfs_root *root,
5569
- struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5570
- enum btrfs_reserve_flush_enum flush)
5571
-{
5572
- u64 num_bytes = 0;
5573
- int ret = -ENOSPC;
5574
-
5575
- if (!block_rsv)
5576
- return 0;
5577
-
5578
- spin_lock(&block_rsv->lock);
5579
- num_bytes = min_reserved;
5580
- if (block_rsv->reserved >= num_bytes)
5581
- ret = 0;
5582
- else
5583
- num_bytes -= block_rsv->reserved;
5584
- spin_unlock(&block_rsv->lock);
5585
-
5586
- if (!ret)
5587
- return 0;
5588
-
5589
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5590
- if (!ret) {
5591
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
5592
- return 0;
5593
- }
5594
-
5595
- return ret;
5596
-}
5597
-
5598
-/**
5599
- * btrfs_inode_rsv_refill - refill the inode block rsv.
5600
- * @inode - the inode we are refilling.
5601
- * @flush - the flusing restriction.
5602
- *
5603
- * Essentially the same as btrfs_block_rsv_refill, except it uses the
5604
- * block_rsv->size as the minimum size. We'll either refill the missing amount
5605
- * or return if we already have enough space. This will also handle the resreve
5606
- * tracepoint for the reserved amount.
5607
- */
5608
-static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5609
- enum btrfs_reserve_flush_enum flush)
5610
-{
5611
- struct btrfs_root *root = inode->root;
5612
- struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5613
- u64 num_bytes = 0;
5614
- u64 qgroup_num_bytes = 0;
5615
- int ret = -ENOSPC;
5616
-
5617
- spin_lock(&block_rsv->lock);
5618
- if (block_rsv->reserved < block_rsv->size)
5619
- num_bytes = block_rsv->size - block_rsv->reserved;
5620
- if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
5621
- qgroup_num_bytes = block_rsv->qgroup_rsv_size -
5622
- block_rsv->qgroup_rsv_reserved;
5623
- spin_unlock(&block_rsv->lock);
5624
-
5625
- if (num_bytes == 0)
5626
- return 0;
5627
-
5628
- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
5629
- if (ret)
5630
- return ret;
5631
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5632
- if (!ret) {
5633
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
5634
- trace_btrfs_space_reservation(root->fs_info, "delalloc",
5635
- btrfs_ino(inode), num_bytes, 1);
5636
-
5637
- /* Don't forget to increase qgroup_rsv_reserved */
5638
- spin_lock(&block_rsv->lock);
5639
- block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
5640
- spin_unlock(&block_rsv->lock);
5641
- } else
5642
- btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5643
- return ret;
5644
-}
5645
-
5646
-/**
5647
- * btrfs_inode_rsv_release - release any excessive reservation.
5648
- * @inode - the inode we need to release from.
5649
- * @qgroup_free - free or convert qgroup meta.
5650
- * Unlike normal operation, qgroup meta reservation needs to know if we are
5651
- * freeing qgroup reservation or just converting it into per-trans. Normally
5652
- * @qgroup_free is true for error handling, and false for normal release.
5653
- *
5654
- * This is the same as btrfs_block_rsv_release, except that it handles the
5655
- * tracepoint for the reservation.
5656
- */
5657
-static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5658
-{
5659
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
5660
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5661
- struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5662
- u64 released = 0;
5663
- u64 qgroup_to_release = 0;
5664
-
5665
- /*
5666
- * Since we statically set the block_rsv->size we just want to say we
5667
- * are releasing 0 bytes, and then we'll just get the reservation over
5668
- * the size free'd.
5669
- */
5670
- released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
5671
- &qgroup_to_release);
5672
- if (released > 0)
5673
- trace_btrfs_space_reservation(fs_info, "delalloc",
5674
- btrfs_ino(inode), released, 0);
5675
- if (qgroup_free)
5676
- btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5677
- else
5678
- btrfs_qgroup_convert_reserved_meta(inode->root,
5679
- qgroup_to_release);
5680
-}
5681
-
5682
-void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5683
- struct btrfs_block_rsv *block_rsv,
5684
- u64 num_bytes)
5685
-{
5686
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5687
-
5688
- if (global_rsv == block_rsv ||
5689
- block_rsv->space_info != global_rsv->space_info)
5690
- global_rsv = NULL;
5691
- block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
5692
-}
5693
-
5694
-static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5695
-{
5696
- struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5697
- struct btrfs_space_info *sinfo = block_rsv->space_info;
5698
- u64 num_bytes;
5699
-
5700
- /*
5701
- * The global block rsv is based on the size of the extent tree, the
5702
- * checksum tree and the root tree. If the fs is empty we want to set
5703
- * it to a minimal amount for safety.
5704
- */
5705
- num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5706
- btrfs_root_used(&fs_info->csum_root->root_item) +
5707
- btrfs_root_used(&fs_info->tree_root->root_item);
5708
- num_bytes = max_t(u64, num_bytes, SZ_16M);
5709
-
5710
- spin_lock(&sinfo->lock);
5711
- spin_lock(&block_rsv->lock);
5712
-
5713
- block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5714
-
5715
- if (block_rsv->reserved < block_rsv->size) {
5716
- num_bytes = btrfs_space_info_used(sinfo, true);
5717
- if (sinfo->total_bytes > num_bytes) {
5718
- num_bytes = sinfo->total_bytes - num_bytes;
5719
- num_bytes = min(num_bytes,
5720
- block_rsv->size - block_rsv->reserved);
5721
- block_rsv->reserved += num_bytes;
5722
- sinfo->bytes_may_use += num_bytes;
5723
- trace_btrfs_space_reservation(fs_info, "space_info",
5724
- sinfo->flags, num_bytes,
5725
- 1);
5726
- }
5727
- } else if (block_rsv->reserved > block_rsv->size) {
5728
- num_bytes = block_rsv->reserved - block_rsv->size;
5729
- sinfo->bytes_may_use -= num_bytes;
5730
- trace_btrfs_space_reservation(fs_info, "space_info",
5731
- sinfo->flags, num_bytes, 0);
5732
- block_rsv->reserved = block_rsv->size;
5733
- }
5734
-
5735
- if (block_rsv->reserved == block_rsv->size)
5736
- block_rsv->full = 1;
5737
- else
5738
- block_rsv->full = 0;
5739
-
5740
- spin_unlock(&block_rsv->lock);
5741
- spin_unlock(&sinfo->lock);
5742
-}
5743
-
5744
-static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5745
-{
5746
- struct btrfs_space_info *space_info;
5747
-
5748
- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5749
- fs_info->chunk_block_rsv.space_info = space_info;
5750
-
5751
- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5752
- fs_info->global_block_rsv.space_info = space_info;
5753
- fs_info->trans_block_rsv.space_info = space_info;
5754
- fs_info->empty_block_rsv.space_info = space_info;
5755
- fs_info->delayed_block_rsv.space_info = space_info;
5756
-
5757
- fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5758
- fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5759
- fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5760
- fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5761
- if (fs_info->quota_root)
5762
- fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5763
- fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5764
-
5765
- update_global_block_rsv(fs_info);
5766
-}
5767
-
5768
-static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5769
-{
5770
- block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5771
- (u64)-1, NULL);
5772
- WARN_ON(fs_info->trans_block_rsv.size > 0);
5773
- WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5774
- WARN_ON(fs_info->chunk_block_rsv.size > 0);
5775
- WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5776
- WARN_ON(fs_info->delayed_block_rsv.size > 0);
5777
- WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5778
-}
5779
-
5780
-
5781
-/*
5782
- * To be called after all the new block groups attached to the transaction
5783
- * handle have been created (btrfs_create_pending_block_groups()).
5784
- */
5785
-void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5786
-{
5787
- struct btrfs_fs_info *fs_info = trans->fs_info;
5788
-
5789
- if (!trans->chunk_bytes_reserved)
5790
- return;
5791
-
5792
- WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5793
-
5794
- block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5795
- trans->chunk_bytes_reserved, NULL);
5796
- trans->chunk_bytes_reserved = 0;
5797
-}
5798
-
5799
-/*
5800
- * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5801
- * root: the root of the parent directory
5802
- * rsv: block reservation
5803
- * items: the number of items that we need do reservation
5804
- * use_global_rsv: allow fallback to the global block reservation
5805
- *
5806
- * This function is used to reserve the space for snapshot/subvolume
5807
- * creation and deletion. Those operations are different with the
5808
- * common file/directory operations, they change two fs/file trees
5809
- * and root tree, the number of items that the qgroup reserves is
5810
- * different with the free space reservation. So we can not use
5811
- * the space reservation mechanism in start_transaction().
5812
- */
5813
-int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5814
- struct btrfs_block_rsv *rsv, int items,
5815
- bool use_global_rsv)
5816
-{
5817
- u64 qgroup_num_bytes = 0;
5818
- u64 num_bytes;
5819
- int ret;
5820
- struct btrfs_fs_info *fs_info = root->fs_info;
5821
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5822
-
5823
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
5824
- /* One for parent inode, two for dir entries */
5825
- qgroup_num_bytes = 3 * fs_info->nodesize;
5826
- ret = btrfs_qgroup_reserve_meta_prealloc(root,
5827
- qgroup_num_bytes, true);
5828
- if (ret)
5829
- return ret;
5830
- }
5831
-
5832
- num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
5833
- rsv->space_info = __find_space_info(fs_info,
5834
- BTRFS_BLOCK_GROUP_METADATA);
5835
- ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5836
- BTRFS_RESERVE_FLUSH_ALL);
5837
-
5838
- if (ret == -ENOSPC && use_global_rsv)
5839
- ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
5840
-
5841
- if (ret && qgroup_num_bytes)
5842
- btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5843
-
5844
- return ret;
5845
-}
5846
-
5847
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
5848
- struct btrfs_block_rsv *rsv)
5849
-{
5850
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5851
-}
5852
-
5853
-static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
5854
- struct btrfs_inode *inode)
5855
-{
5856
- struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5857
- u64 reserve_size = 0;
5858
- u64 qgroup_rsv_size = 0;
5859
- u64 csum_leaves;
5860
- unsigned outstanding_extents;
5861
-
5862
- lockdep_assert_held(&inode->lock);
5863
- outstanding_extents = inode->outstanding_extents;
5864
- if (outstanding_extents)
5865
- reserve_size = btrfs_calc_trans_metadata_size(fs_info,
5866
- outstanding_extents + 1);
5867
- csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
5868
- inode->csum_bytes);
5869
- reserve_size += btrfs_calc_trans_metadata_size(fs_info,
5870
- csum_leaves);
5871
- /*
5872
- * For qgroup rsv, the calculation is very simple:
5873
- * account one nodesize for each outstanding extent
5874
- *
5875
- * This is overestimating in most cases.
5876
- */
5877
- qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
5878
-
5879
- spin_lock(&block_rsv->lock);
5880
- block_rsv->size = reserve_size;
5881
- block_rsv->qgroup_rsv_size = qgroup_rsv_size;
5882
- spin_unlock(&block_rsv->lock);
5883
-}
5884
-
5885
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
5886
-{
5887
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
5888
- unsigned nr_extents;
5889
- enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5890
- int ret = 0;
5891
- bool delalloc_lock = true;
5892
-
5893
- /* If we are a free space inode we need to not flush since we will be in
5894
- * the middle of a transaction commit. We also don't need the delalloc
5895
- * mutex since we won't race with anybody. We need this mostly to make
5896
- * lockdep shut its filthy mouth.
5897
- *
5898
- * If we have a transaction open (can happen if we call truncate_block
5899
- * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5900
- */
5901
- if (btrfs_is_free_space_inode(inode)) {
5902
- flush = BTRFS_RESERVE_NO_FLUSH;
5903
- delalloc_lock = false;
5904
- } else {
5905
- if (current->journal_info)
5906
- flush = BTRFS_RESERVE_FLUSH_LIMIT;
5907
-
5908
- if (btrfs_transaction_in_commit(fs_info))
5909
- schedule_timeout(1);
5910
- }
5911
-
5912
- if (delalloc_lock)
5913
- mutex_lock(&inode->delalloc_mutex);
5914
-
5915
- num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5916
-
5917
- /* Add our new extents and calculate the new rsv size. */
5918
- spin_lock(&inode->lock);
5919
- nr_extents = count_max_extents(num_bytes);
5920
- btrfs_mod_outstanding_extents(inode, nr_extents);
5921
- inode->csum_bytes += num_bytes;
5922
- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5923
- spin_unlock(&inode->lock);
5924
-
5925
- ret = btrfs_inode_rsv_refill(inode, flush);
5926
- if (unlikely(ret))
5927
- goto out_fail;
5928
-
5929
- if (delalloc_lock)
5930
- mutex_unlock(&inode->delalloc_mutex);
5931
- return 0;
5932
-
5933
-out_fail:
5934
- spin_lock(&inode->lock);
5935
- nr_extents = count_max_extents(num_bytes);
5936
- btrfs_mod_outstanding_extents(inode, -nr_extents);
5937
- inode->csum_bytes -= num_bytes;
5938
- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5939
- spin_unlock(&inode->lock);
5940
-
5941
- btrfs_inode_rsv_release(inode, true);
5942
- if (delalloc_lock)
5943
- mutex_unlock(&inode->delalloc_mutex);
5944
- return ret;
5945
-}
5946
-
5947
-/**
5948
- * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5949
- * @inode: the inode to release the reservation for.
5950
- * @num_bytes: the number of bytes we are releasing.
5951
- * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
5952
- *
5953
- * This will release the metadata reservation for an inode. This can be called
5954
- * once we complete IO for a given set of bytes to release their metadata
5955
- * reservations, or on error for the same reason.
5956
- */
5957
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
5958
- bool qgroup_free)
5959
-{
5960
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
5961
-
5962
- num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5963
- spin_lock(&inode->lock);
5964
- inode->csum_bytes -= num_bytes;
5965
- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5966
- spin_unlock(&inode->lock);
5967
-
5968
- if (btrfs_is_testing(fs_info))
5969
- return;
5970
-
5971
- btrfs_inode_rsv_release(inode, qgroup_free);
5972
-}
5973
-
5974
-/**
5975
- * btrfs_delalloc_release_extents - release our outstanding_extents
5976
- * @inode: the inode to balance the reservation for.
5977
- * @num_bytes: the number of bytes we originally reserved with
5978
- * @qgroup_free: do we need to free qgroup meta reservation or convert them.
5979
- *
5980
- * When we reserve space we increase outstanding_extents for the extents we may
5981
- * add. Once we've set the range as delalloc or created our ordered extents we
5982
- * have outstanding_extents to track the real usage, so we use this to free our
5983
- * temporarily tracked outstanding_extents. This _must_ be used in conjunction
5984
- * with btrfs_delalloc_reserve_metadata.
5985
- */
5986
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
5987
-{
5988
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
5989
- unsigned num_extents;
5990
-
5991
- spin_lock(&inode->lock);
5992
- num_extents = count_max_extents(num_bytes);
5993
- btrfs_mod_outstanding_extents(inode, -num_extents);
5994
- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5995
- spin_unlock(&inode->lock);
5996
-
5997
- if (btrfs_is_testing(fs_info))
5998
- return;
5999
-
6000
- btrfs_inode_rsv_release(inode, true);
6001
-}
6002
-
6003
-/**
6004
- * btrfs_delalloc_reserve_space - reserve data and metadata space for
6005
- * delalloc
6006
- * @inode: inode we're writing to
6007
- * @start: start range we are writing to
6008
- * @len: how long the range we are writing to
6009
- * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6010
- * current reservation.
6011
- *
6012
- * This will do the following things
6013
- *
6014
- * o reserve space in data space info for num bytes
6015
- * and reserve precious corresponding qgroup space
6016
- * (Done in check_data_free_space)
6017
- *
6018
- * o reserve space for metadata space, based on the number of outstanding
6019
- * extents and how much csums will be needed
6020
- * also reserve metadata space in a per root over-reserve method.
6021
- * o add to the inodes->delalloc_bytes
6022
- * o add it to the fs_info's delalloc inodes list.
6023
- * (Above 3 all done in delalloc_reserve_metadata)
6024
- *
6025
- * Return 0 for success
6026
- * Return <0 for error(-ENOSPC or -EQUOT)
6027
- */
6028
-int btrfs_delalloc_reserve_space(struct inode *inode,
6029
- struct extent_changeset **reserved, u64 start, u64 len)
6030
-{
6031
- int ret;
6032
-
6033
- ret = btrfs_check_data_free_space(inode, reserved, start, len);
6034
- if (ret < 0)
6035
- return ret;
6036
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6037
- if (ret < 0)
6038
- btrfs_free_reserved_data_space(inode, *reserved, start, len);
6039
- return ret;
6040
-}
6041
-
6042
-/**
6043
- * btrfs_delalloc_release_space - release data and metadata space for delalloc
6044
- * @inode: inode we're releasing space for
6045
- * @start: start position of the space already reserved
6046
- * @len: the len of the space already reserved
6047
- * @release_bytes: the len of the space we consumed or didn't use
6048
- *
6049
- * This function will release the metadata space that was not used and will
6050
- * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6051
- * list if there are no delalloc bytes left.
6052
- * Also it will handle the qgroup reserved space.
6053
- */
6054
-void btrfs_delalloc_release_space(struct inode *inode,
6055
- struct extent_changeset *reserved,
6056
- u64 start, u64 len, bool qgroup_free)
6057
-{
6058
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
6059
- btrfs_free_reserved_data_space(inode, reserved, start, len);
6060
-}
6061
-
6062
-static int update_block_group(struct btrfs_trans_handle *trans,
6063
- struct btrfs_fs_info *info, u64 bytenr,
6064
- u64 num_bytes, int alloc)
6065
-{
6066
- struct btrfs_block_group_cache *cache = NULL;
6067
- u64 total = num_bytes;
6068
- u64 old_val;
6069
- u64 byte_in_group;
6070
- int factor;
6071
-
6072
- /* block accounting for super block */
6073
- spin_lock(&info->delalloc_root_lock);
6074
- old_val = btrfs_super_bytes_used(info->super_copy);
6075
- if (alloc)
6076
- old_val += num_bytes;
6077
- else
6078
- old_val -= num_bytes;
6079
- btrfs_set_super_bytes_used(info->super_copy, old_val);
6080
- spin_unlock(&info->delalloc_root_lock);
6081
-
6082
- while (total) {
6083
- cache = btrfs_lookup_block_group(info, bytenr);
6084
- if (!cache)
6085
- return -ENOENT;
6086
- factor = btrfs_bg_type_to_factor(cache->flags);
6087
-
6088
- /*
6089
- * If this block group has free space cache written out, we
6090
- * need to make sure to load it if we are removing space. This
6091
- * is because we need the unpinning stage to actually add the
6092
- * space back to the block group, otherwise we will leak space.
6093
- */
6094
- if (!alloc && cache->cached == BTRFS_CACHE_NO)
6095
- cache_block_group(cache, 1);
6096
-
6097
- byte_in_group = bytenr - cache->key.objectid;
6098
- WARN_ON(byte_in_group > cache->key.offset);
6099
-
6100
- spin_lock(&cache->space_info->lock);
6101
- spin_lock(&cache->lock);
6102
-
6103
- if (btrfs_test_opt(info, SPACE_CACHE) &&
6104
- cache->disk_cache_state < BTRFS_DC_CLEAR)
6105
- cache->disk_cache_state = BTRFS_DC_CLEAR;
6106
-
6107
- old_val = btrfs_block_group_used(&cache->item);
6108
- num_bytes = min(total, cache->key.offset - byte_in_group);
6109
- if (alloc) {
6110
- old_val += num_bytes;
6111
- btrfs_set_block_group_used(&cache->item, old_val);
6112
- cache->reserved -= num_bytes;
6113
- cache->space_info->bytes_reserved -= num_bytes;
6114
- cache->space_info->bytes_used += num_bytes;
6115
- cache->space_info->disk_used += num_bytes * factor;
6116
- spin_unlock(&cache->lock);
6117
- spin_unlock(&cache->space_info->lock);
6118
- } else {
6119
- old_val -= num_bytes;
6120
- btrfs_set_block_group_used(&cache->item, old_val);
6121
- cache->pinned += num_bytes;
6122
- cache->space_info->bytes_pinned += num_bytes;
6123
- cache->space_info->bytes_used -= num_bytes;
6124
- cache->space_info->disk_used -= num_bytes * factor;
6125
- spin_unlock(&cache->lock);
6126
- spin_unlock(&cache->space_info->lock);
6127
-
6128
- trace_btrfs_space_reservation(info, "pinned",
6129
- cache->space_info->flags,
6130
- num_bytes, 1);
6131
- percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6132
- num_bytes,
6133
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
6134
- set_extent_dirty(info->pinned_extents,
6135
- bytenr, bytenr + num_bytes - 1,
6136
- GFP_NOFS | __GFP_NOFAIL);
6137
- }
6138
-
6139
- spin_lock(&trans->transaction->dirty_bgs_lock);
6140
- if (list_empty(&cache->dirty_list)) {
6141
- list_add_tail(&cache->dirty_list,
6142
- &trans->transaction->dirty_bgs);
6143
- trans->transaction->num_dirty_bgs++;
6144
- btrfs_get_block_group(cache);
6145
- }
6146
- spin_unlock(&trans->transaction->dirty_bgs_lock);
6147
-
6148
- /*
6149
- * No longer have used bytes in this block group, queue it for
6150
- * deletion. We do this after adding the block group to the
6151
- * dirty list to avoid races between cleaner kthread and space
6152
- * cache writeout.
6153
- */
6154
- if (!alloc && old_val == 0)
6155
- btrfs_mark_bg_unused(cache);
6156
-
6157
- btrfs_put_block_group(cache);
6158
- total -= num_bytes;
6159
- bytenr += num_bytes;
6160
- }
6161
- return 0;
61622518 }
61632519
61642520 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
61652521 {
6166
- struct btrfs_block_group_cache *cache;
2522
+ struct btrfs_block_group *cache;
61672523 u64 bytenr;
61682524
61692525 spin_lock(&fs_info->block_group_cache_lock);
....@@ -6177,20 +2533,23 @@
61772533 if (!cache)
61782534 return 0;
61792535
6180
- bytenr = cache->key.objectid;
2536
+ bytenr = cache->start;
61812537 btrfs_put_block_group(cache);
61822538
61832539 return bytenr;
61842540 }
61852541
6186
-static int pin_down_extent(struct btrfs_fs_info *fs_info,
6187
- struct btrfs_block_group_cache *cache,
2542
+static int pin_down_extent(struct btrfs_trans_handle *trans,
2543
+ struct btrfs_block_group *cache,
61882544 u64 bytenr, u64 num_bytes, int reserved)
61892545 {
2546
+ struct btrfs_fs_info *fs_info = cache->fs_info;
2547
+
61902548 spin_lock(&cache->space_info->lock);
61912549 spin_lock(&cache->lock);
61922550 cache->pinned += num_bytes;
6193
- cache->space_info->bytes_pinned += num_bytes;
2551
+ btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2552
+ num_bytes);
61942553 if (reserved) {
61952554 cache->reserved -= num_bytes;
61962555 cache->space_info->bytes_reserved -= num_bytes;
....@@ -6198,27 +2557,21 @@
61982557 spin_unlock(&cache->lock);
61992558 spin_unlock(&cache->space_info->lock);
62002559
6201
- trace_btrfs_space_reservation(fs_info, "pinned",
6202
- cache->space_info->flags, num_bytes, 1);
6203
- percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6204
- num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6205
- set_extent_dirty(fs_info->pinned_extents, bytenr,
2560
+ __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
2561
+ set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
62062562 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
62072563 return 0;
62082564 }
62092565
6210
-/*
6211
- * this function must be called within transaction
6212
- */
6213
-int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
2566
+int btrfs_pin_extent(struct btrfs_trans_handle *trans,
62142567 u64 bytenr, u64 num_bytes, int reserved)
62152568 {
6216
- struct btrfs_block_group_cache *cache;
2569
+ struct btrfs_block_group *cache;
62172570
6218
- cache = btrfs_lookup_block_group(fs_info, bytenr);
2571
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
62192572 BUG_ON(!cache); /* Logic error */
62202573
6221
- pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
2574
+ pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
62222575
62232576 btrfs_put_block_group(cache);
62242577 return 0;
....@@ -6227,13 +2580,15 @@
62272580 /*
62282581 * this function must be called within transaction
62292582 */
6230
-int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
2583
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
62312584 u64 bytenr, u64 num_bytes)
62322585 {
6233
- struct btrfs_block_group_cache *cache;
2586
+ struct btrfs_block_group *cache;
62342587 int ret;
62352588
6236
- cache = btrfs_lookup_block_group(fs_info, bytenr);
2589
+ btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes);
2590
+
2591
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
62372592 if (!cache)
62382593 return -EINVAL;
62392594
....@@ -6243,9 +2598,9 @@
62432598 * to one because the slow code to read in the free extents does check
62442599 * the pinned extents.
62452600 */
6246
- cache_block_group(cache, 1);
2601
+ btrfs_cache_block_group(cache, 1);
62472602
6248
- pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
2603
+ pin_down_extent(trans, cache, bytenr, num_bytes, 0);
62492604
62502605 /* remove us from the free space cache (if we're there at all) */
62512606 ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
....@@ -6257,25 +2612,26 @@
62572612 u64 start, u64 num_bytes)
62582613 {
62592614 int ret;
6260
- struct btrfs_block_group_cache *block_group;
2615
+ struct btrfs_block_group *block_group;
62612616 struct btrfs_caching_control *caching_ctl;
62622617
62632618 block_group = btrfs_lookup_block_group(fs_info, start);
62642619 if (!block_group)
62652620 return -EINVAL;
62662621
6267
- cache_block_group(block_group, 0);
6268
- caching_ctl = get_caching_control(block_group);
2622
+ btrfs_cache_block_group(block_group, 0);
2623
+ caching_ctl = btrfs_get_caching_control(block_group);
62692624
62702625 if (!caching_ctl) {
62712626 /* Logic error */
6272
- BUG_ON(!block_group_cache_done(block_group));
2627
+ BUG_ON(!btrfs_block_group_done(block_group));
62732628 ret = btrfs_remove_free_space(block_group, start, num_bytes);
62742629 } else {
62752630 mutex_lock(&caching_ctl->mutex);
62762631
62772632 if (start >= caching_ctl->progress) {
6278
- ret = add_excluded_extent(fs_info, start, num_bytes);
2633
+ ret = btrfs_add_excluded_extent(fs_info, start,
2634
+ num_bytes);
62792635 } else if (start + num_bytes <= caching_ctl->progress) {
62802636 ret = btrfs_remove_free_space(block_group,
62812637 start, num_bytes);
....@@ -6289,19 +2645,20 @@
62892645 num_bytes = (start + num_bytes) -
62902646 caching_ctl->progress;
62912647 start = caching_ctl->progress;
6292
- ret = add_excluded_extent(fs_info, start, num_bytes);
2648
+ ret = btrfs_add_excluded_extent(fs_info, start,
2649
+ num_bytes);
62932650 }
62942651 out_lock:
62952652 mutex_unlock(&caching_ctl->mutex);
6296
- put_caching_control(caching_ctl);
2653
+ btrfs_put_caching_control(caching_ctl);
62972654 }
62982655 btrfs_put_block_group(block_group);
62992656 return ret;
63002657 }
63012658
6302
-int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
6303
- struct extent_buffer *eb)
2659
+int btrfs_exclude_logged_extents(struct extent_buffer *eb)
63042660 {
2661
+ struct btrfs_fs_info *fs_info = eb->fs_info;
63052662 struct btrfs_file_extent_item *item;
63062663 struct btrfs_key key;
63072664 int found_type;
....@@ -6332,146 +2689,9 @@
63322689 }
63332690
63342691 static void
6335
-btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
2692
+btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
63362693 {
63372694 atomic_inc(&bg->reservations);
6338
-}
6339
-
6340
-void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6341
- const u64 start)
6342
-{
6343
- struct btrfs_block_group_cache *bg;
6344
-
6345
- bg = btrfs_lookup_block_group(fs_info, start);
6346
- ASSERT(bg);
6347
- if (atomic_dec_and_test(&bg->reservations))
6348
- wake_up_var(&bg->reservations);
6349
- btrfs_put_block_group(bg);
6350
-}
6351
-
6352
-void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6353
-{
6354
- struct btrfs_space_info *space_info = bg->space_info;
6355
-
6356
- ASSERT(bg->ro);
6357
-
6358
- if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6359
- return;
6360
-
6361
- /*
6362
- * Our block group is read only but before we set it to read only,
6363
- * some task might have had allocated an extent from it already, but it
6364
- * has not yet created a respective ordered extent (and added it to a
6365
- * root's list of ordered extents).
6366
- * Therefore wait for any task currently allocating extents, since the
6367
- * block group's reservations counter is incremented while a read lock
6368
- * on the groups' semaphore is held and decremented after releasing
6369
- * the read access on that semaphore and creating the ordered extent.
6370
- */
6371
- down_write(&space_info->groups_sem);
6372
- up_write(&space_info->groups_sem);
6373
-
6374
- wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
6375
-}
6376
-
6377
-/**
6378
- * btrfs_add_reserved_bytes - update the block_group and space info counters
6379
- * @cache: The cache we are manipulating
6380
- * @ram_bytes: The number of bytes of file content, and will be same to
6381
- * @num_bytes except for the compress path.
6382
- * @num_bytes: The number of bytes in question
6383
- * @delalloc: The blocks are allocated for the delalloc write
6384
- *
6385
- * This is called by the allocator when it reserves space. If this is a
6386
- * reservation and the block group has become read only we cannot make the
6387
- * reservation and return -EAGAIN, otherwise this function always succeeds.
6388
- */
6389
-static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6390
- u64 ram_bytes, u64 num_bytes, int delalloc)
6391
-{
6392
- struct btrfs_space_info *space_info = cache->space_info;
6393
- int ret = 0;
6394
-
6395
- spin_lock(&space_info->lock);
6396
- spin_lock(&cache->lock);
6397
- if (cache->ro) {
6398
- ret = -EAGAIN;
6399
- } else {
6400
- cache->reserved += num_bytes;
6401
- space_info->bytes_reserved += num_bytes;
6402
-
6403
- trace_btrfs_space_reservation(cache->fs_info,
6404
- "space_info", space_info->flags,
6405
- ram_bytes, 0);
6406
- space_info->bytes_may_use -= ram_bytes;
6407
- if (delalloc)
6408
- cache->delalloc_bytes += num_bytes;
6409
- }
6410
- spin_unlock(&cache->lock);
6411
- spin_unlock(&space_info->lock);
6412
- return ret;
6413
-}
6414
-
6415
-/**
6416
- * btrfs_free_reserved_bytes - update the block_group and space info counters
6417
- * @cache: The cache we are manipulating
6418
- * @num_bytes: The number of bytes in question
6419
- * @delalloc: The blocks are allocated for the delalloc write
6420
- *
6421
- * This is called by somebody who is freeing space that was never actually used
6422
- * on disk. For example if you reserve some space for a new leaf in transaction
6423
- * A and before transaction A commits you free that leaf, you call this with
6424
- * reserve set to 0 in order to clear the reservation.
6425
- */
6426
-
6427
-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6428
- u64 num_bytes, int delalloc)
6429
-{
6430
- struct btrfs_space_info *space_info = cache->space_info;
6431
- int ret = 0;
6432
-
6433
- spin_lock(&space_info->lock);
6434
- spin_lock(&cache->lock);
6435
- if (cache->ro)
6436
- space_info->bytes_readonly += num_bytes;
6437
- cache->reserved -= num_bytes;
6438
- space_info->bytes_reserved -= num_bytes;
6439
- space_info->max_extent_size = 0;
6440
-
6441
- if (delalloc)
6442
- cache->delalloc_bytes -= num_bytes;
6443
- spin_unlock(&cache->lock);
6444
- spin_unlock(&space_info->lock);
6445
- return ret;
6446
-}
6447
-void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6448
-{
6449
- struct btrfs_caching_control *next;
6450
- struct btrfs_caching_control *caching_ctl;
6451
- struct btrfs_block_group_cache *cache;
6452
-
6453
- down_write(&fs_info->commit_root_sem);
6454
-
6455
- list_for_each_entry_safe(caching_ctl, next,
6456
- &fs_info->caching_block_groups, list) {
6457
- cache = caching_ctl->block_group;
6458
- if (block_group_cache_done(cache)) {
6459
- cache->last_byte_to_unpin = (u64)-1;
6460
- list_del_init(&caching_ctl->list);
6461
- put_caching_control(caching_ctl);
6462
- } else {
6463
- cache->last_byte_to_unpin = caching_ctl->progress;
6464
- }
6465
- }
6466
-
6467
- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6468
- fs_info->pinned_extents = &fs_info->freed_extents[1];
6469
- else
6470
- fs_info->pinned_extents = &fs_info->freed_extents[0];
6471
-
6472
- up_write(&fs_info->commit_root_sem);
6473
-
6474
- update_global_block_rsv(fs_info);
64752695 }
64762696
64772697 /*
....@@ -6507,7 +2727,7 @@
65072727 u64 start, u64 end,
65082728 const bool return_free_space)
65092729 {
6510
- struct btrfs_block_group_cache *cache = NULL;
2730
+ struct btrfs_block_group *cache = NULL;
65112731 struct btrfs_space_info *space_info;
65122732 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
65132733 struct btrfs_free_cluster *cluster = NULL;
....@@ -6519,7 +2739,7 @@
65192739 while (start <= end) {
65202740 readonly = false;
65212741 if (!cache ||
6522
- start >= cache->key.objectid + cache->key.offset) {
2742
+ start >= cache->start + cache->length) {
65232743 if (cache)
65242744 btrfs_put_block_group(cache);
65252745 total_unpinned = 0;
....@@ -6532,13 +2752,13 @@
65322752 empty_cluster <<= 1;
65332753 }
65342754
6535
- len = cache->key.objectid + cache->key.offset - start;
2755
+ len = cache->start + cache->length - start;
65362756 len = min(len, end + 1 - start);
65372757
6538
- if (start < cache->last_byte_to_unpin) {
6539
- len = min(len, cache->last_byte_to_unpin - start);
6540
- if (return_free_space)
6541
- btrfs_add_free_space(cache, start, len);
2758
+ if (start < cache->last_byte_to_unpin && return_free_space) {
2759
+ u64 add_len = min(len, cache->last_byte_to_unpin - start);
2760
+
2761
+ btrfs_add_free_space(cache, start, add_len);
65422762 }
65432763
65442764 start += len;
....@@ -6561,13 +2781,9 @@
65612781 spin_lock(&space_info->lock);
65622782 spin_lock(&cache->lock);
65632783 cache->pinned -= len;
6564
- space_info->bytes_pinned -= len;
6565
-
6566
- trace_btrfs_space_reservation(fs_info, "pinned",
6567
- space_info->flags, len, 0);
2784
+ btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
65682785 space_info->max_extent_size = 0;
6569
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
6570
- -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
2786
+ __btrfs_mod_total_bytes_pinned(space_info, -len);
65712787 if (cache->ro) {
65722788 space_info->bytes_readonly += len;
65732789 readonly = true;
....@@ -6582,21 +2798,17 @@
65822798 to_add = min(len, global_rsv->size -
65832799 global_rsv->reserved);
65842800 global_rsv->reserved += to_add;
6585
- space_info->bytes_may_use += to_add;
2801
+ btrfs_space_info_update_bytes_may_use(fs_info,
2802
+ space_info, to_add);
65862803 if (global_rsv->reserved >= global_rsv->size)
65872804 global_rsv->full = 1;
6588
- trace_btrfs_space_reservation(fs_info,
6589
- "space_info",
6590
- space_info->flags,
6591
- to_add, 1);
65922805 len -= to_add;
65932806 }
65942807 spin_unlock(&global_rsv->lock);
6595
- /* Add to any tickets we may have */
6596
- if (len)
6597
- space_info_add_new_bytes(fs_info, space_info,
6598
- len);
65992808 }
2809
+ /* Add to any tickets we may have */
2810
+ if (!readonly && return_free_space && len)
2811
+ btrfs_try_granting_tickets(fs_info, space_info);
66002812 spin_unlock(&space_info->lock);
66012813 }
66022814
....@@ -6608,19 +2820,16 @@
66082820 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
66092821 {
66102822 struct btrfs_fs_info *fs_info = trans->fs_info;
6611
- struct btrfs_block_group_cache *block_group, *tmp;
2823
+ struct btrfs_block_group *block_group, *tmp;
66122824 struct list_head *deleted_bgs;
66132825 struct extent_io_tree *unpin;
66142826 u64 start;
66152827 u64 end;
66162828 int ret;
66172829
6618
- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6619
- unpin = &fs_info->freed_extents[1];
6620
- else
6621
- unpin = &fs_info->freed_extents[0];
2830
+ unpin = &trans->transaction->pinned_extents;
66222831
6623
- while (!trans->aborted) {
2832
+ while (!TRANS_ABORTED(trans)) {
66242833 struct extent_state *cached_state = NULL;
66252834
66262835 mutex_lock(&fs_info->unused_bg_unpin_mutex);
....@@ -6630,8 +2839,11 @@
66302839 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
66312840 break;
66322841 }
2842
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
2843
+ clear_extent_bits(&fs_info->excluded_extents, start,
2844
+ end, EXTENT_UPTODATE);
66332845
6634
- if (btrfs_test_opt(fs_info, DISCARD))
2846
+ if (btrfs_test_opt(fs_info, DISCARD_SYNC))
66352847 ret = btrfs_discard_extent(fs_info, start,
66362848 end + 1 - start, NULL);
66372849
....@@ -6640,6 +2852,11 @@
66402852 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
66412853 free_extent_state(cached_state);
66422854 cond_resched();
2855
+ }
2856
+
2857
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2858
+ btrfs_discard_calc_delay(&fs_info->discard_ctl);
2859
+ btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
66432860 }
66442861
66452862 /*
....@@ -6652,14 +2869,14 @@
66522869 u64 trimmed = 0;
66532870
66542871 ret = -EROFS;
6655
- if (!trans->aborted)
2872
+ if (!TRANS_ABORTED(trans))
66562873 ret = btrfs_discard_extent(fs_info,
6657
- block_group->key.objectid,
6658
- block_group->key.offset,
2874
+ block_group->start,
2875
+ block_group->length,
66592876 &trimmed);
66602877
66612878 list_del_init(&block_group->bg_list);
6662
- btrfs_put_block_group_trimming(block_group);
2879
+ btrfs_unfreeze_block_group(block_group);
66632880 btrfs_put_block_group(block_group);
66642881
66652882 if (ret) {
....@@ -6673,6 +2890,65 @@
66732890 return 0;
66742891 }
66752892
2893
+/*
2894
+ * Drop one or more refs of @node.
2895
+ *
2896
+ * 1. Locate the extent refs.
2897
+ * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
2898
+ * Locate it, then reduce the refs number or remove the ref line completely.
2899
+ *
2900
+ * 2. Update the refs count in EXTENT/METADATA_ITEM
2901
+ *
2902
+ * Inline backref case:
2903
+ *
2904
+ * in extent tree we have:
2905
+ *
2906
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2907
+ * refs 2 gen 6 flags DATA
2908
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
2909
+ * extent data backref root FS_TREE objectid 257 offset 0 count 1
2910
+ *
2911
+ * This function gets called with:
2912
+ *
2913
+ * node->bytenr = 13631488
2914
+ * node->num_bytes = 1048576
2915
+ * root_objectid = FS_TREE
2916
+ * owner_objectid = 257
2917
+ * owner_offset = 0
2918
+ * refs_to_drop = 1
2919
+ *
2920
+ * Then we should get some like:
2921
+ *
2922
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2923
+ * refs 1 gen 6 flags DATA
2924
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
2925
+ *
2926
+ * Keyed backref case:
2927
+ *
2928
+ * in extent tree we have:
2929
+ *
2930
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2931
+ * refs 754 gen 6 flags DATA
2932
+ * [...]
2933
+ * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
2934
+ * extent data backref root FS_TREE objectid 866 offset 0 count 1
2935
+ *
2936
+ * This function get called with:
2937
+ *
2938
+ * node->bytenr = 13631488
2939
+ * node->num_bytes = 1048576
2940
+ * root_objectid = FS_TREE
2941
+ * owner_objectid = 866
2942
+ * owner_offset = 0
2943
+ * refs_to_drop = 1
2944
+ *
2945
+ * Then we should get some like:
2946
+ *
2947
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2948
+ * refs 753 gen 6 flags DATA
2949
+ *
2950
+ * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
2951
+ */
66762952 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
66772953 struct btrfs_delayed_ref_node *node, u64 parent,
66782954 u64 root_objectid, u64 owner_objectid,
....@@ -6702,11 +2978,18 @@
67022978 if (!path)
67032979 return -ENOMEM;
67042980
6705
- path->reada = READA_FORWARD;
67062981 path->leave_spinning = 1;
67072982
67082983 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6709
- BUG_ON(!is_data && refs_to_drop != 1);
2984
+
2985
+ if (!is_data && refs_to_drop != 1) {
2986
+ btrfs_crit(info,
2987
+"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
2988
+ node->bytenr, refs_to_drop);
2989
+ ret = -EINVAL;
2990
+ btrfs_abort_transaction(trans, ret);
2991
+ goto out;
2992
+ }
67102993
67112994 if (is_data)
67122995 skinny_metadata = false;
....@@ -6715,6 +2998,13 @@
67152998 parent, root_objectid, owner_objectid,
67162999 owner_offset);
67173000 if (ret == 0) {
3001
+ /*
3002
+ * Either the inline backref or the SHARED_DATA_REF/
3003
+ * SHARED_BLOCK_REF is found
3004
+ *
3005
+ * Here is a quick path to locate EXTENT/METADATA_ITEM.
3006
+ * It's possible the EXTENT/METADATA_ITEM is near current slot.
3007
+ */
67183008 extent_slot = path->slots[0];
67193009 while (extent_slot >= 0) {
67203010 btrfs_item_key_to_cpu(path->nodes[0], &key,
....@@ -6731,13 +3021,21 @@
67313021 found_extent = 1;
67323022 break;
67333023 }
3024
+
3025
+ /* Quick path didn't find the EXTEMT/METADATA_ITEM */
67343026 if (path->slots[0] - extent_slot > 5)
67353027 break;
67363028 extent_slot--;
67373029 }
67383030
67393031 if (!found_extent) {
6740
- BUG_ON(iref);
3032
+ if (iref) {
3033
+ btrfs_crit(info,
3034
+"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
3035
+ btrfs_abort_transaction(trans, -EUCLEAN);
3036
+ goto err_dump;
3037
+ }
3038
+ /* Must be SHARED_* item, remove the backref first */
67413039 ret = remove_extent_backref(trans, path, NULL,
67423040 refs_to_drop,
67433041 is_data, &last_ref);
....@@ -6748,6 +3046,7 @@
67483046 btrfs_release_path(path);
67493047 path->leave_spinning = 1;
67503048
3049
+ /* Slow path to locate EXTENT/METADATA_ITEM */
67513050 key.objectid = bytenr;
67523051 key.type = BTRFS_EXTENT_ITEM_KEY;
67533052 key.offset = num_bytes;
....@@ -6822,19 +3121,26 @@
68223121 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
68233122 key.type == BTRFS_EXTENT_ITEM_KEY) {
68243123 struct btrfs_tree_block_info *bi;
6825
- BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
3124
+ if (item_size < sizeof(*ei) + sizeof(*bi)) {
3125
+ btrfs_crit(info,
3126
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
3127
+ key.objectid, key.type, key.offset,
3128
+ owner_objectid, item_size,
3129
+ sizeof(*ei) + sizeof(*bi));
3130
+ btrfs_abort_transaction(trans, -EUCLEAN);
3131
+ goto err_dump;
3132
+ }
68263133 bi = (struct btrfs_tree_block_info *)(ei + 1);
68273134 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
68283135 }
68293136
68303137 refs = btrfs_extent_refs(leaf, ei);
68313138 if (refs < refs_to_drop) {
6832
- btrfs_err(info,
6833
- "trying to drop %d refs but we only have %Lu for bytenr %Lu",
3139
+ btrfs_crit(info,
3140
+ "trying to drop %d refs but we only have %llu for bytenr %llu",
68343141 refs_to_drop, refs, bytenr);
6835
- ret = -EINVAL;
6836
- btrfs_abort_transaction(trans, ret);
6837
- goto out;
3142
+ btrfs_abort_transaction(trans, -EUCLEAN);
3143
+ goto err_dump;
68383144 }
68393145 refs -= refs_to_drop;
68403146
....@@ -6846,7 +3152,12 @@
68463152 * be updated by remove_extent_backref
68473153 */
68483154 if (iref) {
6849
- BUG_ON(!found_extent);
3155
+ if (!found_extent) {
3156
+ btrfs_crit(info,
3157
+"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
3158
+ btrfs_abort_transaction(trans, -EUCLEAN);
3159
+ goto err_dump;
3160
+ }
68503161 } else {
68513162 btrfs_set_extent_refs(leaf, ei, refs);
68523163 btrfs_mark_buffer_dirty(leaf);
....@@ -6861,13 +3172,39 @@
68613172 }
68623173 }
68633174 } else {
3175
+ /* In this branch refs == 1 */
68643176 if (found_extent) {
6865
- BUG_ON(is_data && refs_to_drop !=
6866
- extent_data_ref_count(path, iref));
3177
+ if (is_data && refs_to_drop !=
3178
+ extent_data_ref_count(path, iref)) {
3179
+ btrfs_crit(info,
3180
+ "invalid refs_to_drop, current refs %u refs_to_drop %u",
3181
+ extent_data_ref_count(path, iref),
3182
+ refs_to_drop);
3183
+ btrfs_abort_transaction(trans, -EUCLEAN);
3184
+ goto err_dump;
3185
+ }
68673186 if (iref) {
6868
- BUG_ON(path->slots[0] != extent_slot);
3187
+ if (path->slots[0] != extent_slot) {
3188
+ btrfs_crit(info,
3189
+"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
3190
+ key.objectid, key.type,
3191
+ key.offset);
3192
+ btrfs_abort_transaction(trans, -EUCLEAN);
3193
+ goto err_dump;
3194
+ }
68693195 } else {
6870
- BUG_ON(path->slots[0] != extent_slot + 1);
3196
+ /*
3197
+ * No inline ref, we must be at SHARED_* item,
3198
+ * And it's single ref, it must be:
3199
+ * | extent_slot ||extent_slot + 1|
3200
+ * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3201
+ */
3202
+ if (path->slots[0] != extent_slot + 1) {
3203
+ btrfs_crit(info,
3204
+ "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
3205
+ btrfs_abort_transaction(trans, -EUCLEAN);
3206
+ goto err_dump;
3207
+ }
68713208 path->slots[0] = extent_slot;
68723209 num_to_del = 2;
68733210 }
....@@ -6897,7 +3234,7 @@
68973234 goto out;
68983235 }
68993236
6900
- ret = update_block_group(trans, info, bytenr, num_bytes, 0);
3237
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
69013238 if (ret) {
69023239 btrfs_abort_transaction(trans, ret);
69033240 goto out;
....@@ -6908,6 +3245,19 @@
69083245 out:
69093246 btrfs_free_path(path);
69103247 return ret;
3248
+err_dump:
3249
+ /*
3250
+ * Leaf dump can take up a lot of log buffer, so we only do full leaf
3251
+ * dump for debug build.
3252
+ */
3253
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
3254
+ btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
3255
+ path->slots[0], extent_slot);
3256
+ btrfs_print_leaf(path->nodes[0]);
3257
+ }
3258
+
3259
+ btrfs_free_path(path);
3260
+ return -EUCLEAN;
69113261 }
69123262
69133263 /*
....@@ -6930,15 +3280,11 @@
69303280 goto out_delayed_unlock;
69313281
69323282 spin_lock(&head->lock);
6933
- if (!RB_EMPTY_ROOT(&head->ref_tree))
3283
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
69343284 goto out;
69353285
6936
- if (head->extent_op) {
6937
- if (!head->must_insert_reserved)
6938
- goto out;
6939
- btrfs_free_delayed_extent_op(head->extent_op);
6940
- head->extent_op = NULL;
6941
- }
3286
+ if (cleanup_extent_op(head) != NULL)
3287
+ goto out;
69423288
69433289 /*
69443290 * waiting for the lock here would deadlock. If someone else has it
....@@ -6947,22 +3293,9 @@
69473293 if (!mutex_trylock(&head->mutex))
69483294 goto out;
69493295
6950
- /*
6951
- * at this point we have a head with no other entries. Go
6952
- * ahead and process it.
6953
- */
6954
- rb_erase(&head->href_node, &delayed_refs->href_root);
6955
- RB_CLEAR_NODE(&head->href_node);
6956
- atomic_dec(&delayed_refs->num_entries);
6957
-
6958
- /*
6959
- * we don't take a ref on the node because we're removing it from the
6960
- * tree, so we just steal the ref the tree was holding.
6961
- */
6962
- delayed_refs->num_heads--;
6963
- if (head->processing == 0)
6964
- delayed_refs->num_heads_ready--;
3296
+ btrfs_delete_ref_head(delayed_refs, head);
69653297 head->processing = 0;
3298
+
69663299 spin_unlock(&head->lock);
69673300 spin_unlock(&delayed_refs->lock);
69683301
....@@ -6970,6 +3303,7 @@
69703303 if (head->must_insert_reserved)
69713304 ret = 1;
69723305
3306
+ btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
69733307 mutex_unlock(&head->mutex);
69743308 btrfs_put_delayed_ref_head(head);
69753309 return ret;
....@@ -6987,28 +3321,22 @@
69873321 u64 parent, int last_ref)
69883322 {
69893323 struct btrfs_fs_info *fs_info = root->fs_info;
6990
- int pin = 1;
3324
+ struct btrfs_ref generic_ref = { 0 };
69913325 int ret;
69923326
6993
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6994
- int old_ref_mod, new_ref_mod;
3327
+ btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3328
+ buf->start, buf->len, parent);
3329
+ btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3330
+ root->root_key.objectid);
69953331
6996
- btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
6997
- root->root_key.objectid,
6998
- btrfs_header_level(buf), 0,
6999
- BTRFS_DROP_DELAYED_REF);
7000
- ret = btrfs_add_delayed_tree_ref(trans, buf->start,
7001
- buf->len, parent,
7002
- root->root_key.objectid,
7003
- btrfs_header_level(buf),
7004
- BTRFS_DROP_DELAYED_REF, NULL,
7005
- &old_ref_mod, &new_ref_mod);
3332
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3333
+ btrfs_ref_tree_mod(fs_info, &generic_ref);
3334
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
70063335 BUG_ON(ret); /* -ENOMEM */
7007
- pin = old_ref_mod >= 0 && new_ref_mod < 0;
70083336 }
70093337
70103338 if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7011
- struct btrfs_block_group_cache *cache;
3339
+ struct btrfs_block_group *cache;
70123340
70133341 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
70143342 ret = check_ref_cleanup(trans, buf->start);
....@@ -7016,12 +3344,10 @@
70163344 goto out;
70173345 }
70183346
7019
- pin = 0;
70203347 cache = btrfs_lookup_block_group(fs_info, buf->start);
70213348
70223349 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7023
- pin_down_extent(fs_info, cache, buf->start,
7024
- buf->len, 1);
3350
+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
70253351 btrfs_put_block_group(cache);
70263352 goto out;
70273353 }
....@@ -7034,10 +3360,6 @@
70343360 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
70353361 }
70363362 out:
7037
- if (pin)
7038
- add_pinned_bytes(fs_info, buf->len, true,
7039
- root->root_key.objectid);
7040
-
70413363 if (last_ref) {
70423364 /*
70433365 * Deleting the buffer, clear the corrupt flag since it doesn't
....@@ -7048,120 +3370,56 @@
70483370 }
70493371
70503372 /* Can return -ENOMEM */
7051
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
7052
- struct btrfs_root *root,
7053
- u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7054
- u64 owner, u64 offset)
3373
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
70553374 {
7056
- struct btrfs_fs_info *fs_info = root->fs_info;
7057
- int old_ref_mod, new_ref_mod;
3375
+ struct btrfs_fs_info *fs_info = trans->fs_info;
70583376 int ret;
70593377
70603378 if (btrfs_is_testing(fs_info))
70613379 return 0;
70623380
7063
- if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
7064
- btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
7065
- root_objectid, owner, offset,
7066
- BTRFS_DROP_DELAYED_REF);
7067
-
70683381 /*
70693382 * tree log blocks never actually go into the extent allocation
70703383 * tree, just update pinning info and exit early.
70713384 */
7072
- if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7073
- WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
3385
+ if ((ref->type == BTRFS_REF_METADATA &&
3386
+ ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
3387
+ (ref->type == BTRFS_REF_DATA &&
3388
+ ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
70743389 /* unlocks the pinned mutex */
7075
- btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
7076
- old_ref_mod = new_ref_mod = 0;
3390
+ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
70773391 ret = 0;
7078
- } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7079
- ret = btrfs_add_delayed_tree_ref(trans, bytenr,
7080
- num_bytes, parent,
7081
- root_objectid, (int)owner,
7082
- BTRFS_DROP_DELAYED_REF, NULL,
7083
- &old_ref_mod, &new_ref_mod);
3392
+ } else if (ref->type == BTRFS_REF_METADATA) {
3393
+ ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
70843394 } else {
7085
- ret = btrfs_add_delayed_data_ref(trans, bytenr,
7086
- num_bytes, parent,
7087
- root_objectid, owner, offset,
7088
- 0, BTRFS_DROP_DELAYED_REF,
7089
- &old_ref_mod, &new_ref_mod);
3395
+ ret = btrfs_add_delayed_data_ref(trans, ref, 0);
70903396 }
70913397
7092
- if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
7093
- bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
3398
+ if (!((ref->type == BTRFS_REF_METADATA &&
3399
+ ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
3400
+ (ref->type == BTRFS_REF_DATA &&
3401
+ ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
3402
+ btrfs_ref_tree_mod(fs_info, ref);
70943403
7095
- add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
7096
- }
7097
-
7098
- return ret;
7099
-}
7100
-
7101
-/*
7102
- * when we wait for progress in the block group caching, its because
7103
- * our allocation attempt failed at least once. So, we must sleep
7104
- * and let some progress happen before we try again.
7105
- *
7106
- * This function will sleep at least once waiting for new free space to
7107
- * show up, and then it will check the block group free space numbers
7108
- * for our min num_bytes. Another option is to have it go ahead
7109
- * and look in the rbtree for a free extent of a given size, but this
7110
- * is a good start.
7111
- *
7112
- * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7113
- * any of the information in this block group.
7114
- */
7115
-static noinline void
7116
-wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7117
- u64 num_bytes)
7118
-{
7119
- struct btrfs_caching_control *caching_ctl;
7120
-
7121
- caching_ctl = get_caching_control(cache);
7122
- if (!caching_ctl)
7123
- return;
7124
-
7125
- wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7126
- (cache->free_space_ctl->free_space >= num_bytes));
7127
-
7128
- put_caching_control(caching_ctl);
7129
-}
7130
-
7131
-static noinline int
7132
-wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7133
-{
7134
- struct btrfs_caching_control *caching_ctl;
7135
- int ret = 0;
7136
-
7137
- caching_ctl = get_caching_control(cache);
7138
- if (!caching_ctl)
7139
- return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7140
-
7141
- wait_event(caching_ctl->wait, block_group_cache_done(cache));
7142
- if (cache->cached == BTRFS_CACHE_ERROR)
7143
- ret = -EIO;
7144
- put_caching_control(caching_ctl);
71453404 return ret;
71463405 }
71473406
71483407 enum btrfs_loop_type {
7149
- LOOP_CACHING_NOWAIT = 0,
7150
- LOOP_CACHING_WAIT = 1,
7151
- LOOP_ALLOC_CHUNK = 2,
7152
- LOOP_NO_EMPTY_SIZE = 3,
3408
+ LOOP_CACHING_NOWAIT,
3409
+ LOOP_CACHING_WAIT,
3410
+ LOOP_ALLOC_CHUNK,
3411
+ LOOP_NO_EMPTY_SIZE,
71533412 };
71543413
71553414 static inline void
7156
-btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
3415
+btrfs_lock_block_group(struct btrfs_block_group *cache,
71573416 int delalloc)
71583417 {
71593418 if (delalloc)
71603419 down_read(&cache->data_rwsem);
71613420 }
71623421
7163
-static inline void
7164
-btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
3422
+static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
71653423 int delalloc)
71663424 {
71673425 btrfs_get_block_group(cache);
....@@ -7169,12 +3427,13 @@
71693427 down_read(&cache->data_rwsem);
71703428 }
71713429
7172
-static struct btrfs_block_group_cache *
7173
-btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
3430
+static struct btrfs_block_group *btrfs_lock_cluster(
3431
+ struct btrfs_block_group *block_group,
71743432 struct btrfs_free_cluster *cluster,
71753433 int delalloc)
3434
+ __acquires(&cluster->refill_lock)
71763435 {
7177
- struct btrfs_block_group_cache *used_bg = NULL;
3436
+ struct btrfs_block_group *used_bg = NULL;
71783437
71793438 spin_lock(&cluster->refill_lock);
71803439 while (1) {
....@@ -7208,12 +3467,503 @@
72083467 }
72093468
72103469 static inline void
7211
-btrfs_release_block_group(struct btrfs_block_group_cache *cache,
3470
+btrfs_release_block_group(struct btrfs_block_group *cache,
72123471 int delalloc)
72133472 {
72143473 if (delalloc)
72153474 up_read(&cache->data_rwsem);
72163475 btrfs_put_block_group(cache);
3476
+}
3477
+
3478
+enum btrfs_extent_allocation_policy {
3479
+ BTRFS_EXTENT_ALLOC_CLUSTERED,
3480
+};
3481
+
3482
+/*
3483
+ * Structure used internally for find_free_extent() function. Wraps needed
3484
+ * parameters.
3485
+ */
3486
+struct find_free_extent_ctl {
3487
+ /* Basic allocation info */
3488
+ u64 num_bytes;
3489
+ u64 empty_size;
3490
+ u64 flags;
3491
+ int delalloc;
3492
+
3493
+ /* Where to start the search inside the bg */
3494
+ u64 search_start;
3495
+
3496
+ /* For clustered allocation */
3497
+ u64 empty_cluster;
3498
+ struct btrfs_free_cluster *last_ptr;
3499
+ bool use_cluster;
3500
+
3501
+ bool have_caching_bg;
3502
+ bool orig_have_caching_bg;
3503
+
3504
+ /* RAID index, converted from flags */
3505
+ int index;
3506
+
3507
+ /*
3508
+ * Current loop number, check find_free_extent_update_loop() for details
3509
+ */
3510
+ int loop;
3511
+
3512
+ /*
3513
+ * Whether we're refilling a cluster, if true we need to re-search
3514
+ * current block group but don't try to refill the cluster again.
3515
+ */
3516
+ bool retry_clustered;
3517
+
3518
+ /*
3519
+ * Whether we're updating free space cache, if true we need to re-search
3520
+ * current block group but don't try updating free space cache again.
3521
+ */
3522
+ bool retry_unclustered;
3523
+
3524
+ /* If current block group is cached */
3525
+ int cached;
3526
+
3527
+ /* Max contiguous hole found */
3528
+ u64 max_extent_size;
3529
+
3530
+ /* Total free space from free space cache, not always contiguous */
3531
+ u64 total_free_space;
3532
+
3533
+ /* Found result */
3534
+ u64 found_offset;
3535
+
3536
+ /* Hint where to start looking for an empty space */
3537
+ u64 hint_byte;
3538
+
3539
+ /* Allocation policy */
3540
+ enum btrfs_extent_allocation_policy policy;
3541
+};
3542
+
3543
+
3544
+/*
3545
+ * Helper function for find_free_extent().
3546
+ *
3547
+ * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3548
+ * Return -EAGAIN to inform caller that we need to re-search this block group
3549
+ * Return >0 to inform caller that we find nothing
3550
+ * Return 0 means we have found a location and set ffe_ctl->found_offset.
3551
+ */
3552
+static int find_free_extent_clustered(struct btrfs_block_group *bg,
3553
+ struct find_free_extent_ctl *ffe_ctl,
3554
+ struct btrfs_block_group **cluster_bg_ret)
3555
+{
3556
+ struct btrfs_block_group *cluster_bg;
3557
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3558
+ u64 aligned_cluster;
3559
+ u64 offset;
3560
+ int ret;
3561
+
3562
+ cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3563
+ if (!cluster_bg)
3564
+ goto refill_cluster;
3565
+ if (cluster_bg != bg && (cluster_bg->ro ||
3566
+ !block_group_bits(cluster_bg, ffe_ctl->flags)))
3567
+ goto release_cluster;
3568
+
3569
+ offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3570
+ ffe_ctl->num_bytes, cluster_bg->start,
3571
+ &ffe_ctl->max_extent_size);
3572
+ if (offset) {
3573
+ /* We have a block, we're done */
3574
+ spin_unlock(&last_ptr->refill_lock);
3575
+ trace_btrfs_reserve_extent_cluster(cluster_bg,
3576
+ ffe_ctl->search_start, ffe_ctl->num_bytes);
3577
+ *cluster_bg_ret = cluster_bg;
3578
+ ffe_ctl->found_offset = offset;
3579
+ return 0;
3580
+ }
3581
+ WARN_ON(last_ptr->block_group != cluster_bg);
3582
+
3583
+release_cluster:
3584
+ /*
3585
+ * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3586
+ * lets just skip it and let the allocator find whatever block it can
3587
+ * find. If we reach this point, we will have tried the cluster
3588
+ * allocator plenty of times and not have found anything, so we are
3589
+ * likely way too fragmented for the clustering stuff to find anything.
3590
+ *
3591
+ * However, if the cluster is taken from the current block group,
3592
+ * release the cluster first, so that we stand a better chance of
3593
+ * succeeding in the unclustered allocation.
3594
+ */
3595
+ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3596
+ spin_unlock(&last_ptr->refill_lock);
3597
+ btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3598
+ return -ENOENT;
3599
+ }
3600
+
3601
+ /* This cluster didn't work out, free it and start over */
3602
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
3603
+
3604
+ if (cluster_bg != bg)
3605
+ btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3606
+
3607
+refill_cluster:
3608
+ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3609
+ spin_unlock(&last_ptr->refill_lock);
3610
+ return -ENOENT;
3611
+ }
3612
+
3613
+ aligned_cluster = max_t(u64,
3614
+ ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3615
+ bg->full_stripe_len);
3616
+ ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3617
+ ffe_ctl->num_bytes, aligned_cluster);
3618
+ if (ret == 0) {
3619
+ /* Now pull our allocation out of this cluster */
3620
+ offset = btrfs_alloc_from_cluster(bg, last_ptr,
3621
+ ffe_ctl->num_bytes, ffe_ctl->search_start,
3622
+ &ffe_ctl->max_extent_size);
3623
+ if (offset) {
3624
+ /* We found one, proceed */
3625
+ spin_unlock(&last_ptr->refill_lock);
3626
+ trace_btrfs_reserve_extent_cluster(bg,
3627
+ ffe_ctl->search_start,
3628
+ ffe_ctl->num_bytes);
3629
+ ffe_ctl->found_offset = offset;
3630
+ return 0;
3631
+ }
3632
+ } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
3633
+ !ffe_ctl->retry_clustered) {
3634
+ spin_unlock(&last_ptr->refill_lock);
3635
+
3636
+ ffe_ctl->retry_clustered = true;
3637
+ btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3638
+ ffe_ctl->empty_cluster + ffe_ctl->empty_size);
3639
+ return -EAGAIN;
3640
+ }
3641
+ /*
3642
+ * At this point we either didn't find a cluster or we weren't able to
3643
+ * allocate a block from our cluster. Free the cluster we've been
3644
+ * trying to use, and go to the next block group.
3645
+ */
3646
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
3647
+ spin_unlock(&last_ptr->refill_lock);
3648
+ return 1;
3649
+}
3650
+
3651
+/*
3652
+ * Return >0 to inform caller that we find nothing
3653
+ * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3654
+ * Return -EAGAIN to inform caller that we need to re-search this block group
3655
+ */
3656
+static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3657
+ struct find_free_extent_ctl *ffe_ctl)
3658
+{
3659
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3660
+ u64 offset;
3661
+
3662
+ /*
3663
+ * We are doing an unclustered allocation, set the fragmented flag so
3664
+ * we don't bother trying to setup a cluster again until we get more
3665
+ * space.
3666
+ */
3667
+ if (unlikely(last_ptr)) {
3668
+ spin_lock(&last_ptr->lock);
3669
+ last_ptr->fragmented = 1;
3670
+ spin_unlock(&last_ptr->lock);
3671
+ }
3672
+ if (ffe_ctl->cached) {
3673
+ struct btrfs_free_space_ctl *free_space_ctl;
3674
+
3675
+ free_space_ctl = bg->free_space_ctl;
3676
+ spin_lock(&free_space_ctl->tree_lock);
3677
+ if (free_space_ctl->free_space <
3678
+ ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3679
+ ffe_ctl->empty_size) {
3680
+ ffe_ctl->total_free_space = max_t(u64,
3681
+ ffe_ctl->total_free_space,
3682
+ free_space_ctl->free_space);
3683
+ spin_unlock(&free_space_ctl->tree_lock);
3684
+ return 1;
3685
+ }
3686
+ spin_unlock(&free_space_ctl->tree_lock);
3687
+ }
3688
+
3689
+ offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3690
+ ffe_ctl->num_bytes, ffe_ctl->empty_size,
3691
+ &ffe_ctl->max_extent_size);
3692
+
3693
+ /*
3694
+ * If we didn't find a chunk, and we haven't failed on this block group
3695
+ * before, and this block group is in the middle of caching and we are
3696
+ * ok with waiting, then go ahead and wait for progress to be made, and
3697
+ * set @retry_unclustered to true.
3698
+ *
3699
+ * If @retry_unclustered is true then we've already waited on this
3700
+ * block group once and should move on to the next block group.
3701
+ */
3702
+ if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
3703
+ ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3704
+ btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3705
+ ffe_ctl->empty_size);
3706
+ ffe_ctl->retry_unclustered = true;
3707
+ return -EAGAIN;
3708
+ } else if (!offset) {
3709
+ return 1;
3710
+ }
3711
+ ffe_ctl->found_offset = offset;
3712
+ return 0;
3713
+}
3714
+
3715
+static int do_allocation_clustered(struct btrfs_block_group *block_group,
3716
+ struct find_free_extent_ctl *ffe_ctl,
3717
+ struct btrfs_block_group **bg_ret)
3718
+{
3719
+ int ret;
3720
+
3721
+ /* We want to try and use the cluster allocator, so lets look there */
3722
+ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3723
+ ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3724
+ if (ret >= 0 || ret == -EAGAIN)
3725
+ return ret;
3726
+ /* ret == -ENOENT case falls through */
3727
+ }
3728
+
3729
+ return find_free_extent_unclustered(block_group, ffe_ctl);
3730
+}
3731
+
3732
+static int do_allocation(struct btrfs_block_group *block_group,
3733
+ struct find_free_extent_ctl *ffe_ctl,
3734
+ struct btrfs_block_group **bg_ret)
3735
+{
3736
+ switch (ffe_ctl->policy) {
3737
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
3738
+ return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3739
+ default:
3740
+ BUG();
3741
+ }
3742
+}
3743
+
3744
+static void release_block_group(struct btrfs_block_group *block_group,
3745
+ struct find_free_extent_ctl *ffe_ctl,
3746
+ int delalloc)
3747
+{
3748
+ switch (ffe_ctl->policy) {
3749
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
3750
+ ffe_ctl->retry_clustered = false;
3751
+ ffe_ctl->retry_unclustered = false;
3752
+ break;
3753
+ default:
3754
+ BUG();
3755
+ }
3756
+
3757
+ BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
3758
+ ffe_ctl->index);
3759
+ btrfs_release_block_group(block_group, delalloc);
3760
+}
3761
+
3762
+static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
3763
+ struct btrfs_key *ins)
3764
+{
3765
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3766
+
3767
+ if (!ffe_ctl->use_cluster && last_ptr) {
3768
+ spin_lock(&last_ptr->lock);
3769
+ last_ptr->window_start = ins->objectid;
3770
+ spin_unlock(&last_ptr->lock);
3771
+ }
3772
+}
3773
+
3774
+static void found_extent(struct find_free_extent_ctl *ffe_ctl,
3775
+ struct btrfs_key *ins)
3776
+{
3777
+ switch (ffe_ctl->policy) {
3778
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
3779
+ found_extent_clustered(ffe_ctl, ins);
3780
+ break;
3781
+ default:
3782
+ BUG();
3783
+ }
3784
+}
3785
+
3786
+static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
3787
+{
3788
+ switch (ffe_ctl->policy) {
3789
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
3790
+ /*
3791
+ * If we can't allocate a new chunk we've already looped through
3792
+ * at least once, move on to the NO_EMPTY_SIZE case.
3793
+ */
3794
+ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
3795
+ return 0;
3796
+ default:
3797
+ BUG();
3798
+ }
3799
+}
3800
+
3801
+/*
3802
+ * Return >0 means caller needs to re-search for free extent
3803
+ * Return 0 means we have the needed free extent.
3804
+ * Return <0 means we failed to locate any free extent.
3805
+ */
3806
+static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
3807
+ struct btrfs_key *ins,
3808
+ struct find_free_extent_ctl *ffe_ctl,
3809
+ bool full_search)
3810
+{
3811
+ struct btrfs_root *root = fs_info->extent_root;
3812
+ int ret;
3813
+
3814
+ if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
3815
+ ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
3816
+ ffe_ctl->orig_have_caching_bg = true;
3817
+
3818
+ if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
3819
+ ffe_ctl->have_caching_bg)
3820
+ return 1;
3821
+
3822
+ if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
3823
+ return 1;
3824
+
3825
+ if (ins->objectid) {
3826
+ found_extent(ffe_ctl, ins);
3827
+ return 0;
3828
+ }
3829
+
3830
+ /*
3831
+ * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
3832
+ * caching kthreads as we move along
3833
+ * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3834
+ * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3835
+ * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3836
+ * again
3837
+ */
3838
+ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
3839
+ ffe_ctl->index = 0;
3840
+ if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
3841
+ /*
3842
+ * We want to skip the LOOP_CACHING_WAIT step if we
3843
+ * don't have any uncached bgs and we've already done a
3844
+ * full search through.
3845
+ */
3846
+ if (ffe_ctl->orig_have_caching_bg || !full_search)
3847
+ ffe_ctl->loop = LOOP_CACHING_WAIT;
3848
+ else
3849
+ ffe_ctl->loop = LOOP_ALLOC_CHUNK;
3850
+ } else {
3851
+ ffe_ctl->loop++;
3852
+ }
3853
+
3854
+ if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
3855
+ struct btrfs_trans_handle *trans;
3856
+ int exist = 0;
3857
+
3858
+ trans = current->journal_info;
3859
+ if (trans)
3860
+ exist = 1;
3861
+ else
3862
+ trans = btrfs_join_transaction(root);
3863
+
3864
+ if (IS_ERR(trans)) {
3865
+ ret = PTR_ERR(trans);
3866
+ return ret;
3867
+ }
3868
+
3869
+ ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
3870
+ CHUNK_ALLOC_FORCE);
3871
+
3872
+ /* Do not bail out on ENOSPC since we can do more. */
3873
+ if (ret == -ENOSPC)
3874
+ ret = chunk_allocation_failed(ffe_ctl);
3875
+ else if (ret < 0)
3876
+ btrfs_abort_transaction(trans, ret);
3877
+ else
3878
+ ret = 0;
3879
+ if (!exist)
3880
+ btrfs_end_transaction(trans);
3881
+ if (ret)
3882
+ return ret;
3883
+ }
3884
+
3885
+ if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
3886
+ if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
3887
+ return -ENOSPC;
3888
+
3889
+ /*
3890
+ * Don't loop again if we already have no empty_size and
3891
+ * no empty_cluster.
3892
+ */
3893
+ if (ffe_ctl->empty_size == 0 &&
3894
+ ffe_ctl->empty_cluster == 0)
3895
+ return -ENOSPC;
3896
+ ffe_ctl->empty_size = 0;
3897
+ ffe_ctl->empty_cluster = 0;
3898
+ }
3899
+ return 1;
3900
+ }
3901
+ return -ENOSPC;
3902
+}
3903
+
3904
+static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
3905
+ struct find_free_extent_ctl *ffe_ctl,
3906
+ struct btrfs_space_info *space_info,
3907
+ struct btrfs_key *ins)
3908
+{
3909
+ /*
3910
+ * If our free space is heavily fragmented we may not be able to make
3911
+ * big contiguous allocations, so instead of doing the expensive search
3912
+ * for free space, simply return ENOSPC with our max_extent_size so we
3913
+ * can go ahead and search for a more manageable chunk.
3914
+ *
3915
+ * If our max_extent_size is large enough for our allocation simply
3916
+ * disable clustering since we will likely not be able to find enough
3917
+ * space to create a cluster and induce latency trying.
3918
+ */
3919
+ if (space_info->max_extent_size) {
3920
+ spin_lock(&space_info->lock);
3921
+ if (space_info->max_extent_size &&
3922
+ ffe_ctl->num_bytes > space_info->max_extent_size) {
3923
+ ins->offset = space_info->max_extent_size;
3924
+ spin_unlock(&space_info->lock);
3925
+ return -ENOSPC;
3926
+ } else if (space_info->max_extent_size) {
3927
+ ffe_ctl->use_cluster = false;
3928
+ }
3929
+ spin_unlock(&space_info->lock);
3930
+ }
3931
+
3932
+ ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
3933
+ &ffe_ctl->empty_cluster);
3934
+ if (ffe_ctl->last_ptr) {
3935
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3936
+
3937
+ spin_lock(&last_ptr->lock);
3938
+ if (last_ptr->block_group)
3939
+ ffe_ctl->hint_byte = last_ptr->window_start;
3940
+ if (last_ptr->fragmented) {
3941
+ /*
3942
+ * We still set window_start so we can keep track of the
3943
+ * last place we found an allocation to try and save
3944
+ * some time.
3945
+ */
3946
+ ffe_ctl->hint_byte = last_ptr->window_start;
3947
+ ffe_ctl->use_cluster = false;
3948
+ }
3949
+ spin_unlock(&last_ptr->lock);
3950
+ }
3951
+
3952
+ return 0;
3953
+}
3954
+
3955
+static int prepare_allocation(struct btrfs_fs_info *fs_info,
3956
+ struct find_free_extent_ctl *ffe_ctl,
3957
+ struct btrfs_space_info *space_info,
3958
+ struct btrfs_key *ins)
3959
+{
3960
+ switch (ffe_ctl->policy) {
3961
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
3962
+ return prepare_allocation_clustered(fs_info, ffe_ctl,
3963
+ space_info, ins);
3964
+ default:
3965
+ BUG();
3966
+ }
72173967 }
72183968
72193969 /*
....@@ -7226,87 +3976,76 @@
72263976 *
72273977 * If there is no suitable free space, we will record the max size of
72283978 * the free space extent currently.
3979
+ *
3980
+ * The overall logic and call chain:
3981
+ *
3982
+ * find_free_extent()
3983
+ * |- Iterate through all block groups
3984
+ * | |- Get a valid block group
3985
+ * | |- Try to do clustered allocation in that block group
3986
+ * | |- Try to do unclustered allocation in that block group
3987
+ * | |- Check if the result is valid
3988
+ * | | |- If valid, then exit
3989
+ * | |- Jump to next block group
3990
+ * |
3991
+ * |- Push harder to find free extents
3992
+ * |- If not found, re-iterate all block groups
72293993 */
7230
-static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
3994
+static noinline int find_free_extent(struct btrfs_root *root,
72313995 u64 ram_bytes, u64 num_bytes, u64 empty_size,
7232
- u64 hint_byte, struct btrfs_key *ins,
3996
+ u64 hint_byte_orig, struct btrfs_key *ins,
72333997 u64 flags, int delalloc)
72343998 {
3999
+ struct btrfs_fs_info *fs_info = root->fs_info;
72354000 int ret = 0;
7236
- struct btrfs_root *root = fs_info->extent_root;
7237
- struct btrfs_free_cluster *last_ptr = NULL;
7238
- struct btrfs_block_group_cache *block_group = NULL;
7239
- u64 search_start = 0;
7240
- u64 max_extent_size = 0;
7241
- u64 max_free_space = 0;
7242
- u64 empty_cluster = 0;
4001
+ int cache_block_group_error = 0;
4002
+ struct btrfs_block_group *block_group = NULL;
4003
+ struct find_free_extent_ctl ffe_ctl = {0};
72434004 struct btrfs_space_info *space_info;
7244
- int loop = 0;
7245
- int index = btrfs_bg_flags_to_raid_index(flags);
7246
- bool failed_cluster_refill = false;
7247
- bool failed_alloc = false;
7248
- bool use_cluster = true;
7249
- bool have_caching_bg = false;
7250
- bool orig_have_caching_bg = false;
72514005 bool full_search = false;
72524006
72534007 WARN_ON(num_bytes < fs_info->sectorsize);
4008
+
4009
+ ffe_ctl.num_bytes = num_bytes;
4010
+ ffe_ctl.empty_size = empty_size;
4011
+ ffe_ctl.flags = flags;
4012
+ ffe_ctl.search_start = 0;
4013
+ ffe_ctl.delalloc = delalloc;
4014
+ ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
4015
+ ffe_ctl.have_caching_bg = false;
4016
+ ffe_ctl.orig_have_caching_bg = false;
4017
+ ffe_ctl.found_offset = 0;
4018
+ ffe_ctl.hint_byte = hint_byte_orig;
4019
+ ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4020
+
4021
+ /* For clustered allocation */
4022
+ ffe_ctl.retry_clustered = false;
4023
+ ffe_ctl.retry_unclustered = false;
4024
+ ffe_ctl.last_ptr = NULL;
4025
+ ffe_ctl.use_cluster = true;
4026
+
72544027 ins->type = BTRFS_EXTENT_ITEM_KEY;
72554028 ins->objectid = 0;
72564029 ins->offset = 0;
72574030
7258
- trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
4031
+ trace_find_free_extent(root, num_bytes, empty_size, flags);
72594032
7260
- space_info = __find_space_info(fs_info, flags);
4033
+ space_info = btrfs_find_space_info(fs_info, flags);
72614034 if (!space_info) {
72624035 btrfs_err(fs_info, "No space info for %llu", flags);
72634036 return -ENOSPC;
72644037 }
72654038
7266
- /*
7267
- * If our free space is heavily fragmented we may not be able to make
7268
- * big contiguous allocations, so instead of doing the expensive search
7269
- * for free space, simply return ENOSPC with our max_extent_size so we
7270
- * can go ahead and search for a more manageable chunk.
7271
- *
7272
- * If our max_extent_size is large enough for our allocation simply
7273
- * disable clustering since we will likely not be able to find enough
7274
- * space to create a cluster and induce latency trying.
7275
- */
7276
- if (unlikely(space_info->max_extent_size)) {
7277
- spin_lock(&space_info->lock);
7278
- if (space_info->max_extent_size &&
7279
- num_bytes > space_info->max_extent_size) {
7280
- ins->offset = space_info->max_extent_size;
7281
- spin_unlock(&space_info->lock);
7282
- return -ENOSPC;
7283
- } else if (space_info->max_extent_size) {
7284
- use_cluster = false;
7285
- }
7286
- spin_unlock(&space_info->lock);
7287
- }
4039
+ ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
4040
+ if (ret < 0)
4041
+ return ret;
72884042
7289
- last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
7290
- if (last_ptr) {
7291
- spin_lock(&last_ptr->lock);
7292
- if (last_ptr->block_group)
7293
- hint_byte = last_ptr->window_start;
7294
- if (last_ptr->fragmented) {
7295
- /*
7296
- * We still set window_start so we can keep track of the
7297
- * last place we found an allocation to try and save
7298
- * some time.
7299
- */
7300
- hint_byte = last_ptr->window_start;
7301
- use_cluster = false;
7302
- }
7303
- spin_unlock(&last_ptr->lock);
7304
- }
7305
-
7306
- search_start = max(search_start, first_logical_byte(fs_info, 0));
7307
- search_start = max(search_start, hint_byte);
7308
- if (search_start == hint_byte) {
7309
- block_group = btrfs_lookup_block_group(fs_info, search_start);
4043
+ ffe_ctl.search_start = max(ffe_ctl.search_start,
4044
+ first_logical_byte(fs_info, 0));
4045
+ ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
4046
+ if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
4047
+ block_group = btrfs_lookup_block_group(fs_info,
4048
+ ffe_ctl.search_start);
73104049 /*
73114050 * we don't want to use the block group if it doesn't match our
73124051 * allocation bits, or if its not cached.
....@@ -7328,7 +4067,7 @@
73284067 btrfs_put_block_group(block_group);
73294068 up_read(&space_info->groups_sem);
73304069 } else {
7331
- index = btrfs_bg_flags_to_raid_index(
4070
+ ffe_ctl.index = btrfs_bg_flags_to_raid_index(
73324071 block_group->flags);
73334072 btrfs_lock_block_group(block_group, delalloc);
73344073 goto have_block_group;
....@@ -7338,21 +4077,21 @@
73384077 }
73394078 }
73404079 search:
7341
- have_caching_bg = false;
7342
- if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
4080
+ ffe_ctl.have_caching_bg = false;
4081
+ if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
4082
+ ffe_ctl.index == 0)
73434083 full_search = true;
73444084 down_read(&space_info->groups_sem);
7345
- list_for_each_entry(block_group, &space_info->block_groups[index],
7346
- list) {
7347
- u64 offset;
7348
- int cached;
4085
+ list_for_each_entry(block_group,
4086
+ &space_info->block_groups[ffe_ctl.index], list) {
4087
+ struct btrfs_block_group *bg_ret;
73494088
73504089 /* If the block group is read-only, we can skip it entirely. */
73514090 if (unlikely(block_group->ro))
73524091 continue;
73534092
73544093 btrfs_grab_block_group(block_group, delalloc);
7355
- search_start = block_group->key.objectid;
4094
+ ffe_ctl.search_start = block_group->start;
73564095
73574096 /*
73584097 * this can happen if we end up cycling through all the
....@@ -7361,9 +4100,8 @@
73614100 */
73624101 if (!block_group_bits(block_group, flags)) {
73634102 u64 extra = BTRFS_BLOCK_GROUP_DUP |
7364
- BTRFS_BLOCK_GROUP_RAID1 |
7365
- BTRFS_BLOCK_GROUP_RAID5 |
7366
- BTRFS_BLOCK_GROUP_RAID6 |
4103
+ BTRFS_BLOCK_GROUP_RAID1_MASK |
4104
+ BTRFS_BLOCK_GROUP_RAID56_MASK |
73674105 BTRFS_BLOCK_GROUP_RAID10;
73684106
73694107 /*
....@@ -7384,379 +4122,104 @@
73844122 }
73854123
73864124 have_block_group:
7387
- cached = block_group_cache_done(block_group);
7388
- if (unlikely(!cached)) {
7389
- have_caching_bg = true;
7390
- ret = cache_block_group(block_group, 0);
7391
- BUG_ON(ret < 0);
4125
+ ffe_ctl.cached = btrfs_block_group_done(block_group);
4126
+ if (unlikely(!ffe_ctl.cached)) {
4127
+ ffe_ctl.have_caching_bg = true;
4128
+ ret = btrfs_cache_block_group(block_group, 0);
4129
+
4130
+ /*
4131
+ * If we get ENOMEM here or something else we want to
4132
+ * try other block groups, because it may not be fatal.
4133
+ * However if we can't find anything else we need to
4134
+ * save our return here so that we return the actual
4135
+ * error that caused problems, not ENOSPC.
4136
+ */
4137
+ if (ret < 0) {
4138
+ if (!cache_block_group_error)
4139
+ cache_block_group_error = ret;
4140
+ ret = 0;
4141
+ goto loop;
4142
+ }
73924143 ret = 0;
73934144 }
73944145
7395
- if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7396
- goto loop;
7397
-
7398
- /*
7399
- * Ok we want to try and use the cluster allocator, so
7400
- * lets look there
7401
- */
7402
- if (last_ptr && use_cluster) {
7403
- struct btrfs_block_group_cache *used_block_group;
7404
- unsigned long aligned_cluster;
7405
- /*
7406
- * the refill lock keeps out other
7407
- * people trying to start a new cluster
7408
- */
7409
- used_block_group = btrfs_lock_cluster(block_group,
7410
- last_ptr,
7411
- delalloc);
7412
- if (!used_block_group)
7413
- goto refill_cluster;
7414
-
7415
- if (used_block_group != block_group &&
7416
- (used_block_group->ro ||
7417
- !block_group_bits(used_block_group, flags)))
7418
- goto release_cluster;
7419
-
7420
- offset = btrfs_alloc_from_cluster(used_block_group,
7421
- last_ptr,
7422
- num_bytes,
7423
- used_block_group->key.objectid,
7424
- &max_extent_size);
7425
- if (offset) {
7426
- /* we have a block, we're done */
7427
- spin_unlock(&last_ptr->refill_lock);
7428
- trace_btrfs_reserve_extent_cluster(
7429
- used_block_group,
7430
- search_start, num_bytes);
7431
- if (used_block_group != block_group) {
7432
- btrfs_release_block_group(block_group,
7433
- delalloc);
7434
- block_group = used_block_group;
7435
- }
7436
- goto checks;
7437
- }
7438
-
7439
- WARN_ON(last_ptr->block_group != used_block_group);
7440
-release_cluster:
7441
- /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7442
- * set up a new clusters, so lets just skip it
7443
- * and let the allocator find whatever block
7444
- * it can find. If we reach this point, we
7445
- * will have tried the cluster allocator
7446
- * plenty of times and not have found
7447
- * anything, so we are likely way too
7448
- * fragmented for the clustering stuff to find
7449
- * anything.
7450
- *
7451
- * However, if the cluster is taken from the
7452
- * current block group, release the cluster
7453
- * first, so that we stand a better chance of
7454
- * succeeding in the unclustered
7455
- * allocation. */
7456
- if (loop >= LOOP_NO_EMPTY_SIZE &&
7457
- used_block_group != block_group) {
7458
- spin_unlock(&last_ptr->refill_lock);
7459
- btrfs_release_block_group(used_block_group,
7460
- delalloc);
7461
- goto unclustered_alloc;
7462
- }
7463
-
7464
- /*
7465
- * this cluster didn't work out, free it and
7466
- * start over
7467
- */
7468
- btrfs_return_cluster_to_free_space(NULL, last_ptr);
7469
-
7470
- if (used_block_group != block_group)
7471
- btrfs_release_block_group(used_block_group,
7472
- delalloc);
7473
-refill_cluster:
7474
- if (loop >= LOOP_NO_EMPTY_SIZE) {
7475
- spin_unlock(&last_ptr->refill_lock);
7476
- goto unclustered_alloc;
7477
- }
7478
-
7479
- aligned_cluster = max_t(unsigned long,
7480
- empty_cluster + empty_size,
7481
- block_group->full_stripe_len);
7482
-
7483
- /* allocate a cluster in this block group */
7484
- ret = btrfs_find_space_cluster(fs_info, block_group,
7485
- last_ptr, search_start,
7486
- num_bytes,
7487
- aligned_cluster);
7488
- if (ret == 0) {
7489
- /*
7490
- * now pull our allocation out of this
7491
- * cluster
7492
- */
7493
- offset = btrfs_alloc_from_cluster(block_group,
7494
- last_ptr,
7495
- num_bytes,
7496
- search_start,
7497
- &max_extent_size);
7498
- if (offset) {
7499
- /* we found one, proceed */
7500
- spin_unlock(&last_ptr->refill_lock);
7501
- trace_btrfs_reserve_extent_cluster(
7502
- block_group, search_start,
7503
- num_bytes);
7504
- goto checks;
7505
- }
7506
- } else if (!cached && loop > LOOP_CACHING_NOWAIT
7507
- && !failed_cluster_refill) {
7508
- spin_unlock(&last_ptr->refill_lock);
7509
-
7510
- failed_cluster_refill = true;
7511
- wait_block_group_cache_progress(block_group,
7512
- num_bytes + empty_cluster + empty_size);
7513
- goto have_block_group;
7514
- }
7515
-
7516
- /*
7517
- * at this point we either didn't find a cluster
7518
- * or we weren't able to allocate a block from our
7519
- * cluster. Free the cluster we've been trying
7520
- * to use, and go to the next block group
7521
- */
7522
- btrfs_return_cluster_to_free_space(NULL, last_ptr);
7523
- spin_unlock(&last_ptr->refill_lock);
4146
+ if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
4147
+ if (!cache_block_group_error)
4148
+ cache_block_group_error = -EIO;
75244149 goto loop;
75254150 }
75264151
7527
-unclustered_alloc:
7528
- /*
7529
- * We are doing an unclustered alloc, set the fragmented flag so
7530
- * we don't bother trying to setup a cluster again until we get
7531
- * more space.
7532
- */
7533
- if (unlikely(last_ptr)) {
7534
- spin_lock(&last_ptr->lock);
7535
- last_ptr->fragmented = 1;
7536
- spin_unlock(&last_ptr->lock);
7537
- }
7538
- if (cached) {
7539
- struct btrfs_free_space_ctl *ctl =
7540
- block_group->free_space_ctl;
7541
-
7542
- spin_lock(&ctl->tree_lock);
7543
- if (ctl->free_space <
7544
- num_bytes + empty_cluster + empty_size) {
7545
- max_free_space = max(max_free_space,
7546
- ctl->free_space);
7547
- spin_unlock(&ctl->tree_lock);
7548
- goto loop;
4152
+ bg_ret = NULL;
4153
+ ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
4154
+ if (ret == 0) {
4155
+ if (bg_ret && bg_ret != block_group) {
4156
+ btrfs_release_block_group(block_group, delalloc);
4157
+ block_group = bg_ret;
75494158 }
7550
- spin_unlock(&ctl->tree_lock);
7551
- }
7552
-
7553
- offset = btrfs_find_space_for_alloc(block_group, search_start,
7554
- num_bytes, empty_size,
7555
- &max_extent_size);
7556
- /*
7557
- * If we didn't find a chunk, and we haven't failed on this
7558
- * block group before, and this block group is in the middle of
7559
- * caching and we are ok with waiting, then go ahead and wait
7560
- * for progress to be made, and set failed_alloc to true.
7561
- *
7562
- * If failed_alloc is true then we've already waited on this
7563
- * block group once and should move on to the next block group.
7564
- */
7565
- if (!offset && !failed_alloc && !cached &&
7566
- loop > LOOP_CACHING_NOWAIT) {
7567
- wait_block_group_cache_progress(block_group,
7568
- num_bytes + empty_size);
7569
- failed_alloc = true;
4159
+ } else if (ret == -EAGAIN) {
75704160 goto have_block_group;
7571
- } else if (!offset) {
4161
+ } else if (ret > 0) {
75724162 goto loop;
75734163 }
7574
-checks:
7575
- search_start = round_up(offset, fs_info->stripesize);
4164
+
4165
+ /* Checks */
4166
+ ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
4167
+ fs_info->stripesize);
75764168
75774169 /* move on to the next group */
7578
- if (search_start + num_bytes >
7579
- block_group->key.objectid + block_group->key.offset) {
7580
- btrfs_add_free_space(block_group, offset, num_bytes);
4170
+ if (ffe_ctl.search_start + num_bytes >
4171
+ block_group->start + block_group->length) {
4172
+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4173
+ num_bytes);
75814174 goto loop;
75824175 }
75834176
7584
- if (offset < search_start)
7585
- btrfs_add_free_space(block_group, offset,
7586
- search_start - offset);
4177
+ if (ffe_ctl.found_offset < ffe_ctl.search_start)
4178
+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4179
+ ffe_ctl.search_start - ffe_ctl.found_offset);
75874180
75884181 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
75894182 num_bytes, delalloc);
75904183 if (ret == -EAGAIN) {
7591
- btrfs_add_free_space(block_group, offset, num_bytes);
4184
+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4185
+ num_bytes);
75924186 goto loop;
75934187 }
75944188 btrfs_inc_block_group_reservations(block_group);
75954189
75964190 /* we are all good, lets return */
7597
- ins->objectid = search_start;
4191
+ ins->objectid = ffe_ctl.search_start;
75984192 ins->offset = num_bytes;
75994193
7600
- trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
4194
+ trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
4195
+ num_bytes);
76014196 btrfs_release_block_group(block_group, delalloc);
76024197 break;
76034198 loop:
7604
- failed_cluster_refill = false;
7605
- failed_alloc = false;
7606
- BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
7607
- index);
7608
- btrfs_release_block_group(block_group, delalloc);
4199
+ release_block_group(block_group, &ffe_ctl, delalloc);
76094200 cond_resched();
76104201 }
76114202 up_read(&space_info->groups_sem);
76124203
7613
- if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7614
- && !orig_have_caching_bg)
7615
- orig_have_caching_bg = true;
7616
-
7617
- if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
4204
+ ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
4205
+ if (ret > 0)
76184206 goto search;
76194207
7620
- if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7621
- goto search;
7622
-
7623
- /*
7624
- * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7625
- * caching kthreads as we move along
7626
- * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7627
- * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7628
- * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7629
- * again
7630
- */
7631
- if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7632
- index = 0;
7633
- if (loop == LOOP_CACHING_NOWAIT) {
7634
- /*
7635
- * We want to skip the LOOP_CACHING_WAIT step if we
7636
- * don't have any uncached bgs and we've already done a
7637
- * full search through.
7638
- */
7639
- if (orig_have_caching_bg || !full_search)
7640
- loop = LOOP_CACHING_WAIT;
7641
- else
7642
- loop = LOOP_ALLOC_CHUNK;
7643
- } else {
7644
- loop++;
7645
- }
7646
-
7647
- if (loop == LOOP_ALLOC_CHUNK) {
7648
- struct btrfs_trans_handle *trans;
7649
- int exist = 0;
7650
-
7651
- trans = current->journal_info;
7652
- if (trans)
7653
- exist = 1;
7654
- else
7655
- trans = btrfs_join_transaction(root);
7656
-
7657
- if (IS_ERR(trans)) {
7658
- ret = PTR_ERR(trans);
7659
- goto out;
7660
- }
7661
-
7662
- ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
7663
-
7664
- /*
7665
- * If we can't allocate a new chunk we've already looped
7666
- * through at least once, move on to the NO_EMPTY_SIZE
7667
- * case.
7668
- */
7669
- if (ret == -ENOSPC)
7670
- loop = LOOP_NO_EMPTY_SIZE;
7671
-
7672
- /*
7673
- * Do not bail out on ENOSPC since we
7674
- * can do more things.
7675
- */
7676
- if (ret < 0 && ret != -ENOSPC)
7677
- btrfs_abort_transaction(trans, ret);
7678
- else
7679
- ret = 0;
7680
- if (!exist)
7681
- btrfs_end_transaction(trans);
7682
- if (ret)
7683
- goto out;
7684
- }
7685
-
7686
- if (loop == LOOP_NO_EMPTY_SIZE) {
7687
- /*
7688
- * Don't loop again if we already have no empty_size and
7689
- * no empty_cluster.
7690
- */
7691
- if (empty_size == 0 &&
7692
- empty_cluster == 0) {
7693
- ret = -ENOSPC;
7694
- goto out;
7695
- }
7696
- empty_size = 0;
7697
- empty_cluster = 0;
7698
- }
7699
-
7700
- goto search;
7701
- } else if (!ins->objectid) {
7702
- ret = -ENOSPC;
7703
- } else if (ins->objectid) {
7704
- if (!use_cluster && last_ptr) {
7705
- spin_lock(&last_ptr->lock);
7706
- last_ptr->window_start = ins->objectid;
7707
- spin_unlock(&last_ptr->lock);
7708
- }
7709
- ret = 0;
7710
- }
7711
-out:
7712
- if (ret == -ENOSPC) {
7713
- if (!max_extent_size)
7714
- max_extent_size = max_free_space;
4208
+ if (ret == -ENOSPC && !cache_block_group_error) {
4209
+ /*
4210
+ * Use ffe_ctl->total_free_space as fallback if we can't find
4211
+ * any contiguous hole.
4212
+ */
4213
+ if (!ffe_ctl.max_extent_size)
4214
+ ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
77154215 spin_lock(&space_info->lock);
7716
- space_info->max_extent_size = max_extent_size;
4216
+ space_info->max_extent_size = ffe_ctl.max_extent_size;
77174217 spin_unlock(&space_info->lock);
7718
- ins->offset = max_extent_size;
4218
+ ins->offset = ffe_ctl.max_extent_size;
4219
+ } else if (ret == -ENOSPC) {
4220
+ ret = cache_block_group_error;
77194221 }
77204222 return ret;
7721
-}
7722
-
7723
-static void dump_space_info(struct btrfs_fs_info *fs_info,
7724
- struct btrfs_space_info *info, u64 bytes,
7725
- int dump_block_groups)
7726
-{
7727
- struct btrfs_block_group_cache *cache;
7728
- int index = 0;
7729
-
7730
- spin_lock(&info->lock);
7731
- btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
7732
- info->flags,
7733
- info->total_bytes - btrfs_space_info_used(info, true),
7734
- info->full ? "" : "not ");
7735
- btrfs_info(fs_info,
7736
- "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
7737
- info->total_bytes, info->bytes_used, info->bytes_pinned,
7738
- info->bytes_reserved, info->bytes_may_use,
7739
- info->bytes_readonly);
7740
- spin_unlock(&info->lock);
7741
-
7742
- if (!dump_block_groups)
7743
- return;
7744
-
7745
- down_read(&info->groups_sem);
7746
-again:
7747
- list_for_each_entry(cache, &info->block_groups[index], list) {
7748
- spin_lock(&cache->lock);
7749
- btrfs_info(fs_info,
7750
- "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
7751
- cache->key.objectid, cache->key.offset,
7752
- btrfs_block_group_used(&cache->item), cache->pinned,
7753
- cache->reserved, cache->ro ? "[readonly]" : "");
7754
- btrfs_dump_free_space(cache, bytes);
7755
- spin_unlock(&cache->lock);
7756
- }
7757
- if (++index < BTRFS_NR_RAID_TYPES)
7758
- goto again;
7759
- up_read(&info->groups_sem);
77604223 }
77614224
77624225 /*
....@@ -7817,7 +4280,7 @@
78174280 flags = get_alloc_profile_by_root(root, is_data);
78184281 again:
78194282 WARN_ON(num_bytes < fs_info->sectorsize);
7820
- ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
4283
+ ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
78214284 hint_byte, ins, flags, delalloc);
78224285 if (!ret && !is_data) {
78234286 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
....@@ -7834,24 +4297,23 @@
78344297 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
78354298 struct btrfs_space_info *sinfo;
78364299
7837
- sinfo = __find_space_info(fs_info, flags);
4300
+ sinfo = btrfs_find_space_info(fs_info, flags);
78384301 btrfs_err(fs_info,
78394302 "allocation failed flags %llu, wanted %llu",
78404303 flags, num_bytes);
78414304 if (sinfo)
7842
- dump_space_info(fs_info, sinfo, num_bytes, 1);
4305
+ btrfs_dump_space_info(fs_info, sinfo,
4306
+ num_bytes, 1);
78434307 }
78444308 }
78454309
78464310 return ret;
78474311 }
78484312
7849
-static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7850
- u64 start, u64 len,
7851
- int pin, int delalloc)
4313
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4314
+ u64 start, u64 len, int delalloc)
78524315 {
7853
- struct btrfs_block_group_cache *cache;
7854
- int ret = 0;
4316
+ struct btrfs_block_group *cache;
78554317
78564318 cache = btrfs_lookup_block_group(fs_info, start);
78574319 if (!cache) {
....@@ -7860,30 +4322,30 @@
78604322 return -ENOSPC;
78614323 }
78624324
7863
- if (pin)
7864
- pin_down_extent(fs_info, cache, start, len, 1);
7865
- else {
7866
- if (btrfs_test_opt(fs_info, DISCARD))
7867
- ret = btrfs_discard_extent(fs_info, start, len, NULL);
7868
- btrfs_add_free_space(cache, start, len);
7869
- btrfs_free_reserved_bytes(cache, len, delalloc);
7870
- trace_btrfs_reserved_extent_free(fs_info, start, len);
7871
- }
4325
+ btrfs_add_free_space(cache, start, len);
4326
+ btrfs_free_reserved_bytes(cache, len, delalloc);
4327
+ trace_btrfs_reserved_extent_free(fs_info, start, len);
78724328
78734329 btrfs_put_block_group(cache);
4330
+ return 0;
4331
+}
4332
+
4333
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
4334
+ u64 len)
4335
+{
4336
+ struct btrfs_block_group *cache;
4337
+ int ret = 0;
4338
+
4339
+ cache = btrfs_lookup_block_group(trans->fs_info, start);
4340
+ if (!cache) {
4341
+ btrfs_err(trans->fs_info, "unable to find block group for %llu",
4342
+ start);
4343
+ return -ENOSPC;
4344
+ }
4345
+
4346
+ ret = pin_down_extent(trans, cache, start, len, 1);
4347
+ btrfs_put_block_group(cache);
78744348 return ret;
7875
-}
7876
-
7877
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7878
- u64 start, u64 len, int delalloc)
7879
-{
7880
- return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
7881
-}
7882
-
7883
-int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
7884
- u64 start, u64 len)
7885
-{
7886
- return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
78874349 }
78884350
78894351 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
....@@ -7950,7 +4412,7 @@
79504412 if (ret)
79514413 return ret;
79524414
7953
- ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
4415
+ ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
79544416 if (ret) { /* -ENOENT, logic error */
79554417 btrfs_err(fs_info, "update block group failed for %llu %llu",
79564418 ins->objectid, ins->offset);
....@@ -8040,8 +4502,8 @@
80404502 if (ret)
80414503 return ret;
80424504
8043
- ret = update_block_group(trans, fs_info, extent_key.objectid,
8044
- fs_info->nodesize, 1);
4505
+ ret = btrfs_update_block_group(trans, extent_key.objectid,
4506
+ fs_info->nodesize, 1);
80454507 if (ret) { /* -ENOENT, logic error */
80464508 btrfs_err(fs_info, "update block group failed for %llu %llu",
80474509 extent_key.objectid, extent_key.offset);
....@@ -8058,20 +4520,16 @@
80584520 u64 offset, u64 ram_bytes,
80594521 struct btrfs_key *ins)
80604522 {
8061
- int ret;
4523
+ struct btrfs_ref generic_ref = { 0 };
80624524
80634525 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
80644526
8065
- btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
8066
- root->root_key.objectid, owner, offset,
8067
- BTRFS_ADD_DELAYED_EXTENT);
4527
+ btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4528
+ ins->objectid, ins->offset, 0);
4529
+ btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
4530
+ btrfs_ref_tree_mod(root->fs_info, &generic_ref);
80684531
8069
- ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
8070
- ins->offset, 0,
8071
- root->root_key.objectid, owner,
8072
- offset, ram_bytes,
8073
- BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8074
- return ret;
4532
+ return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
80754533 }
80764534
80774535 /*
....@@ -8085,7 +4543,7 @@
80854543 {
80864544 struct btrfs_fs_info *fs_info = trans->fs_info;
80874545 int ret;
8088
- struct btrfs_block_group_cache *block_group;
4546
+ struct btrfs_block_group *block_group;
80894547 struct btrfs_space_info *space_info;
80904548
80914549 /*
....@@ -8113,13 +4571,16 @@
81134571
81144572 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
81154573 offset, ins, 1);
4574
+ if (ret)
4575
+ btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
81164576 btrfs_put_block_group(block_group);
81174577 return ret;
81184578 }
81194579
81204580 static struct extent_buffer *
81214581 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8122
- u64 bytenr, int level, u64 owner)
4582
+ u64 bytenr, int level, u64 owner,
4583
+ enum btrfs_lock_nesting nest)
81234584 {
81244585 struct btrfs_fs_info *fs_info = root->fs_info;
81254586 struct extent_buffer *buf;
....@@ -8141,12 +4602,12 @@
81414602 return ERR_PTR(-EUCLEAN);
81424603 }
81434604
8144
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8145
- btrfs_tree_lock(buf);
8146
- clean_tree_block(fs_info, buf);
4605
+ btrfs_set_buffer_lockdep_class(owner, buf, level);
4606
+ __btrfs_tree_lock(buf, nest);
4607
+ btrfs_clean_tree_block(buf);
81474608 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
81484609
8149
- btrfs_set_lock_blocking(buf);
4610
+ btrfs_set_lock_blocking_write(buf);
81504611 set_extent_buffer_uptodate(buf);
81514612
81524613 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
....@@ -8155,13 +4616,13 @@
81554616 btrfs_set_header_generation(buf, trans->transid);
81564617 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
81574618 btrfs_set_header_owner(buf, owner);
8158
- write_extent_buffer_fsid(buf, fs_info->fsid);
4619
+ write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
81594620 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
81604621 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
81614622 buf->log_index = root->log_transid % 2;
81624623 /*
81634624 * we allow two log transactions at a time, use different
8164
- * EXENT bit to differentiate dirty pages.
4625
+ * EXTENT bit to differentiate dirty pages.
81654626 */
81664627 if (buf->log_index == 0)
81674628 set_extent_dirty(&root->dirty_log_pages, buf->start,
....@@ -8179,68 +4640,6 @@
81794640 return buf;
81804641 }
81814642
8182
-static struct btrfs_block_rsv *
8183
-use_block_rsv(struct btrfs_trans_handle *trans,
8184
- struct btrfs_root *root, u32 blocksize)
8185
-{
8186
- struct btrfs_fs_info *fs_info = root->fs_info;
8187
- struct btrfs_block_rsv *block_rsv;
8188
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8189
- int ret;
8190
- bool global_updated = false;
8191
-
8192
- block_rsv = get_block_rsv(trans, root);
8193
-
8194
- if (unlikely(block_rsv->size == 0))
8195
- goto try_reserve;
8196
-again:
8197
- ret = block_rsv_use_bytes(block_rsv, blocksize);
8198
- if (!ret)
8199
- return block_rsv;
8200
-
8201
- if (block_rsv->failfast)
8202
- return ERR_PTR(ret);
8203
-
8204
- if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8205
- global_updated = true;
8206
- update_global_block_rsv(fs_info);
8207
- goto again;
8208
- }
8209
-
8210
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8211
- static DEFINE_RATELIMIT_STATE(_rs,
8212
- DEFAULT_RATELIMIT_INTERVAL * 10,
8213
- /*DEFAULT_RATELIMIT_BURST*/ 1);
8214
- if (__ratelimit(&_rs))
8215
- WARN(1, KERN_DEBUG
8216
- "BTRFS: block rsv returned %d\n", ret);
8217
- }
8218
-try_reserve:
8219
- ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8220
- BTRFS_RESERVE_NO_FLUSH);
8221
- if (!ret)
8222
- return block_rsv;
8223
- /*
8224
- * If we couldn't reserve metadata bytes try and use some from
8225
- * the global reserve if its space type is the same as the global
8226
- * reservation.
8227
- */
8228
- if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8229
- block_rsv->space_info == global_rsv->space_info) {
8230
- ret = block_rsv_use_bytes(global_rsv, blocksize);
8231
- if (!ret)
8232
- return global_rsv;
8233
- }
8234
- return ERR_PTR(ret);
8235
-}
8236
-
8237
-static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8238
- struct btrfs_block_rsv *block_rsv, u32 blocksize)
8239
-{
8240
- block_rsv_add_bytes(block_rsv, blocksize, 0);
8241
- block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8242
-}
8243
-
82444643 /*
82454644 * finds a free extent and does all the dirty work required for allocation
82464645 * returns the tree buffer or an ERR_PTR on error.
....@@ -8250,13 +4649,15 @@
82504649 u64 parent, u64 root_objectid,
82514650 const struct btrfs_disk_key *key,
82524651 int level, u64 hint,
8253
- u64 empty_size)
4652
+ u64 empty_size,
4653
+ enum btrfs_lock_nesting nest)
82544654 {
82554655 struct btrfs_fs_info *fs_info = root->fs_info;
82564656 struct btrfs_key ins;
82574657 struct btrfs_block_rsv *block_rsv;
82584658 struct extent_buffer *buf;
82594659 struct btrfs_delayed_extent_op *extent_op;
4660
+ struct btrfs_ref generic_ref = { 0 };
82604661 u64 flags = 0;
82614662 int ret;
82624663 u32 blocksize = fs_info->nodesize;
....@@ -8265,14 +4666,14 @@
82654666 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
82664667 if (btrfs_is_testing(fs_info)) {
82674668 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8268
- level, root_objectid);
4669
+ level, root_objectid, nest);
82694670 if (!IS_ERR(buf))
82704671 root->alloc_bytenr += blocksize;
82714672 return buf;
82724673 }
82734674 #endif
82744675
8275
- block_rsv = use_block_rsv(trans, root, blocksize);
4676
+ block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
82764677 if (IS_ERR(block_rsv))
82774678 return ERR_CAST(block_rsv);
82784679
....@@ -8282,7 +4683,7 @@
82824683 goto out_unuse;
82834684
82844685 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
8285
- root_objectid);
4686
+ root_objectid, nest);
82864687 if (IS_ERR(buf)) {
82874688 ret = PTR_ERR(buf);
82884689 goto out_free_reserved;
....@@ -8311,14 +4712,12 @@
83114712 extent_op->is_data = false;
83124713 extent_op->level = level;
83134714
8314
- btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
8315
- root_objectid, level, 0,
8316
- BTRFS_ADD_DELAYED_EXTENT);
8317
- ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
8318
- ins.offset, parent,
8319
- root_objectid, level,
8320
- BTRFS_ADD_DELAYED_EXTENT,
8321
- extent_op, NULL, NULL);
4715
+ btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4716
+ ins.objectid, ins.offset, parent);
4717
+ generic_ref.real_root = root->root_key.objectid;
4718
+ btrfs_init_tree_ref(&generic_ref, level, root_objectid);
4719
+ btrfs_ref_tree_mod(fs_info, &generic_ref);
4720
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
83224721 if (ret)
83234722 goto out_free_delayed;
83244723 }
....@@ -8327,11 +4726,12 @@
83274726 out_free_delayed:
83284727 btrfs_free_delayed_extent_op(extent_op);
83294728 out_free_buf:
4729
+ btrfs_tree_unlock(buf);
83304730 free_extent_buffer(buf);
83314731 out_free_reserved:
83324732 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
83334733 out_unuse:
8334
- unuse_block_rsv(fs_info, block_rsv, blocksize);
4734
+ btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
83354735 return ERR_PTR(ret);
83364736 }
83374737
....@@ -8339,6 +4739,8 @@
83394739 u64 refs[BTRFS_MAX_LEVEL];
83404740 u64 flags[BTRFS_MAX_LEVEL];
83414741 struct btrfs_key update_progress;
4742
+ struct btrfs_key drop_progress;
4743
+ int drop_level;
83424744 int stage;
83434745 int level;
83444746 int shared_level;
....@@ -8346,6 +4748,7 @@
83464748 int keep_locks;
83474749 int reada_slot;
83484750 int reada_count;
4751
+ int restarted;
83494752 };
83504753
83514754 #define DROP_REFERENCE 1
....@@ -8490,8 +4893,7 @@
84904893 BUG_ON(ret); /* -ENOMEM */
84914894 ret = btrfs_dec_ref(trans, root, eb, 0);
84924895 BUG_ON(ret); /* -ENOMEM */
8493
- ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
8494
- eb->len, flag,
4896
+ ret = btrfs_set_disk_extent_flags(trans, eb, flag,
84954897 btrfs_header_level(eb), 0);
84964898 BUG_ON(ret); /* -ENOMEM */
84974899 wc->flags[level] |= flag;
....@@ -8506,6 +4908,33 @@
85064908 path->locks[level] = 0;
85074909 }
85084910 return 0;
4911
+}
4912
+
4913
+/*
4914
+ * This is used to verify a ref exists for this root to deal with a bug where we
4915
+ * would have a drop_progress key that hadn't been updated properly.
4916
+ */
4917
+static int check_ref_exists(struct btrfs_trans_handle *trans,
4918
+ struct btrfs_root *root, u64 bytenr, u64 parent,
4919
+ int level)
4920
+{
4921
+ struct btrfs_path *path;
4922
+ struct btrfs_extent_inline_ref *iref;
4923
+ int ret;
4924
+
4925
+ path = btrfs_alloc_path();
4926
+ if (!path)
4927
+ return -ENOMEM;
4928
+
4929
+ ret = lookup_extent_backref(trans, path, &iref, bytenr,
4930
+ root->fs_info->nodesize, parent,
4931
+ root->root_key.objectid, level, 0);
4932
+ btrfs_free_path(path);
4933
+ if (ret == -ENOENT)
4934
+ return 0;
4935
+ if (ret < 0)
4936
+ return ret;
4937
+ return 1;
85094938 }
85104939
85114940 /*
....@@ -8530,9 +4959,9 @@
85304959 u64 bytenr;
85314960 u64 generation;
85324961 u64 parent;
8533
- u32 blocksize;
85344962 struct btrfs_key key;
85354963 struct btrfs_key first_key;
4964
+ struct btrfs_ref ref = { 0 };
85364965 struct extent_buffer *next;
85374966 int level = wc->level;
85384967 int reada = 0;
....@@ -8555,7 +4984,6 @@
85554984 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
85564985 btrfs_node_key_to_cpu(path->nodes[level], &first_key,
85574986 path->slots[level]);
8558
- blocksize = fs_info->nodesize;
85594987
85604988 next = find_extent_buffer(fs_info, bytenr);
85614989 if (!next) {
....@@ -8568,7 +4996,7 @@
85684996 reada = 1;
85694997 }
85704998 btrfs_tree_lock(next);
8571
- btrfs_set_lock_blocking(next);
4999
+ btrfs_set_lock_blocking_write(next);
85725000
85735001 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
85745002 &wc->refs[level - 1],
....@@ -8628,7 +5056,7 @@
86285056 return -EIO;
86295057 }
86305058 btrfs_tree_lock(next);
8631
- btrfs_set_lock_blocking(next);
5059
+ btrfs_set_lock_blocking_write(next);
86325060 }
86335061
86345062 level--;
....@@ -8664,7 +5092,30 @@
86645092 parent = 0;
86655093 }
86665094
8667
- if (need_account) {
5095
+ /*
5096
+ * If we had a drop_progress we need to verify the refs are set
5097
+ * as expected. If we find our ref then we know that from here
5098
+ * on out everything should be correct, and we can clear the
5099
+ * ->restarted flag.
5100
+ */
5101
+ if (wc->restarted) {
5102
+ ret = check_ref_exists(trans, root, bytenr, parent,
5103
+ level - 1);
5104
+ if (ret < 0)
5105
+ goto out_unlock;
5106
+ if (ret == 0)
5107
+ goto no_delete;
5108
+ ret = 0;
5109
+ wc->restarted = 0;
5110
+ }
5111
+
5112
+ /*
5113
+ * Reloc tree doesn't contribute to qgroup numbers, and we have
5114
+ * already accounted them at merge time (replace_path),
5115
+ * thus we could skip expensive subtree trace here.
5116
+ */
5117
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
5118
+ need_account) {
86685119 ret = btrfs_qgroup_trace_subtree(trans, next,
86695120 generation, level - 1);
86705121 if (ret) {
....@@ -8673,13 +5124,24 @@
86735124 ret);
86745125 }
86755126 }
8676
- ret = btrfs_free_extent(trans, root, bytenr, blocksize,
8677
- parent, root->root_key.objectid,
8678
- level - 1, 0);
5127
+
5128
+ /*
5129
+ * We need to update the next key in our walk control so we can
5130
+ * update the drop_progress key accordingly. We don't care if
5131
+ * find_next_key doesn't find a key because that means we're at
5132
+ * the end and are going to clean up now.
5133
+ */
5134
+ wc->drop_level = level;
5135
+ find_next_key(path, level, &wc->drop_progress);
5136
+
5137
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
5138
+ fs_info->nodesize, parent);
5139
+ btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
5140
+ ret = btrfs_free_extent(trans, &ref);
86795141 if (ret)
86805142 goto out_unlock;
86815143 }
8682
-
5144
+no_delete:
86835145 *lookup_info = 1;
86845146 ret = 1;
86855147
....@@ -8734,7 +5196,7 @@
87345196 if (!path->locks[level]) {
87355197 BUG_ON(level == 0);
87365198 btrfs_tree_lock(eb);
8737
- btrfs_set_lock_blocking(eb);
5199
+ btrfs_set_lock_blocking_write(eb);
87385200 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
87395201
87405202 ret = btrfs_lookup_extent_info(trans, fs_info,
....@@ -8765,21 +5227,23 @@
87655227 else
87665228 ret = btrfs_dec_ref(trans, root, eb, 0);
87675229 BUG_ON(ret); /* -ENOMEM */
8768
- ret = btrfs_qgroup_trace_leaf_items(trans, eb);
8769
- if (ret) {
8770
- btrfs_err_rl(fs_info,
8771
- "error %d accounting leaf items. Quota is out of sync, rescan required.",
5230
+ if (is_fstree(root->root_key.objectid)) {
5231
+ ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5232
+ if (ret) {
5233
+ btrfs_err_rl(fs_info,
5234
+ "error %d accounting leaf items, quota is out of sync, rescan required",
87725235 ret);
5236
+ }
87735237 }
87745238 }
8775
- /* make block locked assertion in clean_tree_block happy */
5239
+ /* make block locked assertion in btrfs_clean_tree_block happy */
87765240 if (!path->locks[level] &&
87775241 btrfs_header_generation(eb) == trans->transid) {
87785242 btrfs_tree_lock(eb);
8779
- btrfs_set_lock_blocking(eb);
5243
+ btrfs_set_lock_blocking_write(eb);
87805244 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
87815245 }
8782
- clean_tree_block(fs_info, eb);
5246
+ btrfs_clean_tree_block(eb);
87835247 }
87845248
87855249 if (eb == root->node) {
....@@ -8887,9 +5351,7 @@
88875351 *
88885352 * If called with for_reloc == 0, may exit early with -EAGAIN
88895353 */
8890
-int btrfs_drop_snapshot(struct btrfs_root *root,
8891
- struct btrfs_block_rsv *block_rsv, int update_ref,
8892
- int for_reloc)
5354
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
88935355 {
88945356 struct btrfs_fs_info *fs_info = root->fs_info;
88955357 struct btrfs_path *path;
....@@ -8903,7 +5365,7 @@
89035365 int level;
89045366 bool root_dropped = false;
89055367
8906
- btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
5368
+ btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
89075369
89085370 path = btrfs_alloc_path();
89095371 if (!path) {
....@@ -8918,7 +5380,14 @@
89185380 goto out;
89195381 }
89205382
8921
- trans = btrfs_start_transaction(tree_root, 0);
5383
+ /*
5384
+ * Use join to avoid potential EINTR from transaction start. See
5385
+ * wait_reserve_ticket and the whole reservation callchain.
5386
+ */
5387
+ if (for_reloc)
5388
+ trans = btrfs_join_transaction(tree_root);
5389
+ else
5390
+ trans = btrfs_start_transaction(tree_root, 0);
89225391 if (IS_ERR(trans)) {
89235392 err = PTR_ERR(trans);
89245393 goto out_free;
....@@ -8928,13 +5397,19 @@
89285397 if (err)
89295398 goto out_end_trans;
89305399
8931
- if (block_rsv)
8932
- trans->block_rsv = block_rsv;
8933
-
5400
+ /*
5401
+ * This will help us catch people modifying the fs tree while we're
5402
+ * dropping it. It is unsafe to mess with the fs tree while it's being
5403
+ * dropped as we unlock the root node and parent nodes as we walk down
5404
+ * the tree, assuming nothing will change. If something does change
5405
+ * then we'll have stale information and drop references to blocks we've
5406
+ * already dropped.
5407
+ */
5408
+ set_bit(BTRFS_ROOT_DELETING, &root->state);
89345409 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
89355410 level = btrfs_header_level(root->node);
89365411 path->nodes[level] = btrfs_lock_root_node(root);
8937
- btrfs_set_lock_blocking(path->nodes[level]);
5412
+ btrfs_set_lock_blocking_write(path->nodes[level]);
89385413 path->slots[level] = 0;
89395414 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
89405415 memset(&wc->update_progress, 0,
....@@ -8964,7 +5439,7 @@
89645439 level = btrfs_header_level(root->node);
89655440 while (1) {
89665441 btrfs_tree_lock(path->nodes[level]);
8967
- btrfs_set_lock_blocking(path->nodes[level]);
5442
+ btrfs_set_lock_blocking_write(path->nodes[level]);
89685443 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
89695444
89705445 ret = btrfs_lookup_extent_info(trans, fs_info,
....@@ -8987,6 +5462,7 @@
89875462 }
89885463 }
89895464
5465
+ wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
89905466 wc->level = level;
89915467 wc->shared_level = -1;
89925468 wc->stage = DROP_REFERENCE;
....@@ -9014,12 +5490,14 @@
90145490 }
90155491
90165492 if (wc->stage == DROP_REFERENCE) {
9017
- level = wc->level;
9018
- btrfs_node_key(path->nodes[level],
9019
- &root_item->drop_progress,
9020
- path->slots[level]);
9021
- root_item->drop_level = level;
5493
+ wc->drop_level = wc->level;
5494
+ btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5495
+ &wc->drop_progress,
5496
+ path->slots[wc->drop_level]);
90225497 }
5498
+ btrfs_cpu_key_to_disk(&root_item->drop_progress,
5499
+ &wc->drop_progress);
5500
+ root_item->drop_level = wc->drop_level;
90235501
90245502 BUG_ON(wc->level == 0);
90255503 if (btrfs_should_end_transaction(trans) ||
....@@ -9041,13 +5519,19 @@
90415519 goto out_free;
90425520 }
90435521
9044
- trans = btrfs_start_transaction(tree_root, 0);
5522
+ /*
5523
+ * Use join to avoid potential EINTR from transaction
5524
+ * start. See wait_reserve_ticket and the whole
5525
+ * reservation callchain.
5526
+ */
5527
+ if (for_reloc)
5528
+ trans = btrfs_join_transaction(tree_root);
5529
+ else
5530
+ trans = btrfs_start_transaction(tree_root, 0);
90455531 if (IS_ERR(trans)) {
90465532 err = PTR_ERR(trans);
90475533 goto out_free;
90485534 }
9049
- if (block_rsv)
9050
- trans->block_rsv = block_rsv;
90515535 }
90525536 }
90535537 btrfs_release_path(path);
....@@ -9079,13 +5563,18 @@
90795563 }
90805564 }
90815565
9082
- if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
5566
+ /*
5567
+ * This subvolume is going to be completely dropped, and won't be
5568
+ * recorded as dirty roots, thus pertrans meta rsv will not be freed at
5569
+ * commit transaction time. So free it here manually.
5570
+ */
5571
+ btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
5572
+ btrfs_qgroup_free_meta_all_pertrans(root);
5573
+
5574
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
90835575 btrfs_add_dropped_root(trans, root);
9084
- } else {
9085
- free_extent_buffer(root->node);
9086
- free_extent_buffer(root->commit_root);
9087
- btrfs_put_fs_root(root);
9088
- }
5576
+ else
5577
+ btrfs_put_root(root);
90895578 root_dropped = true;
90905579 out_end_trans:
90915580 btrfs_end_transaction_throttle(trans);
....@@ -9138,7 +5627,7 @@
91385627
91395628 btrfs_assert_tree_locked(parent);
91405629 parent_level = btrfs_header_level(parent);
9141
- extent_buffer_get(parent);
5630
+ atomic_inc(&parent->refs);
91425631 path->nodes[parent_level] = parent;
91435632 path->slots[parent_level] = btrfs_header_nritems(parent);
91445633
....@@ -9176,184 +5665,13 @@
91765665 return ret;
91775666 }
91785667
9179
-static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9180
-{
9181
- u64 num_devices;
9182
- u64 stripped;
9183
-
9184
- /*
9185
- * if restripe for this chunk_type is on pick target profile and
9186
- * return, otherwise do the usual balance
9187
- */
9188
- stripped = get_restripe_target(fs_info, flags);
9189
- if (stripped)
9190
- return extended_to_chunk(stripped);
9191
-
9192
- num_devices = fs_info->fs_devices->rw_devices;
9193
-
9194
- stripped = BTRFS_BLOCK_GROUP_RAID0 |
9195
- BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9196
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9197
-
9198
- if (num_devices == 1) {
9199
- stripped |= BTRFS_BLOCK_GROUP_DUP;
9200
- stripped = flags & ~stripped;
9201
-
9202
- /* turn raid0 into single device chunks */
9203
- if (flags & BTRFS_BLOCK_GROUP_RAID0)
9204
- return stripped;
9205
-
9206
- /* turn mirroring into duplication */
9207
- if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9208
- BTRFS_BLOCK_GROUP_RAID10))
9209
- return stripped | BTRFS_BLOCK_GROUP_DUP;
9210
- } else {
9211
- /* they already had raid on here, just return */
9212
- if (flags & stripped)
9213
- return flags;
9214
-
9215
- stripped |= BTRFS_BLOCK_GROUP_DUP;
9216
- stripped = flags & ~stripped;
9217
-
9218
- /* switch duplicated blocks with raid1 */
9219
- if (flags & BTRFS_BLOCK_GROUP_DUP)
9220
- return stripped | BTRFS_BLOCK_GROUP_RAID1;
9221
-
9222
- /* this is drive concat, leave it alone */
9223
- }
9224
-
9225
- return flags;
9226
-}
9227
-
9228
-static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9229
-{
9230
- struct btrfs_space_info *sinfo = cache->space_info;
9231
- u64 num_bytes;
9232
- u64 min_allocable_bytes;
9233
- int ret = -ENOSPC;
9234
-
9235
- /*
9236
- * We need some metadata space and system metadata space for
9237
- * allocating chunks in some corner cases until we force to set
9238
- * it to be readonly.
9239
- */
9240
- if ((sinfo->flags &
9241
- (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9242
- !force)
9243
- min_allocable_bytes = SZ_1M;
9244
- else
9245
- min_allocable_bytes = 0;
9246
-
9247
- spin_lock(&sinfo->lock);
9248
- spin_lock(&cache->lock);
9249
-
9250
- if (cache->ro) {
9251
- cache->ro++;
9252
- ret = 0;
9253
- goto out;
9254
- }
9255
-
9256
- num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9257
- cache->bytes_super - btrfs_block_group_used(&cache->item);
9258
-
9259
- if (btrfs_space_info_used(sinfo, true) + num_bytes +
9260
- min_allocable_bytes <= sinfo->total_bytes) {
9261
- sinfo->bytes_readonly += num_bytes;
9262
- cache->ro++;
9263
- list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9264
- ret = 0;
9265
- }
9266
-out:
9267
- spin_unlock(&cache->lock);
9268
- spin_unlock(&sinfo->lock);
9269
- return ret;
9270
-}
9271
-
9272
-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
9273
-
9274
-{
9275
- struct btrfs_fs_info *fs_info = cache->fs_info;
9276
- struct btrfs_trans_handle *trans;
9277
- u64 alloc_flags;
9278
- int ret;
9279
-
9280
-again:
9281
- trans = btrfs_join_transaction(fs_info->extent_root);
9282
- if (IS_ERR(trans))
9283
- return PTR_ERR(trans);
9284
-
9285
- /*
9286
- * we're not allowed to set block groups readonly after the dirty
9287
- * block groups cache has started writing. If it already started,
9288
- * back off and let this transaction commit
9289
- */
9290
- mutex_lock(&fs_info->ro_block_group_mutex);
9291
- if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9292
- u64 transid = trans->transid;
9293
-
9294
- mutex_unlock(&fs_info->ro_block_group_mutex);
9295
- btrfs_end_transaction(trans);
9296
-
9297
- ret = btrfs_wait_for_commit(fs_info, transid);
9298
- if (ret)
9299
- return ret;
9300
- goto again;
9301
- }
9302
-
9303
- /*
9304
- * if we are changing raid levels, try to allocate a corresponding
9305
- * block group with the new raid level.
9306
- */
9307
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
9308
- if (alloc_flags != cache->flags) {
9309
- ret = do_chunk_alloc(trans, alloc_flags,
9310
- CHUNK_ALLOC_FORCE);
9311
- /*
9312
- * ENOSPC is allowed here, we may have enough space
9313
- * already allocated at the new raid level to
9314
- * carry on
9315
- */
9316
- if (ret == -ENOSPC)
9317
- ret = 0;
9318
- if (ret < 0)
9319
- goto out;
9320
- }
9321
-
9322
- ret = inc_block_group_ro(cache, 0);
9323
- if (!ret)
9324
- goto out;
9325
- alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9326
- ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9327
- if (ret < 0)
9328
- goto out;
9329
- ret = inc_block_group_ro(cache, 0);
9330
-out:
9331
- if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9332
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
9333
- mutex_lock(&fs_info->chunk_mutex);
9334
- check_system_chunk(trans, alloc_flags);
9335
- mutex_unlock(&fs_info->chunk_mutex);
9336
- }
9337
- mutex_unlock(&fs_info->ro_block_group_mutex);
9338
-
9339
- btrfs_end_transaction(trans);
9340
- return ret;
9341
-}
9342
-
9343
-int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
9344
-{
9345
- u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
9346
-
9347
- return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9348
-}
9349
-
93505668 /*
93515669 * helper to account the unused space of all the readonly block group in the
93525670 * space_info. takes mirrors into account.
93535671 */
93545672 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
93555673 {
9356
- struct btrfs_block_group_cache *block_group;
5674
+ struct btrfs_block_group *block_group;
93575675 u64 free_bytes = 0;
93585676 int factor;
93595677
....@@ -9371,1412 +5689,14 @@
93715689 }
93725690
93735691 factor = btrfs_bg_type_to_factor(block_group->flags);
9374
- free_bytes += (block_group->key.offset -
9375
- btrfs_block_group_used(&block_group->item)) *
9376
- factor;
5692
+ free_bytes += (block_group->length -
5693
+ block_group->used) * factor;
93775694
93785695 spin_unlock(&block_group->lock);
93795696 }
93805697 spin_unlock(&sinfo->lock);
93815698
93825699 return free_bytes;
9383
-}
9384
-
9385
-void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9386
-{
9387
- struct btrfs_space_info *sinfo = cache->space_info;
9388
- u64 num_bytes;
9389
-
9390
- BUG_ON(!cache->ro);
9391
-
9392
- spin_lock(&sinfo->lock);
9393
- spin_lock(&cache->lock);
9394
- if (!--cache->ro) {
9395
- num_bytes = cache->key.offset - cache->reserved -
9396
- cache->pinned - cache->bytes_super -
9397
- btrfs_block_group_used(&cache->item);
9398
- sinfo->bytes_readonly -= num_bytes;
9399
- list_del_init(&cache->ro_list);
9400
- }
9401
- spin_unlock(&cache->lock);
9402
- spin_unlock(&sinfo->lock);
9403
-}
9404
-
9405
-/*
9406
- * checks to see if its even possible to relocate this block group.
9407
- *
9408
- * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9409
- * ok to go ahead and try.
9410
- */
9411
-int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9412
-{
9413
- struct btrfs_root *root = fs_info->extent_root;
9414
- struct btrfs_block_group_cache *block_group;
9415
- struct btrfs_space_info *space_info;
9416
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9417
- struct btrfs_device *device;
9418
- struct btrfs_trans_handle *trans;
9419
- u64 min_free;
9420
- u64 dev_min = 1;
9421
- u64 dev_nr = 0;
9422
- u64 target;
9423
- int debug;
9424
- int index;
9425
- int full = 0;
9426
- int ret = 0;
9427
-
9428
- debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9429
-
9430
- block_group = btrfs_lookup_block_group(fs_info, bytenr);
9431
-
9432
- /* odd, couldn't find the block group, leave it alone */
9433
- if (!block_group) {
9434
- if (debug)
9435
- btrfs_warn(fs_info,
9436
- "can't find block group for bytenr %llu",
9437
- bytenr);
9438
- return -1;
9439
- }
9440
-
9441
- min_free = btrfs_block_group_used(&block_group->item);
9442
-
9443
- /* no bytes used, we're good */
9444
- if (!min_free)
9445
- goto out;
9446
-
9447
- space_info = block_group->space_info;
9448
- spin_lock(&space_info->lock);
9449
-
9450
- full = space_info->full;
9451
-
9452
- /*
9453
- * if this is the last block group we have in this space, we can't
9454
- * relocate it unless we're able to allocate a new chunk below.
9455
- *
9456
- * Otherwise, we need to make sure we have room in the space to handle
9457
- * all of the extents from this block group. If we can, we're good
9458
- */
9459
- if ((space_info->total_bytes != block_group->key.offset) &&
9460
- (btrfs_space_info_used(space_info, false) + min_free <
9461
- space_info->total_bytes)) {
9462
- spin_unlock(&space_info->lock);
9463
- goto out;
9464
- }
9465
- spin_unlock(&space_info->lock);
9466
-
9467
- /*
9468
- * ok we don't have enough space, but maybe we have free space on our
9469
- * devices to allocate new chunks for relocation, so loop through our
9470
- * alloc devices and guess if we have enough space. if this block
9471
- * group is going to be restriped, run checks against the target
9472
- * profile instead of the current one.
9473
- */
9474
- ret = -1;
9475
-
9476
- /*
9477
- * index:
9478
- * 0: raid10
9479
- * 1: raid1
9480
- * 2: dup
9481
- * 3: raid0
9482
- * 4: single
9483
- */
9484
- target = get_restripe_target(fs_info, block_group->flags);
9485
- if (target) {
9486
- index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
9487
- } else {
9488
- /*
9489
- * this is just a balance, so if we were marked as full
9490
- * we know there is no space for a new chunk
9491
- */
9492
- if (full) {
9493
- if (debug)
9494
- btrfs_warn(fs_info,
9495
- "no space to alloc new chunk for block group %llu",
9496
- block_group->key.objectid);
9497
- goto out;
9498
- }
9499
-
9500
- index = btrfs_bg_flags_to_raid_index(block_group->flags);
9501
- }
9502
-
9503
- if (index == BTRFS_RAID_RAID10) {
9504
- dev_min = 4;
9505
- /* Divide by 2 */
9506
- min_free >>= 1;
9507
- } else if (index == BTRFS_RAID_RAID1) {
9508
- dev_min = 2;
9509
- } else if (index == BTRFS_RAID_DUP) {
9510
- /* Multiply by 2 */
9511
- min_free <<= 1;
9512
- } else if (index == BTRFS_RAID_RAID0) {
9513
- dev_min = fs_devices->rw_devices;
9514
- min_free = div64_u64(min_free, dev_min);
9515
- }
9516
-
9517
- /* We need to do this so that we can look at pending chunks */
9518
- trans = btrfs_join_transaction(root);
9519
- if (IS_ERR(trans)) {
9520
- ret = PTR_ERR(trans);
9521
- goto out;
9522
- }
9523
-
9524
- mutex_lock(&fs_info->chunk_mutex);
9525
- list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9526
- u64 dev_offset;
9527
-
9528
- /*
9529
- * check to make sure we can actually find a chunk with enough
9530
- * space to fit our block group in.
9531
- */
9532
- if (device->total_bytes > device->bytes_used + min_free &&
9533
- !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
9534
- ret = find_free_dev_extent(trans, device, min_free,
9535
- &dev_offset, NULL);
9536
- if (!ret)
9537
- dev_nr++;
9538
-
9539
- if (dev_nr >= dev_min)
9540
- break;
9541
-
9542
- ret = -1;
9543
- }
9544
- }
9545
- if (debug && ret == -1)
9546
- btrfs_warn(fs_info,
9547
- "no space to allocate a new chunk for block group %llu",
9548
- block_group->key.objectid);
9549
- mutex_unlock(&fs_info->chunk_mutex);
9550
- btrfs_end_transaction(trans);
9551
-out:
9552
- btrfs_put_block_group(block_group);
9553
- return ret;
9554
-}
9555
-
9556
-static int find_first_block_group(struct btrfs_fs_info *fs_info,
9557
- struct btrfs_path *path,
9558
- struct btrfs_key *key)
9559
-{
9560
- struct btrfs_root *root = fs_info->extent_root;
9561
- int ret = 0;
9562
- struct btrfs_key found_key;
9563
- struct extent_buffer *leaf;
9564
- struct btrfs_block_group_item bg;
9565
- u64 flags;
9566
- int slot;
9567
-
9568
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9569
- if (ret < 0)
9570
- goto out;
9571
-
9572
- while (1) {
9573
- slot = path->slots[0];
9574
- leaf = path->nodes[0];
9575
- if (slot >= btrfs_header_nritems(leaf)) {
9576
- ret = btrfs_next_leaf(root, path);
9577
- if (ret == 0)
9578
- continue;
9579
- if (ret < 0)
9580
- goto out;
9581
- break;
9582
- }
9583
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
9584
-
9585
- if (found_key.objectid >= key->objectid &&
9586
- found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9587
- struct extent_map_tree *em_tree;
9588
- struct extent_map *em;
9589
-
9590
- em_tree = &root->fs_info->mapping_tree.map_tree;
9591
- read_lock(&em_tree->lock);
9592
- em = lookup_extent_mapping(em_tree, found_key.objectid,
9593
- found_key.offset);
9594
- read_unlock(&em_tree->lock);
9595
- if (!em) {
9596
- btrfs_err(fs_info,
9597
- "logical %llu len %llu found bg but no related chunk",
9598
- found_key.objectid, found_key.offset);
9599
- ret = -ENOENT;
9600
- } else if (em->start != found_key.objectid ||
9601
- em->len != found_key.offset) {
9602
- btrfs_err(fs_info,
9603
- "block group %llu len %llu mismatch with chunk %llu len %llu",
9604
- found_key.objectid, found_key.offset,
9605
- em->start, em->len);
9606
- ret = -EUCLEAN;
9607
- } else {
9608
- read_extent_buffer(leaf, &bg,
9609
- btrfs_item_ptr_offset(leaf, slot),
9610
- sizeof(bg));
9611
- flags = btrfs_block_group_flags(&bg) &
9612
- BTRFS_BLOCK_GROUP_TYPE_MASK;
9613
-
9614
- if (flags != (em->map_lookup->type &
9615
- BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9616
- btrfs_err(fs_info,
9617
-"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
9618
- found_key.objectid,
9619
- found_key.offset, flags,
9620
- (BTRFS_BLOCK_GROUP_TYPE_MASK &
9621
- em->map_lookup->type));
9622
- ret = -EUCLEAN;
9623
- } else {
9624
- ret = 0;
9625
- }
9626
- }
9627
- free_extent_map(em);
9628
- goto out;
9629
- }
9630
- path->slots[0]++;
9631
- }
9632
-out:
9633
- return ret;
9634
-}
9635
-
9636
-void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9637
-{
9638
- struct btrfs_block_group_cache *block_group;
9639
- u64 last = 0;
9640
-
9641
- while (1) {
9642
- struct inode *inode;
9643
-
9644
- block_group = btrfs_lookup_first_block_group(info, last);
9645
- while (block_group) {
9646
- wait_block_group_cache_done(block_group);
9647
- spin_lock(&block_group->lock);
9648
- if (block_group->iref)
9649
- break;
9650
- spin_unlock(&block_group->lock);
9651
- block_group = next_block_group(info, block_group);
9652
- }
9653
- if (!block_group) {
9654
- if (last == 0)
9655
- break;
9656
- last = 0;
9657
- continue;
9658
- }
9659
-
9660
- inode = block_group->inode;
9661
- block_group->iref = 0;
9662
- block_group->inode = NULL;
9663
- spin_unlock(&block_group->lock);
9664
- ASSERT(block_group->io_ctl.inode == NULL);
9665
- iput(inode);
9666
- last = block_group->key.objectid + block_group->key.offset;
9667
- btrfs_put_block_group(block_group);
9668
- }
9669
-}
9670
-
9671
-/*
9672
- * Must be called only after stopping all workers, since we could have block
9673
- * group caching kthreads running, and therefore they could race with us if we
9674
- * freed the block groups before stopping them.
9675
- */
9676
-int btrfs_free_block_groups(struct btrfs_fs_info *info)
9677
-{
9678
- struct btrfs_block_group_cache *block_group;
9679
- struct btrfs_space_info *space_info;
9680
- struct btrfs_caching_control *caching_ctl;
9681
- struct rb_node *n;
9682
-
9683
- down_write(&info->commit_root_sem);
9684
- while (!list_empty(&info->caching_block_groups)) {
9685
- caching_ctl = list_entry(info->caching_block_groups.next,
9686
- struct btrfs_caching_control, list);
9687
- list_del(&caching_ctl->list);
9688
- put_caching_control(caching_ctl);
9689
- }
9690
- up_write(&info->commit_root_sem);
9691
-
9692
- spin_lock(&info->unused_bgs_lock);
9693
- while (!list_empty(&info->unused_bgs)) {
9694
- block_group = list_first_entry(&info->unused_bgs,
9695
- struct btrfs_block_group_cache,
9696
- bg_list);
9697
- list_del_init(&block_group->bg_list);
9698
- btrfs_put_block_group(block_group);
9699
- }
9700
- spin_unlock(&info->unused_bgs_lock);
9701
-
9702
- spin_lock(&info->block_group_cache_lock);
9703
- while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9704
- block_group = rb_entry(n, struct btrfs_block_group_cache,
9705
- cache_node);
9706
- rb_erase(&block_group->cache_node,
9707
- &info->block_group_cache_tree);
9708
- RB_CLEAR_NODE(&block_group->cache_node);
9709
- spin_unlock(&info->block_group_cache_lock);
9710
-
9711
- down_write(&block_group->space_info->groups_sem);
9712
- list_del(&block_group->list);
9713
- up_write(&block_group->space_info->groups_sem);
9714
-
9715
- /*
9716
- * We haven't cached this block group, which means we could
9717
- * possibly have excluded extents on this block group.
9718
- */
9719
- if (block_group->cached == BTRFS_CACHE_NO ||
9720
- block_group->cached == BTRFS_CACHE_ERROR)
9721
- free_excluded_extents(block_group);
9722
-
9723
- btrfs_remove_free_space_cache(block_group);
9724
- ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
9725
- ASSERT(list_empty(&block_group->dirty_list));
9726
- ASSERT(list_empty(&block_group->io_list));
9727
- ASSERT(list_empty(&block_group->bg_list));
9728
- ASSERT(atomic_read(&block_group->count) == 1);
9729
- btrfs_put_block_group(block_group);
9730
-
9731
- spin_lock(&info->block_group_cache_lock);
9732
- }
9733
- spin_unlock(&info->block_group_cache_lock);
9734
-
9735
- /* now that all the block groups are freed, go through and
9736
- * free all the space_info structs. This is only called during
9737
- * the final stages of unmount, and so we know nobody is
9738
- * using them. We call synchronize_rcu() once before we start,
9739
- * just to be on the safe side.
9740
- */
9741
- synchronize_rcu();
9742
-
9743
- release_global_block_rsv(info);
9744
-
9745
- while (!list_empty(&info->space_info)) {
9746
- int i;
9747
-
9748
- space_info = list_entry(info->space_info.next,
9749
- struct btrfs_space_info,
9750
- list);
9751
-
9752
- /*
9753
- * Do not hide this behind enospc_debug, this is actually
9754
- * important and indicates a real bug if this happens.
9755
- */
9756
- if (WARN_ON(space_info->bytes_pinned > 0 ||
9757
- space_info->bytes_reserved > 0 ||
9758
- space_info->bytes_may_use > 0))
9759
- dump_space_info(info, space_info, 0, 0);
9760
- list_del(&space_info->list);
9761
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9762
- struct kobject *kobj;
9763
- kobj = space_info->block_group_kobjs[i];
9764
- space_info->block_group_kobjs[i] = NULL;
9765
- if (kobj) {
9766
- kobject_del(kobj);
9767
- kobject_put(kobj);
9768
- }
9769
- }
9770
- kobject_del(&space_info->kobj);
9771
- kobject_put(&space_info->kobj);
9772
- }
9773
- return 0;
9774
-}
9775
-
9776
-/* link_block_group will queue up kobjects to add when we're reclaim-safe */
9777
-void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
9778
-{
9779
- struct btrfs_space_info *space_info;
9780
- struct raid_kobject *rkobj;
9781
- LIST_HEAD(list);
9782
- int index;
9783
- int ret = 0;
9784
-
9785
- spin_lock(&fs_info->pending_raid_kobjs_lock);
9786
- list_splice_init(&fs_info->pending_raid_kobjs, &list);
9787
- spin_unlock(&fs_info->pending_raid_kobjs_lock);
9788
-
9789
- list_for_each_entry(rkobj, &list, list) {
9790
- space_info = __find_space_info(fs_info, rkobj->flags);
9791
- index = btrfs_bg_flags_to_raid_index(rkobj->flags);
9792
-
9793
- ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9794
- "%s", get_raid_name(index));
9795
- if (ret) {
9796
- kobject_put(&rkobj->kobj);
9797
- break;
9798
- }
9799
- }
9800
- if (ret)
9801
- btrfs_warn(fs_info,
9802
- "failed to add kobject for block cache, ignoring");
9803
-}
9804
-
9805
-static void link_block_group(struct btrfs_block_group_cache *cache)
9806
-{
9807
- struct btrfs_space_info *space_info = cache->space_info;
9808
- struct btrfs_fs_info *fs_info = cache->fs_info;
9809
- int index = btrfs_bg_flags_to_raid_index(cache->flags);
9810
- bool first = false;
9811
-
9812
- down_write(&space_info->groups_sem);
9813
- if (list_empty(&space_info->block_groups[index]))
9814
- first = true;
9815
- list_add_tail(&cache->list, &space_info->block_groups[index]);
9816
- up_write(&space_info->groups_sem);
9817
-
9818
- if (first) {
9819
- struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9820
- if (!rkobj) {
9821
- btrfs_warn(cache->fs_info,
9822
- "couldn't alloc memory for raid level kobject");
9823
- return;
9824
- }
9825
- rkobj->flags = cache->flags;
9826
- kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9827
-
9828
- spin_lock(&fs_info->pending_raid_kobjs_lock);
9829
- list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
9830
- spin_unlock(&fs_info->pending_raid_kobjs_lock);
9831
- space_info->block_group_kobjs[index] = &rkobj->kobj;
9832
- }
9833
-}
9834
-
9835
-static struct btrfs_block_group_cache *
9836
-btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
9837
- u64 start, u64 size)
9838
-{
9839
- struct btrfs_block_group_cache *cache;
9840
-
9841
- cache = kzalloc(sizeof(*cache), GFP_NOFS);
9842
- if (!cache)
9843
- return NULL;
9844
-
9845
- cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9846
- GFP_NOFS);
9847
- if (!cache->free_space_ctl) {
9848
- kfree(cache);
9849
- return NULL;
9850
- }
9851
-
9852
- cache->key.objectid = start;
9853
- cache->key.offset = size;
9854
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9855
-
9856
- cache->fs_info = fs_info;
9857
- cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
9858
- set_free_space_tree_thresholds(cache);
9859
-
9860
- atomic_set(&cache->count, 1);
9861
- spin_lock_init(&cache->lock);
9862
- init_rwsem(&cache->data_rwsem);
9863
- INIT_LIST_HEAD(&cache->list);
9864
- INIT_LIST_HEAD(&cache->cluster_list);
9865
- INIT_LIST_HEAD(&cache->bg_list);
9866
- INIT_LIST_HEAD(&cache->ro_list);
9867
- INIT_LIST_HEAD(&cache->dirty_list);
9868
- INIT_LIST_HEAD(&cache->io_list);
9869
- btrfs_init_free_space_ctl(cache);
9870
- atomic_set(&cache->trimming, 0);
9871
- mutex_init(&cache->free_space_lock);
9872
- btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
9873
-
9874
- return cache;
9875
-}
9876
-
9877
-
9878
-/*
9879
- * Iterate all chunks and verify that each of them has the corresponding block
9880
- * group
9881
- */
9882
-static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
9883
-{
9884
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
9885
- struct extent_map *em;
9886
- struct btrfs_block_group_cache *bg;
9887
- u64 start = 0;
9888
- int ret = 0;
9889
-
9890
- while (1) {
9891
- read_lock(&map_tree->map_tree.lock);
9892
- /*
9893
- * lookup_extent_mapping will return the first extent map
9894
- * intersecting the range, so setting @len to 1 is enough to
9895
- * get the first chunk.
9896
- */
9897
- em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
9898
- read_unlock(&map_tree->map_tree.lock);
9899
- if (!em)
9900
- break;
9901
-
9902
- bg = btrfs_lookup_block_group(fs_info, em->start);
9903
- if (!bg) {
9904
- btrfs_err(fs_info,
9905
- "chunk start=%llu len=%llu doesn't have corresponding block group",
9906
- em->start, em->len);
9907
- ret = -EUCLEAN;
9908
- free_extent_map(em);
9909
- break;
9910
- }
9911
- if (bg->key.objectid != em->start ||
9912
- bg->key.offset != em->len ||
9913
- (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
9914
- (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9915
- btrfs_err(fs_info,
9916
-"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
9917
- em->start, em->len,
9918
- em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
9919
- bg->key.objectid, bg->key.offset,
9920
- bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
9921
- ret = -EUCLEAN;
9922
- free_extent_map(em);
9923
- btrfs_put_block_group(bg);
9924
- break;
9925
- }
9926
- start = em->start + em->len;
9927
- free_extent_map(em);
9928
- btrfs_put_block_group(bg);
9929
- }
9930
- return ret;
9931
-}
9932
-
9933
-int btrfs_read_block_groups(struct btrfs_fs_info *info)
9934
-{
9935
- struct btrfs_path *path;
9936
- int ret;
9937
- struct btrfs_block_group_cache *cache;
9938
- struct btrfs_space_info *space_info;
9939
- struct btrfs_key key;
9940
- struct btrfs_key found_key;
9941
- struct extent_buffer *leaf;
9942
- int need_clear = 0;
9943
- u64 cache_gen;
9944
- u64 feature;
9945
- int mixed;
9946
-
9947
- feature = btrfs_super_incompat_flags(info->super_copy);
9948
- mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
9949
-
9950
- key.objectid = 0;
9951
- key.offset = 0;
9952
- key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9953
- path = btrfs_alloc_path();
9954
- if (!path)
9955
- return -ENOMEM;
9956
- path->reada = READA_FORWARD;
9957
-
9958
- cache_gen = btrfs_super_cache_generation(info->super_copy);
9959
- if (btrfs_test_opt(info, SPACE_CACHE) &&
9960
- btrfs_super_generation(info->super_copy) != cache_gen)
9961
- need_clear = 1;
9962
- if (btrfs_test_opt(info, CLEAR_CACHE))
9963
- need_clear = 1;
9964
-
9965
- while (1) {
9966
- ret = find_first_block_group(info, path, &key);
9967
- if (ret > 0)
9968
- break;
9969
- if (ret != 0)
9970
- goto error;
9971
-
9972
- leaf = path->nodes[0];
9973
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9974
-
9975
- cache = btrfs_create_block_group_cache(info, found_key.objectid,
9976
- found_key.offset);
9977
- if (!cache) {
9978
- ret = -ENOMEM;
9979
- goto error;
9980
- }
9981
-
9982
- if (need_clear) {
9983
- /*
9984
- * When we mount with old space cache, we need to
9985
- * set BTRFS_DC_CLEAR and set dirty flag.
9986
- *
9987
- * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
9988
- * truncate the old free space cache inode and
9989
- * setup a new one.
9990
- * b) Setting 'dirty flag' makes sure that we flush
9991
- * the new space cache info onto disk.
9992
- */
9993
- if (btrfs_test_opt(info, SPACE_CACHE))
9994
- cache->disk_cache_state = BTRFS_DC_CLEAR;
9995
- }
9996
-
9997
- read_extent_buffer(leaf, &cache->item,
9998
- btrfs_item_ptr_offset(leaf, path->slots[0]),
9999
- sizeof(cache->item));
10000
- cache->flags = btrfs_block_group_flags(&cache->item);
10001
- if (!mixed &&
10002
- ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10003
- (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10004
- btrfs_err(info,
10005
-"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10006
- cache->key.objectid);
10007
- btrfs_put_block_group(cache);
10008
- ret = -EINVAL;
10009
- goto error;
10010
- }
10011
-
10012
- key.objectid = found_key.objectid + found_key.offset;
10013
- btrfs_release_path(path);
10014
-
10015
- /*
10016
- * We need to exclude the super stripes now so that the space
10017
- * info has super bytes accounted for, otherwise we'll think
10018
- * we have more space than we actually do.
10019
- */
10020
- ret = exclude_super_stripes(cache);
10021
- if (ret) {
10022
- /*
10023
- * We may have excluded something, so call this just in
10024
- * case.
10025
- */
10026
- free_excluded_extents(cache);
10027
- btrfs_put_block_group(cache);
10028
- goto error;
10029
- }
10030
-
10031
- /*
10032
- * check for two cases, either we are full, and therefore
10033
- * don't need to bother with the caching work since we won't
10034
- * find any space, or we are empty, and we can just add all
10035
- * the space in and be done with it. This saves us _alot_ of
10036
- * time, particularly in the full case.
10037
- */
10038
- if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10039
- cache->last_byte_to_unpin = (u64)-1;
10040
- cache->cached = BTRFS_CACHE_FINISHED;
10041
- free_excluded_extents(cache);
10042
- } else if (btrfs_block_group_used(&cache->item) == 0) {
10043
- cache->last_byte_to_unpin = (u64)-1;
10044
- cache->cached = BTRFS_CACHE_FINISHED;
10045
- add_new_free_space(cache, found_key.objectid,
10046
- found_key.objectid +
10047
- found_key.offset);
10048
- free_excluded_extents(cache);
10049
- }
10050
-
10051
- ret = btrfs_add_block_group_cache(info, cache);
10052
- if (ret) {
10053
- btrfs_remove_free_space_cache(cache);
10054
- btrfs_put_block_group(cache);
10055
- goto error;
10056
- }
10057
-
10058
- trace_btrfs_add_block_group(info, cache, 0);
10059
- update_space_info(info, cache->flags, found_key.offset,
10060
- btrfs_block_group_used(&cache->item),
10061
- cache->bytes_super, &space_info);
10062
-
10063
- cache->space_info = space_info;
10064
-
10065
- link_block_group(cache);
10066
-
10067
- set_avail_alloc_bits(info, cache->flags);
10068
- if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10069
- inc_block_group_ro(cache, 1);
10070
- } else if (btrfs_block_group_used(&cache->item) == 0) {
10071
- ASSERT(list_empty(&cache->bg_list));
10072
- btrfs_mark_bg_unused(cache);
10073
- }
10074
- }
10075
-
10076
- list_for_each_entry_rcu(space_info, &info->space_info, list) {
10077
- if (!(get_alloc_profile(info, space_info->flags) &
10078
- (BTRFS_BLOCK_GROUP_RAID10 |
10079
- BTRFS_BLOCK_GROUP_RAID1 |
10080
- BTRFS_BLOCK_GROUP_RAID5 |
10081
- BTRFS_BLOCK_GROUP_RAID6 |
10082
- BTRFS_BLOCK_GROUP_DUP)))
10083
- continue;
10084
- /*
10085
- * avoid allocating from un-mirrored block group if there are
10086
- * mirrored block groups.
10087
- */
10088
- list_for_each_entry(cache,
10089
- &space_info->block_groups[BTRFS_RAID_RAID0],
10090
- list)
10091
- inc_block_group_ro(cache, 1);
10092
- list_for_each_entry(cache,
10093
- &space_info->block_groups[BTRFS_RAID_SINGLE],
10094
- list)
10095
- inc_block_group_ro(cache, 1);
10096
- }
10097
-
10098
- btrfs_add_raid_kobjects(info);
10099
- init_global_block_rsv(info);
10100
- ret = check_chunk_block_group_mappings(info);
10101
-error:
10102
- btrfs_free_path(path);
10103
- return ret;
10104
-}
10105
-
10106
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10107
-{
10108
- struct btrfs_fs_info *fs_info = trans->fs_info;
10109
- struct btrfs_block_group_cache *block_group;
10110
- struct btrfs_root *extent_root = fs_info->extent_root;
10111
- struct btrfs_block_group_item item;
10112
- struct btrfs_key key;
10113
- int ret = 0;
10114
-
10115
- if (!trans->can_flush_pending_bgs)
10116
- return;
10117
-
10118
- while (!list_empty(&trans->new_bgs)) {
10119
- block_group = list_first_entry(&trans->new_bgs,
10120
- struct btrfs_block_group_cache,
10121
- bg_list);
10122
- if (ret)
10123
- goto next;
10124
-
10125
- spin_lock(&block_group->lock);
10126
- memcpy(&item, &block_group->item, sizeof(item));
10127
- memcpy(&key, &block_group->key, sizeof(key));
10128
- spin_unlock(&block_group->lock);
10129
-
10130
- ret = btrfs_insert_item(trans, extent_root, &key, &item,
10131
- sizeof(item));
10132
- if (ret)
10133
- btrfs_abort_transaction(trans, ret);
10134
- ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
10135
- if (ret)
10136
- btrfs_abort_transaction(trans, ret);
10137
- add_block_group_free_space(trans, block_group);
10138
- /* already aborted the transaction if it failed. */
10139
-next:
10140
- list_del_init(&block_group->bg_list);
10141
- }
10142
- btrfs_trans_release_chunk_metadata(trans);
10143
-}
10144
-
10145
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10146
- u64 type, u64 chunk_offset, u64 size)
10147
-{
10148
- struct btrfs_fs_info *fs_info = trans->fs_info;
10149
- struct btrfs_block_group_cache *cache;
10150
- int ret;
10151
-
10152
- btrfs_set_log_full_commit(fs_info, trans);
10153
-
10154
- cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10155
- if (!cache)
10156
- return -ENOMEM;
10157
-
10158
- btrfs_set_block_group_used(&cache->item, bytes_used);
10159
- btrfs_set_block_group_chunk_objectid(&cache->item,
10160
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10161
- btrfs_set_block_group_flags(&cache->item, type);
10162
-
10163
- cache->flags = type;
10164
- cache->last_byte_to_unpin = (u64)-1;
10165
- cache->cached = BTRFS_CACHE_FINISHED;
10166
- cache->needs_free_space = 1;
10167
- ret = exclude_super_stripes(cache);
10168
- if (ret) {
10169
- /*
10170
- * We may have excluded something, so call this just in
10171
- * case.
10172
- */
10173
- free_excluded_extents(cache);
10174
- btrfs_put_block_group(cache);
10175
- return ret;
10176
- }
10177
-
10178
- add_new_free_space(cache, chunk_offset, chunk_offset + size);
10179
-
10180
- free_excluded_extents(cache);
10181
-
10182
-#ifdef CONFIG_BTRFS_DEBUG
10183
- if (btrfs_should_fragment_free_space(cache)) {
10184
- u64 new_bytes_used = size - bytes_used;
10185
-
10186
- bytes_used += new_bytes_used >> 1;
10187
- fragment_free_space(cache);
10188
- }
10189
-#endif
10190
- /*
10191
- * Ensure the corresponding space_info object is created and
10192
- * assigned to our block group. We want our bg to be added to the rbtree
10193
- * with its ->space_info set.
10194
- */
10195
- cache->space_info = __find_space_info(fs_info, cache->flags);
10196
- ASSERT(cache->space_info);
10197
-
10198
- ret = btrfs_add_block_group_cache(fs_info, cache);
10199
- if (ret) {
10200
- btrfs_remove_free_space_cache(cache);
10201
- btrfs_put_block_group(cache);
10202
- return ret;
10203
- }
10204
-
10205
- /*
10206
- * Now that our block group has its ->space_info set and is inserted in
10207
- * the rbtree, update the space info's counters.
10208
- */
10209
- trace_btrfs_add_block_group(fs_info, cache, 1);
10210
- update_space_info(fs_info, cache->flags, size, bytes_used,
10211
- cache->bytes_super, &cache->space_info);
10212
- update_global_block_rsv(fs_info);
10213
-
10214
- link_block_group(cache);
10215
-
10216
- list_add_tail(&cache->bg_list, &trans->new_bgs);
10217
-
10218
- set_avail_alloc_bits(fs_info, type);
10219
- return 0;
10220
-}
10221
-
10222
-static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10223
-{
10224
- u64 extra_flags = chunk_to_extended(flags) &
10225
- BTRFS_EXTENDED_PROFILE_MASK;
10226
-
10227
- write_seqlock(&fs_info->profiles_lock);
10228
- if (flags & BTRFS_BLOCK_GROUP_DATA)
10229
- fs_info->avail_data_alloc_bits &= ~extra_flags;
10230
- if (flags & BTRFS_BLOCK_GROUP_METADATA)
10231
- fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10232
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10233
- fs_info->avail_system_alloc_bits &= ~extra_flags;
10234
- write_sequnlock(&fs_info->profiles_lock);
10235
-}
10236
-
10237
-int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10238
- u64 group_start, struct extent_map *em)
10239
-{
10240
- struct btrfs_fs_info *fs_info = trans->fs_info;
10241
- struct btrfs_root *root = fs_info->extent_root;
10242
- struct btrfs_path *path;
10243
- struct btrfs_block_group_cache *block_group;
10244
- struct btrfs_free_cluster *cluster;
10245
- struct btrfs_root *tree_root = fs_info->tree_root;
10246
- struct btrfs_key key;
10247
- struct inode *inode;
10248
- struct kobject *kobj = NULL;
10249
- int ret;
10250
- int index;
10251
- int factor;
10252
- struct btrfs_caching_control *caching_ctl = NULL;
10253
- bool remove_em;
10254
-
10255
- block_group = btrfs_lookup_block_group(fs_info, group_start);
10256
- BUG_ON(!block_group);
10257
- BUG_ON(!block_group->ro);
10258
-
10259
- trace_btrfs_remove_block_group(block_group);
10260
- /*
10261
- * Free the reserved super bytes from this block group before
10262
- * remove it.
10263
- */
10264
- free_excluded_extents(block_group);
10265
- btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10266
- block_group->key.offset);
10267
-
10268
- memcpy(&key, &block_group->key, sizeof(key));
10269
- index = btrfs_bg_flags_to_raid_index(block_group->flags);
10270
- factor = btrfs_bg_type_to_factor(block_group->flags);
10271
-
10272
- /* make sure this block group isn't part of an allocation cluster */
10273
- cluster = &fs_info->data_alloc_cluster;
10274
- spin_lock(&cluster->refill_lock);
10275
- btrfs_return_cluster_to_free_space(block_group, cluster);
10276
- spin_unlock(&cluster->refill_lock);
10277
-
10278
- /*
10279
- * make sure this block group isn't part of a metadata
10280
- * allocation cluster
10281
- */
10282
- cluster = &fs_info->meta_alloc_cluster;
10283
- spin_lock(&cluster->refill_lock);
10284
- btrfs_return_cluster_to_free_space(block_group, cluster);
10285
- spin_unlock(&cluster->refill_lock);
10286
-
10287
- path = btrfs_alloc_path();
10288
- if (!path) {
10289
- ret = -ENOMEM;
10290
- goto out;
10291
- }
10292
-
10293
- /*
10294
- * get the inode first so any iput calls done for the io_list
10295
- * aren't the final iput (no unlinks allowed now)
10296
- */
10297
- inode = lookup_free_space_inode(fs_info, block_group, path);
10298
-
10299
- mutex_lock(&trans->transaction->cache_write_mutex);
10300
- /*
10301
- * make sure our free spache cache IO is done before remove the
10302
- * free space inode
10303
- */
10304
- spin_lock(&trans->transaction->dirty_bgs_lock);
10305
- if (!list_empty(&block_group->io_list)) {
10306
- list_del_init(&block_group->io_list);
10307
-
10308
- WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10309
-
10310
- spin_unlock(&trans->transaction->dirty_bgs_lock);
10311
- btrfs_wait_cache_io(trans, block_group, path);
10312
- btrfs_put_block_group(block_group);
10313
- spin_lock(&trans->transaction->dirty_bgs_lock);
10314
- }
10315
-
10316
- if (!list_empty(&block_group->dirty_list)) {
10317
- list_del_init(&block_group->dirty_list);
10318
- btrfs_put_block_group(block_group);
10319
- }
10320
- spin_unlock(&trans->transaction->dirty_bgs_lock);
10321
- mutex_unlock(&trans->transaction->cache_write_mutex);
10322
-
10323
- if (!IS_ERR(inode)) {
10324
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10325
- if (ret) {
10326
- btrfs_add_delayed_iput(inode);
10327
- goto out;
10328
- }
10329
- clear_nlink(inode);
10330
- /* One for the block groups ref */
10331
- spin_lock(&block_group->lock);
10332
- if (block_group->iref) {
10333
- block_group->iref = 0;
10334
- block_group->inode = NULL;
10335
- spin_unlock(&block_group->lock);
10336
- iput(inode);
10337
- } else {
10338
- spin_unlock(&block_group->lock);
10339
- }
10340
- /* One for our lookup ref */
10341
- btrfs_add_delayed_iput(inode);
10342
- }
10343
-
10344
- key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10345
- key.offset = block_group->key.objectid;
10346
- key.type = 0;
10347
-
10348
- ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10349
- if (ret < 0)
10350
- goto out;
10351
- if (ret > 0)
10352
- btrfs_release_path(path);
10353
- if (ret == 0) {
10354
- ret = btrfs_del_item(trans, tree_root, path);
10355
- if (ret)
10356
- goto out;
10357
- btrfs_release_path(path);
10358
- }
10359
-
10360
- spin_lock(&fs_info->block_group_cache_lock);
10361
- rb_erase(&block_group->cache_node,
10362
- &fs_info->block_group_cache_tree);
10363
- RB_CLEAR_NODE(&block_group->cache_node);
10364
-
10365
- /* Once for the block groups rbtree */
10366
- btrfs_put_block_group(block_group);
10367
-
10368
- if (fs_info->first_logical_byte == block_group->key.objectid)
10369
- fs_info->first_logical_byte = (u64)-1;
10370
- spin_unlock(&fs_info->block_group_cache_lock);
10371
-
10372
- down_write(&block_group->space_info->groups_sem);
10373
- /*
10374
- * we must use list_del_init so people can check to see if they
10375
- * are still on the list after taking the semaphore
10376
- */
10377
- list_del_init(&block_group->list);
10378
- if (list_empty(&block_group->space_info->block_groups[index])) {
10379
- kobj = block_group->space_info->block_group_kobjs[index];
10380
- block_group->space_info->block_group_kobjs[index] = NULL;
10381
- clear_avail_alloc_bits(fs_info, block_group->flags);
10382
- }
10383
- up_write(&block_group->space_info->groups_sem);
10384
- if (kobj) {
10385
- kobject_del(kobj);
10386
- kobject_put(kobj);
10387
- }
10388
-
10389
- if (block_group->has_caching_ctl)
10390
- caching_ctl = get_caching_control(block_group);
10391
- if (block_group->cached == BTRFS_CACHE_STARTED)
10392
- wait_block_group_cache_done(block_group);
10393
- if (block_group->has_caching_ctl) {
10394
- down_write(&fs_info->commit_root_sem);
10395
- if (!caching_ctl) {
10396
- struct btrfs_caching_control *ctl;
10397
-
10398
- list_for_each_entry(ctl,
10399
- &fs_info->caching_block_groups, list)
10400
- if (ctl->block_group == block_group) {
10401
- caching_ctl = ctl;
10402
- refcount_inc(&caching_ctl->count);
10403
- break;
10404
- }
10405
- }
10406
- if (caching_ctl)
10407
- list_del_init(&caching_ctl->list);
10408
- up_write(&fs_info->commit_root_sem);
10409
- if (caching_ctl) {
10410
- /* Once for the caching bgs list and once for us. */
10411
- put_caching_control(caching_ctl);
10412
- put_caching_control(caching_ctl);
10413
- }
10414
- }
10415
-
10416
- spin_lock(&trans->transaction->dirty_bgs_lock);
10417
- if (!list_empty(&block_group->dirty_list)) {
10418
- WARN_ON(1);
10419
- }
10420
- if (!list_empty(&block_group->io_list)) {
10421
- WARN_ON(1);
10422
- }
10423
- spin_unlock(&trans->transaction->dirty_bgs_lock);
10424
- btrfs_remove_free_space_cache(block_group);
10425
-
10426
- spin_lock(&block_group->space_info->lock);
10427
- list_del_init(&block_group->ro_list);
10428
-
10429
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10430
- WARN_ON(block_group->space_info->total_bytes
10431
- < block_group->key.offset);
10432
- WARN_ON(block_group->space_info->bytes_readonly
10433
- < block_group->key.offset);
10434
- WARN_ON(block_group->space_info->disk_total
10435
- < block_group->key.offset * factor);
10436
- }
10437
- block_group->space_info->total_bytes -= block_group->key.offset;
10438
- block_group->space_info->bytes_readonly -= block_group->key.offset;
10439
- block_group->space_info->disk_total -= block_group->key.offset * factor;
10440
-
10441
- spin_unlock(&block_group->space_info->lock);
10442
-
10443
- memcpy(&key, &block_group->key, sizeof(key));
10444
-
10445
- mutex_lock(&fs_info->chunk_mutex);
10446
- if (!list_empty(&em->list)) {
10447
- /* We're in the transaction->pending_chunks list. */
10448
- free_extent_map(em);
10449
- }
10450
- spin_lock(&block_group->lock);
10451
- block_group->removed = 1;
10452
- /*
10453
- * At this point trimming can't start on this block group, because we
10454
- * removed the block group from the tree fs_info->block_group_cache_tree
10455
- * so no one can't find it anymore and even if someone already got this
10456
- * block group before we removed it from the rbtree, they have already
10457
- * incremented block_group->trimming - if they didn't, they won't find
10458
- * any free space entries because we already removed them all when we
10459
- * called btrfs_remove_free_space_cache().
10460
- *
10461
- * And we must not remove the extent map from the fs_info->mapping_tree
10462
- * to prevent the same logical address range and physical device space
10463
- * ranges from being reused for a new block group. This is because our
10464
- * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10465
- * completely transactionless, so while it is trimming a range the
10466
- * currently running transaction might finish and a new one start,
10467
- * allowing for new block groups to be created that can reuse the same
10468
- * physical device locations unless we take this special care.
10469
- *
10470
- * There may also be an implicit trim operation if the file system
10471
- * is mounted with -odiscard. The same protections must remain
10472
- * in place until the extents have been discarded completely when
10473
- * the transaction commit has completed.
10474
- */
10475
- remove_em = (atomic_read(&block_group->trimming) == 0);
10476
- /*
10477
- * Make sure a trimmer task always sees the em in the pinned_chunks list
10478
- * if it sees block_group->removed == 1 (needs to lock block_group->lock
10479
- * before checking block_group->removed).
10480
- */
10481
- if (!remove_em) {
10482
- /*
10483
- * Our em might be in trans->transaction->pending_chunks which
10484
- * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10485
- * and so is the fs_info->pinned_chunks list.
10486
- *
10487
- * So at this point we must be holding the chunk_mutex to avoid
10488
- * any races with chunk allocation (more specifically at
10489
- * volumes.c:contains_pending_extent()), to ensure it always
10490
- * sees the em, either in the pending_chunks list or in the
10491
- * pinned_chunks list.
10492
- */
10493
- list_move_tail(&em->list, &fs_info->pinned_chunks);
10494
- }
10495
- spin_unlock(&block_group->lock);
10496
-
10497
- mutex_unlock(&fs_info->chunk_mutex);
10498
-
10499
- ret = remove_block_group_free_space(trans, block_group);
10500
- if (ret)
10501
- goto out;
10502
-
10503
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10504
- if (ret > 0)
10505
- ret = -EIO;
10506
- if (ret < 0)
10507
- goto out;
10508
-
10509
- ret = btrfs_del_item(trans, root, path);
10510
- if (ret)
10511
- goto out;
10512
-
10513
- if (remove_em) {
10514
- struct extent_map_tree *em_tree;
10515
-
10516
- em_tree = &fs_info->mapping_tree.map_tree;
10517
- write_lock(&em_tree->lock);
10518
- /*
10519
- * The em might be in the pending_chunks list, so make sure the
10520
- * chunk mutex is locked, since remove_extent_mapping() will
10521
- * delete us from that list.
10522
- */
10523
- remove_extent_mapping(em_tree, em);
10524
- write_unlock(&em_tree->lock);
10525
- /* once for the tree */
10526
- free_extent_map(em);
10527
- }
10528
-
10529
-out:
10530
- /* Once for the lookup reference */
10531
- btrfs_put_block_group(block_group);
10532
- btrfs_free_path(path);
10533
- return ret;
10534
-}
10535
-
10536
-struct btrfs_trans_handle *
10537
-btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10538
- const u64 chunk_offset)
10539
-{
10540
- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10541
- struct extent_map *em;
10542
- struct map_lookup *map;
10543
- unsigned int num_items;
10544
-
10545
- read_lock(&em_tree->lock);
10546
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10547
- read_unlock(&em_tree->lock);
10548
- ASSERT(em && em->start == chunk_offset);
10549
-
10550
- /*
10551
- * We need to reserve 3 + N units from the metadata space info in order
10552
- * to remove a block group (done at btrfs_remove_chunk() and at
10553
- * btrfs_remove_block_group()), which are used for:
10554
- *
10555
- * 1 unit for adding the free space inode's orphan (located in the tree
10556
- * of tree roots).
10557
- * 1 unit for deleting the block group item (located in the extent
10558
- * tree).
10559
- * 1 unit for deleting the free space item (located in tree of tree
10560
- * roots).
10561
- * N units for deleting N device extent items corresponding to each
10562
- * stripe (located in the device tree).
10563
- *
10564
- * In order to remove a block group we also need to reserve units in the
10565
- * system space info in order to update the chunk tree (update one or
10566
- * more device items and remove one chunk item), but this is done at
10567
- * btrfs_remove_chunk() through a call to check_system_chunk().
10568
- */
10569
- map = em->map_lookup;
10570
- num_items = 3 + map->num_stripes;
10571
- free_extent_map(em);
10572
-
10573
- return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10574
- num_items, 1);
10575
-}
10576
-
10577
-/*
10578
- * Process the unused_bgs list and remove any that don't have any allocated
10579
- * space inside of them.
10580
- */
10581
-void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10582
-{
10583
- struct btrfs_block_group_cache *block_group;
10584
- struct btrfs_space_info *space_info;
10585
- struct btrfs_trans_handle *trans;
10586
- int ret = 0;
10587
-
10588
- if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10589
- return;
10590
-
10591
- spin_lock(&fs_info->unused_bgs_lock);
10592
- while (!list_empty(&fs_info->unused_bgs)) {
10593
- u64 start, end;
10594
- int trimming;
10595
-
10596
- block_group = list_first_entry(&fs_info->unused_bgs,
10597
- struct btrfs_block_group_cache,
10598
- bg_list);
10599
- list_del_init(&block_group->bg_list);
10600
-
10601
- space_info = block_group->space_info;
10602
-
10603
- if (ret || btrfs_mixed_space_info(space_info)) {
10604
- btrfs_put_block_group(block_group);
10605
- continue;
10606
- }
10607
- spin_unlock(&fs_info->unused_bgs_lock);
10608
-
10609
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
10610
-
10611
- /* Don't want to race with allocators so take the groups_sem */
10612
- down_write(&space_info->groups_sem);
10613
- spin_lock(&block_group->lock);
10614
- if (block_group->reserved || block_group->pinned ||
10615
- btrfs_block_group_used(&block_group->item) ||
10616
- block_group->ro ||
10617
- list_is_singular(&block_group->list)) {
10618
- /*
10619
- * We want to bail if we made new allocations or have
10620
- * outstanding allocations in this block group. We do
10621
- * the ro check in case balance is currently acting on
10622
- * this block group.
10623
- */
10624
- trace_btrfs_skip_unused_block_group(block_group);
10625
- spin_unlock(&block_group->lock);
10626
- up_write(&space_info->groups_sem);
10627
- goto next;
10628
- }
10629
- spin_unlock(&block_group->lock);
10630
-
10631
- /* We don't want to force the issue, only flip if it's ok. */
10632
- ret = inc_block_group_ro(block_group, 0);
10633
- up_write(&space_info->groups_sem);
10634
- if (ret < 0) {
10635
- ret = 0;
10636
- goto next;
10637
- }
10638
-
10639
- /*
10640
- * Want to do this before we do anything else so we can recover
10641
- * properly if we fail to join the transaction.
10642
- */
10643
- trans = btrfs_start_trans_remove_block_group(fs_info,
10644
- block_group->key.objectid);
10645
- if (IS_ERR(trans)) {
10646
- btrfs_dec_block_group_ro(block_group);
10647
- ret = PTR_ERR(trans);
10648
- goto next;
10649
- }
10650
-
10651
- /*
10652
- * We could have pending pinned extents for this block group,
10653
- * just delete them, we don't care about them anymore.
10654
- */
10655
- start = block_group->key.objectid;
10656
- end = start + block_group->key.offset - 1;
10657
- /*
10658
- * Hold the unused_bg_unpin_mutex lock to avoid racing with
10659
- * btrfs_finish_extent_commit(). If we are at transaction N,
10660
- * another task might be running finish_extent_commit() for the
10661
- * previous transaction N - 1, and have seen a range belonging
10662
- * to the block group in freed_extents[] before we were able to
10663
- * clear the whole block group range from freed_extents[]. This
10664
- * means that task can lookup for the block group after we
10665
- * unpinned it from freed_extents[] and removed it, leading to
10666
- * a BUG_ON() at btrfs_unpin_extent_range().
10667
- */
10668
- mutex_lock(&fs_info->unused_bg_unpin_mutex);
10669
- ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10670
- EXTENT_DIRTY);
10671
- if (ret) {
10672
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10673
- btrfs_dec_block_group_ro(block_group);
10674
- goto end_trans;
10675
- }
10676
- ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10677
- EXTENT_DIRTY);
10678
- if (ret) {
10679
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10680
- btrfs_dec_block_group_ro(block_group);
10681
- goto end_trans;
10682
- }
10683
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10684
-
10685
- /* Reset pinned so btrfs_put_block_group doesn't complain */
10686
- spin_lock(&space_info->lock);
10687
- spin_lock(&block_group->lock);
10688
-
10689
- space_info->bytes_pinned -= block_group->pinned;
10690
- space_info->bytes_readonly += block_group->pinned;
10691
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
10692
- -block_group->pinned,
10693
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
10694
- block_group->pinned = 0;
10695
-
10696
- spin_unlock(&block_group->lock);
10697
- spin_unlock(&space_info->lock);
10698
-
10699
- /* DISCARD can flip during remount */
10700
- trimming = btrfs_test_opt(fs_info, DISCARD);
10701
-
10702
- /* Implicit trim during transaction commit. */
10703
- if (trimming)
10704
- btrfs_get_block_group_trimming(block_group);
10705
-
10706
- /*
10707
- * Btrfs_remove_chunk will abort the transaction if things go
10708
- * horribly wrong.
10709
- */
10710
- ret = btrfs_remove_chunk(trans, block_group->key.objectid);
10711
-
10712
- if (ret) {
10713
- if (trimming)
10714
- btrfs_put_block_group_trimming(block_group);
10715
- goto end_trans;
10716
- }
10717
-
10718
- /*
10719
- * If we're not mounted with -odiscard, we can just forget
10720
- * about this block group. Otherwise we'll need to wait
10721
- * until transaction commit to do the actual discard.
10722
- */
10723
- if (trimming) {
10724
- spin_lock(&fs_info->unused_bgs_lock);
10725
- /*
10726
- * A concurrent scrub might have added us to the list
10727
- * fs_info->unused_bgs, so use a list_move operation
10728
- * to add the block group to the deleted_bgs list.
10729
- */
10730
- list_move(&block_group->bg_list,
10731
- &trans->transaction->deleted_bgs);
10732
- spin_unlock(&fs_info->unused_bgs_lock);
10733
- btrfs_get_block_group(block_group);
10734
- }
10735
-end_trans:
10736
- btrfs_end_transaction(trans);
10737
-next:
10738
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10739
- btrfs_put_block_group(block_group);
10740
- spin_lock(&fs_info->unused_bgs_lock);
10741
- }
10742
- spin_unlock(&fs_info->unused_bgs_lock);
10743
-}
10744
-
10745
-int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10746
-{
10747
- struct btrfs_super_block *disk_super;
10748
- u64 features;
10749
- u64 flags;
10750
- int mixed = 0;
10751
- int ret;
10752
-
10753
- disk_super = fs_info->super_copy;
10754
- if (!btrfs_super_root(disk_super))
10755
- return -EINVAL;
10756
-
10757
- features = btrfs_super_incompat_flags(disk_super);
10758
- if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10759
- mixed = 1;
10760
-
10761
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
10762
- ret = create_space_info(fs_info, flags);
10763
- if (ret)
10764
- goto out;
10765
-
10766
- if (mixed) {
10767
- flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10768
- ret = create_space_info(fs_info, flags);
10769
- } else {
10770
- flags = BTRFS_BLOCK_GROUP_METADATA;
10771
- ret = create_space_info(fs_info, flags);
10772
- if (ret)
10773
- goto out;
10774
-
10775
- flags = BTRFS_BLOCK_GROUP_DATA;
10776
- ret = create_space_info(fs_info, flags);
10777
- }
10778
-out:
10779
- return ret;
107805700 }
107815701
107825702 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
....@@ -10805,10 +5725,9 @@
108055725 * it while performing the free space search since we have already
108065726 * held back allocations.
108075727 */
10808
-static int btrfs_trim_free_extents(struct btrfs_device *device,
10809
- u64 minlen, u64 *trimmed)
5728
+static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
108105729 {
10811
- u64 start = 0, len = 0;
5730
+ u64 start = SZ_1M, len = 0, end = 0;
108125731 int ret;
108135732
108145733 *trimmed = 0;
....@@ -10817,7 +5736,7 @@
108175736 if (!blk_queue_discard(bdev_get_queue(device->bdev)))
108185737 return 0;
108195738
10820
- /* Not writeable = nothing to do. */
5739
+ /* Not writable = nothing to do. */
108215740 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
108225741 return 0;
108235742
....@@ -10829,43 +5748,54 @@
108295748
108305749 while (1) {
108315750 struct btrfs_fs_info *fs_info = device->fs_info;
10832
- struct btrfs_transaction *trans;
108335751 u64 bytes;
108345752
108355753 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
108365754 if (ret)
108375755 break;
108385756
10839
- ret = down_read_killable(&fs_info->commit_root_sem);
10840
- if (ret) {
5757
+ find_first_clear_extent_bit(&device->alloc_state, start,
5758
+ &start, &end,
5759
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
5760
+
5761
+ /* Check if there are any CHUNK_* bits left */
5762
+ if (start > device->total_bytes) {
5763
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
5764
+ btrfs_warn_in_rcu(fs_info,
5765
+"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
5766
+ start, end - start + 1,
5767
+ rcu_str_deref(device->name),
5768
+ device->total_bytes);
108415769 mutex_unlock(&fs_info->chunk_mutex);
5770
+ ret = 0;
108425771 break;
108435772 }
108445773
10845
- spin_lock(&fs_info->trans_lock);
10846
- trans = fs_info->running_transaction;
10847
- if (trans)
10848
- refcount_inc(&trans->use_count);
10849
- spin_unlock(&fs_info->trans_lock);
5774
+ /* Ensure we skip the reserved area in the first 1M */
5775
+ start = max_t(u64, start, SZ_1M);
108505776
10851
- if (!trans)
10852
- up_read(&fs_info->commit_root_sem);
5777
+ /*
5778
+ * If find_first_clear_extent_bit find a range that spans the
5779
+ * end of the device it will set end to -1, in this case it's up
5780
+ * to the caller to trim the value to the size of the device.
5781
+ */
5782
+ end = min(end, device->total_bytes - 1);
108535783
10854
- ret = find_free_dev_extent_start(trans, device, minlen, start,
10855
- &start, &len);
10856
- if (trans) {
10857
- up_read(&fs_info->commit_root_sem);
10858
- btrfs_put_transaction(trans);
10859
- }
5784
+ len = end - start + 1;
108605785
10861
- if (ret) {
5786
+ /* We didn't find any extents */
5787
+ if (!len) {
108625788 mutex_unlock(&fs_info->chunk_mutex);
10863
- if (ret == -ENOSPC)
10864
- ret = 0;
5789
+ ret = 0;
108655790 break;
108665791 }
108675792
10868
- ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
5793
+ ret = btrfs_issue_discard(device->bdev, start, len,
5794
+ &bytes);
5795
+ if (!ret)
5796
+ set_extent_bits(&device->alloc_state, start,
5797
+ start + bytes - 1,
5798
+ CHUNK_TRIMMED);
108695799 mutex_unlock(&fs_info->chunk_mutex);
108705800
108715801 if (ret)
....@@ -10896,10 +5826,11 @@
108965826 */
108975827 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
108985828 {
10899
- struct btrfs_block_group_cache *cache = NULL;
5829
+ struct btrfs_block_group *cache = NULL;
109005830 struct btrfs_device *device;
109015831 struct list_head *devices;
109025832 u64 group_trimmed;
5833
+ u64 range_end = U64_MAX;
109035834 u64 start;
109045835 u64 end;
109055836 u64 trimmed = 0;
....@@ -10909,26 +5840,33 @@
109095840 int dev_ret = 0;
109105841 int ret = 0;
109115842
5843
+ /*
5844
+ * Check range overflow if range->len is set.
5845
+ * The default range->len is U64_MAX.
5846
+ */
5847
+ if (range->len != U64_MAX &&
5848
+ check_add_overflow(range->start, range->len, &range_end))
5849
+ return -EINVAL;
5850
+
109125851 cache = btrfs_lookup_first_block_group(fs_info, range->start);
10913
- for (; cache; cache = next_block_group(fs_info, cache)) {
10914
- if (cache->key.objectid >= (range->start + range->len)) {
5852
+ for (; cache; cache = btrfs_next_block_group(cache)) {
5853
+ if (cache->start >= range_end) {
109155854 btrfs_put_block_group(cache);
109165855 break;
109175856 }
109185857
10919
- start = max(range->start, cache->key.objectid);
10920
- end = min(range->start + range->len,
10921
- cache->key.objectid + cache->key.offset);
5858
+ start = max(range->start, cache->start);
5859
+ end = min(range_end, cache->start + cache->length);
109225860
109235861 if (end - start >= range->minlen) {
10924
- if (!block_group_cache_done(cache)) {
10925
- ret = cache_block_group(cache, 0);
5862
+ if (!btrfs_block_group_done(cache)) {
5863
+ ret = btrfs_cache_block_group(cache, 0);
109265864 if (ret) {
109275865 bg_failed++;
109285866 bg_ret = ret;
109295867 continue;
109305868 }
10931
- ret = wait_block_group_cache_done(cache);
5869
+ ret = btrfs_wait_block_group_cache_done(cache);
109325870 if (ret) {
109335871 bg_failed++;
109345872 bg_ret = ret;
....@@ -10957,8 +5895,10 @@
109575895 mutex_lock(&fs_info->fs_devices->device_list_mutex);
109585896 devices = &fs_info->fs_devices->devices;
109595897 list_for_each_entry(device, devices, dev_list) {
10960
- ret = btrfs_trim_free_extents(device, range->minlen,
10961
- &group_trimmed);
5898
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
5899
+ continue;
5900
+
5901
+ ret = btrfs_trim_free_extents(device, &group_trimmed);
109625902 if (ret) {
109635903 dev_failed++;
109645904 dev_ret = ret;
....@@ -10977,61 +5917,4 @@
109775917 if (bg_ret)
109785918 return bg_ret;
109795919 return dev_ret;
10980
-}
10981
-
10982
-/*
10983
- * btrfs_{start,end}_write_no_snapshotting() are similar to
10984
- * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
10985
- * data into the page cache through nocow before the subvolume is snapshoted,
10986
- * but flush the data into disk after the snapshot creation, or to prevent
10987
- * operations while snapshotting is ongoing and that cause the snapshot to be
10988
- * inconsistent (writes followed by expanding truncates for example).
10989
- */
10990
-void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
10991
-{
10992
- percpu_counter_dec(&root->subv_writers->counter);
10993
- cond_wake_up(&root->subv_writers->wait);
10994
-}
10995
-
10996
-int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
10997
-{
10998
- if (atomic_read(&root->will_be_snapshotted))
10999
- return 0;
11000
-
11001
- percpu_counter_inc(&root->subv_writers->counter);
11002
- /*
11003
- * Make sure counter is updated before we check for snapshot creation.
11004
- */
11005
- smp_mb();
11006
- if (atomic_read(&root->will_be_snapshotted)) {
11007
- btrfs_end_write_no_snapshotting(root);
11008
- return 0;
11009
- }
11010
- return 1;
11011
-}
11012
-
11013
-void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11014
-{
11015
- while (true) {
11016
- int ret;
11017
-
11018
- ret = btrfs_start_write_no_snapshotting(root);
11019
- if (ret)
11020
- break;
11021
- wait_var_event(&root->will_be_snapshotted,
11022
- !atomic_read(&root->will_be_snapshotted));
11023
- }
11024
-}
11025
-
11026
-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
11027
-{
11028
- struct btrfs_fs_info *fs_info = bg->fs_info;
11029
-
11030
- spin_lock(&fs_info->unused_bgs_lock);
11031
- if (list_empty(&bg->bg_list)) {
11032
- btrfs_get_block_group(bg);
11033
- trace_btrfs_add_unused_block_group(bg);
11034
- list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
11035
- }
11036
- spin_unlock(&fs_info->unused_bgs_lock);
110375920 }