forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-08 01573e231f18eb2d99162747186f59511f56b64d
kernel/fs/btrfs/extent-tree.c
....@@ -16,6 +16,7 @@
1616 #include <linux/percpu_counter.h>
1717 #include <linux/lockdep.h>
1818 #include <linux/crc32c.h>
19
+#include "misc.h"
1920 #include "tree-log.h"
2021 #include "disk-io.h"
2122 #include "print-tree.h"
....@@ -24,32 +25,18 @@
2425 #include "locking.h"
2526 #include "free-space-cache.h"
2627 #include "free-space-tree.h"
27
-#include "math.h"
2828 #include "sysfs.h"
2929 #include "qgroup.h"
3030 #include "ref-verify.h"
31
+#include "space-info.h"
32
+#include "block-rsv.h"
33
+#include "delalloc-space.h"
34
+#include "block-group.h"
35
+#include "discard.h"
36
+#include "rcu-string.h"
3137
3238 #undef SCRAMBLE_DELAYED_REFS
3339
34
-/*
35
- * control flags for do_chunk_alloc's force field
36
- * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
37
- * if we really need one.
38
- *
39
- * CHUNK_ALLOC_LIMITED means to only try and allocate one
40
- * if we have very few chunks already allocated. This is
41
- * used as part of the clustering code to help make sure
42
- * we have a good pool of storage to cluster in, without
43
- * filling the FS with empty chunks
44
- *
45
- * CHUNK_ALLOC_FORCE means it must try to allocate one
46
- *
47
- */
48
-enum {
49
- CHUNK_ALLOC_NO_FORCE = 0,
50
- CHUNK_ALLOC_LIMITED = 1,
51
- CHUNK_ALLOC_FORCE = 2,
52
-};
5340
5441 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5542 struct btrfs_delayed_ref_node *node, u64 parent,
....@@ -66,712 +53,33 @@
6653 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6754 struct btrfs_delayed_ref_node *node,
6855 struct btrfs_delayed_extent_op *extent_op);
69
-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
70
- int force);
7156 static int find_next_key(struct btrfs_path *path, int level,
7257 struct btrfs_key *key);
73
-static void dump_space_info(struct btrfs_fs_info *fs_info,
74
- struct btrfs_space_info *info, u64 bytes,
75
- int dump_block_groups);
76
-static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
77
- u64 num_bytes);
78
-static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
79
- struct btrfs_space_info *space_info,
80
- u64 num_bytes);
81
-static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
82
- struct btrfs_space_info *space_info,
83
- u64 num_bytes);
8458
85
-static noinline int
86
-block_group_cache_done(struct btrfs_block_group_cache *cache)
87
-{
88
- smp_mb();
89
- return cache->cached == BTRFS_CACHE_FINISHED ||
90
- cache->cached == BTRFS_CACHE_ERROR;
91
-}
92
-
93
-static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
59
+static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
9460 {
9561 return (cache->flags & bits) == bits;
9662 }
9763
98
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
99
-{
100
- atomic_inc(&cache->count);
101
-}
102
-
103
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104
-{
105
- if (atomic_dec_and_test(&cache->count)) {
106
- WARN_ON(cache->pinned > 0);
107
- WARN_ON(cache->reserved > 0);
108
-
109
- /*
110
- * If not empty, someone is still holding mutex of
111
- * full_stripe_lock, which can only be released by caller.
112
- * And it will definitely cause use-after-free when caller
113
- * tries to release full stripe lock.
114
- *
115
- * No better way to resolve, but only to warn.
116
- */
117
- WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
118
- kfree(cache->free_space_ctl);
119
- kfree(cache);
120
- }
121
-}
122
-
123
-/*
124
- * this adds the block group to the fs_info rb tree for the block group
125
- * cache
126
- */
127
-static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
128
- struct btrfs_block_group_cache *block_group)
129
-{
130
- struct rb_node **p;
131
- struct rb_node *parent = NULL;
132
- struct btrfs_block_group_cache *cache;
133
-
134
- spin_lock(&info->block_group_cache_lock);
135
- p = &info->block_group_cache_tree.rb_node;
136
-
137
- while (*p) {
138
- parent = *p;
139
- cache = rb_entry(parent, struct btrfs_block_group_cache,
140
- cache_node);
141
- if (block_group->key.objectid < cache->key.objectid) {
142
- p = &(*p)->rb_left;
143
- } else if (block_group->key.objectid > cache->key.objectid) {
144
- p = &(*p)->rb_right;
145
- } else {
146
- spin_unlock(&info->block_group_cache_lock);
147
- return -EEXIST;
148
- }
149
- }
150
-
151
- rb_link_node(&block_group->cache_node, parent, p);
152
- rb_insert_color(&block_group->cache_node,
153
- &info->block_group_cache_tree);
154
-
155
- if (info->first_logical_byte > block_group->key.objectid)
156
- info->first_logical_byte = block_group->key.objectid;
157
-
158
- spin_unlock(&info->block_group_cache_lock);
159
-
160
- return 0;
161
-}
162
-
163
-/*
164
- * This will return the block group at or after bytenr if contains is 0, else
165
- * it will return the block group that contains the bytenr
166
- */
167
-static struct btrfs_block_group_cache *
168
-block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
169
- int contains)
170
-{
171
- struct btrfs_block_group_cache *cache, *ret = NULL;
172
- struct rb_node *n;
173
- u64 end, start;
174
-
175
- spin_lock(&info->block_group_cache_lock);
176
- n = info->block_group_cache_tree.rb_node;
177
-
178
- while (n) {
179
- cache = rb_entry(n, struct btrfs_block_group_cache,
180
- cache_node);
181
- end = cache->key.objectid + cache->key.offset - 1;
182
- start = cache->key.objectid;
183
-
184
- if (bytenr < start) {
185
- if (!contains && (!ret || start < ret->key.objectid))
186
- ret = cache;
187
- n = n->rb_left;
188
- } else if (bytenr > start) {
189
- if (contains && bytenr <= end) {
190
- ret = cache;
191
- break;
192
- }
193
- n = n->rb_right;
194
- } else {
195
- ret = cache;
196
- break;
197
- }
198
- }
199
- if (ret) {
200
- btrfs_get_block_group(ret);
201
- if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
202
- info->first_logical_byte = ret->key.objectid;
203
- }
204
- spin_unlock(&info->block_group_cache_lock);
205
-
206
- return ret;
207
-}
208
-
209
-static int add_excluded_extent(struct btrfs_fs_info *fs_info,
210
- u64 start, u64 num_bytes)
64
+int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
65
+ u64 start, u64 num_bytes)
21166 {
21267 u64 end = start + num_bytes - 1;
213
- set_extent_bits(&fs_info->freed_extents[0],
214
- start, end, EXTENT_UPTODATE);
215
- set_extent_bits(&fs_info->freed_extents[1],
216
- start, end, EXTENT_UPTODATE);
68
+ set_extent_bits(&fs_info->excluded_extents, start, end,
69
+ EXTENT_UPTODATE);
21770 return 0;
21871 }
21972
220
-static void free_excluded_extents(struct btrfs_block_group_cache *cache)
73
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
22174 {
22275 struct btrfs_fs_info *fs_info = cache->fs_info;
22376 u64 start, end;
22477
225
- start = cache->key.objectid;
226
- end = start + cache->key.offset - 1;
78
+ start = cache->start;
79
+ end = start + cache->length - 1;
22780
228
- clear_extent_bits(&fs_info->freed_extents[0],
229
- start, end, EXTENT_UPTODATE);
230
- clear_extent_bits(&fs_info->freed_extents[1],
231
- start, end, EXTENT_UPTODATE);
232
-}
233
-
234
-static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
235
-{
236
- struct btrfs_fs_info *fs_info = cache->fs_info;
237
- u64 bytenr;
238
- u64 *logical;
239
- int stripe_len;
240
- int i, nr, ret;
241
-
242
- if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
243
- stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
244
- cache->bytes_super += stripe_len;
245
- ret = add_excluded_extent(fs_info, cache->key.objectid,
246
- stripe_len);
247
- if (ret)
248
- return ret;
249
- }
250
-
251
- for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
252
- bytenr = btrfs_sb_offset(i);
253
- ret = btrfs_rmap_block(fs_info, cache->key.objectid,
254
- bytenr, &logical, &nr, &stripe_len);
255
- if (ret)
256
- return ret;
257
-
258
- while (nr--) {
259
- u64 start, len;
260
-
261
- if (logical[nr] > cache->key.objectid +
262
- cache->key.offset)
263
- continue;
264
-
265
- if (logical[nr] + stripe_len <= cache->key.objectid)
266
- continue;
267
-
268
- start = logical[nr];
269
- if (start < cache->key.objectid) {
270
- start = cache->key.objectid;
271
- len = (logical[nr] + stripe_len) - start;
272
- } else {
273
- len = min_t(u64, stripe_len,
274
- cache->key.objectid +
275
- cache->key.offset - start);
276
- }
277
-
278
- cache->bytes_super += len;
279
- ret = add_excluded_extent(fs_info, start, len);
280
- if (ret) {
281
- kfree(logical);
282
- return ret;
283
- }
284
- }
285
-
286
- kfree(logical);
287
- }
288
- return 0;
289
-}
290
-
291
-static struct btrfs_caching_control *
292
-get_caching_control(struct btrfs_block_group_cache *cache)
293
-{
294
- struct btrfs_caching_control *ctl;
295
-
296
- spin_lock(&cache->lock);
297
- if (!cache->caching_ctl) {
298
- spin_unlock(&cache->lock);
299
- return NULL;
300
- }
301
-
302
- ctl = cache->caching_ctl;
303
- refcount_inc(&ctl->count);
304
- spin_unlock(&cache->lock);
305
- return ctl;
306
-}
307
-
308
-static void put_caching_control(struct btrfs_caching_control *ctl)
309
-{
310
- if (refcount_dec_and_test(&ctl->count))
311
- kfree(ctl);
312
-}
313
-
314
-#ifdef CONFIG_BTRFS_DEBUG
315
-static void fragment_free_space(struct btrfs_block_group_cache *block_group)
316
-{
317
- struct btrfs_fs_info *fs_info = block_group->fs_info;
318
- u64 start = block_group->key.objectid;
319
- u64 len = block_group->key.offset;
320
- u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
321
- fs_info->nodesize : fs_info->sectorsize;
322
- u64 step = chunk << 1;
323
-
324
- while (len > chunk) {
325
- btrfs_remove_free_space(block_group, start, chunk);
326
- start += step;
327
- if (len < step)
328
- len = 0;
329
- else
330
- len -= step;
331
- }
332
-}
333
-#endif
334
-
335
-/*
336
- * this is only called by cache_block_group, since we could have freed extents
337
- * we need to check the pinned_extents for any extents that can't be used yet
338
- * since their free space will be released as soon as the transaction commits.
339
- */
340
-u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
341
- u64 start, u64 end)
342
-{
343
- struct btrfs_fs_info *info = block_group->fs_info;
344
- u64 extent_start, extent_end, size, total_added = 0;
345
- int ret;
346
-
347
- while (start < end) {
348
- ret = find_first_extent_bit(info->pinned_extents, start,
349
- &extent_start, &extent_end,
350
- EXTENT_DIRTY | EXTENT_UPTODATE,
351
- NULL);
352
- if (ret)
353
- break;
354
-
355
- if (extent_start <= start) {
356
- start = extent_end + 1;
357
- } else if (extent_start > start && extent_start < end) {
358
- size = extent_start - start;
359
- total_added += size;
360
- ret = btrfs_add_free_space(block_group, start,
361
- size);
362
- BUG_ON(ret); /* -ENOMEM or logic error */
363
- start = extent_end + 1;
364
- } else {
365
- break;
366
- }
367
- }
368
-
369
- if (start < end) {
370
- size = end - start;
371
- total_added += size;
372
- ret = btrfs_add_free_space(block_group, start, size);
373
- BUG_ON(ret); /* -ENOMEM or logic error */
374
- }
375
-
376
- return total_added;
377
-}
378
-
379
-static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
380
-{
381
- struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
382
- struct btrfs_fs_info *fs_info = block_group->fs_info;
383
- struct btrfs_root *extent_root = fs_info->extent_root;
384
- struct btrfs_path *path;
385
- struct extent_buffer *leaf;
386
- struct btrfs_key key;
387
- u64 total_found = 0;
388
- u64 last = 0;
389
- u32 nritems;
390
- int ret;
391
- bool wakeup = true;
392
-
393
- path = btrfs_alloc_path();
394
- if (!path)
395
- return -ENOMEM;
396
-
397
- last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
398
-
399
-#ifdef CONFIG_BTRFS_DEBUG
400
- /*
401
- * If we're fragmenting we don't want to make anybody think we can
402
- * allocate from this block group until we've had a chance to fragment
403
- * the free space.
404
- */
405
- if (btrfs_should_fragment_free_space(block_group))
406
- wakeup = false;
407
-#endif
408
- /*
409
- * We don't want to deadlock with somebody trying to allocate a new
410
- * extent for the extent root while also trying to search the extent
411
- * root to add free space. So we skip locking and search the commit
412
- * root, since its read-only
413
- */
414
- path->skip_locking = 1;
415
- path->search_commit_root = 1;
416
- path->reada = READA_FORWARD;
417
-
418
- key.objectid = last;
419
- key.offset = 0;
420
- key.type = BTRFS_EXTENT_ITEM_KEY;
421
-
422
-next:
423
- ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
424
- if (ret < 0)
425
- goto out;
426
-
427
- leaf = path->nodes[0];
428
- nritems = btrfs_header_nritems(leaf);
429
-
430
- while (1) {
431
- if (btrfs_fs_closing(fs_info) > 1) {
432
- last = (u64)-1;
433
- break;
434
- }
435
-
436
- if (path->slots[0] < nritems) {
437
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
438
- } else {
439
- ret = find_next_key(path, 0, &key);
440
- if (ret)
441
- break;
442
-
443
- if (need_resched() ||
444
- rwsem_is_contended(&fs_info->commit_root_sem)) {
445
- if (wakeup)
446
- caching_ctl->progress = last;
447
- btrfs_release_path(path);
448
- up_read(&fs_info->commit_root_sem);
449
- mutex_unlock(&caching_ctl->mutex);
450
- cond_resched();
451
- mutex_lock(&caching_ctl->mutex);
452
- down_read(&fs_info->commit_root_sem);
453
- goto next;
454
- }
455
-
456
- ret = btrfs_next_leaf(extent_root, path);
457
- if (ret < 0)
458
- goto out;
459
- if (ret)
460
- break;
461
- leaf = path->nodes[0];
462
- nritems = btrfs_header_nritems(leaf);
463
- continue;
464
- }
465
-
466
- if (key.objectid < last) {
467
- key.objectid = last;
468
- key.offset = 0;
469
- key.type = BTRFS_EXTENT_ITEM_KEY;
470
-
471
- if (wakeup)
472
- caching_ctl->progress = last;
473
- btrfs_release_path(path);
474
- goto next;
475
- }
476
-
477
- if (key.objectid < block_group->key.objectid) {
478
- path->slots[0]++;
479
- continue;
480
- }
481
-
482
- if (key.objectid >= block_group->key.objectid +
483
- block_group->key.offset)
484
- break;
485
-
486
- if (key.type == BTRFS_EXTENT_ITEM_KEY ||
487
- key.type == BTRFS_METADATA_ITEM_KEY) {
488
- total_found += add_new_free_space(block_group, last,
489
- key.objectid);
490
- if (key.type == BTRFS_METADATA_ITEM_KEY)
491
- last = key.objectid +
492
- fs_info->nodesize;
493
- else
494
- last = key.objectid + key.offset;
495
-
496
- if (total_found > CACHING_CTL_WAKE_UP) {
497
- total_found = 0;
498
- if (wakeup)
499
- wake_up(&caching_ctl->wait);
500
- }
501
- }
502
- path->slots[0]++;
503
- }
504
- ret = 0;
505
-
506
- total_found += add_new_free_space(block_group, last,
507
- block_group->key.objectid +
508
- block_group->key.offset);
509
- caching_ctl->progress = (u64)-1;
510
-
511
-out:
512
- btrfs_free_path(path);
513
- return ret;
514
-}
515
-
516
-static noinline void caching_thread(struct btrfs_work *work)
517
-{
518
- struct btrfs_block_group_cache *block_group;
519
- struct btrfs_fs_info *fs_info;
520
- struct btrfs_caching_control *caching_ctl;
521
- int ret;
522
-
523
- caching_ctl = container_of(work, struct btrfs_caching_control, work);
524
- block_group = caching_ctl->block_group;
525
- fs_info = block_group->fs_info;
526
-
527
- mutex_lock(&caching_ctl->mutex);
528
- down_read(&fs_info->commit_root_sem);
529
-
530
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
531
- ret = load_free_space_tree(caching_ctl);
532
- else
533
- ret = load_extent_tree_free(caching_ctl);
534
-
535
- spin_lock(&block_group->lock);
536
- block_group->caching_ctl = NULL;
537
- block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
538
- spin_unlock(&block_group->lock);
539
-
540
-#ifdef CONFIG_BTRFS_DEBUG
541
- if (btrfs_should_fragment_free_space(block_group)) {
542
- u64 bytes_used;
543
-
544
- spin_lock(&block_group->space_info->lock);
545
- spin_lock(&block_group->lock);
546
- bytes_used = block_group->key.offset -
547
- btrfs_block_group_used(&block_group->item);
548
- block_group->space_info->bytes_used += bytes_used >> 1;
549
- spin_unlock(&block_group->lock);
550
- spin_unlock(&block_group->space_info->lock);
551
- fragment_free_space(block_group);
552
- }
553
-#endif
554
-
555
- caching_ctl->progress = (u64)-1;
556
-
557
- up_read(&fs_info->commit_root_sem);
558
- free_excluded_extents(block_group);
559
- mutex_unlock(&caching_ctl->mutex);
560
-
561
- wake_up(&caching_ctl->wait);
562
-
563
- put_caching_control(caching_ctl);
564
- btrfs_put_block_group(block_group);
565
-}
566
-
567
-static int cache_block_group(struct btrfs_block_group_cache *cache,
568
- int load_cache_only)
569
-{
570
- DEFINE_WAIT(wait);
571
- struct btrfs_fs_info *fs_info = cache->fs_info;
572
- struct btrfs_caching_control *caching_ctl;
573
- int ret = 0;
574
-
575
- caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
576
- if (!caching_ctl)
577
- return -ENOMEM;
578
-
579
- INIT_LIST_HEAD(&caching_ctl->list);
580
- mutex_init(&caching_ctl->mutex);
581
- init_waitqueue_head(&caching_ctl->wait);
582
- caching_ctl->block_group = cache;
583
- caching_ctl->progress = cache->key.objectid;
584
- refcount_set(&caching_ctl->count, 1);
585
- btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
586
- caching_thread, NULL, NULL);
587
-
588
- spin_lock(&cache->lock);
589
- /*
590
- * This should be a rare occasion, but this could happen I think in the
591
- * case where one thread starts to load the space cache info, and then
592
- * some other thread starts a transaction commit which tries to do an
593
- * allocation while the other thread is still loading the space cache
594
- * info. The previous loop should have kept us from choosing this block
595
- * group, but if we've moved to the state where we will wait on caching
596
- * block groups we need to first check if we're doing a fast load here,
597
- * so we can wait for it to finish, otherwise we could end up allocating
598
- * from a block group who's cache gets evicted for one reason or
599
- * another.
600
- */
601
- while (cache->cached == BTRFS_CACHE_FAST) {
602
- struct btrfs_caching_control *ctl;
603
-
604
- ctl = cache->caching_ctl;
605
- refcount_inc(&ctl->count);
606
- prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
607
- spin_unlock(&cache->lock);
608
-
609
- schedule();
610
-
611
- finish_wait(&ctl->wait, &wait);
612
- put_caching_control(ctl);
613
- spin_lock(&cache->lock);
614
- }
615
-
616
- if (cache->cached != BTRFS_CACHE_NO) {
617
- spin_unlock(&cache->lock);
618
- kfree(caching_ctl);
619
- return 0;
620
- }
621
- WARN_ON(cache->caching_ctl);
622
- cache->caching_ctl = caching_ctl;
623
- cache->cached = BTRFS_CACHE_FAST;
624
- spin_unlock(&cache->lock);
625
-
626
- if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
627
- mutex_lock(&caching_ctl->mutex);
628
- ret = load_free_space_cache(fs_info, cache);
629
-
630
- spin_lock(&cache->lock);
631
- if (ret == 1) {
632
- cache->caching_ctl = NULL;
633
- cache->cached = BTRFS_CACHE_FINISHED;
634
- cache->last_byte_to_unpin = (u64)-1;
635
- caching_ctl->progress = (u64)-1;
636
- } else {
637
- if (load_cache_only) {
638
- cache->caching_ctl = NULL;
639
- cache->cached = BTRFS_CACHE_NO;
640
- } else {
641
- cache->cached = BTRFS_CACHE_STARTED;
642
- cache->has_caching_ctl = 1;
643
- }
644
- }
645
- spin_unlock(&cache->lock);
646
-#ifdef CONFIG_BTRFS_DEBUG
647
- if (ret == 1 &&
648
- btrfs_should_fragment_free_space(cache)) {
649
- u64 bytes_used;
650
-
651
- spin_lock(&cache->space_info->lock);
652
- spin_lock(&cache->lock);
653
- bytes_used = cache->key.offset -
654
- btrfs_block_group_used(&cache->item);
655
- cache->space_info->bytes_used += bytes_used >> 1;
656
- spin_unlock(&cache->lock);
657
- spin_unlock(&cache->space_info->lock);
658
- fragment_free_space(cache);
659
- }
660
-#endif
661
- mutex_unlock(&caching_ctl->mutex);
662
-
663
- wake_up(&caching_ctl->wait);
664
- if (ret == 1) {
665
- put_caching_control(caching_ctl);
666
- free_excluded_extents(cache);
667
- return 0;
668
- }
669
- } else {
670
- /*
671
- * We're either using the free space tree or no caching at all.
672
- * Set cached to the appropriate value and wakeup any waiters.
673
- */
674
- spin_lock(&cache->lock);
675
- if (load_cache_only) {
676
- cache->caching_ctl = NULL;
677
- cache->cached = BTRFS_CACHE_NO;
678
- } else {
679
- cache->cached = BTRFS_CACHE_STARTED;
680
- cache->has_caching_ctl = 1;
681
- }
682
- spin_unlock(&cache->lock);
683
- wake_up(&caching_ctl->wait);
684
- }
685
-
686
- if (load_cache_only) {
687
- put_caching_control(caching_ctl);
688
- return 0;
689
- }
690
-
691
- down_write(&fs_info->commit_root_sem);
692
- refcount_inc(&caching_ctl->count);
693
- list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
694
- up_write(&fs_info->commit_root_sem);
695
-
696
- btrfs_get_block_group(cache);
697
-
698
- btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
699
-
700
- return ret;
701
-}
702
-
703
-/*
704
- * return the block group that starts at or after bytenr
705
- */
706
-static struct btrfs_block_group_cache *
707
-btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
708
-{
709
- return block_group_cache_tree_search(info, bytenr, 0);
710
-}
711
-
712
-/*
713
- * return the block group that contains the given bytenr
714
- */
715
-struct btrfs_block_group_cache *btrfs_lookup_block_group(
716
- struct btrfs_fs_info *info,
717
- u64 bytenr)
718
-{
719
- return block_group_cache_tree_search(info, bytenr, 1);
720
-}
721
-
722
-static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
723
- u64 flags)
724
-{
725
- struct list_head *head = &info->space_info;
726
- struct btrfs_space_info *found;
727
-
728
- flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
729
-
730
- rcu_read_lock();
731
- list_for_each_entry_rcu(found, head, list) {
732
- if (found->flags & flags) {
733
- rcu_read_unlock();
734
- return found;
735
- }
736
- }
737
- rcu_read_unlock();
738
- return NULL;
739
-}
740
-
741
-static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
742
- bool metadata, u64 root_objectid)
743
-{
744
- struct btrfs_space_info *space_info;
745
- u64 flags;
746
-
747
- if (metadata) {
748
- if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
749
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
750
- else
751
- flags = BTRFS_BLOCK_GROUP_METADATA;
752
- } else {
753
- flags = BTRFS_BLOCK_GROUP_DATA;
754
- }
755
-
756
- space_info = __find_space_info(fs_info, flags);
757
- ASSERT(space_info);
758
- percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
759
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
760
-}
761
-
762
-/*
763
- * after adding space to the filesystem, we need to clear the full flags
764
- * on all the space infos.
765
- */
766
-void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
767
-{
768
- struct list_head *head = &info->space_info;
769
- struct btrfs_space_info *found;
770
-
771
- rcu_read_lock();
772
- list_for_each_entry_rcu(found, head, list)
773
- found->full = 0;
774
- rcu_read_unlock();
81
+ clear_extent_bits(&fs_info->excluded_extents, start, end,
82
+ EXTENT_UPTODATE);
77583 }
77684
77785 /* simple helper to search for an existing data extent at a given offset */
....@@ -1037,7 +345,7 @@
1037345
1038346 /*
1039347 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1040
- * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
348
+ * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
1041349 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1042350 */
1043351 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
....@@ -1092,18 +400,18 @@
1092400 return BTRFS_REF_TYPE_INVALID;
1093401 }
1094402
1095
-static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
403
+u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1096404 {
1097405 u32 high_crc = ~(u32)0;
1098406 u32 low_crc = ~(u32)0;
1099407 __le64 lenum;
1100408
1101409 lenum = cpu_to_le64(root_objectid);
1102
- high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
410
+ high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1103411 lenum = cpu_to_le64(owner);
1104
- low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
412
+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1105413 lenum = cpu_to_le64(offset);
1106
- low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
414
+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1107415
1108416 return ((u64)high_crc << 31) ^ (u64)low_crc;
1109417 }
....@@ -1685,7 +993,7 @@
1685993 type = extent_ref_type(parent, owner);
1686994 size = btrfs_extent_inline_ref_size(type);
1687995
1688
- btrfs_extend_item(fs_info, path, size);
996
+ btrfs_extend_item(path, size);
1689997
1690998 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1691999 refs = btrfs_extent_refs(leaf, ei);
....@@ -1760,7 +1068,6 @@
17601068 int *last_ref)
17611069 {
17621070 struct extent_buffer *leaf = path->nodes[0];
1763
- struct btrfs_fs_info *fs_info = leaf->fs_info;
17641071 struct btrfs_extent_item *ei;
17651072 struct btrfs_extent_data_ref *dref = NULL;
17661073 struct btrfs_shared_data_ref *sref = NULL;
....@@ -1815,7 +1122,7 @@
18151122 memmove_extent_buffer(leaf, ptr, ptr + size,
18161123 end - ptr - size);
18171124 item_size -= size;
1818
- btrfs_truncate_item(fs_info, path, item_size, 1);
1125
+ btrfs_truncate_item(path, item_size, 1);
18191126 }
18201127 btrfs_mark_buffer_dirty(leaf);
18211128 }
....@@ -1835,7 +1142,22 @@
18351142 num_bytes, parent, root_objectid,
18361143 owner, offset, 1);
18371144 if (ret == 0) {
1838
- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1145
+ /*
1146
+ * We're adding refs to a tree block we already own, this
1147
+ * should not happen at all.
1148
+ */
1149
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1150
+ btrfs_crit(trans->fs_info,
1151
+"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
1152
+ bytenr, num_bytes, root_objectid);
1153
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
1154
+ WARN_ON(1);
1155
+ btrfs_crit(trans->fs_info,
1156
+ "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
1157
+ btrfs_print_leaf(path->nodes[0]);
1158
+ }
1159
+ return -EUCLEAN;
1160
+ }
18391161 update_inline_extent_backref(path, iref, refs_to_add,
18401162 extent_op, NULL);
18411163 } else if (ret == -ENOENT) {
....@@ -1843,24 +1165,6 @@
18431165 root_objectid, owner, offset,
18441166 refs_to_add, extent_op);
18451167 ret = 0;
1846
- }
1847
- return ret;
1848
-}
1849
-
1850
-static int insert_extent_backref(struct btrfs_trans_handle *trans,
1851
- struct btrfs_path *path,
1852
- u64 bytenr, u64 parent, u64 root_objectid,
1853
- u64 owner, u64 offset, int refs_to_add)
1854
-{
1855
- int ret;
1856
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1857
- BUG_ON(refs_to_add != 1);
1858
- ret = insert_tree_block_ref(trans, path, bytenr, parent,
1859
- root_objectid);
1860
- } else {
1861
- ret = insert_extent_data_ref(trans, path, bytenr, parent,
1862
- root_objectid, owner, offset,
1863
- refs_to_add);
18641168 }
18651169 return ret;
18661170 }
....@@ -1886,7 +1190,6 @@
18861190 return ret;
18871191 }
18881192
1889
-#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
18901193 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
18911194 u64 *discarded_bytes)
18921195 {
....@@ -1962,8 +1265,10 @@
19621265 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
19631266 u64 num_bytes, u64 *actual_bytes)
19641267 {
1965
- int ret;
1268
+ int ret = 0;
19661269 u64 discarded_bytes = 0;
1270
+ u64 end = bytenr + num_bytes;
1271
+ u64 cur = bytenr;
19671272 struct btrfs_bio *bbio = NULL;
19681273
19691274
....@@ -1972,15 +1277,23 @@
19721277 * associated to its stripes that don't go away while we are discarding.
19731278 */
19741279 btrfs_bio_counter_inc_blocked(fs_info);
1975
- /* Tell the block device(s) that the sectors can be discarded */
1976
- ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1977
- &bbio, 0);
1978
- /* Error condition is -ENOMEM */
1979
- if (!ret) {
1980
- struct btrfs_bio_stripe *stripe = bbio->stripes;
1280
+ while (cur < end) {
1281
+ struct btrfs_bio_stripe *stripe;
19811282 int i;
19821283
1284
+ num_bytes = end - cur;
1285
+ /* Tell the block device(s) that the sectors can be discarded */
1286
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
1287
+ &num_bytes, &bbio, 0);
1288
+ /*
1289
+ * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
1290
+ * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
1291
+ * thus we can't continue anyway.
1292
+ */
1293
+ if (ret < 0)
1294
+ goto out;
19831295
1296
+ stripe = bbio->stripes;
19841297 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
19851298 u64 bytes;
19861299 struct request_queue *req_q;
....@@ -2001,10 +1314,19 @@
20011314 stripe->physical,
20021315 stripe->length,
20031316 &bytes);
2004
- if (!ret)
1317
+ if (!ret) {
20051318 discarded_bytes += bytes;
2006
- else if (ret != -EOPNOTSUPP)
2007
- break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1319
+ } else if (ret != -EOPNOTSUPP) {
1320
+ /*
1321
+ * Logic errors or -ENOMEM, or -EIO, but
1322
+ * unlikely to happen.
1323
+ *
1324
+ * And since there are two loops, explicitly
1325
+ * go to out to avoid confusion.
1326
+ */
1327
+ btrfs_put_bbio(bbio);
1328
+ goto out;
1329
+ }
20081330
20091331 /*
20101332 * Just in case we get back EOPNOTSUPP for some reason,
....@@ -2014,7 +1336,9 @@
20141336 ret = 0;
20151337 }
20161338 btrfs_put_bbio(bbio);
1339
+ cur += num_bytes;
20171340 }
1341
+out:
20181342 btrfs_bio_counter_dec(fs_info);
20191343
20201344 if (actual_bytes)
....@@ -2028,45 +1352,31 @@
20281352
20291353 /* Can return -ENOMEM */
20301354 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2031
- struct btrfs_root *root,
2032
- u64 bytenr, u64 num_bytes, u64 parent,
2033
- u64 root_objectid, u64 owner, u64 offset)
1355
+ struct btrfs_ref *generic_ref)
20341356 {
2035
- struct btrfs_fs_info *fs_info = root->fs_info;
2036
- int old_ref_mod, new_ref_mod;
1357
+ struct btrfs_fs_info *fs_info = trans->fs_info;
20371358 int ret;
20381359
2039
- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2040
- root_objectid == BTRFS_TREE_LOG_OBJECTID);
1360
+ ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1361
+ generic_ref->action);
1362
+ BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1363
+ generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
20411364
2042
- btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2043
- owner, offset, BTRFS_ADD_DELAYED_REF);
1365
+ if (generic_ref->type == BTRFS_REF_METADATA)
1366
+ ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1367
+ else
1368
+ ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
20441369
2045
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2046
- ret = btrfs_add_delayed_tree_ref(trans, bytenr,
2047
- num_bytes, parent,
2048
- root_objectid, (int)owner,
2049
- BTRFS_ADD_DELAYED_REF, NULL,
2050
- &old_ref_mod, &new_ref_mod);
2051
- } else {
2052
- ret = btrfs_add_delayed_data_ref(trans, bytenr,
2053
- num_bytes, parent,
2054
- root_objectid, owner, offset,
2055
- 0, BTRFS_ADD_DELAYED_REF,
2056
- &old_ref_mod, &new_ref_mod);
2057
- }
2058
-
2059
- if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
2060
- bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
2061
-
2062
- add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
2063
- }
1370
+ btrfs_ref_tree_mod(fs_info, generic_ref);
20641371
20651372 return ret;
20661373 }
20671374
20681375 /*
20691376 * __btrfs_inc_extent_ref - insert backreference for a given extent
1377
+ *
1378
+ * The counterpart is in __btrfs_free_extent(), with examples and more details
1379
+ * how it works.
20701380 *
20711381 * @trans: Handle of transaction
20721382 *
....@@ -2118,7 +1428,6 @@
21181428 if (!path)
21191429 return -ENOMEM;
21201430
2121
- path->reada = READA_FORWARD;
21221431 path->leave_spinning = 1;
21231432 /* this will setup the path even if it fails to insert the back ref */
21241433 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
....@@ -2143,11 +1452,17 @@
21431452 btrfs_mark_buffer_dirty(leaf);
21441453 btrfs_release_path(path);
21451454
2146
- path->reada = READA_FORWARD;
21471455 path->leave_spinning = 1;
21481456 /* now insert the actual backref */
2149
- ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2150
- owner, offset, refs_to_add);
1457
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1458
+ BUG_ON(refs_to_add != 1);
1459
+ ret = insert_tree_block_ref(trans, path, bytenr, parent,
1460
+ root_objectid);
1461
+ } else {
1462
+ ret = insert_extent_data_ref(trans, path, bytenr, parent,
1463
+ root_objectid, owner, offset,
1464
+ refs_to_add);
1465
+ }
21511466 if (ret)
21521467 btrfs_abort_transaction(trans, ret);
21531468 out:
....@@ -2232,7 +1547,7 @@
22321547 int err = 0;
22331548 int metadata = !extent_op->is_data;
22341549
2235
- if (trans->aborted)
1550
+ if (TRANS_ABORTED(trans))
22361551 return 0;
22371552
22381553 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
....@@ -2253,7 +1568,6 @@
22531568 }
22541569
22551570 again:
2256
- path->reada = READA_FORWARD;
22571571 path->leave_spinning = 1;
22581572 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
22591573 if (ret < 0) {
....@@ -2352,10 +1666,9 @@
23521666 {
23531667 int ret = 0;
23541668
2355
- if (trans->aborted) {
1669
+ if (TRANS_ABORTED(trans)) {
23561670 if (insert_reserved)
2357
- btrfs_pin_extent(trans->fs_info, node->bytenr,
2358
- node->num_bytes, 1);
1671
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
23591672 return 0;
23601673 }
23611674
....@@ -2370,8 +1683,7 @@
23701683 else
23711684 BUG();
23721685 if (ret && insert_reserved)
2373
- btrfs_pin_extent(trans->fs_info, node->bytenr,
2374
- node->num_bytes, 1);
1686
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
23751687 return ret;
23761688 }
23771689
....@@ -2380,7 +1692,7 @@
23801692 {
23811693 struct btrfs_delayed_ref_node *ref;
23821694
2383
- if (RB_EMPTY_ROOT(&head->ref_tree))
1695
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
23841696 return NULL;
23851697
23861698 /*
....@@ -2393,7 +1705,7 @@
23931705 return list_first_entry(&head->ref_add_list,
23941706 struct btrfs_delayed_ref_node, add_list);
23951707
2396
- ref = rb_entry(rb_first(&head->ref_tree),
1708
+ ref = rb_entry(rb_first_cached(&head->ref_tree),
23971709 struct btrfs_delayed_ref_node, ref_node);
23981710 ASSERT(list_empty(&ref->add_list));
23991711 return ref;
....@@ -2409,23 +1721,69 @@
24091721 btrfs_delayed_ref_unlock(head);
24101722 }
24111723
2412
-static int cleanup_extent_op(struct btrfs_trans_handle *trans,
2413
- struct btrfs_delayed_ref_head *head)
1724
+static struct btrfs_delayed_extent_op *cleanup_extent_op(
1725
+ struct btrfs_delayed_ref_head *head)
24141726 {
24151727 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1728
+
1729
+ if (!extent_op)
1730
+ return NULL;
1731
+
1732
+ if (head->must_insert_reserved) {
1733
+ head->extent_op = NULL;
1734
+ btrfs_free_delayed_extent_op(extent_op);
1735
+ return NULL;
1736
+ }
1737
+ return extent_op;
1738
+}
1739
+
1740
+static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1741
+ struct btrfs_delayed_ref_head *head)
1742
+{
1743
+ struct btrfs_delayed_extent_op *extent_op;
24161744 int ret;
24171745
1746
+ extent_op = cleanup_extent_op(head);
24181747 if (!extent_op)
24191748 return 0;
24201749 head->extent_op = NULL;
2421
- if (head->must_insert_reserved) {
2422
- btrfs_free_delayed_extent_op(extent_op);
2423
- return 0;
2424
- }
24251750 spin_unlock(&head->lock);
24261751 ret = run_delayed_extent_op(trans, head, extent_op);
24271752 btrfs_free_delayed_extent_op(extent_op);
24281753 return ret ? ret : 1;
1754
+}
1755
+
1756
+void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1757
+ struct btrfs_delayed_ref_root *delayed_refs,
1758
+ struct btrfs_delayed_ref_head *head)
1759
+{
1760
+ int nr_items = 1; /* Dropping this ref head update. */
1761
+
1762
+ /*
1763
+ * We had csum deletions accounted for in our delayed refs rsv, we need
1764
+ * to drop the csum leaves for this update from our delayed_refs_rsv.
1765
+ */
1766
+ if (head->total_ref_mod < 0 && head->is_data) {
1767
+ spin_lock(&delayed_refs->lock);
1768
+ delayed_refs->pending_csums -= head->num_bytes;
1769
+ spin_unlock(&delayed_refs->lock);
1770
+ nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1771
+ }
1772
+
1773
+ /*
1774
+ * We were dropping refs, or had a new ref and dropped it, and thus must
1775
+ * adjust down our total_bytes_pinned, the space may or may not have
1776
+ * been pinned and so is accounted for properly in the pinned space by
1777
+ * now.
1778
+ */
1779
+ if (head->total_ref_mod < 0 ||
1780
+ (head->total_ref_mod == 0 && head->must_insert_reserved)) {
1781
+ u64 flags = btrfs_ref_head_to_space_flags(head);
1782
+
1783
+ btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
1784
+ }
1785
+
1786
+ btrfs_delayed_refs_rsv_release(fs_info, nr_items);
24291787 }
24301788
24311789 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
....@@ -2438,7 +1796,7 @@
24381796
24391797 delayed_refs = &trans->transaction->delayed_refs;
24401798
2441
- ret = cleanup_extent_op(trans, head);
1799
+ ret = run_and_cleanup_extent_op(trans, head);
24421800 if (ret < 0) {
24431801 unselect_delayed_ref_head(delayed_refs, head);
24441802 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
....@@ -2454,156 +1812,91 @@
24541812 spin_unlock(&head->lock);
24551813 spin_lock(&delayed_refs->lock);
24561814 spin_lock(&head->lock);
2457
- if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
1815
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
24581816 spin_unlock(&head->lock);
24591817 spin_unlock(&delayed_refs->lock);
24601818 return 1;
24611819 }
2462
- delayed_refs->num_heads--;
2463
- rb_erase(&head->href_node, &delayed_refs->href_root);
2464
- RB_CLEAR_NODE(&head->href_node);
1820
+ btrfs_delete_ref_head(delayed_refs, head);
24651821 spin_unlock(&head->lock);
24661822 spin_unlock(&delayed_refs->lock);
2467
- atomic_dec(&delayed_refs->num_entries);
2468
-
2469
- trace_run_delayed_ref_head(fs_info, head, 0);
2470
-
2471
- if (head->total_ref_mod < 0) {
2472
- struct btrfs_space_info *space_info;
2473
- u64 flags;
2474
-
2475
- if (head->is_data)
2476
- flags = BTRFS_BLOCK_GROUP_DATA;
2477
- else if (head->is_system)
2478
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
2479
- else
2480
- flags = BTRFS_BLOCK_GROUP_METADATA;
2481
- space_info = __find_space_info(fs_info, flags);
2482
- ASSERT(space_info);
2483
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
2484
- -head->num_bytes,
2485
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
2486
-
2487
- if (head->is_data) {
2488
- spin_lock(&delayed_refs->lock);
2489
- delayed_refs->pending_csums -= head->num_bytes;
2490
- spin_unlock(&delayed_refs->lock);
2491
- }
2492
- }
24931823
24941824 if (head->must_insert_reserved) {
2495
- btrfs_pin_extent(fs_info, head->bytenr,
2496
- head->num_bytes, 1);
1825
+ btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
24971826 if (head->is_data) {
24981827 ret = btrfs_del_csums(trans, fs_info->csum_root,
24991828 head->bytenr, head->num_bytes);
25001829 }
25011830 }
25021831
2503
- /* Also free its reserved qgroup space */
2504
- btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2505
- head->qgroup_reserved);
1832
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1833
+
1834
+ trace_run_delayed_ref_head(fs_info, head, 0);
25061835 btrfs_delayed_ref_unlock(head);
25071836 btrfs_put_delayed_ref_head(head);
25081837 return ret;
25091838 }
25101839
2511
-/*
2512
- * Returns 0 on success or if called with an already aborted transaction.
2513
- * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2514
- */
2515
-static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2516
- unsigned long nr)
1840
+static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1841
+ struct btrfs_trans_handle *trans)
1842
+{
1843
+ struct btrfs_delayed_ref_root *delayed_refs =
1844
+ &trans->transaction->delayed_refs;
1845
+ struct btrfs_delayed_ref_head *head = NULL;
1846
+ int ret;
1847
+
1848
+ spin_lock(&delayed_refs->lock);
1849
+ head = btrfs_select_ref_head(delayed_refs);
1850
+ if (!head) {
1851
+ spin_unlock(&delayed_refs->lock);
1852
+ return head;
1853
+ }
1854
+
1855
+ /*
1856
+ * Grab the lock that says we are going to process all the refs for
1857
+ * this head
1858
+ */
1859
+ ret = btrfs_delayed_ref_lock(delayed_refs, head);
1860
+ spin_unlock(&delayed_refs->lock);
1861
+
1862
+ /*
1863
+ * We may have dropped the spin lock to get the head mutex lock, and
1864
+ * that might have given someone else time to free the head. If that's
1865
+ * true, it has been removed from our list and we can move on.
1866
+ */
1867
+ if (ret == -EAGAIN)
1868
+ head = ERR_PTR(-EAGAIN);
1869
+
1870
+ return head;
1871
+}
1872
+
1873
+static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1874
+ struct btrfs_delayed_ref_head *locked_ref,
1875
+ unsigned long *run_refs)
25171876 {
25181877 struct btrfs_fs_info *fs_info = trans->fs_info;
25191878 struct btrfs_delayed_ref_root *delayed_refs;
2520
- struct btrfs_delayed_ref_node *ref;
2521
- struct btrfs_delayed_ref_head *locked_ref = NULL;
25221879 struct btrfs_delayed_extent_op *extent_op;
2523
- ktime_t start = ktime_get();
2524
- int ret;
2525
- unsigned long count = 0;
2526
- unsigned long actual_count = 0;
1880
+ struct btrfs_delayed_ref_node *ref;
25271881 int must_insert_reserved = 0;
1882
+ int ret;
25281883
25291884 delayed_refs = &trans->transaction->delayed_refs;
2530
- while (1) {
2531
- if (!locked_ref) {
2532
- if (count >= nr)
2533
- break;
25341885
2535
- spin_lock(&delayed_refs->lock);
2536
- locked_ref = btrfs_select_ref_head(trans);
2537
- if (!locked_ref) {
2538
- spin_unlock(&delayed_refs->lock);
2539
- break;
2540
- }
1886
+ lockdep_assert_held(&locked_ref->mutex);
1887
+ lockdep_assert_held(&locked_ref->lock);
25411888
2542
- /* grab the lock that says we are going to process
2543
- * all the refs for this head */
2544
- ret = btrfs_delayed_ref_lock(trans, locked_ref);
2545
- spin_unlock(&delayed_refs->lock);
2546
- /*
2547
- * we may have dropped the spin lock to get the head
2548
- * mutex lock, and that might have given someone else
2549
- * time to free the head. If that's true, it has been
2550
- * removed from our list and we can move on.
2551
- */
2552
- if (ret == -EAGAIN) {
2553
- locked_ref = NULL;
2554
- count++;
2555
- continue;
2556
- }
2557
- }
2558
-
2559
- /*
2560
- * We need to try and merge add/drops of the same ref since we
2561
- * can run into issues with relocate dropping the implicit ref
2562
- * and then it being added back again before the drop can
2563
- * finish. If we merged anything we need to re-loop so we can
2564
- * get a good ref.
2565
- * Or we can get node references of the same type that weren't
2566
- * merged when created due to bumps in the tree mod seq, and
2567
- * we need to merge them to prevent adding an inline extent
2568
- * backref before dropping it (triggering a BUG_ON at
2569
- * insert_inline_extent_backref()).
2570
- */
2571
- spin_lock(&locked_ref->lock);
2572
- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2573
-
2574
- ref = select_delayed_ref(locked_ref);
2575
-
2576
- if (ref && ref->seq &&
1889
+ while ((ref = select_delayed_ref(locked_ref))) {
1890
+ if (ref->seq &&
25771891 btrfs_check_delayed_seq(fs_info, ref->seq)) {
25781892 spin_unlock(&locked_ref->lock);
25791893 unselect_delayed_ref_head(delayed_refs, locked_ref);
2580
- locked_ref = NULL;
2581
- cond_resched();
2582
- count++;
2583
- continue;
1894
+ return -EAGAIN;
25841895 }
25851896
2586
- /*
2587
- * We're done processing refs in this ref_head, clean everything
2588
- * up and move on to the next ref_head.
2589
- */
2590
- if (!ref) {
2591
- ret = cleanup_ref_head(trans, locked_ref);
2592
- if (ret > 0 ) {
2593
- /* We dropped our lock, we need to loop. */
2594
- ret = 0;
2595
- continue;
2596
- } else if (ret) {
2597
- return ret;
2598
- }
2599
- locked_ref = NULL;
2600
- count++;
2601
- continue;
2602
- }
2603
-
2604
- actual_count++;
1897
+ (*run_refs)++;
26051898 ref->in_tree = 0;
2606
- rb_erase(&ref->ref_node, &locked_ref->ref_tree);
1899
+ rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
26071900 RB_CLEAR_NODE(&ref->ref_node);
26081901 if (!list_empty(&ref->add_list))
26091902 list_del(&ref->add_list);
....@@ -2625,8 +1918,8 @@
26251918 atomic_dec(&delayed_refs->num_entries);
26261919
26271920 /*
2628
- * Record the must-insert_reserved flag before we drop the spin
2629
- * lock.
1921
+ * Record the must_insert_reserved flag before we drop the
1922
+ * spin lock.
26301923 */
26311924 must_insert_reserved = locked_ref->must_insert_reserved;
26321925 locked_ref->must_insert_reserved = 0;
....@@ -2648,9 +1941,89 @@
26481941 }
26491942
26501943 btrfs_put_delayed_ref(ref);
2651
- count++;
26521944 cond_resched();
1945
+
1946
+ spin_lock(&locked_ref->lock);
1947
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
26531948 }
1949
+
1950
+ return 0;
1951
+}
1952
+
1953
+/*
1954
+ * Returns 0 on success or if called with an already aborted transaction.
1955
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
1956
+ */
1957
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1958
+ unsigned long nr)
1959
+{
1960
+ struct btrfs_fs_info *fs_info = trans->fs_info;
1961
+ struct btrfs_delayed_ref_root *delayed_refs;
1962
+ struct btrfs_delayed_ref_head *locked_ref = NULL;
1963
+ ktime_t start = ktime_get();
1964
+ int ret;
1965
+ unsigned long count = 0;
1966
+ unsigned long actual_count = 0;
1967
+
1968
+ delayed_refs = &trans->transaction->delayed_refs;
1969
+ do {
1970
+ if (!locked_ref) {
1971
+ locked_ref = btrfs_obtain_ref_head(trans);
1972
+ if (IS_ERR_OR_NULL(locked_ref)) {
1973
+ if (PTR_ERR(locked_ref) == -EAGAIN) {
1974
+ continue;
1975
+ } else {
1976
+ break;
1977
+ }
1978
+ }
1979
+ count++;
1980
+ }
1981
+ /*
1982
+ * We need to try and merge add/drops of the same ref since we
1983
+ * can run into issues with relocate dropping the implicit ref
1984
+ * and then it being added back again before the drop can
1985
+ * finish. If we merged anything we need to re-loop so we can
1986
+ * get a good ref.
1987
+ * Or we can get node references of the same type that weren't
1988
+ * merged when created due to bumps in the tree mod seq, and
1989
+ * we need to merge them to prevent adding an inline extent
1990
+ * backref before dropping it (triggering a BUG_ON at
1991
+ * insert_inline_extent_backref()).
1992
+ */
1993
+ spin_lock(&locked_ref->lock);
1994
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
1995
+
1996
+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
1997
+ &actual_count);
1998
+ if (ret < 0 && ret != -EAGAIN) {
1999
+ /*
2000
+ * Error, btrfs_run_delayed_refs_for_head already
2001
+ * unlocked everything so just bail out
2002
+ */
2003
+ return ret;
2004
+ } else if (!ret) {
2005
+ /*
2006
+ * Success, perform the usual cleanup of a processed
2007
+ * head
2008
+ */
2009
+ ret = cleanup_ref_head(trans, locked_ref);
2010
+ if (ret > 0 ) {
2011
+ /* We dropped our lock, we need to loop. */
2012
+ ret = 0;
2013
+ continue;
2014
+ } else if (ret) {
2015
+ return ret;
2016
+ }
2017
+ }
2018
+
2019
+ /*
2020
+ * Either success case or btrfs_run_delayed_refs_for_head
2021
+ * returned -EAGAIN, meaning we need to select another head
2022
+ */
2023
+
2024
+ locked_ref = NULL;
2025
+ cond_resched();
2026
+ } while ((nr != -1 && count < nr) || locked_ref);
26542027
26552028 /*
26562029 * We don't want to include ref heads since we can have empty ref heads
....@@ -2716,22 +2089,6 @@
27162089 }
27172090 #endif
27182091
2719
-static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2720
-{
2721
- u64 num_bytes;
2722
-
2723
- num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2724
- sizeof(struct btrfs_extent_inline_ref));
2725
- if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2726
- num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2727
-
2728
- /*
2729
- * We don't ever fill up leaves all the way so multiply by 2 just to be
2730
- * closer to what we're really going to want to use.
2731
- */
2732
- return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2733
-}
2734
-
27352092 /*
27362093 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
27372094 * would require to store the csums for that many bytes.
....@@ -2749,153 +2106,6 @@
27492106 num_csums += num_csums_per_leaf - 1;
27502107 num_csums = div64_u64(num_csums, num_csums_per_leaf);
27512108 return num_csums;
2752
-}
2753
-
2754
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2755
- struct btrfs_fs_info *fs_info)
2756
-{
2757
- struct btrfs_block_rsv *global_rsv;
2758
- u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2759
- u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2760
- unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
2761
- u64 num_bytes, num_dirty_bgs_bytes;
2762
- int ret = 0;
2763
-
2764
- num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
2765
- num_heads = heads_to_leaves(fs_info, num_heads);
2766
- if (num_heads > 1)
2767
- num_bytes += (num_heads - 1) * fs_info->nodesize;
2768
- num_bytes <<= 1;
2769
- num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
2770
- fs_info->nodesize;
2771
- num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
2772
- num_dirty_bgs);
2773
- global_rsv = &fs_info->global_block_rsv;
2774
-
2775
- /*
2776
- * If we can't allocate any more chunks lets make sure we have _lots_ of
2777
- * wiggle room since running delayed refs can create more delayed refs.
2778
- */
2779
- if (global_rsv->space_info->full) {
2780
- num_dirty_bgs_bytes <<= 1;
2781
- num_bytes <<= 1;
2782
- }
2783
-
2784
- spin_lock(&global_rsv->lock);
2785
- if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2786
- ret = 1;
2787
- spin_unlock(&global_rsv->lock);
2788
- return ret;
2789
-}
2790
-
2791
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2792
- struct btrfs_fs_info *fs_info)
2793
-{
2794
- u64 num_entries =
2795
- atomic_read(&trans->transaction->delayed_refs.num_entries);
2796
- u64 avg_runtime;
2797
- u64 val;
2798
-
2799
- smp_mb();
2800
- avg_runtime = fs_info->avg_delayed_ref_runtime;
2801
- val = num_entries * avg_runtime;
2802
- if (val >= NSEC_PER_SEC)
2803
- return 1;
2804
- if (val >= NSEC_PER_SEC / 2)
2805
- return 2;
2806
-
2807
- return btrfs_check_space_for_delayed_refs(trans, fs_info);
2808
-}
2809
-
2810
-struct async_delayed_refs {
2811
- struct btrfs_root *root;
2812
- u64 transid;
2813
- int count;
2814
- int error;
2815
- int sync;
2816
- struct completion wait;
2817
- struct btrfs_work work;
2818
-};
2819
-
2820
-static inline struct async_delayed_refs *
2821
-to_async_delayed_refs(struct btrfs_work *work)
2822
-{
2823
- return container_of(work, struct async_delayed_refs, work);
2824
-}
2825
-
2826
-static void delayed_ref_async_start(struct btrfs_work *work)
2827
-{
2828
- struct async_delayed_refs *async = to_async_delayed_refs(work);
2829
- struct btrfs_trans_handle *trans;
2830
- struct btrfs_fs_info *fs_info = async->root->fs_info;
2831
- int ret;
2832
-
2833
- /* if the commit is already started, we don't need to wait here */
2834
- if (btrfs_transaction_blocked(fs_info))
2835
- goto done;
2836
-
2837
- trans = btrfs_join_transaction(async->root);
2838
- if (IS_ERR(trans)) {
2839
- async->error = PTR_ERR(trans);
2840
- goto done;
2841
- }
2842
-
2843
- /*
2844
- * trans->sync means that when we call end_transaction, we won't
2845
- * wait on delayed refs
2846
- */
2847
- trans->sync = true;
2848
-
2849
- /* Don't bother flushing if we got into a different transaction */
2850
- if (trans->transid > async->transid)
2851
- goto end;
2852
-
2853
- ret = btrfs_run_delayed_refs(trans, async->count);
2854
- if (ret)
2855
- async->error = ret;
2856
-end:
2857
- ret = btrfs_end_transaction(trans);
2858
- if (ret && !async->error)
2859
- async->error = ret;
2860
-done:
2861
- if (async->sync)
2862
- complete(&async->wait);
2863
- else
2864
- kfree(async);
2865
-}
2866
-
2867
-int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
2868
- unsigned long count, u64 transid, int wait)
2869
-{
2870
- struct async_delayed_refs *async;
2871
- int ret;
2872
-
2873
- async = kmalloc(sizeof(*async), GFP_NOFS);
2874
- if (!async)
2875
- return -ENOMEM;
2876
-
2877
- async->root = fs_info->tree_root;
2878
- async->count = count;
2879
- async->error = 0;
2880
- async->transid = transid;
2881
- if (wait)
2882
- async->sync = 1;
2883
- else
2884
- async->sync = 0;
2885
- init_completion(&async->wait);
2886
-
2887
- btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2888
- delayed_ref_async_start, NULL, NULL);
2889
-
2890
- btrfs_queue_work(fs_info->extent_workers, &async->work);
2891
-
2892
- if (wait) {
2893
- wait_for_completion(&async->wait);
2894
- ret = async->error;
2895
- kfree(async);
2896
- return ret;
2897
- }
2898
- return 0;
28992109 }
29002110
29012111 /*
....@@ -2919,7 +2129,7 @@
29192129 int run_all = count == (unsigned long)-1;
29202130
29212131 /* We'll clean this up in btrfs_cleanup_transaction */
2922
- if (trans->aborted)
2132
+ if (TRANS_ABORTED(trans))
29232133 return 0;
29242134
29252135 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
....@@ -2940,11 +2150,10 @@
29402150 }
29412151
29422152 if (run_all) {
2943
- if (!list_empty(&trans->new_bgs))
2944
- btrfs_create_pending_block_groups(trans);
2153
+ btrfs_create_pending_block_groups(trans);
29452154
29462155 spin_lock(&delayed_refs->lock);
2947
- node = rb_first(&delayed_refs->href_root);
2156
+ node = rb_first_cached(&delayed_refs->href_root);
29482157 if (!node) {
29492158 spin_unlock(&delayed_refs->lock);
29502159 goto out;
....@@ -2967,8 +2176,7 @@
29672176 }
29682177
29692178 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2970
- struct btrfs_fs_info *fs_info,
2971
- u64 bytenr, u64 num_bytes, u64 flags,
2179
+ struct extent_buffer *eb, u64 flags,
29722180 int level, int is_data)
29732181 {
29742182 struct btrfs_delayed_extent_op *extent_op;
....@@ -2984,8 +2192,7 @@
29842192 extent_op->is_data = is_data ? true : false;
29852193 extent_op->level = level;
29862194
2987
- ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
2988
- num_bytes, extent_op);
2195
+ ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
29892196 if (ret)
29902197 btrfs_free_delayed_extent_op(extent_op);
29912198 return ret;
....@@ -3043,7 +2250,8 @@
30432250 * XXX: We should replace this with a proper search function in the
30442251 * future.
30452252 */
3046
- for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
2253
+ for (node = rb_first_cached(&head->ref_tree); node;
2254
+ node = rb_next(node)) {
30472255 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
30482256 /* If it's a shared ref we know a cross reference exists */
30492257 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
....@@ -3072,7 +2280,8 @@
30722280
30732281 static noinline int check_committed_ref(struct btrfs_root *root,
30742282 struct btrfs_path *path,
3075
- u64 objectid, u64 offset, u64 bytenr)
2283
+ u64 objectid, u64 offset, u64 bytenr,
2284
+ bool strict)
30762285 {
30772286 struct btrfs_fs_info *fs_info = root->fs_info;
30782287 struct btrfs_root *extent_root = fs_info->extent_root;
....@@ -3109,16 +2318,23 @@
31092318 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
31102319 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
31112320
2321
+ /* If extent item has more than 1 inline ref then it's shared */
31122322 if (item_size != sizeof(*ei) +
31132323 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
31142324 goto out;
31152325
3116
- if (btrfs_extent_generation(leaf, ei) <=
3117
- btrfs_root_last_snapshot(&root->root_item))
2326
+ /*
2327
+ * If extent created before last snapshot => it's shared unless the
2328
+ * snapshot has been deleted. Use the heuristic if strict is false.
2329
+ */
2330
+ if (!strict &&
2331
+ (btrfs_extent_generation(leaf, ei) <=
2332
+ btrfs_root_last_snapshot(&root->root_item)))
31182333 goto out;
31192334
31202335 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
31212336
2337
+ /* If this extent has SHARED_DATA_REF then it's shared */
31222338 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
31232339 if (type != BTRFS_EXTENT_DATA_REF_KEY)
31242340 goto out;
....@@ -3138,11 +2354,10 @@
31382354 }
31392355
31402356 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3141
- u64 bytenr)
2357
+ u64 bytenr, bool strict)
31422358 {
31432359 struct btrfs_path *path;
31442360 int ret;
3145
- int ret2;
31462361
31472362 path = btrfs_alloc_path();
31482363 if (!path)
....@@ -3150,21 +2365,13 @@
31502365
31512366 do {
31522367 ret = check_committed_ref(root, path, objectid,
3153
- offset, bytenr);
2368
+ offset, bytenr, strict);
31542369 if (ret && ret != -ENOENT)
31552370 goto out;
31562371
3157
- ret2 = check_delayed_ref(root, path, objectid,
3158
- offset, bytenr);
3159
- } while (ret2 == -EAGAIN);
2372
+ ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2373
+ } while (ret == -EAGAIN);
31602374
3161
- if (ret2 && ret2 != -ENOENT) {
3162
- ret = ret2;
3163
- goto out;
3164
- }
3165
-
3166
- if (ret != -ENOENT || ret2 != -ENOENT)
3167
- ret = 0;
31682375 out:
31692376 btrfs_free_path(path);
31702377 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
....@@ -3185,13 +2392,12 @@
31852392 u32 nritems;
31862393 struct btrfs_key key;
31872394 struct btrfs_file_extent_item *fi;
2395
+ struct btrfs_ref generic_ref = { 0 };
2396
+ bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
31882397 int i;
2398
+ int action;
31892399 int level;
31902400 int ret = 0;
3191
- int (*process_func)(struct btrfs_trans_handle *,
3192
- struct btrfs_root *,
3193
- u64, u64, u64, u64, u64, u64);
3194
-
31952401
31962402 if (btrfs_is_testing(fs_info))
31972403 return 0;
....@@ -3200,18 +2406,17 @@
32002406 nritems = btrfs_header_nritems(buf);
32012407 level = btrfs_header_level(buf);
32022408
3203
- if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
2409
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
32042410 return 0;
3205
-
3206
- if (inc)
3207
- process_func = btrfs_inc_extent_ref;
3208
- else
3209
- process_func = btrfs_free_extent;
32102411
32112412 if (full_backref)
32122413 parent = buf->start;
32132414 else
32142415 parent = 0;
2416
+ if (inc)
2417
+ action = BTRFS_ADD_DELAYED_REF;
2418
+ else
2419
+ action = BTRFS_DROP_DELAYED_REF;
32152420
32162421 for (i = 0; i < nritems; i++) {
32172422 if (level == 0) {
....@@ -3229,16 +2434,30 @@
32292434
32302435 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
32312436 key.offset -= btrfs_file_extent_offset(buf, fi);
3232
- ret = process_func(trans, root, bytenr, num_bytes,
3233
- parent, ref_root, key.objectid,
3234
- key.offset);
2437
+ btrfs_init_generic_ref(&generic_ref, action, bytenr,
2438
+ num_bytes, parent);
2439
+ generic_ref.real_root = root->root_key.objectid;
2440
+ btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2441
+ key.offset);
2442
+ generic_ref.skip_qgroup = for_reloc;
2443
+ if (inc)
2444
+ ret = btrfs_inc_extent_ref(trans, &generic_ref);
2445
+ else
2446
+ ret = btrfs_free_extent(trans, &generic_ref);
32352447 if (ret)
32362448 goto fail;
32372449 } else {
32382450 bytenr = btrfs_node_blockptr(buf, i);
32392451 num_bytes = fs_info->nodesize;
3240
- ret = process_func(trans, root, bytenr, num_bytes,
3241
- parent, ref_root, level - 1, 0);
2452
+ btrfs_init_generic_ref(&generic_ref, action, bytenr,
2453
+ num_bytes, parent);
2454
+ generic_ref.real_root = root->root_key.objectid;
2455
+ btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
2456
+ generic_ref.skip_qgroup = for_reloc;
2457
+ if (inc)
2458
+ ret = btrfs_inc_extent_ref(trans, &generic_ref);
2459
+ else
2460
+ ret = btrfs_free_extent(trans, &generic_ref);
32422461 if (ret)
32432462 goto fail;
32442463 }
....@@ -3260,555 +2479,9 @@
32602479 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
32612480 }
32622481
3263
-static int write_one_cache_group(struct btrfs_trans_handle *trans,
3264
- struct btrfs_fs_info *fs_info,
3265
- struct btrfs_path *path,
3266
- struct btrfs_block_group_cache *cache)
3267
-{
3268
- int ret;
3269
- struct btrfs_root *extent_root = fs_info->extent_root;
3270
- unsigned long bi;
3271
- struct extent_buffer *leaf;
3272
-
3273
- ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3274
- if (ret) {
3275
- if (ret > 0)
3276
- ret = -ENOENT;
3277
- goto fail;
3278
- }
3279
-
3280
- leaf = path->nodes[0];
3281
- bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3282
- write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3283
- btrfs_mark_buffer_dirty(leaf);
3284
-fail:
3285
- btrfs_release_path(path);
3286
- return ret;
3287
-
3288
-}
3289
-
3290
-static struct btrfs_block_group_cache *
3291
-next_block_group(struct btrfs_fs_info *fs_info,
3292
- struct btrfs_block_group_cache *cache)
3293
-{
3294
- struct rb_node *node;
3295
-
3296
- spin_lock(&fs_info->block_group_cache_lock);
3297
-
3298
- /* If our block group was removed, we need a full search. */
3299
- if (RB_EMPTY_NODE(&cache->cache_node)) {
3300
- const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3301
-
3302
- spin_unlock(&fs_info->block_group_cache_lock);
3303
- btrfs_put_block_group(cache);
3304
- cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3305
- }
3306
- node = rb_next(&cache->cache_node);
3307
- btrfs_put_block_group(cache);
3308
- if (node) {
3309
- cache = rb_entry(node, struct btrfs_block_group_cache,
3310
- cache_node);
3311
- btrfs_get_block_group(cache);
3312
- } else
3313
- cache = NULL;
3314
- spin_unlock(&fs_info->block_group_cache_lock);
3315
- return cache;
3316
-}
3317
-
3318
-static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3319
- struct btrfs_trans_handle *trans,
3320
- struct btrfs_path *path)
3321
-{
3322
- struct btrfs_fs_info *fs_info = block_group->fs_info;
3323
- struct btrfs_root *root = fs_info->tree_root;
3324
- struct inode *inode = NULL;
3325
- struct extent_changeset *data_reserved = NULL;
3326
- u64 alloc_hint = 0;
3327
- int dcs = BTRFS_DC_ERROR;
3328
- u64 num_pages = 0;
3329
- int retries = 0;
3330
- int ret = 0;
3331
-
3332
- /*
3333
- * If this block group is smaller than 100 megs don't bother caching the
3334
- * block group.
3335
- */
3336
- if (block_group->key.offset < (100 * SZ_1M)) {
3337
- spin_lock(&block_group->lock);
3338
- block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3339
- spin_unlock(&block_group->lock);
3340
- return 0;
3341
- }
3342
-
3343
- if (trans->aborted)
3344
- return 0;
3345
-again:
3346
- inode = lookup_free_space_inode(fs_info, block_group, path);
3347
- if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3348
- ret = PTR_ERR(inode);
3349
- btrfs_release_path(path);
3350
- goto out;
3351
- }
3352
-
3353
- if (IS_ERR(inode)) {
3354
- BUG_ON(retries);
3355
- retries++;
3356
-
3357
- if (block_group->ro)
3358
- goto out_free;
3359
-
3360
- ret = create_free_space_inode(fs_info, trans, block_group,
3361
- path);
3362
- if (ret)
3363
- goto out_free;
3364
- goto again;
3365
- }
3366
-
3367
- /*
3368
- * We want to set the generation to 0, that way if anything goes wrong
3369
- * from here on out we know not to trust this cache when we load up next
3370
- * time.
3371
- */
3372
- BTRFS_I(inode)->generation = 0;
3373
- ret = btrfs_update_inode(trans, root, inode);
3374
- if (ret) {
3375
- /*
3376
- * So theoretically we could recover from this, simply set the
3377
- * super cache generation to 0 so we know to invalidate the
3378
- * cache, but then we'd have to keep track of the block groups
3379
- * that fail this way so we know we _have_ to reset this cache
3380
- * before the next commit or risk reading stale cache. So to
3381
- * limit our exposure to horrible edge cases lets just abort the
3382
- * transaction, this only happens in really bad situations
3383
- * anyway.
3384
- */
3385
- btrfs_abort_transaction(trans, ret);
3386
- goto out_put;
3387
- }
3388
- WARN_ON(ret);
3389
-
3390
- /* We've already setup this transaction, go ahead and exit */
3391
- if (block_group->cache_generation == trans->transid &&
3392
- i_size_read(inode)) {
3393
- dcs = BTRFS_DC_SETUP;
3394
- goto out_put;
3395
- }
3396
-
3397
- if (i_size_read(inode) > 0) {
3398
- ret = btrfs_check_trunc_cache_free_space(fs_info,
3399
- &fs_info->global_block_rsv);
3400
- if (ret)
3401
- goto out_put;
3402
-
3403
- ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3404
- if (ret)
3405
- goto out_put;
3406
- }
3407
-
3408
- spin_lock(&block_group->lock);
3409
- if (block_group->cached != BTRFS_CACHE_FINISHED ||
3410
- !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3411
- /*
3412
- * don't bother trying to write stuff out _if_
3413
- * a) we're not cached,
3414
- * b) we're with nospace_cache mount option,
3415
- * c) we're with v2 space_cache (FREE_SPACE_TREE).
3416
- */
3417
- dcs = BTRFS_DC_WRITTEN;
3418
- spin_unlock(&block_group->lock);
3419
- goto out_put;
3420
- }
3421
- spin_unlock(&block_group->lock);
3422
-
3423
- /*
3424
- * We hit an ENOSPC when setting up the cache in this transaction, just
3425
- * skip doing the setup, we've already cleared the cache so we're safe.
3426
- */
3427
- if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3428
- ret = -ENOSPC;
3429
- goto out_put;
3430
- }
3431
-
3432
- /*
3433
- * Try to preallocate enough space based on how big the block group is.
3434
- * Keep in mind this has to include any pinned space which could end up
3435
- * taking up quite a bit since it's not folded into the other space
3436
- * cache.
3437
- */
3438
- num_pages = div_u64(block_group->key.offset, SZ_256M);
3439
- if (!num_pages)
3440
- num_pages = 1;
3441
-
3442
- num_pages *= 16;
3443
- num_pages *= PAGE_SIZE;
3444
-
3445
- ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3446
- if (ret)
3447
- goto out_put;
3448
-
3449
- ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3450
- num_pages, num_pages,
3451
- &alloc_hint);
3452
- /*
3453
- * Our cache requires contiguous chunks so that we don't modify a bunch
3454
- * of metadata or split extents when writing the cache out, which means
3455
- * we can enospc if we are heavily fragmented in addition to just normal
3456
- * out of space conditions. So if we hit this just skip setting up any
3457
- * other block groups for this transaction, maybe we'll unpin enough
3458
- * space the next time around.
3459
- */
3460
- if (!ret)
3461
- dcs = BTRFS_DC_SETUP;
3462
- else if (ret == -ENOSPC)
3463
- set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3464
-
3465
-out_put:
3466
- iput(inode);
3467
-out_free:
3468
- btrfs_release_path(path);
3469
-out:
3470
- spin_lock(&block_group->lock);
3471
- if (!ret && dcs == BTRFS_DC_SETUP)
3472
- block_group->cache_generation = trans->transid;
3473
- block_group->disk_cache_state = dcs;
3474
- spin_unlock(&block_group->lock);
3475
-
3476
- extent_changeset_free(data_reserved);
3477
- return ret;
3478
-}
3479
-
3480
-int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3481
- struct btrfs_fs_info *fs_info)
3482
-{
3483
- struct btrfs_block_group_cache *cache, *tmp;
3484
- struct btrfs_transaction *cur_trans = trans->transaction;
3485
- struct btrfs_path *path;
3486
-
3487
- if (list_empty(&cur_trans->dirty_bgs) ||
3488
- !btrfs_test_opt(fs_info, SPACE_CACHE))
3489
- return 0;
3490
-
3491
- path = btrfs_alloc_path();
3492
- if (!path)
3493
- return -ENOMEM;
3494
-
3495
- /* Could add new block groups, use _safe just in case */
3496
- list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3497
- dirty_list) {
3498
- if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3499
- cache_save_setup(cache, trans, path);
3500
- }
3501
-
3502
- btrfs_free_path(path);
3503
- return 0;
3504
-}
3505
-
3506
-/*
3507
- * transaction commit does final block group cache writeback during a
3508
- * critical section where nothing is allowed to change the FS. This is
3509
- * required in order for the cache to actually match the block group,
3510
- * but can introduce a lot of latency into the commit.
3511
- *
3512
- * So, btrfs_start_dirty_block_groups is here to kick off block group
3513
- * cache IO. There's a chance we'll have to redo some of it if the
3514
- * block group changes again during the commit, but it greatly reduces
3515
- * the commit latency by getting rid of the easy block groups while
3516
- * we're still allowing others to join the commit.
3517
- */
3518
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3519
-{
3520
- struct btrfs_fs_info *fs_info = trans->fs_info;
3521
- struct btrfs_block_group_cache *cache;
3522
- struct btrfs_transaction *cur_trans = trans->transaction;
3523
- int ret = 0;
3524
- int should_put;
3525
- struct btrfs_path *path = NULL;
3526
- LIST_HEAD(dirty);
3527
- struct list_head *io = &cur_trans->io_bgs;
3528
- int num_started = 0;
3529
- int loops = 0;
3530
-
3531
- spin_lock(&cur_trans->dirty_bgs_lock);
3532
- if (list_empty(&cur_trans->dirty_bgs)) {
3533
- spin_unlock(&cur_trans->dirty_bgs_lock);
3534
- return 0;
3535
- }
3536
- list_splice_init(&cur_trans->dirty_bgs, &dirty);
3537
- spin_unlock(&cur_trans->dirty_bgs_lock);
3538
-
3539
-again:
3540
- /*
3541
- * make sure all the block groups on our dirty list actually
3542
- * exist
3543
- */
3544
- btrfs_create_pending_block_groups(trans);
3545
-
3546
- if (!path) {
3547
- path = btrfs_alloc_path();
3548
- if (!path)
3549
- return -ENOMEM;
3550
- }
3551
-
3552
- /*
3553
- * cache_write_mutex is here only to save us from balance or automatic
3554
- * removal of empty block groups deleting this block group while we are
3555
- * writing out the cache
3556
- */
3557
- mutex_lock(&trans->transaction->cache_write_mutex);
3558
- while (!list_empty(&dirty)) {
3559
- cache = list_first_entry(&dirty,
3560
- struct btrfs_block_group_cache,
3561
- dirty_list);
3562
- /*
3563
- * this can happen if something re-dirties a block
3564
- * group that is already under IO. Just wait for it to
3565
- * finish and then do it all again
3566
- */
3567
- if (!list_empty(&cache->io_list)) {
3568
- list_del_init(&cache->io_list);
3569
- btrfs_wait_cache_io(trans, cache, path);
3570
- btrfs_put_block_group(cache);
3571
- }
3572
-
3573
-
3574
- /*
3575
- * btrfs_wait_cache_io uses the cache->dirty_list to decide
3576
- * if it should update the cache_state. Don't delete
3577
- * until after we wait.
3578
- *
3579
- * Since we're not running in the commit critical section
3580
- * we need the dirty_bgs_lock to protect from update_block_group
3581
- */
3582
- spin_lock(&cur_trans->dirty_bgs_lock);
3583
- list_del_init(&cache->dirty_list);
3584
- spin_unlock(&cur_trans->dirty_bgs_lock);
3585
-
3586
- should_put = 1;
3587
-
3588
- cache_save_setup(cache, trans, path);
3589
-
3590
- if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3591
- cache->io_ctl.inode = NULL;
3592
- ret = btrfs_write_out_cache(fs_info, trans,
3593
- cache, path);
3594
- if (ret == 0 && cache->io_ctl.inode) {
3595
- num_started++;
3596
- should_put = 0;
3597
-
3598
- /*
3599
- * The cache_write_mutex is protecting the
3600
- * io_list, also refer to the definition of
3601
- * btrfs_transaction::io_bgs for more details
3602
- */
3603
- list_add_tail(&cache->io_list, io);
3604
- } else {
3605
- /*
3606
- * if we failed to write the cache, the
3607
- * generation will be bad and life goes on
3608
- */
3609
- ret = 0;
3610
- }
3611
- }
3612
- if (!ret) {
3613
- ret = write_one_cache_group(trans, fs_info,
3614
- path, cache);
3615
- /*
3616
- * Our block group might still be attached to the list
3617
- * of new block groups in the transaction handle of some
3618
- * other task (struct btrfs_trans_handle->new_bgs). This
3619
- * means its block group item isn't yet in the extent
3620
- * tree. If this happens ignore the error, as we will
3621
- * try again later in the critical section of the
3622
- * transaction commit.
3623
- */
3624
- if (ret == -ENOENT) {
3625
- ret = 0;
3626
- spin_lock(&cur_trans->dirty_bgs_lock);
3627
- if (list_empty(&cache->dirty_list)) {
3628
- list_add_tail(&cache->dirty_list,
3629
- &cur_trans->dirty_bgs);
3630
- btrfs_get_block_group(cache);
3631
- }
3632
- spin_unlock(&cur_trans->dirty_bgs_lock);
3633
- } else if (ret) {
3634
- btrfs_abort_transaction(trans, ret);
3635
- }
3636
- }
3637
-
3638
- /* if its not on the io list, we need to put the block group */
3639
- if (should_put)
3640
- btrfs_put_block_group(cache);
3641
-
3642
- if (ret)
3643
- break;
3644
-
3645
- /*
3646
- * Avoid blocking other tasks for too long. It might even save
3647
- * us from writing caches for block groups that are going to be
3648
- * removed.
3649
- */
3650
- mutex_unlock(&trans->transaction->cache_write_mutex);
3651
- mutex_lock(&trans->transaction->cache_write_mutex);
3652
- }
3653
- mutex_unlock(&trans->transaction->cache_write_mutex);
3654
-
3655
- /*
3656
- * go through delayed refs for all the stuff we've just kicked off
3657
- * and then loop back (just once)
3658
- */
3659
- ret = btrfs_run_delayed_refs(trans, 0);
3660
- if (!ret && loops == 0) {
3661
- loops++;
3662
- spin_lock(&cur_trans->dirty_bgs_lock);
3663
- list_splice_init(&cur_trans->dirty_bgs, &dirty);
3664
- /*
3665
- * dirty_bgs_lock protects us from concurrent block group
3666
- * deletes too (not just cache_write_mutex).
3667
- */
3668
- if (!list_empty(&dirty)) {
3669
- spin_unlock(&cur_trans->dirty_bgs_lock);
3670
- goto again;
3671
- }
3672
- spin_unlock(&cur_trans->dirty_bgs_lock);
3673
- } else if (ret < 0) {
3674
- btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3675
- }
3676
-
3677
- btrfs_free_path(path);
3678
- return ret;
3679
-}
3680
-
3681
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3682
- struct btrfs_fs_info *fs_info)
3683
-{
3684
- struct btrfs_block_group_cache *cache;
3685
- struct btrfs_transaction *cur_trans = trans->transaction;
3686
- int ret = 0;
3687
- int should_put;
3688
- struct btrfs_path *path;
3689
- struct list_head *io = &cur_trans->io_bgs;
3690
- int num_started = 0;
3691
-
3692
- path = btrfs_alloc_path();
3693
- if (!path)
3694
- return -ENOMEM;
3695
-
3696
- /*
3697
- * Even though we are in the critical section of the transaction commit,
3698
- * we can still have concurrent tasks adding elements to this
3699
- * transaction's list of dirty block groups. These tasks correspond to
3700
- * endio free space workers started when writeback finishes for a
3701
- * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3702
- * allocate new block groups as a result of COWing nodes of the root
3703
- * tree when updating the free space inode. The writeback for the space
3704
- * caches is triggered by an earlier call to
3705
- * btrfs_start_dirty_block_groups() and iterations of the following
3706
- * loop.
3707
- * Also we want to do the cache_save_setup first and then run the
3708
- * delayed refs to make sure we have the best chance at doing this all
3709
- * in one shot.
3710
- */
3711
- spin_lock(&cur_trans->dirty_bgs_lock);
3712
- while (!list_empty(&cur_trans->dirty_bgs)) {
3713
- cache = list_first_entry(&cur_trans->dirty_bgs,
3714
- struct btrfs_block_group_cache,
3715
- dirty_list);
3716
-
3717
- /*
3718
- * this can happen if cache_save_setup re-dirties a block
3719
- * group that is already under IO. Just wait for it to
3720
- * finish and then do it all again
3721
- */
3722
- if (!list_empty(&cache->io_list)) {
3723
- spin_unlock(&cur_trans->dirty_bgs_lock);
3724
- list_del_init(&cache->io_list);
3725
- btrfs_wait_cache_io(trans, cache, path);
3726
- btrfs_put_block_group(cache);
3727
- spin_lock(&cur_trans->dirty_bgs_lock);
3728
- }
3729
-
3730
- /*
3731
- * don't remove from the dirty list until after we've waited
3732
- * on any pending IO
3733
- */
3734
- list_del_init(&cache->dirty_list);
3735
- spin_unlock(&cur_trans->dirty_bgs_lock);
3736
- should_put = 1;
3737
-
3738
- cache_save_setup(cache, trans, path);
3739
-
3740
- if (!ret)
3741
- ret = btrfs_run_delayed_refs(trans,
3742
- (unsigned long) -1);
3743
-
3744
- if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3745
- cache->io_ctl.inode = NULL;
3746
- ret = btrfs_write_out_cache(fs_info, trans,
3747
- cache, path);
3748
- if (ret == 0 && cache->io_ctl.inode) {
3749
- num_started++;
3750
- should_put = 0;
3751
- list_add_tail(&cache->io_list, io);
3752
- } else {
3753
- /*
3754
- * if we failed to write the cache, the
3755
- * generation will be bad and life goes on
3756
- */
3757
- ret = 0;
3758
- }
3759
- }
3760
- if (!ret) {
3761
- ret = write_one_cache_group(trans, fs_info,
3762
- path, cache);
3763
- /*
3764
- * One of the free space endio workers might have
3765
- * created a new block group while updating a free space
3766
- * cache's inode (at inode.c:btrfs_finish_ordered_io())
3767
- * and hasn't released its transaction handle yet, in
3768
- * which case the new block group is still attached to
3769
- * its transaction handle and its creation has not
3770
- * finished yet (no block group item in the extent tree
3771
- * yet, etc). If this is the case, wait for all free
3772
- * space endio workers to finish and retry. This is a
3773
- * a very rare case so no need for a more efficient and
3774
- * complex approach.
3775
- */
3776
- if (ret == -ENOENT) {
3777
- wait_event(cur_trans->writer_wait,
3778
- atomic_read(&cur_trans->num_writers) == 1);
3779
- ret = write_one_cache_group(trans, fs_info,
3780
- path, cache);
3781
- }
3782
- if (ret)
3783
- btrfs_abort_transaction(trans, ret);
3784
- }
3785
-
3786
- /* if its not on the io list, we need to put the block group */
3787
- if (should_put)
3788
- btrfs_put_block_group(cache);
3789
- spin_lock(&cur_trans->dirty_bgs_lock);
3790
- }
3791
- spin_unlock(&cur_trans->dirty_bgs_lock);
3792
-
3793
- /*
3794
- * Refer to the definition of io_bgs member for details why it's safe
3795
- * to use it without any locking
3796
- */
3797
- while (!list_empty(io)) {
3798
- cache = list_first_entry(io, struct btrfs_block_group_cache,
3799
- io_list);
3800
- list_del_init(&cache->io_list);
3801
- btrfs_wait_cache_io(trans, cache, path);
3802
- btrfs_put_block_group(cache);
3803
- }
3804
-
3805
- btrfs_free_path(path);
3806
- return ret;
3807
-}
3808
-
38092482 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
38102483 {
3811
- struct btrfs_block_group_cache *block_group;
2484
+ struct btrfs_block_group *block_group;
38122485 int readonly = 0;
38132486
38142487 block_group = btrfs_lookup_block_group(fs_info, bytenr);
....@@ -3817,253 +2490,6 @@
38172490 if (block_group)
38182491 btrfs_put_block_group(block_group);
38192492 return readonly;
3820
-}
3821
-
3822
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3823
-{
3824
- struct btrfs_block_group_cache *bg;
3825
- bool ret = true;
3826
-
3827
- bg = btrfs_lookup_block_group(fs_info, bytenr);
3828
- if (!bg)
3829
- return false;
3830
-
3831
- spin_lock(&bg->lock);
3832
- if (bg->ro)
3833
- ret = false;
3834
- else
3835
- atomic_inc(&bg->nocow_writers);
3836
- spin_unlock(&bg->lock);
3837
-
3838
- /* no put on block group, done by btrfs_dec_nocow_writers */
3839
- if (!ret)
3840
- btrfs_put_block_group(bg);
3841
-
3842
- return ret;
3843
-
3844
-}
3845
-
3846
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3847
-{
3848
- struct btrfs_block_group_cache *bg;
3849
-
3850
- bg = btrfs_lookup_block_group(fs_info, bytenr);
3851
- ASSERT(bg);
3852
- if (atomic_dec_and_test(&bg->nocow_writers))
3853
- wake_up_var(&bg->nocow_writers);
3854
- /*
3855
- * Once for our lookup and once for the lookup done by a previous call
3856
- * to btrfs_inc_nocow_writers()
3857
- */
3858
- btrfs_put_block_group(bg);
3859
- btrfs_put_block_group(bg);
3860
-}
3861
-
3862
-void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3863
-{
3864
- wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3865
-}
3866
-
3867
-static const char *alloc_name(u64 flags)
3868
-{
3869
- switch (flags) {
3870
- case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3871
- return "mixed";
3872
- case BTRFS_BLOCK_GROUP_METADATA:
3873
- return "metadata";
3874
- case BTRFS_BLOCK_GROUP_DATA:
3875
- return "data";
3876
- case BTRFS_BLOCK_GROUP_SYSTEM:
3877
- return "system";
3878
- default:
3879
- WARN_ON(1);
3880
- return "invalid-combination";
3881
- };
3882
-}
3883
-
3884
-static int create_space_info(struct btrfs_fs_info *info, u64 flags)
3885
-{
3886
-
3887
- struct btrfs_space_info *space_info;
3888
- int i;
3889
- int ret;
3890
-
3891
- space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
3892
- if (!space_info)
3893
- return -ENOMEM;
3894
-
3895
- ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
3896
- GFP_KERNEL);
3897
- if (ret) {
3898
- kfree(space_info);
3899
- return ret;
3900
- }
3901
-
3902
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3903
- INIT_LIST_HEAD(&space_info->block_groups[i]);
3904
- init_rwsem(&space_info->groups_sem);
3905
- spin_lock_init(&space_info->lock);
3906
- space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3907
- space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3908
- init_waitqueue_head(&space_info->wait);
3909
- INIT_LIST_HEAD(&space_info->ro_bgs);
3910
- INIT_LIST_HEAD(&space_info->tickets);
3911
- INIT_LIST_HEAD(&space_info->priority_tickets);
3912
-
3913
- ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
3914
- info->space_info_kobj, "%s",
3915
- alloc_name(space_info->flags));
3916
- if (ret) {
3917
- kobject_put(&space_info->kobj);
3918
- return ret;
3919
- }
3920
-
3921
- list_add_rcu(&space_info->list, &info->space_info);
3922
- if (flags & BTRFS_BLOCK_GROUP_DATA)
3923
- info->data_sinfo = space_info;
3924
-
3925
- return ret;
3926
-}
3927
-
3928
-static void update_space_info(struct btrfs_fs_info *info, u64 flags,
3929
- u64 total_bytes, u64 bytes_used,
3930
- u64 bytes_readonly,
3931
- struct btrfs_space_info **space_info)
3932
-{
3933
- struct btrfs_space_info *found;
3934
- int factor;
3935
-
3936
- factor = btrfs_bg_type_to_factor(flags);
3937
-
3938
- found = __find_space_info(info, flags);
3939
- ASSERT(found);
3940
- spin_lock(&found->lock);
3941
- found->total_bytes += total_bytes;
3942
- found->disk_total += total_bytes * factor;
3943
- found->bytes_used += bytes_used;
3944
- found->disk_used += bytes_used * factor;
3945
- found->bytes_readonly += bytes_readonly;
3946
- if (total_bytes > 0)
3947
- found->full = 0;
3948
- space_info_add_new_bytes(info, found, total_bytes -
3949
- bytes_used - bytes_readonly);
3950
- spin_unlock(&found->lock);
3951
- *space_info = found;
3952
-}
3953
-
3954
-static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3955
-{
3956
- u64 extra_flags = chunk_to_extended(flags) &
3957
- BTRFS_EXTENDED_PROFILE_MASK;
3958
-
3959
- write_seqlock(&fs_info->profiles_lock);
3960
- if (flags & BTRFS_BLOCK_GROUP_DATA)
3961
- fs_info->avail_data_alloc_bits |= extra_flags;
3962
- if (flags & BTRFS_BLOCK_GROUP_METADATA)
3963
- fs_info->avail_metadata_alloc_bits |= extra_flags;
3964
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3965
- fs_info->avail_system_alloc_bits |= extra_flags;
3966
- write_sequnlock(&fs_info->profiles_lock);
3967
-}
3968
-
3969
-/*
3970
- * returns target flags in extended format or 0 if restripe for this
3971
- * chunk_type is not in progress
3972
- *
3973
- * should be called with balance_lock held
3974
- */
3975
-static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3976
-{
3977
- struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3978
- u64 target = 0;
3979
-
3980
- if (!bctl)
3981
- return 0;
3982
-
3983
- if (flags & BTRFS_BLOCK_GROUP_DATA &&
3984
- bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3985
- target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3986
- } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3987
- bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3988
- target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3989
- } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3990
- bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3991
- target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3992
- }
3993
-
3994
- return target;
3995
-}
3996
-
3997
-/*
3998
- * @flags: available profiles in extended format (see ctree.h)
3999
- *
4000
- * Returns reduced profile in chunk format. If profile changing is in
4001
- * progress (either running or paused) picks the target profile (if it's
4002
- * already available), otherwise falls back to plain reducing.
4003
- */
4004
-static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
4005
-{
4006
- u64 num_devices = fs_info->fs_devices->rw_devices;
4007
- u64 target;
4008
- u64 raid_type;
4009
- u64 allowed = 0;
4010
-
4011
- /*
4012
- * see if restripe for this chunk_type is in progress, if so
4013
- * try to reduce to the target profile
4014
- */
4015
- spin_lock(&fs_info->balance_lock);
4016
- target = get_restripe_target(fs_info, flags);
4017
- if (target) {
4018
- /* pick target profile only if it's already available */
4019
- if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4020
- spin_unlock(&fs_info->balance_lock);
4021
- return extended_to_chunk(target);
4022
- }
4023
- }
4024
- spin_unlock(&fs_info->balance_lock);
4025
-
4026
- /* First, mask out the RAID levels which aren't possible */
4027
- for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4028
- if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4029
- allowed |= btrfs_raid_array[raid_type].bg_flag;
4030
- }
4031
- allowed &= flags;
4032
-
4033
- if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4034
- allowed = BTRFS_BLOCK_GROUP_RAID6;
4035
- else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4036
- allowed = BTRFS_BLOCK_GROUP_RAID5;
4037
- else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4038
- allowed = BTRFS_BLOCK_GROUP_RAID10;
4039
- else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4040
- allowed = BTRFS_BLOCK_GROUP_RAID1;
4041
- else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4042
- allowed = BTRFS_BLOCK_GROUP_RAID0;
4043
-
4044
- flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4045
-
4046
- return extended_to_chunk(flags | allowed);
4047
-}
4048
-
4049
-static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
4050
-{
4051
- unsigned seq;
4052
- u64 flags;
4053
-
4054
- do {
4055
- flags = orig_flags;
4056
- seq = read_seqbegin(&fs_info->profiles_lock);
4057
-
4058
- if (flags & BTRFS_BLOCK_GROUP_DATA)
4059
- flags |= fs_info->avail_data_alloc_bits;
4060
- else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4061
- flags |= fs_info->avail_system_alloc_bits;
4062
- else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4063
- flags |= fs_info->avail_metadata_alloc_bits;
4064
- } while (read_seqretry(&fs_info->profiles_lock, seq));
4065
-
4066
- return btrfs_reduce_alloc_profile(fs_info, flags);
40672493 }
40682494
40692495 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
....@@ -4079,2091 +2505,13 @@
40792505 else
40802506 flags = BTRFS_BLOCK_GROUP_METADATA;
40812507
4082
- ret = get_alloc_profile(fs_info, flags);
2508
+ ret = btrfs_get_alloc_profile(fs_info, flags);
40832509 return ret;
4084
-}
4085
-
4086
-u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4087
-{
4088
- return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4089
-}
4090
-
4091
-u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4092
-{
4093
- return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4094
-}
4095
-
4096
-u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4097
-{
4098
- return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4099
-}
4100
-
4101
-static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4102
- bool may_use_included)
4103
-{
4104
- ASSERT(s_info);
4105
- return s_info->bytes_used + s_info->bytes_reserved +
4106
- s_info->bytes_pinned + s_info->bytes_readonly +
4107
- (may_use_included ? s_info->bytes_may_use : 0);
4108
-}
4109
-
4110
-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
4111
-{
4112
- struct btrfs_root *root = inode->root;
4113
- struct btrfs_fs_info *fs_info = root->fs_info;
4114
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
4115
- u64 used;
4116
- int ret = 0;
4117
- int need_commit = 2;
4118
- int have_pinned_space;
4119
-
4120
- /* make sure bytes are sectorsize aligned */
4121
- bytes = ALIGN(bytes, fs_info->sectorsize);
4122
-
4123
- if (btrfs_is_free_space_inode(inode)) {
4124
- need_commit = 0;
4125
- ASSERT(current->journal_info);
4126
- }
4127
-
4128
-again:
4129
- /* make sure we have enough space to handle the data first */
4130
- spin_lock(&data_sinfo->lock);
4131
- used = btrfs_space_info_used(data_sinfo, true);
4132
-
4133
- if (used + bytes > data_sinfo->total_bytes) {
4134
- struct btrfs_trans_handle *trans;
4135
-
4136
- /*
4137
- * if we don't have enough free bytes in this space then we need
4138
- * to alloc a new chunk.
4139
- */
4140
- if (!data_sinfo->full) {
4141
- u64 alloc_target;
4142
-
4143
- data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4144
- spin_unlock(&data_sinfo->lock);
4145
-
4146
- alloc_target = btrfs_data_alloc_profile(fs_info);
4147
- /*
4148
- * It is ugly that we don't call nolock join
4149
- * transaction for the free space inode case here.
4150
- * But it is safe because we only do the data space
4151
- * reservation for the free space cache in the
4152
- * transaction context, the common join transaction
4153
- * just increase the counter of the current transaction
4154
- * handler, doesn't try to acquire the trans_lock of
4155
- * the fs.
4156
- */
4157
- trans = btrfs_join_transaction(root);
4158
- if (IS_ERR(trans))
4159
- return PTR_ERR(trans);
4160
-
4161
- ret = do_chunk_alloc(trans, alloc_target,
4162
- CHUNK_ALLOC_NO_FORCE);
4163
- btrfs_end_transaction(trans);
4164
- if (ret < 0) {
4165
- if (ret != -ENOSPC)
4166
- return ret;
4167
- else {
4168
- have_pinned_space = 1;
4169
- goto commit_trans;
4170
- }
4171
- }
4172
-
4173
- goto again;
4174
- }
4175
-
4176
- /*
4177
- * If we don't have enough pinned space to deal with this
4178
- * allocation, and no removed chunk in current transaction,
4179
- * don't bother committing the transaction.
4180
- */
4181
- have_pinned_space = __percpu_counter_compare(
4182
- &data_sinfo->total_bytes_pinned,
4183
- used + bytes - data_sinfo->total_bytes,
4184
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
4185
- spin_unlock(&data_sinfo->lock);
4186
-
4187
- /* commit the current transaction and try again */
4188
-commit_trans:
4189
- if (need_commit) {
4190
- need_commit--;
4191
-
4192
- if (need_commit > 0) {
4193
- btrfs_start_delalloc_roots(fs_info, -1);
4194
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4195
- (u64)-1);
4196
- }
4197
-
4198
- trans = btrfs_join_transaction(root);
4199
- if (IS_ERR(trans))
4200
- return PTR_ERR(trans);
4201
- if (have_pinned_space >= 0 ||
4202
- test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4203
- &trans->transaction->flags) ||
4204
- need_commit > 0) {
4205
- ret = btrfs_commit_transaction(trans);
4206
- if (ret)
4207
- return ret;
4208
- /*
4209
- * The cleaner kthread might still be doing iput
4210
- * operations. Wait for it to finish so that
4211
- * more space is released.
4212
- */
4213
- mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
4214
- mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
4215
- goto again;
4216
- } else {
4217
- btrfs_end_transaction(trans);
4218
- }
4219
- }
4220
-
4221
- trace_btrfs_space_reservation(fs_info,
4222
- "space_info:enospc",
4223
- data_sinfo->flags, bytes, 1);
4224
- return -ENOSPC;
4225
- }
4226
- data_sinfo->bytes_may_use += bytes;
4227
- trace_btrfs_space_reservation(fs_info, "space_info",
4228
- data_sinfo->flags, bytes, 1);
4229
- spin_unlock(&data_sinfo->lock);
4230
-
4231
- return 0;
4232
-}
4233
-
4234
-int btrfs_check_data_free_space(struct inode *inode,
4235
- struct extent_changeset **reserved, u64 start, u64 len)
4236
-{
4237
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4238
- int ret;
4239
-
4240
- /* align the range */
4241
- len = round_up(start + len, fs_info->sectorsize) -
4242
- round_down(start, fs_info->sectorsize);
4243
- start = round_down(start, fs_info->sectorsize);
4244
-
4245
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4246
- if (ret < 0)
4247
- return ret;
4248
-
4249
- /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4250
- ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4251
- if (ret < 0)
4252
- btrfs_free_reserved_data_space_noquota(inode, start, len);
4253
- else
4254
- ret = 0;
4255
- return ret;
4256
-}
4257
-
4258
-/*
4259
- * Called if we need to clear a data reservation for this inode
4260
- * Normally in a error case.
4261
- *
4262
- * This one will *NOT* use accurate qgroup reserved space API, just for case
4263
- * which we can't sleep and is sure it won't affect qgroup reserved space.
4264
- * Like clear_bit_hook().
4265
- */
4266
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4267
- u64 len)
4268
-{
4269
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4270
- struct btrfs_space_info *data_sinfo;
4271
-
4272
- /* Make sure the range is aligned to sectorsize */
4273
- len = round_up(start + len, fs_info->sectorsize) -
4274
- round_down(start, fs_info->sectorsize);
4275
- start = round_down(start, fs_info->sectorsize);
4276
-
4277
- data_sinfo = fs_info->data_sinfo;
4278
- spin_lock(&data_sinfo->lock);
4279
- if (WARN_ON(data_sinfo->bytes_may_use < len))
4280
- data_sinfo->bytes_may_use = 0;
4281
- else
4282
- data_sinfo->bytes_may_use -= len;
4283
- trace_btrfs_space_reservation(fs_info, "space_info",
4284
- data_sinfo->flags, len, 0);
4285
- spin_unlock(&data_sinfo->lock);
4286
-}
4287
-
4288
-/*
4289
- * Called if we need to clear a data reservation for this inode
4290
- * Normally in a error case.
4291
- *
4292
- * This one will handle the per-inode data rsv map for accurate reserved
4293
- * space framework.
4294
- */
4295
-void btrfs_free_reserved_data_space(struct inode *inode,
4296
- struct extent_changeset *reserved, u64 start, u64 len)
4297
-{
4298
- struct btrfs_root *root = BTRFS_I(inode)->root;
4299
-
4300
- /* Make sure the range is aligned to sectorsize */
4301
- len = round_up(start + len, root->fs_info->sectorsize) -
4302
- round_down(start, root->fs_info->sectorsize);
4303
- start = round_down(start, root->fs_info->sectorsize);
4304
-
4305
- btrfs_free_reserved_data_space_noquota(inode, start, len);
4306
- btrfs_qgroup_free_data(inode, reserved, start, len);
4307
-}
4308
-
4309
-static void force_metadata_allocation(struct btrfs_fs_info *info)
4310
-{
4311
- struct list_head *head = &info->space_info;
4312
- struct btrfs_space_info *found;
4313
-
4314
- rcu_read_lock();
4315
- list_for_each_entry_rcu(found, head, list) {
4316
- if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4317
- found->force_alloc = CHUNK_ALLOC_FORCE;
4318
- }
4319
- rcu_read_unlock();
4320
-}
4321
-
4322
-static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4323
-{
4324
- return (global->size << 1);
4325
-}
4326
-
4327
-static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4328
- struct btrfs_space_info *sinfo, int force)
4329
-{
4330
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4331
- u64 bytes_used = btrfs_space_info_used(sinfo, false);
4332
- u64 thresh;
4333
-
4334
- if (force == CHUNK_ALLOC_FORCE)
4335
- return 1;
4336
-
4337
- /*
4338
- * We need to take into account the global rsv because for all intents
4339
- * and purposes it's used space. Don't worry about locking the
4340
- * global_rsv, it doesn't change except when the transaction commits.
4341
- */
4342
- if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4343
- bytes_used += calc_global_rsv_need_space(global_rsv);
4344
-
4345
- /*
4346
- * in limited mode, we want to have some free space up to
4347
- * about 1% of the FS size.
4348
- */
4349
- if (force == CHUNK_ALLOC_LIMITED) {
4350
- thresh = btrfs_super_total_bytes(fs_info->super_copy);
4351
- thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4352
-
4353
- if (sinfo->total_bytes - bytes_used < thresh)
4354
- return 1;
4355
- }
4356
-
4357
- if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4358
- return 0;
4359
- return 1;
4360
-}
4361
-
4362
-static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4363
-{
4364
- u64 num_dev;
4365
-
4366
- if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4367
- BTRFS_BLOCK_GROUP_RAID0 |
4368
- BTRFS_BLOCK_GROUP_RAID5 |
4369
- BTRFS_BLOCK_GROUP_RAID6))
4370
- num_dev = fs_info->fs_devices->rw_devices;
4371
- else if (type & BTRFS_BLOCK_GROUP_RAID1)
4372
- num_dev = 2;
4373
- else
4374
- num_dev = 1; /* DUP or single */
4375
-
4376
- return num_dev;
4377
-}
4378
-
4379
-/*
4380
- * If @is_allocation is true, reserve space in the system space info necessary
4381
- * for allocating a chunk, otherwise if it's false, reserve space necessary for
4382
- * removing a chunk.
4383
- */
4384
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4385
-{
4386
- struct btrfs_fs_info *fs_info = trans->fs_info;
4387
- struct btrfs_space_info *info;
4388
- u64 left;
4389
- u64 thresh;
4390
- int ret = 0;
4391
- u64 num_devs;
4392
-
4393
- /*
4394
- * Needed because we can end up allocating a system chunk and for an
4395
- * atomic and race free space reservation in the chunk block reserve.
4396
- */
4397
- lockdep_assert_held(&fs_info->chunk_mutex);
4398
-
4399
- info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4400
- spin_lock(&info->lock);
4401
- left = info->total_bytes - btrfs_space_info_used(info, true);
4402
- spin_unlock(&info->lock);
4403
-
4404
- num_devs = get_profile_num_devs(fs_info, type);
4405
-
4406
- /* num_devs device items to update and 1 chunk item to add or remove */
4407
- thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4408
- btrfs_calc_trans_metadata_size(fs_info, 1);
4409
-
4410
- if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4411
- btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4412
- left, thresh, type);
4413
- dump_space_info(fs_info, info, 0, 0);
4414
- }
4415
-
4416
- if (left < thresh) {
4417
- u64 flags = btrfs_system_alloc_profile(fs_info);
4418
-
4419
- /*
4420
- * Ignore failure to create system chunk. We might end up not
4421
- * needing it, as we might not need to COW all nodes/leafs from
4422
- * the paths we visit in the chunk tree (they were already COWed
4423
- * or created in the current transaction for example).
4424
- */
4425
- ret = btrfs_alloc_chunk(trans, flags);
4426
- }
4427
-
4428
- if (!ret) {
4429
- ret = btrfs_block_rsv_add(fs_info->chunk_root,
4430
- &fs_info->chunk_block_rsv,
4431
- thresh, BTRFS_RESERVE_NO_FLUSH);
4432
- if (!ret)
4433
- trans->chunk_bytes_reserved += thresh;
4434
- }
4435
-}
4436
-
4437
-/*
4438
- * If force is CHUNK_ALLOC_FORCE:
4439
- * - return 1 if it successfully allocates a chunk,
4440
- * - return errors including -ENOSPC otherwise.
4441
- * If force is NOT CHUNK_ALLOC_FORCE:
4442
- * - return 0 if it doesn't need to allocate a new chunk,
4443
- * - return 1 if it successfully allocates a chunk,
4444
- * - return errors including -ENOSPC otherwise.
4445
- */
4446
-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4447
- int force)
4448
-{
4449
- struct btrfs_fs_info *fs_info = trans->fs_info;
4450
- struct btrfs_space_info *space_info;
4451
- bool wait_for_alloc = false;
4452
- bool should_alloc = false;
4453
- int ret = 0;
4454
-
4455
- /* Don't re-enter if we're already allocating a chunk */
4456
- if (trans->allocating_chunk)
4457
- return -ENOSPC;
4458
-
4459
- space_info = __find_space_info(fs_info, flags);
4460
- ASSERT(space_info);
4461
-
4462
- do {
4463
- spin_lock(&space_info->lock);
4464
- if (force < space_info->force_alloc)
4465
- force = space_info->force_alloc;
4466
- should_alloc = should_alloc_chunk(fs_info, space_info, force);
4467
- if (space_info->full) {
4468
- /* No more free physical space */
4469
- if (should_alloc)
4470
- ret = -ENOSPC;
4471
- else
4472
- ret = 0;
4473
- spin_unlock(&space_info->lock);
4474
- return ret;
4475
- } else if (!should_alloc) {
4476
- spin_unlock(&space_info->lock);
4477
- return 0;
4478
- } else if (space_info->chunk_alloc) {
4479
- /*
4480
- * Someone is already allocating, so we need to block
4481
- * until this someone is finished and then loop to
4482
- * recheck if we should continue with our allocation
4483
- * attempt.
4484
- */
4485
- wait_for_alloc = true;
4486
- spin_unlock(&space_info->lock);
4487
- mutex_lock(&fs_info->chunk_mutex);
4488
- mutex_unlock(&fs_info->chunk_mutex);
4489
- } else {
4490
- /* Proceed with allocation */
4491
- space_info->chunk_alloc = 1;
4492
- wait_for_alloc = false;
4493
- spin_unlock(&space_info->lock);
4494
- }
4495
-
4496
- cond_resched();
4497
- } while (wait_for_alloc);
4498
-
4499
- mutex_lock(&fs_info->chunk_mutex);
4500
- trans->allocating_chunk = true;
4501
-
4502
- /*
4503
- * If we have mixed data/metadata chunks we want to make sure we keep
4504
- * allocating mixed chunks instead of individual chunks.
4505
- */
4506
- if (btrfs_mixed_space_info(space_info))
4507
- flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4508
-
4509
- /*
4510
- * if we're doing a data chunk, go ahead and make sure that
4511
- * we keep a reasonable number of metadata chunks allocated in the
4512
- * FS as well.
4513
- */
4514
- if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4515
- fs_info->data_chunk_allocations++;
4516
- if (!(fs_info->data_chunk_allocations %
4517
- fs_info->metadata_ratio))
4518
- force_metadata_allocation(fs_info);
4519
- }
4520
-
4521
- /*
4522
- * Check if we have enough space in SYSTEM chunk because we may need
4523
- * to update devices.
4524
- */
4525
- check_system_chunk(trans, flags);
4526
-
4527
- ret = btrfs_alloc_chunk(trans, flags);
4528
- trans->allocating_chunk = false;
4529
-
4530
- spin_lock(&space_info->lock);
4531
- if (ret < 0) {
4532
- if (ret == -ENOSPC)
4533
- space_info->full = 1;
4534
- else
4535
- goto out;
4536
- } else {
4537
- ret = 1;
4538
- space_info->max_extent_size = 0;
4539
- }
4540
-
4541
- space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4542
-out:
4543
- space_info->chunk_alloc = 0;
4544
- spin_unlock(&space_info->lock);
4545
- mutex_unlock(&fs_info->chunk_mutex);
4546
- /*
4547
- * When we allocate a new chunk we reserve space in the chunk block
4548
- * reserve to make sure we can COW nodes/leafs in the chunk tree or
4549
- * add new nodes/leafs to it if we end up needing to do it when
4550
- * inserting the chunk item and updating device items as part of the
4551
- * second phase of chunk allocation, performed by
4552
- * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4553
- * large number of new block groups to create in our transaction
4554
- * handle's new_bgs list to avoid exhausting the chunk block reserve
4555
- * in extreme cases - like having a single transaction create many new
4556
- * block groups when starting to write out the free space caches of all
4557
- * the block groups that were made dirty during the lifetime of the
4558
- * transaction.
4559
- */
4560
- if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
4561
- btrfs_create_pending_block_groups(trans);
4562
-
4563
- return ret;
4564
-}
4565
-
4566
-static int can_overcommit(struct btrfs_fs_info *fs_info,
4567
- struct btrfs_space_info *space_info, u64 bytes,
4568
- enum btrfs_reserve_flush_enum flush,
4569
- bool system_chunk)
4570
-{
4571
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4572
- u64 profile;
4573
- u64 space_size;
4574
- u64 avail;
4575
- u64 used;
4576
- int factor;
4577
-
4578
- /* Don't overcommit when in mixed mode. */
4579
- if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4580
- return 0;
4581
-
4582
- if (system_chunk)
4583
- profile = btrfs_system_alloc_profile(fs_info);
4584
- else
4585
- profile = btrfs_metadata_alloc_profile(fs_info);
4586
-
4587
- used = btrfs_space_info_used(space_info, false);
4588
-
4589
- /*
4590
- * We only want to allow over committing if we have lots of actual space
4591
- * free, but if we don't have enough space to handle the global reserve
4592
- * space then we could end up having a real enospc problem when trying
4593
- * to allocate a chunk or some other such important allocation.
4594
- */
4595
- spin_lock(&global_rsv->lock);
4596
- space_size = calc_global_rsv_need_space(global_rsv);
4597
- spin_unlock(&global_rsv->lock);
4598
- if (used + space_size >= space_info->total_bytes)
4599
- return 0;
4600
-
4601
- used += space_info->bytes_may_use;
4602
-
4603
- avail = atomic64_read(&fs_info->free_chunk_space);
4604
-
4605
- /*
4606
- * If we have dup, raid1 or raid10 then only half of the free
4607
- * space is actually useable. For raid56, the space info used
4608
- * doesn't include the parity drive, so we don't have to
4609
- * change the math
4610
- */
4611
- factor = btrfs_bg_type_to_factor(profile);
4612
- avail = div_u64(avail, factor);
4613
-
4614
- /*
4615
- * If we aren't flushing all things, let us overcommit up to
4616
- * 1/2th of the space. If we can flush, don't let us overcommit
4617
- * too much, let it overcommit up to 1/8 of the space.
4618
- */
4619
- if (flush == BTRFS_RESERVE_FLUSH_ALL)
4620
- avail >>= 3;
4621
- else
4622
- avail >>= 1;
4623
-
4624
- if (used + bytes < space_info->total_bytes + avail)
4625
- return 1;
4626
- return 0;
4627
-}
4628
-
4629
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4630
- unsigned long nr_pages, int nr_items)
4631
-{
4632
- struct super_block *sb = fs_info->sb;
4633
-
4634
- if (down_read_trylock(&sb->s_umount)) {
4635
- writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4636
- up_read(&sb->s_umount);
4637
- } else {
4638
- /*
4639
- * We needn't worry the filesystem going from r/w to r/o though
4640
- * we don't acquire ->s_umount mutex, because the filesystem
4641
- * should guarantee the delalloc inodes list be empty after
4642
- * the filesystem is readonly(all dirty pages are written to
4643
- * the disk).
4644
- */
4645
- btrfs_start_delalloc_roots(fs_info, nr_items);
4646
- if (!current->journal_info)
4647
- btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4648
- }
4649
-}
4650
-
4651
-static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4652
- u64 to_reclaim)
4653
-{
4654
- u64 bytes;
4655
- u64 nr;
4656
-
4657
- bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4658
- nr = div64_u64(to_reclaim, bytes);
4659
- if (!nr)
4660
- nr = 1;
4661
- return nr;
4662
-}
4663
-
4664
-#define EXTENT_SIZE_PER_ITEM SZ_256K
4665
-
4666
-/*
4667
- * shrink metadata reservation for delalloc
4668
- */
4669
-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4670
- u64 orig, bool wait_ordered)
4671
-{
4672
- struct btrfs_space_info *space_info;
4673
- struct btrfs_trans_handle *trans;
4674
- u64 delalloc_bytes;
4675
- u64 max_reclaim;
4676
- u64 items;
4677
- long time_left;
4678
- unsigned long nr_pages;
4679
- int loops;
4680
-
4681
- /* Calc the number of the pages we need flush for space reservation */
4682
- items = calc_reclaim_items_nr(fs_info, to_reclaim);
4683
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4684
-
4685
- trans = (struct btrfs_trans_handle *)current->journal_info;
4686
- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4687
-
4688
- delalloc_bytes = percpu_counter_sum_positive(
4689
- &fs_info->delalloc_bytes);
4690
- if (delalloc_bytes == 0) {
4691
- if (trans)
4692
- return;
4693
- if (wait_ordered)
4694
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4695
- return;
4696
- }
4697
-
4698
- loops = 0;
4699
- while (delalloc_bytes && loops < 3) {
4700
- max_reclaim = min(delalloc_bytes, to_reclaim);
4701
- nr_pages = max_reclaim >> PAGE_SHIFT;
4702
- btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4703
- /*
4704
- * We need to wait for the async pages to actually start before
4705
- * we do anything.
4706
- */
4707
- max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
4708
- if (!max_reclaim)
4709
- goto skip_async;
4710
-
4711
- if (max_reclaim <= nr_pages)
4712
- max_reclaim = 0;
4713
- else
4714
- max_reclaim -= nr_pages;
4715
-
4716
- wait_event(fs_info->async_submit_wait,
4717
- atomic_read(&fs_info->async_delalloc_pages) <=
4718
- (int)max_reclaim);
4719
-skip_async:
4720
- spin_lock(&space_info->lock);
4721
- if (list_empty(&space_info->tickets) &&
4722
- list_empty(&space_info->priority_tickets)) {
4723
- spin_unlock(&space_info->lock);
4724
- break;
4725
- }
4726
- spin_unlock(&space_info->lock);
4727
-
4728
- loops++;
4729
- if (wait_ordered && !trans) {
4730
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4731
- } else {
4732
- time_left = schedule_timeout_killable(1);
4733
- if (time_left)
4734
- break;
4735
- }
4736
- delalloc_bytes = percpu_counter_sum_positive(
4737
- &fs_info->delalloc_bytes);
4738
- }
4739
-}
4740
-
4741
-struct reserve_ticket {
4742
- u64 bytes;
4743
- int error;
4744
- struct list_head list;
4745
- wait_queue_head_t wait;
4746
-};
4747
-
4748
-/**
4749
- * maybe_commit_transaction - possibly commit the transaction if its ok to
4750
- * @root - the root we're allocating for
4751
- * @bytes - the number of bytes we want to reserve
4752
- * @force - force the commit
4753
- *
4754
- * This will check to make sure that committing the transaction will actually
4755
- * get us somewhere and then commit the transaction if it does. Otherwise it
4756
- * will return -ENOSPC.
4757
- */
4758
-static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4759
- struct btrfs_space_info *space_info)
4760
-{
4761
- struct reserve_ticket *ticket = NULL;
4762
- struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4763
- struct btrfs_trans_handle *trans;
4764
- u64 bytes;
4765
-
4766
- trans = (struct btrfs_trans_handle *)current->journal_info;
4767
- if (trans)
4768
- return -EAGAIN;
4769
-
4770
- spin_lock(&space_info->lock);
4771
- if (!list_empty(&space_info->priority_tickets))
4772
- ticket = list_first_entry(&space_info->priority_tickets,
4773
- struct reserve_ticket, list);
4774
- else if (!list_empty(&space_info->tickets))
4775
- ticket = list_first_entry(&space_info->tickets,
4776
- struct reserve_ticket, list);
4777
- bytes = (ticket) ? ticket->bytes : 0;
4778
- spin_unlock(&space_info->lock);
4779
-
4780
- if (!bytes)
4781
- return 0;
4782
-
4783
- /* See if there is enough pinned space to make this reservation */
4784
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4785
- bytes,
4786
- BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
4787
- goto commit;
4788
-
4789
- /*
4790
- * See if there is some space in the delayed insertion reservation for
4791
- * this reservation.
4792
- */
4793
- if (space_info != delayed_rsv->space_info)
4794
- return -ENOSPC;
4795
-
4796
- spin_lock(&delayed_rsv->lock);
4797
- if (delayed_rsv->size > bytes)
4798
- bytes = 0;
4799
- else
4800
- bytes -= delayed_rsv->size;
4801
- spin_unlock(&delayed_rsv->lock);
4802
-
4803
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4804
- bytes,
4805
- BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
4806
- return -ENOSPC;
4807
- }
4808
-
4809
-commit:
4810
- trans = btrfs_join_transaction(fs_info->extent_root);
4811
- if (IS_ERR(trans))
4812
- return -ENOSPC;
4813
-
4814
- return btrfs_commit_transaction(trans);
4815
-}
4816
-
4817
-/*
4818
- * Try to flush some data based on policy set by @state. This is only advisory
4819
- * and may fail for various reasons. The caller is supposed to examine the
4820
- * state of @space_info to detect the outcome.
4821
- */
4822
-static void flush_space(struct btrfs_fs_info *fs_info,
4823
- struct btrfs_space_info *space_info, u64 num_bytes,
4824
- int state)
4825
-{
4826
- struct btrfs_root *root = fs_info->extent_root;
4827
- struct btrfs_trans_handle *trans;
4828
- int nr;
4829
- int ret = 0;
4830
-
4831
- switch (state) {
4832
- case FLUSH_DELAYED_ITEMS_NR:
4833
- case FLUSH_DELAYED_ITEMS:
4834
- if (state == FLUSH_DELAYED_ITEMS_NR)
4835
- nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4836
- else
4837
- nr = -1;
4838
-
4839
- trans = btrfs_join_transaction(root);
4840
- if (IS_ERR(trans)) {
4841
- ret = PTR_ERR(trans);
4842
- break;
4843
- }
4844
- ret = btrfs_run_delayed_items_nr(trans, nr);
4845
- btrfs_end_transaction(trans);
4846
- break;
4847
- case FLUSH_DELALLOC:
4848
- case FLUSH_DELALLOC_WAIT:
4849
- shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4850
- state == FLUSH_DELALLOC_WAIT);
4851
- break;
4852
- case ALLOC_CHUNK:
4853
- trans = btrfs_join_transaction(root);
4854
- if (IS_ERR(trans)) {
4855
- ret = PTR_ERR(trans);
4856
- break;
4857
- }
4858
- ret = do_chunk_alloc(trans,
4859
- btrfs_metadata_alloc_profile(fs_info),
4860
- CHUNK_ALLOC_NO_FORCE);
4861
- btrfs_end_transaction(trans);
4862
- if (ret > 0 || ret == -ENOSPC)
4863
- ret = 0;
4864
- break;
4865
- case COMMIT_TRANS:
4866
- ret = may_commit_transaction(fs_info, space_info);
4867
- break;
4868
- default:
4869
- ret = -ENOSPC;
4870
- break;
4871
- }
4872
-
4873
- trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
4874
- ret);
4875
- return;
4876
-}
4877
-
4878
-static inline u64
4879
-btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4880
- struct btrfs_space_info *space_info,
4881
- bool system_chunk)
4882
-{
4883
- struct reserve_ticket *ticket;
4884
- u64 used;
4885
- u64 expected;
4886
- u64 to_reclaim = 0;
4887
-
4888
- list_for_each_entry(ticket, &space_info->tickets, list)
4889
- to_reclaim += ticket->bytes;
4890
- list_for_each_entry(ticket, &space_info->priority_tickets, list)
4891
- to_reclaim += ticket->bytes;
4892
- if (to_reclaim)
4893
- return to_reclaim;
4894
-
4895
- to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4896
- if (can_overcommit(fs_info, space_info, to_reclaim,
4897
- BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4898
- return 0;
4899
-
4900
- used = btrfs_space_info_used(space_info, true);
4901
-
4902
- if (can_overcommit(fs_info, space_info, SZ_1M,
4903
- BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4904
- expected = div_factor_fine(space_info->total_bytes, 95);
4905
- else
4906
- expected = div_factor_fine(space_info->total_bytes, 90);
4907
-
4908
- if (used > expected)
4909
- to_reclaim = used - expected;
4910
- else
4911
- to_reclaim = 0;
4912
- to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4913
- space_info->bytes_reserved);
4914
- return to_reclaim;
4915
-}
4916
-
4917
-static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
4918
- struct btrfs_space_info *space_info,
4919
- u64 used, bool system_chunk)
4920
-{
4921
- u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4922
-
4923
- /* If we're just plain full then async reclaim just slows us down. */
4924
- if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4925
- return 0;
4926
-
4927
- if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4928
- system_chunk))
4929
- return 0;
4930
-
4931
- return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4932
- !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4933
-}
4934
-
4935
-static void wake_all_tickets(struct list_head *head)
4936
-{
4937
- struct reserve_ticket *ticket;
4938
-
4939
- while (!list_empty(head)) {
4940
- ticket = list_first_entry(head, struct reserve_ticket, list);
4941
- list_del_init(&ticket->list);
4942
- ticket->error = -ENOSPC;
4943
- wake_up(&ticket->wait);
4944
- }
4945
-}
4946
-
4947
-/*
4948
- * This is for normal flushers, we can wait all goddamned day if we want to. We
4949
- * will loop and continuously try to flush as long as we are making progress.
4950
- * We count progress as clearing off tickets each time we have to loop.
4951
- */
4952
-static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4953
-{
4954
- struct btrfs_fs_info *fs_info;
4955
- struct btrfs_space_info *space_info;
4956
- u64 to_reclaim;
4957
- int flush_state;
4958
- int commit_cycles = 0;
4959
- u64 last_tickets_id;
4960
-
4961
- fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4962
- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4963
-
4964
- spin_lock(&space_info->lock);
4965
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4966
- false);
4967
- if (!to_reclaim) {
4968
- space_info->flush = 0;
4969
- spin_unlock(&space_info->lock);
4970
- return;
4971
- }
4972
- last_tickets_id = space_info->tickets_id;
4973
- spin_unlock(&space_info->lock);
4974
-
4975
- flush_state = FLUSH_DELAYED_ITEMS_NR;
4976
- do {
4977
- flush_space(fs_info, space_info, to_reclaim, flush_state);
4978
- spin_lock(&space_info->lock);
4979
- if (list_empty(&space_info->tickets)) {
4980
- space_info->flush = 0;
4981
- spin_unlock(&space_info->lock);
4982
- return;
4983
- }
4984
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
4985
- space_info,
4986
- false);
4987
- if (last_tickets_id == space_info->tickets_id) {
4988
- flush_state++;
4989
- } else {
4990
- last_tickets_id = space_info->tickets_id;
4991
- flush_state = FLUSH_DELAYED_ITEMS_NR;
4992
- if (commit_cycles)
4993
- commit_cycles--;
4994
- }
4995
-
4996
- if (flush_state > COMMIT_TRANS) {
4997
- commit_cycles++;
4998
- if (commit_cycles > 2) {
4999
- wake_all_tickets(&space_info->tickets);
5000
- space_info->flush = 0;
5001
- } else {
5002
- flush_state = FLUSH_DELAYED_ITEMS_NR;
5003
- }
5004
- }
5005
- spin_unlock(&space_info->lock);
5006
- } while (flush_state <= COMMIT_TRANS);
5007
-}
5008
-
5009
-void btrfs_init_async_reclaim_work(struct work_struct *work)
5010
-{
5011
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5012
-}
5013
-
5014
-static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5015
- struct btrfs_space_info *space_info,
5016
- struct reserve_ticket *ticket)
5017
-{
5018
- u64 to_reclaim;
5019
- int flush_state = FLUSH_DELAYED_ITEMS_NR;
5020
-
5021
- spin_lock(&space_info->lock);
5022
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5023
- false);
5024
- if (!to_reclaim) {
5025
- spin_unlock(&space_info->lock);
5026
- return;
5027
- }
5028
- spin_unlock(&space_info->lock);
5029
-
5030
- do {
5031
- flush_space(fs_info, space_info, to_reclaim, flush_state);
5032
- flush_state++;
5033
- spin_lock(&space_info->lock);
5034
- if (ticket->bytes == 0) {
5035
- spin_unlock(&space_info->lock);
5036
- return;
5037
- }
5038
- spin_unlock(&space_info->lock);
5039
-
5040
- /*
5041
- * Priority flushers can't wait on delalloc without
5042
- * deadlocking.
5043
- */
5044
- if (flush_state == FLUSH_DELALLOC ||
5045
- flush_state == FLUSH_DELALLOC_WAIT)
5046
- flush_state = ALLOC_CHUNK;
5047
- } while (flush_state < COMMIT_TRANS);
5048
-}
5049
-
5050
-static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5051
- struct btrfs_space_info *space_info,
5052
- struct reserve_ticket *ticket, u64 orig_bytes)
5053
-
5054
-{
5055
- DEFINE_WAIT(wait);
5056
- int ret = 0;
5057
-
5058
- spin_lock(&space_info->lock);
5059
- while (ticket->bytes > 0 && ticket->error == 0) {
5060
- ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5061
- if (ret) {
5062
- ret = -EINTR;
5063
- break;
5064
- }
5065
- spin_unlock(&space_info->lock);
5066
-
5067
- schedule();
5068
-
5069
- finish_wait(&ticket->wait, &wait);
5070
- spin_lock(&space_info->lock);
5071
- }
5072
- if (!ret)
5073
- ret = ticket->error;
5074
- if (!list_empty(&ticket->list))
5075
- list_del_init(&ticket->list);
5076
- if (ticket->bytes && ticket->bytes < orig_bytes) {
5077
- u64 num_bytes = orig_bytes - ticket->bytes;
5078
- space_info->bytes_may_use -= num_bytes;
5079
- trace_btrfs_space_reservation(fs_info, "space_info",
5080
- space_info->flags, num_bytes, 0);
5081
- }
5082
- spin_unlock(&space_info->lock);
5083
-
5084
- return ret;
5085
-}
5086
-
5087
-/**
5088
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5089
- * @root - the root we're allocating for
5090
- * @space_info - the space info we want to allocate from
5091
- * @orig_bytes - the number of bytes we want
5092
- * @flush - whether or not we can flush to make our reservation
5093
- *
5094
- * This will reserve orig_bytes number of bytes from the space info associated
5095
- * with the block_rsv. If there is not enough space it will make an attempt to
5096
- * flush out space to make room. It will do this by flushing delalloc if
5097
- * possible or committing the transaction. If flush is 0 then no attempts to
5098
- * regain reservations will be made and this will fail if there is not enough
5099
- * space already.
5100
- */
5101
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5102
- struct btrfs_space_info *space_info,
5103
- u64 orig_bytes,
5104
- enum btrfs_reserve_flush_enum flush,
5105
- bool system_chunk)
5106
-{
5107
- struct reserve_ticket ticket;
5108
- u64 used;
5109
- int ret = 0;
5110
-
5111
- ASSERT(orig_bytes);
5112
- ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5113
-
5114
- spin_lock(&space_info->lock);
5115
- ret = -ENOSPC;
5116
- used = btrfs_space_info_used(space_info, true);
5117
-
5118
- /*
5119
- * If we have enough space then hooray, make our reservation and carry
5120
- * on. If not see if we can overcommit, and if we can, hooray carry on.
5121
- * If not things get more complicated.
5122
- */
5123
- if (used + orig_bytes <= space_info->total_bytes) {
5124
- space_info->bytes_may_use += orig_bytes;
5125
- trace_btrfs_space_reservation(fs_info, "space_info",
5126
- space_info->flags, orig_bytes, 1);
5127
- ret = 0;
5128
- } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5129
- system_chunk)) {
5130
- space_info->bytes_may_use += orig_bytes;
5131
- trace_btrfs_space_reservation(fs_info, "space_info",
5132
- space_info->flags, orig_bytes, 1);
5133
- ret = 0;
5134
- }
5135
-
5136
- /*
5137
- * If we couldn't make a reservation then setup our reservation ticket
5138
- * and kick the async worker if it's not already running.
5139
- *
5140
- * If we are a priority flusher then we just need to add our ticket to
5141
- * the list and we will do our own flushing further down.
5142
- */
5143
- if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5144
- ticket.bytes = orig_bytes;
5145
- ticket.error = 0;
5146
- init_waitqueue_head(&ticket.wait);
5147
- if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5148
- list_add_tail(&ticket.list, &space_info->tickets);
5149
- if (!space_info->flush) {
5150
- space_info->flush = 1;
5151
- trace_btrfs_trigger_flush(fs_info,
5152
- space_info->flags,
5153
- orig_bytes, flush,
5154
- "enospc");
5155
- queue_work(system_unbound_wq,
5156
- &fs_info->async_reclaim_work);
5157
- }
5158
- } else {
5159
- list_add_tail(&ticket.list,
5160
- &space_info->priority_tickets);
5161
- }
5162
- } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5163
- used += orig_bytes;
5164
- /*
5165
- * We will do the space reservation dance during log replay,
5166
- * which means we won't have fs_info->fs_root set, so don't do
5167
- * the async reclaim as we will panic.
5168
- */
5169
- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5170
- need_do_async_reclaim(fs_info, space_info,
5171
- used, system_chunk) &&
5172
- !work_busy(&fs_info->async_reclaim_work)) {
5173
- trace_btrfs_trigger_flush(fs_info, space_info->flags,
5174
- orig_bytes, flush, "preempt");
5175
- queue_work(system_unbound_wq,
5176
- &fs_info->async_reclaim_work);
5177
- }
5178
- }
5179
- spin_unlock(&space_info->lock);
5180
- if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5181
- return ret;
5182
-
5183
- if (flush == BTRFS_RESERVE_FLUSH_ALL)
5184
- return wait_reserve_ticket(fs_info, space_info, &ticket,
5185
- orig_bytes);
5186
-
5187
- ret = 0;
5188
- priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5189
- spin_lock(&space_info->lock);
5190
- if (ticket.bytes) {
5191
- if (ticket.bytes < orig_bytes) {
5192
- u64 num_bytes = orig_bytes - ticket.bytes;
5193
- space_info->bytes_may_use -= num_bytes;
5194
- trace_btrfs_space_reservation(fs_info, "space_info",
5195
- space_info->flags,
5196
- num_bytes, 0);
5197
-
5198
- }
5199
- list_del_init(&ticket.list);
5200
- ret = -ENOSPC;
5201
- }
5202
- spin_unlock(&space_info->lock);
5203
- ASSERT(list_empty(&ticket.list));
5204
- return ret;
5205
-}
5206
-
5207
-/**
5208
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5209
- * @root - the root we're allocating for
5210
- * @block_rsv - the block_rsv we're allocating for
5211
- * @orig_bytes - the number of bytes we want
5212
- * @flush - whether or not we can flush to make our reservation
5213
- *
5214
- * This will reserve orgi_bytes number of bytes from the space info associated
5215
- * with the block_rsv. If there is not enough space it will make an attempt to
5216
- * flush out space to make room. It will do this by flushing delalloc if
5217
- * possible or committing the transaction. If flush is 0 then no attempts to
5218
- * regain reservations will be made and this will fail if there is not enough
5219
- * space already.
5220
- */
5221
-static int reserve_metadata_bytes(struct btrfs_root *root,
5222
- struct btrfs_block_rsv *block_rsv,
5223
- u64 orig_bytes,
5224
- enum btrfs_reserve_flush_enum flush)
5225
-{
5226
- struct btrfs_fs_info *fs_info = root->fs_info;
5227
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5228
- int ret;
5229
- bool system_chunk = (root == fs_info->chunk_root);
5230
-
5231
- ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5232
- orig_bytes, flush, system_chunk);
5233
- if (ret == -ENOSPC &&
5234
- unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5235
- if (block_rsv != global_rsv &&
5236
- !block_rsv_use_bytes(global_rsv, orig_bytes))
5237
- ret = 0;
5238
- }
5239
- if (ret == -ENOSPC) {
5240
- trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5241
- block_rsv->space_info->flags,
5242
- orig_bytes, 1);
5243
-
5244
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5245
- dump_space_info(fs_info, block_rsv->space_info,
5246
- orig_bytes, 0);
5247
- }
5248
- return ret;
5249
-}
5250
-
5251
-static struct btrfs_block_rsv *get_block_rsv(
5252
- const struct btrfs_trans_handle *trans,
5253
- const struct btrfs_root *root)
5254
-{
5255
- struct btrfs_fs_info *fs_info = root->fs_info;
5256
- struct btrfs_block_rsv *block_rsv = NULL;
5257
-
5258
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5259
- (root == fs_info->csum_root && trans->adding_csums) ||
5260
- (root == fs_info->uuid_root))
5261
- block_rsv = trans->block_rsv;
5262
-
5263
- if (!block_rsv)
5264
- block_rsv = root->block_rsv;
5265
-
5266
- if (!block_rsv)
5267
- block_rsv = &fs_info->empty_block_rsv;
5268
-
5269
- return block_rsv;
5270
-}
5271
-
5272
-static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5273
- u64 num_bytes)
5274
-{
5275
- int ret = -ENOSPC;
5276
- spin_lock(&block_rsv->lock);
5277
- if (block_rsv->reserved >= num_bytes) {
5278
- block_rsv->reserved -= num_bytes;
5279
- if (block_rsv->reserved < block_rsv->size)
5280
- block_rsv->full = 0;
5281
- ret = 0;
5282
- }
5283
- spin_unlock(&block_rsv->lock);
5284
- return ret;
5285
-}
5286
-
5287
-static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5288
- u64 num_bytes, int update_size)
5289
-{
5290
- spin_lock(&block_rsv->lock);
5291
- block_rsv->reserved += num_bytes;
5292
- if (update_size)
5293
- block_rsv->size += num_bytes;
5294
- else if (block_rsv->reserved >= block_rsv->size)
5295
- block_rsv->full = 1;
5296
- spin_unlock(&block_rsv->lock);
5297
-}
5298
-
5299
-int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5300
- struct btrfs_block_rsv *dest, u64 num_bytes,
5301
- int min_factor)
5302
-{
5303
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5304
- u64 min_bytes;
5305
-
5306
- if (global_rsv->space_info != dest->space_info)
5307
- return -ENOSPC;
5308
-
5309
- spin_lock(&global_rsv->lock);
5310
- min_bytes = div_factor(global_rsv->size, min_factor);
5311
- if (global_rsv->reserved < min_bytes + num_bytes) {
5312
- spin_unlock(&global_rsv->lock);
5313
- return -ENOSPC;
5314
- }
5315
- global_rsv->reserved -= num_bytes;
5316
- if (global_rsv->reserved < global_rsv->size)
5317
- global_rsv->full = 0;
5318
- spin_unlock(&global_rsv->lock);
5319
-
5320
- block_rsv_add_bytes(dest, num_bytes, 1);
5321
- return 0;
5322
-}
5323
-
5324
-/*
5325
- * This is for space we already have accounted in space_info->bytes_may_use, so
5326
- * basically when we're returning space from block_rsv's.
5327
- */
5328
-static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5329
- struct btrfs_space_info *space_info,
5330
- u64 num_bytes)
5331
-{
5332
- struct reserve_ticket *ticket;
5333
- struct list_head *head;
5334
- u64 used;
5335
- enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5336
- bool check_overcommit = false;
5337
-
5338
- spin_lock(&space_info->lock);
5339
- head = &space_info->priority_tickets;
5340
-
5341
- /*
5342
- * If we are over our limit then we need to check and see if we can
5343
- * overcommit, and if we can't then we just need to free up our space
5344
- * and not satisfy any requests.
5345
- */
5346
- used = btrfs_space_info_used(space_info, true);
5347
- if (used - num_bytes >= space_info->total_bytes)
5348
- check_overcommit = true;
5349
-again:
5350
- while (!list_empty(head) && num_bytes) {
5351
- ticket = list_first_entry(head, struct reserve_ticket,
5352
- list);
5353
- /*
5354
- * We use 0 bytes because this space is already reserved, so
5355
- * adding the ticket space would be a double count.
5356
- */
5357
- if (check_overcommit &&
5358
- !can_overcommit(fs_info, space_info, 0, flush, false))
5359
- break;
5360
- if (num_bytes >= ticket->bytes) {
5361
- list_del_init(&ticket->list);
5362
- num_bytes -= ticket->bytes;
5363
- ticket->bytes = 0;
5364
- space_info->tickets_id++;
5365
- wake_up(&ticket->wait);
5366
- } else {
5367
- ticket->bytes -= num_bytes;
5368
- num_bytes = 0;
5369
- }
5370
- }
5371
-
5372
- if (num_bytes && head == &space_info->priority_tickets) {
5373
- head = &space_info->tickets;
5374
- flush = BTRFS_RESERVE_FLUSH_ALL;
5375
- goto again;
5376
- }
5377
- space_info->bytes_may_use -= num_bytes;
5378
- trace_btrfs_space_reservation(fs_info, "space_info",
5379
- space_info->flags, num_bytes, 0);
5380
- spin_unlock(&space_info->lock);
5381
-}
5382
-
5383
-/*
5384
- * This is for newly allocated space that isn't accounted in
5385
- * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
5386
- * we use this helper.
5387
- */
5388
-static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5389
- struct btrfs_space_info *space_info,
5390
- u64 num_bytes)
5391
-{
5392
- struct reserve_ticket *ticket;
5393
- struct list_head *head = &space_info->priority_tickets;
5394
-
5395
-again:
5396
- while (!list_empty(head) && num_bytes) {
5397
- ticket = list_first_entry(head, struct reserve_ticket,
5398
- list);
5399
- if (num_bytes >= ticket->bytes) {
5400
- trace_btrfs_space_reservation(fs_info, "space_info",
5401
- space_info->flags,
5402
- ticket->bytes, 1);
5403
- list_del_init(&ticket->list);
5404
- num_bytes -= ticket->bytes;
5405
- space_info->bytes_may_use += ticket->bytes;
5406
- ticket->bytes = 0;
5407
- space_info->tickets_id++;
5408
- wake_up(&ticket->wait);
5409
- } else {
5410
- trace_btrfs_space_reservation(fs_info, "space_info",
5411
- space_info->flags,
5412
- num_bytes, 1);
5413
- space_info->bytes_may_use += num_bytes;
5414
- ticket->bytes -= num_bytes;
5415
- num_bytes = 0;
5416
- }
5417
- }
5418
-
5419
- if (num_bytes && head == &space_info->priority_tickets) {
5420
- head = &space_info->tickets;
5421
- goto again;
5422
- }
5423
-}
5424
-
5425
-static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5426
- struct btrfs_block_rsv *block_rsv,
5427
- struct btrfs_block_rsv *dest, u64 num_bytes,
5428
- u64 *qgroup_to_release_ret)
5429
-{
5430
- struct btrfs_space_info *space_info = block_rsv->space_info;
5431
- u64 qgroup_to_release = 0;
5432
- u64 ret;
5433
-
5434
- spin_lock(&block_rsv->lock);
5435
- if (num_bytes == (u64)-1) {
5436
- num_bytes = block_rsv->size;
5437
- qgroup_to_release = block_rsv->qgroup_rsv_size;
5438
- }
5439
- block_rsv->size -= num_bytes;
5440
- if (block_rsv->reserved >= block_rsv->size) {
5441
- num_bytes = block_rsv->reserved - block_rsv->size;
5442
- block_rsv->reserved = block_rsv->size;
5443
- block_rsv->full = 1;
5444
- } else {
5445
- num_bytes = 0;
5446
- }
5447
- if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5448
- qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5449
- block_rsv->qgroup_rsv_size;
5450
- block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5451
- } else {
5452
- qgroup_to_release = 0;
5453
- }
5454
- spin_unlock(&block_rsv->lock);
5455
-
5456
- ret = num_bytes;
5457
- if (num_bytes > 0) {
5458
- if (dest) {
5459
- spin_lock(&dest->lock);
5460
- if (!dest->full) {
5461
- u64 bytes_to_add;
5462
-
5463
- bytes_to_add = dest->size - dest->reserved;
5464
- bytes_to_add = min(num_bytes, bytes_to_add);
5465
- dest->reserved += bytes_to_add;
5466
- if (dest->reserved >= dest->size)
5467
- dest->full = 1;
5468
- num_bytes -= bytes_to_add;
5469
- }
5470
- spin_unlock(&dest->lock);
5471
- }
5472
- if (num_bytes)
5473
- space_info_add_old_bytes(fs_info, space_info,
5474
- num_bytes);
5475
- }
5476
- if (qgroup_to_release_ret)
5477
- *qgroup_to_release_ret = qgroup_to_release;
5478
- return ret;
5479
-}
5480
-
5481
-int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5482
- struct btrfs_block_rsv *dst, u64 num_bytes,
5483
- int update_size)
5484
-{
5485
- int ret;
5486
-
5487
- ret = block_rsv_use_bytes(src, num_bytes);
5488
- if (ret)
5489
- return ret;
5490
-
5491
- block_rsv_add_bytes(dst, num_bytes, update_size);
5492
- return 0;
5493
-}
5494
-
5495
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5496
-{
5497
- memset(rsv, 0, sizeof(*rsv));
5498
- spin_lock_init(&rsv->lock);
5499
- rsv->type = type;
5500
-}
5501
-
5502
-void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5503
- struct btrfs_block_rsv *rsv,
5504
- unsigned short type)
5505
-{
5506
- btrfs_init_block_rsv(rsv, type);
5507
- rsv->space_info = __find_space_info(fs_info,
5508
- BTRFS_BLOCK_GROUP_METADATA);
5509
-}
5510
-
5511
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5512
- unsigned short type)
5513
-{
5514
- struct btrfs_block_rsv *block_rsv;
5515
-
5516
- block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5517
- if (!block_rsv)
5518
- return NULL;
5519
-
5520
- btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5521
- return block_rsv;
5522
-}
5523
-
5524
-void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5525
- struct btrfs_block_rsv *rsv)
5526
-{
5527
- if (!rsv)
5528
- return;
5529
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5530
- kfree(rsv);
5531
-}
5532
-
5533
-int btrfs_block_rsv_add(struct btrfs_root *root,
5534
- struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5535
- enum btrfs_reserve_flush_enum flush)
5536
-{
5537
- int ret;
5538
-
5539
- if (num_bytes == 0)
5540
- return 0;
5541
-
5542
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5543
- if (!ret) {
5544
- block_rsv_add_bytes(block_rsv, num_bytes, 1);
5545
- return 0;
5546
- }
5547
-
5548
- return ret;
5549
-}
5550
-
5551
-int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5552
-{
5553
- u64 num_bytes = 0;
5554
- int ret = -ENOSPC;
5555
-
5556
- if (!block_rsv)
5557
- return 0;
5558
-
5559
- spin_lock(&block_rsv->lock);
5560
- num_bytes = div_factor(block_rsv->size, min_factor);
5561
- if (block_rsv->reserved >= num_bytes)
5562
- ret = 0;
5563
- spin_unlock(&block_rsv->lock);
5564
-
5565
- return ret;
5566
-}
5567
-
5568
-int btrfs_block_rsv_refill(struct btrfs_root *root,
5569
- struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5570
- enum btrfs_reserve_flush_enum flush)
5571
-{
5572
- u64 num_bytes = 0;
5573
- int ret = -ENOSPC;
5574
-
5575
- if (!block_rsv)
5576
- return 0;
5577
-
5578
- spin_lock(&block_rsv->lock);
5579
- num_bytes = min_reserved;
5580
- if (block_rsv->reserved >= num_bytes)
5581
- ret = 0;
5582
- else
5583
- num_bytes -= block_rsv->reserved;
5584
- spin_unlock(&block_rsv->lock);
5585
-
5586
- if (!ret)
5587
- return 0;
5588
-
5589
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5590
- if (!ret) {
5591
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
5592
- return 0;
5593
- }
5594
-
5595
- return ret;
5596
-}
5597
-
5598
-/**
5599
- * btrfs_inode_rsv_refill - refill the inode block rsv.
5600
- * @inode - the inode we are refilling.
5601
- * @flush - the flusing restriction.
5602
- *
5603
- * Essentially the same as btrfs_block_rsv_refill, except it uses the
5604
- * block_rsv->size as the minimum size. We'll either refill the missing amount
5605
- * or return if we already have enough space. This will also handle the resreve
5606
- * tracepoint for the reserved amount.
5607
- */
5608
-static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5609
- enum btrfs_reserve_flush_enum flush)
5610
-{
5611
- struct btrfs_root *root = inode->root;
5612
- struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5613
- u64 num_bytes = 0;
5614
- u64 qgroup_num_bytes = 0;
5615
- int ret = -ENOSPC;
5616
-
5617
- spin_lock(&block_rsv->lock);
5618
- if (block_rsv->reserved < block_rsv->size)
5619
- num_bytes = block_rsv->size - block_rsv->reserved;
5620
- if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
5621
- qgroup_num_bytes = block_rsv->qgroup_rsv_size -
5622
- block_rsv->qgroup_rsv_reserved;
5623
- spin_unlock(&block_rsv->lock);
5624
-
5625
- if (num_bytes == 0)
5626
- return 0;
5627
-
5628
- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
5629
- if (ret)
5630
- return ret;
5631
- ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5632
- if (!ret) {
5633
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
5634
- trace_btrfs_space_reservation(root->fs_info, "delalloc",
5635
- btrfs_ino(inode), num_bytes, 1);
5636
-
5637
- /* Don't forget to increase qgroup_rsv_reserved */
5638
- spin_lock(&block_rsv->lock);
5639
- block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
5640
- spin_unlock(&block_rsv->lock);
5641
- } else
5642
- btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5643
- return ret;
5644
-}
5645
-
5646
-/**
5647
- * btrfs_inode_rsv_release - release any excessive reservation.
5648
- * @inode - the inode we need to release from.
5649
- * @qgroup_free - free or convert qgroup meta.
5650
- * Unlike normal operation, qgroup meta reservation needs to know if we are
5651
- * freeing qgroup reservation or just converting it into per-trans. Normally
5652
- * @qgroup_free is true for error handling, and false for normal release.
5653
- *
5654
- * This is the same as btrfs_block_rsv_release, except that it handles the
5655
- * tracepoint for the reservation.
5656
- */
5657
-static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5658
-{
5659
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
5660
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5661
- struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5662
- u64 released = 0;
5663
- u64 qgroup_to_release = 0;
5664
-
5665
- /*
5666
- * Since we statically set the block_rsv->size we just want to say we
5667
- * are releasing 0 bytes, and then we'll just get the reservation over
5668
- * the size free'd.
5669
- */
5670
- released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
5671
- &qgroup_to_release);
5672
- if (released > 0)
5673
- trace_btrfs_space_reservation(fs_info, "delalloc",
5674
- btrfs_ino(inode), released, 0);
5675
- if (qgroup_free)
5676
- btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5677
- else
5678
- btrfs_qgroup_convert_reserved_meta(inode->root,
5679
- qgroup_to_release);
5680
-}
5681
-
5682
-void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5683
- struct btrfs_block_rsv *block_rsv,
5684
- u64 num_bytes)
5685
-{
5686
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5687
-
5688
- if (global_rsv == block_rsv ||
5689
- block_rsv->space_info != global_rsv->space_info)
5690
- global_rsv = NULL;
5691
- block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
5692
-}
5693
-
5694
-static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5695
-{
5696
- struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5697
- struct btrfs_space_info *sinfo = block_rsv->space_info;
5698
- u64 num_bytes;
5699
-
5700
- /*
5701
- * The global block rsv is based on the size of the extent tree, the
5702
- * checksum tree and the root tree. If the fs is empty we want to set
5703
- * it to a minimal amount for safety.
5704
- */
5705
- num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5706
- btrfs_root_used(&fs_info->csum_root->root_item) +
5707
- btrfs_root_used(&fs_info->tree_root->root_item);
5708
- num_bytes = max_t(u64, num_bytes, SZ_16M);
5709
-
5710
- spin_lock(&sinfo->lock);
5711
- spin_lock(&block_rsv->lock);
5712
-
5713
- block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5714
-
5715
- if (block_rsv->reserved < block_rsv->size) {
5716
- num_bytes = btrfs_space_info_used(sinfo, true);
5717
- if (sinfo->total_bytes > num_bytes) {
5718
- num_bytes = sinfo->total_bytes - num_bytes;
5719
- num_bytes = min(num_bytes,
5720
- block_rsv->size - block_rsv->reserved);
5721
- block_rsv->reserved += num_bytes;
5722
- sinfo->bytes_may_use += num_bytes;
5723
- trace_btrfs_space_reservation(fs_info, "space_info",
5724
- sinfo->flags, num_bytes,
5725
- 1);
5726
- }
5727
- } else if (block_rsv->reserved > block_rsv->size) {
5728
- num_bytes = block_rsv->reserved - block_rsv->size;
5729
- sinfo->bytes_may_use -= num_bytes;
5730
- trace_btrfs_space_reservation(fs_info, "space_info",
5731
- sinfo->flags, num_bytes, 0);
5732
- block_rsv->reserved = block_rsv->size;
5733
- }
5734
-
5735
- if (block_rsv->reserved == block_rsv->size)
5736
- block_rsv->full = 1;
5737
- else
5738
- block_rsv->full = 0;
5739
-
5740
- spin_unlock(&block_rsv->lock);
5741
- spin_unlock(&sinfo->lock);
5742
-}
5743
-
5744
-static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5745
-{
5746
- struct btrfs_space_info *space_info;
5747
-
5748
- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5749
- fs_info->chunk_block_rsv.space_info = space_info;
5750
-
5751
- space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5752
- fs_info->global_block_rsv.space_info = space_info;
5753
- fs_info->trans_block_rsv.space_info = space_info;
5754
- fs_info->empty_block_rsv.space_info = space_info;
5755
- fs_info->delayed_block_rsv.space_info = space_info;
5756
-
5757
- fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5758
- fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5759
- fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5760
- fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5761
- if (fs_info->quota_root)
5762
- fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5763
- fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5764
-
5765
- update_global_block_rsv(fs_info);
5766
-}
5767
-
5768
-static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5769
-{
5770
- block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5771
- (u64)-1, NULL);
5772
- WARN_ON(fs_info->trans_block_rsv.size > 0);
5773
- WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5774
- WARN_ON(fs_info->chunk_block_rsv.size > 0);
5775
- WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5776
- WARN_ON(fs_info->delayed_block_rsv.size > 0);
5777
- WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5778
-}
5779
-
5780
-
5781
-/*
5782
- * To be called after all the new block groups attached to the transaction
5783
- * handle have been created (btrfs_create_pending_block_groups()).
5784
- */
5785
-void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5786
-{
5787
- struct btrfs_fs_info *fs_info = trans->fs_info;
5788
-
5789
- if (!trans->chunk_bytes_reserved)
5790
- return;
5791
-
5792
- WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5793
-
5794
- block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5795
- trans->chunk_bytes_reserved, NULL);
5796
- trans->chunk_bytes_reserved = 0;
5797
-}
5798
-
5799
-/*
5800
- * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5801
- * root: the root of the parent directory
5802
- * rsv: block reservation
5803
- * items: the number of items that we need do reservation
5804
- * use_global_rsv: allow fallback to the global block reservation
5805
- *
5806
- * This function is used to reserve the space for snapshot/subvolume
5807
- * creation and deletion. Those operations are different with the
5808
- * common file/directory operations, they change two fs/file trees
5809
- * and root tree, the number of items that the qgroup reserves is
5810
- * different with the free space reservation. So we can not use
5811
- * the space reservation mechanism in start_transaction().
5812
- */
5813
-int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5814
- struct btrfs_block_rsv *rsv, int items,
5815
- bool use_global_rsv)
5816
-{
5817
- u64 qgroup_num_bytes = 0;
5818
- u64 num_bytes;
5819
- int ret;
5820
- struct btrfs_fs_info *fs_info = root->fs_info;
5821
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5822
-
5823
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
5824
- /* One for parent inode, two for dir entries */
5825
- qgroup_num_bytes = 3 * fs_info->nodesize;
5826
- ret = btrfs_qgroup_reserve_meta_prealloc(root,
5827
- qgroup_num_bytes, true);
5828
- if (ret)
5829
- return ret;
5830
- }
5831
-
5832
- num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
5833
- rsv->space_info = __find_space_info(fs_info,
5834
- BTRFS_BLOCK_GROUP_METADATA);
5835
- ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5836
- BTRFS_RESERVE_FLUSH_ALL);
5837
-
5838
- if (ret == -ENOSPC && use_global_rsv)
5839
- ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
5840
-
5841
- if (ret && qgroup_num_bytes)
5842
- btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5843
-
5844
- return ret;
5845
-}
5846
-
5847
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
5848
- struct btrfs_block_rsv *rsv)
5849
-{
5850
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5851
-}
5852
-
5853
-static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
5854
- struct btrfs_inode *inode)
5855
-{
5856
- struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5857
- u64 reserve_size = 0;
5858
- u64 qgroup_rsv_size = 0;
5859
- u64 csum_leaves;
5860
- unsigned outstanding_extents;
5861
-
5862
- lockdep_assert_held(&inode->lock);
5863
- outstanding_extents = inode->outstanding_extents;
5864
- if (outstanding_extents)
5865
- reserve_size = btrfs_calc_trans_metadata_size(fs_info,
5866
- outstanding_extents + 1);
5867
- csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
5868
- inode->csum_bytes);
5869
- reserve_size += btrfs_calc_trans_metadata_size(fs_info,
5870
- csum_leaves);
5871
- /*
5872
- * For qgroup rsv, the calculation is very simple:
5873
- * account one nodesize for each outstanding extent
5874
- *
5875
- * This is overestimating in most cases.
5876
- */
5877
- qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
5878
-
5879
- spin_lock(&block_rsv->lock);
5880
- block_rsv->size = reserve_size;
5881
- block_rsv->qgroup_rsv_size = qgroup_rsv_size;
5882
- spin_unlock(&block_rsv->lock);
5883
-}
5884
-
5885
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
5886
-{
5887
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
5888
- unsigned nr_extents;
5889
- enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5890
- int ret = 0;
5891
- bool delalloc_lock = true;
5892
-
5893
- /* If we are a free space inode we need to not flush since we will be in
5894
- * the middle of a transaction commit. We also don't need the delalloc
5895
- * mutex since we won't race with anybody. We need this mostly to make
5896
- * lockdep shut its filthy mouth.
5897
- *
5898
- * If we have a transaction open (can happen if we call truncate_block
5899
- * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5900
- */
5901
- if (btrfs_is_free_space_inode(inode)) {
5902
- flush = BTRFS_RESERVE_NO_FLUSH;
5903
- delalloc_lock = false;
5904
- } else {
5905
- if (current->journal_info)
5906
- flush = BTRFS_RESERVE_FLUSH_LIMIT;
5907
-
5908
- if (btrfs_transaction_in_commit(fs_info))
5909
- schedule_timeout(1);
5910
- }
5911
-
5912
- if (delalloc_lock)
5913
- mutex_lock(&inode->delalloc_mutex);
5914
-
5915
- num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5916
-
5917
- /* Add our new extents and calculate the new rsv size. */
5918
- spin_lock(&inode->lock);
5919
- nr_extents = count_max_extents(num_bytes);
5920
- btrfs_mod_outstanding_extents(inode, nr_extents);
5921
- inode->csum_bytes += num_bytes;
5922
- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5923
- spin_unlock(&inode->lock);
5924
-
5925
- ret = btrfs_inode_rsv_refill(inode, flush);
5926
- if (unlikely(ret))
5927
- goto out_fail;
5928
-
5929
- if (delalloc_lock)
5930
- mutex_unlock(&inode->delalloc_mutex);
5931
- return 0;
5932
-
5933
-out_fail:
5934
- spin_lock(&inode->lock);
5935
- nr_extents = count_max_extents(num_bytes);
5936
- btrfs_mod_outstanding_extents(inode, -nr_extents);
5937
- inode->csum_bytes -= num_bytes;
5938
- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5939
- spin_unlock(&inode->lock);
5940
-
5941
- btrfs_inode_rsv_release(inode, true);
5942
- if (delalloc_lock)
5943
- mutex_unlock(&inode->delalloc_mutex);
5944
- return ret;
5945
-}
5946
-
5947
-/**
5948
- * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5949
- * @inode: the inode to release the reservation for.
5950
- * @num_bytes: the number of bytes we are releasing.
5951
- * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
5952
- *
5953
- * This will release the metadata reservation for an inode. This can be called
5954
- * once we complete IO for a given set of bytes to release their metadata
5955
- * reservations, or on error for the same reason.
5956
- */
5957
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
5958
- bool qgroup_free)
5959
-{
5960
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
5961
-
5962
- num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
5963
- spin_lock(&inode->lock);
5964
- inode->csum_bytes -= num_bytes;
5965
- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5966
- spin_unlock(&inode->lock);
5967
-
5968
- if (btrfs_is_testing(fs_info))
5969
- return;
5970
-
5971
- btrfs_inode_rsv_release(inode, qgroup_free);
5972
-}
5973
-
5974
-/**
5975
- * btrfs_delalloc_release_extents - release our outstanding_extents
5976
- * @inode: the inode to balance the reservation for.
5977
- * @num_bytes: the number of bytes we originally reserved with
5978
- * @qgroup_free: do we need to free qgroup meta reservation or convert them.
5979
- *
5980
- * When we reserve space we increase outstanding_extents for the extents we may
5981
- * add. Once we've set the range as delalloc or created our ordered extents we
5982
- * have outstanding_extents to track the real usage, so we use this to free our
5983
- * temporarily tracked outstanding_extents. This _must_ be used in conjunction
5984
- * with btrfs_delalloc_reserve_metadata.
5985
- */
5986
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
5987
-{
5988
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
5989
- unsigned num_extents;
5990
-
5991
- spin_lock(&inode->lock);
5992
- num_extents = count_max_extents(num_bytes);
5993
- btrfs_mod_outstanding_extents(inode, -num_extents);
5994
- btrfs_calculate_inode_block_rsv_size(fs_info, inode);
5995
- spin_unlock(&inode->lock);
5996
-
5997
- if (btrfs_is_testing(fs_info))
5998
- return;
5999
-
6000
- btrfs_inode_rsv_release(inode, true);
6001
-}
6002
-
6003
-/**
6004
- * btrfs_delalloc_reserve_space - reserve data and metadata space for
6005
- * delalloc
6006
- * @inode: inode we're writing to
6007
- * @start: start range we are writing to
6008
- * @len: how long the range we are writing to
6009
- * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6010
- * current reservation.
6011
- *
6012
- * This will do the following things
6013
- *
6014
- * o reserve space in data space info for num bytes
6015
- * and reserve precious corresponding qgroup space
6016
- * (Done in check_data_free_space)
6017
- *
6018
- * o reserve space for metadata space, based on the number of outstanding
6019
- * extents and how much csums will be needed
6020
- * also reserve metadata space in a per root over-reserve method.
6021
- * o add to the inodes->delalloc_bytes
6022
- * o add it to the fs_info's delalloc inodes list.
6023
- * (Above 3 all done in delalloc_reserve_metadata)
6024
- *
6025
- * Return 0 for success
6026
- * Return <0 for error(-ENOSPC or -EQUOT)
6027
- */
6028
-int btrfs_delalloc_reserve_space(struct inode *inode,
6029
- struct extent_changeset **reserved, u64 start, u64 len)
6030
-{
6031
- int ret;
6032
-
6033
- ret = btrfs_check_data_free_space(inode, reserved, start, len);
6034
- if (ret < 0)
6035
- return ret;
6036
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6037
- if (ret < 0)
6038
- btrfs_free_reserved_data_space(inode, *reserved, start, len);
6039
- return ret;
6040
-}
6041
-
6042
-/**
6043
- * btrfs_delalloc_release_space - release data and metadata space for delalloc
6044
- * @inode: inode we're releasing space for
6045
- * @start: start position of the space already reserved
6046
- * @len: the len of the space already reserved
6047
- * @release_bytes: the len of the space we consumed or didn't use
6048
- *
6049
- * This function will release the metadata space that was not used and will
6050
- * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6051
- * list if there are no delalloc bytes left.
6052
- * Also it will handle the qgroup reserved space.
6053
- */
6054
-void btrfs_delalloc_release_space(struct inode *inode,
6055
- struct extent_changeset *reserved,
6056
- u64 start, u64 len, bool qgroup_free)
6057
-{
6058
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
6059
- btrfs_free_reserved_data_space(inode, reserved, start, len);
6060
-}
6061
-
6062
-static int update_block_group(struct btrfs_trans_handle *trans,
6063
- struct btrfs_fs_info *info, u64 bytenr,
6064
- u64 num_bytes, int alloc)
6065
-{
6066
- struct btrfs_block_group_cache *cache = NULL;
6067
- u64 total = num_bytes;
6068
- u64 old_val;
6069
- u64 byte_in_group;
6070
- int factor;
6071
-
6072
- /* block accounting for super block */
6073
- spin_lock(&info->delalloc_root_lock);
6074
- old_val = btrfs_super_bytes_used(info->super_copy);
6075
- if (alloc)
6076
- old_val += num_bytes;
6077
- else
6078
- old_val -= num_bytes;
6079
- btrfs_set_super_bytes_used(info->super_copy, old_val);
6080
- spin_unlock(&info->delalloc_root_lock);
6081
-
6082
- while (total) {
6083
- cache = btrfs_lookup_block_group(info, bytenr);
6084
- if (!cache)
6085
- return -ENOENT;
6086
- factor = btrfs_bg_type_to_factor(cache->flags);
6087
-
6088
- /*
6089
- * If this block group has free space cache written out, we
6090
- * need to make sure to load it if we are removing space. This
6091
- * is because we need the unpinning stage to actually add the
6092
- * space back to the block group, otherwise we will leak space.
6093
- */
6094
- if (!alloc && cache->cached == BTRFS_CACHE_NO)
6095
- cache_block_group(cache, 1);
6096
-
6097
- byte_in_group = bytenr - cache->key.objectid;
6098
- WARN_ON(byte_in_group > cache->key.offset);
6099
-
6100
- spin_lock(&cache->space_info->lock);
6101
- spin_lock(&cache->lock);
6102
-
6103
- if (btrfs_test_opt(info, SPACE_CACHE) &&
6104
- cache->disk_cache_state < BTRFS_DC_CLEAR)
6105
- cache->disk_cache_state = BTRFS_DC_CLEAR;
6106
-
6107
- old_val = btrfs_block_group_used(&cache->item);
6108
- num_bytes = min(total, cache->key.offset - byte_in_group);
6109
- if (alloc) {
6110
- old_val += num_bytes;
6111
- btrfs_set_block_group_used(&cache->item, old_val);
6112
- cache->reserved -= num_bytes;
6113
- cache->space_info->bytes_reserved -= num_bytes;
6114
- cache->space_info->bytes_used += num_bytes;
6115
- cache->space_info->disk_used += num_bytes * factor;
6116
- spin_unlock(&cache->lock);
6117
- spin_unlock(&cache->space_info->lock);
6118
- } else {
6119
- old_val -= num_bytes;
6120
- btrfs_set_block_group_used(&cache->item, old_val);
6121
- cache->pinned += num_bytes;
6122
- cache->space_info->bytes_pinned += num_bytes;
6123
- cache->space_info->bytes_used -= num_bytes;
6124
- cache->space_info->disk_used -= num_bytes * factor;
6125
- spin_unlock(&cache->lock);
6126
- spin_unlock(&cache->space_info->lock);
6127
-
6128
- trace_btrfs_space_reservation(info, "pinned",
6129
- cache->space_info->flags,
6130
- num_bytes, 1);
6131
- percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6132
- num_bytes,
6133
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
6134
- set_extent_dirty(info->pinned_extents,
6135
- bytenr, bytenr + num_bytes - 1,
6136
- GFP_NOFS | __GFP_NOFAIL);
6137
- }
6138
-
6139
- spin_lock(&trans->transaction->dirty_bgs_lock);
6140
- if (list_empty(&cache->dirty_list)) {
6141
- list_add_tail(&cache->dirty_list,
6142
- &trans->transaction->dirty_bgs);
6143
- trans->transaction->num_dirty_bgs++;
6144
- btrfs_get_block_group(cache);
6145
- }
6146
- spin_unlock(&trans->transaction->dirty_bgs_lock);
6147
-
6148
- /*
6149
- * No longer have used bytes in this block group, queue it for
6150
- * deletion. We do this after adding the block group to the
6151
- * dirty list to avoid races between cleaner kthread and space
6152
- * cache writeout.
6153
- */
6154
- if (!alloc && old_val == 0)
6155
- btrfs_mark_bg_unused(cache);
6156
-
6157
- btrfs_put_block_group(cache);
6158
- total -= num_bytes;
6159
- bytenr += num_bytes;
6160
- }
6161
- return 0;
61622510 }
61632511
61642512 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
61652513 {
6166
- struct btrfs_block_group_cache *cache;
2514
+ struct btrfs_block_group *cache;
61672515 u64 bytenr;
61682516
61692517 spin_lock(&fs_info->block_group_cache_lock);
....@@ -6177,20 +2525,23 @@
61772525 if (!cache)
61782526 return 0;
61792527
6180
- bytenr = cache->key.objectid;
2528
+ bytenr = cache->start;
61812529 btrfs_put_block_group(cache);
61822530
61832531 return bytenr;
61842532 }
61852533
6186
-static int pin_down_extent(struct btrfs_fs_info *fs_info,
6187
- struct btrfs_block_group_cache *cache,
2534
+static int pin_down_extent(struct btrfs_trans_handle *trans,
2535
+ struct btrfs_block_group *cache,
61882536 u64 bytenr, u64 num_bytes, int reserved)
61892537 {
2538
+ struct btrfs_fs_info *fs_info = cache->fs_info;
2539
+
61902540 spin_lock(&cache->space_info->lock);
61912541 spin_lock(&cache->lock);
61922542 cache->pinned += num_bytes;
6193
- cache->space_info->bytes_pinned += num_bytes;
2543
+ btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2544
+ num_bytes);
61942545 if (reserved) {
61952546 cache->reserved -= num_bytes;
61962547 cache->space_info->bytes_reserved -= num_bytes;
....@@ -6198,27 +2549,21 @@
61982549 spin_unlock(&cache->lock);
61992550 spin_unlock(&cache->space_info->lock);
62002551
6201
- trace_btrfs_space_reservation(fs_info, "pinned",
6202
- cache->space_info->flags, num_bytes, 1);
6203
- percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6204
- num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6205
- set_extent_dirty(fs_info->pinned_extents, bytenr,
2552
+ __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
2553
+ set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
62062554 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
62072555 return 0;
62082556 }
62092557
6210
-/*
6211
- * this function must be called within transaction
6212
- */
6213
-int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
2558
+int btrfs_pin_extent(struct btrfs_trans_handle *trans,
62142559 u64 bytenr, u64 num_bytes, int reserved)
62152560 {
6216
- struct btrfs_block_group_cache *cache;
2561
+ struct btrfs_block_group *cache;
62172562
6218
- cache = btrfs_lookup_block_group(fs_info, bytenr);
2563
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
62192564 BUG_ON(!cache); /* Logic error */
62202565
6221
- pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
2566
+ pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
62222567
62232568 btrfs_put_block_group(cache);
62242569 return 0;
....@@ -6227,13 +2572,15 @@
62272572 /*
62282573 * this function must be called within transaction
62292574 */
6230
-int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
2575
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
62312576 u64 bytenr, u64 num_bytes)
62322577 {
6233
- struct btrfs_block_group_cache *cache;
2578
+ struct btrfs_block_group *cache;
62342579 int ret;
62352580
6236
- cache = btrfs_lookup_block_group(fs_info, bytenr);
2581
+ btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes);
2582
+
2583
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
62372584 if (!cache)
62382585 return -EINVAL;
62392586
....@@ -6243,9 +2590,9 @@
62432590 * to one because the slow code to read in the free extents does check
62442591 * the pinned extents.
62452592 */
6246
- cache_block_group(cache, 1);
2593
+ btrfs_cache_block_group(cache, 1);
62472594
6248
- pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
2595
+ pin_down_extent(trans, cache, bytenr, num_bytes, 0);
62492596
62502597 /* remove us from the free space cache (if we're there at all) */
62512598 ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
....@@ -6257,25 +2604,26 @@
62572604 u64 start, u64 num_bytes)
62582605 {
62592606 int ret;
6260
- struct btrfs_block_group_cache *block_group;
2607
+ struct btrfs_block_group *block_group;
62612608 struct btrfs_caching_control *caching_ctl;
62622609
62632610 block_group = btrfs_lookup_block_group(fs_info, start);
62642611 if (!block_group)
62652612 return -EINVAL;
62662613
6267
- cache_block_group(block_group, 0);
6268
- caching_ctl = get_caching_control(block_group);
2614
+ btrfs_cache_block_group(block_group, 0);
2615
+ caching_ctl = btrfs_get_caching_control(block_group);
62692616
62702617 if (!caching_ctl) {
62712618 /* Logic error */
6272
- BUG_ON(!block_group_cache_done(block_group));
2619
+ BUG_ON(!btrfs_block_group_done(block_group));
62732620 ret = btrfs_remove_free_space(block_group, start, num_bytes);
62742621 } else {
62752622 mutex_lock(&caching_ctl->mutex);
62762623
62772624 if (start >= caching_ctl->progress) {
6278
- ret = add_excluded_extent(fs_info, start, num_bytes);
2625
+ ret = btrfs_add_excluded_extent(fs_info, start,
2626
+ num_bytes);
62792627 } else if (start + num_bytes <= caching_ctl->progress) {
62802628 ret = btrfs_remove_free_space(block_group,
62812629 start, num_bytes);
....@@ -6289,19 +2637,20 @@
62892637 num_bytes = (start + num_bytes) -
62902638 caching_ctl->progress;
62912639 start = caching_ctl->progress;
6292
- ret = add_excluded_extent(fs_info, start, num_bytes);
2640
+ ret = btrfs_add_excluded_extent(fs_info, start,
2641
+ num_bytes);
62932642 }
62942643 out_lock:
62952644 mutex_unlock(&caching_ctl->mutex);
6296
- put_caching_control(caching_ctl);
2645
+ btrfs_put_caching_control(caching_ctl);
62972646 }
62982647 btrfs_put_block_group(block_group);
62992648 return ret;
63002649 }
63012650
6302
-int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
6303
- struct extent_buffer *eb)
2651
+int btrfs_exclude_logged_extents(struct extent_buffer *eb)
63042652 {
2653
+ struct btrfs_fs_info *fs_info = eb->fs_info;
63052654 struct btrfs_file_extent_item *item;
63062655 struct btrfs_key key;
63072656 int found_type;
....@@ -6332,146 +2681,9 @@
63322681 }
63332682
63342683 static void
6335
-btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
2684
+btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
63362685 {
63372686 atomic_inc(&bg->reservations);
6338
-}
6339
-
6340
-void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6341
- const u64 start)
6342
-{
6343
- struct btrfs_block_group_cache *bg;
6344
-
6345
- bg = btrfs_lookup_block_group(fs_info, start);
6346
- ASSERT(bg);
6347
- if (atomic_dec_and_test(&bg->reservations))
6348
- wake_up_var(&bg->reservations);
6349
- btrfs_put_block_group(bg);
6350
-}
6351
-
6352
-void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6353
-{
6354
- struct btrfs_space_info *space_info = bg->space_info;
6355
-
6356
- ASSERT(bg->ro);
6357
-
6358
- if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6359
- return;
6360
-
6361
- /*
6362
- * Our block group is read only but before we set it to read only,
6363
- * some task might have had allocated an extent from it already, but it
6364
- * has not yet created a respective ordered extent (and added it to a
6365
- * root's list of ordered extents).
6366
- * Therefore wait for any task currently allocating extents, since the
6367
- * block group's reservations counter is incremented while a read lock
6368
- * on the groups' semaphore is held and decremented after releasing
6369
- * the read access on that semaphore and creating the ordered extent.
6370
- */
6371
- down_write(&space_info->groups_sem);
6372
- up_write(&space_info->groups_sem);
6373
-
6374
- wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
6375
-}
6376
-
6377
-/**
6378
- * btrfs_add_reserved_bytes - update the block_group and space info counters
6379
- * @cache: The cache we are manipulating
6380
- * @ram_bytes: The number of bytes of file content, and will be same to
6381
- * @num_bytes except for the compress path.
6382
- * @num_bytes: The number of bytes in question
6383
- * @delalloc: The blocks are allocated for the delalloc write
6384
- *
6385
- * This is called by the allocator when it reserves space. If this is a
6386
- * reservation and the block group has become read only we cannot make the
6387
- * reservation and return -EAGAIN, otherwise this function always succeeds.
6388
- */
6389
-static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6390
- u64 ram_bytes, u64 num_bytes, int delalloc)
6391
-{
6392
- struct btrfs_space_info *space_info = cache->space_info;
6393
- int ret = 0;
6394
-
6395
- spin_lock(&space_info->lock);
6396
- spin_lock(&cache->lock);
6397
- if (cache->ro) {
6398
- ret = -EAGAIN;
6399
- } else {
6400
- cache->reserved += num_bytes;
6401
- space_info->bytes_reserved += num_bytes;
6402
-
6403
- trace_btrfs_space_reservation(cache->fs_info,
6404
- "space_info", space_info->flags,
6405
- ram_bytes, 0);
6406
- space_info->bytes_may_use -= ram_bytes;
6407
- if (delalloc)
6408
- cache->delalloc_bytes += num_bytes;
6409
- }
6410
- spin_unlock(&cache->lock);
6411
- spin_unlock(&space_info->lock);
6412
- return ret;
6413
-}
6414
-
6415
-/**
6416
- * btrfs_free_reserved_bytes - update the block_group and space info counters
6417
- * @cache: The cache we are manipulating
6418
- * @num_bytes: The number of bytes in question
6419
- * @delalloc: The blocks are allocated for the delalloc write
6420
- *
6421
- * This is called by somebody who is freeing space that was never actually used
6422
- * on disk. For example if you reserve some space for a new leaf in transaction
6423
- * A and before transaction A commits you free that leaf, you call this with
6424
- * reserve set to 0 in order to clear the reservation.
6425
- */
6426
-
6427
-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6428
- u64 num_bytes, int delalloc)
6429
-{
6430
- struct btrfs_space_info *space_info = cache->space_info;
6431
- int ret = 0;
6432
-
6433
- spin_lock(&space_info->lock);
6434
- spin_lock(&cache->lock);
6435
- if (cache->ro)
6436
- space_info->bytes_readonly += num_bytes;
6437
- cache->reserved -= num_bytes;
6438
- space_info->bytes_reserved -= num_bytes;
6439
- space_info->max_extent_size = 0;
6440
-
6441
- if (delalloc)
6442
- cache->delalloc_bytes -= num_bytes;
6443
- spin_unlock(&cache->lock);
6444
- spin_unlock(&space_info->lock);
6445
- return ret;
6446
-}
6447
-void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6448
-{
6449
- struct btrfs_caching_control *next;
6450
- struct btrfs_caching_control *caching_ctl;
6451
- struct btrfs_block_group_cache *cache;
6452
-
6453
- down_write(&fs_info->commit_root_sem);
6454
-
6455
- list_for_each_entry_safe(caching_ctl, next,
6456
- &fs_info->caching_block_groups, list) {
6457
- cache = caching_ctl->block_group;
6458
- if (block_group_cache_done(cache)) {
6459
- cache->last_byte_to_unpin = (u64)-1;
6460
- list_del_init(&caching_ctl->list);
6461
- put_caching_control(caching_ctl);
6462
- } else {
6463
- cache->last_byte_to_unpin = caching_ctl->progress;
6464
- }
6465
- }
6466
-
6467
- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6468
- fs_info->pinned_extents = &fs_info->freed_extents[1];
6469
- else
6470
- fs_info->pinned_extents = &fs_info->freed_extents[0];
6471
-
6472
- up_write(&fs_info->commit_root_sem);
6473
-
6474
- update_global_block_rsv(fs_info);
64752687 }
64762688
64772689 /*
....@@ -6507,7 +2719,7 @@
65072719 u64 start, u64 end,
65082720 const bool return_free_space)
65092721 {
6510
- struct btrfs_block_group_cache *cache = NULL;
2722
+ struct btrfs_block_group *cache = NULL;
65112723 struct btrfs_space_info *space_info;
65122724 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
65132725 struct btrfs_free_cluster *cluster = NULL;
....@@ -6519,7 +2731,7 @@
65192731 while (start <= end) {
65202732 readonly = false;
65212733 if (!cache ||
6522
- start >= cache->key.objectid + cache->key.offset) {
2734
+ start >= cache->start + cache->length) {
65232735 if (cache)
65242736 btrfs_put_block_group(cache);
65252737 total_unpinned = 0;
....@@ -6532,13 +2744,13 @@
65322744 empty_cluster <<= 1;
65332745 }
65342746
6535
- len = cache->key.objectid + cache->key.offset - start;
2747
+ len = cache->start + cache->length - start;
65362748 len = min(len, end + 1 - start);
65372749
6538
- if (start < cache->last_byte_to_unpin) {
6539
- len = min(len, cache->last_byte_to_unpin - start);
6540
- if (return_free_space)
6541
- btrfs_add_free_space(cache, start, len);
2750
+ if (start < cache->last_byte_to_unpin && return_free_space) {
2751
+ u64 add_len = min(len, cache->last_byte_to_unpin - start);
2752
+
2753
+ btrfs_add_free_space(cache, start, add_len);
65422754 }
65432755
65442756 start += len;
....@@ -6561,13 +2773,9 @@
65612773 spin_lock(&space_info->lock);
65622774 spin_lock(&cache->lock);
65632775 cache->pinned -= len;
6564
- space_info->bytes_pinned -= len;
6565
-
6566
- trace_btrfs_space_reservation(fs_info, "pinned",
6567
- space_info->flags, len, 0);
2776
+ btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
65682777 space_info->max_extent_size = 0;
6569
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
6570
- -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
2778
+ __btrfs_mod_total_bytes_pinned(space_info, -len);
65712779 if (cache->ro) {
65722780 space_info->bytes_readonly += len;
65732781 readonly = true;
....@@ -6582,21 +2790,17 @@
65822790 to_add = min(len, global_rsv->size -
65832791 global_rsv->reserved);
65842792 global_rsv->reserved += to_add;
6585
- space_info->bytes_may_use += to_add;
2793
+ btrfs_space_info_update_bytes_may_use(fs_info,
2794
+ space_info, to_add);
65862795 if (global_rsv->reserved >= global_rsv->size)
65872796 global_rsv->full = 1;
6588
- trace_btrfs_space_reservation(fs_info,
6589
- "space_info",
6590
- space_info->flags,
6591
- to_add, 1);
65922797 len -= to_add;
65932798 }
65942799 spin_unlock(&global_rsv->lock);
6595
- /* Add to any tickets we may have */
6596
- if (len)
6597
- space_info_add_new_bytes(fs_info, space_info,
6598
- len);
65992800 }
2801
+ /* Add to any tickets we may have */
2802
+ if (!readonly && return_free_space && len)
2803
+ btrfs_try_granting_tickets(fs_info, space_info);
66002804 spin_unlock(&space_info->lock);
66012805 }
66022806
....@@ -6608,19 +2812,16 @@
66082812 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
66092813 {
66102814 struct btrfs_fs_info *fs_info = trans->fs_info;
6611
- struct btrfs_block_group_cache *block_group, *tmp;
2815
+ struct btrfs_block_group *block_group, *tmp;
66122816 struct list_head *deleted_bgs;
66132817 struct extent_io_tree *unpin;
66142818 u64 start;
66152819 u64 end;
66162820 int ret;
66172821
6618
- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6619
- unpin = &fs_info->freed_extents[1];
6620
- else
6621
- unpin = &fs_info->freed_extents[0];
2822
+ unpin = &trans->transaction->pinned_extents;
66222823
6623
- while (!trans->aborted) {
2824
+ while (!TRANS_ABORTED(trans)) {
66242825 struct extent_state *cached_state = NULL;
66252826
66262827 mutex_lock(&fs_info->unused_bg_unpin_mutex);
....@@ -6630,8 +2831,11 @@
66302831 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
66312832 break;
66322833 }
2834
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
2835
+ clear_extent_bits(&fs_info->excluded_extents, start,
2836
+ end, EXTENT_UPTODATE);
66332837
6634
- if (btrfs_test_opt(fs_info, DISCARD))
2838
+ if (btrfs_test_opt(fs_info, DISCARD_SYNC))
66352839 ret = btrfs_discard_extent(fs_info, start,
66362840 end + 1 - start, NULL);
66372841
....@@ -6640,6 +2844,11 @@
66402844 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
66412845 free_extent_state(cached_state);
66422846 cond_resched();
2847
+ }
2848
+
2849
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2850
+ btrfs_discard_calc_delay(&fs_info->discard_ctl);
2851
+ btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
66432852 }
66442853
66452854 /*
....@@ -6652,14 +2861,14 @@
66522861 u64 trimmed = 0;
66532862
66542863 ret = -EROFS;
6655
- if (!trans->aborted)
2864
+ if (!TRANS_ABORTED(trans))
66562865 ret = btrfs_discard_extent(fs_info,
6657
- block_group->key.objectid,
6658
- block_group->key.offset,
2866
+ block_group->start,
2867
+ block_group->length,
66592868 &trimmed);
66602869
66612870 list_del_init(&block_group->bg_list);
6662
- btrfs_put_block_group_trimming(block_group);
2871
+ btrfs_unfreeze_block_group(block_group);
66632872 btrfs_put_block_group(block_group);
66642873
66652874 if (ret) {
....@@ -6673,6 +2882,65 @@
66732882 return 0;
66742883 }
66752884
2885
+/*
2886
+ * Drop one or more refs of @node.
2887
+ *
2888
+ * 1. Locate the extent refs.
2889
+ * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
2890
+ * Locate it, then reduce the refs number or remove the ref line completely.
2891
+ *
2892
+ * 2. Update the refs count in EXTENT/METADATA_ITEM
2893
+ *
2894
+ * Inline backref case:
2895
+ *
2896
+ * in extent tree we have:
2897
+ *
2898
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2899
+ * refs 2 gen 6 flags DATA
2900
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
2901
+ * extent data backref root FS_TREE objectid 257 offset 0 count 1
2902
+ *
2903
+ * This function gets called with:
2904
+ *
2905
+ * node->bytenr = 13631488
2906
+ * node->num_bytes = 1048576
2907
+ * root_objectid = FS_TREE
2908
+ * owner_objectid = 257
2909
+ * owner_offset = 0
2910
+ * refs_to_drop = 1
2911
+ *
2912
+ * Then we should get some like:
2913
+ *
2914
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2915
+ * refs 1 gen 6 flags DATA
2916
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
2917
+ *
2918
+ * Keyed backref case:
2919
+ *
2920
+ * in extent tree we have:
2921
+ *
2922
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2923
+ * refs 754 gen 6 flags DATA
2924
+ * [...]
2925
+ * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
2926
+ * extent data backref root FS_TREE objectid 866 offset 0 count 1
2927
+ *
2928
+ * This function get called with:
2929
+ *
2930
+ * node->bytenr = 13631488
2931
+ * node->num_bytes = 1048576
2932
+ * root_objectid = FS_TREE
2933
+ * owner_objectid = 866
2934
+ * owner_offset = 0
2935
+ * refs_to_drop = 1
2936
+ *
2937
+ * Then we should get some like:
2938
+ *
2939
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2940
+ * refs 753 gen 6 flags DATA
2941
+ *
2942
+ * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
2943
+ */
66762944 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
66772945 struct btrfs_delayed_ref_node *node, u64 parent,
66782946 u64 root_objectid, u64 owner_objectid,
....@@ -6702,11 +2970,18 @@
67022970 if (!path)
67032971 return -ENOMEM;
67042972
6705
- path->reada = READA_FORWARD;
67062973 path->leave_spinning = 1;
67072974
67082975 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6709
- BUG_ON(!is_data && refs_to_drop != 1);
2976
+
2977
+ if (!is_data && refs_to_drop != 1) {
2978
+ btrfs_crit(info,
2979
+"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
2980
+ node->bytenr, refs_to_drop);
2981
+ ret = -EINVAL;
2982
+ btrfs_abort_transaction(trans, ret);
2983
+ goto out;
2984
+ }
67102985
67112986 if (is_data)
67122987 skinny_metadata = false;
....@@ -6715,6 +2990,13 @@
67152990 parent, root_objectid, owner_objectid,
67162991 owner_offset);
67172992 if (ret == 0) {
2993
+ /*
2994
+ * Either the inline backref or the SHARED_DATA_REF/
2995
+ * SHARED_BLOCK_REF is found
2996
+ *
2997
+ * Here is a quick path to locate EXTENT/METADATA_ITEM.
2998
+ * It's possible the EXTENT/METADATA_ITEM is near current slot.
2999
+ */
67183000 extent_slot = path->slots[0];
67193001 while (extent_slot >= 0) {
67203002 btrfs_item_key_to_cpu(path->nodes[0], &key,
....@@ -6731,13 +3013,21 @@
67313013 found_extent = 1;
67323014 break;
67333015 }
3016
+
3017
+ /* Quick path didn't find the EXTEMT/METADATA_ITEM */
67343018 if (path->slots[0] - extent_slot > 5)
67353019 break;
67363020 extent_slot--;
67373021 }
67383022
67393023 if (!found_extent) {
6740
- BUG_ON(iref);
3024
+ if (iref) {
3025
+ btrfs_crit(info,
3026
+"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
3027
+ btrfs_abort_transaction(trans, -EUCLEAN);
3028
+ goto err_dump;
3029
+ }
3030
+ /* Must be SHARED_* item, remove the backref first */
67413031 ret = remove_extent_backref(trans, path, NULL,
67423032 refs_to_drop,
67433033 is_data, &last_ref);
....@@ -6748,6 +3038,7 @@
67483038 btrfs_release_path(path);
67493039 path->leave_spinning = 1;
67503040
3041
+ /* Slow path to locate EXTENT/METADATA_ITEM */
67513042 key.objectid = bytenr;
67523043 key.type = BTRFS_EXTENT_ITEM_KEY;
67533044 key.offset = num_bytes;
....@@ -6822,19 +3113,26 @@
68223113 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
68233114 key.type == BTRFS_EXTENT_ITEM_KEY) {
68243115 struct btrfs_tree_block_info *bi;
6825
- BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
3116
+ if (item_size < sizeof(*ei) + sizeof(*bi)) {
3117
+ btrfs_crit(info,
3118
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
3119
+ key.objectid, key.type, key.offset,
3120
+ owner_objectid, item_size,
3121
+ sizeof(*ei) + sizeof(*bi));
3122
+ btrfs_abort_transaction(trans, -EUCLEAN);
3123
+ goto err_dump;
3124
+ }
68263125 bi = (struct btrfs_tree_block_info *)(ei + 1);
68273126 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
68283127 }
68293128
68303129 refs = btrfs_extent_refs(leaf, ei);
68313130 if (refs < refs_to_drop) {
6832
- btrfs_err(info,
6833
- "trying to drop %d refs but we only have %Lu for bytenr %Lu",
3131
+ btrfs_crit(info,
3132
+ "trying to drop %d refs but we only have %llu for bytenr %llu",
68343133 refs_to_drop, refs, bytenr);
6835
- ret = -EINVAL;
6836
- btrfs_abort_transaction(trans, ret);
6837
- goto out;
3134
+ btrfs_abort_transaction(trans, -EUCLEAN);
3135
+ goto err_dump;
68383136 }
68393137 refs -= refs_to_drop;
68403138
....@@ -6846,7 +3144,12 @@
68463144 * be updated by remove_extent_backref
68473145 */
68483146 if (iref) {
6849
- BUG_ON(!found_extent);
3147
+ if (!found_extent) {
3148
+ btrfs_crit(info,
3149
+"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
3150
+ btrfs_abort_transaction(trans, -EUCLEAN);
3151
+ goto err_dump;
3152
+ }
68503153 } else {
68513154 btrfs_set_extent_refs(leaf, ei, refs);
68523155 btrfs_mark_buffer_dirty(leaf);
....@@ -6861,13 +3164,39 @@
68613164 }
68623165 }
68633166 } else {
3167
+ /* In this branch refs == 1 */
68643168 if (found_extent) {
6865
- BUG_ON(is_data && refs_to_drop !=
6866
- extent_data_ref_count(path, iref));
3169
+ if (is_data && refs_to_drop !=
3170
+ extent_data_ref_count(path, iref)) {
3171
+ btrfs_crit(info,
3172
+ "invalid refs_to_drop, current refs %u refs_to_drop %u",
3173
+ extent_data_ref_count(path, iref),
3174
+ refs_to_drop);
3175
+ btrfs_abort_transaction(trans, -EUCLEAN);
3176
+ goto err_dump;
3177
+ }
68673178 if (iref) {
6868
- BUG_ON(path->slots[0] != extent_slot);
3179
+ if (path->slots[0] != extent_slot) {
3180
+ btrfs_crit(info,
3181
+"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
3182
+ key.objectid, key.type,
3183
+ key.offset);
3184
+ btrfs_abort_transaction(trans, -EUCLEAN);
3185
+ goto err_dump;
3186
+ }
68693187 } else {
6870
- BUG_ON(path->slots[0] != extent_slot + 1);
3188
+ /*
3189
+ * No inline ref, we must be at SHARED_* item,
3190
+ * And it's single ref, it must be:
3191
+ * | extent_slot ||extent_slot + 1|
3192
+ * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3193
+ */
3194
+ if (path->slots[0] != extent_slot + 1) {
3195
+ btrfs_crit(info,
3196
+ "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
3197
+ btrfs_abort_transaction(trans, -EUCLEAN);
3198
+ goto err_dump;
3199
+ }
68713200 path->slots[0] = extent_slot;
68723201 num_to_del = 2;
68733202 }
....@@ -6897,7 +3226,7 @@
68973226 goto out;
68983227 }
68993228
6900
- ret = update_block_group(trans, info, bytenr, num_bytes, 0);
3229
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
69013230 if (ret) {
69023231 btrfs_abort_transaction(trans, ret);
69033232 goto out;
....@@ -6908,6 +3237,19 @@
69083237 out:
69093238 btrfs_free_path(path);
69103239 return ret;
3240
+err_dump:
3241
+ /*
3242
+ * Leaf dump can take up a lot of log buffer, so we only do full leaf
3243
+ * dump for debug build.
3244
+ */
3245
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
3246
+ btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
3247
+ path->slots[0], extent_slot);
3248
+ btrfs_print_leaf(path->nodes[0]);
3249
+ }
3250
+
3251
+ btrfs_free_path(path);
3252
+ return -EUCLEAN;
69113253 }
69123254
69133255 /*
....@@ -6930,15 +3272,11 @@
69303272 goto out_delayed_unlock;
69313273
69323274 spin_lock(&head->lock);
6933
- if (!RB_EMPTY_ROOT(&head->ref_tree))
3275
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
69343276 goto out;
69353277
6936
- if (head->extent_op) {
6937
- if (!head->must_insert_reserved)
6938
- goto out;
6939
- btrfs_free_delayed_extent_op(head->extent_op);
6940
- head->extent_op = NULL;
6941
- }
3278
+ if (cleanup_extent_op(head) != NULL)
3279
+ goto out;
69423280
69433281 /*
69443282 * waiting for the lock here would deadlock. If someone else has it
....@@ -6947,22 +3285,9 @@
69473285 if (!mutex_trylock(&head->mutex))
69483286 goto out;
69493287
6950
- /*
6951
- * at this point we have a head with no other entries. Go
6952
- * ahead and process it.
6953
- */
6954
- rb_erase(&head->href_node, &delayed_refs->href_root);
6955
- RB_CLEAR_NODE(&head->href_node);
6956
- atomic_dec(&delayed_refs->num_entries);
6957
-
6958
- /*
6959
- * we don't take a ref on the node because we're removing it from the
6960
- * tree, so we just steal the ref the tree was holding.
6961
- */
6962
- delayed_refs->num_heads--;
6963
- if (head->processing == 0)
6964
- delayed_refs->num_heads_ready--;
3288
+ btrfs_delete_ref_head(delayed_refs, head);
69653289 head->processing = 0;
3290
+
69663291 spin_unlock(&head->lock);
69673292 spin_unlock(&delayed_refs->lock);
69683293
....@@ -6970,6 +3295,7 @@
69703295 if (head->must_insert_reserved)
69713296 ret = 1;
69723297
3298
+ btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
69733299 mutex_unlock(&head->mutex);
69743300 btrfs_put_delayed_ref_head(head);
69753301 return ret;
....@@ -6987,28 +3313,22 @@
69873313 u64 parent, int last_ref)
69883314 {
69893315 struct btrfs_fs_info *fs_info = root->fs_info;
6990
- int pin = 1;
3316
+ struct btrfs_ref generic_ref = { 0 };
69913317 int ret;
69923318
6993
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6994
- int old_ref_mod, new_ref_mod;
3319
+ btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3320
+ buf->start, buf->len, parent);
3321
+ btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3322
+ root->root_key.objectid);
69953323
6996
- btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
6997
- root->root_key.objectid,
6998
- btrfs_header_level(buf), 0,
6999
- BTRFS_DROP_DELAYED_REF);
7000
- ret = btrfs_add_delayed_tree_ref(trans, buf->start,
7001
- buf->len, parent,
7002
- root->root_key.objectid,
7003
- btrfs_header_level(buf),
7004
- BTRFS_DROP_DELAYED_REF, NULL,
7005
- &old_ref_mod, &new_ref_mod);
3324
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3325
+ btrfs_ref_tree_mod(fs_info, &generic_ref);
3326
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
70063327 BUG_ON(ret); /* -ENOMEM */
7007
- pin = old_ref_mod >= 0 && new_ref_mod < 0;
70083328 }
70093329
70103330 if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7011
- struct btrfs_block_group_cache *cache;
3331
+ struct btrfs_block_group *cache;
70123332
70133333 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
70143334 ret = check_ref_cleanup(trans, buf->start);
....@@ -7016,12 +3336,10 @@
70163336 goto out;
70173337 }
70183338
7019
- pin = 0;
70203339 cache = btrfs_lookup_block_group(fs_info, buf->start);
70213340
70223341 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7023
- pin_down_extent(fs_info, cache, buf->start,
7024
- buf->len, 1);
3342
+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
70253343 btrfs_put_block_group(cache);
70263344 goto out;
70273345 }
....@@ -7034,10 +3352,6 @@
70343352 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
70353353 }
70363354 out:
7037
- if (pin)
7038
- add_pinned_bytes(fs_info, buf->len, true,
7039
- root->root_key.objectid);
7040
-
70413355 if (last_ref) {
70423356 /*
70433357 * Deleting the buffer, clear the corrupt flag since it doesn't
....@@ -7048,120 +3362,56 @@
70483362 }
70493363
70503364 /* Can return -ENOMEM */
7051
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
7052
- struct btrfs_root *root,
7053
- u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7054
- u64 owner, u64 offset)
3365
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
70553366 {
7056
- struct btrfs_fs_info *fs_info = root->fs_info;
7057
- int old_ref_mod, new_ref_mod;
3367
+ struct btrfs_fs_info *fs_info = trans->fs_info;
70583368 int ret;
70593369
70603370 if (btrfs_is_testing(fs_info))
70613371 return 0;
70623372
7063
- if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
7064
- btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
7065
- root_objectid, owner, offset,
7066
- BTRFS_DROP_DELAYED_REF);
7067
-
70683373 /*
70693374 * tree log blocks never actually go into the extent allocation
70703375 * tree, just update pinning info and exit early.
70713376 */
7072
- if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7073
- WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
3377
+ if ((ref->type == BTRFS_REF_METADATA &&
3378
+ ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
3379
+ (ref->type == BTRFS_REF_DATA &&
3380
+ ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
70743381 /* unlocks the pinned mutex */
7075
- btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
7076
- old_ref_mod = new_ref_mod = 0;
3382
+ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
70773383 ret = 0;
7078
- } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7079
- ret = btrfs_add_delayed_tree_ref(trans, bytenr,
7080
- num_bytes, parent,
7081
- root_objectid, (int)owner,
7082
- BTRFS_DROP_DELAYED_REF, NULL,
7083
- &old_ref_mod, &new_ref_mod);
3384
+ } else if (ref->type == BTRFS_REF_METADATA) {
3385
+ ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
70843386 } else {
7085
- ret = btrfs_add_delayed_data_ref(trans, bytenr,
7086
- num_bytes, parent,
7087
- root_objectid, owner, offset,
7088
- 0, BTRFS_DROP_DELAYED_REF,
7089
- &old_ref_mod, &new_ref_mod);
3387
+ ret = btrfs_add_delayed_data_ref(trans, ref, 0);
70903388 }
70913389
7092
- if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
7093
- bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
3390
+ if (!((ref->type == BTRFS_REF_METADATA &&
3391
+ ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
3392
+ (ref->type == BTRFS_REF_DATA &&
3393
+ ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
3394
+ btrfs_ref_tree_mod(fs_info, ref);
70943395
7095
- add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
7096
- }
7097
-
7098
- return ret;
7099
-}
7100
-
7101
-/*
7102
- * when we wait for progress in the block group caching, its because
7103
- * our allocation attempt failed at least once. So, we must sleep
7104
- * and let some progress happen before we try again.
7105
- *
7106
- * This function will sleep at least once waiting for new free space to
7107
- * show up, and then it will check the block group free space numbers
7108
- * for our min num_bytes. Another option is to have it go ahead
7109
- * and look in the rbtree for a free extent of a given size, but this
7110
- * is a good start.
7111
- *
7112
- * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7113
- * any of the information in this block group.
7114
- */
7115
-static noinline void
7116
-wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7117
- u64 num_bytes)
7118
-{
7119
- struct btrfs_caching_control *caching_ctl;
7120
-
7121
- caching_ctl = get_caching_control(cache);
7122
- if (!caching_ctl)
7123
- return;
7124
-
7125
- wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7126
- (cache->free_space_ctl->free_space >= num_bytes));
7127
-
7128
- put_caching_control(caching_ctl);
7129
-}
7130
-
7131
-static noinline int
7132
-wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7133
-{
7134
- struct btrfs_caching_control *caching_ctl;
7135
- int ret = 0;
7136
-
7137
- caching_ctl = get_caching_control(cache);
7138
- if (!caching_ctl)
7139
- return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7140
-
7141
- wait_event(caching_ctl->wait, block_group_cache_done(cache));
7142
- if (cache->cached == BTRFS_CACHE_ERROR)
7143
- ret = -EIO;
7144
- put_caching_control(caching_ctl);
71453396 return ret;
71463397 }
71473398
71483399 enum btrfs_loop_type {
7149
- LOOP_CACHING_NOWAIT = 0,
7150
- LOOP_CACHING_WAIT = 1,
7151
- LOOP_ALLOC_CHUNK = 2,
7152
- LOOP_NO_EMPTY_SIZE = 3,
3400
+ LOOP_CACHING_NOWAIT,
3401
+ LOOP_CACHING_WAIT,
3402
+ LOOP_ALLOC_CHUNK,
3403
+ LOOP_NO_EMPTY_SIZE,
71533404 };
71543405
71553406 static inline void
7156
-btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
3407
+btrfs_lock_block_group(struct btrfs_block_group *cache,
71573408 int delalloc)
71583409 {
71593410 if (delalloc)
71603411 down_read(&cache->data_rwsem);
71613412 }
71623413
7163
-static inline void
7164
-btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
3414
+static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
71653415 int delalloc)
71663416 {
71673417 btrfs_get_block_group(cache);
....@@ -7169,12 +3419,13 @@
71693419 down_read(&cache->data_rwsem);
71703420 }
71713421
7172
-static struct btrfs_block_group_cache *
7173
-btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
3422
+static struct btrfs_block_group *btrfs_lock_cluster(
3423
+ struct btrfs_block_group *block_group,
71743424 struct btrfs_free_cluster *cluster,
71753425 int delalloc)
3426
+ __acquires(&cluster->refill_lock)
71763427 {
7177
- struct btrfs_block_group_cache *used_bg = NULL;
3428
+ struct btrfs_block_group *used_bg = NULL;
71783429
71793430 spin_lock(&cluster->refill_lock);
71803431 while (1) {
....@@ -7208,12 +3459,503 @@
72083459 }
72093460
72103461 static inline void
7211
-btrfs_release_block_group(struct btrfs_block_group_cache *cache,
3462
+btrfs_release_block_group(struct btrfs_block_group *cache,
72123463 int delalloc)
72133464 {
72143465 if (delalloc)
72153466 up_read(&cache->data_rwsem);
72163467 btrfs_put_block_group(cache);
3468
+}
3469
+
3470
+enum btrfs_extent_allocation_policy {
3471
+ BTRFS_EXTENT_ALLOC_CLUSTERED,
3472
+};
3473
+
3474
+/*
3475
+ * Structure used internally for find_free_extent() function. Wraps needed
3476
+ * parameters.
3477
+ */
3478
+struct find_free_extent_ctl {
3479
+ /* Basic allocation info */
3480
+ u64 num_bytes;
3481
+ u64 empty_size;
3482
+ u64 flags;
3483
+ int delalloc;
3484
+
3485
+ /* Where to start the search inside the bg */
3486
+ u64 search_start;
3487
+
3488
+ /* For clustered allocation */
3489
+ u64 empty_cluster;
3490
+ struct btrfs_free_cluster *last_ptr;
3491
+ bool use_cluster;
3492
+
3493
+ bool have_caching_bg;
3494
+ bool orig_have_caching_bg;
3495
+
3496
+ /* RAID index, converted from flags */
3497
+ int index;
3498
+
3499
+ /*
3500
+ * Current loop number, check find_free_extent_update_loop() for details
3501
+ */
3502
+ int loop;
3503
+
3504
+ /*
3505
+ * Whether we're refilling a cluster, if true we need to re-search
3506
+ * current block group but don't try to refill the cluster again.
3507
+ */
3508
+ bool retry_clustered;
3509
+
3510
+ /*
3511
+ * Whether we're updating free space cache, if true we need to re-search
3512
+ * current block group but don't try updating free space cache again.
3513
+ */
3514
+ bool retry_unclustered;
3515
+
3516
+ /* If current block group is cached */
3517
+ int cached;
3518
+
3519
+ /* Max contiguous hole found */
3520
+ u64 max_extent_size;
3521
+
3522
+ /* Total free space from free space cache, not always contiguous */
3523
+ u64 total_free_space;
3524
+
3525
+ /* Found result */
3526
+ u64 found_offset;
3527
+
3528
+ /* Hint where to start looking for an empty space */
3529
+ u64 hint_byte;
3530
+
3531
+ /* Allocation policy */
3532
+ enum btrfs_extent_allocation_policy policy;
3533
+};
3534
+
3535
+
3536
+/*
3537
+ * Helper function for find_free_extent().
3538
+ *
3539
+ * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3540
+ * Return -EAGAIN to inform caller that we need to re-search this block group
3541
+ * Return >0 to inform caller that we find nothing
3542
+ * Return 0 means we have found a location and set ffe_ctl->found_offset.
3543
+ */
3544
+static int find_free_extent_clustered(struct btrfs_block_group *bg,
3545
+ struct find_free_extent_ctl *ffe_ctl,
3546
+ struct btrfs_block_group **cluster_bg_ret)
3547
+{
3548
+ struct btrfs_block_group *cluster_bg;
3549
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3550
+ u64 aligned_cluster;
3551
+ u64 offset;
3552
+ int ret;
3553
+
3554
+ cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3555
+ if (!cluster_bg)
3556
+ goto refill_cluster;
3557
+ if (cluster_bg != bg && (cluster_bg->ro ||
3558
+ !block_group_bits(cluster_bg, ffe_ctl->flags)))
3559
+ goto release_cluster;
3560
+
3561
+ offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3562
+ ffe_ctl->num_bytes, cluster_bg->start,
3563
+ &ffe_ctl->max_extent_size);
3564
+ if (offset) {
3565
+ /* We have a block, we're done */
3566
+ spin_unlock(&last_ptr->refill_lock);
3567
+ trace_btrfs_reserve_extent_cluster(cluster_bg,
3568
+ ffe_ctl->search_start, ffe_ctl->num_bytes);
3569
+ *cluster_bg_ret = cluster_bg;
3570
+ ffe_ctl->found_offset = offset;
3571
+ return 0;
3572
+ }
3573
+ WARN_ON(last_ptr->block_group != cluster_bg);
3574
+
3575
+release_cluster:
3576
+ /*
3577
+ * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3578
+ * lets just skip it and let the allocator find whatever block it can
3579
+ * find. If we reach this point, we will have tried the cluster
3580
+ * allocator plenty of times and not have found anything, so we are
3581
+ * likely way too fragmented for the clustering stuff to find anything.
3582
+ *
3583
+ * However, if the cluster is taken from the current block group,
3584
+ * release the cluster first, so that we stand a better chance of
3585
+ * succeeding in the unclustered allocation.
3586
+ */
3587
+ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3588
+ spin_unlock(&last_ptr->refill_lock);
3589
+ btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3590
+ return -ENOENT;
3591
+ }
3592
+
3593
+ /* This cluster didn't work out, free it and start over */
3594
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
3595
+
3596
+ if (cluster_bg != bg)
3597
+ btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3598
+
3599
+refill_cluster:
3600
+ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3601
+ spin_unlock(&last_ptr->refill_lock);
3602
+ return -ENOENT;
3603
+ }
3604
+
3605
+ aligned_cluster = max_t(u64,
3606
+ ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3607
+ bg->full_stripe_len);
3608
+ ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3609
+ ffe_ctl->num_bytes, aligned_cluster);
3610
+ if (ret == 0) {
3611
+ /* Now pull our allocation out of this cluster */
3612
+ offset = btrfs_alloc_from_cluster(bg, last_ptr,
3613
+ ffe_ctl->num_bytes, ffe_ctl->search_start,
3614
+ &ffe_ctl->max_extent_size);
3615
+ if (offset) {
3616
+ /* We found one, proceed */
3617
+ spin_unlock(&last_ptr->refill_lock);
3618
+ trace_btrfs_reserve_extent_cluster(bg,
3619
+ ffe_ctl->search_start,
3620
+ ffe_ctl->num_bytes);
3621
+ ffe_ctl->found_offset = offset;
3622
+ return 0;
3623
+ }
3624
+ } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
3625
+ !ffe_ctl->retry_clustered) {
3626
+ spin_unlock(&last_ptr->refill_lock);
3627
+
3628
+ ffe_ctl->retry_clustered = true;
3629
+ btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3630
+ ffe_ctl->empty_cluster + ffe_ctl->empty_size);
3631
+ return -EAGAIN;
3632
+ }
3633
+ /*
3634
+ * At this point we either didn't find a cluster or we weren't able to
3635
+ * allocate a block from our cluster. Free the cluster we've been
3636
+ * trying to use, and go to the next block group.
3637
+ */
3638
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
3639
+ spin_unlock(&last_ptr->refill_lock);
3640
+ return 1;
3641
+}
3642
+
3643
+/*
3644
+ * Return >0 to inform caller that we find nothing
3645
+ * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3646
+ * Return -EAGAIN to inform caller that we need to re-search this block group
3647
+ */
3648
+static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3649
+ struct find_free_extent_ctl *ffe_ctl)
3650
+{
3651
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3652
+ u64 offset;
3653
+
3654
+ /*
3655
+ * We are doing an unclustered allocation, set the fragmented flag so
3656
+ * we don't bother trying to setup a cluster again until we get more
3657
+ * space.
3658
+ */
3659
+ if (unlikely(last_ptr)) {
3660
+ spin_lock(&last_ptr->lock);
3661
+ last_ptr->fragmented = 1;
3662
+ spin_unlock(&last_ptr->lock);
3663
+ }
3664
+ if (ffe_ctl->cached) {
3665
+ struct btrfs_free_space_ctl *free_space_ctl;
3666
+
3667
+ free_space_ctl = bg->free_space_ctl;
3668
+ spin_lock(&free_space_ctl->tree_lock);
3669
+ if (free_space_ctl->free_space <
3670
+ ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3671
+ ffe_ctl->empty_size) {
3672
+ ffe_ctl->total_free_space = max_t(u64,
3673
+ ffe_ctl->total_free_space,
3674
+ free_space_ctl->free_space);
3675
+ spin_unlock(&free_space_ctl->tree_lock);
3676
+ return 1;
3677
+ }
3678
+ spin_unlock(&free_space_ctl->tree_lock);
3679
+ }
3680
+
3681
+ offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3682
+ ffe_ctl->num_bytes, ffe_ctl->empty_size,
3683
+ &ffe_ctl->max_extent_size);
3684
+
3685
+ /*
3686
+ * If we didn't find a chunk, and we haven't failed on this block group
3687
+ * before, and this block group is in the middle of caching and we are
3688
+ * ok with waiting, then go ahead and wait for progress to be made, and
3689
+ * set @retry_unclustered to true.
3690
+ *
3691
+ * If @retry_unclustered is true then we've already waited on this
3692
+ * block group once and should move on to the next block group.
3693
+ */
3694
+ if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
3695
+ ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3696
+ btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3697
+ ffe_ctl->empty_size);
3698
+ ffe_ctl->retry_unclustered = true;
3699
+ return -EAGAIN;
3700
+ } else if (!offset) {
3701
+ return 1;
3702
+ }
3703
+ ffe_ctl->found_offset = offset;
3704
+ return 0;
3705
+}
3706
+
3707
+static int do_allocation_clustered(struct btrfs_block_group *block_group,
3708
+ struct find_free_extent_ctl *ffe_ctl,
3709
+ struct btrfs_block_group **bg_ret)
3710
+{
3711
+ int ret;
3712
+
3713
+ /* We want to try and use the cluster allocator, so lets look there */
3714
+ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3715
+ ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3716
+ if (ret >= 0 || ret == -EAGAIN)
3717
+ return ret;
3718
+ /* ret == -ENOENT case falls through */
3719
+ }
3720
+
3721
+ return find_free_extent_unclustered(block_group, ffe_ctl);
3722
+}
3723
+
3724
+static int do_allocation(struct btrfs_block_group *block_group,
3725
+ struct find_free_extent_ctl *ffe_ctl,
3726
+ struct btrfs_block_group **bg_ret)
3727
+{
3728
+ switch (ffe_ctl->policy) {
3729
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
3730
+ return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3731
+ default:
3732
+ BUG();
3733
+ }
3734
+}
3735
+
3736
+static void release_block_group(struct btrfs_block_group *block_group,
3737
+ struct find_free_extent_ctl *ffe_ctl,
3738
+ int delalloc)
3739
+{
3740
+ switch (ffe_ctl->policy) {
3741
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
3742
+ ffe_ctl->retry_clustered = false;
3743
+ ffe_ctl->retry_unclustered = false;
3744
+ break;
3745
+ default:
3746
+ BUG();
3747
+ }
3748
+
3749
+ BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
3750
+ ffe_ctl->index);
3751
+ btrfs_release_block_group(block_group, delalloc);
3752
+}
3753
+
3754
+static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
3755
+ struct btrfs_key *ins)
3756
+{
3757
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3758
+
3759
+ if (!ffe_ctl->use_cluster && last_ptr) {
3760
+ spin_lock(&last_ptr->lock);
3761
+ last_ptr->window_start = ins->objectid;
3762
+ spin_unlock(&last_ptr->lock);
3763
+ }
3764
+}
3765
+
3766
+static void found_extent(struct find_free_extent_ctl *ffe_ctl,
3767
+ struct btrfs_key *ins)
3768
+{
3769
+ switch (ffe_ctl->policy) {
3770
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
3771
+ found_extent_clustered(ffe_ctl, ins);
3772
+ break;
3773
+ default:
3774
+ BUG();
3775
+ }
3776
+}
3777
+
3778
+static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
3779
+{
3780
+ switch (ffe_ctl->policy) {
3781
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
3782
+ /*
3783
+ * If we can't allocate a new chunk we've already looped through
3784
+ * at least once, move on to the NO_EMPTY_SIZE case.
3785
+ */
3786
+ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
3787
+ return 0;
3788
+ default:
3789
+ BUG();
3790
+ }
3791
+}
3792
+
3793
+/*
3794
+ * Return >0 means caller needs to re-search for free extent
3795
+ * Return 0 means we have the needed free extent.
3796
+ * Return <0 means we failed to locate any free extent.
3797
+ */
3798
+static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
3799
+ struct btrfs_key *ins,
3800
+ struct find_free_extent_ctl *ffe_ctl,
3801
+ bool full_search)
3802
+{
3803
+ struct btrfs_root *root = fs_info->extent_root;
3804
+ int ret;
3805
+
3806
+ if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
3807
+ ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
3808
+ ffe_ctl->orig_have_caching_bg = true;
3809
+
3810
+ if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
3811
+ ffe_ctl->have_caching_bg)
3812
+ return 1;
3813
+
3814
+ if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
3815
+ return 1;
3816
+
3817
+ if (ins->objectid) {
3818
+ found_extent(ffe_ctl, ins);
3819
+ return 0;
3820
+ }
3821
+
3822
+ /*
3823
+ * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
3824
+ * caching kthreads as we move along
3825
+ * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3826
+ * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3827
+ * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3828
+ * again
3829
+ */
3830
+ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
3831
+ ffe_ctl->index = 0;
3832
+ if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
3833
+ /*
3834
+ * We want to skip the LOOP_CACHING_WAIT step if we
3835
+ * don't have any uncached bgs and we've already done a
3836
+ * full search through.
3837
+ */
3838
+ if (ffe_ctl->orig_have_caching_bg || !full_search)
3839
+ ffe_ctl->loop = LOOP_CACHING_WAIT;
3840
+ else
3841
+ ffe_ctl->loop = LOOP_ALLOC_CHUNK;
3842
+ } else {
3843
+ ffe_ctl->loop++;
3844
+ }
3845
+
3846
+ if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
3847
+ struct btrfs_trans_handle *trans;
3848
+ int exist = 0;
3849
+
3850
+ trans = current->journal_info;
3851
+ if (trans)
3852
+ exist = 1;
3853
+ else
3854
+ trans = btrfs_join_transaction(root);
3855
+
3856
+ if (IS_ERR(trans)) {
3857
+ ret = PTR_ERR(trans);
3858
+ return ret;
3859
+ }
3860
+
3861
+ ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
3862
+ CHUNK_ALLOC_FORCE);
3863
+
3864
+ /* Do not bail out on ENOSPC since we can do more. */
3865
+ if (ret == -ENOSPC)
3866
+ ret = chunk_allocation_failed(ffe_ctl);
3867
+ else if (ret < 0)
3868
+ btrfs_abort_transaction(trans, ret);
3869
+ else
3870
+ ret = 0;
3871
+ if (!exist)
3872
+ btrfs_end_transaction(trans);
3873
+ if (ret)
3874
+ return ret;
3875
+ }
3876
+
3877
+ if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
3878
+ if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
3879
+ return -ENOSPC;
3880
+
3881
+ /*
3882
+ * Don't loop again if we already have no empty_size and
3883
+ * no empty_cluster.
3884
+ */
3885
+ if (ffe_ctl->empty_size == 0 &&
3886
+ ffe_ctl->empty_cluster == 0)
3887
+ return -ENOSPC;
3888
+ ffe_ctl->empty_size = 0;
3889
+ ffe_ctl->empty_cluster = 0;
3890
+ }
3891
+ return 1;
3892
+ }
3893
+ return -ENOSPC;
3894
+}
3895
+
3896
+static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
3897
+ struct find_free_extent_ctl *ffe_ctl,
3898
+ struct btrfs_space_info *space_info,
3899
+ struct btrfs_key *ins)
3900
+{
3901
+ /*
3902
+ * If our free space is heavily fragmented we may not be able to make
3903
+ * big contiguous allocations, so instead of doing the expensive search
3904
+ * for free space, simply return ENOSPC with our max_extent_size so we
3905
+ * can go ahead and search for a more manageable chunk.
3906
+ *
3907
+ * If our max_extent_size is large enough for our allocation simply
3908
+ * disable clustering since we will likely not be able to find enough
3909
+ * space to create a cluster and induce latency trying.
3910
+ */
3911
+ if (space_info->max_extent_size) {
3912
+ spin_lock(&space_info->lock);
3913
+ if (space_info->max_extent_size &&
3914
+ ffe_ctl->num_bytes > space_info->max_extent_size) {
3915
+ ins->offset = space_info->max_extent_size;
3916
+ spin_unlock(&space_info->lock);
3917
+ return -ENOSPC;
3918
+ } else if (space_info->max_extent_size) {
3919
+ ffe_ctl->use_cluster = false;
3920
+ }
3921
+ spin_unlock(&space_info->lock);
3922
+ }
3923
+
3924
+ ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
3925
+ &ffe_ctl->empty_cluster);
3926
+ if (ffe_ctl->last_ptr) {
3927
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3928
+
3929
+ spin_lock(&last_ptr->lock);
3930
+ if (last_ptr->block_group)
3931
+ ffe_ctl->hint_byte = last_ptr->window_start;
3932
+ if (last_ptr->fragmented) {
3933
+ /*
3934
+ * We still set window_start so we can keep track of the
3935
+ * last place we found an allocation to try and save
3936
+ * some time.
3937
+ */
3938
+ ffe_ctl->hint_byte = last_ptr->window_start;
3939
+ ffe_ctl->use_cluster = false;
3940
+ }
3941
+ spin_unlock(&last_ptr->lock);
3942
+ }
3943
+
3944
+ return 0;
3945
+}
3946
+
3947
+static int prepare_allocation(struct btrfs_fs_info *fs_info,
3948
+ struct find_free_extent_ctl *ffe_ctl,
3949
+ struct btrfs_space_info *space_info,
3950
+ struct btrfs_key *ins)
3951
+{
3952
+ switch (ffe_ctl->policy) {
3953
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
3954
+ return prepare_allocation_clustered(fs_info, ffe_ctl,
3955
+ space_info, ins);
3956
+ default:
3957
+ BUG();
3958
+ }
72173959 }
72183960
72193961 /*
....@@ -7226,87 +3968,76 @@
72263968 *
72273969 * If there is no suitable free space, we will record the max size of
72283970 * the free space extent currently.
3971
+ *
3972
+ * The overall logic and call chain:
3973
+ *
3974
+ * find_free_extent()
3975
+ * |- Iterate through all block groups
3976
+ * | |- Get a valid block group
3977
+ * | |- Try to do clustered allocation in that block group
3978
+ * | |- Try to do unclustered allocation in that block group
3979
+ * | |- Check if the result is valid
3980
+ * | | |- If valid, then exit
3981
+ * | |- Jump to next block group
3982
+ * |
3983
+ * |- Push harder to find free extents
3984
+ * |- If not found, re-iterate all block groups
72293985 */
7230
-static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
3986
+static noinline int find_free_extent(struct btrfs_root *root,
72313987 u64 ram_bytes, u64 num_bytes, u64 empty_size,
7232
- u64 hint_byte, struct btrfs_key *ins,
3988
+ u64 hint_byte_orig, struct btrfs_key *ins,
72333989 u64 flags, int delalloc)
72343990 {
3991
+ struct btrfs_fs_info *fs_info = root->fs_info;
72353992 int ret = 0;
7236
- struct btrfs_root *root = fs_info->extent_root;
7237
- struct btrfs_free_cluster *last_ptr = NULL;
7238
- struct btrfs_block_group_cache *block_group = NULL;
7239
- u64 search_start = 0;
7240
- u64 max_extent_size = 0;
7241
- u64 max_free_space = 0;
7242
- u64 empty_cluster = 0;
3993
+ int cache_block_group_error = 0;
3994
+ struct btrfs_block_group *block_group = NULL;
3995
+ struct find_free_extent_ctl ffe_ctl = {0};
72433996 struct btrfs_space_info *space_info;
7244
- int loop = 0;
7245
- int index = btrfs_bg_flags_to_raid_index(flags);
7246
- bool failed_cluster_refill = false;
7247
- bool failed_alloc = false;
7248
- bool use_cluster = true;
7249
- bool have_caching_bg = false;
7250
- bool orig_have_caching_bg = false;
72513997 bool full_search = false;
72523998
72533999 WARN_ON(num_bytes < fs_info->sectorsize);
4000
+
4001
+ ffe_ctl.num_bytes = num_bytes;
4002
+ ffe_ctl.empty_size = empty_size;
4003
+ ffe_ctl.flags = flags;
4004
+ ffe_ctl.search_start = 0;
4005
+ ffe_ctl.delalloc = delalloc;
4006
+ ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
4007
+ ffe_ctl.have_caching_bg = false;
4008
+ ffe_ctl.orig_have_caching_bg = false;
4009
+ ffe_ctl.found_offset = 0;
4010
+ ffe_ctl.hint_byte = hint_byte_orig;
4011
+ ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4012
+
4013
+ /* For clustered allocation */
4014
+ ffe_ctl.retry_clustered = false;
4015
+ ffe_ctl.retry_unclustered = false;
4016
+ ffe_ctl.last_ptr = NULL;
4017
+ ffe_ctl.use_cluster = true;
4018
+
72544019 ins->type = BTRFS_EXTENT_ITEM_KEY;
72554020 ins->objectid = 0;
72564021 ins->offset = 0;
72574022
7258
- trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
4023
+ trace_find_free_extent(root, num_bytes, empty_size, flags);
72594024
7260
- space_info = __find_space_info(fs_info, flags);
4025
+ space_info = btrfs_find_space_info(fs_info, flags);
72614026 if (!space_info) {
72624027 btrfs_err(fs_info, "No space info for %llu", flags);
72634028 return -ENOSPC;
72644029 }
72654030
7266
- /*
7267
- * If our free space is heavily fragmented we may not be able to make
7268
- * big contiguous allocations, so instead of doing the expensive search
7269
- * for free space, simply return ENOSPC with our max_extent_size so we
7270
- * can go ahead and search for a more manageable chunk.
7271
- *
7272
- * If our max_extent_size is large enough for our allocation simply
7273
- * disable clustering since we will likely not be able to find enough
7274
- * space to create a cluster and induce latency trying.
7275
- */
7276
- if (unlikely(space_info->max_extent_size)) {
7277
- spin_lock(&space_info->lock);
7278
- if (space_info->max_extent_size &&
7279
- num_bytes > space_info->max_extent_size) {
7280
- ins->offset = space_info->max_extent_size;
7281
- spin_unlock(&space_info->lock);
7282
- return -ENOSPC;
7283
- } else if (space_info->max_extent_size) {
7284
- use_cluster = false;
7285
- }
7286
- spin_unlock(&space_info->lock);
7287
- }
4031
+ ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
4032
+ if (ret < 0)
4033
+ return ret;
72884034
7289
- last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
7290
- if (last_ptr) {
7291
- spin_lock(&last_ptr->lock);
7292
- if (last_ptr->block_group)
7293
- hint_byte = last_ptr->window_start;
7294
- if (last_ptr->fragmented) {
7295
- /*
7296
- * We still set window_start so we can keep track of the
7297
- * last place we found an allocation to try and save
7298
- * some time.
7299
- */
7300
- hint_byte = last_ptr->window_start;
7301
- use_cluster = false;
7302
- }
7303
- spin_unlock(&last_ptr->lock);
7304
- }
7305
-
7306
- search_start = max(search_start, first_logical_byte(fs_info, 0));
7307
- search_start = max(search_start, hint_byte);
7308
- if (search_start == hint_byte) {
7309
- block_group = btrfs_lookup_block_group(fs_info, search_start);
4035
+ ffe_ctl.search_start = max(ffe_ctl.search_start,
4036
+ first_logical_byte(fs_info, 0));
4037
+ ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
4038
+ if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
4039
+ block_group = btrfs_lookup_block_group(fs_info,
4040
+ ffe_ctl.search_start);
73104041 /*
73114042 * we don't want to use the block group if it doesn't match our
73124043 * allocation bits, or if its not cached.
....@@ -7328,7 +4059,7 @@
73284059 btrfs_put_block_group(block_group);
73294060 up_read(&space_info->groups_sem);
73304061 } else {
7331
- index = btrfs_bg_flags_to_raid_index(
4062
+ ffe_ctl.index = btrfs_bg_flags_to_raid_index(
73324063 block_group->flags);
73334064 btrfs_lock_block_group(block_group, delalloc);
73344065 goto have_block_group;
....@@ -7338,21 +4069,21 @@
73384069 }
73394070 }
73404071 search:
7341
- have_caching_bg = false;
7342
- if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
4072
+ ffe_ctl.have_caching_bg = false;
4073
+ if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
4074
+ ffe_ctl.index == 0)
73434075 full_search = true;
73444076 down_read(&space_info->groups_sem);
7345
- list_for_each_entry(block_group, &space_info->block_groups[index],
7346
- list) {
7347
- u64 offset;
7348
- int cached;
4077
+ list_for_each_entry(block_group,
4078
+ &space_info->block_groups[ffe_ctl.index], list) {
4079
+ struct btrfs_block_group *bg_ret;
73494080
73504081 /* If the block group is read-only, we can skip it entirely. */
73514082 if (unlikely(block_group->ro))
73524083 continue;
73534084
73544085 btrfs_grab_block_group(block_group, delalloc);
7355
- search_start = block_group->key.objectid;
4086
+ ffe_ctl.search_start = block_group->start;
73564087
73574088 /*
73584089 * this can happen if we end up cycling through all the
....@@ -7361,9 +4092,8 @@
73614092 */
73624093 if (!block_group_bits(block_group, flags)) {
73634094 u64 extra = BTRFS_BLOCK_GROUP_DUP |
7364
- BTRFS_BLOCK_GROUP_RAID1 |
7365
- BTRFS_BLOCK_GROUP_RAID5 |
7366
- BTRFS_BLOCK_GROUP_RAID6 |
4095
+ BTRFS_BLOCK_GROUP_RAID1_MASK |
4096
+ BTRFS_BLOCK_GROUP_RAID56_MASK |
73674097 BTRFS_BLOCK_GROUP_RAID10;
73684098
73694099 /*
....@@ -7384,379 +4114,101 @@
73844114 }
73854115
73864116 have_block_group:
7387
- cached = block_group_cache_done(block_group);
7388
- if (unlikely(!cached)) {
7389
- have_caching_bg = true;
7390
- ret = cache_block_group(block_group, 0);
7391
- BUG_ON(ret < 0);
4117
+ ffe_ctl.cached = btrfs_block_group_done(block_group);
4118
+ if (unlikely(!ffe_ctl.cached)) {
4119
+ ffe_ctl.have_caching_bg = true;
4120
+ ret = btrfs_cache_block_group(block_group, 0);
4121
+
4122
+ /*
4123
+ * If we get ENOMEM here or something else we want to
4124
+ * try other block groups, because it may not be fatal.
4125
+ * However if we can't find anything else we need to
4126
+ * save our return here so that we return the actual
4127
+ * error that caused problems, not ENOSPC.
4128
+ */
4129
+ if (ret < 0) {
4130
+ if (!cache_block_group_error)
4131
+ cache_block_group_error = ret;
4132
+ ret = 0;
4133
+ goto loop;
4134
+ }
73924135 ret = 0;
73934136 }
73944137
73954138 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
73964139 goto loop;
73974140
7398
- /*
7399
- * Ok we want to try and use the cluster allocator, so
7400
- * lets look there
7401
- */
7402
- if (last_ptr && use_cluster) {
7403
- struct btrfs_block_group_cache *used_block_group;
7404
- unsigned long aligned_cluster;
7405
- /*
7406
- * the refill lock keeps out other
7407
- * people trying to start a new cluster
7408
- */
7409
- used_block_group = btrfs_lock_cluster(block_group,
7410
- last_ptr,
7411
- delalloc);
7412
- if (!used_block_group)
7413
- goto refill_cluster;
7414
-
7415
- if (used_block_group != block_group &&
7416
- (used_block_group->ro ||
7417
- !block_group_bits(used_block_group, flags)))
7418
- goto release_cluster;
7419
-
7420
- offset = btrfs_alloc_from_cluster(used_block_group,
7421
- last_ptr,
7422
- num_bytes,
7423
- used_block_group->key.objectid,
7424
- &max_extent_size);
7425
- if (offset) {
7426
- /* we have a block, we're done */
7427
- spin_unlock(&last_ptr->refill_lock);
7428
- trace_btrfs_reserve_extent_cluster(
7429
- used_block_group,
7430
- search_start, num_bytes);
7431
- if (used_block_group != block_group) {
7432
- btrfs_release_block_group(block_group,
7433
- delalloc);
7434
- block_group = used_block_group;
7435
- }
7436
- goto checks;
4141
+ bg_ret = NULL;
4142
+ ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
4143
+ if (ret == 0) {
4144
+ if (bg_ret && bg_ret != block_group) {
4145
+ btrfs_release_block_group(block_group, delalloc);
4146
+ block_group = bg_ret;
74374147 }
7438
-
7439
- WARN_ON(last_ptr->block_group != used_block_group);
7440
-release_cluster:
7441
- /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7442
- * set up a new clusters, so lets just skip it
7443
- * and let the allocator find whatever block
7444
- * it can find. If we reach this point, we
7445
- * will have tried the cluster allocator
7446
- * plenty of times and not have found
7447
- * anything, so we are likely way too
7448
- * fragmented for the clustering stuff to find
7449
- * anything.
7450
- *
7451
- * However, if the cluster is taken from the
7452
- * current block group, release the cluster
7453
- * first, so that we stand a better chance of
7454
- * succeeding in the unclustered
7455
- * allocation. */
7456
- if (loop >= LOOP_NO_EMPTY_SIZE &&
7457
- used_block_group != block_group) {
7458
- spin_unlock(&last_ptr->refill_lock);
7459
- btrfs_release_block_group(used_block_group,
7460
- delalloc);
7461
- goto unclustered_alloc;
7462
- }
7463
-
7464
- /*
7465
- * this cluster didn't work out, free it and
7466
- * start over
7467
- */
7468
- btrfs_return_cluster_to_free_space(NULL, last_ptr);
7469
-
7470
- if (used_block_group != block_group)
7471
- btrfs_release_block_group(used_block_group,
7472
- delalloc);
7473
-refill_cluster:
7474
- if (loop >= LOOP_NO_EMPTY_SIZE) {
7475
- spin_unlock(&last_ptr->refill_lock);
7476
- goto unclustered_alloc;
7477
- }
7478
-
7479
- aligned_cluster = max_t(unsigned long,
7480
- empty_cluster + empty_size,
7481
- block_group->full_stripe_len);
7482
-
7483
- /* allocate a cluster in this block group */
7484
- ret = btrfs_find_space_cluster(fs_info, block_group,
7485
- last_ptr, search_start,
7486
- num_bytes,
7487
- aligned_cluster);
7488
- if (ret == 0) {
7489
- /*
7490
- * now pull our allocation out of this
7491
- * cluster
7492
- */
7493
- offset = btrfs_alloc_from_cluster(block_group,
7494
- last_ptr,
7495
- num_bytes,
7496
- search_start,
7497
- &max_extent_size);
7498
- if (offset) {
7499
- /* we found one, proceed */
7500
- spin_unlock(&last_ptr->refill_lock);
7501
- trace_btrfs_reserve_extent_cluster(
7502
- block_group, search_start,
7503
- num_bytes);
7504
- goto checks;
7505
- }
7506
- } else if (!cached && loop > LOOP_CACHING_NOWAIT
7507
- && !failed_cluster_refill) {
7508
- spin_unlock(&last_ptr->refill_lock);
7509
-
7510
- failed_cluster_refill = true;
7511
- wait_block_group_cache_progress(block_group,
7512
- num_bytes + empty_cluster + empty_size);
7513
- goto have_block_group;
7514
- }
7515
-
7516
- /*
7517
- * at this point we either didn't find a cluster
7518
- * or we weren't able to allocate a block from our
7519
- * cluster. Free the cluster we've been trying
7520
- * to use, and go to the next block group
7521
- */
7522
- btrfs_return_cluster_to_free_space(NULL, last_ptr);
7523
- spin_unlock(&last_ptr->refill_lock);
7524
- goto loop;
7525
- }
7526
-
7527
-unclustered_alloc:
7528
- /*
7529
- * We are doing an unclustered alloc, set the fragmented flag so
7530
- * we don't bother trying to setup a cluster again until we get
7531
- * more space.
7532
- */
7533
- if (unlikely(last_ptr)) {
7534
- spin_lock(&last_ptr->lock);
7535
- last_ptr->fragmented = 1;
7536
- spin_unlock(&last_ptr->lock);
7537
- }
7538
- if (cached) {
7539
- struct btrfs_free_space_ctl *ctl =
7540
- block_group->free_space_ctl;
7541
-
7542
- spin_lock(&ctl->tree_lock);
7543
- if (ctl->free_space <
7544
- num_bytes + empty_cluster + empty_size) {
7545
- max_free_space = max(max_free_space,
7546
- ctl->free_space);
7547
- spin_unlock(&ctl->tree_lock);
7548
- goto loop;
7549
- }
7550
- spin_unlock(&ctl->tree_lock);
7551
- }
7552
-
7553
- offset = btrfs_find_space_for_alloc(block_group, search_start,
7554
- num_bytes, empty_size,
7555
- &max_extent_size);
7556
- /*
7557
- * If we didn't find a chunk, and we haven't failed on this
7558
- * block group before, and this block group is in the middle of
7559
- * caching and we are ok with waiting, then go ahead and wait
7560
- * for progress to be made, and set failed_alloc to true.
7561
- *
7562
- * If failed_alloc is true then we've already waited on this
7563
- * block group once and should move on to the next block group.
7564
- */
7565
- if (!offset && !failed_alloc && !cached &&
7566
- loop > LOOP_CACHING_NOWAIT) {
7567
- wait_block_group_cache_progress(block_group,
7568
- num_bytes + empty_size);
7569
- failed_alloc = true;
4148
+ } else if (ret == -EAGAIN) {
75704149 goto have_block_group;
7571
- } else if (!offset) {
4150
+ } else if (ret > 0) {
75724151 goto loop;
75734152 }
7574
-checks:
7575
- search_start = round_up(offset, fs_info->stripesize);
4153
+
4154
+ /* Checks */
4155
+ ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
4156
+ fs_info->stripesize);
75764157
75774158 /* move on to the next group */
7578
- if (search_start + num_bytes >
7579
- block_group->key.objectid + block_group->key.offset) {
7580
- btrfs_add_free_space(block_group, offset, num_bytes);
4159
+ if (ffe_ctl.search_start + num_bytes >
4160
+ block_group->start + block_group->length) {
4161
+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4162
+ num_bytes);
75814163 goto loop;
75824164 }
75834165
7584
- if (offset < search_start)
7585
- btrfs_add_free_space(block_group, offset,
7586
- search_start - offset);
4166
+ if (ffe_ctl.found_offset < ffe_ctl.search_start)
4167
+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4168
+ ffe_ctl.search_start - ffe_ctl.found_offset);
75874169
75884170 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
75894171 num_bytes, delalloc);
75904172 if (ret == -EAGAIN) {
7591
- btrfs_add_free_space(block_group, offset, num_bytes);
4173
+ btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4174
+ num_bytes);
75924175 goto loop;
75934176 }
75944177 btrfs_inc_block_group_reservations(block_group);
75954178
75964179 /* we are all good, lets return */
7597
- ins->objectid = search_start;
4180
+ ins->objectid = ffe_ctl.search_start;
75984181 ins->offset = num_bytes;
75994182
7600
- trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
4183
+ trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
4184
+ num_bytes);
76014185 btrfs_release_block_group(block_group, delalloc);
76024186 break;
76034187 loop:
7604
- failed_cluster_refill = false;
7605
- failed_alloc = false;
7606
- BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
7607
- index);
7608
- btrfs_release_block_group(block_group, delalloc);
4188
+ release_block_group(block_group, &ffe_ctl, delalloc);
76094189 cond_resched();
76104190 }
76114191 up_read(&space_info->groups_sem);
76124192
7613
- if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7614
- && !orig_have_caching_bg)
7615
- orig_have_caching_bg = true;
7616
-
7617
- if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
4193
+ ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
4194
+ if (ret > 0)
76184195 goto search;
76194196
7620
- if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7621
- goto search;
7622
-
7623
- /*
7624
- * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7625
- * caching kthreads as we move along
7626
- * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7627
- * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7628
- * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7629
- * again
7630
- */
7631
- if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7632
- index = 0;
7633
- if (loop == LOOP_CACHING_NOWAIT) {
7634
- /*
7635
- * We want to skip the LOOP_CACHING_WAIT step if we
7636
- * don't have any uncached bgs and we've already done a
7637
- * full search through.
7638
- */
7639
- if (orig_have_caching_bg || !full_search)
7640
- loop = LOOP_CACHING_WAIT;
7641
- else
7642
- loop = LOOP_ALLOC_CHUNK;
7643
- } else {
7644
- loop++;
7645
- }
7646
-
7647
- if (loop == LOOP_ALLOC_CHUNK) {
7648
- struct btrfs_trans_handle *trans;
7649
- int exist = 0;
7650
-
7651
- trans = current->journal_info;
7652
- if (trans)
7653
- exist = 1;
7654
- else
7655
- trans = btrfs_join_transaction(root);
7656
-
7657
- if (IS_ERR(trans)) {
7658
- ret = PTR_ERR(trans);
7659
- goto out;
7660
- }
7661
-
7662
- ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
7663
-
7664
- /*
7665
- * If we can't allocate a new chunk we've already looped
7666
- * through at least once, move on to the NO_EMPTY_SIZE
7667
- * case.
7668
- */
7669
- if (ret == -ENOSPC)
7670
- loop = LOOP_NO_EMPTY_SIZE;
7671
-
7672
- /*
7673
- * Do not bail out on ENOSPC since we
7674
- * can do more things.
7675
- */
7676
- if (ret < 0 && ret != -ENOSPC)
7677
- btrfs_abort_transaction(trans, ret);
7678
- else
7679
- ret = 0;
7680
- if (!exist)
7681
- btrfs_end_transaction(trans);
7682
- if (ret)
7683
- goto out;
7684
- }
7685
-
7686
- if (loop == LOOP_NO_EMPTY_SIZE) {
7687
- /*
7688
- * Don't loop again if we already have no empty_size and
7689
- * no empty_cluster.
7690
- */
7691
- if (empty_size == 0 &&
7692
- empty_cluster == 0) {
7693
- ret = -ENOSPC;
7694
- goto out;
7695
- }
7696
- empty_size = 0;
7697
- empty_cluster = 0;
7698
- }
7699
-
7700
- goto search;
7701
- } else if (!ins->objectid) {
7702
- ret = -ENOSPC;
7703
- } else if (ins->objectid) {
7704
- if (!use_cluster && last_ptr) {
7705
- spin_lock(&last_ptr->lock);
7706
- last_ptr->window_start = ins->objectid;
7707
- spin_unlock(&last_ptr->lock);
7708
- }
7709
- ret = 0;
7710
- }
7711
-out:
7712
- if (ret == -ENOSPC) {
7713
- if (!max_extent_size)
7714
- max_extent_size = max_free_space;
4197
+ if (ret == -ENOSPC && !cache_block_group_error) {
4198
+ /*
4199
+ * Use ffe_ctl->total_free_space as fallback if we can't find
4200
+ * any contiguous hole.
4201
+ */
4202
+ if (!ffe_ctl.max_extent_size)
4203
+ ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
77154204 spin_lock(&space_info->lock);
7716
- space_info->max_extent_size = max_extent_size;
4205
+ space_info->max_extent_size = ffe_ctl.max_extent_size;
77174206 spin_unlock(&space_info->lock);
7718
- ins->offset = max_extent_size;
4207
+ ins->offset = ffe_ctl.max_extent_size;
4208
+ } else if (ret == -ENOSPC) {
4209
+ ret = cache_block_group_error;
77194210 }
77204211 return ret;
7721
-}
7722
-
7723
-static void dump_space_info(struct btrfs_fs_info *fs_info,
7724
- struct btrfs_space_info *info, u64 bytes,
7725
- int dump_block_groups)
7726
-{
7727
- struct btrfs_block_group_cache *cache;
7728
- int index = 0;
7729
-
7730
- spin_lock(&info->lock);
7731
- btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
7732
- info->flags,
7733
- info->total_bytes - btrfs_space_info_used(info, true),
7734
- info->full ? "" : "not ");
7735
- btrfs_info(fs_info,
7736
- "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
7737
- info->total_bytes, info->bytes_used, info->bytes_pinned,
7738
- info->bytes_reserved, info->bytes_may_use,
7739
- info->bytes_readonly);
7740
- spin_unlock(&info->lock);
7741
-
7742
- if (!dump_block_groups)
7743
- return;
7744
-
7745
- down_read(&info->groups_sem);
7746
-again:
7747
- list_for_each_entry(cache, &info->block_groups[index], list) {
7748
- spin_lock(&cache->lock);
7749
- btrfs_info(fs_info,
7750
- "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
7751
- cache->key.objectid, cache->key.offset,
7752
- btrfs_block_group_used(&cache->item), cache->pinned,
7753
- cache->reserved, cache->ro ? "[readonly]" : "");
7754
- btrfs_dump_free_space(cache, bytes);
7755
- spin_unlock(&cache->lock);
7756
- }
7757
- if (++index < BTRFS_NR_RAID_TYPES)
7758
- goto again;
7759
- up_read(&info->groups_sem);
77604212 }
77614213
77624214 /*
....@@ -7817,7 +4269,7 @@
78174269 flags = get_alloc_profile_by_root(root, is_data);
78184270 again:
78194271 WARN_ON(num_bytes < fs_info->sectorsize);
7820
- ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
4272
+ ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
78214273 hint_byte, ins, flags, delalloc);
78224274 if (!ret && !is_data) {
78234275 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
....@@ -7834,24 +4286,23 @@
78344286 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
78354287 struct btrfs_space_info *sinfo;
78364288
7837
- sinfo = __find_space_info(fs_info, flags);
4289
+ sinfo = btrfs_find_space_info(fs_info, flags);
78384290 btrfs_err(fs_info,
78394291 "allocation failed flags %llu, wanted %llu",
78404292 flags, num_bytes);
78414293 if (sinfo)
7842
- dump_space_info(fs_info, sinfo, num_bytes, 1);
4294
+ btrfs_dump_space_info(fs_info, sinfo,
4295
+ num_bytes, 1);
78434296 }
78444297 }
78454298
78464299 return ret;
78474300 }
78484301
7849
-static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7850
- u64 start, u64 len,
7851
- int pin, int delalloc)
4302
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4303
+ u64 start, u64 len, int delalloc)
78524304 {
7853
- struct btrfs_block_group_cache *cache;
7854
- int ret = 0;
4305
+ struct btrfs_block_group *cache;
78554306
78564307 cache = btrfs_lookup_block_group(fs_info, start);
78574308 if (!cache) {
....@@ -7860,30 +4311,30 @@
78604311 return -ENOSPC;
78614312 }
78624313
7863
- if (pin)
7864
- pin_down_extent(fs_info, cache, start, len, 1);
7865
- else {
7866
- if (btrfs_test_opt(fs_info, DISCARD))
7867
- ret = btrfs_discard_extent(fs_info, start, len, NULL);
7868
- btrfs_add_free_space(cache, start, len);
7869
- btrfs_free_reserved_bytes(cache, len, delalloc);
7870
- trace_btrfs_reserved_extent_free(fs_info, start, len);
7871
- }
4314
+ btrfs_add_free_space(cache, start, len);
4315
+ btrfs_free_reserved_bytes(cache, len, delalloc);
4316
+ trace_btrfs_reserved_extent_free(fs_info, start, len);
78724317
78734318 btrfs_put_block_group(cache);
4319
+ return 0;
4320
+}
4321
+
4322
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
4323
+ u64 len)
4324
+{
4325
+ struct btrfs_block_group *cache;
4326
+ int ret = 0;
4327
+
4328
+ cache = btrfs_lookup_block_group(trans->fs_info, start);
4329
+ if (!cache) {
4330
+ btrfs_err(trans->fs_info, "unable to find block group for %llu",
4331
+ start);
4332
+ return -ENOSPC;
4333
+ }
4334
+
4335
+ ret = pin_down_extent(trans, cache, start, len, 1);
4336
+ btrfs_put_block_group(cache);
78744337 return ret;
7875
-}
7876
-
7877
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
7878
- u64 start, u64 len, int delalloc)
7879
-{
7880
- return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
7881
-}
7882
-
7883
-int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
7884
- u64 start, u64 len)
7885
-{
7886
- return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
78874338 }
78884339
78894340 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
....@@ -7950,7 +4401,7 @@
79504401 if (ret)
79514402 return ret;
79524403
7953
- ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
4404
+ ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
79544405 if (ret) { /* -ENOENT, logic error */
79554406 btrfs_err(fs_info, "update block group failed for %llu %llu",
79564407 ins->objectid, ins->offset);
....@@ -8040,8 +4491,8 @@
80404491 if (ret)
80414492 return ret;
80424493
8043
- ret = update_block_group(trans, fs_info, extent_key.objectid,
8044
- fs_info->nodesize, 1);
4494
+ ret = btrfs_update_block_group(trans, extent_key.objectid,
4495
+ fs_info->nodesize, 1);
80454496 if (ret) { /* -ENOENT, logic error */
80464497 btrfs_err(fs_info, "update block group failed for %llu %llu",
80474498 extent_key.objectid, extent_key.offset);
....@@ -8058,20 +4509,16 @@
80584509 u64 offset, u64 ram_bytes,
80594510 struct btrfs_key *ins)
80604511 {
8061
- int ret;
4512
+ struct btrfs_ref generic_ref = { 0 };
80624513
80634514 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
80644515
8065
- btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
8066
- root->root_key.objectid, owner, offset,
8067
- BTRFS_ADD_DELAYED_EXTENT);
4516
+ btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4517
+ ins->objectid, ins->offset, 0);
4518
+ btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
4519
+ btrfs_ref_tree_mod(root->fs_info, &generic_ref);
80684520
8069
- ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
8070
- ins->offset, 0,
8071
- root->root_key.objectid, owner,
8072
- offset, ram_bytes,
8073
- BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8074
- return ret;
4521
+ return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
80754522 }
80764523
80774524 /*
....@@ -8085,7 +4532,7 @@
80854532 {
80864533 struct btrfs_fs_info *fs_info = trans->fs_info;
80874534 int ret;
8088
- struct btrfs_block_group_cache *block_group;
4535
+ struct btrfs_block_group *block_group;
80894536 struct btrfs_space_info *space_info;
80904537
80914538 /*
....@@ -8113,13 +4560,16 @@
81134560
81144561 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
81154562 offset, ins, 1);
4563
+ if (ret)
4564
+ btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
81164565 btrfs_put_block_group(block_group);
81174566 return ret;
81184567 }
81194568
81204569 static struct extent_buffer *
81214570 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8122
- u64 bytenr, int level, u64 owner)
4571
+ u64 bytenr, int level, u64 owner,
4572
+ enum btrfs_lock_nesting nest)
81234573 {
81244574 struct btrfs_fs_info *fs_info = root->fs_info;
81254575 struct extent_buffer *buf;
....@@ -8141,12 +4591,12 @@
81414591 return ERR_PTR(-EUCLEAN);
81424592 }
81434593
8144
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8145
- btrfs_tree_lock(buf);
8146
- clean_tree_block(fs_info, buf);
4594
+ btrfs_set_buffer_lockdep_class(owner, buf, level);
4595
+ __btrfs_tree_lock(buf, nest);
4596
+ btrfs_clean_tree_block(buf);
81474597 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
81484598
8149
- btrfs_set_lock_blocking(buf);
4599
+ btrfs_set_lock_blocking_write(buf);
81504600 set_extent_buffer_uptodate(buf);
81514601
81524602 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
....@@ -8155,13 +4605,13 @@
81554605 btrfs_set_header_generation(buf, trans->transid);
81564606 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
81574607 btrfs_set_header_owner(buf, owner);
8158
- write_extent_buffer_fsid(buf, fs_info->fsid);
4608
+ write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
81594609 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
81604610 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
81614611 buf->log_index = root->log_transid % 2;
81624612 /*
81634613 * we allow two log transactions at a time, use different
8164
- * EXENT bit to differentiate dirty pages.
4614
+ * EXTENT bit to differentiate dirty pages.
81654615 */
81664616 if (buf->log_index == 0)
81674617 set_extent_dirty(&root->dirty_log_pages, buf->start,
....@@ -8179,68 +4629,6 @@
81794629 return buf;
81804630 }
81814631
8182
-static struct btrfs_block_rsv *
8183
-use_block_rsv(struct btrfs_trans_handle *trans,
8184
- struct btrfs_root *root, u32 blocksize)
8185
-{
8186
- struct btrfs_fs_info *fs_info = root->fs_info;
8187
- struct btrfs_block_rsv *block_rsv;
8188
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8189
- int ret;
8190
- bool global_updated = false;
8191
-
8192
- block_rsv = get_block_rsv(trans, root);
8193
-
8194
- if (unlikely(block_rsv->size == 0))
8195
- goto try_reserve;
8196
-again:
8197
- ret = block_rsv_use_bytes(block_rsv, blocksize);
8198
- if (!ret)
8199
- return block_rsv;
8200
-
8201
- if (block_rsv->failfast)
8202
- return ERR_PTR(ret);
8203
-
8204
- if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8205
- global_updated = true;
8206
- update_global_block_rsv(fs_info);
8207
- goto again;
8208
- }
8209
-
8210
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8211
- static DEFINE_RATELIMIT_STATE(_rs,
8212
- DEFAULT_RATELIMIT_INTERVAL * 10,
8213
- /*DEFAULT_RATELIMIT_BURST*/ 1);
8214
- if (__ratelimit(&_rs))
8215
- WARN(1, KERN_DEBUG
8216
- "BTRFS: block rsv returned %d\n", ret);
8217
- }
8218
-try_reserve:
8219
- ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8220
- BTRFS_RESERVE_NO_FLUSH);
8221
- if (!ret)
8222
- return block_rsv;
8223
- /*
8224
- * If we couldn't reserve metadata bytes try and use some from
8225
- * the global reserve if its space type is the same as the global
8226
- * reservation.
8227
- */
8228
- if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8229
- block_rsv->space_info == global_rsv->space_info) {
8230
- ret = block_rsv_use_bytes(global_rsv, blocksize);
8231
- if (!ret)
8232
- return global_rsv;
8233
- }
8234
- return ERR_PTR(ret);
8235
-}
8236
-
8237
-static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8238
- struct btrfs_block_rsv *block_rsv, u32 blocksize)
8239
-{
8240
- block_rsv_add_bytes(block_rsv, blocksize, 0);
8241
- block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8242
-}
8243
-
82444632 /*
82454633 * finds a free extent and does all the dirty work required for allocation
82464634 * returns the tree buffer or an ERR_PTR on error.
....@@ -8250,13 +4638,15 @@
82504638 u64 parent, u64 root_objectid,
82514639 const struct btrfs_disk_key *key,
82524640 int level, u64 hint,
8253
- u64 empty_size)
4641
+ u64 empty_size,
4642
+ enum btrfs_lock_nesting nest)
82544643 {
82554644 struct btrfs_fs_info *fs_info = root->fs_info;
82564645 struct btrfs_key ins;
82574646 struct btrfs_block_rsv *block_rsv;
82584647 struct extent_buffer *buf;
82594648 struct btrfs_delayed_extent_op *extent_op;
4649
+ struct btrfs_ref generic_ref = { 0 };
82604650 u64 flags = 0;
82614651 int ret;
82624652 u32 blocksize = fs_info->nodesize;
....@@ -8265,14 +4655,14 @@
82654655 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
82664656 if (btrfs_is_testing(fs_info)) {
82674657 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8268
- level, root_objectid);
4658
+ level, root_objectid, nest);
82694659 if (!IS_ERR(buf))
82704660 root->alloc_bytenr += blocksize;
82714661 return buf;
82724662 }
82734663 #endif
82744664
8275
- block_rsv = use_block_rsv(trans, root, blocksize);
4665
+ block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
82764666 if (IS_ERR(block_rsv))
82774667 return ERR_CAST(block_rsv);
82784668
....@@ -8282,7 +4672,7 @@
82824672 goto out_unuse;
82834673
82844674 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
8285
- root_objectid);
4675
+ root_objectid, nest);
82864676 if (IS_ERR(buf)) {
82874677 ret = PTR_ERR(buf);
82884678 goto out_free_reserved;
....@@ -8311,14 +4701,12 @@
83114701 extent_op->is_data = false;
83124702 extent_op->level = level;
83134703
8314
- btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
8315
- root_objectid, level, 0,
8316
- BTRFS_ADD_DELAYED_EXTENT);
8317
- ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
8318
- ins.offset, parent,
8319
- root_objectid, level,
8320
- BTRFS_ADD_DELAYED_EXTENT,
8321
- extent_op, NULL, NULL);
4704
+ btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4705
+ ins.objectid, ins.offset, parent);
4706
+ generic_ref.real_root = root->root_key.objectid;
4707
+ btrfs_init_tree_ref(&generic_ref, level, root_objectid);
4708
+ btrfs_ref_tree_mod(fs_info, &generic_ref);
4709
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
83224710 if (ret)
83234711 goto out_free_delayed;
83244712 }
....@@ -8327,11 +4715,12 @@
83274715 out_free_delayed:
83284716 btrfs_free_delayed_extent_op(extent_op);
83294717 out_free_buf:
4718
+ btrfs_tree_unlock(buf);
83304719 free_extent_buffer(buf);
83314720 out_free_reserved:
83324721 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
83334722 out_unuse:
8334
- unuse_block_rsv(fs_info, block_rsv, blocksize);
4723
+ btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
83354724 return ERR_PTR(ret);
83364725 }
83374726
....@@ -8339,6 +4728,8 @@
83394728 u64 refs[BTRFS_MAX_LEVEL];
83404729 u64 flags[BTRFS_MAX_LEVEL];
83414730 struct btrfs_key update_progress;
4731
+ struct btrfs_key drop_progress;
4732
+ int drop_level;
83424733 int stage;
83434734 int level;
83444735 int shared_level;
....@@ -8346,6 +4737,7 @@
83464737 int keep_locks;
83474738 int reada_slot;
83484739 int reada_count;
4740
+ int restarted;
83494741 };
83504742
83514743 #define DROP_REFERENCE 1
....@@ -8490,8 +4882,7 @@
84904882 BUG_ON(ret); /* -ENOMEM */
84914883 ret = btrfs_dec_ref(trans, root, eb, 0);
84924884 BUG_ON(ret); /* -ENOMEM */
8493
- ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
8494
- eb->len, flag,
4885
+ ret = btrfs_set_disk_extent_flags(trans, eb, flag,
84954886 btrfs_header_level(eb), 0);
84964887 BUG_ON(ret); /* -ENOMEM */
84974888 wc->flags[level] |= flag;
....@@ -8506,6 +4897,33 @@
85064897 path->locks[level] = 0;
85074898 }
85084899 return 0;
4900
+}
4901
+
4902
+/*
4903
+ * This is used to verify a ref exists for this root to deal with a bug where we
4904
+ * would have a drop_progress key that hadn't been updated properly.
4905
+ */
4906
+static int check_ref_exists(struct btrfs_trans_handle *trans,
4907
+ struct btrfs_root *root, u64 bytenr, u64 parent,
4908
+ int level)
4909
+{
4910
+ struct btrfs_path *path;
4911
+ struct btrfs_extent_inline_ref *iref;
4912
+ int ret;
4913
+
4914
+ path = btrfs_alloc_path();
4915
+ if (!path)
4916
+ return -ENOMEM;
4917
+
4918
+ ret = lookup_extent_backref(trans, path, &iref, bytenr,
4919
+ root->fs_info->nodesize, parent,
4920
+ root->root_key.objectid, level, 0);
4921
+ btrfs_free_path(path);
4922
+ if (ret == -ENOENT)
4923
+ return 0;
4924
+ if (ret < 0)
4925
+ return ret;
4926
+ return 1;
85094927 }
85104928
85114929 /*
....@@ -8530,9 +4948,9 @@
85304948 u64 bytenr;
85314949 u64 generation;
85324950 u64 parent;
8533
- u32 blocksize;
85344951 struct btrfs_key key;
85354952 struct btrfs_key first_key;
4953
+ struct btrfs_ref ref = { 0 };
85364954 struct extent_buffer *next;
85374955 int level = wc->level;
85384956 int reada = 0;
....@@ -8555,7 +4973,6 @@
85554973 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
85564974 btrfs_node_key_to_cpu(path->nodes[level], &first_key,
85574975 path->slots[level]);
8558
- blocksize = fs_info->nodesize;
85594976
85604977 next = find_extent_buffer(fs_info, bytenr);
85614978 if (!next) {
....@@ -8568,7 +4985,7 @@
85684985 reada = 1;
85694986 }
85704987 btrfs_tree_lock(next);
8571
- btrfs_set_lock_blocking(next);
4988
+ btrfs_set_lock_blocking_write(next);
85724989
85734990 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
85744991 &wc->refs[level - 1],
....@@ -8628,7 +5045,7 @@
86285045 return -EIO;
86295046 }
86305047 btrfs_tree_lock(next);
8631
- btrfs_set_lock_blocking(next);
5048
+ btrfs_set_lock_blocking_write(next);
86325049 }
86335050
86345051 level--;
....@@ -8664,7 +5081,30 @@
86645081 parent = 0;
86655082 }
86665083
8667
- if (need_account) {
5084
+ /*
5085
+ * If we had a drop_progress we need to verify the refs are set
5086
+ * as expected. If we find our ref then we know that from here
5087
+ * on out everything should be correct, and we can clear the
5088
+ * ->restarted flag.
5089
+ */
5090
+ if (wc->restarted) {
5091
+ ret = check_ref_exists(trans, root, bytenr, parent,
5092
+ level - 1);
5093
+ if (ret < 0)
5094
+ goto out_unlock;
5095
+ if (ret == 0)
5096
+ goto no_delete;
5097
+ ret = 0;
5098
+ wc->restarted = 0;
5099
+ }
5100
+
5101
+ /*
5102
+ * Reloc tree doesn't contribute to qgroup numbers, and we have
5103
+ * already accounted them at merge time (replace_path),
5104
+ * thus we could skip expensive subtree trace here.
5105
+ */
5106
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
5107
+ need_account) {
86685108 ret = btrfs_qgroup_trace_subtree(trans, next,
86695109 generation, level - 1);
86705110 if (ret) {
....@@ -8673,13 +5113,24 @@
86735113 ret);
86745114 }
86755115 }
8676
- ret = btrfs_free_extent(trans, root, bytenr, blocksize,
8677
- parent, root->root_key.objectid,
8678
- level - 1, 0);
5116
+
5117
+ /*
5118
+ * We need to update the next key in our walk control so we can
5119
+ * update the drop_progress key accordingly. We don't care if
5120
+ * find_next_key doesn't find a key because that means we're at
5121
+ * the end and are going to clean up now.
5122
+ */
5123
+ wc->drop_level = level;
5124
+ find_next_key(path, level, &wc->drop_progress);
5125
+
5126
+ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
5127
+ fs_info->nodesize, parent);
5128
+ btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
5129
+ ret = btrfs_free_extent(trans, &ref);
86795130 if (ret)
86805131 goto out_unlock;
86815132 }
8682
-
5133
+no_delete:
86835134 *lookup_info = 1;
86845135 ret = 1;
86855136
....@@ -8734,7 +5185,7 @@
87345185 if (!path->locks[level]) {
87355186 BUG_ON(level == 0);
87365187 btrfs_tree_lock(eb);
8737
- btrfs_set_lock_blocking(eb);
5188
+ btrfs_set_lock_blocking_write(eb);
87385189 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
87395190
87405191 ret = btrfs_lookup_extent_info(trans, fs_info,
....@@ -8765,21 +5216,23 @@
87655216 else
87665217 ret = btrfs_dec_ref(trans, root, eb, 0);
87675218 BUG_ON(ret); /* -ENOMEM */
8768
- ret = btrfs_qgroup_trace_leaf_items(trans, eb);
8769
- if (ret) {
8770
- btrfs_err_rl(fs_info,
8771
- "error %d accounting leaf items. Quota is out of sync, rescan required.",
5219
+ if (is_fstree(root->root_key.objectid)) {
5220
+ ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5221
+ if (ret) {
5222
+ btrfs_err_rl(fs_info,
5223
+ "error %d accounting leaf items, quota is out of sync, rescan required",
87725224 ret);
5225
+ }
87735226 }
87745227 }
8775
- /* make block locked assertion in clean_tree_block happy */
5228
+ /* make block locked assertion in btrfs_clean_tree_block happy */
87765229 if (!path->locks[level] &&
87775230 btrfs_header_generation(eb) == trans->transid) {
87785231 btrfs_tree_lock(eb);
8779
- btrfs_set_lock_blocking(eb);
5232
+ btrfs_set_lock_blocking_write(eb);
87805233 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
87815234 }
8782
- clean_tree_block(fs_info, eb);
5235
+ btrfs_clean_tree_block(eb);
87835236 }
87845237
87855238 if (eb == root->node) {
....@@ -8887,9 +5340,7 @@
88875340 *
88885341 * If called with for_reloc == 0, may exit early with -EAGAIN
88895342 */
8890
-int btrfs_drop_snapshot(struct btrfs_root *root,
8891
- struct btrfs_block_rsv *block_rsv, int update_ref,
8892
- int for_reloc)
5343
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
88935344 {
88945345 struct btrfs_fs_info *fs_info = root->fs_info;
88955346 struct btrfs_path *path;
....@@ -8903,7 +5354,7 @@
89035354 int level;
89045355 bool root_dropped = false;
89055356
8906
- btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
5357
+ btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
89075358
89085359 path = btrfs_alloc_path();
89095360 if (!path) {
....@@ -8918,7 +5369,14 @@
89185369 goto out;
89195370 }
89205371
8921
- trans = btrfs_start_transaction(tree_root, 0);
5372
+ /*
5373
+ * Use join to avoid potential EINTR from transaction start. See
5374
+ * wait_reserve_ticket and the whole reservation callchain.
5375
+ */
5376
+ if (for_reloc)
5377
+ trans = btrfs_join_transaction(tree_root);
5378
+ else
5379
+ trans = btrfs_start_transaction(tree_root, 0);
89225380 if (IS_ERR(trans)) {
89235381 err = PTR_ERR(trans);
89245382 goto out_free;
....@@ -8928,13 +5386,19 @@
89285386 if (err)
89295387 goto out_end_trans;
89305388
8931
- if (block_rsv)
8932
- trans->block_rsv = block_rsv;
8933
-
5389
+ /*
5390
+ * This will help us catch people modifying the fs tree while we're
5391
+ * dropping it. It is unsafe to mess with the fs tree while it's being
5392
+ * dropped as we unlock the root node and parent nodes as we walk down
5393
+ * the tree, assuming nothing will change. If something does change
5394
+ * then we'll have stale information and drop references to blocks we've
5395
+ * already dropped.
5396
+ */
5397
+ set_bit(BTRFS_ROOT_DELETING, &root->state);
89345398 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
89355399 level = btrfs_header_level(root->node);
89365400 path->nodes[level] = btrfs_lock_root_node(root);
8937
- btrfs_set_lock_blocking(path->nodes[level]);
5401
+ btrfs_set_lock_blocking_write(path->nodes[level]);
89385402 path->slots[level] = 0;
89395403 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
89405404 memset(&wc->update_progress, 0,
....@@ -8964,7 +5428,7 @@
89645428 level = btrfs_header_level(root->node);
89655429 while (1) {
89665430 btrfs_tree_lock(path->nodes[level]);
8967
- btrfs_set_lock_blocking(path->nodes[level]);
5431
+ btrfs_set_lock_blocking_write(path->nodes[level]);
89685432 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
89695433
89705434 ret = btrfs_lookup_extent_info(trans, fs_info,
....@@ -8987,6 +5451,7 @@
89875451 }
89885452 }
89895453
5454
+ wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
89905455 wc->level = level;
89915456 wc->shared_level = -1;
89925457 wc->stage = DROP_REFERENCE;
....@@ -9014,12 +5479,14 @@
90145479 }
90155480
90165481 if (wc->stage == DROP_REFERENCE) {
9017
- level = wc->level;
9018
- btrfs_node_key(path->nodes[level],
9019
- &root_item->drop_progress,
9020
- path->slots[level]);
9021
- root_item->drop_level = level;
5482
+ wc->drop_level = wc->level;
5483
+ btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5484
+ &wc->drop_progress,
5485
+ path->slots[wc->drop_level]);
90225486 }
5487
+ btrfs_cpu_key_to_disk(&root_item->drop_progress,
5488
+ &wc->drop_progress);
5489
+ root_item->drop_level = wc->drop_level;
90235490
90245491 BUG_ON(wc->level == 0);
90255492 if (btrfs_should_end_transaction(trans) ||
....@@ -9041,13 +5508,19 @@
90415508 goto out_free;
90425509 }
90435510
9044
- trans = btrfs_start_transaction(tree_root, 0);
5511
+ /*
5512
+ * Use join to avoid potential EINTR from transaction
5513
+ * start. See wait_reserve_ticket and the whole
5514
+ * reservation callchain.
5515
+ */
5516
+ if (for_reloc)
5517
+ trans = btrfs_join_transaction(tree_root);
5518
+ else
5519
+ trans = btrfs_start_transaction(tree_root, 0);
90455520 if (IS_ERR(trans)) {
90465521 err = PTR_ERR(trans);
90475522 goto out_free;
90485523 }
9049
- if (block_rsv)
9050
- trans->block_rsv = block_rsv;
90515524 }
90525525 }
90535526 btrfs_release_path(path);
....@@ -9079,13 +5552,18 @@
90795552 }
90805553 }
90815554
9082
- if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
5555
+ /*
5556
+ * This subvolume is going to be completely dropped, and won't be
5557
+ * recorded as dirty roots, thus pertrans meta rsv will not be freed at
5558
+ * commit transaction time. So free it here manually.
5559
+ */
5560
+ btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
5561
+ btrfs_qgroup_free_meta_all_pertrans(root);
5562
+
5563
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
90835564 btrfs_add_dropped_root(trans, root);
9084
- } else {
9085
- free_extent_buffer(root->node);
9086
- free_extent_buffer(root->commit_root);
9087
- btrfs_put_fs_root(root);
9088
- }
5565
+ else
5566
+ btrfs_put_root(root);
90895567 root_dropped = true;
90905568 out_end_trans:
90915569 btrfs_end_transaction_throttle(trans);
....@@ -9138,7 +5616,7 @@
91385616
91395617 btrfs_assert_tree_locked(parent);
91405618 parent_level = btrfs_header_level(parent);
9141
- extent_buffer_get(parent);
5619
+ atomic_inc(&parent->refs);
91425620 path->nodes[parent_level] = parent;
91435621 path->slots[parent_level] = btrfs_header_nritems(parent);
91445622
....@@ -9176,184 +5654,13 @@
91765654 return ret;
91775655 }
91785656
9179
-static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9180
-{
9181
- u64 num_devices;
9182
- u64 stripped;
9183
-
9184
- /*
9185
- * if restripe for this chunk_type is on pick target profile and
9186
- * return, otherwise do the usual balance
9187
- */
9188
- stripped = get_restripe_target(fs_info, flags);
9189
- if (stripped)
9190
- return extended_to_chunk(stripped);
9191
-
9192
- num_devices = fs_info->fs_devices->rw_devices;
9193
-
9194
- stripped = BTRFS_BLOCK_GROUP_RAID0 |
9195
- BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9196
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9197
-
9198
- if (num_devices == 1) {
9199
- stripped |= BTRFS_BLOCK_GROUP_DUP;
9200
- stripped = flags & ~stripped;
9201
-
9202
- /* turn raid0 into single device chunks */
9203
- if (flags & BTRFS_BLOCK_GROUP_RAID0)
9204
- return stripped;
9205
-
9206
- /* turn mirroring into duplication */
9207
- if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9208
- BTRFS_BLOCK_GROUP_RAID10))
9209
- return stripped | BTRFS_BLOCK_GROUP_DUP;
9210
- } else {
9211
- /* they already had raid on here, just return */
9212
- if (flags & stripped)
9213
- return flags;
9214
-
9215
- stripped |= BTRFS_BLOCK_GROUP_DUP;
9216
- stripped = flags & ~stripped;
9217
-
9218
- /* switch duplicated blocks with raid1 */
9219
- if (flags & BTRFS_BLOCK_GROUP_DUP)
9220
- return stripped | BTRFS_BLOCK_GROUP_RAID1;
9221
-
9222
- /* this is drive concat, leave it alone */
9223
- }
9224
-
9225
- return flags;
9226
-}
9227
-
9228
-static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9229
-{
9230
- struct btrfs_space_info *sinfo = cache->space_info;
9231
- u64 num_bytes;
9232
- u64 min_allocable_bytes;
9233
- int ret = -ENOSPC;
9234
-
9235
- /*
9236
- * We need some metadata space and system metadata space for
9237
- * allocating chunks in some corner cases until we force to set
9238
- * it to be readonly.
9239
- */
9240
- if ((sinfo->flags &
9241
- (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9242
- !force)
9243
- min_allocable_bytes = SZ_1M;
9244
- else
9245
- min_allocable_bytes = 0;
9246
-
9247
- spin_lock(&sinfo->lock);
9248
- spin_lock(&cache->lock);
9249
-
9250
- if (cache->ro) {
9251
- cache->ro++;
9252
- ret = 0;
9253
- goto out;
9254
- }
9255
-
9256
- num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9257
- cache->bytes_super - btrfs_block_group_used(&cache->item);
9258
-
9259
- if (btrfs_space_info_used(sinfo, true) + num_bytes +
9260
- min_allocable_bytes <= sinfo->total_bytes) {
9261
- sinfo->bytes_readonly += num_bytes;
9262
- cache->ro++;
9263
- list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9264
- ret = 0;
9265
- }
9266
-out:
9267
- spin_unlock(&cache->lock);
9268
- spin_unlock(&sinfo->lock);
9269
- return ret;
9270
-}
9271
-
9272
-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
9273
-
9274
-{
9275
- struct btrfs_fs_info *fs_info = cache->fs_info;
9276
- struct btrfs_trans_handle *trans;
9277
- u64 alloc_flags;
9278
- int ret;
9279
-
9280
-again:
9281
- trans = btrfs_join_transaction(fs_info->extent_root);
9282
- if (IS_ERR(trans))
9283
- return PTR_ERR(trans);
9284
-
9285
- /*
9286
- * we're not allowed to set block groups readonly after the dirty
9287
- * block groups cache has started writing. If it already started,
9288
- * back off and let this transaction commit
9289
- */
9290
- mutex_lock(&fs_info->ro_block_group_mutex);
9291
- if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9292
- u64 transid = trans->transid;
9293
-
9294
- mutex_unlock(&fs_info->ro_block_group_mutex);
9295
- btrfs_end_transaction(trans);
9296
-
9297
- ret = btrfs_wait_for_commit(fs_info, transid);
9298
- if (ret)
9299
- return ret;
9300
- goto again;
9301
- }
9302
-
9303
- /*
9304
- * if we are changing raid levels, try to allocate a corresponding
9305
- * block group with the new raid level.
9306
- */
9307
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
9308
- if (alloc_flags != cache->flags) {
9309
- ret = do_chunk_alloc(trans, alloc_flags,
9310
- CHUNK_ALLOC_FORCE);
9311
- /*
9312
- * ENOSPC is allowed here, we may have enough space
9313
- * already allocated at the new raid level to
9314
- * carry on
9315
- */
9316
- if (ret == -ENOSPC)
9317
- ret = 0;
9318
- if (ret < 0)
9319
- goto out;
9320
- }
9321
-
9322
- ret = inc_block_group_ro(cache, 0);
9323
- if (!ret)
9324
- goto out;
9325
- alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9326
- ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9327
- if (ret < 0)
9328
- goto out;
9329
- ret = inc_block_group_ro(cache, 0);
9330
-out:
9331
- if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9332
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
9333
- mutex_lock(&fs_info->chunk_mutex);
9334
- check_system_chunk(trans, alloc_flags);
9335
- mutex_unlock(&fs_info->chunk_mutex);
9336
- }
9337
- mutex_unlock(&fs_info->ro_block_group_mutex);
9338
-
9339
- btrfs_end_transaction(trans);
9340
- return ret;
9341
-}
9342
-
9343
-int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
9344
-{
9345
- u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
9346
-
9347
- return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9348
-}
9349
-
93505657 /*
93515658 * helper to account the unused space of all the readonly block group in the
93525659 * space_info. takes mirrors into account.
93535660 */
93545661 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
93555662 {
9356
- struct btrfs_block_group_cache *block_group;
5663
+ struct btrfs_block_group *block_group;
93575664 u64 free_bytes = 0;
93585665 int factor;
93595666
....@@ -9371,1412 +5678,14 @@
93715678 }
93725679
93735680 factor = btrfs_bg_type_to_factor(block_group->flags);
9374
- free_bytes += (block_group->key.offset -
9375
- btrfs_block_group_used(&block_group->item)) *
9376
- factor;
5681
+ free_bytes += (block_group->length -
5682
+ block_group->used) * factor;
93775683
93785684 spin_unlock(&block_group->lock);
93795685 }
93805686 spin_unlock(&sinfo->lock);
93815687
93825688 return free_bytes;
9383
-}
9384
-
9385
-void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9386
-{
9387
- struct btrfs_space_info *sinfo = cache->space_info;
9388
- u64 num_bytes;
9389
-
9390
- BUG_ON(!cache->ro);
9391
-
9392
- spin_lock(&sinfo->lock);
9393
- spin_lock(&cache->lock);
9394
- if (!--cache->ro) {
9395
- num_bytes = cache->key.offset - cache->reserved -
9396
- cache->pinned - cache->bytes_super -
9397
- btrfs_block_group_used(&cache->item);
9398
- sinfo->bytes_readonly -= num_bytes;
9399
- list_del_init(&cache->ro_list);
9400
- }
9401
- spin_unlock(&cache->lock);
9402
- spin_unlock(&sinfo->lock);
9403
-}
9404
-
9405
-/*
9406
- * checks to see if its even possible to relocate this block group.
9407
- *
9408
- * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9409
- * ok to go ahead and try.
9410
- */
9411
-int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9412
-{
9413
- struct btrfs_root *root = fs_info->extent_root;
9414
- struct btrfs_block_group_cache *block_group;
9415
- struct btrfs_space_info *space_info;
9416
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9417
- struct btrfs_device *device;
9418
- struct btrfs_trans_handle *trans;
9419
- u64 min_free;
9420
- u64 dev_min = 1;
9421
- u64 dev_nr = 0;
9422
- u64 target;
9423
- int debug;
9424
- int index;
9425
- int full = 0;
9426
- int ret = 0;
9427
-
9428
- debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9429
-
9430
- block_group = btrfs_lookup_block_group(fs_info, bytenr);
9431
-
9432
- /* odd, couldn't find the block group, leave it alone */
9433
- if (!block_group) {
9434
- if (debug)
9435
- btrfs_warn(fs_info,
9436
- "can't find block group for bytenr %llu",
9437
- bytenr);
9438
- return -1;
9439
- }
9440
-
9441
- min_free = btrfs_block_group_used(&block_group->item);
9442
-
9443
- /* no bytes used, we're good */
9444
- if (!min_free)
9445
- goto out;
9446
-
9447
- space_info = block_group->space_info;
9448
- spin_lock(&space_info->lock);
9449
-
9450
- full = space_info->full;
9451
-
9452
- /*
9453
- * if this is the last block group we have in this space, we can't
9454
- * relocate it unless we're able to allocate a new chunk below.
9455
- *
9456
- * Otherwise, we need to make sure we have room in the space to handle
9457
- * all of the extents from this block group. If we can, we're good
9458
- */
9459
- if ((space_info->total_bytes != block_group->key.offset) &&
9460
- (btrfs_space_info_used(space_info, false) + min_free <
9461
- space_info->total_bytes)) {
9462
- spin_unlock(&space_info->lock);
9463
- goto out;
9464
- }
9465
- spin_unlock(&space_info->lock);
9466
-
9467
- /*
9468
- * ok we don't have enough space, but maybe we have free space on our
9469
- * devices to allocate new chunks for relocation, so loop through our
9470
- * alloc devices and guess if we have enough space. if this block
9471
- * group is going to be restriped, run checks against the target
9472
- * profile instead of the current one.
9473
- */
9474
- ret = -1;
9475
-
9476
- /*
9477
- * index:
9478
- * 0: raid10
9479
- * 1: raid1
9480
- * 2: dup
9481
- * 3: raid0
9482
- * 4: single
9483
- */
9484
- target = get_restripe_target(fs_info, block_group->flags);
9485
- if (target) {
9486
- index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
9487
- } else {
9488
- /*
9489
- * this is just a balance, so if we were marked as full
9490
- * we know there is no space for a new chunk
9491
- */
9492
- if (full) {
9493
- if (debug)
9494
- btrfs_warn(fs_info,
9495
- "no space to alloc new chunk for block group %llu",
9496
- block_group->key.objectid);
9497
- goto out;
9498
- }
9499
-
9500
- index = btrfs_bg_flags_to_raid_index(block_group->flags);
9501
- }
9502
-
9503
- if (index == BTRFS_RAID_RAID10) {
9504
- dev_min = 4;
9505
- /* Divide by 2 */
9506
- min_free >>= 1;
9507
- } else if (index == BTRFS_RAID_RAID1) {
9508
- dev_min = 2;
9509
- } else if (index == BTRFS_RAID_DUP) {
9510
- /* Multiply by 2 */
9511
- min_free <<= 1;
9512
- } else if (index == BTRFS_RAID_RAID0) {
9513
- dev_min = fs_devices->rw_devices;
9514
- min_free = div64_u64(min_free, dev_min);
9515
- }
9516
-
9517
- /* We need to do this so that we can look at pending chunks */
9518
- trans = btrfs_join_transaction(root);
9519
- if (IS_ERR(trans)) {
9520
- ret = PTR_ERR(trans);
9521
- goto out;
9522
- }
9523
-
9524
- mutex_lock(&fs_info->chunk_mutex);
9525
- list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9526
- u64 dev_offset;
9527
-
9528
- /*
9529
- * check to make sure we can actually find a chunk with enough
9530
- * space to fit our block group in.
9531
- */
9532
- if (device->total_bytes > device->bytes_used + min_free &&
9533
- !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
9534
- ret = find_free_dev_extent(trans, device, min_free,
9535
- &dev_offset, NULL);
9536
- if (!ret)
9537
- dev_nr++;
9538
-
9539
- if (dev_nr >= dev_min)
9540
- break;
9541
-
9542
- ret = -1;
9543
- }
9544
- }
9545
- if (debug && ret == -1)
9546
- btrfs_warn(fs_info,
9547
- "no space to allocate a new chunk for block group %llu",
9548
- block_group->key.objectid);
9549
- mutex_unlock(&fs_info->chunk_mutex);
9550
- btrfs_end_transaction(trans);
9551
-out:
9552
- btrfs_put_block_group(block_group);
9553
- return ret;
9554
-}
9555
-
9556
-static int find_first_block_group(struct btrfs_fs_info *fs_info,
9557
- struct btrfs_path *path,
9558
- struct btrfs_key *key)
9559
-{
9560
- struct btrfs_root *root = fs_info->extent_root;
9561
- int ret = 0;
9562
- struct btrfs_key found_key;
9563
- struct extent_buffer *leaf;
9564
- struct btrfs_block_group_item bg;
9565
- u64 flags;
9566
- int slot;
9567
-
9568
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9569
- if (ret < 0)
9570
- goto out;
9571
-
9572
- while (1) {
9573
- slot = path->slots[0];
9574
- leaf = path->nodes[0];
9575
- if (slot >= btrfs_header_nritems(leaf)) {
9576
- ret = btrfs_next_leaf(root, path);
9577
- if (ret == 0)
9578
- continue;
9579
- if (ret < 0)
9580
- goto out;
9581
- break;
9582
- }
9583
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
9584
-
9585
- if (found_key.objectid >= key->objectid &&
9586
- found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9587
- struct extent_map_tree *em_tree;
9588
- struct extent_map *em;
9589
-
9590
- em_tree = &root->fs_info->mapping_tree.map_tree;
9591
- read_lock(&em_tree->lock);
9592
- em = lookup_extent_mapping(em_tree, found_key.objectid,
9593
- found_key.offset);
9594
- read_unlock(&em_tree->lock);
9595
- if (!em) {
9596
- btrfs_err(fs_info,
9597
- "logical %llu len %llu found bg but no related chunk",
9598
- found_key.objectid, found_key.offset);
9599
- ret = -ENOENT;
9600
- } else if (em->start != found_key.objectid ||
9601
- em->len != found_key.offset) {
9602
- btrfs_err(fs_info,
9603
- "block group %llu len %llu mismatch with chunk %llu len %llu",
9604
- found_key.objectid, found_key.offset,
9605
- em->start, em->len);
9606
- ret = -EUCLEAN;
9607
- } else {
9608
- read_extent_buffer(leaf, &bg,
9609
- btrfs_item_ptr_offset(leaf, slot),
9610
- sizeof(bg));
9611
- flags = btrfs_block_group_flags(&bg) &
9612
- BTRFS_BLOCK_GROUP_TYPE_MASK;
9613
-
9614
- if (flags != (em->map_lookup->type &
9615
- BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9616
- btrfs_err(fs_info,
9617
-"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
9618
- found_key.objectid,
9619
- found_key.offset, flags,
9620
- (BTRFS_BLOCK_GROUP_TYPE_MASK &
9621
- em->map_lookup->type));
9622
- ret = -EUCLEAN;
9623
- } else {
9624
- ret = 0;
9625
- }
9626
- }
9627
- free_extent_map(em);
9628
- goto out;
9629
- }
9630
- path->slots[0]++;
9631
- }
9632
-out:
9633
- return ret;
9634
-}
9635
-
9636
-void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9637
-{
9638
- struct btrfs_block_group_cache *block_group;
9639
- u64 last = 0;
9640
-
9641
- while (1) {
9642
- struct inode *inode;
9643
-
9644
- block_group = btrfs_lookup_first_block_group(info, last);
9645
- while (block_group) {
9646
- wait_block_group_cache_done(block_group);
9647
- spin_lock(&block_group->lock);
9648
- if (block_group->iref)
9649
- break;
9650
- spin_unlock(&block_group->lock);
9651
- block_group = next_block_group(info, block_group);
9652
- }
9653
- if (!block_group) {
9654
- if (last == 0)
9655
- break;
9656
- last = 0;
9657
- continue;
9658
- }
9659
-
9660
- inode = block_group->inode;
9661
- block_group->iref = 0;
9662
- block_group->inode = NULL;
9663
- spin_unlock(&block_group->lock);
9664
- ASSERT(block_group->io_ctl.inode == NULL);
9665
- iput(inode);
9666
- last = block_group->key.objectid + block_group->key.offset;
9667
- btrfs_put_block_group(block_group);
9668
- }
9669
-}
9670
-
9671
-/*
9672
- * Must be called only after stopping all workers, since we could have block
9673
- * group caching kthreads running, and therefore they could race with us if we
9674
- * freed the block groups before stopping them.
9675
- */
9676
-int btrfs_free_block_groups(struct btrfs_fs_info *info)
9677
-{
9678
- struct btrfs_block_group_cache *block_group;
9679
- struct btrfs_space_info *space_info;
9680
- struct btrfs_caching_control *caching_ctl;
9681
- struct rb_node *n;
9682
-
9683
- down_write(&info->commit_root_sem);
9684
- while (!list_empty(&info->caching_block_groups)) {
9685
- caching_ctl = list_entry(info->caching_block_groups.next,
9686
- struct btrfs_caching_control, list);
9687
- list_del(&caching_ctl->list);
9688
- put_caching_control(caching_ctl);
9689
- }
9690
- up_write(&info->commit_root_sem);
9691
-
9692
- spin_lock(&info->unused_bgs_lock);
9693
- while (!list_empty(&info->unused_bgs)) {
9694
- block_group = list_first_entry(&info->unused_bgs,
9695
- struct btrfs_block_group_cache,
9696
- bg_list);
9697
- list_del_init(&block_group->bg_list);
9698
- btrfs_put_block_group(block_group);
9699
- }
9700
- spin_unlock(&info->unused_bgs_lock);
9701
-
9702
- spin_lock(&info->block_group_cache_lock);
9703
- while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9704
- block_group = rb_entry(n, struct btrfs_block_group_cache,
9705
- cache_node);
9706
- rb_erase(&block_group->cache_node,
9707
- &info->block_group_cache_tree);
9708
- RB_CLEAR_NODE(&block_group->cache_node);
9709
- spin_unlock(&info->block_group_cache_lock);
9710
-
9711
- down_write(&block_group->space_info->groups_sem);
9712
- list_del(&block_group->list);
9713
- up_write(&block_group->space_info->groups_sem);
9714
-
9715
- /*
9716
- * We haven't cached this block group, which means we could
9717
- * possibly have excluded extents on this block group.
9718
- */
9719
- if (block_group->cached == BTRFS_CACHE_NO ||
9720
- block_group->cached == BTRFS_CACHE_ERROR)
9721
- free_excluded_extents(block_group);
9722
-
9723
- btrfs_remove_free_space_cache(block_group);
9724
- ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
9725
- ASSERT(list_empty(&block_group->dirty_list));
9726
- ASSERT(list_empty(&block_group->io_list));
9727
- ASSERT(list_empty(&block_group->bg_list));
9728
- ASSERT(atomic_read(&block_group->count) == 1);
9729
- btrfs_put_block_group(block_group);
9730
-
9731
- spin_lock(&info->block_group_cache_lock);
9732
- }
9733
- spin_unlock(&info->block_group_cache_lock);
9734
-
9735
- /* now that all the block groups are freed, go through and
9736
- * free all the space_info structs. This is only called during
9737
- * the final stages of unmount, and so we know nobody is
9738
- * using them. We call synchronize_rcu() once before we start,
9739
- * just to be on the safe side.
9740
- */
9741
- synchronize_rcu();
9742
-
9743
- release_global_block_rsv(info);
9744
-
9745
- while (!list_empty(&info->space_info)) {
9746
- int i;
9747
-
9748
- space_info = list_entry(info->space_info.next,
9749
- struct btrfs_space_info,
9750
- list);
9751
-
9752
- /*
9753
- * Do not hide this behind enospc_debug, this is actually
9754
- * important and indicates a real bug if this happens.
9755
- */
9756
- if (WARN_ON(space_info->bytes_pinned > 0 ||
9757
- space_info->bytes_reserved > 0 ||
9758
- space_info->bytes_may_use > 0))
9759
- dump_space_info(info, space_info, 0, 0);
9760
- list_del(&space_info->list);
9761
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9762
- struct kobject *kobj;
9763
- kobj = space_info->block_group_kobjs[i];
9764
- space_info->block_group_kobjs[i] = NULL;
9765
- if (kobj) {
9766
- kobject_del(kobj);
9767
- kobject_put(kobj);
9768
- }
9769
- }
9770
- kobject_del(&space_info->kobj);
9771
- kobject_put(&space_info->kobj);
9772
- }
9773
- return 0;
9774
-}
9775
-
9776
-/* link_block_group will queue up kobjects to add when we're reclaim-safe */
9777
-void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
9778
-{
9779
- struct btrfs_space_info *space_info;
9780
- struct raid_kobject *rkobj;
9781
- LIST_HEAD(list);
9782
- int index;
9783
- int ret = 0;
9784
-
9785
- spin_lock(&fs_info->pending_raid_kobjs_lock);
9786
- list_splice_init(&fs_info->pending_raid_kobjs, &list);
9787
- spin_unlock(&fs_info->pending_raid_kobjs_lock);
9788
-
9789
- list_for_each_entry(rkobj, &list, list) {
9790
- space_info = __find_space_info(fs_info, rkobj->flags);
9791
- index = btrfs_bg_flags_to_raid_index(rkobj->flags);
9792
-
9793
- ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9794
- "%s", get_raid_name(index));
9795
- if (ret) {
9796
- kobject_put(&rkobj->kobj);
9797
- break;
9798
- }
9799
- }
9800
- if (ret)
9801
- btrfs_warn(fs_info,
9802
- "failed to add kobject for block cache, ignoring");
9803
-}
9804
-
9805
-static void link_block_group(struct btrfs_block_group_cache *cache)
9806
-{
9807
- struct btrfs_space_info *space_info = cache->space_info;
9808
- struct btrfs_fs_info *fs_info = cache->fs_info;
9809
- int index = btrfs_bg_flags_to_raid_index(cache->flags);
9810
- bool first = false;
9811
-
9812
- down_write(&space_info->groups_sem);
9813
- if (list_empty(&space_info->block_groups[index]))
9814
- first = true;
9815
- list_add_tail(&cache->list, &space_info->block_groups[index]);
9816
- up_write(&space_info->groups_sem);
9817
-
9818
- if (first) {
9819
- struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9820
- if (!rkobj) {
9821
- btrfs_warn(cache->fs_info,
9822
- "couldn't alloc memory for raid level kobject");
9823
- return;
9824
- }
9825
- rkobj->flags = cache->flags;
9826
- kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9827
-
9828
- spin_lock(&fs_info->pending_raid_kobjs_lock);
9829
- list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
9830
- spin_unlock(&fs_info->pending_raid_kobjs_lock);
9831
- space_info->block_group_kobjs[index] = &rkobj->kobj;
9832
- }
9833
-}
9834
-
9835
-static struct btrfs_block_group_cache *
9836
-btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
9837
- u64 start, u64 size)
9838
-{
9839
- struct btrfs_block_group_cache *cache;
9840
-
9841
- cache = kzalloc(sizeof(*cache), GFP_NOFS);
9842
- if (!cache)
9843
- return NULL;
9844
-
9845
- cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9846
- GFP_NOFS);
9847
- if (!cache->free_space_ctl) {
9848
- kfree(cache);
9849
- return NULL;
9850
- }
9851
-
9852
- cache->key.objectid = start;
9853
- cache->key.offset = size;
9854
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9855
-
9856
- cache->fs_info = fs_info;
9857
- cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
9858
- set_free_space_tree_thresholds(cache);
9859
-
9860
- atomic_set(&cache->count, 1);
9861
- spin_lock_init(&cache->lock);
9862
- init_rwsem(&cache->data_rwsem);
9863
- INIT_LIST_HEAD(&cache->list);
9864
- INIT_LIST_HEAD(&cache->cluster_list);
9865
- INIT_LIST_HEAD(&cache->bg_list);
9866
- INIT_LIST_HEAD(&cache->ro_list);
9867
- INIT_LIST_HEAD(&cache->dirty_list);
9868
- INIT_LIST_HEAD(&cache->io_list);
9869
- btrfs_init_free_space_ctl(cache);
9870
- atomic_set(&cache->trimming, 0);
9871
- mutex_init(&cache->free_space_lock);
9872
- btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
9873
-
9874
- return cache;
9875
-}
9876
-
9877
-
9878
-/*
9879
- * Iterate all chunks and verify that each of them has the corresponding block
9880
- * group
9881
- */
9882
-static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
9883
-{
9884
- struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
9885
- struct extent_map *em;
9886
- struct btrfs_block_group_cache *bg;
9887
- u64 start = 0;
9888
- int ret = 0;
9889
-
9890
- while (1) {
9891
- read_lock(&map_tree->map_tree.lock);
9892
- /*
9893
- * lookup_extent_mapping will return the first extent map
9894
- * intersecting the range, so setting @len to 1 is enough to
9895
- * get the first chunk.
9896
- */
9897
- em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
9898
- read_unlock(&map_tree->map_tree.lock);
9899
- if (!em)
9900
- break;
9901
-
9902
- bg = btrfs_lookup_block_group(fs_info, em->start);
9903
- if (!bg) {
9904
- btrfs_err(fs_info,
9905
- "chunk start=%llu len=%llu doesn't have corresponding block group",
9906
- em->start, em->len);
9907
- ret = -EUCLEAN;
9908
- free_extent_map(em);
9909
- break;
9910
- }
9911
- if (bg->key.objectid != em->start ||
9912
- bg->key.offset != em->len ||
9913
- (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
9914
- (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9915
- btrfs_err(fs_info,
9916
-"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
9917
- em->start, em->len,
9918
- em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
9919
- bg->key.objectid, bg->key.offset,
9920
- bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
9921
- ret = -EUCLEAN;
9922
- free_extent_map(em);
9923
- btrfs_put_block_group(bg);
9924
- break;
9925
- }
9926
- start = em->start + em->len;
9927
- free_extent_map(em);
9928
- btrfs_put_block_group(bg);
9929
- }
9930
- return ret;
9931
-}
9932
-
9933
-int btrfs_read_block_groups(struct btrfs_fs_info *info)
9934
-{
9935
- struct btrfs_path *path;
9936
- int ret;
9937
- struct btrfs_block_group_cache *cache;
9938
- struct btrfs_space_info *space_info;
9939
- struct btrfs_key key;
9940
- struct btrfs_key found_key;
9941
- struct extent_buffer *leaf;
9942
- int need_clear = 0;
9943
- u64 cache_gen;
9944
- u64 feature;
9945
- int mixed;
9946
-
9947
- feature = btrfs_super_incompat_flags(info->super_copy);
9948
- mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
9949
-
9950
- key.objectid = 0;
9951
- key.offset = 0;
9952
- key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9953
- path = btrfs_alloc_path();
9954
- if (!path)
9955
- return -ENOMEM;
9956
- path->reada = READA_FORWARD;
9957
-
9958
- cache_gen = btrfs_super_cache_generation(info->super_copy);
9959
- if (btrfs_test_opt(info, SPACE_CACHE) &&
9960
- btrfs_super_generation(info->super_copy) != cache_gen)
9961
- need_clear = 1;
9962
- if (btrfs_test_opt(info, CLEAR_CACHE))
9963
- need_clear = 1;
9964
-
9965
- while (1) {
9966
- ret = find_first_block_group(info, path, &key);
9967
- if (ret > 0)
9968
- break;
9969
- if (ret != 0)
9970
- goto error;
9971
-
9972
- leaf = path->nodes[0];
9973
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9974
-
9975
- cache = btrfs_create_block_group_cache(info, found_key.objectid,
9976
- found_key.offset);
9977
- if (!cache) {
9978
- ret = -ENOMEM;
9979
- goto error;
9980
- }
9981
-
9982
- if (need_clear) {
9983
- /*
9984
- * When we mount with old space cache, we need to
9985
- * set BTRFS_DC_CLEAR and set dirty flag.
9986
- *
9987
- * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
9988
- * truncate the old free space cache inode and
9989
- * setup a new one.
9990
- * b) Setting 'dirty flag' makes sure that we flush
9991
- * the new space cache info onto disk.
9992
- */
9993
- if (btrfs_test_opt(info, SPACE_CACHE))
9994
- cache->disk_cache_state = BTRFS_DC_CLEAR;
9995
- }
9996
-
9997
- read_extent_buffer(leaf, &cache->item,
9998
- btrfs_item_ptr_offset(leaf, path->slots[0]),
9999
- sizeof(cache->item));
10000
- cache->flags = btrfs_block_group_flags(&cache->item);
10001
- if (!mixed &&
10002
- ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10003
- (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10004
- btrfs_err(info,
10005
-"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10006
- cache->key.objectid);
10007
- btrfs_put_block_group(cache);
10008
- ret = -EINVAL;
10009
- goto error;
10010
- }
10011
-
10012
- key.objectid = found_key.objectid + found_key.offset;
10013
- btrfs_release_path(path);
10014
-
10015
- /*
10016
- * We need to exclude the super stripes now so that the space
10017
- * info has super bytes accounted for, otherwise we'll think
10018
- * we have more space than we actually do.
10019
- */
10020
- ret = exclude_super_stripes(cache);
10021
- if (ret) {
10022
- /*
10023
- * We may have excluded something, so call this just in
10024
- * case.
10025
- */
10026
- free_excluded_extents(cache);
10027
- btrfs_put_block_group(cache);
10028
- goto error;
10029
- }
10030
-
10031
- /*
10032
- * check for two cases, either we are full, and therefore
10033
- * don't need to bother with the caching work since we won't
10034
- * find any space, or we are empty, and we can just add all
10035
- * the space in and be done with it. This saves us _alot_ of
10036
- * time, particularly in the full case.
10037
- */
10038
- if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10039
- cache->last_byte_to_unpin = (u64)-1;
10040
- cache->cached = BTRFS_CACHE_FINISHED;
10041
- free_excluded_extents(cache);
10042
- } else if (btrfs_block_group_used(&cache->item) == 0) {
10043
- cache->last_byte_to_unpin = (u64)-1;
10044
- cache->cached = BTRFS_CACHE_FINISHED;
10045
- add_new_free_space(cache, found_key.objectid,
10046
- found_key.objectid +
10047
- found_key.offset);
10048
- free_excluded_extents(cache);
10049
- }
10050
-
10051
- ret = btrfs_add_block_group_cache(info, cache);
10052
- if (ret) {
10053
- btrfs_remove_free_space_cache(cache);
10054
- btrfs_put_block_group(cache);
10055
- goto error;
10056
- }
10057
-
10058
- trace_btrfs_add_block_group(info, cache, 0);
10059
- update_space_info(info, cache->flags, found_key.offset,
10060
- btrfs_block_group_used(&cache->item),
10061
- cache->bytes_super, &space_info);
10062
-
10063
- cache->space_info = space_info;
10064
-
10065
- link_block_group(cache);
10066
-
10067
- set_avail_alloc_bits(info, cache->flags);
10068
- if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10069
- inc_block_group_ro(cache, 1);
10070
- } else if (btrfs_block_group_used(&cache->item) == 0) {
10071
- ASSERT(list_empty(&cache->bg_list));
10072
- btrfs_mark_bg_unused(cache);
10073
- }
10074
- }
10075
-
10076
- list_for_each_entry_rcu(space_info, &info->space_info, list) {
10077
- if (!(get_alloc_profile(info, space_info->flags) &
10078
- (BTRFS_BLOCK_GROUP_RAID10 |
10079
- BTRFS_BLOCK_GROUP_RAID1 |
10080
- BTRFS_BLOCK_GROUP_RAID5 |
10081
- BTRFS_BLOCK_GROUP_RAID6 |
10082
- BTRFS_BLOCK_GROUP_DUP)))
10083
- continue;
10084
- /*
10085
- * avoid allocating from un-mirrored block group if there are
10086
- * mirrored block groups.
10087
- */
10088
- list_for_each_entry(cache,
10089
- &space_info->block_groups[BTRFS_RAID_RAID0],
10090
- list)
10091
- inc_block_group_ro(cache, 1);
10092
- list_for_each_entry(cache,
10093
- &space_info->block_groups[BTRFS_RAID_SINGLE],
10094
- list)
10095
- inc_block_group_ro(cache, 1);
10096
- }
10097
-
10098
- btrfs_add_raid_kobjects(info);
10099
- init_global_block_rsv(info);
10100
- ret = check_chunk_block_group_mappings(info);
10101
-error:
10102
- btrfs_free_path(path);
10103
- return ret;
10104
-}
10105
-
10106
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10107
-{
10108
- struct btrfs_fs_info *fs_info = trans->fs_info;
10109
- struct btrfs_block_group_cache *block_group;
10110
- struct btrfs_root *extent_root = fs_info->extent_root;
10111
- struct btrfs_block_group_item item;
10112
- struct btrfs_key key;
10113
- int ret = 0;
10114
-
10115
- if (!trans->can_flush_pending_bgs)
10116
- return;
10117
-
10118
- while (!list_empty(&trans->new_bgs)) {
10119
- block_group = list_first_entry(&trans->new_bgs,
10120
- struct btrfs_block_group_cache,
10121
- bg_list);
10122
- if (ret)
10123
- goto next;
10124
-
10125
- spin_lock(&block_group->lock);
10126
- memcpy(&item, &block_group->item, sizeof(item));
10127
- memcpy(&key, &block_group->key, sizeof(key));
10128
- spin_unlock(&block_group->lock);
10129
-
10130
- ret = btrfs_insert_item(trans, extent_root, &key, &item,
10131
- sizeof(item));
10132
- if (ret)
10133
- btrfs_abort_transaction(trans, ret);
10134
- ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
10135
- if (ret)
10136
- btrfs_abort_transaction(trans, ret);
10137
- add_block_group_free_space(trans, block_group);
10138
- /* already aborted the transaction if it failed. */
10139
-next:
10140
- list_del_init(&block_group->bg_list);
10141
- }
10142
- btrfs_trans_release_chunk_metadata(trans);
10143
-}
10144
-
10145
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10146
- u64 type, u64 chunk_offset, u64 size)
10147
-{
10148
- struct btrfs_fs_info *fs_info = trans->fs_info;
10149
- struct btrfs_block_group_cache *cache;
10150
- int ret;
10151
-
10152
- btrfs_set_log_full_commit(fs_info, trans);
10153
-
10154
- cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10155
- if (!cache)
10156
- return -ENOMEM;
10157
-
10158
- btrfs_set_block_group_used(&cache->item, bytes_used);
10159
- btrfs_set_block_group_chunk_objectid(&cache->item,
10160
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10161
- btrfs_set_block_group_flags(&cache->item, type);
10162
-
10163
- cache->flags = type;
10164
- cache->last_byte_to_unpin = (u64)-1;
10165
- cache->cached = BTRFS_CACHE_FINISHED;
10166
- cache->needs_free_space = 1;
10167
- ret = exclude_super_stripes(cache);
10168
- if (ret) {
10169
- /*
10170
- * We may have excluded something, so call this just in
10171
- * case.
10172
- */
10173
- free_excluded_extents(cache);
10174
- btrfs_put_block_group(cache);
10175
- return ret;
10176
- }
10177
-
10178
- add_new_free_space(cache, chunk_offset, chunk_offset + size);
10179
-
10180
- free_excluded_extents(cache);
10181
-
10182
-#ifdef CONFIG_BTRFS_DEBUG
10183
- if (btrfs_should_fragment_free_space(cache)) {
10184
- u64 new_bytes_used = size - bytes_used;
10185
-
10186
- bytes_used += new_bytes_used >> 1;
10187
- fragment_free_space(cache);
10188
- }
10189
-#endif
10190
- /*
10191
- * Ensure the corresponding space_info object is created and
10192
- * assigned to our block group. We want our bg to be added to the rbtree
10193
- * with its ->space_info set.
10194
- */
10195
- cache->space_info = __find_space_info(fs_info, cache->flags);
10196
- ASSERT(cache->space_info);
10197
-
10198
- ret = btrfs_add_block_group_cache(fs_info, cache);
10199
- if (ret) {
10200
- btrfs_remove_free_space_cache(cache);
10201
- btrfs_put_block_group(cache);
10202
- return ret;
10203
- }
10204
-
10205
- /*
10206
- * Now that our block group has its ->space_info set and is inserted in
10207
- * the rbtree, update the space info's counters.
10208
- */
10209
- trace_btrfs_add_block_group(fs_info, cache, 1);
10210
- update_space_info(fs_info, cache->flags, size, bytes_used,
10211
- cache->bytes_super, &cache->space_info);
10212
- update_global_block_rsv(fs_info);
10213
-
10214
- link_block_group(cache);
10215
-
10216
- list_add_tail(&cache->bg_list, &trans->new_bgs);
10217
-
10218
- set_avail_alloc_bits(fs_info, type);
10219
- return 0;
10220
-}
10221
-
10222
-static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10223
-{
10224
- u64 extra_flags = chunk_to_extended(flags) &
10225
- BTRFS_EXTENDED_PROFILE_MASK;
10226
-
10227
- write_seqlock(&fs_info->profiles_lock);
10228
- if (flags & BTRFS_BLOCK_GROUP_DATA)
10229
- fs_info->avail_data_alloc_bits &= ~extra_flags;
10230
- if (flags & BTRFS_BLOCK_GROUP_METADATA)
10231
- fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10232
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10233
- fs_info->avail_system_alloc_bits &= ~extra_flags;
10234
- write_sequnlock(&fs_info->profiles_lock);
10235
-}
10236
-
10237
-int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10238
- u64 group_start, struct extent_map *em)
10239
-{
10240
- struct btrfs_fs_info *fs_info = trans->fs_info;
10241
- struct btrfs_root *root = fs_info->extent_root;
10242
- struct btrfs_path *path;
10243
- struct btrfs_block_group_cache *block_group;
10244
- struct btrfs_free_cluster *cluster;
10245
- struct btrfs_root *tree_root = fs_info->tree_root;
10246
- struct btrfs_key key;
10247
- struct inode *inode;
10248
- struct kobject *kobj = NULL;
10249
- int ret;
10250
- int index;
10251
- int factor;
10252
- struct btrfs_caching_control *caching_ctl = NULL;
10253
- bool remove_em;
10254
-
10255
- block_group = btrfs_lookup_block_group(fs_info, group_start);
10256
- BUG_ON(!block_group);
10257
- BUG_ON(!block_group->ro);
10258
-
10259
- trace_btrfs_remove_block_group(block_group);
10260
- /*
10261
- * Free the reserved super bytes from this block group before
10262
- * remove it.
10263
- */
10264
- free_excluded_extents(block_group);
10265
- btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10266
- block_group->key.offset);
10267
-
10268
- memcpy(&key, &block_group->key, sizeof(key));
10269
- index = btrfs_bg_flags_to_raid_index(block_group->flags);
10270
- factor = btrfs_bg_type_to_factor(block_group->flags);
10271
-
10272
- /* make sure this block group isn't part of an allocation cluster */
10273
- cluster = &fs_info->data_alloc_cluster;
10274
- spin_lock(&cluster->refill_lock);
10275
- btrfs_return_cluster_to_free_space(block_group, cluster);
10276
- spin_unlock(&cluster->refill_lock);
10277
-
10278
- /*
10279
- * make sure this block group isn't part of a metadata
10280
- * allocation cluster
10281
- */
10282
- cluster = &fs_info->meta_alloc_cluster;
10283
- spin_lock(&cluster->refill_lock);
10284
- btrfs_return_cluster_to_free_space(block_group, cluster);
10285
- spin_unlock(&cluster->refill_lock);
10286
-
10287
- path = btrfs_alloc_path();
10288
- if (!path) {
10289
- ret = -ENOMEM;
10290
- goto out;
10291
- }
10292
-
10293
- /*
10294
- * get the inode first so any iput calls done for the io_list
10295
- * aren't the final iput (no unlinks allowed now)
10296
- */
10297
- inode = lookup_free_space_inode(fs_info, block_group, path);
10298
-
10299
- mutex_lock(&trans->transaction->cache_write_mutex);
10300
- /*
10301
- * make sure our free spache cache IO is done before remove the
10302
- * free space inode
10303
- */
10304
- spin_lock(&trans->transaction->dirty_bgs_lock);
10305
- if (!list_empty(&block_group->io_list)) {
10306
- list_del_init(&block_group->io_list);
10307
-
10308
- WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10309
-
10310
- spin_unlock(&trans->transaction->dirty_bgs_lock);
10311
- btrfs_wait_cache_io(trans, block_group, path);
10312
- btrfs_put_block_group(block_group);
10313
- spin_lock(&trans->transaction->dirty_bgs_lock);
10314
- }
10315
-
10316
- if (!list_empty(&block_group->dirty_list)) {
10317
- list_del_init(&block_group->dirty_list);
10318
- btrfs_put_block_group(block_group);
10319
- }
10320
- spin_unlock(&trans->transaction->dirty_bgs_lock);
10321
- mutex_unlock(&trans->transaction->cache_write_mutex);
10322
-
10323
- if (!IS_ERR(inode)) {
10324
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10325
- if (ret) {
10326
- btrfs_add_delayed_iput(inode);
10327
- goto out;
10328
- }
10329
- clear_nlink(inode);
10330
- /* One for the block groups ref */
10331
- spin_lock(&block_group->lock);
10332
- if (block_group->iref) {
10333
- block_group->iref = 0;
10334
- block_group->inode = NULL;
10335
- spin_unlock(&block_group->lock);
10336
- iput(inode);
10337
- } else {
10338
- spin_unlock(&block_group->lock);
10339
- }
10340
- /* One for our lookup ref */
10341
- btrfs_add_delayed_iput(inode);
10342
- }
10343
-
10344
- key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10345
- key.offset = block_group->key.objectid;
10346
- key.type = 0;
10347
-
10348
- ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10349
- if (ret < 0)
10350
- goto out;
10351
- if (ret > 0)
10352
- btrfs_release_path(path);
10353
- if (ret == 0) {
10354
- ret = btrfs_del_item(trans, tree_root, path);
10355
- if (ret)
10356
- goto out;
10357
- btrfs_release_path(path);
10358
- }
10359
-
10360
- spin_lock(&fs_info->block_group_cache_lock);
10361
- rb_erase(&block_group->cache_node,
10362
- &fs_info->block_group_cache_tree);
10363
- RB_CLEAR_NODE(&block_group->cache_node);
10364
-
10365
- /* Once for the block groups rbtree */
10366
- btrfs_put_block_group(block_group);
10367
-
10368
- if (fs_info->first_logical_byte == block_group->key.objectid)
10369
- fs_info->first_logical_byte = (u64)-1;
10370
- spin_unlock(&fs_info->block_group_cache_lock);
10371
-
10372
- down_write(&block_group->space_info->groups_sem);
10373
- /*
10374
- * we must use list_del_init so people can check to see if they
10375
- * are still on the list after taking the semaphore
10376
- */
10377
- list_del_init(&block_group->list);
10378
- if (list_empty(&block_group->space_info->block_groups[index])) {
10379
- kobj = block_group->space_info->block_group_kobjs[index];
10380
- block_group->space_info->block_group_kobjs[index] = NULL;
10381
- clear_avail_alloc_bits(fs_info, block_group->flags);
10382
- }
10383
- up_write(&block_group->space_info->groups_sem);
10384
- if (kobj) {
10385
- kobject_del(kobj);
10386
- kobject_put(kobj);
10387
- }
10388
-
10389
- if (block_group->has_caching_ctl)
10390
- caching_ctl = get_caching_control(block_group);
10391
- if (block_group->cached == BTRFS_CACHE_STARTED)
10392
- wait_block_group_cache_done(block_group);
10393
- if (block_group->has_caching_ctl) {
10394
- down_write(&fs_info->commit_root_sem);
10395
- if (!caching_ctl) {
10396
- struct btrfs_caching_control *ctl;
10397
-
10398
- list_for_each_entry(ctl,
10399
- &fs_info->caching_block_groups, list)
10400
- if (ctl->block_group == block_group) {
10401
- caching_ctl = ctl;
10402
- refcount_inc(&caching_ctl->count);
10403
- break;
10404
- }
10405
- }
10406
- if (caching_ctl)
10407
- list_del_init(&caching_ctl->list);
10408
- up_write(&fs_info->commit_root_sem);
10409
- if (caching_ctl) {
10410
- /* Once for the caching bgs list and once for us. */
10411
- put_caching_control(caching_ctl);
10412
- put_caching_control(caching_ctl);
10413
- }
10414
- }
10415
-
10416
- spin_lock(&trans->transaction->dirty_bgs_lock);
10417
- if (!list_empty(&block_group->dirty_list)) {
10418
- WARN_ON(1);
10419
- }
10420
- if (!list_empty(&block_group->io_list)) {
10421
- WARN_ON(1);
10422
- }
10423
- spin_unlock(&trans->transaction->dirty_bgs_lock);
10424
- btrfs_remove_free_space_cache(block_group);
10425
-
10426
- spin_lock(&block_group->space_info->lock);
10427
- list_del_init(&block_group->ro_list);
10428
-
10429
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10430
- WARN_ON(block_group->space_info->total_bytes
10431
- < block_group->key.offset);
10432
- WARN_ON(block_group->space_info->bytes_readonly
10433
- < block_group->key.offset);
10434
- WARN_ON(block_group->space_info->disk_total
10435
- < block_group->key.offset * factor);
10436
- }
10437
- block_group->space_info->total_bytes -= block_group->key.offset;
10438
- block_group->space_info->bytes_readonly -= block_group->key.offset;
10439
- block_group->space_info->disk_total -= block_group->key.offset * factor;
10440
-
10441
- spin_unlock(&block_group->space_info->lock);
10442
-
10443
- memcpy(&key, &block_group->key, sizeof(key));
10444
-
10445
- mutex_lock(&fs_info->chunk_mutex);
10446
- if (!list_empty(&em->list)) {
10447
- /* We're in the transaction->pending_chunks list. */
10448
- free_extent_map(em);
10449
- }
10450
- spin_lock(&block_group->lock);
10451
- block_group->removed = 1;
10452
- /*
10453
- * At this point trimming can't start on this block group, because we
10454
- * removed the block group from the tree fs_info->block_group_cache_tree
10455
- * so no one can't find it anymore and even if someone already got this
10456
- * block group before we removed it from the rbtree, they have already
10457
- * incremented block_group->trimming - if they didn't, they won't find
10458
- * any free space entries because we already removed them all when we
10459
- * called btrfs_remove_free_space_cache().
10460
- *
10461
- * And we must not remove the extent map from the fs_info->mapping_tree
10462
- * to prevent the same logical address range and physical device space
10463
- * ranges from being reused for a new block group. This is because our
10464
- * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10465
- * completely transactionless, so while it is trimming a range the
10466
- * currently running transaction might finish and a new one start,
10467
- * allowing for new block groups to be created that can reuse the same
10468
- * physical device locations unless we take this special care.
10469
- *
10470
- * There may also be an implicit trim operation if the file system
10471
- * is mounted with -odiscard. The same protections must remain
10472
- * in place until the extents have been discarded completely when
10473
- * the transaction commit has completed.
10474
- */
10475
- remove_em = (atomic_read(&block_group->trimming) == 0);
10476
- /*
10477
- * Make sure a trimmer task always sees the em in the pinned_chunks list
10478
- * if it sees block_group->removed == 1 (needs to lock block_group->lock
10479
- * before checking block_group->removed).
10480
- */
10481
- if (!remove_em) {
10482
- /*
10483
- * Our em might be in trans->transaction->pending_chunks which
10484
- * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10485
- * and so is the fs_info->pinned_chunks list.
10486
- *
10487
- * So at this point we must be holding the chunk_mutex to avoid
10488
- * any races with chunk allocation (more specifically at
10489
- * volumes.c:contains_pending_extent()), to ensure it always
10490
- * sees the em, either in the pending_chunks list or in the
10491
- * pinned_chunks list.
10492
- */
10493
- list_move_tail(&em->list, &fs_info->pinned_chunks);
10494
- }
10495
- spin_unlock(&block_group->lock);
10496
-
10497
- mutex_unlock(&fs_info->chunk_mutex);
10498
-
10499
- ret = remove_block_group_free_space(trans, block_group);
10500
- if (ret)
10501
- goto out;
10502
-
10503
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10504
- if (ret > 0)
10505
- ret = -EIO;
10506
- if (ret < 0)
10507
- goto out;
10508
-
10509
- ret = btrfs_del_item(trans, root, path);
10510
- if (ret)
10511
- goto out;
10512
-
10513
- if (remove_em) {
10514
- struct extent_map_tree *em_tree;
10515
-
10516
- em_tree = &fs_info->mapping_tree.map_tree;
10517
- write_lock(&em_tree->lock);
10518
- /*
10519
- * The em might be in the pending_chunks list, so make sure the
10520
- * chunk mutex is locked, since remove_extent_mapping() will
10521
- * delete us from that list.
10522
- */
10523
- remove_extent_mapping(em_tree, em);
10524
- write_unlock(&em_tree->lock);
10525
- /* once for the tree */
10526
- free_extent_map(em);
10527
- }
10528
-
10529
-out:
10530
- /* Once for the lookup reference */
10531
- btrfs_put_block_group(block_group);
10532
- btrfs_free_path(path);
10533
- return ret;
10534
-}
10535
-
10536
-struct btrfs_trans_handle *
10537
-btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10538
- const u64 chunk_offset)
10539
-{
10540
- struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10541
- struct extent_map *em;
10542
- struct map_lookup *map;
10543
- unsigned int num_items;
10544
-
10545
- read_lock(&em_tree->lock);
10546
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10547
- read_unlock(&em_tree->lock);
10548
- ASSERT(em && em->start == chunk_offset);
10549
-
10550
- /*
10551
- * We need to reserve 3 + N units from the metadata space info in order
10552
- * to remove a block group (done at btrfs_remove_chunk() and at
10553
- * btrfs_remove_block_group()), which are used for:
10554
- *
10555
- * 1 unit for adding the free space inode's orphan (located in the tree
10556
- * of tree roots).
10557
- * 1 unit for deleting the block group item (located in the extent
10558
- * tree).
10559
- * 1 unit for deleting the free space item (located in tree of tree
10560
- * roots).
10561
- * N units for deleting N device extent items corresponding to each
10562
- * stripe (located in the device tree).
10563
- *
10564
- * In order to remove a block group we also need to reserve units in the
10565
- * system space info in order to update the chunk tree (update one or
10566
- * more device items and remove one chunk item), but this is done at
10567
- * btrfs_remove_chunk() through a call to check_system_chunk().
10568
- */
10569
- map = em->map_lookup;
10570
- num_items = 3 + map->num_stripes;
10571
- free_extent_map(em);
10572
-
10573
- return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10574
- num_items, 1);
10575
-}
10576
-
10577
-/*
10578
- * Process the unused_bgs list and remove any that don't have any allocated
10579
- * space inside of them.
10580
- */
10581
-void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10582
-{
10583
- struct btrfs_block_group_cache *block_group;
10584
- struct btrfs_space_info *space_info;
10585
- struct btrfs_trans_handle *trans;
10586
- int ret = 0;
10587
-
10588
- if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10589
- return;
10590
-
10591
- spin_lock(&fs_info->unused_bgs_lock);
10592
- while (!list_empty(&fs_info->unused_bgs)) {
10593
- u64 start, end;
10594
- int trimming;
10595
-
10596
- block_group = list_first_entry(&fs_info->unused_bgs,
10597
- struct btrfs_block_group_cache,
10598
- bg_list);
10599
- list_del_init(&block_group->bg_list);
10600
-
10601
- space_info = block_group->space_info;
10602
-
10603
- if (ret || btrfs_mixed_space_info(space_info)) {
10604
- btrfs_put_block_group(block_group);
10605
- continue;
10606
- }
10607
- spin_unlock(&fs_info->unused_bgs_lock);
10608
-
10609
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
10610
-
10611
- /* Don't want to race with allocators so take the groups_sem */
10612
- down_write(&space_info->groups_sem);
10613
- spin_lock(&block_group->lock);
10614
- if (block_group->reserved || block_group->pinned ||
10615
- btrfs_block_group_used(&block_group->item) ||
10616
- block_group->ro ||
10617
- list_is_singular(&block_group->list)) {
10618
- /*
10619
- * We want to bail if we made new allocations or have
10620
- * outstanding allocations in this block group. We do
10621
- * the ro check in case balance is currently acting on
10622
- * this block group.
10623
- */
10624
- trace_btrfs_skip_unused_block_group(block_group);
10625
- spin_unlock(&block_group->lock);
10626
- up_write(&space_info->groups_sem);
10627
- goto next;
10628
- }
10629
- spin_unlock(&block_group->lock);
10630
-
10631
- /* We don't want to force the issue, only flip if it's ok. */
10632
- ret = inc_block_group_ro(block_group, 0);
10633
- up_write(&space_info->groups_sem);
10634
- if (ret < 0) {
10635
- ret = 0;
10636
- goto next;
10637
- }
10638
-
10639
- /*
10640
- * Want to do this before we do anything else so we can recover
10641
- * properly if we fail to join the transaction.
10642
- */
10643
- trans = btrfs_start_trans_remove_block_group(fs_info,
10644
- block_group->key.objectid);
10645
- if (IS_ERR(trans)) {
10646
- btrfs_dec_block_group_ro(block_group);
10647
- ret = PTR_ERR(trans);
10648
- goto next;
10649
- }
10650
-
10651
- /*
10652
- * We could have pending pinned extents for this block group,
10653
- * just delete them, we don't care about them anymore.
10654
- */
10655
- start = block_group->key.objectid;
10656
- end = start + block_group->key.offset - 1;
10657
- /*
10658
- * Hold the unused_bg_unpin_mutex lock to avoid racing with
10659
- * btrfs_finish_extent_commit(). If we are at transaction N,
10660
- * another task might be running finish_extent_commit() for the
10661
- * previous transaction N - 1, and have seen a range belonging
10662
- * to the block group in freed_extents[] before we were able to
10663
- * clear the whole block group range from freed_extents[]. This
10664
- * means that task can lookup for the block group after we
10665
- * unpinned it from freed_extents[] and removed it, leading to
10666
- * a BUG_ON() at btrfs_unpin_extent_range().
10667
- */
10668
- mutex_lock(&fs_info->unused_bg_unpin_mutex);
10669
- ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10670
- EXTENT_DIRTY);
10671
- if (ret) {
10672
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10673
- btrfs_dec_block_group_ro(block_group);
10674
- goto end_trans;
10675
- }
10676
- ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10677
- EXTENT_DIRTY);
10678
- if (ret) {
10679
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10680
- btrfs_dec_block_group_ro(block_group);
10681
- goto end_trans;
10682
- }
10683
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10684
-
10685
- /* Reset pinned so btrfs_put_block_group doesn't complain */
10686
- spin_lock(&space_info->lock);
10687
- spin_lock(&block_group->lock);
10688
-
10689
- space_info->bytes_pinned -= block_group->pinned;
10690
- space_info->bytes_readonly += block_group->pinned;
10691
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
10692
- -block_group->pinned,
10693
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
10694
- block_group->pinned = 0;
10695
-
10696
- spin_unlock(&block_group->lock);
10697
- spin_unlock(&space_info->lock);
10698
-
10699
- /* DISCARD can flip during remount */
10700
- trimming = btrfs_test_opt(fs_info, DISCARD);
10701
-
10702
- /* Implicit trim during transaction commit. */
10703
- if (trimming)
10704
- btrfs_get_block_group_trimming(block_group);
10705
-
10706
- /*
10707
- * Btrfs_remove_chunk will abort the transaction if things go
10708
- * horribly wrong.
10709
- */
10710
- ret = btrfs_remove_chunk(trans, block_group->key.objectid);
10711
-
10712
- if (ret) {
10713
- if (trimming)
10714
- btrfs_put_block_group_trimming(block_group);
10715
- goto end_trans;
10716
- }
10717
-
10718
- /*
10719
- * If we're not mounted with -odiscard, we can just forget
10720
- * about this block group. Otherwise we'll need to wait
10721
- * until transaction commit to do the actual discard.
10722
- */
10723
- if (trimming) {
10724
- spin_lock(&fs_info->unused_bgs_lock);
10725
- /*
10726
- * A concurrent scrub might have added us to the list
10727
- * fs_info->unused_bgs, so use a list_move operation
10728
- * to add the block group to the deleted_bgs list.
10729
- */
10730
- list_move(&block_group->bg_list,
10731
- &trans->transaction->deleted_bgs);
10732
- spin_unlock(&fs_info->unused_bgs_lock);
10733
- btrfs_get_block_group(block_group);
10734
- }
10735
-end_trans:
10736
- btrfs_end_transaction(trans);
10737
-next:
10738
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10739
- btrfs_put_block_group(block_group);
10740
- spin_lock(&fs_info->unused_bgs_lock);
10741
- }
10742
- spin_unlock(&fs_info->unused_bgs_lock);
10743
-}
10744
-
10745
-int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10746
-{
10747
- struct btrfs_super_block *disk_super;
10748
- u64 features;
10749
- u64 flags;
10750
- int mixed = 0;
10751
- int ret;
10752
-
10753
- disk_super = fs_info->super_copy;
10754
- if (!btrfs_super_root(disk_super))
10755
- return -EINVAL;
10756
-
10757
- features = btrfs_super_incompat_flags(disk_super);
10758
- if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10759
- mixed = 1;
10760
-
10761
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
10762
- ret = create_space_info(fs_info, flags);
10763
- if (ret)
10764
- goto out;
10765
-
10766
- if (mixed) {
10767
- flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10768
- ret = create_space_info(fs_info, flags);
10769
- } else {
10770
- flags = BTRFS_BLOCK_GROUP_METADATA;
10771
- ret = create_space_info(fs_info, flags);
10772
- if (ret)
10773
- goto out;
10774
-
10775
- flags = BTRFS_BLOCK_GROUP_DATA;
10776
- ret = create_space_info(fs_info, flags);
10777
- }
10778
-out:
10779
- return ret;
107805689 }
107815690
107825691 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
....@@ -10805,10 +5714,9 @@
108055714 * it while performing the free space search since we have already
108065715 * held back allocations.
108075716 */
10808
-static int btrfs_trim_free_extents(struct btrfs_device *device,
10809
- u64 minlen, u64 *trimmed)
5717
+static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
108105718 {
10811
- u64 start = 0, len = 0;
5719
+ u64 start = SZ_1M, len = 0, end = 0;
108125720 int ret;
108135721
108145722 *trimmed = 0;
....@@ -10817,7 +5725,7 @@
108175725 if (!blk_queue_discard(bdev_get_queue(device->bdev)))
108185726 return 0;
108195727
10820
- /* Not writeable = nothing to do. */
5728
+ /* Not writable = nothing to do. */
108215729 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
108225730 return 0;
108235731
....@@ -10829,43 +5737,54 @@
108295737
108305738 while (1) {
108315739 struct btrfs_fs_info *fs_info = device->fs_info;
10832
- struct btrfs_transaction *trans;
108335740 u64 bytes;
108345741
108355742 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
108365743 if (ret)
108375744 break;
108385745
10839
- ret = down_read_killable(&fs_info->commit_root_sem);
10840
- if (ret) {
5746
+ find_first_clear_extent_bit(&device->alloc_state, start,
5747
+ &start, &end,
5748
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
5749
+
5750
+ /* Check if there are any CHUNK_* bits left */
5751
+ if (start > device->total_bytes) {
5752
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
5753
+ btrfs_warn_in_rcu(fs_info,
5754
+"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
5755
+ start, end - start + 1,
5756
+ rcu_str_deref(device->name),
5757
+ device->total_bytes);
108415758 mutex_unlock(&fs_info->chunk_mutex);
5759
+ ret = 0;
108425760 break;
108435761 }
108445762
10845
- spin_lock(&fs_info->trans_lock);
10846
- trans = fs_info->running_transaction;
10847
- if (trans)
10848
- refcount_inc(&trans->use_count);
10849
- spin_unlock(&fs_info->trans_lock);
5763
+ /* Ensure we skip the reserved area in the first 1M */
5764
+ start = max_t(u64, start, SZ_1M);
108505765
10851
- if (!trans)
10852
- up_read(&fs_info->commit_root_sem);
5766
+ /*
5767
+ * If find_first_clear_extent_bit find a range that spans the
5768
+ * end of the device it will set end to -1, in this case it's up
5769
+ * to the caller to trim the value to the size of the device.
5770
+ */
5771
+ end = min(end, device->total_bytes - 1);
108535772
10854
- ret = find_free_dev_extent_start(trans, device, minlen, start,
10855
- &start, &len);
10856
- if (trans) {
10857
- up_read(&fs_info->commit_root_sem);
10858
- btrfs_put_transaction(trans);
10859
- }
5773
+ len = end - start + 1;
108605774
10861
- if (ret) {
5775
+ /* We didn't find any extents */
5776
+ if (!len) {
108625777 mutex_unlock(&fs_info->chunk_mutex);
10863
- if (ret == -ENOSPC)
10864
- ret = 0;
5778
+ ret = 0;
108655779 break;
108665780 }
108675781
10868
- ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
5782
+ ret = btrfs_issue_discard(device->bdev, start, len,
5783
+ &bytes);
5784
+ if (!ret)
5785
+ set_extent_bits(&device->alloc_state, start,
5786
+ start + bytes - 1,
5787
+ CHUNK_TRIMMED);
108695788 mutex_unlock(&fs_info->chunk_mutex);
108705789
108715790 if (ret)
....@@ -10896,10 +5815,11 @@
108965815 */
108975816 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
108985817 {
10899
- struct btrfs_block_group_cache *cache = NULL;
5818
+ struct btrfs_block_group *cache = NULL;
109005819 struct btrfs_device *device;
109015820 struct list_head *devices;
109025821 u64 group_trimmed;
5822
+ u64 range_end = U64_MAX;
109035823 u64 start;
109045824 u64 end;
109055825 u64 trimmed = 0;
....@@ -10909,26 +5829,33 @@
109095829 int dev_ret = 0;
109105830 int ret = 0;
109115831
5832
+ /*
5833
+ * Check range overflow if range->len is set.
5834
+ * The default range->len is U64_MAX.
5835
+ */
5836
+ if (range->len != U64_MAX &&
5837
+ check_add_overflow(range->start, range->len, &range_end))
5838
+ return -EINVAL;
5839
+
109125840 cache = btrfs_lookup_first_block_group(fs_info, range->start);
10913
- for (; cache; cache = next_block_group(fs_info, cache)) {
10914
- if (cache->key.objectid >= (range->start + range->len)) {
5841
+ for (; cache; cache = btrfs_next_block_group(cache)) {
5842
+ if (cache->start >= range_end) {
109155843 btrfs_put_block_group(cache);
109165844 break;
109175845 }
109185846
10919
- start = max(range->start, cache->key.objectid);
10920
- end = min(range->start + range->len,
10921
- cache->key.objectid + cache->key.offset);
5847
+ start = max(range->start, cache->start);
5848
+ end = min(range_end, cache->start + cache->length);
109225849
109235850 if (end - start >= range->minlen) {
10924
- if (!block_group_cache_done(cache)) {
10925
- ret = cache_block_group(cache, 0);
5851
+ if (!btrfs_block_group_done(cache)) {
5852
+ ret = btrfs_cache_block_group(cache, 0);
109265853 if (ret) {
109275854 bg_failed++;
109285855 bg_ret = ret;
109295856 continue;
109305857 }
10931
- ret = wait_block_group_cache_done(cache);
5858
+ ret = btrfs_wait_block_group_cache_done(cache);
109325859 if (ret) {
109335860 bg_failed++;
109345861 bg_ret = ret;
....@@ -10957,8 +5884,10 @@
109575884 mutex_lock(&fs_info->fs_devices->device_list_mutex);
109585885 devices = &fs_info->fs_devices->devices;
109595886 list_for_each_entry(device, devices, dev_list) {
10960
- ret = btrfs_trim_free_extents(device, range->minlen,
10961
- &group_trimmed);
5887
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
5888
+ continue;
5889
+
5890
+ ret = btrfs_trim_free_extents(device, &group_trimmed);
109625891 if (ret) {
109635892 dev_failed++;
109645893 dev_ret = ret;
....@@ -10977,61 +5906,4 @@
109775906 if (bg_ret)
109785907 return bg_ret;
109795908 return dev_ret;
10980
-}
10981
-
10982
-/*
10983
- * btrfs_{start,end}_write_no_snapshotting() are similar to
10984
- * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
10985
- * data into the page cache through nocow before the subvolume is snapshoted,
10986
- * but flush the data into disk after the snapshot creation, or to prevent
10987
- * operations while snapshotting is ongoing and that cause the snapshot to be
10988
- * inconsistent (writes followed by expanding truncates for example).
10989
- */
10990
-void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
10991
-{
10992
- percpu_counter_dec(&root->subv_writers->counter);
10993
- cond_wake_up(&root->subv_writers->wait);
10994
-}
10995
-
10996
-int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
10997
-{
10998
- if (atomic_read(&root->will_be_snapshotted))
10999
- return 0;
11000
-
11001
- percpu_counter_inc(&root->subv_writers->counter);
11002
- /*
11003
- * Make sure counter is updated before we check for snapshot creation.
11004
- */
11005
- smp_mb();
11006
- if (atomic_read(&root->will_be_snapshotted)) {
11007
- btrfs_end_write_no_snapshotting(root);
11008
- return 0;
11009
- }
11010
- return 1;
11011
-}
11012
-
11013
-void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11014
-{
11015
- while (true) {
11016
- int ret;
11017
-
11018
- ret = btrfs_start_write_no_snapshotting(root);
11019
- if (ret)
11020
- break;
11021
- wait_var_event(&root->will_be_snapshotted,
11022
- !atomic_read(&root->will_be_snapshotted));
11023
- }
11024
-}
11025
-
11026
-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
11027
-{
11028
- struct btrfs_fs_info *fs_info = bg->fs_info;
11029
-
11030
- spin_lock(&fs_info->unused_bgs_lock);
11031
- if (list_empty(&bg->bg_list)) {
11032
- btrfs_get_block_group(bg);
11033
- trace_btrfs_add_unused_block_group(bg);
11034
- list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
11035
- }
11036
- spin_unlock(&fs_info->unused_bgs_lock);
110375909 }