forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-08 01573e231f18eb2d99162747186f59511f56b64d
kernel/fs/btrfs/qgroup.c
....@@ -11,7 +11,7 @@
1111 #include <linux/slab.h>
1212 #include <linux/workqueue.h>
1313 #include <linux/btrfs.h>
14
-#include <linux/sizes.h>
14
+#include <linux/sched/mm.h>
1515
1616 #include "ctree.h"
1717 #include "transaction.h"
....@@ -21,7 +21,8 @@
2121 #include "backref.h"
2222 #include "extent_io.h"
2323 #include "qgroup.h"
24
-
24
+#include "block-group.h"
25
+#include "sysfs.h"
2526
2627 /* TODO XXX FIXME
2728 * - subvol delete -> delete when ref goes to 0? delete limits also?
....@@ -30,7 +31,7 @@
3031 * - sync
3132 * - copy also limits on subvol creation
3233 * - limit
33
- * - caches fuer ulists
34
+ * - caches for ulists
3435 * - performance benchmarks
3536 * - check all ioctl parameters
3637 */
....@@ -220,7 +221,8 @@
220221 return qgroup;
221222 }
222223
223
-static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
224
+static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
225
+ struct btrfs_qgroup *qgroup)
224226 {
225227 struct btrfs_qgroup_list *list;
226228
....@@ -240,7 +242,6 @@
240242 list_del(&list->next_member);
241243 kfree(list);
242244 }
243
- kfree(qgroup);
244245 }
245246
246247 /* must be called with qgroup_lock held */
....@@ -252,7 +253,7 @@
252253 return -ENOENT;
253254
254255 rb_erase(&qgroup->node, &fs_info->qgroup_tree);
255
- __del_qgroup_rb(qgroup);
256
+ __del_qgroup_rb(fs_info, qgroup);
256257 return 0;
257258 }
258259
....@@ -351,6 +352,9 @@
351352 goto out;
352353 }
353354
355
+ ret = btrfs_sysfs_add_qgroups(fs_info);
356
+ if (ret < 0)
357
+ goto out;
354358 /* default this to quota off, in case no status key is found */
355359 fs_info->qgroup_flags = 0;
356360
....@@ -412,6 +416,10 @@
412416 goto out;
413417 }
414418 }
419
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
420
+ if (ret < 0)
421
+ goto out;
422
+
415423 switch (found_key.type) {
416424 case BTRFS_QGROUP_INFO_KEY: {
417425 struct btrfs_qgroup_info_item *ptr;
....@@ -500,9 +508,48 @@
500508 ulist_free(fs_info->qgroup_ulist);
501509 fs_info->qgroup_ulist = NULL;
502510 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
511
+ btrfs_sysfs_del_qgroups(fs_info);
503512 }
504513
505514 return ret < 0 ? ret : 0;
515
+}
516
+
517
+/*
518
+ * Called in close_ctree() when quota is still enabled. This verifies we don't
519
+ * leak some reserved space.
520
+ *
521
+ * Return false if no reserved space is left.
522
+ * Return true if some reserved space is leaked.
523
+ */
524
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
525
+{
526
+ struct rb_node *node;
527
+ bool ret = false;
528
+
529
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
530
+ return ret;
531
+ /*
532
+ * Since we're unmounting, there is no race and no need to grab qgroup
533
+ * lock. And here we don't go post-order to provide a more user
534
+ * friendly sorted result.
535
+ */
536
+ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
537
+ struct btrfs_qgroup *qgroup;
538
+ int i;
539
+
540
+ qgroup = rb_entry(node, struct btrfs_qgroup, node);
541
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
542
+ if (qgroup->rsv.values[i]) {
543
+ ret = true;
544
+ btrfs_warn(fs_info,
545
+ "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
546
+ btrfs_qgroup_level(qgroup->qgroupid),
547
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
548
+ i, qgroup->rsv.values[i]);
549
+ }
550
+ }
551
+ }
552
+ return ret;
506553 }
507554
508555 /*
....@@ -519,15 +566,18 @@
519566 while ((n = rb_first(&fs_info->qgroup_tree))) {
520567 qgroup = rb_entry(n, struct btrfs_qgroup, node);
521568 rb_erase(n, &fs_info->qgroup_tree);
522
- __del_qgroup_rb(qgroup);
569
+ __del_qgroup_rb(fs_info, qgroup);
570
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
571
+ kfree(qgroup);
523572 }
524573 /*
525
- * we call btrfs_free_qgroup_config() when umounting
574
+ * We call btrfs_free_qgroup_config() when unmounting
526575 * filesystem and disabling quota, so we set qgroup_ulist
527576 * to be null here to avoid double free.
528577 */
529578 ulist_free(fs_info->qgroup_ulist);
530579 fs_info->qgroup_ulist = NULL;
580
+ btrfs_sysfs_del_qgroups(fs_info);
531581 }
532582
533583 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
....@@ -887,12 +937,46 @@
887937 struct btrfs_key found_key;
888938 struct btrfs_qgroup *qgroup = NULL;
889939 struct btrfs_trans_handle *trans = NULL;
940
+ struct ulist *ulist = NULL;
890941 int ret = 0;
891942 int slot;
943
+
944
+ /*
945
+ * We need to have subvol_sem write locked, to prevent races between
946
+ * concurrent tasks trying to enable quotas, because we will unlock
947
+ * and relock qgroup_ioctl_lock before setting fs_info->quota_root
948
+ * and before setting BTRFS_FS_QUOTA_ENABLED.
949
+ */
950
+ lockdep_assert_held_write(&fs_info->subvol_sem);
892951
893952 mutex_lock(&fs_info->qgroup_ioctl_lock);
894953 if (fs_info->quota_root)
895954 goto out;
955
+
956
+ ulist = ulist_alloc(GFP_KERNEL);
957
+ if (!ulist) {
958
+ ret = -ENOMEM;
959
+ goto out;
960
+ }
961
+
962
+ ret = btrfs_sysfs_add_qgroups(fs_info);
963
+ if (ret < 0)
964
+ goto out;
965
+
966
+ /*
967
+ * Unlock qgroup_ioctl_lock before starting the transaction. This is to
968
+ * avoid lock acquisition inversion problems (reported by lockdep) between
969
+ * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
970
+ * start a transaction.
971
+ * After we started the transaction lock qgroup_ioctl_lock again and
972
+ * check if someone else created the quota root in the meanwhile. If so,
973
+ * just return success and release the transaction handle.
974
+ *
975
+ * Also we don't need to worry about someone else calling
976
+ * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
977
+ * that function returns 0 (success) when the sysfs entries already exist.
978
+ */
979
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
896980
897981 /*
898982 * 1 for quota root item
....@@ -903,24 +987,24 @@
903987 * would be a lot of overkill.
904988 */
905989 trans = btrfs_start_transaction(tree_root, 2);
990
+
991
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
906992 if (IS_ERR(trans)) {
907993 ret = PTR_ERR(trans);
908994 trans = NULL;
909995 goto out;
910996 }
911997
912
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
913
- if (!fs_info->qgroup_ulist) {
914
- ret = -ENOMEM;
915
- btrfs_abort_transaction(trans, ret);
998
+ if (fs_info->quota_root)
916999 goto out;
917
- }
1000
+
1001
+ fs_info->qgroup_ulist = ulist;
1002
+ ulist = NULL;
9181003
9191004 /*
9201005 * initially create the quota tree
9211006 */
922
- quota_root = btrfs_create_tree(trans, fs_info,
923
- BTRFS_QUOTA_TREE_OBJECTID);
1007
+ quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
9241008 if (IS_ERR(quota_root)) {
9251009 ret = PTR_ERR(quota_root);
9261010 btrfs_abort_transaction(trans, ret);
....@@ -976,6 +1060,10 @@
9761060 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9771061
9781062 if (found_key.type == BTRFS_ROOT_REF_KEY) {
1063
+
1064
+ /* Release locks on tree_root before we access quota_root */
1065
+ btrfs_release_path(path);
1066
+
9791067 ret = add_qgroup_item(trans, quota_root,
9801068 found_key.offset);
9811069 if (ret) {
....@@ -988,6 +1076,25 @@
9881076 ret = PTR_ERR(qgroup);
9891077 btrfs_abort_transaction(trans, ret);
9901078 goto out_free_path;
1079
+ }
1080
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1081
+ if (ret < 0) {
1082
+ btrfs_abort_transaction(trans, ret);
1083
+ goto out_free_path;
1084
+ }
1085
+ ret = btrfs_search_slot_for_read(tree_root, &found_key,
1086
+ path, 1, 0);
1087
+ if (ret < 0) {
1088
+ btrfs_abort_transaction(trans, ret);
1089
+ goto out_free_path;
1090
+ }
1091
+ if (ret > 0) {
1092
+ /*
1093
+ * Shouldn't happen, but in case it does we
1094
+ * don't need to do the btrfs_next_item, just
1095
+ * continue.
1096
+ */
1097
+ continue;
9911098 }
9921099 }
9931100 ret = btrfs_next_item(tree_root, path);
....@@ -1013,9 +1120,25 @@
10131120 btrfs_abort_transaction(trans, ret);
10141121 goto out_free_path;
10151122 }
1123
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1124
+ if (ret < 0) {
1125
+ btrfs_abort_transaction(trans, ret);
1126
+ goto out_free_path;
1127
+ }
10161128
1129
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
1130
+ /*
1131
+ * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
1132
+ * a deadlock with tasks concurrently doing other qgroup operations, such
1133
+ * adding/removing qgroups or adding/deleting qgroup relations for example,
1134
+ * because all qgroup operations first start or join a transaction and then
1135
+ * lock the qgroup_ioctl_lock mutex.
1136
+ * We are safe from a concurrent task trying to enable quotas, by calling
1137
+ * this function, since we are serialized by fs_info->subvol_sem.
1138
+ */
10171139 ret = btrfs_commit_transaction(trans);
10181140 trans = NULL;
1141
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
10191142 if (ret)
10201143 goto out_free_path;
10211144
....@@ -1035,24 +1158,40 @@
10351158 fs_info->qgroup_rescan_running = true;
10361159 btrfs_queue_work(fs_info->qgroup_rescan_workers,
10371160 &fs_info->qgroup_rescan_work);
1161
+ } else {
1162
+ /*
1163
+ * We have set both BTRFS_FS_QUOTA_ENABLED and
1164
+ * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
1165
+ * -EINPROGRESS. That can happen because someone started the
1166
+ * rescan worker by calling quota rescan ioctl before we
1167
+ * attempted to initialize the rescan worker. Failure due to
1168
+ * quotas disabled in the meanwhile is not possible, because
1169
+ * we are holding a write lock on fs_info->subvol_sem, which
1170
+ * is also acquired when disabling quotas.
1171
+ * Ignore such error, and any other error would need to undo
1172
+ * everything we did in the transaction we just committed.
1173
+ */
1174
+ ASSERT(ret == -EINPROGRESS);
1175
+ ret = 0;
10381176 }
10391177
10401178 out_free_path:
10411179 btrfs_free_path(path);
10421180 out_free_root:
1043
- if (ret) {
1044
- free_extent_buffer(quota_root->node);
1045
- free_extent_buffer(quota_root->commit_root);
1046
- kfree(quota_root);
1047
- }
1181
+ if (ret)
1182
+ btrfs_put_root(quota_root);
10481183 out:
10491184 if (ret) {
10501185 ulist_free(fs_info->qgroup_ulist);
10511186 fs_info->qgroup_ulist = NULL;
1052
- if (trans)
1053
- btrfs_end_transaction(trans);
1187
+ btrfs_sysfs_del_qgroups(fs_info);
10541188 }
10551189 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1190
+ if (ret && trans)
1191
+ btrfs_end_transaction(trans);
1192
+ else if (trans)
1193
+ ret = btrfs_end_transaction(trans);
1194
+ ulist_free(ulist);
10561195 return ret;
10571196 }
10581197
....@@ -1062,24 +1201,55 @@
10621201 struct btrfs_trans_handle *trans = NULL;
10631202 int ret = 0;
10641203
1204
+ /*
1205
+ * We need to have subvol_sem write locked, to prevent races between
1206
+ * concurrent tasks trying to disable quotas, because we will unlock
1207
+ * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
1208
+ */
1209
+ lockdep_assert_held_write(&fs_info->subvol_sem);
1210
+
10651211 mutex_lock(&fs_info->qgroup_ioctl_lock);
10661212 if (!fs_info->quota_root)
10671213 goto out;
1214
+
1215
+ /*
1216
+ * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
1217
+ * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
1218
+ * to lock that mutex while holding a transaction handle and the rescan
1219
+ * worker needs to commit a transaction.
1220
+ */
1221
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
1222
+
1223
+ /*
1224
+ * Request qgroup rescan worker to complete and wait for it. This wait
1225
+ * must be done before transaction start for quota disable since it may
1226
+ * deadlock with transaction by the qgroup rescan worker.
1227
+ */
1228
+ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1229
+ btrfs_qgroup_wait_for_completion(fs_info, false);
10681230
10691231 /*
10701232 * 1 For the root item
10711233 *
10721234 * We should also reserve enough items for the quota tree deletion in
10731235 * btrfs_clean_quota_tree but this is not done.
1236
+ *
1237
+ * Also, we must always start a transaction without holding the mutex
1238
+ * qgroup_ioctl_lock, see btrfs_quota_enable().
10741239 */
10751240 trans = btrfs_start_transaction(fs_info->tree_root, 1);
1241
+
1242
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
10761243 if (IS_ERR(trans)) {
10771244 ret = PTR_ERR(trans);
1245
+ trans = NULL;
1246
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
10781247 goto out;
10791248 }
10801249
1081
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1082
- btrfs_qgroup_wait_for_completion(fs_info, false);
1250
+ if (!fs_info->quota_root)
1251
+ goto out;
1252
+
10831253 spin_lock(&fs_info->qgroup_lock);
10841254 quota_root = fs_info->quota_root;
10851255 fs_info->quota_root = NULL;
....@@ -1091,30 +1261,31 @@
10911261 ret = btrfs_clean_quota_tree(trans, quota_root);
10921262 if (ret) {
10931263 btrfs_abort_transaction(trans, ret);
1094
- goto end_trans;
1264
+ goto out;
10951265 }
10961266
10971267 ret = btrfs_del_root(trans, &quota_root->root_key);
10981268 if (ret) {
10991269 btrfs_abort_transaction(trans, ret);
1100
- goto end_trans;
1270
+ goto out;
11011271 }
11021272
11031273 list_del(&quota_root->dirty_list);
11041274
11051275 btrfs_tree_lock(quota_root->node);
1106
- clean_tree_block(fs_info, quota_root->node);
1276
+ btrfs_clean_tree_block(quota_root->node);
11071277 btrfs_tree_unlock(quota_root->node);
11081278 btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
11091279
1110
- free_extent_buffer(quota_root->node);
1111
- free_extent_buffer(quota_root->commit_root);
1112
- kfree(quota_root);
1280
+ btrfs_put_root(quota_root);
11131281
1114
-end_trans:
1115
- ret = btrfs_end_transaction(trans);
11161282 out:
11171283 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1284
+ if (ret && trans)
1285
+ btrfs_end_transaction(trans);
1286
+ else if (trans)
1287
+ ret = btrfs_end_transaction(trans);
1288
+
11181289 return ret;
11191290 }
11201291
....@@ -1129,7 +1300,7 @@
11291300 * The easy accounting, we're updating qgroup relationship whose child qgroup
11301301 * only has exclusive extents.
11311302 *
1132
- * In this case, all exclsuive extents will also be exlusive for parent, so
1303
+ * In this case, all exclusive extents will also be exclusive for parent, so
11331304 * excl/rfer just get added/removed.
11341305 *
11351306 * So is qgroup reservation space, which should also be added/removed to
....@@ -1246,25 +1417,27 @@
12461417 u64 dst)
12471418 {
12481419 struct btrfs_fs_info *fs_info = trans->fs_info;
1249
- struct btrfs_root *quota_root;
12501420 struct btrfs_qgroup *parent;
12511421 struct btrfs_qgroup *member;
12521422 struct btrfs_qgroup_list *list;
12531423 struct ulist *tmp;
1424
+ unsigned int nofs_flag;
12541425 int ret = 0;
12551426
12561427 /* Check the level of src and dst first */
12571428 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
12581429 return -EINVAL;
12591430
1431
+ /* We hold a transaction handle open, must do a NOFS allocation. */
1432
+ nofs_flag = memalloc_nofs_save();
12601433 tmp = ulist_alloc(GFP_KERNEL);
1434
+ memalloc_nofs_restore(nofs_flag);
12611435 if (!tmp)
12621436 return -ENOMEM;
12631437
12641438 mutex_lock(&fs_info->qgroup_ioctl_lock);
1265
- quota_root = fs_info->quota_root;
1266
- if (!quota_root) {
1267
- ret = -EINVAL;
1439
+ if (!fs_info->quota_root) {
1440
+ ret = -ENOTCONN;
12681441 goto out;
12691442 }
12701443 member = find_qgroup_rb(fs_info, src);
....@@ -1310,48 +1483,62 @@
13101483 u64 dst)
13111484 {
13121485 struct btrfs_fs_info *fs_info = trans->fs_info;
1313
- struct btrfs_root *quota_root;
13141486 struct btrfs_qgroup *parent;
13151487 struct btrfs_qgroup *member;
13161488 struct btrfs_qgroup_list *list;
13171489 struct ulist *tmp;
1490
+ bool found = false;
1491
+ unsigned int nofs_flag;
13181492 int ret = 0;
1319
- int err;
1493
+ int ret2;
13201494
1495
+ /* We hold a transaction handle open, must do a NOFS allocation. */
1496
+ nofs_flag = memalloc_nofs_save();
13211497 tmp = ulist_alloc(GFP_KERNEL);
1498
+ memalloc_nofs_restore(nofs_flag);
13221499 if (!tmp)
13231500 return -ENOMEM;
13241501
1325
- quota_root = fs_info->quota_root;
1326
- if (!quota_root) {
1327
- ret = -EINVAL;
1502
+ if (!fs_info->quota_root) {
1503
+ ret = -ENOTCONN;
13281504 goto out;
13291505 }
13301506
13311507 member = find_qgroup_rb(fs_info, src);
13321508 parent = find_qgroup_rb(fs_info, dst);
1333
- if (!member || !parent) {
1334
- ret = -EINVAL;
1335
- goto out;
1336
- }
1509
+ /*
1510
+ * The parent/member pair doesn't exist, then try to delete the dead
1511
+ * relation items only.
1512
+ */
1513
+ if (!member || !parent)
1514
+ goto delete_item;
13371515
13381516 /* check if such qgroup relation exist firstly */
13391517 list_for_each_entry(list, &member->groups, next_group) {
1340
- if (list->group == parent)
1341
- goto exist;
1518
+ if (list->group == parent) {
1519
+ found = true;
1520
+ break;
1521
+ }
13421522 }
1343
- ret = -ENOENT;
1344
- goto out;
1345
-exist:
1346
- ret = del_qgroup_relation_item(trans, src, dst);
1347
- err = del_qgroup_relation_item(trans, dst, src);
1348
- if (err && !ret)
1349
- ret = err;
13501523
1351
- spin_lock(&fs_info->qgroup_lock);
1352
- del_relation_rb(fs_info, src, dst);
1353
- ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1354
- spin_unlock(&fs_info->qgroup_lock);
1524
+delete_item:
1525
+ ret = del_qgroup_relation_item(trans, src, dst);
1526
+ if (ret < 0 && ret != -ENOENT)
1527
+ goto out;
1528
+ ret2 = del_qgroup_relation_item(trans, dst, src);
1529
+ if (ret2 < 0 && ret2 != -ENOENT)
1530
+ goto out;
1531
+
1532
+ /* At least one deletion succeeded, return 0 */
1533
+ if (!ret || !ret2)
1534
+ ret = 0;
1535
+
1536
+ if (found) {
1537
+ spin_lock(&fs_info->qgroup_lock);
1538
+ del_relation_rb(fs_info, src, dst);
1539
+ ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1540
+ spin_unlock(&fs_info->qgroup_lock);
1541
+ }
13551542 out:
13561543 ulist_free(tmp);
13571544 return ret;
....@@ -1378,11 +1565,11 @@
13781565 int ret = 0;
13791566
13801567 mutex_lock(&fs_info->qgroup_ioctl_lock);
1381
- quota_root = fs_info->quota_root;
1382
- if (!quota_root) {
1383
- ret = -EINVAL;
1568
+ if (!fs_info->quota_root) {
1569
+ ret = -ENOTCONN;
13841570 goto out;
13851571 }
1572
+ quota_root = fs_info->quota_root;
13861573 qgroup = find_qgroup_rb(fs_info, qgroupid);
13871574 if (qgroup) {
13881575 ret = -EEXIST;
....@@ -1397,8 +1584,11 @@
13971584 qgroup = add_qgroup_rb(fs_info, qgroupid);
13981585 spin_unlock(&fs_info->qgroup_lock);
13991586
1400
- if (IS_ERR(qgroup))
1587
+ if (IS_ERR(qgroup)) {
14011588 ret = PTR_ERR(qgroup);
1589
+ goto out;
1590
+ }
1591
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
14021592 out:
14031593 mutex_unlock(&fs_info->qgroup_ioctl_lock);
14041594 return ret;
....@@ -1407,15 +1597,13 @@
14071597 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
14081598 {
14091599 struct btrfs_fs_info *fs_info = trans->fs_info;
1410
- struct btrfs_root *quota_root;
14111600 struct btrfs_qgroup *qgroup;
14121601 struct btrfs_qgroup_list *list;
14131602 int ret = 0;
14141603
14151604 mutex_lock(&fs_info->qgroup_ioctl_lock);
1416
- quota_root = fs_info->quota_root;
1417
- if (!quota_root) {
1418
- ret = -EINVAL;
1605
+ if (!fs_info->quota_root) {
1606
+ ret = -ENOTCONN;
14191607 goto out;
14201608 }
14211609
....@@ -1423,13 +1611,14 @@
14231611 if (!qgroup) {
14241612 ret = -ENOENT;
14251613 goto out;
1426
- } else {
1427
- /* check if there are no children of this qgroup */
1428
- if (!list_empty(&qgroup->members)) {
1429
- ret = -EBUSY;
1430
- goto out;
1431
- }
14321614 }
1615
+
1616
+ /* Check if there are no children of this qgroup */
1617
+ if (!list_empty(&qgroup->members)) {
1618
+ ret = -EBUSY;
1619
+ goto out;
1620
+ }
1621
+
14331622 ret = del_qgroup_item(trans, qgroupid);
14341623 if (ret && ret != -ENOENT)
14351624 goto out;
....@@ -1446,6 +1635,14 @@
14461635 spin_lock(&fs_info->qgroup_lock);
14471636 del_qgroup_rb(fs_info, qgroupid);
14481637 spin_unlock(&fs_info->qgroup_lock);
1638
+
1639
+ /*
1640
+ * Remove the qgroup from sysfs now without holding the qgroup_lock
1641
+ * spinlock, since the sysfs_remove_group() function needs to take
1642
+ * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
1643
+ */
1644
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
1645
+ kfree(qgroup);
14491646 out:
14501647 mutex_unlock(&fs_info->qgroup_ioctl_lock);
14511648 return ret;
....@@ -1455,7 +1652,6 @@
14551652 struct btrfs_qgroup_limit *limit)
14561653 {
14571654 struct btrfs_fs_info *fs_info = trans->fs_info;
1458
- struct btrfs_root *quota_root;
14591655 struct btrfs_qgroup *qgroup;
14601656 int ret = 0;
14611657 /* Sometimes we would want to clear the limit on this qgroup.
....@@ -1465,9 +1661,8 @@
14651661 const u64 CLEAR_VALUE = -1;
14661662
14671663 mutex_lock(&fs_info->qgroup_ioctl_lock);
1468
- quota_root = fs_info->quota_root;
1469
- if (!quota_root) {
1470
- ret = -EINVAL;
1664
+ if (!fs_info->quota_root) {
1665
+ ret = -ENOTCONN;
14711666 goto out;
14721667 }
14731668
....@@ -1546,12 +1741,18 @@
15461741 parent_node = *p;
15471742 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
15481743 node);
1549
- if (bytenr < entry->bytenr)
1744
+ if (bytenr < entry->bytenr) {
15501745 p = &(*p)->rb_left;
1551
- else if (bytenr > entry->bytenr)
1746
+ } else if (bytenr > entry->bytenr) {
15521747 p = &(*p)->rb_right;
1553
- else
1748
+ } else {
1749
+ if (record->data_rsv && !entry->data_rsv) {
1750
+ entry->data_rsv = record->data_rsv;
1751
+ entry->data_rsv_refroot =
1752
+ record->data_rsv_refroot;
1753
+ }
15541754 return 1;
1755
+ }
15551756 }
15561757
15571758 rb_link_node(&record->node, parent_node, p);
....@@ -1597,7 +1798,7 @@
15971798 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
15981799 || bytenr == 0 || num_bytes == 0)
15991800 return 0;
1600
- record = kmalloc(sizeof(*record), gfp_flag);
1801
+ record = kzalloc(sizeof(*record), gfp_flag);
16011802 if (!record)
16021803 return -ENOMEM;
16031804
....@@ -1719,6 +1920,357 @@
17191920 return 0;
17201921 }
17211922
1923
+/*
1924
+ * Helper function to trace a subtree tree block swap.
1925
+ *
1926
+ * The swap will happen in highest tree block, but there may be a lot of
1927
+ * tree blocks involved.
1928
+ *
1929
+ * For example:
1930
+ * OO = Old tree blocks
1931
+ * NN = New tree blocks allocated during balance
1932
+ *
1933
+ * File tree (257) Reloc tree for 257
1934
+ * L2 OO NN
1935
+ * / \ / \
1936
+ * L1 OO OO (a) OO NN (a)
1937
+ * / \ / \ / \ / \
1938
+ * L0 OO OO OO OO OO OO NN NN
1939
+ * (b) (c) (b) (c)
1940
+ *
1941
+ * When calling qgroup_trace_extent_swap(), we will pass:
1942
+ * @src_eb = OO(a)
1943
+ * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
1944
+ * @dst_level = 0
1945
+ * @root_level = 1
1946
+ *
1947
+ * In that case, qgroup_trace_extent_swap() will search from OO(a) to
1948
+ * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
1949
+ *
1950
+ * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
1951
+ *
1952
+ * 1) Tree search from @src_eb
1953
+ * It should acts as a simplified btrfs_search_slot().
1954
+ * The key for search can be extracted from @dst_path->nodes[dst_level]
1955
+ * (first key).
1956
+ *
1957
+ * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
1958
+ * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
1959
+ * They should be marked during previous (@dst_level = 1) iteration.
1960
+ *
1961
+ * 3) Mark file extents in leaves dirty
1962
+ * We don't have good way to pick out new file extents only.
1963
+ * So we still follow the old method by scanning all file extents in
1964
+ * the leave.
1965
+ *
1966
+ * This function can free us from keeping two paths, thus later we only need
1967
+ * to care about how to iterate all new tree blocks in reloc tree.
1968
+ */
1969
+static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
1970
+ struct extent_buffer *src_eb,
1971
+ struct btrfs_path *dst_path,
1972
+ int dst_level, int root_level,
1973
+ bool trace_leaf)
1974
+{
1975
+ struct btrfs_key key;
1976
+ struct btrfs_path *src_path;
1977
+ struct btrfs_fs_info *fs_info = trans->fs_info;
1978
+ u32 nodesize = fs_info->nodesize;
1979
+ int cur_level = root_level;
1980
+ int ret;
1981
+
1982
+ BUG_ON(dst_level > root_level);
1983
+ /* Level mismatch */
1984
+ if (btrfs_header_level(src_eb) != root_level)
1985
+ return -EINVAL;
1986
+
1987
+ src_path = btrfs_alloc_path();
1988
+ if (!src_path) {
1989
+ ret = -ENOMEM;
1990
+ goto out;
1991
+ }
1992
+
1993
+ if (dst_level)
1994
+ btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
1995
+ else
1996
+ btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
1997
+
1998
+ /* For src_path */
1999
+ atomic_inc(&src_eb->refs);
2000
+ src_path->nodes[root_level] = src_eb;
2001
+ src_path->slots[root_level] = dst_path->slots[root_level];
2002
+ src_path->locks[root_level] = 0;
2003
+
2004
+ /* A simplified version of btrfs_search_slot() */
2005
+ while (cur_level >= dst_level) {
2006
+ struct btrfs_key src_key;
2007
+ struct btrfs_key dst_key;
2008
+
2009
+ if (src_path->nodes[cur_level] == NULL) {
2010
+ struct btrfs_key first_key;
2011
+ struct extent_buffer *eb;
2012
+ int parent_slot;
2013
+ u64 child_gen;
2014
+ u64 child_bytenr;
2015
+
2016
+ eb = src_path->nodes[cur_level + 1];
2017
+ parent_slot = src_path->slots[cur_level + 1];
2018
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2019
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2020
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
2021
+
2022
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
2023
+ cur_level, &first_key);
2024
+ if (IS_ERR(eb)) {
2025
+ ret = PTR_ERR(eb);
2026
+ goto out;
2027
+ } else if (!extent_buffer_uptodate(eb)) {
2028
+ free_extent_buffer(eb);
2029
+ ret = -EIO;
2030
+ goto out;
2031
+ }
2032
+
2033
+ src_path->nodes[cur_level] = eb;
2034
+
2035
+ btrfs_tree_read_lock(eb);
2036
+ btrfs_set_lock_blocking_read(eb);
2037
+ src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
2038
+ }
2039
+
2040
+ src_path->slots[cur_level] = dst_path->slots[cur_level];
2041
+ if (cur_level) {
2042
+ btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
2043
+ &dst_key, dst_path->slots[cur_level]);
2044
+ btrfs_node_key_to_cpu(src_path->nodes[cur_level],
2045
+ &src_key, src_path->slots[cur_level]);
2046
+ } else {
2047
+ btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
2048
+ &dst_key, dst_path->slots[cur_level]);
2049
+ btrfs_item_key_to_cpu(src_path->nodes[cur_level],
2050
+ &src_key, src_path->slots[cur_level]);
2051
+ }
2052
+ /* Content mismatch, something went wrong */
2053
+ if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
2054
+ ret = -ENOENT;
2055
+ goto out;
2056
+ }
2057
+ cur_level--;
2058
+ }
2059
+
2060
+ /*
2061
+ * Now both @dst_path and @src_path have been populated, record the tree
2062
+ * blocks for qgroup accounting.
2063
+ */
2064
+ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
2065
+ nodesize, GFP_NOFS);
2066
+ if (ret < 0)
2067
+ goto out;
2068
+ ret = btrfs_qgroup_trace_extent(trans,
2069
+ dst_path->nodes[dst_level]->start,
2070
+ nodesize, GFP_NOFS);
2071
+ if (ret < 0)
2072
+ goto out;
2073
+
2074
+ /* Record leaf file extents */
2075
+ if (dst_level == 0 && trace_leaf) {
2076
+ ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
2077
+ if (ret < 0)
2078
+ goto out;
2079
+ ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
2080
+ }
2081
+out:
2082
+ btrfs_free_path(src_path);
2083
+ return ret;
2084
+}
2085
+
2086
+/*
2087
+ * Helper function to do recursive generation-aware depth-first search, to
2088
+ * locate all new tree blocks in a subtree of reloc tree.
2089
+ *
2090
+ * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
2091
+ * reloc tree
2092
+ * L2 NN (a)
2093
+ * / \
2094
+ * L1 OO NN (b)
2095
+ * / \ / \
2096
+ * L0 OO OO OO NN
2097
+ * (c) (d)
2098
+ * If we pass:
2099
+ * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
2100
+ * @cur_level = 1
2101
+ * @root_level = 1
2102
+ *
2103
+ * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
2104
+ * above tree blocks along with their counter parts in file tree.
2105
+ * While during search, old tree blocks OO(c) will be skipped as tree block swap
2106
+ * won't affect OO(c).
2107
+ */
2108
+static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
2109
+ struct extent_buffer *src_eb,
2110
+ struct btrfs_path *dst_path,
2111
+ int cur_level, int root_level,
2112
+ u64 last_snapshot, bool trace_leaf)
2113
+{
2114
+ struct btrfs_fs_info *fs_info = trans->fs_info;
2115
+ struct extent_buffer *eb;
2116
+ bool need_cleanup = false;
2117
+ int ret = 0;
2118
+ int i;
2119
+
2120
+ /* Level sanity check */
2121
+ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
2122
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
2123
+ root_level < cur_level) {
2124
+ btrfs_err_rl(fs_info,
2125
+ "%s: bad levels, cur_level=%d root_level=%d",
2126
+ __func__, cur_level, root_level);
2127
+ return -EUCLEAN;
2128
+ }
2129
+
2130
+ /* Read the tree block if needed */
2131
+ if (dst_path->nodes[cur_level] == NULL) {
2132
+ struct btrfs_key first_key;
2133
+ int parent_slot;
2134
+ u64 child_gen;
2135
+ u64 child_bytenr;
2136
+
2137
+ /*
2138
+ * dst_path->nodes[root_level] must be initialized before
2139
+ * calling this function.
2140
+ */
2141
+ if (cur_level == root_level) {
2142
+ btrfs_err_rl(fs_info,
2143
+ "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
2144
+ __func__, root_level, root_level, cur_level);
2145
+ return -EUCLEAN;
2146
+ }
2147
+
2148
+ /*
2149
+ * We need to get child blockptr/gen from parent before we can
2150
+ * read it.
2151
+ */
2152
+ eb = dst_path->nodes[cur_level + 1];
2153
+ parent_slot = dst_path->slots[cur_level + 1];
2154
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2155
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2156
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
2157
+
2158
+ /* This node is old, no need to trace */
2159
+ if (child_gen < last_snapshot)
2160
+ goto out;
2161
+
2162
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
2163
+ cur_level, &first_key);
2164
+ if (IS_ERR(eb)) {
2165
+ ret = PTR_ERR(eb);
2166
+ goto out;
2167
+ } else if (!extent_buffer_uptodate(eb)) {
2168
+ free_extent_buffer(eb);
2169
+ ret = -EIO;
2170
+ goto out;
2171
+ }
2172
+
2173
+ dst_path->nodes[cur_level] = eb;
2174
+ dst_path->slots[cur_level] = 0;
2175
+
2176
+ btrfs_tree_read_lock(eb);
2177
+ btrfs_set_lock_blocking_read(eb);
2178
+ dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
2179
+ need_cleanup = true;
2180
+ }
2181
+
2182
+ /* Now record this tree block and its counter part for qgroups */
2183
+ ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
2184
+ root_level, trace_leaf);
2185
+ if (ret < 0)
2186
+ goto cleanup;
2187
+
2188
+ eb = dst_path->nodes[cur_level];
2189
+
2190
+ if (cur_level > 0) {
2191
+ /* Iterate all child tree blocks */
2192
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
2193
+ /* Skip old tree blocks as they won't be swapped */
2194
+ if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
2195
+ continue;
2196
+ dst_path->slots[cur_level] = i;
2197
+
2198
+ /* Recursive call (at most 7 times) */
2199
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2200
+ dst_path, cur_level - 1, root_level,
2201
+ last_snapshot, trace_leaf);
2202
+ if (ret < 0)
2203
+ goto cleanup;
2204
+ }
2205
+ }
2206
+
2207
+cleanup:
2208
+ if (need_cleanup) {
2209
+ /* Clean up */
2210
+ btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
2211
+ dst_path->locks[cur_level]);
2212
+ free_extent_buffer(dst_path->nodes[cur_level]);
2213
+ dst_path->nodes[cur_level] = NULL;
2214
+ dst_path->slots[cur_level] = 0;
2215
+ dst_path->locks[cur_level] = 0;
2216
+ }
2217
+out:
2218
+ return ret;
2219
+}
2220
+
2221
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2222
+ struct extent_buffer *src_eb,
2223
+ struct extent_buffer *dst_eb,
2224
+ u64 last_snapshot, bool trace_leaf)
2225
+{
2226
+ struct btrfs_fs_info *fs_info = trans->fs_info;
2227
+ struct btrfs_path *dst_path = NULL;
2228
+ int level;
2229
+ int ret;
2230
+
2231
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2232
+ return 0;
2233
+
2234
+ /* Wrong parameter order */
2235
+ if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
2236
+ btrfs_err_rl(fs_info,
2237
+ "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2238
+ btrfs_header_generation(src_eb),
2239
+ btrfs_header_generation(dst_eb));
2240
+ return -EUCLEAN;
2241
+ }
2242
+
2243
+ if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
2244
+ ret = -EIO;
2245
+ goto out;
2246
+ }
2247
+
2248
+ level = btrfs_header_level(dst_eb);
2249
+ dst_path = btrfs_alloc_path();
2250
+ if (!dst_path) {
2251
+ ret = -ENOMEM;
2252
+ goto out;
2253
+ }
2254
+ /* For dst_path */
2255
+ atomic_inc(&dst_eb->refs);
2256
+ dst_path->nodes[level] = dst_eb;
2257
+ dst_path->slots[level] = 0;
2258
+ dst_path->locks[level] = 0;
2259
+
2260
+ /* Do the generation aware breadth-first search */
2261
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2262
+ level, last_snapshot, trace_leaf);
2263
+ if (ret < 0)
2264
+ goto out;
2265
+ ret = 0;
2266
+
2267
+out:
2268
+ btrfs_free_path(dst_path);
2269
+ if (ret < 0)
2270
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2271
+ return ret;
2272
+}
2273
+
17222274 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
17232275 struct extent_buffer *root_eb,
17242276 u64 root_gen, int root_level)
....@@ -1759,7 +2311,7 @@
17592311 * walk back up the tree (adjusting slot pointers as we go)
17602312 * and restart the search process.
17612313 */
1762
- extent_buffer_get(root_eb); /* For path */
2314
+ atomic_inc(&root_eb->refs); /* For path */
17632315 path->nodes[root_level] = root_eb;
17642316 path->slots[root_level] = 0;
17652317 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
....@@ -1797,7 +2349,7 @@
17972349 path->slots[level] = 0;
17982350
17992351 btrfs_tree_read_lock(eb);
1800
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
2352
+ btrfs_set_lock_blocking_read(eb);
18012353 path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
18022354
18032355 ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
....@@ -1894,7 +2446,7 @@
18942446 * Update qgroup rfer/excl counters.
18952447 * Rfer update is easy, codes can explain themselves.
18962448 *
1897
- * Excl update is tricky, the update is split into 2 part.
2449
+ * Excl update is tricky, the update is split into 2 parts.
18982450 * Part 1: Possible exclusive <-> sharing detect:
18992451 * | A | !A |
19002452 * -------------------------------------
....@@ -2143,6 +2695,7 @@
21432695 struct btrfs_delayed_ref_root *delayed_refs;
21442696 struct ulist *new_roots = NULL;
21452697 struct rb_node *node;
2698
+ u64 num_dirty_extents = 0;
21462699 u64 qgroup_to_skip;
21472700 int ret = 0;
21482701
....@@ -2152,6 +2705,7 @@
21522705 record = rb_entry(node, struct btrfs_qgroup_extent_record,
21532706 node);
21542707
2708
+ num_dirty_extents++;
21552709 trace_btrfs_qgroup_account_extents(fs_info, record);
21562710
21572711 if (!ret) {
....@@ -2168,6 +2722,11 @@
21682722 goto cleanup;
21692723 }
21702724
2725
+ /* Free the reserved data space */
2726
+ btrfs_qgroup_free_refroot(fs_info,
2727
+ record->data_rsv_refroot,
2728
+ record->data_rsv,
2729
+ BTRFS_QGROUP_RSV_DATA);
21712730 /*
21722731 * Use SEQ_LAST as time_seq to do special search, which
21732732 * doesn't lock tree or delayed_refs and search current
....@@ -2197,6 +2756,8 @@
21972756 kfree(record);
21982757
21992758 }
2759
+ trace_qgroup_num_dirty_extents(fs_info, trans->transid,
2760
+ num_dirty_extents);
22002761 return ret;
22012762 }
22022763
....@@ -2206,10 +2767,9 @@
22062767 int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
22072768 {
22082769 struct btrfs_fs_info *fs_info = trans->fs_info;
2209
- struct btrfs_root *quota_root = fs_info->quota_root;
22102770 int ret = 0;
22112771
2212
- if (!quota_root)
2772
+ if (!fs_info->quota_root)
22132773 return ret;
22142774
22152775 spin_lock(&fs_info->qgroup_lock);
....@@ -2353,14 +2913,7 @@
23532913 dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
23542914 dstgroup->rsv_excl = inherit->lim.rsv_excl;
23552915
2356
- ret = update_qgroup_limit_item(trans, dstgroup);
2357
- if (ret) {
2358
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2359
- btrfs_info(fs_info,
2360
- "unable to update quota limit for %llu",
2361
- dstgroup->qgroupid);
2362
- goto unlock;
2363
- }
2916
+ qgroup_dirty(fs_info, dstgroup);
23642917 }
23652918
23662919 if (srcid) {
....@@ -2455,6 +3008,8 @@
24553008
24563009 unlock:
24573010 spin_unlock(&fs_info->qgroup_lock);
3011
+ if (!ret)
3012
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
24583013 out:
24593014 if (!committing)
24603015 mutex_unlock(&fs_info->qgroup_ioctl_lock);
....@@ -2463,20 +3018,8 @@
24633018 return ret;
24643019 }
24653020
2466
-/*
2467
- * Two limits to commit transaction in advance.
2468
- *
2469
- * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
2470
- * For SIZE, it will be in byte unit as threshold.
2471
- */
2472
-#define QGROUP_FREE_RATIO 32
2473
-#define QGROUP_FREE_SIZE SZ_32M
2474
-static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
2475
- const struct btrfs_qgroup *qg, u64 num_bytes)
3021
+static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
24763022 {
2477
- u64 free;
2478
- u64 threshold;
2479
-
24803023 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
24813024 qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
24823025 return false;
....@@ -2485,39 +3028,12 @@
24853028 qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
24863029 return false;
24873030
2488
- /*
2489
- * Even if we passed the check, it's better to check if reservation
2490
- * for meta_pertrans is pushing us near limit.
2491
- * If there is too much pertrans reservation or it's near the limit,
2492
- * let's try commit transaction to free some, using transaction_kthread
2493
- */
2494
- if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
2495
- BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
2496
- if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
2497
- free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
2498
- threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
2499
- QGROUP_FREE_SIZE);
2500
- } else {
2501
- free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
2502
- threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
2503
- QGROUP_FREE_SIZE);
2504
- }
2505
-
2506
- /*
2507
- * Use transaction_kthread to commit transaction, so we no
2508
- * longer need to bother nested transaction nor lock context.
2509
- */
2510
- if (free < threshold)
2511
- btrfs_commit_transaction_locksafe(fs_info);
2512
- }
2513
-
25143031 return true;
25153032 }
25163033
25173034 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
25183035 enum btrfs_qgroup_rsv_type type)
25193036 {
2520
- struct btrfs_root *quota_root;
25213037 struct btrfs_qgroup *qgroup;
25223038 struct btrfs_fs_info *fs_info = root->fs_info;
25233039 u64 ref_root = root->root_key.objectid;
....@@ -2536,8 +3052,7 @@
25363052 enforce = false;
25373053
25383054 spin_lock(&fs_info->qgroup_lock);
2539
- quota_root = fs_info->quota_root;
2540
- if (!quota_root)
3055
+ if (!fs_info->quota_root)
25413056 goto out;
25423057
25433058 qgroup = find_qgroup_rb(fs_info, ref_root);
....@@ -2560,7 +3075,7 @@
25603075
25613076 qg = unode_aux_to_qgroup(unode);
25623077
2563
- if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
3078
+ if (enforce && !qgroup_check_limits(qg, num_bytes)) {
25643079 ret = -EDQUOT;
25653080 goto out;
25663081 }
....@@ -2583,7 +3098,6 @@
25833098
25843099 qg = unode_aux_to_qgroup(unode);
25853100
2586
- trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
25873101 qgroup_rsv_add(fs_info, qg, num_bytes, type);
25883102 }
25893103
....@@ -2605,7 +3119,6 @@
26053119 u64 ref_root, u64 num_bytes,
26063120 enum btrfs_qgroup_rsv_type type)
26073121 {
2608
- struct btrfs_root *quota_root;
26093122 struct btrfs_qgroup *qgroup;
26103123 struct ulist_node *unode;
26113124 struct ulist_iterator uiter;
....@@ -2623,8 +3136,7 @@
26233136 }
26243137 spin_lock(&fs_info->qgroup_lock);
26253138
2626
- quota_root = fs_info->quota_root;
2627
- if (!quota_root)
3139
+ if (!fs_info->quota_root)
26283140 goto out;
26293141
26303142 qgroup = find_qgroup_rb(fs_info, ref_root);
....@@ -2650,7 +3162,6 @@
26503162
26513163 qg = unode_aux_to_qgroup(unode);
26523164
2653
- trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
26543165 qgroup_rsv_release(fs_info, qg, num_bytes, type);
26553166
26563167 list_for_each_entry(glist, &qg->groups, next_group) {
....@@ -2734,9 +3245,6 @@
27343245 mutex_unlock(&fs_info->qgroup_rescan_lock);
27353246 goto out;
27363247 }
2737
- extent_buffer_get(scratch_leaf);
2738
- btrfs_tree_read_lock(scratch_leaf);
2739
- btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
27403248 slot = path->slots[0];
27413249 btrfs_release_path(path);
27423250 mutex_unlock(&fs_info->qgroup_rescan_lock);
....@@ -2762,10 +3270,8 @@
27623270 goto out;
27633271 }
27643272 out:
2765
- if (scratch_leaf) {
2766
- btrfs_tree_read_unlock_blocking(scratch_leaf);
3273
+ if (scratch_leaf)
27673274 free_extent_buffer(scratch_leaf);
2768
- }
27693275
27703276 if (done && !ret) {
27713277 ret = 1;
....@@ -2777,7 +3283,8 @@
27773283 static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
27783284 {
27793285 return btrfs_fs_closing(fs_info) ||
2780
- test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
3286
+ test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
3287
+ !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
27813288 }
27823289
27833290 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
....@@ -2807,11 +3314,9 @@
28073314 err = PTR_ERR(trans);
28083315 break;
28093316 }
2810
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
2811
- err = -EINTR;
2812
- } else {
2813
- err = qgroup_rescan_leaf(trans, path);
2814
- }
3317
+
3318
+ err = qgroup_rescan_leaf(trans, path);
3319
+
28153320 if (err > 0)
28163321 btrfs_commit_transaction(trans);
28173322 else
....@@ -2825,7 +3330,7 @@
28253330 if (err > 0 &&
28263331 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
28273332 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2828
- } else if (err < 0) {
3333
+ } else if (err < 0 || stopped) {
28293334 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
28303335 }
28313336 mutex_unlock(&fs_info->qgroup_rescan_lock);
....@@ -2902,7 +3407,6 @@
29023407 }
29033408
29043409 mutex_lock(&fs_info->qgroup_rescan_lock);
2905
- spin_lock(&fs_info->qgroup_lock);
29063410
29073411 if (init_flags) {
29083412 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
....@@ -2914,10 +3418,12 @@
29143418 btrfs_warn(fs_info,
29153419 "qgroup rescan init failed, qgroup is not enabled");
29163420 ret = -EINVAL;
3421
+ } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
3422
+ /* Quota disable is in progress */
3423
+ ret = -EBUSY;
29173424 }
29183425
29193426 if (ret) {
2920
- spin_unlock(&fs_info->qgroup_lock);
29213427 mutex_unlock(&fs_info->qgroup_rescan_lock);
29223428 return ret;
29233429 }
....@@ -2928,14 +3434,9 @@
29283434 sizeof(fs_info->qgroup_rescan_progress));
29293435 fs_info->qgroup_rescan_progress.objectid = progress_objectid;
29303436 init_completion(&fs_info->qgroup_rescan_completion);
2931
-
2932
- spin_unlock(&fs_info->qgroup_lock);
29333437 mutex_unlock(&fs_info->qgroup_rescan_lock);
29343438
2935
- memset(&fs_info->qgroup_rescan_work, 0,
2936
- sizeof(fs_info->qgroup_rescan_work));
29373439 btrfs_init_work(&fs_info->qgroup_rescan_work,
2938
- btrfs_qgroup_rescan_helper,
29393440 btrfs_qgroup_rescan_worker, NULL, NULL);
29403441 return 0;
29413442 }
....@@ -3009,9 +3510,7 @@
30093510 int ret = 0;
30103511
30113512 mutex_lock(&fs_info->qgroup_rescan_lock);
3012
- spin_lock(&fs_info->qgroup_lock);
30133513 running = fs_info->qgroup_rescan_running;
3014
- spin_unlock(&fs_info->qgroup_lock);
30153514 mutex_unlock(&fs_info->qgroup_rescan_lock);
30163515
30173516 if (!running)
....@@ -3042,40 +3541,169 @@
30423541 }
30433542 }
30443543
3544
+#define rbtree_iterate_from_safe(node, next, start) \
3545
+ for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
3546
+
3547
+static int qgroup_unreserve_range(struct btrfs_inode *inode,
3548
+ struct extent_changeset *reserved, u64 start,
3549
+ u64 len)
3550
+{
3551
+ struct rb_node *node;
3552
+ struct rb_node *next;
3553
+ struct ulist_node *entry;
3554
+ int ret = 0;
3555
+
3556
+ node = reserved->range_changed.root.rb_node;
3557
+ if (!node)
3558
+ return 0;
3559
+ while (node) {
3560
+ entry = rb_entry(node, struct ulist_node, rb_node);
3561
+ if (entry->val < start)
3562
+ node = node->rb_right;
3563
+ else
3564
+ node = node->rb_left;
3565
+ }
3566
+
3567
+ if (entry->val > start && rb_prev(&entry->rb_node))
3568
+ entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
3569
+ rb_node);
3570
+
3571
+ rbtree_iterate_from_safe(node, next, &entry->rb_node) {
3572
+ u64 entry_start;
3573
+ u64 entry_end;
3574
+ u64 entry_len;
3575
+ int clear_ret;
3576
+
3577
+ entry = rb_entry(node, struct ulist_node, rb_node);
3578
+ entry_start = entry->val;
3579
+ entry_end = entry->aux;
3580
+ entry_len = entry_end - entry_start + 1;
3581
+
3582
+ if (entry_start >= start + len)
3583
+ break;
3584
+ if (entry_start + entry_len <= start)
3585
+ continue;
3586
+ /*
3587
+ * Now the entry is in [start, start + len), revert the
3588
+ * EXTENT_QGROUP_RESERVED bit.
3589
+ */
3590
+ clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
3591
+ entry_end, EXTENT_QGROUP_RESERVED);
3592
+ if (!ret && clear_ret < 0)
3593
+ ret = clear_ret;
3594
+
3595
+ ulist_del(&reserved->range_changed, entry->val, entry->aux);
3596
+ if (likely(reserved->bytes_changed >= entry_len)) {
3597
+ reserved->bytes_changed -= entry_len;
3598
+ } else {
3599
+ WARN_ON(1);
3600
+ reserved->bytes_changed = 0;
3601
+ }
3602
+ }
3603
+
3604
+ return ret;
3605
+}
3606
+
30453607 /*
3046
- * Reserve qgroup space for range [start, start + len).
3608
+ * Try to free some space for qgroup.
30473609 *
3048
- * This function will either reserve space from related qgroups or doing
3049
- * nothing if the range is already reserved.
3610
+ * For qgroup, there are only 3 ways to free qgroup space:
3611
+ * - Flush nodatacow write
3612
+ * Any nodatacow write will free its reserved data space at run_delalloc_range().
3613
+ * In theory, we should only flush nodatacow inodes, but it's not yet
3614
+ * possible, so we need to flush the whole root.
30503615 *
3051
- * Return 0 for successful reserve
3052
- * Return <0 for error (including -EQUOT)
3616
+ * - Wait for ordered extents
3617
+ * When ordered extents are finished, their reserved metadata is finally
3618
+ * converted to per_trans status, which can be freed by later commit
3619
+ * transaction.
30533620 *
3054
- * NOTE: this function may sleep for memory allocation.
3055
- * if btrfs_qgroup_reserve_data() is called multiple times with
3056
- * same @reserved, caller must ensure when error happens it's OK
3057
- * to free *ALL* reserved space.
3621
+ * - Commit transaction
3622
+ * This would free the meta_per_trans space.
3623
+ * In theory this shouldn't provide much space, but any more qgroup space
3624
+ * is needed.
30583625 */
3059
-int btrfs_qgroup_reserve_data(struct inode *inode,
3626
+static int try_flush_qgroup(struct btrfs_root *root)
3627
+{
3628
+ struct btrfs_trans_handle *trans;
3629
+ int ret;
3630
+ bool can_commit = true;
3631
+
3632
+ /*
3633
+ * If current process holds a transaction, we shouldn't flush, as we
3634
+ * assume all space reservation happens before a transaction handle is
3635
+ * held.
3636
+ *
3637
+ * But there are cases like btrfs_delayed_item_reserve_metadata() where
3638
+ * we try to reserve space with one transction handle already held.
3639
+ * In that case we can't commit transaction, but at least try to end it
3640
+ * and hope the started data writes can free some space.
3641
+ */
3642
+ if (current->journal_info &&
3643
+ current->journal_info != BTRFS_SEND_TRANS_STUB)
3644
+ can_commit = false;
3645
+
3646
+ /*
3647
+ * We don't want to run flush again and again, so if there is a running
3648
+ * one, we won't try to start a new flush, but exit directly.
3649
+ */
3650
+ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
3651
+ /*
3652
+ * We are already holding a transaction, thus we can block other
3653
+ * threads from flushing. So exit right now. This increases
3654
+ * the chance of EDQUOT for heavy load and near limit cases.
3655
+ * But we can argue that if we're already near limit, EDQUOT is
3656
+ * unavoidable anyway.
3657
+ */
3658
+ if (!can_commit)
3659
+ return 0;
3660
+
3661
+ wait_event(root->qgroup_flush_wait,
3662
+ !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
3663
+ return 0;
3664
+ }
3665
+
3666
+ ret = btrfs_start_delalloc_snapshot(root);
3667
+ if (ret < 0)
3668
+ goto out;
3669
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
3670
+
3671
+ trans = btrfs_join_transaction(root);
3672
+ if (IS_ERR(trans)) {
3673
+ ret = PTR_ERR(trans);
3674
+ goto out;
3675
+ }
3676
+
3677
+ if (can_commit)
3678
+ ret = btrfs_commit_transaction(trans);
3679
+ else
3680
+ ret = btrfs_end_transaction(trans);
3681
+out:
3682
+ clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
3683
+ wake_up(&root->qgroup_flush_wait);
3684
+ return ret;
3685
+}
3686
+
3687
+static int qgroup_reserve_data(struct btrfs_inode *inode,
30603688 struct extent_changeset **reserved_ret, u64 start,
30613689 u64 len)
30623690 {
3063
- struct btrfs_root *root = BTRFS_I(inode)->root;
3064
- struct ulist_node *unode;
3065
- struct ulist_iterator uiter;
3691
+ struct btrfs_root *root = inode->root;
30663692 struct extent_changeset *reserved;
3693
+ bool new_reserved = false;
30673694 u64 orig_reserved;
30683695 u64 to_reserve;
30693696 int ret;
30703697
30713698 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
3072
- !is_fstree(root->objectid) || len == 0)
3699
+ !is_fstree(root->root_key.objectid) || len == 0)
30733700 return 0;
30743701
30753702 /* @reserved parameter is mandatory for qgroup */
30763703 if (WARN_ON(!reserved_ret))
30773704 return -EINVAL;
30783705 if (!*reserved_ret) {
3706
+ new_reserved = true;
30793707 *reserved_ret = extent_changeset_alloc();
30803708 if (!*reserved_ret)
30813709 return -ENOMEM;
....@@ -3083,15 +3711,15 @@
30833711 reserved = *reserved_ret;
30843712 /* Record already reserved space */
30853713 orig_reserved = reserved->bytes_changed;
3086
- ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
3714
+ ret = set_record_extent_bits(&inode->io_tree, start,
30873715 start + len -1, EXTENT_QGROUP_RESERVED, reserved);
30883716
30893717 /* Newly reserved space */
30903718 to_reserve = reserved->bytes_changed - orig_reserved;
3091
- trace_btrfs_qgroup_reserve_data(inode, start, len,
3719
+ trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
30923720 to_reserve, QGROUP_RESERVE);
30933721 if (ret < 0)
3094
- goto cleanup;
3722
+ goto out;
30953723 ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
30963724 if (ret < 0)
30973725 goto cleanup;
....@@ -3099,23 +3727,49 @@
30993727 return ret;
31003728
31013729 cleanup:
3102
- /* cleanup *ALL* already reserved ranges */
3103
- ULIST_ITER_INIT(&uiter);
3104
- while ((unode = ulist_next(&reserved->range_changed, &uiter)))
3105
- clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
3106
- unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
3107
- /* Also free data bytes of already reserved one */
3108
- btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
3109
- orig_reserved, BTRFS_QGROUP_RSV_DATA);
3110
- extent_changeset_release(reserved);
3730
+ qgroup_unreserve_range(inode, reserved, start, len);
3731
+out:
3732
+ if (new_reserved) {
3733
+ extent_changeset_release(reserved);
3734
+ kfree(reserved);
3735
+ *reserved_ret = NULL;
3736
+ }
31113737 return ret;
31123738 }
31133739
3740
+/*
3741
+ * Reserve qgroup space for range [start, start + len).
3742
+ *
3743
+ * This function will either reserve space from related qgroups or do nothing
3744
+ * if the range is already reserved.
3745
+ *
3746
+ * Return 0 for successful reservation
3747
+ * Return <0 for error (including -EQUOT)
3748
+ *
3749
+ * NOTE: This function may sleep for memory allocation, dirty page flushing and
3750
+ * commit transaction. So caller should not hold any dirty page locked.
3751
+ */
3752
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
3753
+ struct extent_changeset **reserved_ret, u64 start,
3754
+ u64 len)
3755
+{
3756
+ int ret;
3757
+
3758
+ ret = qgroup_reserve_data(inode, reserved_ret, start, len);
3759
+ if (ret <= 0 && ret != -EDQUOT)
3760
+ return ret;
3761
+
3762
+ ret = try_flush_qgroup(inode->root);
3763
+ if (ret < 0)
3764
+ return ret;
3765
+ return qgroup_reserve_data(inode, reserved_ret, start, len);
3766
+}
3767
+
31143768 /* Free ranges specified by @reserved, normally in error path */
3115
-static int qgroup_free_reserved_data(struct inode *inode,
3769
+static int qgroup_free_reserved_data(struct btrfs_inode *inode,
31163770 struct extent_changeset *reserved, u64 start, u64 len)
31173771 {
3118
- struct btrfs_root *root = BTRFS_I(inode)->root;
3772
+ struct btrfs_root *root = inode->root;
31193773 struct ulist_node *unode;
31203774 struct ulist_iterator uiter;
31213775 struct extent_changeset changeset;
....@@ -3151,14 +3805,14 @@
31513805 * EXTENT_QGROUP_RESERVED, we won't double free.
31523806 * So not need to rush.
31533807 */
3154
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
3155
- free_start, free_start + free_len - 1,
3808
+ ret = clear_record_extent_bits(&inode->io_tree, free_start,
3809
+ free_start + free_len - 1,
31563810 EXTENT_QGROUP_RESERVED, &changeset);
31573811 if (ret < 0)
31583812 goto out;
31593813 freed += changeset.bytes_changed;
31603814 }
3161
- btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed,
3815
+ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
31623816 BTRFS_QGROUP_RSV_DATA);
31633817 ret = freed;
31643818 out:
....@@ -3166,7 +3820,7 @@
31663820 return ret;
31673821 }
31683822
3169
-static int __btrfs_qgroup_release_data(struct inode *inode,
3823
+static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
31703824 struct extent_changeset *reserved, u64 start, u64 len,
31713825 int free)
31723826 {
....@@ -3174,8 +3828,7 @@
31743828 int trace_op = QGROUP_RELEASE;
31753829 int ret;
31763830
3177
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
3178
- &BTRFS_I(inode)->root->fs_info->flags))
3831
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
31793832 return 0;
31803833
31813834 /* In release case, we shouldn't have @reserved */
....@@ -3183,18 +3836,18 @@
31833836 if (free && reserved)
31843837 return qgroup_free_reserved_data(inode, reserved, start, len);
31853838 extent_changeset_init(&changeset);
3186
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
3187
- start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
3839
+ ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
3840
+ EXTENT_QGROUP_RESERVED, &changeset);
31883841 if (ret < 0)
31893842 goto out;
31903843
31913844 if (free)
31923845 trace_op = QGROUP_FREE;
3193
- trace_btrfs_qgroup_release_data(inode, start, len,
3846
+ trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
31943847 changeset.bytes_changed, trace_op);
31953848 if (free)
3196
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
3197
- BTRFS_I(inode)->root->objectid,
3849
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
3850
+ inode->root->root_key.objectid,
31983851 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
31993852 ret = changeset.bytes_changed;
32003853 out:
....@@ -3214,7 +3867,7 @@
32143867 *
32153868 * NOTE: This function may sleep for memory allocation.
32163869 */
3217
-int btrfs_qgroup_free_data(struct inode *inode,
3870
+int btrfs_qgroup_free_data(struct btrfs_inode *inode,
32183871 struct extent_changeset *reserved, u64 start, u64 len)
32193872 {
32203873 return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
....@@ -3235,7 +3888,7 @@
32353888 *
32363889 * NOTE: This function may sleep for memory allocation.
32373890 */
3238
-int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
3891
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
32393892 {
32403893 return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
32413894 }
....@@ -3280,14 +3933,14 @@
32803933 return num_bytes;
32813934 }
32823935
3283
-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3284
- enum btrfs_qgroup_rsv_type type, bool enforce)
3936
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3937
+ enum btrfs_qgroup_rsv_type type, bool enforce)
32853938 {
32863939 struct btrfs_fs_info *fs_info = root->fs_info;
32873940 int ret;
32883941
32893942 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3290
- !is_fstree(root->objectid) || num_bytes == 0)
3943
+ !is_fstree(root->root_key.objectid) || num_bytes == 0)
32913944 return 0;
32923945
32933946 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
....@@ -3307,18 +3960,33 @@
33073960 return ret;
33083961 }
33093962
3963
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3964
+ enum btrfs_qgroup_rsv_type type, bool enforce)
3965
+{
3966
+ int ret;
3967
+
3968
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
3969
+ if (ret <= 0 && ret != -EDQUOT)
3970
+ return ret;
3971
+
3972
+ ret = try_flush_qgroup(root);
3973
+ if (ret < 0)
3974
+ return ret;
3975
+ return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
3976
+}
3977
+
33103978 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
33113979 {
33123980 struct btrfs_fs_info *fs_info = root->fs_info;
33133981
33143982 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3315
- !is_fstree(root->objectid))
3983
+ !is_fstree(root->root_key.objectid))
33163984 return;
33173985
33183986 /* TODO: Update trace point to handle such free */
33193987 trace_qgroup_meta_free_all_pertrans(root);
33203988 /* Special value -1 means to free all reserved space */
3321
- btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
3989
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
33223990 BTRFS_QGROUP_RSV_META_PERTRANS);
33233991 }
33243992
....@@ -3328,7 +3996,7 @@
33283996 struct btrfs_fs_info *fs_info = root->fs_info;
33293997
33303998 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3331
- !is_fstree(root->objectid))
3999
+ !is_fstree(root->root_key.objectid))
33324000 return;
33334001
33344002 /*
....@@ -3339,13 +4007,13 @@
33394007 num_bytes = sub_root_meta_rsv(root, num_bytes, type);
33404008 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
33414009 trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
3342
- btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
4010
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
4011
+ num_bytes, type);
33434012 }
33444013
33454014 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
33464015 int num_bytes)
33474016 {
3348
- struct btrfs_root *quota_root = fs_info->quota_root;
33494017 struct btrfs_qgroup *qgroup;
33504018 struct ulist_node *unode;
33514019 struct ulist_iterator uiter;
....@@ -3353,7 +4021,7 @@
33534021
33544022 if (num_bytes == 0)
33554023 return;
3356
- if (!quota_root)
4024
+ if (!fs_info->quota_root)
33574025 return;
33584026
33594027 spin_lock(&fs_info->qgroup_lock);
....@@ -3393,20 +4061,20 @@
33934061 struct btrfs_fs_info *fs_info = root->fs_info;
33944062
33954063 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3396
- !is_fstree(root->objectid))
4064
+ !is_fstree(root->root_key.objectid))
33974065 return;
33984066 /* Same as btrfs_qgroup_free_meta_prealloc() */
33994067 num_bytes = sub_root_meta_rsv(root, num_bytes,
34004068 BTRFS_QGROUP_RSV_META_PREALLOC);
34014069 trace_qgroup_meta_convert(root, num_bytes);
3402
- qgroup_convert_meta(fs_info, root->objectid, num_bytes);
4070
+ qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
34034071 }
34044072
34054073 /*
34064074 * Check qgroup reserved space leaking, normally at destroy inode
34074075 * time
34084076 */
3409
-void btrfs_qgroup_check_reserved_leak(struct inode *inode)
4077
+void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
34104078 {
34114079 struct extent_changeset changeset;
34124080 struct ulist_node *unode;
....@@ -3414,21 +4082,278 @@
34144082 int ret;
34154083
34164084 extent_changeset_init(&changeset);
3417
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
4085
+ ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
34184086 EXTENT_QGROUP_RESERVED, &changeset);
34194087
34204088 WARN_ON(ret < 0);
34214089 if (WARN_ON(changeset.bytes_changed)) {
34224090 ULIST_ITER_INIT(&iter);
34234091 while ((unode = ulist_next(&changeset.range_changed, &iter))) {
3424
- btrfs_warn(BTRFS_I(inode)->root->fs_info,
3425
- "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
3426
- inode->i_ino, unode->val, unode->aux);
4092
+ btrfs_warn(inode->root->fs_info,
4093
+ "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
4094
+ btrfs_ino(inode), unode->val, unode->aux);
34274095 }
3428
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
3429
- BTRFS_I(inode)->root->objectid,
4096
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
4097
+ inode->root->root_key.objectid,
34304098 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
34314099
34324100 }
34334101 extent_changeset_release(&changeset);
34344102 }
4103
+
4104
+void btrfs_qgroup_init_swapped_blocks(
4105
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks)
4106
+{
4107
+ int i;
4108
+
4109
+ spin_lock_init(&swapped_blocks->lock);
4110
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
4111
+ swapped_blocks->blocks[i] = RB_ROOT;
4112
+ swapped_blocks->swapped = false;
4113
+}
4114
+
4115
+/*
4116
+ * Delete all swapped blocks record of @root.
4117
+ * Every record here means we skipped a full subtree scan for qgroup.
4118
+ *
4119
+ * Gets called when committing one transaction.
4120
+ */
4121
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
4122
+{
4123
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks;
4124
+ int i;
4125
+
4126
+ swapped_blocks = &root->swapped_blocks;
4127
+
4128
+ spin_lock(&swapped_blocks->lock);
4129
+ if (!swapped_blocks->swapped)
4130
+ goto out;
4131
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4132
+ struct rb_root *cur_root = &swapped_blocks->blocks[i];
4133
+ struct btrfs_qgroup_swapped_block *entry;
4134
+ struct btrfs_qgroup_swapped_block *next;
4135
+
4136
+ rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
4137
+ node)
4138
+ kfree(entry);
4139
+ swapped_blocks->blocks[i] = RB_ROOT;
4140
+ }
4141
+ swapped_blocks->swapped = false;
4142
+out:
4143
+ spin_unlock(&swapped_blocks->lock);
4144
+}
4145
+
4146
+/*
4147
+ * Add subtree roots record into @subvol_root.
4148
+ *
4149
+ * @subvol_root: tree root of the subvolume tree get swapped
4150
+ * @bg: block group under balance
4151
+ * @subvol_parent/slot: pointer to the subtree root in subvolume tree
4152
+ * @reloc_parent/slot: pointer to the subtree root in reloc tree
4153
+ * BOTH POINTERS ARE BEFORE TREE SWAP
4154
+ * @last_snapshot: last snapshot generation of the subvolume tree
4155
+ */
4156
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
4157
+ struct btrfs_root *subvol_root,
4158
+ struct btrfs_block_group *bg,
4159
+ struct extent_buffer *subvol_parent, int subvol_slot,
4160
+ struct extent_buffer *reloc_parent, int reloc_slot,
4161
+ u64 last_snapshot)
4162
+{
4163
+ struct btrfs_fs_info *fs_info = subvol_root->fs_info;
4164
+ struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
4165
+ struct btrfs_qgroup_swapped_block *block;
4166
+ struct rb_node **cur;
4167
+ struct rb_node *parent = NULL;
4168
+ int level = btrfs_header_level(subvol_parent) - 1;
4169
+ int ret = 0;
4170
+
4171
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4172
+ return 0;
4173
+
4174
+ if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
4175
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
4176
+ btrfs_err_rl(fs_info,
4177
+ "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
4178
+ __func__,
4179
+ btrfs_node_ptr_generation(subvol_parent, subvol_slot),
4180
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot));
4181
+ return -EUCLEAN;
4182
+ }
4183
+
4184
+ block = kmalloc(sizeof(*block), GFP_NOFS);
4185
+ if (!block) {
4186
+ ret = -ENOMEM;
4187
+ goto out;
4188
+ }
4189
+
4190
+ /*
4191
+ * @reloc_parent/slot is still before swap, while @block is going to
4192
+ * record the bytenr after swap, so we do the swap here.
4193
+ */
4194
+ block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
4195
+ block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
4196
+ reloc_slot);
4197
+ block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
4198
+ block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
4199
+ subvol_slot);
4200
+ block->last_snapshot = last_snapshot;
4201
+ block->level = level;
4202
+
4203
+ /*
4204
+ * If we have bg == NULL, we're called from btrfs_recover_relocation(),
4205
+ * no one else can modify tree blocks thus we qgroup will not change
4206
+ * no matter the value of trace_leaf.
4207
+ */
4208
+ if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
4209
+ block->trace_leaf = true;
4210
+ else
4211
+ block->trace_leaf = false;
4212
+ btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
4213
+
4214
+ /* Insert @block into @blocks */
4215
+ spin_lock(&blocks->lock);
4216
+ cur = &blocks->blocks[level].rb_node;
4217
+ while (*cur) {
4218
+ struct btrfs_qgroup_swapped_block *entry;
4219
+
4220
+ parent = *cur;
4221
+ entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
4222
+ node);
4223
+
4224
+ if (entry->subvol_bytenr < block->subvol_bytenr) {
4225
+ cur = &(*cur)->rb_left;
4226
+ } else if (entry->subvol_bytenr > block->subvol_bytenr) {
4227
+ cur = &(*cur)->rb_right;
4228
+ } else {
4229
+ if (entry->subvol_generation !=
4230
+ block->subvol_generation ||
4231
+ entry->reloc_bytenr != block->reloc_bytenr ||
4232
+ entry->reloc_generation !=
4233
+ block->reloc_generation) {
4234
+ /*
4235
+ * Duplicated but mismatch entry found.
4236
+ * Shouldn't happen.
4237
+ *
4238
+ * Marking qgroup inconsistent should be enough
4239
+ * for end users.
4240
+ */
4241
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4242
+ ret = -EEXIST;
4243
+ }
4244
+ kfree(block);
4245
+ goto out_unlock;
4246
+ }
4247
+ }
4248
+ rb_link_node(&block->node, parent, cur);
4249
+ rb_insert_color(&block->node, &blocks->blocks[level]);
4250
+ blocks->swapped = true;
4251
+out_unlock:
4252
+ spin_unlock(&blocks->lock);
4253
+out:
4254
+ if (ret < 0)
4255
+ fs_info->qgroup_flags |=
4256
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4257
+ return ret;
4258
+}
4259
+
4260
+/*
4261
+ * Check if the tree block is a subtree root, and if so do the needed
4262
+ * delayed subtree trace for qgroup.
4263
+ *
4264
+ * This is called during btrfs_cow_block().
4265
+ */
4266
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
4267
+ struct btrfs_root *root,
4268
+ struct extent_buffer *subvol_eb)
4269
+{
4270
+ struct btrfs_fs_info *fs_info = root->fs_info;
4271
+ struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
4272
+ struct btrfs_qgroup_swapped_block *block;
4273
+ struct extent_buffer *reloc_eb = NULL;
4274
+ struct rb_node *node;
4275
+ bool found = false;
4276
+ bool swapped = false;
4277
+ int level = btrfs_header_level(subvol_eb);
4278
+ int ret = 0;
4279
+ int i;
4280
+
4281
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4282
+ return 0;
4283
+ if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
4284
+ return 0;
4285
+
4286
+ spin_lock(&blocks->lock);
4287
+ if (!blocks->swapped) {
4288
+ spin_unlock(&blocks->lock);
4289
+ return 0;
4290
+ }
4291
+ node = blocks->blocks[level].rb_node;
4292
+
4293
+ while (node) {
4294
+ block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4295
+ if (block->subvol_bytenr < subvol_eb->start) {
4296
+ node = node->rb_left;
4297
+ } else if (block->subvol_bytenr > subvol_eb->start) {
4298
+ node = node->rb_right;
4299
+ } else {
4300
+ found = true;
4301
+ break;
4302
+ }
4303
+ }
4304
+ if (!found) {
4305
+ spin_unlock(&blocks->lock);
4306
+ goto out;
4307
+ }
4308
+ /* Found one, remove it from @blocks first and update blocks->swapped */
4309
+ rb_erase(&block->node, &blocks->blocks[level]);
4310
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4311
+ if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
4312
+ swapped = true;
4313
+ break;
4314
+ }
4315
+ }
4316
+ blocks->swapped = swapped;
4317
+ spin_unlock(&blocks->lock);
4318
+
4319
+ /* Read out reloc subtree root */
4320
+ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
4321
+ block->reloc_generation, block->level,
4322
+ &block->first_key);
4323
+ if (IS_ERR(reloc_eb)) {
4324
+ ret = PTR_ERR(reloc_eb);
4325
+ reloc_eb = NULL;
4326
+ goto free_out;
4327
+ }
4328
+ if (!extent_buffer_uptodate(reloc_eb)) {
4329
+ ret = -EIO;
4330
+ goto free_out;
4331
+ }
4332
+
4333
+ ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
4334
+ block->last_snapshot, block->trace_leaf);
4335
+free_out:
4336
+ kfree(block);
4337
+ free_extent_buffer(reloc_eb);
4338
+out:
4339
+ if (ret < 0) {
4340
+ btrfs_err_rl(fs_info,
4341
+ "failed to account subtree at bytenr %llu: %d",
4342
+ subvol_eb->start, ret);
4343
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4344
+ }
4345
+ return ret;
4346
+}
4347
+
4348
+void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
4349
+{
4350
+ struct btrfs_qgroup_extent_record *entry;
4351
+ struct btrfs_qgroup_extent_record *next;
4352
+ struct rb_root *root;
4353
+
4354
+ root = &trans->delayed_refs.dirty_extent_root;
4355
+ rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
4356
+ ulist_free(entry->old_roots);
4357
+ kfree(entry);
4358
+ }
4359
+}