hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/fs/btrfs/qgroup.c
....@@ -11,7 +11,7 @@
1111 #include <linux/slab.h>
1212 #include <linux/workqueue.h>
1313 #include <linux/btrfs.h>
14
-#include <linux/sizes.h>
14
+#include <linux/sched/mm.h>
1515
1616 #include "ctree.h"
1717 #include "transaction.h"
....@@ -21,7 +21,8 @@
2121 #include "backref.h"
2222 #include "extent_io.h"
2323 #include "qgroup.h"
24
-
24
+#include "block-group.h"
25
+#include "sysfs.h"
2526
2627 /* TODO XXX FIXME
2728 * - subvol delete -> delete when ref goes to 0? delete limits also?
....@@ -30,7 +31,7 @@
3031 * - sync
3132 * - copy also limits on subvol creation
3233 * - limit
33
- * - caches fuer ulists
34
+ * - caches for ulists
3435 * - performance benchmarks
3536 * - check all ioctl parameters
3637 */
....@@ -220,7 +221,8 @@
220221 return qgroup;
221222 }
222223
223
-static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
224
+static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
225
+ struct btrfs_qgroup *qgroup)
224226 {
225227 struct btrfs_qgroup_list *list;
226228
....@@ -240,7 +242,6 @@
240242 list_del(&list->next_member);
241243 kfree(list);
242244 }
243
- kfree(qgroup);
244245 }
245246
246247 /* must be called with qgroup_lock held */
....@@ -252,7 +253,7 @@
252253 return -ENOENT;
253254
254255 rb_erase(&qgroup->node, &fs_info->qgroup_tree);
255
- __del_qgroup_rb(qgroup);
256
+ __del_qgroup_rb(fs_info, qgroup);
256257 return 0;
257258 }
258259
....@@ -351,6 +352,9 @@
351352 goto out;
352353 }
353354
355
+ ret = btrfs_sysfs_add_qgroups(fs_info);
356
+ if (ret < 0)
357
+ goto out;
354358 /* default this to quota off, in case no status key is found */
355359 fs_info->qgroup_flags = 0;
356360
....@@ -412,6 +416,10 @@
412416 goto out;
413417 }
414418 }
419
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
420
+ if (ret < 0)
421
+ goto out;
422
+
415423 switch (found_key.type) {
416424 case BTRFS_QGROUP_INFO_KEY: {
417425 struct btrfs_qgroup_info_item *ptr;
....@@ -500,9 +508,48 @@
500508 ulist_free(fs_info->qgroup_ulist);
501509 fs_info->qgroup_ulist = NULL;
502510 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
511
+ btrfs_sysfs_del_qgroups(fs_info);
503512 }
504513
505514 return ret < 0 ? ret : 0;
515
+}
516
+
517
+/*
518
+ * Called in close_ctree() when quota is still enabled. This verifies we don't
519
+ * leak some reserved space.
520
+ *
521
+ * Return false if no reserved space is left.
522
+ * Return true if some reserved space is leaked.
523
+ */
524
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
525
+{
526
+ struct rb_node *node;
527
+ bool ret = false;
528
+
529
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
530
+ return ret;
531
+ /*
532
+ * Since we're unmounting, there is no race and no need to grab qgroup
533
+ * lock. And here we don't go post-order to provide a more user
534
+ * friendly sorted result.
535
+ */
536
+ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
537
+ struct btrfs_qgroup *qgroup;
538
+ int i;
539
+
540
+ qgroup = rb_entry(node, struct btrfs_qgroup, node);
541
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
542
+ if (qgroup->rsv.values[i]) {
543
+ ret = true;
544
+ btrfs_warn(fs_info,
545
+ "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
546
+ btrfs_qgroup_level(qgroup->qgroupid),
547
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
548
+ i, qgroup->rsv.values[i]);
549
+ }
550
+ }
551
+ }
552
+ return ret;
506553 }
507554
508555 /*
....@@ -519,15 +566,18 @@
519566 while ((n = rb_first(&fs_info->qgroup_tree))) {
520567 qgroup = rb_entry(n, struct btrfs_qgroup, node);
521568 rb_erase(n, &fs_info->qgroup_tree);
522
- __del_qgroup_rb(qgroup);
569
+ __del_qgroup_rb(fs_info, qgroup);
570
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
571
+ kfree(qgroup);
523572 }
524573 /*
525
- * we call btrfs_free_qgroup_config() when umounting
574
+ * We call btrfs_free_qgroup_config() when unmounting
526575 * filesystem and disabling quota, so we set qgroup_ulist
527576 * to be null here to avoid double free.
528577 */
529578 ulist_free(fs_info->qgroup_ulist);
530579 fs_info->qgroup_ulist = NULL;
580
+ btrfs_sysfs_del_qgroups(fs_info);
531581 }
532582
533583 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
....@@ -887,12 +937,46 @@
887937 struct btrfs_key found_key;
888938 struct btrfs_qgroup *qgroup = NULL;
889939 struct btrfs_trans_handle *trans = NULL;
940
+ struct ulist *ulist = NULL;
890941 int ret = 0;
891942 int slot;
943
+
944
+ /*
945
+ * We need to have subvol_sem write locked, to prevent races between
946
+ * concurrent tasks trying to enable quotas, because we will unlock
947
+ * and relock qgroup_ioctl_lock before setting fs_info->quota_root
948
+ * and before setting BTRFS_FS_QUOTA_ENABLED.
949
+ */
950
+ lockdep_assert_held_write(&fs_info->subvol_sem);
892951
893952 mutex_lock(&fs_info->qgroup_ioctl_lock);
894953 if (fs_info->quota_root)
895954 goto out;
955
+
956
+ ulist = ulist_alloc(GFP_KERNEL);
957
+ if (!ulist) {
958
+ ret = -ENOMEM;
959
+ goto out;
960
+ }
961
+
962
+ ret = btrfs_sysfs_add_qgroups(fs_info);
963
+ if (ret < 0)
964
+ goto out;
965
+
966
+ /*
967
+ * Unlock qgroup_ioctl_lock before starting the transaction. This is to
968
+ * avoid lock acquisition inversion problems (reported by lockdep) between
969
+ * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
970
+ * start a transaction.
971
+ * After we started the transaction lock qgroup_ioctl_lock again and
972
+ * check if someone else created the quota root in the meanwhile. If so,
973
+ * just return success and release the transaction handle.
974
+ *
975
+ * Also we don't need to worry about someone else calling
976
+ * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
977
+ * that function returns 0 (success) when the sysfs entries already exist.
978
+ */
979
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
896980
897981 /*
898982 * 1 for quota root item
....@@ -903,24 +987,24 @@
903987 * would be a lot of overkill.
904988 */
905989 trans = btrfs_start_transaction(tree_root, 2);
990
+
991
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
906992 if (IS_ERR(trans)) {
907993 ret = PTR_ERR(trans);
908994 trans = NULL;
909995 goto out;
910996 }
911997
912
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
913
- if (!fs_info->qgroup_ulist) {
914
- ret = -ENOMEM;
915
- btrfs_abort_transaction(trans, ret);
998
+ if (fs_info->quota_root)
916999 goto out;
917
- }
1000
+
1001
+ fs_info->qgroup_ulist = ulist;
1002
+ ulist = NULL;
9181003
9191004 /*
9201005 * initially create the quota tree
9211006 */
922
- quota_root = btrfs_create_tree(trans, fs_info,
923
- BTRFS_QUOTA_TREE_OBJECTID);
1007
+ quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
9241008 if (IS_ERR(quota_root)) {
9251009 ret = PTR_ERR(quota_root);
9261010 btrfs_abort_transaction(trans, ret);
....@@ -976,6 +1060,10 @@
9761060 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9771061
9781062 if (found_key.type == BTRFS_ROOT_REF_KEY) {
1063
+
1064
+ /* Release locks on tree_root before we access quota_root */
1065
+ btrfs_release_path(path);
1066
+
9791067 ret = add_qgroup_item(trans, quota_root,
9801068 found_key.offset);
9811069 if (ret) {
....@@ -988,6 +1076,25 @@
9881076 ret = PTR_ERR(qgroup);
9891077 btrfs_abort_transaction(trans, ret);
9901078 goto out_free_path;
1079
+ }
1080
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1081
+ if (ret < 0) {
1082
+ btrfs_abort_transaction(trans, ret);
1083
+ goto out_free_path;
1084
+ }
1085
+ ret = btrfs_search_slot_for_read(tree_root, &found_key,
1086
+ path, 1, 0);
1087
+ if (ret < 0) {
1088
+ btrfs_abort_transaction(trans, ret);
1089
+ goto out_free_path;
1090
+ }
1091
+ if (ret > 0) {
1092
+ /*
1093
+ * Shouldn't happen, but in case it does we
1094
+ * don't need to do the btrfs_next_item, just
1095
+ * continue.
1096
+ */
1097
+ continue;
9911098 }
9921099 }
9931100 ret = btrfs_next_item(tree_root, path);
....@@ -1013,9 +1120,25 @@
10131120 btrfs_abort_transaction(trans, ret);
10141121 goto out_free_path;
10151122 }
1123
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1124
+ if (ret < 0) {
1125
+ btrfs_abort_transaction(trans, ret);
1126
+ goto out_free_path;
1127
+ }
10161128
1129
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
1130
+ /*
1131
+ * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
1132
+ * a deadlock with tasks concurrently doing other qgroup operations, such
1133
+ * adding/removing qgroups or adding/deleting qgroup relations for example,
1134
+ * because all qgroup operations first start or join a transaction and then
1135
+ * lock the qgroup_ioctl_lock mutex.
1136
+ * We are safe from a concurrent task trying to enable quotas, by calling
1137
+ * this function, since we are serialized by fs_info->subvol_sem.
1138
+ */
10171139 ret = btrfs_commit_transaction(trans);
10181140 trans = NULL;
1141
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
10191142 if (ret)
10201143 goto out_free_path;
10211144
....@@ -1035,24 +1158,40 @@
10351158 fs_info->qgroup_rescan_running = true;
10361159 btrfs_queue_work(fs_info->qgroup_rescan_workers,
10371160 &fs_info->qgroup_rescan_work);
1161
+ } else {
1162
+ /*
1163
+ * We have set both BTRFS_FS_QUOTA_ENABLED and
1164
+ * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
1165
+ * -EINPROGRESS. That can happen because someone started the
1166
+ * rescan worker by calling quota rescan ioctl before we
1167
+ * attempted to initialize the rescan worker. Failure due to
1168
+ * quotas disabled in the meanwhile is not possible, because
1169
+ * we are holding a write lock on fs_info->subvol_sem, which
1170
+ * is also acquired when disabling quotas.
1171
+ * Ignore such error, and any other error would need to undo
1172
+ * everything we did in the transaction we just committed.
1173
+ */
1174
+ ASSERT(ret == -EINPROGRESS);
1175
+ ret = 0;
10381176 }
10391177
10401178 out_free_path:
10411179 btrfs_free_path(path);
10421180 out_free_root:
1043
- if (ret) {
1044
- free_extent_buffer(quota_root->node);
1045
- free_extent_buffer(quota_root->commit_root);
1046
- kfree(quota_root);
1047
- }
1181
+ if (ret)
1182
+ btrfs_put_root(quota_root);
10481183 out:
10491184 if (ret) {
10501185 ulist_free(fs_info->qgroup_ulist);
10511186 fs_info->qgroup_ulist = NULL;
1052
- if (trans)
1053
- btrfs_end_transaction(trans);
1187
+ btrfs_sysfs_del_qgroups(fs_info);
10541188 }
10551189 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1190
+ if (ret && trans)
1191
+ btrfs_end_transaction(trans);
1192
+ else if (trans)
1193
+ ret = btrfs_end_transaction(trans);
1194
+ ulist_free(ulist);
10561195 return ret;
10571196 }
10581197
....@@ -1062,24 +1201,66 @@
10621201 struct btrfs_trans_handle *trans = NULL;
10631202 int ret = 0;
10641203
1204
+ /*
1205
+ * We need to have subvol_sem write locked to prevent races with
1206
+ * snapshot creation.
1207
+ */
1208
+ lockdep_assert_held_write(&fs_info->subvol_sem);
1209
+
1210
+ /*
1211
+ * Lock the cleaner mutex to prevent races with concurrent relocation,
1212
+ * because relocation may be building backrefs for blocks of the quota
1213
+ * root while we are deleting the root. This is like dropping fs roots
1214
+ * of deleted snapshots/subvolumes, we need the same protection.
1215
+ *
1216
+ * This also prevents races between concurrent tasks trying to disable
1217
+ * quotas, because we will unlock and relock qgroup_ioctl_lock across
1218
+ * BTRFS_FS_QUOTA_ENABLED changes.
1219
+ */
1220
+ mutex_lock(&fs_info->cleaner_mutex);
1221
+
10651222 mutex_lock(&fs_info->qgroup_ioctl_lock);
10661223 if (!fs_info->quota_root)
10671224 goto out;
1225
+
1226
+ /*
1227
+ * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
1228
+ * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
1229
+ * to lock that mutex while holding a transaction handle and the rescan
1230
+ * worker needs to commit a transaction.
1231
+ */
1232
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
1233
+
1234
+ /*
1235
+ * Request qgroup rescan worker to complete and wait for it. This wait
1236
+ * must be done before transaction start for quota disable since it may
1237
+ * deadlock with transaction by the qgroup rescan worker.
1238
+ */
1239
+ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1240
+ btrfs_qgroup_wait_for_completion(fs_info, false);
10681241
10691242 /*
10701243 * 1 For the root item
10711244 *
10721245 * We should also reserve enough items for the quota tree deletion in
10731246 * btrfs_clean_quota_tree but this is not done.
1247
+ *
1248
+ * Also, we must always start a transaction without holding the mutex
1249
+ * qgroup_ioctl_lock, see btrfs_quota_enable().
10741250 */
10751251 trans = btrfs_start_transaction(fs_info->tree_root, 1);
1252
+
1253
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
10761254 if (IS_ERR(trans)) {
10771255 ret = PTR_ERR(trans);
1256
+ trans = NULL;
1257
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
10781258 goto out;
10791259 }
10801260
1081
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1082
- btrfs_qgroup_wait_for_completion(fs_info, false);
1261
+ if (!fs_info->quota_root)
1262
+ goto out;
1263
+
10831264 spin_lock(&fs_info->qgroup_lock);
10841265 quota_root = fs_info->quota_root;
10851266 fs_info->quota_root = NULL;
....@@ -1091,30 +1272,34 @@
10911272 ret = btrfs_clean_quota_tree(trans, quota_root);
10921273 if (ret) {
10931274 btrfs_abort_transaction(trans, ret);
1094
- goto end_trans;
1275
+ goto out;
10951276 }
10961277
10971278 ret = btrfs_del_root(trans, &quota_root->root_key);
10981279 if (ret) {
10991280 btrfs_abort_transaction(trans, ret);
1100
- goto end_trans;
1281
+ goto out;
11011282 }
11021283
1284
+ spin_lock(&fs_info->trans_lock);
11031285 list_del(&quota_root->dirty_list);
1286
+ spin_unlock(&fs_info->trans_lock);
11041287
11051288 btrfs_tree_lock(quota_root->node);
1106
- clean_tree_block(fs_info, quota_root->node);
1289
+ btrfs_clean_tree_block(quota_root->node);
11071290 btrfs_tree_unlock(quota_root->node);
11081291 btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
11091292
1110
- free_extent_buffer(quota_root->node);
1111
- free_extent_buffer(quota_root->commit_root);
1112
- kfree(quota_root);
1293
+ btrfs_put_root(quota_root);
11131294
1114
-end_trans:
1115
- ret = btrfs_end_transaction(trans);
11161295 out:
11171296 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1297
+ if (ret && trans)
1298
+ btrfs_end_transaction(trans);
1299
+ else if (trans)
1300
+ ret = btrfs_end_transaction(trans);
1301
+ mutex_unlock(&fs_info->cleaner_mutex);
1302
+
11181303 return ret;
11191304 }
11201305
....@@ -1129,7 +1314,7 @@
11291314 * The easy accounting, we're updating qgroup relationship whose child qgroup
11301315 * only has exclusive extents.
11311316 *
1132
- * In this case, all exclsuive extents will also be exlusive for parent, so
1317
+ * In this case, all exclusive extents will also be exclusive for parent, so
11331318 * excl/rfer just get added/removed.
11341319 *
11351320 * So is qgroup reservation space, which should also be added/removed to
....@@ -1246,25 +1431,27 @@
12461431 u64 dst)
12471432 {
12481433 struct btrfs_fs_info *fs_info = trans->fs_info;
1249
- struct btrfs_root *quota_root;
12501434 struct btrfs_qgroup *parent;
12511435 struct btrfs_qgroup *member;
12521436 struct btrfs_qgroup_list *list;
12531437 struct ulist *tmp;
1438
+ unsigned int nofs_flag;
12541439 int ret = 0;
12551440
12561441 /* Check the level of src and dst first */
12571442 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
12581443 return -EINVAL;
12591444
1445
+ /* We hold a transaction handle open, must do a NOFS allocation. */
1446
+ nofs_flag = memalloc_nofs_save();
12601447 tmp = ulist_alloc(GFP_KERNEL);
1448
+ memalloc_nofs_restore(nofs_flag);
12611449 if (!tmp)
12621450 return -ENOMEM;
12631451
12641452 mutex_lock(&fs_info->qgroup_ioctl_lock);
1265
- quota_root = fs_info->quota_root;
1266
- if (!quota_root) {
1267
- ret = -EINVAL;
1453
+ if (!fs_info->quota_root) {
1454
+ ret = -ENOTCONN;
12681455 goto out;
12691456 }
12701457 member = find_qgroup_rb(fs_info, src);
....@@ -1310,48 +1497,62 @@
13101497 u64 dst)
13111498 {
13121499 struct btrfs_fs_info *fs_info = trans->fs_info;
1313
- struct btrfs_root *quota_root;
13141500 struct btrfs_qgroup *parent;
13151501 struct btrfs_qgroup *member;
13161502 struct btrfs_qgroup_list *list;
13171503 struct ulist *tmp;
1504
+ bool found = false;
1505
+ unsigned int nofs_flag;
13181506 int ret = 0;
1319
- int err;
1507
+ int ret2;
13201508
1509
+ /* We hold a transaction handle open, must do a NOFS allocation. */
1510
+ nofs_flag = memalloc_nofs_save();
13211511 tmp = ulist_alloc(GFP_KERNEL);
1512
+ memalloc_nofs_restore(nofs_flag);
13221513 if (!tmp)
13231514 return -ENOMEM;
13241515
1325
- quota_root = fs_info->quota_root;
1326
- if (!quota_root) {
1327
- ret = -EINVAL;
1516
+ if (!fs_info->quota_root) {
1517
+ ret = -ENOTCONN;
13281518 goto out;
13291519 }
13301520
13311521 member = find_qgroup_rb(fs_info, src);
13321522 parent = find_qgroup_rb(fs_info, dst);
1333
- if (!member || !parent) {
1334
- ret = -EINVAL;
1335
- goto out;
1336
- }
1523
+ /*
1524
+ * The parent/member pair doesn't exist, then try to delete the dead
1525
+ * relation items only.
1526
+ */
1527
+ if (!member || !parent)
1528
+ goto delete_item;
13371529
13381530 /* check if such qgroup relation exist firstly */
13391531 list_for_each_entry(list, &member->groups, next_group) {
1340
- if (list->group == parent)
1341
- goto exist;
1532
+ if (list->group == parent) {
1533
+ found = true;
1534
+ break;
1535
+ }
13421536 }
1343
- ret = -ENOENT;
1344
- goto out;
1345
-exist:
1346
- ret = del_qgroup_relation_item(trans, src, dst);
1347
- err = del_qgroup_relation_item(trans, dst, src);
1348
- if (err && !ret)
1349
- ret = err;
13501537
1351
- spin_lock(&fs_info->qgroup_lock);
1352
- del_relation_rb(fs_info, src, dst);
1353
- ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1354
- spin_unlock(&fs_info->qgroup_lock);
1538
+delete_item:
1539
+ ret = del_qgroup_relation_item(trans, src, dst);
1540
+ if (ret < 0 && ret != -ENOENT)
1541
+ goto out;
1542
+ ret2 = del_qgroup_relation_item(trans, dst, src);
1543
+ if (ret2 < 0 && ret2 != -ENOENT)
1544
+ goto out;
1545
+
1546
+ /* At least one deletion succeeded, return 0 */
1547
+ if (!ret || !ret2)
1548
+ ret = 0;
1549
+
1550
+ if (found) {
1551
+ spin_lock(&fs_info->qgroup_lock);
1552
+ del_relation_rb(fs_info, src, dst);
1553
+ ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1554
+ spin_unlock(&fs_info->qgroup_lock);
1555
+ }
13551556 out:
13561557 ulist_free(tmp);
13571558 return ret;
....@@ -1378,11 +1579,11 @@
13781579 int ret = 0;
13791580
13801581 mutex_lock(&fs_info->qgroup_ioctl_lock);
1381
- quota_root = fs_info->quota_root;
1382
- if (!quota_root) {
1383
- ret = -EINVAL;
1582
+ if (!fs_info->quota_root) {
1583
+ ret = -ENOTCONN;
13841584 goto out;
13851585 }
1586
+ quota_root = fs_info->quota_root;
13861587 qgroup = find_qgroup_rb(fs_info, qgroupid);
13871588 if (qgroup) {
13881589 ret = -EEXIST;
....@@ -1397,8 +1598,11 @@
13971598 qgroup = add_qgroup_rb(fs_info, qgroupid);
13981599 spin_unlock(&fs_info->qgroup_lock);
13991600
1400
- if (IS_ERR(qgroup))
1601
+ if (IS_ERR(qgroup)) {
14011602 ret = PTR_ERR(qgroup);
1603
+ goto out;
1604
+ }
1605
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
14021606 out:
14031607 mutex_unlock(&fs_info->qgroup_ioctl_lock);
14041608 return ret;
....@@ -1407,15 +1611,13 @@
14071611 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
14081612 {
14091613 struct btrfs_fs_info *fs_info = trans->fs_info;
1410
- struct btrfs_root *quota_root;
14111614 struct btrfs_qgroup *qgroup;
14121615 struct btrfs_qgroup_list *list;
14131616 int ret = 0;
14141617
14151618 mutex_lock(&fs_info->qgroup_ioctl_lock);
1416
- quota_root = fs_info->quota_root;
1417
- if (!quota_root) {
1418
- ret = -EINVAL;
1619
+ if (!fs_info->quota_root) {
1620
+ ret = -ENOTCONN;
14191621 goto out;
14201622 }
14211623
....@@ -1423,13 +1625,14 @@
14231625 if (!qgroup) {
14241626 ret = -ENOENT;
14251627 goto out;
1426
- } else {
1427
- /* check if there are no children of this qgroup */
1428
- if (!list_empty(&qgroup->members)) {
1429
- ret = -EBUSY;
1430
- goto out;
1431
- }
14321628 }
1629
+
1630
+ /* Check if there are no children of this qgroup */
1631
+ if (!list_empty(&qgroup->members)) {
1632
+ ret = -EBUSY;
1633
+ goto out;
1634
+ }
1635
+
14331636 ret = del_qgroup_item(trans, qgroupid);
14341637 if (ret && ret != -ENOENT)
14351638 goto out;
....@@ -1446,6 +1649,14 @@
14461649 spin_lock(&fs_info->qgroup_lock);
14471650 del_qgroup_rb(fs_info, qgroupid);
14481651 spin_unlock(&fs_info->qgroup_lock);
1652
+
1653
+ /*
1654
+ * Remove the qgroup from sysfs now without holding the qgroup_lock
1655
+ * spinlock, since the sysfs_remove_group() function needs to take
1656
+ * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
1657
+ */
1658
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
1659
+ kfree(qgroup);
14491660 out:
14501661 mutex_unlock(&fs_info->qgroup_ioctl_lock);
14511662 return ret;
....@@ -1455,7 +1666,6 @@
14551666 struct btrfs_qgroup_limit *limit)
14561667 {
14571668 struct btrfs_fs_info *fs_info = trans->fs_info;
1458
- struct btrfs_root *quota_root;
14591669 struct btrfs_qgroup *qgroup;
14601670 int ret = 0;
14611671 /* Sometimes we would want to clear the limit on this qgroup.
....@@ -1465,9 +1675,8 @@
14651675 const u64 CLEAR_VALUE = -1;
14661676
14671677 mutex_lock(&fs_info->qgroup_ioctl_lock);
1468
- quota_root = fs_info->quota_root;
1469
- if (!quota_root) {
1470
- ret = -EINVAL;
1678
+ if (!fs_info->quota_root) {
1679
+ ret = -ENOTCONN;
14711680 goto out;
14721681 }
14731682
....@@ -1546,12 +1755,18 @@
15461755 parent_node = *p;
15471756 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
15481757 node);
1549
- if (bytenr < entry->bytenr)
1758
+ if (bytenr < entry->bytenr) {
15501759 p = &(*p)->rb_left;
1551
- else if (bytenr > entry->bytenr)
1760
+ } else if (bytenr > entry->bytenr) {
15521761 p = &(*p)->rb_right;
1553
- else
1762
+ } else {
1763
+ if (record->data_rsv && !entry->data_rsv) {
1764
+ entry->data_rsv = record->data_rsv;
1765
+ entry->data_rsv_refroot =
1766
+ record->data_rsv_refroot;
1767
+ }
15541768 return 1;
1769
+ }
15551770 }
15561771
15571772 rb_link_node(&record->node, parent_node, p);
....@@ -1597,7 +1812,7 @@
15971812 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
15981813 || bytenr == 0 || num_bytes == 0)
15991814 return 0;
1600
- record = kmalloc(sizeof(*record), gfp_flag);
1815
+ record = kzalloc(sizeof(*record), gfp_flag);
16011816 if (!record)
16021817 return -ENOMEM;
16031818
....@@ -1719,6 +1934,357 @@
17191934 return 0;
17201935 }
17211936
1937
+/*
1938
+ * Helper function to trace a subtree tree block swap.
1939
+ *
1940
+ * The swap will happen in highest tree block, but there may be a lot of
1941
+ * tree blocks involved.
1942
+ *
1943
+ * For example:
1944
+ * OO = Old tree blocks
1945
+ * NN = New tree blocks allocated during balance
1946
+ *
1947
+ * File tree (257) Reloc tree for 257
1948
+ * L2 OO NN
1949
+ * / \ / \
1950
+ * L1 OO OO (a) OO NN (a)
1951
+ * / \ / \ / \ / \
1952
+ * L0 OO OO OO OO OO OO NN NN
1953
+ * (b) (c) (b) (c)
1954
+ *
1955
+ * When calling qgroup_trace_extent_swap(), we will pass:
1956
+ * @src_eb = OO(a)
1957
+ * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
1958
+ * @dst_level = 0
1959
+ * @root_level = 1
1960
+ *
1961
+ * In that case, qgroup_trace_extent_swap() will search from OO(a) to
1962
+ * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
1963
+ *
1964
+ * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
1965
+ *
1966
+ * 1) Tree search from @src_eb
1967
+ * It should acts as a simplified btrfs_search_slot().
1968
+ * The key for search can be extracted from @dst_path->nodes[dst_level]
1969
+ * (first key).
1970
+ *
1971
+ * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
1972
+ * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
1973
+ * They should be marked during previous (@dst_level = 1) iteration.
1974
+ *
1975
+ * 3) Mark file extents in leaves dirty
1976
+ * We don't have good way to pick out new file extents only.
1977
+ * So we still follow the old method by scanning all file extents in
1978
+ * the leave.
1979
+ *
1980
+ * This function can free us from keeping two paths, thus later we only need
1981
+ * to care about how to iterate all new tree blocks in reloc tree.
1982
+ */
1983
+static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
1984
+ struct extent_buffer *src_eb,
1985
+ struct btrfs_path *dst_path,
1986
+ int dst_level, int root_level,
1987
+ bool trace_leaf)
1988
+{
1989
+ struct btrfs_key key;
1990
+ struct btrfs_path *src_path;
1991
+ struct btrfs_fs_info *fs_info = trans->fs_info;
1992
+ u32 nodesize = fs_info->nodesize;
1993
+ int cur_level = root_level;
1994
+ int ret;
1995
+
1996
+ BUG_ON(dst_level > root_level);
1997
+ /* Level mismatch */
1998
+ if (btrfs_header_level(src_eb) != root_level)
1999
+ return -EINVAL;
2000
+
2001
+ src_path = btrfs_alloc_path();
2002
+ if (!src_path) {
2003
+ ret = -ENOMEM;
2004
+ goto out;
2005
+ }
2006
+
2007
+ if (dst_level)
2008
+ btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2009
+ else
2010
+ btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2011
+
2012
+ /* For src_path */
2013
+ atomic_inc(&src_eb->refs);
2014
+ src_path->nodes[root_level] = src_eb;
2015
+ src_path->slots[root_level] = dst_path->slots[root_level];
2016
+ src_path->locks[root_level] = 0;
2017
+
2018
+ /* A simplified version of btrfs_search_slot() */
2019
+ while (cur_level >= dst_level) {
2020
+ struct btrfs_key src_key;
2021
+ struct btrfs_key dst_key;
2022
+
2023
+ if (src_path->nodes[cur_level] == NULL) {
2024
+ struct btrfs_key first_key;
2025
+ struct extent_buffer *eb;
2026
+ int parent_slot;
2027
+ u64 child_gen;
2028
+ u64 child_bytenr;
2029
+
2030
+ eb = src_path->nodes[cur_level + 1];
2031
+ parent_slot = src_path->slots[cur_level + 1];
2032
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2033
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2034
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
2035
+
2036
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
2037
+ cur_level, &first_key);
2038
+ if (IS_ERR(eb)) {
2039
+ ret = PTR_ERR(eb);
2040
+ goto out;
2041
+ } else if (!extent_buffer_uptodate(eb)) {
2042
+ free_extent_buffer(eb);
2043
+ ret = -EIO;
2044
+ goto out;
2045
+ }
2046
+
2047
+ src_path->nodes[cur_level] = eb;
2048
+
2049
+ btrfs_tree_read_lock(eb);
2050
+ btrfs_set_lock_blocking_read(eb);
2051
+ src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
2052
+ }
2053
+
2054
+ src_path->slots[cur_level] = dst_path->slots[cur_level];
2055
+ if (cur_level) {
2056
+ btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
2057
+ &dst_key, dst_path->slots[cur_level]);
2058
+ btrfs_node_key_to_cpu(src_path->nodes[cur_level],
2059
+ &src_key, src_path->slots[cur_level]);
2060
+ } else {
2061
+ btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
2062
+ &dst_key, dst_path->slots[cur_level]);
2063
+ btrfs_item_key_to_cpu(src_path->nodes[cur_level],
2064
+ &src_key, src_path->slots[cur_level]);
2065
+ }
2066
+ /* Content mismatch, something went wrong */
2067
+ if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
2068
+ ret = -ENOENT;
2069
+ goto out;
2070
+ }
2071
+ cur_level--;
2072
+ }
2073
+
2074
+ /*
2075
+ * Now both @dst_path and @src_path have been populated, record the tree
2076
+ * blocks for qgroup accounting.
2077
+ */
2078
+ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
2079
+ nodesize, GFP_NOFS);
2080
+ if (ret < 0)
2081
+ goto out;
2082
+ ret = btrfs_qgroup_trace_extent(trans,
2083
+ dst_path->nodes[dst_level]->start,
2084
+ nodesize, GFP_NOFS);
2085
+ if (ret < 0)
2086
+ goto out;
2087
+
2088
+ /* Record leaf file extents */
2089
+ if (dst_level == 0 && trace_leaf) {
2090
+ ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
2091
+ if (ret < 0)
2092
+ goto out;
2093
+ ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
2094
+ }
2095
+out:
2096
+ btrfs_free_path(src_path);
2097
+ return ret;
2098
+}
2099
+
2100
+/*
2101
+ * Helper function to do recursive generation-aware depth-first search, to
2102
+ * locate all new tree blocks in a subtree of reloc tree.
2103
+ *
2104
+ * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
2105
+ * reloc tree
2106
+ * L2 NN (a)
2107
+ * / \
2108
+ * L1 OO NN (b)
2109
+ * / \ / \
2110
+ * L0 OO OO OO NN
2111
+ * (c) (d)
2112
+ * If we pass:
2113
+ * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
2114
+ * @cur_level = 1
2115
+ * @root_level = 1
2116
+ *
2117
+ * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
2118
+ * above tree blocks along with their counter parts in file tree.
2119
+ * While during search, old tree blocks OO(c) will be skipped as tree block swap
2120
+ * won't affect OO(c).
2121
+ */
2122
+static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
2123
+ struct extent_buffer *src_eb,
2124
+ struct btrfs_path *dst_path,
2125
+ int cur_level, int root_level,
2126
+ u64 last_snapshot, bool trace_leaf)
2127
+{
2128
+ struct btrfs_fs_info *fs_info = trans->fs_info;
2129
+ struct extent_buffer *eb;
2130
+ bool need_cleanup = false;
2131
+ int ret = 0;
2132
+ int i;
2133
+
2134
+ /* Level sanity check */
2135
+ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
2136
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
2137
+ root_level < cur_level) {
2138
+ btrfs_err_rl(fs_info,
2139
+ "%s: bad levels, cur_level=%d root_level=%d",
2140
+ __func__, cur_level, root_level);
2141
+ return -EUCLEAN;
2142
+ }
2143
+
2144
+ /* Read the tree block if needed */
2145
+ if (dst_path->nodes[cur_level] == NULL) {
2146
+ struct btrfs_key first_key;
2147
+ int parent_slot;
2148
+ u64 child_gen;
2149
+ u64 child_bytenr;
2150
+
2151
+ /*
2152
+ * dst_path->nodes[root_level] must be initialized before
2153
+ * calling this function.
2154
+ */
2155
+ if (cur_level == root_level) {
2156
+ btrfs_err_rl(fs_info,
2157
+ "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
2158
+ __func__, root_level, root_level, cur_level);
2159
+ return -EUCLEAN;
2160
+ }
2161
+
2162
+ /*
2163
+ * We need to get child blockptr/gen from parent before we can
2164
+ * read it.
2165
+ */
2166
+ eb = dst_path->nodes[cur_level + 1];
2167
+ parent_slot = dst_path->slots[cur_level + 1];
2168
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2169
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2170
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
2171
+
2172
+ /* This node is old, no need to trace */
2173
+ if (child_gen < last_snapshot)
2174
+ goto out;
2175
+
2176
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
2177
+ cur_level, &first_key);
2178
+ if (IS_ERR(eb)) {
2179
+ ret = PTR_ERR(eb);
2180
+ goto out;
2181
+ } else if (!extent_buffer_uptodate(eb)) {
2182
+ free_extent_buffer(eb);
2183
+ ret = -EIO;
2184
+ goto out;
2185
+ }
2186
+
2187
+ dst_path->nodes[cur_level] = eb;
2188
+ dst_path->slots[cur_level] = 0;
2189
+
2190
+ btrfs_tree_read_lock(eb);
2191
+ btrfs_set_lock_blocking_read(eb);
2192
+ dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
2193
+ need_cleanup = true;
2194
+ }
2195
+
2196
+ /* Now record this tree block and its counter part for qgroups */
2197
+ ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
2198
+ root_level, trace_leaf);
2199
+ if (ret < 0)
2200
+ goto cleanup;
2201
+
2202
+ eb = dst_path->nodes[cur_level];
2203
+
2204
+ if (cur_level > 0) {
2205
+ /* Iterate all child tree blocks */
2206
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
2207
+ /* Skip old tree blocks as they won't be swapped */
2208
+ if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
2209
+ continue;
2210
+ dst_path->slots[cur_level] = i;
2211
+
2212
+ /* Recursive call (at most 7 times) */
2213
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2214
+ dst_path, cur_level - 1, root_level,
2215
+ last_snapshot, trace_leaf);
2216
+ if (ret < 0)
2217
+ goto cleanup;
2218
+ }
2219
+ }
2220
+
2221
+cleanup:
2222
+ if (need_cleanup) {
2223
+ /* Clean up */
2224
+ btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
2225
+ dst_path->locks[cur_level]);
2226
+ free_extent_buffer(dst_path->nodes[cur_level]);
2227
+ dst_path->nodes[cur_level] = NULL;
2228
+ dst_path->slots[cur_level] = 0;
2229
+ dst_path->locks[cur_level] = 0;
2230
+ }
2231
+out:
2232
+ return ret;
2233
+}
2234
+
2235
+static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2236
+ struct extent_buffer *src_eb,
2237
+ struct extent_buffer *dst_eb,
2238
+ u64 last_snapshot, bool trace_leaf)
2239
+{
2240
+ struct btrfs_fs_info *fs_info = trans->fs_info;
2241
+ struct btrfs_path *dst_path = NULL;
2242
+ int level;
2243
+ int ret;
2244
+
2245
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2246
+ return 0;
2247
+
2248
+ /* Wrong parameter order */
2249
+ if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
2250
+ btrfs_err_rl(fs_info,
2251
+ "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2252
+ btrfs_header_generation(src_eb),
2253
+ btrfs_header_generation(dst_eb));
2254
+ return -EUCLEAN;
2255
+ }
2256
+
2257
+ if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
2258
+ ret = -EIO;
2259
+ goto out;
2260
+ }
2261
+
2262
+ level = btrfs_header_level(dst_eb);
2263
+ dst_path = btrfs_alloc_path();
2264
+ if (!dst_path) {
2265
+ ret = -ENOMEM;
2266
+ goto out;
2267
+ }
2268
+ /* For dst_path */
2269
+ atomic_inc(&dst_eb->refs);
2270
+ dst_path->nodes[level] = dst_eb;
2271
+ dst_path->slots[level] = 0;
2272
+ dst_path->locks[level] = 0;
2273
+
2274
+ /* Do the generation aware breadth-first search */
2275
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2276
+ level, last_snapshot, trace_leaf);
2277
+ if (ret < 0)
2278
+ goto out;
2279
+ ret = 0;
2280
+
2281
+out:
2282
+ btrfs_free_path(dst_path);
2283
+ if (ret < 0)
2284
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2285
+ return ret;
2286
+}
2287
+
17222288 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
17232289 struct extent_buffer *root_eb,
17242290 u64 root_gen, int root_level)
....@@ -1759,7 +2325,7 @@
17592325 * walk back up the tree (adjusting slot pointers as we go)
17602326 * and restart the search process.
17612327 */
1762
- extent_buffer_get(root_eb); /* For path */
2328
+ atomic_inc(&root_eb->refs); /* For path */
17632329 path->nodes[root_level] = root_eb;
17642330 path->slots[root_level] = 0;
17652331 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
....@@ -1797,7 +2363,7 @@
17972363 path->slots[level] = 0;
17982364
17992365 btrfs_tree_read_lock(eb);
1800
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
2366
+ btrfs_set_lock_blocking_read(eb);
18012367 path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
18022368
18032369 ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
....@@ -1894,7 +2460,7 @@
18942460 * Update qgroup rfer/excl counters.
18952461 * Rfer update is easy, codes can explain themselves.
18962462 *
1897
- * Excl update is tricky, the update is split into 2 part.
2463
+ * Excl update is tricky, the update is split into 2 parts.
18982464 * Part 1: Possible exclusive <-> sharing detect:
18992465 * | A | !A |
19002466 * -------------------------------------
....@@ -2143,6 +2709,7 @@
21432709 struct btrfs_delayed_ref_root *delayed_refs;
21442710 struct ulist *new_roots = NULL;
21452711 struct rb_node *node;
2712
+ u64 num_dirty_extents = 0;
21462713 u64 qgroup_to_skip;
21472714 int ret = 0;
21482715
....@@ -2152,6 +2719,7 @@
21522719 record = rb_entry(node, struct btrfs_qgroup_extent_record,
21532720 node);
21542721
2722
+ num_dirty_extents++;
21552723 trace_btrfs_qgroup_account_extents(fs_info, record);
21562724
21572725 if (!ret) {
....@@ -2168,6 +2736,11 @@
21682736 goto cleanup;
21692737 }
21702738
2739
+ /* Free the reserved data space */
2740
+ btrfs_qgroup_free_refroot(fs_info,
2741
+ record->data_rsv_refroot,
2742
+ record->data_rsv,
2743
+ BTRFS_QGROUP_RSV_DATA);
21712744 /*
21722745 * Use SEQ_LAST as time_seq to do special search, which
21732746 * doesn't lock tree or delayed_refs and search current
....@@ -2197,19 +2770,29 @@
21972770 kfree(record);
21982771
21992772 }
2773
+ trace_qgroup_num_dirty_extents(fs_info, trans->transid,
2774
+ num_dirty_extents);
22002775 return ret;
22012776 }
22022777
22032778 /*
2204
- * called from commit_transaction. Writes all changed qgroups to disk.
2779
+ * Writes all changed qgroups to disk.
2780
+ * Called by the transaction commit path and the qgroup assign ioctl.
22052781 */
22062782 int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
22072783 {
22082784 struct btrfs_fs_info *fs_info = trans->fs_info;
2209
- struct btrfs_root *quota_root = fs_info->quota_root;
22102785 int ret = 0;
22112786
2212
- if (!quota_root)
2787
+ /*
2788
+ * In case we are called from the qgroup assign ioctl, assert that we
2789
+ * are holding the qgroup_ioctl_lock, otherwise we can race with a quota
2790
+ * disable operation (ioctl) and access a freed quota root.
2791
+ */
2792
+ if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
2793
+ lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
2794
+
2795
+ if (!fs_info->quota_root)
22132796 return ret;
22142797
22152798 spin_lock(&fs_info->qgroup_lock);
....@@ -2353,14 +2936,7 @@
23532936 dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
23542937 dstgroup->rsv_excl = inherit->lim.rsv_excl;
23552938
2356
- ret = update_qgroup_limit_item(trans, dstgroup);
2357
- if (ret) {
2358
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2359
- btrfs_info(fs_info,
2360
- "unable to update quota limit for %llu",
2361
- dstgroup->qgroupid);
2362
- goto unlock;
2363
- }
2939
+ qgroup_dirty(fs_info, dstgroup);
23642940 }
23652941
23662942 if (srcid) {
....@@ -2455,6 +3031,8 @@
24553031
24563032 unlock:
24573033 spin_unlock(&fs_info->qgroup_lock);
3034
+ if (!ret)
3035
+ ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
24583036 out:
24593037 if (!committing)
24603038 mutex_unlock(&fs_info->qgroup_ioctl_lock);
....@@ -2463,20 +3041,8 @@
24633041 return ret;
24643042 }
24653043
2466
-/*
2467
- * Two limits to commit transaction in advance.
2468
- *
2469
- * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
2470
- * For SIZE, it will be in byte unit as threshold.
2471
- */
2472
-#define QGROUP_FREE_RATIO 32
2473
-#define QGROUP_FREE_SIZE SZ_32M
2474
-static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
2475
- const struct btrfs_qgroup *qg, u64 num_bytes)
3044
+static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
24763045 {
2477
- u64 free;
2478
- u64 threshold;
2479
-
24803046 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
24813047 qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
24823048 return false;
....@@ -2485,39 +3051,12 @@
24853051 qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
24863052 return false;
24873053
2488
- /*
2489
- * Even if we passed the check, it's better to check if reservation
2490
- * for meta_pertrans is pushing us near limit.
2491
- * If there is too much pertrans reservation or it's near the limit,
2492
- * let's try commit transaction to free some, using transaction_kthread
2493
- */
2494
- if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
2495
- BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
2496
- if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
2497
- free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
2498
- threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
2499
- QGROUP_FREE_SIZE);
2500
- } else {
2501
- free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
2502
- threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
2503
- QGROUP_FREE_SIZE);
2504
- }
2505
-
2506
- /*
2507
- * Use transaction_kthread to commit transaction, so we no
2508
- * longer need to bother nested transaction nor lock context.
2509
- */
2510
- if (free < threshold)
2511
- btrfs_commit_transaction_locksafe(fs_info);
2512
- }
2513
-
25143054 return true;
25153055 }
25163056
25173057 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
25183058 enum btrfs_qgroup_rsv_type type)
25193059 {
2520
- struct btrfs_root *quota_root;
25213060 struct btrfs_qgroup *qgroup;
25223061 struct btrfs_fs_info *fs_info = root->fs_info;
25233062 u64 ref_root = root->root_key.objectid;
....@@ -2536,8 +3075,7 @@
25363075 enforce = false;
25373076
25383077 spin_lock(&fs_info->qgroup_lock);
2539
- quota_root = fs_info->quota_root;
2540
- if (!quota_root)
3078
+ if (!fs_info->quota_root)
25413079 goto out;
25423080
25433081 qgroup = find_qgroup_rb(fs_info, ref_root);
....@@ -2560,7 +3098,7 @@
25603098
25613099 qg = unode_aux_to_qgroup(unode);
25623100
2563
- if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
3101
+ if (enforce && !qgroup_check_limits(qg, num_bytes)) {
25643102 ret = -EDQUOT;
25653103 goto out;
25663104 }
....@@ -2583,7 +3121,6 @@
25833121
25843122 qg = unode_aux_to_qgroup(unode);
25853123
2586
- trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
25873124 qgroup_rsv_add(fs_info, qg, num_bytes, type);
25883125 }
25893126
....@@ -2605,7 +3142,6 @@
26053142 u64 ref_root, u64 num_bytes,
26063143 enum btrfs_qgroup_rsv_type type)
26073144 {
2608
- struct btrfs_root *quota_root;
26093145 struct btrfs_qgroup *qgroup;
26103146 struct ulist_node *unode;
26113147 struct ulist_iterator uiter;
....@@ -2623,8 +3159,7 @@
26233159 }
26243160 spin_lock(&fs_info->qgroup_lock);
26253161
2626
- quota_root = fs_info->quota_root;
2627
- if (!quota_root)
3162
+ if (!fs_info->quota_root)
26283163 goto out;
26293164
26303165 qgroup = find_qgroup_rb(fs_info, ref_root);
....@@ -2650,7 +3185,6 @@
26503185
26513186 qg = unode_aux_to_qgroup(unode);
26523187
2653
- trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
26543188 qgroup_rsv_release(fs_info, qg, num_bytes, type);
26553189
26563190 list_for_each_entry(glist, &qg->groups, next_group) {
....@@ -2734,9 +3268,6 @@
27343268 mutex_unlock(&fs_info->qgroup_rescan_lock);
27353269 goto out;
27363270 }
2737
- extent_buffer_get(scratch_leaf);
2738
- btrfs_tree_read_lock(scratch_leaf);
2739
- btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
27403271 slot = path->slots[0];
27413272 btrfs_release_path(path);
27423273 mutex_unlock(&fs_info->qgroup_rescan_lock);
....@@ -2762,10 +3293,8 @@
27623293 goto out;
27633294 }
27643295 out:
2765
- if (scratch_leaf) {
2766
- btrfs_tree_read_unlock_blocking(scratch_leaf);
3296
+ if (scratch_leaf)
27673297 free_extent_buffer(scratch_leaf);
2768
- }
27693298
27703299 if (done && !ret) {
27713300 ret = 1;
....@@ -2777,7 +3306,8 @@
27773306 static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
27783307 {
27793308 return btrfs_fs_closing(fs_info) ||
2780
- test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
3309
+ test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
3310
+ !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
27813311 }
27823312
27833313 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
....@@ -2789,6 +3319,7 @@
27893319 int err = -ENOMEM;
27903320 int ret = 0;
27913321 bool stopped = false;
3322
+ bool did_leaf_rescans = false;
27923323
27933324 path = btrfs_alloc_path();
27943325 if (!path)
....@@ -2807,11 +3338,10 @@
28073338 err = PTR_ERR(trans);
28083339 break;
28093340 }
2810
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
2811
- err = -EINTR;
2812
- } else {
2813
- err = qgroup_rescan_leaf(trans, path);
2814
- }
3341
+
3342
+ err = qgroup_rescan_leaf(trans, path);
3343
+ did_leaf_rescans = true;
3344
+
28153345 if (err > 0)
28163346 btrfs_commit_transaction(trans);
28173347 else
....@@ -2825,22 +3355,29 @@
28253355 if (err > 0 &&
28263356 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
28273357 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2828
- } else if (err < 0) {
3358
+ } else if (err < 0 || stopped) {
28293359 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
28303360 }
28313361 mutex_unlock(&fs_info->qgroup_rescan_lock);
28323362
28333363 /*
2834
- * only update status, since the previous part has already updated the
2835
- * qgroup info.
3364
+ * Only update status, since the previous part has already updated the
3365
+ * qgroup info, and only if we did any actual work. This also prevents
3366
+ * race with a concurrent quota disable, which has already set
3367
+ * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
3368
+ * btrfs_quota_disable().
28363369 */
2837
- trans = btrfs_start_transaction(fs_info->quota_root, 1);
2838
- if (IS_ERR(trans)) {
2839
- err = PTR_ERR(trans);
3370
+ if (did_leaf_rescans) {
3371
+ trans = btrfs_start_transaction(fs_info->quota_root, 1);
3372
+ if (IS_ERR(trans)) {
3373
+ err = PTR_ERR(trans);
3374
+ trans = NULL;
3375
+ btrfs_err(fs_info,
3376
+ "fail to start transaction for status update: %d",
3377
+ err);
3378
+ }
3379
+ } else {
28403380 trans = NULL;
2841
- btrfs_err(fs_info,
2842
- "fail to start transaction for status update: %d",
2843
- err);
28443381 }
28453382
28463383 mutex_lock(&fs_info->qgroup_rescan_lock);
....@@ -2902,7 +3439,6 @@
29023439 }
29033440
29043441 mutex_lock(&fs_info->qgroup_rescan_lock);
2905
- spin_lock(&fs_info->qgroup_lock);
29063442
29073443 if (init_flags) {
29083444 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
....@@ -2914,10 +3450,12 @@
29143450 btrfs_warn(fs_info,
29153451 "qgroup rescan init failed, qgroup is not enabled");
29163452 ret = -EINVAL;
3453
+ } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
3454
+ /* Quota disable is in progress */
3455
+ ret = -EBUSY;
29173456 }
29183457
29193458 if (ret) {
2920
- spin_unlock(&fs_info->qgroup_lock);
29213459 mutex_unlock(&fs_info->qgroup_rescan_lock);
29223460 return ret;
29233461 }
....@@ -2928,14 +3466,9 @@
29283466 sizeof(fs_info->qgroup_rescan_progress));
29293467 fs_info->qgroup_rescan_progress.objectid = progress_objectid;
29303468 init_completion(&fs_info->qgroup_rescan_completion);
2931
-
2932
- spin_unlock(&fs_info->qgroup_lock);
29333469 mutex_unlock(&fs_info->qgroup_rescan_lock);
29343470
2935
- memset(&fs_info->qgroup_rescan_work, 0,
2936
- sizeof(fs_info->qgroup_rescan_work));
29373471 btrfs_init_work(&fs_info->qgroup_rescan_work,
2938
- btrfs_qgroup_rescan_helper,
29393472 btrfs_qgroup_rescan_worker, NULL, NULL);
29403473 return 0;
29413474 }
....@@ -3009,9 +3542,7 @@
30093542 int ret = 0;
30103543
30113544 mutex_lock(&fs_info->qgroup_rescan_lock);
3012
- spin_lock(&fs_info->qgroup_lock);
30133545 running = fs_info->qgroup_rescan_running;
3014
- spin_unlock(&fs_info->qgroup_lock);
30153546 mutex_unlock(&fs_info->qgroup_rescan_lock);
30163547
30173548 if (!running)
....@@ -3042,40 +3573,169 @@
30423573 }
30433574 }
30443575
3576
+#define rbtree_iterate_from_safe(node, next, start) \
3577
+ for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
3578
+
3579
+static int qgroup_unreserve_range(struct btrfs_inode *inode,
3580
+ struct extent_changeset *reserved, u64 start,
3581
+ u64 len)
3582
+{
3583
+ struct rb_node *node;
3584
+ struct rb_node *next;
3585
+ struct ulist_node *entry;
3586
+ int ret = 0;
3587
+
3588
+ node = reserved->range_changed.root.rb_node;
3589
+ if (!node)
3590
+ return 0;
3591
+ while (node) {
3592
+ entry = rb_entry(node, struct ulist_node, rb_node);
3593
+ if (entry->val < start)
3594
+ node = node->rb_right;
3595
+ else
3596
+ node = node->rb_left;
3597
+ }
3598
+
3599
+ if (entry->val > start && rb_prev(&entry->rb_node))
3600
+ entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
3601
+ rb_node);
3602
+
3603
+ rbtree_iterate_from_safe(node, next, &entry->rb_node) {
3604
+ u64 entry_start;
3605
+ u64 entry_end;
3606
+ u64 entry_len;
3607
+ int clear_ret;
3608
+
3609
+ entry = rb_entry(node, struct ulist_node, rb_node);
3610
+ entry_start = entry->val;
3611
+ entry_end = entry->aux;
3612
+ entry_len = entry_end - entry_start + 1;
3613
+
3614
+ if (entry_start >= start + len)
3615
+ break;
3616
+ if (entry_start + entry_len <= start)
3617
+ continue;
3618
+ /*
3619
+ * Now the entry is in [start, start + len), revert the
3620
+ * EXTENT_QGROUP_RESERVED bit.
3621
+ */
3622
+ clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
3623
+ entry_end, EXTENT_QGROUP_RESERVED);
3624
+ if (!ret && clear_ret < 0)
3625
+ ret = clear_ret;
3626
+
3627
+ ulist_del(&reserved->range_changed, entry->val, entry->aux);
3628
+ if (likely(reserved->bytes_changed >= entry_len)) {
3629
+ reserved->bytes_changed -= entry_len;
3630
+ } else {
3631
+ WARN_ON(1);
3632
+ reserved->bytes_changed = 0;
3633
+ }
3634
+ }
3635
+
3636
+ return ret;
3637
+}
3638
+
30453639 /*
3046
- * Reserve qgroup space for range [start, start + len).
3640
+ * Try to free some space for qgroup.
30473641 *
3048
- * This function will either reserve space from related qgroups or doing
3049
- * nothing if the range is already reserved.
3642
+ * For qgroup, there are only 3 ways to free qgroup space:
3643
+ * - Flush nodatacow write
3644
+ * Any nodatacow write will free its reserved data space at run_delalloc_range().
3645
+ * In theory, we should only flush nodatacow inodes, but it's not yet
3646
+ * possible, so we need to flush the whole root.
30503647 *
3051
- * Return 0 for successful reserve
3052
- * Return <0 for error (including -EQUOT)
3648
+ * - Wait for ordered extents
3649
+ * When ordered extents are finished, their reserved metadata is finally
3650
+ * converted to per_trans status, which can be freed by later commit
3651
+ * transaction.
30533652 *
3054
- * NOTE: this function may sleep for memory allocation.
3055
- * if btrfs_qgroup_reserve_data() is called multiple times with
3056
- * same @reserved, caller must ensure when error happens it's OK
3057
- * to free *ALL* reserved space.
3653
+ * - Commit transaction
3654
+ * This would free the meta_per_trans space.
3655
+ * In theory this shouldn't provide much space, but any more qgroup space
3656
+ * is needed.
30583657 */
3059
-int btrfs_qgroup_reserve_data(struct inode *inode,
3658
+static int try_flush_qgroup(struct btrfs_root *root)
3659
+{
3660
+ struct btrfs_trans_handle *trans;
3661
+ int ret;
3662
+ bool can_commit = true;
3663
+
3664
+ /*
3665
+ * If current process holds a transaction, we shouldn't flush, as we
3666
+ * assume all space reservation happens before a transaction handle is
3667
+ * held.
3668
+ *
3669
+ * But there are cases like btrfs_delayed_item_reserve_metadata() where
3670
+ * we try to reserve space with one transction handle already held.
3671
+ * In that case we can't commit transaction, but at least try to end it
3672
+ * and hope the started data writes can free some space.
3673
+ */
3674
+ if (current->journal_info &&
3675
+ current->journal_info != BTRFS_SEND_TRANS_STUB)
3676
+ can_commit = false;
3677
+
3678
+ /*
3679
+ * We don't want to run flush again and again, so if there is a running
3680
+ * one, we won't try to start a new flush, but exit directly.
3681
+ */
3682
+ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
3683
+ /*
3684
+ * We are already holding a transaction, thus we can block other
3685
+ * threads from flushing. So exit right now. This increases
3686
+ * the chance of EDQUOT for heavy load and near limit cases.
3687
+ * But we can argue that if we're already near limit, EDQUOT is
3688
+ * unavoidable anyway.
3689
+ */
3690
+ if (!can_commit)
3691
+ return 0;
3692
+
3693
+ wait_event(root->qgroup_flush_wait,
3694
+ !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
3695
+ return 0;
3696
+ }
3697
+
3698
+ ret = btrfs_start_delalloc_snapshot(root);
3699
+ if (ret < 0)
3700
+ goto out;
3701
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
3702
+
3703
+ trans = btrfs_join_transaction(root);
3704
+ if (IS_ERR(trans)) {
3705
+ ret = PTR_ERR(trans);
3706
+ goto out;
3707
+ }
3708
+
3709
+ if (can_commit)
3710
+ ret = btrfs_commit_transaction(trans);
3711
+ else
3712
+ ret = btrfs_end_transaction(trans);
3713
+out:
3714
+ clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
3715
+ wake_up(&root->qgroup_flush_wait);
3716
+ return ret;
3717
+}
3718
+
3719
+static int qgroup_reserve_data(struct btrfs_inode *inode,
30603720 struct extent_changeset **reserved_ret, u64 start,
30613721 u64 len)
30623722 {
3063
- struct btrfs_root *root = BTRFS_I(inode)->root;
3064
- struct ulist_node *unode;
3065
- struct ulist_iterator uiter;
3723
+ struct btrfs_root *root = inode->root;
30663724 struct extent_changeset *reserved;
3725
+ bool new_reserved = false;
30673726 u64 orig_reserved;
30683727 u64 to_reserve;
30693728 int ret;
30703729
30713730 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
3072
- !is_fstree(root->objectid) || len == 0)
3731
+ !is_fstree(root->root_key.objectid) || len == 0)
30733732 return 0;
30743733
30753734 /* @reserved parameter is mandatory for qgroup */
30763735 if (WARN_ON(!reserved_ret))
30773736 return -EINVAL;
30783737 if (!*reserved_ret) {
3738
+ new_reserved = true;
30793739 *reserved_ret = extent_changeset_alloc();
30803740 if (!*reserved_ret)
30813741 return -ENOMEM;
....@@ -3083,15 +3743,15 @@
30833743 reserved = *reserved_ret;
30843744 /* Record already reserved space */
30853745 orig_reserved = reserved->bytes_changed;
3086
- ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
3746
+ ret = set_record_extent_bits(&inode->io_tree, start,
30873747 start + len -1, EXTENT_QGROUP_RESERVED, reserved);
30883748
30893749 /* Newly reserved space */
30903750 to_reserve = reserved->bytes_changed - orig_reserved;
3091
- trace_btrfs_qgroup_reserve_data(inode, start, len,
3751
+ trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
30923752 to_reserve, QGROUP_RESERVE);
30933753 if (ret < 0)
3094
- goto cleanup;
3754
+ goto out;
30953755 ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
30963756 if (ret < 0)
30973757 goto cleanup;
....@@ -3099,23 +3759,49 @@
30993759 return ret;
31003760
31013761 cleanup:
3102
- /* cleanup *ALL* already reserved ranges */
3103
- ULIST_ITER_INIT(&uiter);
3104
- while ((unode = ulist_next(&reserved->range_changed, &uiter)))
3105
- clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
3106
- unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
3107
- /* Also free data bytes of already reserved one */
3108
- btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
3109
- orig_reserved, BTRFS_QGROUP_RSV_DATA);
3110
- extent_changeset_release(reserved);
3762
+ qgroup_unreserve_range(inode, reserved, start, len);
3763
+out:
3764
+ if (new_reserved) {
3765
+ extent_changeset_release(reserved);
3766
+ kfree(reserved);
3767
+ *reserved_ret = NULL;
3768
+ }
31113769 return ret;
31123770 }
31133771
3772
+/*
3773
+ * Reserve qgroup space for range [start, start + len).
3774
+ *
3775
+ * This function will either reserve space from related qgroups or do nothing
3776
+ * if the range is already reserved.
3777
+ *
3778
+ * Return 0 for successful reservation
3779
+ * Return <0 for error (including -EQUOT)
3780
+ *
3781
+ * NOTE: This function may sleep for memory allocation, dirty page flushing and
3782
+ * commit transaction. So caller should not hold any dirty page locked.
3783
+ */
3784
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
3785
+ struct extent_changeset **reserved_ret, u64 start,
3786
+ u64 len)
3787
+{
3788
+ int ret;
3789
+
3790
+ ret = qgroup_reserve_data(inode, reserved_ret, start, len);
3791
+ if (ret <= 0 && ret != -EDQUOT)
3792
+ return ret;
3793
+
3794
+ ret = try_flush_qgroup(inode->root);
3795
+ if (ret < 0)
3796
+ return ret;
3797
+ return qgroup_reserve_data(inode, reserved_ret, start, len);
3798
+}
3799
+
31143800 /* Free ranges specified by @reserved, normally in error path */
3115
-static int qgroup_free_reserved_data(struct inode *inode,
3801
+static int qgroup_free_reserved_data(struct btrfs_inode *inode,
31163802 struct extent_changeset *reserved, u64 start, u64 len)
31173803 {
3118
- struct btrfs_root *root = BTRFS_I(inode)->root;
3804
+ struct btrfs_root *root = inode->root;
31193805 struct ulist_node *unode;
31203806 struct ulist_iterator uiter;
31213807 struct extent_changeset changeset;
....@@ -3151,14 +3837,14 @@
31513837 * EXTENT_QGROUP_RESERVED, we won't double free.
31523838 * So not need to rush.
31533839 */
3154
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
3155
- free_start, free_start + free_len - 1,
3840
+ ret = clear_record_extent_bits(&inode->io_tree, free_start,
3841
+ free_start + free_len - 1,
31563842 EXTENT_QGROUP_RESERVED, &changeset);
31573843 if (ret < 0)
31583844 goto out;
31593845 freed += changeset.bytes_changed;
31603846 }
3161
- btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed,
3847
+ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
31623848 BTRFS_QGROUP_RSV_DATA);
31633849 ret = freed;
31643850 out:
....@@ -3166,7 +3852,7 @@
31663852 return ret;
31673853 }
31683854
3169
-static int __btrfs_qgroup_release_data(struct inode *inode,
3855
+static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
31703856 struct extent_changeset *reserved, u64 start, u64 len,
31713857 int free)
31723858 {
....@@ -3174,8 +3860,7 @@
31743860 int trace_op = QGROUP_RELEASE;
31753861 int ret;
31763862
3177
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
3178
- &BTRFS_I(inode)->root->fs_info->flags))
3863
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
31793864 return 0;
31803865
31813866 /* In release case, we shouldn't have @reserved */
....@@ -3183,18 +3868,18 @@
31833868 if (free && reserved)
31843869 return qgroup_free_reserved_data(inode, reserved, start, len);
31853870 extent_changeset_init(&changeset);
3186
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
3187
- start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
3871
+ ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
3872
+ EXTENT_QGROUP_RESERVED, &changeset);
31883873 if (ret < 0)
31893874 goto out;
31903875
31913876 if (free)
31923877 trace_op = QGROUP_FREE;
3193
- trace_btrfs_qgroup_release_data(inode, start, len,
3878
+ trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
31943879 changeset.bytes_changed, trace_op);
31953880 if (free)
3196
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
3197
- BTRFS_I(inode)->root->objectid,
3881
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
3882
+ inode->root->root_key.objectid,
31983883 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
31993884 ret = changeset.bytes_changed;
32003885 out:
....@@ -3214,7 +3899,7 @@
32143899 *
32153900 * NOTE: This function may sleep for memory allocation.
32163901 */
3217
-int btrfs_qgroup_free_data(struct inode *inode,
3902
+int btrfs_qgroup_free_data(struct btrfs_inode *inode,
32183903 struct extent_changeset *reserved, u64 start, u64 len)
32193904 {
32203905 return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
....@@ -3235,7 +3920,7 @@
32353920 *
32363921 * NOTE: This function may sleep for memory allocation.
32373922 */
3238
-int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
3923
+int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
32393924 {
32403925 return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
32413926 }
....@@ -3280,14 +3965,14 @@
32803965 return num_bytes;
32813966 }
32823967
3283
-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3284
- enum btrfs_qgroup_rsv_type type, bool enforce)
3968
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3969
+ enum btrfs_qgroup_rsv_type type, bool enforce)
32853970 {
32863971 struct btrfs_fs_info *fs_info = root->fs_info;
32873972 int ret;
32883973
32893974 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3290
- !is_fstree(root->objectid) || num_bytes == 0)
3975
+ !is_fstree(root->root_key.objectid) || num_bytes == 0)
32913976 return 0;
32923977
32933978 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
....@@ -3307,18 +3992,33 @@
33073992 return ret;
33083993 }
33093994
3995
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3996
+ enum btrfs_qgroup_rsv_type type, bool enforce)
3997
+{
3998
+ int ret;
3999
+
4000
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
4001
+ if (ret <= 0 && ret != -EDQUOT)
4002
+ return ret;
4003
+
4004
+ ret = try_flush_qgroup(root);
4005
+ if (ret < 0)
4006
+ return ret;
4007
+ return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
4008
+}
4009
+
33104010 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
33114011 {
33124012 struct btrfs_fs_info *fs_info = root->fs_info;
33134013
33144014 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3315
- !is_fstree(root->objectid))
4015
+ !is_fstree(root->root_key.objectid))
33164016 return;
33174017
33184018 /* TODO: Update trace point to handle such free */
33194019 trace_qgroup_meta_free_all_pertrans(root);
33204020 /* Special value -1 means to free all reserved space */
3321
- btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
4021
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
33224022 BTRFS_QGROUP_RSV_META_PERTRANS);
33234023 }
33244024
....@@ -3328,7 +4028,7 @@
33284028 struct btrfs_fs_info *fs_info = root->fs_info;
33294029
33304030 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3331
- !is_fstree(root->objectid))
4031
+ !is_fstree(root->root_key.objectid))
33324032 return;
33334033
33344034 /*
....@@ -3339,13 +4039,13 @@
33394039 num_bytes = sub_root_meta_rsv(root, num_bytes, type);
33404040 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
33414041 trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
3342
- btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
4042
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
4043
+ num_bytes, type);
33434044 }
33444045
33454046 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
33464047 int num_bytes)
33474048 {
3348
- struct btrfs_root *quota_root = fs_info->quota_root;
33494049 struct btrfs_qgroup *qgroup;
33504050 struct ulist_node *unode;
33514051 struct ulist_iterator uiter;
....@@ -3353,7 +4053,7 @@
33534053
33544054 if (num_bytes == 0)
33554055 return;
3356
- if (!quota_root)
4056
+ if (!fs_info->quota_root)
33574057 return;
33584058
33594059 spin_lock(&fs_info->qgroup_lock);
....@@ -3393,20 +4093,20 @@
33934093 struct btrfs_fs_info *fs_info = root->fs_info;
33944094
33954095 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3396
- !is_fstree(root->objectid))
4096
+ !is_fstree(root->root_key.objectid))
33974097 return;
33984098 /* Same as btrfs_qgroup_free_meta_prealloc() */
33994099 num_bytes = sub_root_meta_rsv(root, num_bytes,
34004100 BTRFS_QGROUP_RSV_META_PREALLOC);
34014101 trace_qgroup_meta_convert(root, num_bytes);
3402
- qgroup_convert_meta(fs_info, root->objectid, num_bytes);
4102
+ qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
34034103 }
34044104
34054105 /*
34064106 * Check qgroup reserved space leaking, normally at destroy inode
34074107 * time
34084108 */
3409
-void btrfs_qgroup_check_reserved_leak(struct inode *inode)
4109
+void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
34104110 {
34114111 struct extent_changeset changeset;
34124112 struct ulist_node *unode;
....@@ -3414,21 +4114,279 @@
34144114 int ret;
34154115
34164116 extent_changeset_init(&changeset);
3417
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
4117
+ ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
34184118 EXTENT_QGROUP_RESERVED, &changeset);
34194119
34204120 WARN_ON(ret < 0);
34214121 if (WARN_ON(changeset.bytes_changed)) {
34224122 ULIST_ITER_INIT(&iter);
34234123 while ((unode = ulist_next(&changeset.range_changed, &iter))) {
3424
- btrfs_warn(BTRFS_I(inode)->root->fs_info,
3425
- "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
3426
- inode->i_ino, unode->val, unode->aux);
4124
+ btrfs_warn(inode->root->fs_info,
4125
+ "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
4126
+ btrfs_ino(inode), unode->val, unode->aux);
34274127 }
3428
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
3429
- BTRFS_I(inode)->root->objectid,
4128
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
4129
+ inode->root->root_key.objectid,
34304130 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
34314131
34324132 }
34334133 extent_changeset_release(&changeset);
34344134 }
4135
+
4136
+void btrfs_qgroup_init_swapped_blocks(
4137
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks)
4138
+{
4139
+ int i;
4140
+
4141
+ spin_lock_init(&swapped_blocks->lock);
4142
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
4143
+ swapped_blocks->blocks[i] = RB_ROOT;
4144
+ swapped_blocks->swapped = false;
4145
+}
4146
+
4147
+/*
4148
+ * Delete all swapped blocks record of @root.
4149
+ * Every record here means we skipped a full subtree scan for qgroup.
4150
+ *
4151
+ * Gets called when committing one transaction.
4152
+ */
4153
+void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
4154
+{
4155
+ struct btrfs_qgroup_swapped_blocks *swapped_blocks;
4156
+ int i;
4157
+
4158
+ swapped_blocks = &root->swapped_blocks;
4159
+
4160
+ spin_lock(&swapped_blocks->lock);
4161
+ if (!swapped_blocks->swapped)
4162
+ goto out;
4163
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4164
+ struct rb_root *cur_root = &swapped_blocks->blocks[i];
4165
+ struct btrfs_qgroup_swapped_block *entry;
4166
+ struct btrfs_qgroup_swapped_block *next;
4167
+
4168
+ rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
4169
+ node)
4170
+ kfree(entry);
4171
+ swapped_blocks->blocks[i] = RB_ROOT;
4172
+ }
4173
+ swapped_blocks->swapped = false;
4174
+out:
4175
+ spin_unlock(&swapped_blocks->lock);
4176
+}
4177
+
4178
+/*
4179
+ * Add subtree roots record into @subvol_root.
4180
+ *
4181
+ * @subvol_root: tree root of the subvolume tree get swapped
4182
+ * @bg: block group under balance
4183
+ * @subvol_parent/slot: pointer to the subtree root in subvolume tree
4184
+ * @reloc_parent/slot: pointer to the subtree root in reloc tree
4185
+ * BOTH POINTERS ARE BEFORE TREE SWAP
4186
+ * @last_snapshot: last snapshot generation of the subvolume tree
4187
+ */
4188
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
4189
+ struct btrfs_root *subvol_root,
4190
+ struct btrfs_block_group *bg,
4191
+ struct extent_buffer *subvol_parent, int subvol_slot,
4192
+ struct extent_buffer *reloc_parent, int reloc_slot,
4193
+ u64 last_snapshot)
4194
+{
4195
+ struct btrfs_fs_info *fs_info = subvol_root->fs_info;
4196
+ struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
4197
+ struct btrfs_qgroup_swapped_block *block;
4198
+ struct rb_node **cur;
4199
+ struct rb_node *parent = NULL;
4200
+ int level = btrfs_header_level(subvol_parent) - 1;
4201
+ int ret = 0;
4202
+
4203
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4204
+ return 0;
4205
+
4206
+ if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
4207
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
4208
+ btrfs_err_rl(fs_info,
4209
+ "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
4210
+ __func__,
4211
+ btrfs_node_ptr_generation(subvol_parent, subvol_slot),
4212
+ btrfs_node_ptr_generation(reloc_parent, reloc_slot));
4213
+ return -EUCLEAN;
4214
+ }
4215
+
4216
+ block = kmalloc(sizeof(*block), GFP_NOFS);
4217
+ if (!block) {
4218
+ ret = -ENOMEM;
4219
+ goto out;
4220
+ }
4221
+
4222
+ /*
4223
+ * @reloc_parent/slot is still before swap, while @block is going to
4224
+ * record the bytenr after swap, so we do the swap here.
4225
+ */
4226
+ block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
4227
+ block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
4228
+ reloc_slot);
4229
+ block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
4230
+ block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
4231
+ subvol_slot);
4232
+ block->last_snapshot = last_snapshot;
4233
+ block->level = level;
4234
+
4235
+ /*
4236
+ * If we have bg == NULL, we're called from btrfs_recover_relocation(),
4237
+ * no one else can modify tree blocks thus we qgroup will not change
4238
+ * no matter the value of trace_leaf.
4239
+ */
4240
+ if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
4241
+ block->trace_leaf = true;
4242
+ else
4243
+ block->trace_leaf = false;
4244
+ btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
4245
+
4246
+ /* Insert @block into @blocks */
4247
+ spin_lock(&blocks->lock);
4248
+ cur = &blocks->blocks[level].rb_node;
4249
+ while (*cur) {
4250
+ struct btrfs_qgroup_swapped_block *entry;
4251
+
4252
+ parent = *cur;
4253
+ entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
4254
+ node);
4255
+
4256
+ if (entry->subvol_bytenr < block->subvol_bytenr) {
4257
+ cur = &(*cur)->rb_left;
4258
+ } else if (entry->subvol_bytenr > block->subvol_bytenr) {
4259
+ cur = &(*cur)->rb_right;
4260
+ } else {
4261
+ if (entry->subvol_generation !=
4262
+ block->subvol_generation ||
4263
+ entry->reloc_bytenr != block->reloc_bytenr ||
4264
+ entry->reloc_generation !=
4265
+ block->reloc_generation) {
4266
+ /*
4267
+ * Duplicated but mismatch entry found.
4268
+ * Shouldn't happen.
4269
+ *
4270
+ * Marking qgroup inconsistent should be enough
4271
+ * for end users.
4272
+ */
4273
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4274
+ ret = -EEXIST;
4275
+ }
4276
+ kfree(block);
4277
+ goto out_unlock;
4278
+ }
4279
+ }
4280
+ rb_link_node(&block->node, parent, cur);
4281
+ rb_insert_color(&block->node, &blocks->blocks[level]);
4282
+ blocks->swapped = true;
4283
+out_unlock:
4284
+ spin_unlock(&blocks->lock);
4285
+out:
4286
+ if (ret < 0)
4287
+ fs_info->qgroup_flags |=
4288
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4289
+ return ret;
4290
+}
4291
+
4292
+/*
4293
+ * Check if the tree block is a subtree root, and if so do the needed
4294
+ * delayed subtree trace for qgroup.
4295
+ *
4296
+ * This is called during btrfs_cow_block().
4297
+ */
4298
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
4299
+ struct btrfs_root *root,
4300
+ struct extent_buffer *subvol_eb)
4301
+{
4302
+ struct btrfs_fs_info *fs_info = root->fs_info;
4303
+ struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
4304
+ struct btrfs_qgroup_swapped_block *block;
4305
+ struct extent_buffer *reloc_eb = NULL;
4306
+ struct rb_node *node;
4307
+ bool found = false;
4308
+ bool swapped = false;
4309
+ int level = btrfs_header_level(subvol_eb);
4310
+ int ret = 0;
4311
+ int i;
4312
+
4313
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4314
+ return 0;
4315
+ if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
4316
+ return 0;
4317
+
4318
+ spin_lock(&blocks->lock);
4319
+ if (!blocks->swapped) {
4320
+ spin_unlock(&blocks->lock);
4321
+ return 0;
4322
+ }
4323
+ node = blocks->blocks[level].rb_node;
4324
+
4325
+ while (node) {
4326
+ block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4327
+ if (block->subvol_bytenr < subvol_eb->start) {
4328
+ node = node->rb_left;
4329
+ } else if (block->subvol_bytenr > subvol_eb->start) {
4330
+ node = node->rb_right;
4331
+ } else {
4332
+ found = true;
4333
+ break;
4334
+ }
4335
+ }
4336
+ if (!found) {
4337
+ spin_unlock(&blocks->lock);
4338
+ goto out;
4339
+ }
4340
+ /* Found one, remove it from @blocks first and update blocks->swapped */
4341
+ rb_erase(&block->node, &blocks->blocks[level]);
4342
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4343
+ if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
4344
+ swapped = true;
4345
+ break;
4346
+ }
4347
+ }
4348
+ blocks->swapped = swapped;
4349
+ spin_unlock(&blocks->lock);
4350
+
4351
+ /* Read out reloc subtree root */
4352
+ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
4353
+ block->reloc_generation, block->level,
4354
+ &block->first_key);
4355
+ if (IS_ERR(reloc_eb)) {
4356
+ ret = PTR_ERR(reloc_eb);
4357
+ reloc_eb = NULL;
4358
+ goto free_out;
4359
+ }
4360
+ if (!extent_buffer_uptodate(reloc_eb)) {
4361
+ ret = -EIO;
4362
+ goto free_out;
4363
+ }
4364
+
4365
+ ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
4366
+ block->last_snapshot, block->trace_leaf);
4367
+free_out:
4368
+ kfree(block);
4369
+ free_extent_buffer(reloc_eb);
4370
+out:
4371
+ if (ret < 0) {
4372
+ btrfs_err_rl(fs_info,
4373
+ "failed to account subtree at bytenr %llu: %d",
4374
+ subvol_eb->start, ret);
4375
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4376
+ }
4377
+ return ret;
4378
+}
4379
+
4380
+void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
4381
+{
4382
+ struct btrfs_qgroup_extent_record *entry;
4383
+ struct btrfs_qgroup_extent_record *next;
4384
+ struct rb_root *root;
4385
+
4386
+ root = &trans->delayed_refs.dirty_extent_root;
4387
+ rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
4388
+ ulist_free(entry->old_roots);
4389
+ kfree(entry);
4390
+ }
4391
+ *root = RB_ROOT;
4392
+}