hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/fs/xfs/xfs_mount.c
....@@ -12,9 +12,6 @@
1212 #include "xfs_bit.h"
1313 #include "xfs_sb.h"
1414 #include "xfs_mount.h"
15
-#include "xfs_defer.h"
16
-#include "xfs_da_format.h"
17
-#include "xfs_da_btree.h"
1815 #include "xfs_inode.h"
1916 #include "xfs_dir2.h"
2017 #include "xfs_ialloc.h"
....@@ -27,14 +24,14 @@
2724 #include "xfs_error.h"
2825 #include "xfs_quota.h"
2926 #include "xfs_fsops.h"
30
-#include "xfs_trace.h"
3127 #include "xfs_icache.h"
3228 #include "xfs_sysfs.h"
3329 #include "xfs_rmap_btree.h"
3430 #include "xfs_refcount_btree.h"
3531 #include "xfs_reflink.h"
3632 #include "xfs_extent_busy.h"
37
-
33
+#include "xfs_health.h"
34
+#include "xfs_trace.h"
3835
3936 static DEFINE_MUTEX(xfs_uuid_table_mutex);
4037 static int xfs_uuid_table_size;
....@@ -83,9 +80,9 @@
8380 }
8481
8582 if (hole < 0) {
86
- xfs_uuid_table = kmem_realloc(xfs_uuid_table,
83
+ xfs_uuid_table = krealloc(xfs_uuid_table,
8784 (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
88
- KM_SLEEP);
85
+ GFP_KERNEL | __GFP_NOFAIL);
8986 hole = xfs_uuid_table_size++;
9087 }
9188 xfs_uuid_table[hole] = *uuid;
....@@ -149,8 +146,8 @@
149146 spin_unlock(&mp->m_perag_lock);
150147 ASSERT(pag);
151148 ASSERT(atomic_read(&pag->pag_ref) == 0);
149
+ xfs_iunlink_destroy(pag);
152150 xfs_buf_hash_destroy(pag);
153
- mutex_destroy(&pag->pag_ici_reclaim_lock);
154151 call_rcu(&pag->rcu_head, __xfs_free_perag);
155152 }
156153 }
....@@ -197,26 +194,30 @@
197194 }
198195
199196 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
200
- if (!pag)
197
+ if (!pag) {
198
+ error = -ENOMEM;
201199 goto out_unwind_new_pags;
200
+ }
202201 pag->pag_agno = index;
203202 pag->pag_mount = mp;
204203 spin_lock_init(&pag->pag_ici_lock);
205
- mutex_init(&pag->pag_ici_reclaim_lock);
206204 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
207
- if (xfs_buf_hash_init(pag))
205
+
206
+ error = xfs_buf_hash_init(pag);
207
+ if (error)
208208 goto out_free_pag;
209209 init_waitqueue_head(&pag->pagb_wait);
210210 spin_lock_init(&pag->pagb_lock);
211211 pag->pagb_count = 0;
212212 pag->pagb_tree = RB_ROOT;
213213
214
- if (radix_tree_preload(GFP_NOFS))
214
+ error = radix_tree_preload(GFP_NOFS);
215
+ if (error)
215216 goto out_hash_destroy;
216217
217218 spin_lock(&mp->m_perag_lock);
218219 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
219
- BUG();
220
+ WARN_ON_ONCE(1);
220221 spin_unlock(&mp->m_perag_lock);
221222 radix_tree_preload_end();
222223 error = -EEXIST;
....@@ -227,6 +228,10 @@
227228 /* first new pag is fully initialized */
228229 if (first_initialised == NULLAGNUMBER)
229230 first_initialised = index;
231
+ error = xfs_iunlink_init(pag);
232
+ if (error)
233
+ goto out_hash_destroy;
234
+ spin_lock_init(&pag->pag_state_lock);
230235 }
231236
232237 index = xfs_set_inode_alloc(mp, agcount);
....@@ -240,7 +245,6 @@
240245 out_hash_destroy:
241246 xfs_buf_hash_destroy(pag);
242247 out_free_pag:
243
- mutex_destroy(&pag->pag_ici_reclaim_lock);
244248 kmem_free(pag);
245249 out_unwind_new_pags:
246250 /* unwind any prior newly initialized pags */
....@@ -249,7 +253,7 @@
249253 if (!pag)
250254 break;
251255 xfs_buf_hash_destroy(pag);
252
- mutex_destroy(&pag->pag_ici_reclaim_lock);
256
+ xfs_iunlink_destroy(pag);
253257 kmem_free(pag);
254258 }
255259 return error;
....@@ -307,7 +311,7 @@
307311 /*
308312 * Initialize the mount structure from the superblock.
309313 */
310
- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
314
+ xfs_sb_from_disk(sbp, bp->b_addr);
311315
312316 /*
313317 * If we haven't validated the superblock, do so now before we try
....@@ -357,132 +361,122 @@
357361 }
358362
359363 /*
360
- * Update alignment values based on mount options and sb values
364
+ * If the sunit/swidth change would move the precomputed root inode value, we
365
+ * must reject the ondisk change because repair will stumble over that.
366
+ * However, we allow the mount to proceed because we never rejected this
367
+ * combination before. Returns true to update the sb, false otherwise.
368
+ */
369
+static inline int
370
+xfs_check_new_dalign(
371
+ struct xfs_mount *mp,
372
+ int new_dalign,
373
+ bool *update_sb)
374
+{
375
+ struct xfs_sb *sbp = &mp->m_sb;
376
+ xfs_ino_t calc_ino;
377
+
378
+ calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign);
379
+ trace_xfs_check_new_dalign(mp, new_dalign, calc_ino);
380
+
381
+ if (sbp->sb_rootino == calc_ino) {
382
+ *update_sb = true;
383
+ return 0;
384
+ }
385
+
386
+ xfs_warn(mp,
387
+"Cannot change stripe alignment; would require moving root inode.");
388
+
389
+ /*
390
+ * XXX: Next time we add a new incompat feature, this should start
391
+ * returning -EINVAL to fail the mount. Until then, spit out a warning
392
+ * that we're ignoring the administrator's instructions.
393
+ */
394
+ xfs_warn(mp, "Skipping superblock stripe alignment update.");
395
+ *update_sb = false;
396
+ return 0;
397
+}
398
+
399
+/*
400
+ * If we were provided with new sunit/swidth values as mount options, make sure
401
+ * that they pass basic alignment and superblock feature checks, and convert
402
+ * them into the same units (FSB) that everything else expects. This step
403
+ * /must/ be done before computing the inode geometry.
361404 */
362405 STATIC int
363
-xfs_update_alignment(xfs_mount_t *mp)
406
+xfs_validate_new_dalign(
407
+ struct xfs_mount *mp)
364408 {
365
- xfs_sb_t *sbp = &(mp->m_sb);
409
+ if (mp->m_dalign == 0)
410
+ return 0;
366411
367
- if (mp->m_dalign) {
412
+ /*
413
+ * If stripe unit and stripe width are not multiples
414
+ * of the fs blocksize turn off alignment.
415
+ */
416
+ if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
417
+ (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
418
+ xfs_warn(mp,
419
+ "alignment check failed: sunit/swidth vs. blocksize(%d)",
420
+ mp->m_sb.sb_blocksize);
421
+ return -EINVAL;
422
+ } else {
368423 /*
369
- * If stripe unit and stripe width are not multiples
370
- * of the fs blocksize turn off alignment.
424
+ * Convert the stripe unit and width to FSBs.
371425 */
372
- if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
373
- (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
426
+ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
427
+ if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
374428 xfs_warn(mp,
375
- "alignment check failed: sunit/swidth vs. blocksize(%d)",
376
- sbp->sb_blocksize);
429
+ "alignment check failed: sunit/swidth vs. agsize(%d)",
430
+ mp->m_sb.sb_agblocks);
377431 return -EINVAL;
378
- } else {
379
- /*
380
- * Convert the stripe unit and width to FSBs.
381
- */
382
- mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
383
- if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
384
- xfs_warn(mp,
385
- "alignment check failed: sunit/swidth vs. agsize(%d)",
386
- sbp->sb_agblocks);
387
- return -EINVAL;
388
- } else if (mp->m_dalign) {
389
- mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
390
- } else {
391
- xfs_warn(mp,
392
- "alignment check failed: sunit(%d) less than bsize(%d)",
393
- mp->m_dalign, sbp->sb_blocksize);
394
- return -EINVAL;
395
- }
396
- }
397
-
398
- /*
399
- * Update superblock with new values
400
- * and log changes
401
- */
402
- if (xfs_sb_version_hasdalign(sbp)) {
403
- if (sbp->sb_unit != mp->m_dalign) {
404
- sbp->sb_unit = mp->m_dalign;
405
- mp->m_update_sb = true;
406
- }
407
- if (sbp->sb_width != mp->m_swidth) {
408
- sbp->sb_width = mp->m_swidth;
409
- mp->m_update_sb = true;
410
- }
432
+ } else if (mp->m_dalign) {
433
+ mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
411434 } else {
412435 xfs_warn(mp,
413
- "cannot change alignment: superblock does not support data alignment");
436
+ "alignment check failed: sunit(%d) less than bsize(%d)",
437
+ mp->m_dalign, mp->m_sb.sb_blocksize);
414438 return -EINVAL;
415439 }
416
- } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
417
- xfs_sb_version_hasdalign(&mp->m_sb)) {
418
- mp->m_dalign = sbp->sb_unit;
419
- mp->m_swidth = sbp->sb_width;
440
+ }
441
+
442
+ if (!xfs_sb_version_hasdalign(&mp->m_sb)) {
443
+ xfs_warn(mp,
444
+"cannot change alignment: superblock does not support data alignment");
445
+ return -EINVAL;
420446 }
421447
422448 return 0;
423449 }
424450
425
-/*
426
- * Set the maximum inode count for this filesystem
427
- */
428
-STATIC void
429
-xfs_set_maxicount(xfs_mount_t *mp)
451
+/* Update alignment values based on mount options and sb values. */
452
+STATIC int
453
+xfs_update_alignment(
454
+ struct xfs_mount *mp)
430455 {
431
- xfs_sb_t *sbp = &(mp->m_sb);
432
- uint64_t icount;
456
+ struct xfs_sb *sbp = &mp->m_sb;
433457
434
- if (sbp->sb_imax_pct) {
435
- /*
436
- * Make sure the maximum inode count is a multiple
437
- * of the units we allocate inodes in.
438
- */
439
- icount = sbp->sb_dblocks * sbp->sb_imax_pct;
440
- do_div(icount, 100);
441
- do_div(icount, mp->m_ialloc_blks);
442
- mp->m_maxicount = (icount * mp->m_ialloc_blks) <<
443
- sbp->sb_inopblog;
444
- } else {
445
- mp->m_maxicount = 0;
446
- }
447
-}
458
+ if (mp->m_dalign) {
459
+ bool update_sb;
460
+ int error;
448461
449
-/*
450
- * Set the default minimum read and write sizes unless
451
- * already specified in a mount option.
452
- * We use smaller I/O sizes when the file system
453
- * is being used for NFS service (wsync mount option).
454
- */
455
-STATIC void
456
-xfs_set_rw_sizes(xfs_mount_t *mp)
457
-{
458
- xfs_sb_t *sbp = &(mp->m_sb);
459
- int readio_log, writeio_log;
462
+ if (sbp->sb_unit == mp->m_dalign &&
463
+ sbp->sb_width == mp->m_swidth)
464
+ return 0;
460465
461
- if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
462
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
463
- readio_log = XFS_WSYNC_READIO_LOG;
464
- writeio_log = XFS_WSYNC_WRITEIO_LOG;
465
- } else {
466
- readio_log = XFS_READIO_LOG_LARGE;
467
- writeio_log = XFS_WRITEIO_LOG_LARGE;
468
- }
469
- } else {
470
- readio_log = mp->m_readio_log;
471
- writeio_log = mp->m_writeio_log;
466
+ error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb);
467
+ if (error || !update_sb)
468
+ return error;
469
+
470
+ sbp->sb_unit = mp->m_dalign;
471
+ sbp->sb_width = mp->m_swidth;
472
+ mp->m_update_sb = true;
473
+ } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
474
+ xfs_sb_version_hasdalign(&mp->m_sb)) {
475
+ mp->m_dalign = sbp->sb_unit;
476
+ mp->m_swidth = sbp->sb_width;
472477 }
473478
474
- if (sbp->sb_blocklog > readio_log) {
475
- mp->m_readio_log = sbp->sb_blocklog;
476
- } else {
477
- mp->m_readio_log = readio_log;
478
- }
479
- mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
480
- if (sbp->sb_blocklog > writeio_log) {
481
- mp->m_writeio_log = sbp->sb_blocklog;
482
- } else {
483
- mp->m_writeio_log = writeio_log;
484
- }
485
- mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
479
+ return 0;
486480 }
487481
488482 /*
....@@ -500,29 +494,6 @@
500494 do_div(space, 100);
501495 mp->m_low_space[i] = space * (i + 1);
502496 }
503
-}
504
-
505
-
506
-/*
507
- * Set whether we're using inode alignment.
508
- */
509
-STATIC void
510
-xfs_set_inoalignment(xfs_mount_t *mp)
511
-{
512
- if (xfs_sb_version_hasalign(&mp->m_sb) &&
513
- mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
514
- mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
515
- else
516
- mp->m_inoalign_mask = 0;
517
- /*
518
- * If we are using stripe alignment, check whether
519
- * the stripe unit is a multiple of the inode alignment
520
- */
521
- if (mp->m_dalign && mp->m_inoalign_mask &&
522
- !(mp->m_dalign & mp->m_inoalign_mask))
523
- mp->m_sinoalign = mp->m_dalign;
524
- else
525
- mp->m_sinoalign = 0;
526497 }
527498
528499 /*
....@@ -639,7 +610,7 @@
639610 (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
640611 !xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
641612 mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
642
- mp->m_flags |= XFS_MOUNT_BAD_SUMMARY;
613
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
643614
644615 /*
645616 * We can safely re-initialise incore superblock counters from the
....@@ -654,10 +625,51 @@
654625 */
655626 if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) ||
656627 XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) &&
657
- !(mp->m_flags & XFS_MOUNT_BAD_SUMMARY))
628
+ !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
658629 return 0;
659630
660631 return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
632
+}
633
+
634
+/*
635
+ * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
636
+ * internal inode structures can be sitting in the CIL and AIL at this point,
637
+ * so we need to unpin them, write them back and/or reclaim them before unmount
638
+ * can proceed.
639
+ *
640
+ * An inode cluster that has been freed can have its buffer still pinned in
641
+ * memory because the transaction is still sitting in a iclog. The stale inodes
642
+ * on that buffer will be pinned to the buffer until the transaction hits the
643
+ * disk and the callbacks run. Pushing the AIL will skip the stale inodes and
644
+ * may never see the pinned buffer, so nothing will push out the iclog and
645
+ * unpin the buffer.
646
+ *
647
+ * Hence we need to force the log to unpin everything first. However, log
648
+ * forces don't wait for the discards they issue to complete, so we have to
649
+ * explicitly wait for them to complete here as well.
650
+ *
651
+ * Then we can tell the world we are unmounting so that error handling knows
652
+ * that the filesystem is going away and we should error out anything that we
653
+ * have been retrying in the background. This will prevent never-ending
654
+ * retries in AIL pushing from hanging the unmount.
655
+ *
656
+ * Finally, we can push the AIL to clean all the remaining dirty objects, then
657
+ * reclaim the remaining inodes that are still in memory at this point in time.
658
+ */
659
+static void
660
+xfs_unmount_flush_inodes(
661
+ struct xfs_mount *mp)
662
+{
663
+ xfs_log_force(mp, XFS_LOG_SYNC);
664
+ xfs_extent_busy_wait_all(mp);
665
+ flush_workqueue(xfs_discard_wq);
666
+
667
+ mp->m_flags |= XFS_MOUNT_UNMOUNTING;
668
+
669
+ xfs_ail_push_all_sync(mp->m_ail);
670
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
671
+ xfs_reclaim_inodes(mp);
672
+ xfs_health_unmount(mp);
661673 }
662674
663675 /*
....@@ -676,6 +688,7 @@
676688 {
677689 struct xfs_sb *sbp = &(mp->m_sb);
678690 struct xfs_inode *rip;
691
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
679692 uint64_t resblks;
680693 uint quotamount = 0;
681694 uint quotaflags = 0;
....@@ -730,28 +743,38 @@
730743 }
731744
732745 /*
733
- * Check if sb_agblocks is aligned at stripe boundary
734
- * If sb_agblocks is NOT aligned turn off m_dalign since
735
- * allocator alignment is within an ag, therefore ag has
736
- * to be aligned at stripe boundary.
746
+ * If we were given new sunit/swidth options, do some basic validation
747
+ * checks and convert the incore dalign and swidth values to the
748
+ * same units (FSB) that everything else uses. This /must/ happen
749
+ * before computing the inode geometry.
737750 */
738
- error = xfs_update_alignment(mp);
751
+ error = xfs_validate_new_dalign(mp);
739752 if (error)
740753 goto out;
741754
742755 xfs_alloc_compute_maxlevels(mp);
743756 xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
744757 xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
745
- xfs_ialloc_compute_maxlevels(mp);
758
+ xfs_ialloc_setup_geometry(mp);
746759 xfs_rmapbt_compute_maxlevels(mp);
747760 xfs_refcountbt_compute_maxlevels(mp);
748761
749
- xfs_set_maxicount(mp);
762
+ /*
763
+ * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
764
+ * is NOT aligned turn off m_dalign since allocator alignment is within
765
+ * an ag, therefore ag has to be aligned at stripe boundary. Note that
766
+ * we must compute the free space and rmap btree geometry before doing
767
+ * this.
768
+ */
769
+ error = xfs_update_alignment(mp);
770
+ if (error)
771
+ goto out;
750772
751773 /* enable fail_at_unmount as default */
752774 mp->m_fail_unmount = true;
753775
754
- error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
776
+ error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
777
+ NULL, mp->m_super->s_id);
755778 if (error)
756779 goto out;
757780
....@@ -773,31 +796,15 @@
773796 goto out_remove_errortag;
774797
775798 /*
776
- * Set the minimum read and write sizes
799
+ * Update the preferred write size based on the information from the
800
+ * on-disk superblock.
777801 */
778
- xfs_set_rw_sizes(mp);
802
+ mp->m_allocsize_log =
803
+ max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log);
804
+ mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog);
779805
780806 /* set the low space thresholds for dynamic preallocation */
781807 xfs_set_low_space_thresholds(mp);
782
-
783
- /*
784
- * Set the inode cluster size.
785
- * This may still be overridden by the file system
786
- * block size if it is larger than the chosen cluster size.
787
- *
788
- * For v5 filesystems, scale the cluster size with the inode size to
789
- * keep a constant ratio of inode per cluster buffer, but only if mkfs
790
- * has set the inode alignment value appropriately for larger cluster
791
- * sizes.
792
- */
793
- mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
794
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
795
- int new_size = mp->m_inode_cluster_size;
796
-
797
- new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
798
- if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
799
- mp->m_inode_cluster_size = new_size;
800
- }
801808
802809 /*
803810 * If enabled, sparse inode chunk alignment is expected to match the
....@@ -806,19 +813,14 @@
806813 */
807814 if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
808815 mp->m_sb.sb_spino_align !=
809
- XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
816
+ XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
810817 xfs_warn(mp,
811818 "Sparse inode block alignment (%u) must match cluster size (%llu).",
812819 mp->m_sb.sb_spino_align,
813
- XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
820
+ XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
814821 error = -EINVAL;
815822 goto out_remove_uuid;
816823 }
817
-
818
- /*
819
- * Set inode alignment fields
820
- */
821
- xfs_set_inoalignment(mp);
822824
823825 /*
824826 * Check that the data (and log if separate) is an ok size.
....@@ -865,9 +867,8 @@
865867 goto out_free_dir;
866868 }
867869
868
- if (!sbp->sb_logblocks) {
870
+ if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
869871 xfs_warn(mp, "no log defined");
870
- XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
871872 error = -EFSCORRUPTED;
872873 goto out_free_perag;
873874 }
....@@ -905,12 +906,10 @@
905906
906907 ASSERT(rip != NULL);
907908
908
- if (unlikely(!S_ISDIR(VFS_I(rip)->i_mode))) {
909
+ if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) {
909910 xfs_warn(mp, "corrupted root inode %llu: not a directory",
910911 (unsigned long long)rip->i_ino);
911912 xfs_iunlock(rip, XFS_ILOCK_EXCL);
912
- XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
913
- mp);
914913 error = -EFSCORRUPTED;
915914 goto out_rele_rip;
916915 }
....@@ -969,9 +968,17 @@
969968 /*
970969 * Finish recovering the file system. This part needed to be delayed
971970 * until after the root and real-time bitmap inodes were consistently
972
- * read in.
971
+ * read in. Temporarily create per-AG space reservations for metadata
972
+ * btree shape changes because space freeing transactions (for inode
973
+ * inactivation) require the per-AG reservation in lieu of reserving
974
+ * blocks.
973975 */
976
+ error = xfs_fs_reserve_ag_blocks(mp);
977
+ if (error && error == -ENOSPC)
978
+ xfs_warn(mp,
979
+ "ENOSPC reserving per-AG metadata pool, log recovery may fail.");
974980 error = xfs_log_mount_finish(mp);
981
+ xfs_fs_unreserve_ag_blocks(mp);
975982 if (error) {
976983 xfs_warn(mp, "log mount finish failed");
977984 goto out_rtunmount;
....@@ -1047,7 +1054,7 @@
10471054 /* Clean out dquots that might be in memory after quotacheck. */
10481055 xfs_qm_unmount(mp);
10491056 /*
1050
- * Cancel all delayed reclaim work and reclaim the inodes directly.
1057
+ * Flush all inode reclamation work and flush the log.
10511058 * We have to do this /after/ rtunmount and qm_unmount because those
10521059 * two will have scheduled delayed reclaim for the rt/quota inodes.
10531060 *
....@@ -1057,10 +1064,8 @@
10571064 * qm_unmount_quotas and therefore rely on qm_unmount to release the
10581065 * quota inodes.
10591066 */
1060
- cancel_delayed_work_sync(&mp->m_reclaim_work);
1061
- xfs_reclaim_inodes(mp, SYNC_WAIT);
1067
+ xfs_unmount_flush_inodes(mp);
10621068 out_log_dealloc:
1063
- mp->m_flags |= XFS_MOUNT_UNMOUNTING;
10641069 xfs_log_mount_cancel(mp);
10651070 out_fail_wait:
10661071 if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
....@@ -1095,52 +1100,13 @@
10951100 uint64_t resblks;
10961101 int error;
10971102
1098
- xfs_icache_disable_reclaim(mp);
1103
+ xfs_stop_block_reaping(mp);
10991104 xfs_fs_unreserve_ag_blocks(mp);
11001105 xfs_qm_unmount_quotas(mp);
11011106 xfs_rtunmount_inodes(mp);
11021107 xfs_irele(mp->m_rootip);
11031108
1104
- /*
1105
- * We can potentially deadlock here if we have an inode cluster
1106
- * that has been freed has its buffer still pinned in memory because
1107
- * the transaction is still sitting in a iclog. The stale inodes
1108
- * on that buffer will have their flush locks held until the
1109
- * transaction hits the disk and the callbacks run. the inode
1110
- * flush takes the flush lock unconditionally and with nothing to
1111
- * push out the iclog we will never get that unlocked. hence we
1112
- * need to force the log first.
1113
- */
1114
- xfs_log_force(mp, XFS_LOG_SYNC);
1115
-
1116
- /*
1117
- * Wait for all busy extents to be freed, including completion of
1118
- * any discard operation.
1119
- */
1120
- xfs_extent_busy_wait_all(mp);
1121
- flush_workqueue(xfs_discard_wq);
1122
-
1123
- /*
1124
- * We now need to tell the world we are unmounting. This will allow
1125
- * us to detect that the filesystem is going away and we should error
1126
- * out anything that we have been retrying in the background. This will
1127
- * prevent neverending retries in AIL pushing from hanging the unmount.
1128
- */
1129
- mp->m_flags |= XFS_MOUNT_UNMOUNTING;
1130
-
1131
- /*
1132
- * Flush all pending changes from the AIL.
1133
- */
1134
- xfs_ail_push_all_sync(mp->m_ail);
1135
-
1136
- /*
1137
- * And reclaim all inodes. At this point there should be no dirty
1138
- * inodes and none should be pinned or locked, but use synchronous
1139
- * reclaim just to be sure. We can stop background inode reclaim
1140
- * here as well if it is still running.
1141
- */
1142
- cancel_delayed_work_sync(&mp->m_reclaim_work);
1143
- xfs_reclaim_inodes(mp, SYNC_WAIT);
1109
+ xfs_unmount_flush_inodes(mp);
11441110
11451111 xfs_qm_unmount(mp);
11461112
....@@ -1216,8 +1182,7 @@
12161182 int
12171183 xfs_log_sbcount(xfs_mount_t *mp)
12181184 {
1219
- /* allow this to proceed during the freeze sequence... */
1220
- if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
1185
+ if (!xfs_log_writable(mp))
12211186 return 0;
12221187
12231188 /*
....@@ -1228,39 +1193,6 @@
12281193 return 0;
12291194
12301195 return xfs_sync_sb(mp, true);
1231
-}
1232
-
1233
-/*
1234
- * Deltas for the inode count are +/-64, hence we use a large batch size
1235
- * of 128 so we don't need to take the counter lock on every update.
1236
- */
1237
-#define XFS_ICOUNT_BATCH 128
1238
-int
1239
-xfs_mod_icount(
1240
- struct xfs_mount *mp,
1241
- int64_t delta)
1242
-{
1243
- percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
1244
- if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) {
1245
- ASSERT(0);
1246
- percpu_counter_add(&mp->m_icount, -delta);
1247
- return -EINVAL;
1248
- }
1249
- return 0;
1250
-}
1251
-
1252
-int
1253
-xfs_mod_ifree(
1254
- struct xfs_mount *mp,
1255
- int64_t delta)
1256
-{
1257
- percpu_counter_add(&mp->m_ifree, delta);
1258
- if (percpu_counter_compare(&mp->m_ifree, 0) < 0) {
1259
- ASSERT(0);
1260
- percpu_counter_add(&mp->m_ifree, -delta);
1261
- return -EINVAL;
1262
- }
1263
- return 0;
12641196 }
12651197
12661198 /*
....@@ -1341,10 +1273,9 @@
13411273 spin_unlock(&mp->m_sb_lock);
13421274 return 0;
13431275 }
1344
- printk_once(KERN_WARNING
1345
- "Filesystem \"%s\": reserve blocks depleted! "
1346
- "Consider increasing reserve pool size.",
1347
- mp->m_fsname);
1276
+ xfs_warn_once(mp,
1277
+"Reserve blocks depleted! Consider increasing reserve pool size.");
1278
+
13481279 fdblocks_enospc:
13491280 spin_unlock(&mp->m_sb_lock);
13501281 return -ENOSPC;
....@@ -1366,33 +1297,6 @@
13661297 mp->m_sb.sb_frextents = lcounter;
13671298 spin_unlock(&mp->m_sb_lock);
13681299 return ret;
1369
-}
1370
-
1371
-/*
1372
- * xfs_getsb() is called to obtain the buffer for the superblock.
1373
- * The buffer is returned locked and read in from disk.
1374
- * The buffer should be released with a call to xfs_brelse().
1375
- *
1376
- * If the flags parameter is BUF_TRYLOCK, then we'll only return
1377
- * the superblock buffer if it can be locked without sleeping.
1378
- * If it can't then we'll return NULL.
1379
- */
1380
-struct xfs_buf *
1381
-xfs_getsb(
1382
- struct xfs_mount *mp,
1383
- int flags)
1384
-{
1385
- struct xfs_buf *bp = mp->m_sb_bp;
1386
-
1387
- if (!xfs_buf_trylock(bp)) {
1388
- if (flags & XBF_TRYLOCK)
1389
- return NULL;
1390
- xfs_buf_lock(bp);
1391
- }
1392
-
1393
- xfs_buf_hold(bp);
1394
- ASSERT(bp->b_flags & XBF_DONE);
1395
- return bp;
13961300 }
13971301
13981302 /*
....@@ -1436,7 +1340,26 @@
14361340 if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
14371341 return;
14381342
1439
- spin_lock(&mp->m_sb_lock);
1440
- mp->m_flags |= XFS_MOUNT_BAD_SUMMARY;
1441
- spin_unlock(&mp->m_sb_lock);
1343
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
1344
+}
1345
+
1346
+/*
1347
+ * Update the in-core delayed block counter.
1348
+ *
1349
+ * We prefer to update the counter without having to take a spinlock for every
1350
+ * counter update (i.e. batching). Each change to delayed allocation
1351
+ * reservations can change can easily exceed the default percpu counter
1352
+ * batching, so we use a larger batch factor here.
1353
+ *
1354
+ * Note that we don't currently have any callers requiring fast summation
1355
+ * (e.g. percpu_counter_read) so we can use a big batch value here.
1356
+ */
1357
+#define XFS_DELALLOC_BATCH (4096)
1358
+void
1359
+xfs_mod_delalloc(
1360
+ struct xfs_mount *mp,
1361
+ int64_t delta)
1362
+{
1363
+ percpu_counter_add_batch(&mp->m_delalloc_blks, delta,
1364
+ XFS_DELALLOC_BATCH);
14421365 }