hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/fs/xfs/xfs_mount.c
....@@ -12,9 +12,6 @@
1212 #include "xfs_bit.h"
1313 #include "xfs_sb.h"
1414 #include "xfs_mount.h"
15
-#include "xfs_defer.h"
16
-#include "xfs_da_format.h"
17
-#include "xfs_da_btree.h"
1815 #include "xfs_inode.h"
1916 #include "xfs_dir2.h"
2017 #include "xfs_ialloc.h"
....@@ -27,14 +24,14 @@
2724 #include "xfs_error.h"
2825 #include "xfs_quota.h"
2926 #include "xfs_fsops.h"
30
-#include "xfs_trace.h"
3127 #include "xfs_icache.h"
3228 #include "xfs_sysfs.h"
3329 #include "xfs_rmap_btree.h"
3430 #include "xfs_refcount_btree.h"
3531 #include "xfs_reflink.h"
3632 #include "xfs_extent_busy.h"
37
-
33
+#include "xfs_health.h"
34
+#include "xfs_trace.h"
3835
3936 static DEFINE_MUTEX(xfs_uuid_table_mutex);
4037 static int xfs_uuid_table_size;
....@@ -83,9 +80,9 @@
8380 }
8481
8582 if (hole < 0) {
86
- xfs_uuid_table = kmem_realloc(xfs_uuid_table,
83
+ xfs_uuid_table = krealloc(xfs_uuid_table,
8784 (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
88
- KM_SLEEP);
85
+ GFP_KERNEL | __GFP_NOFAIL);
8986 hole = xfs_uuid_table_size++;
9087 }
9188 xfs_uuid_table[hole] = *uuid;
....@@ -129,7 +126,6 @@
129126 {
130127 struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
131128
132
- ASSERT(atomic_read(&pag->pag_ref) == 0);
133129 kmem_free(pag);
134130 }
135131
....@@ -148,9 +144,9 @@
148144 pag = radix_tree_delete(&mp->m_perag_tree, agno);
149145 spin_unlock(&mp->m_perag_lock);
150146 ASSERT(pag);
151
- ASSERT(atomic_read(&pag->pag_ref) == 0);
147
+ XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
148
+ xfs_iunlink_destroy(pag);
152149 xfs_buf_hash_destroy(pag);
153
- mutex_destroy(&pag->pag_ici_reclaim_lock);
154150 call_rcu(&pag->rcu_head, __xfs_free_perag);
155151 }
156152 }
....@@ -197,26 +193,30 @@
197193 }
198194
199195 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
200
- if (!pag)
196
+ if (!pag) {
197
+ error = -ENOMEM;
201198 goto out_unwind_new_pags;
199
+ }
202200 pag->pag_agno = index;
203201 pag->pag_mount = mp;
204202 spin_lock_init(&pag->pag_ici_lock);
205
- mutex_init(&pag->pag_ici_reclaim_lock);
206203 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
207
- if (xfs_buf_hash_init(pag))
204
+
205
+ error = xfs_buf_hash_init(pag);
206
+ if (error)
208207 goto out_free_pag;
209208 init_waitqueue_head(&pag->pagb_wait);
210209 spin_lock_init(&pag->pagb_lock);
211210 pag->pagb_count = 0;
212211 pag->pagb_tree = RB_ROOT;
213212
214
- if (radix_tree_preload(GFP_NOFS))
213
+ error = radix_tree_preload(GFP_NOFS);
214
+ if (error)
215215 goto out_hash_destroy;
216216
217217 spin_lock(&mp->m_perag_lock);
218218 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
219
- BUG();
219
+ WARN_ON_ONCE(1);
220220 spin_unlock(&mp->m_perag_lock);
221221 radix_tree_preload_end();
222222 error = -EEXIST;
....@@ -227,6 +227,10 @@
227227 /* first new pag is fully initialized */
228228 if (first_initialised == NULLAGNUMBER)
229229 first_initialised = index;
230
+ error = xfs_iunlink_init(pag);
231
+ if (error)
232
+ goto out_hash_destroy;
233
+ spin_lock_init(&pag->pag_state_lock);
230234 }
231235
232236 index = xfs_set_inode_alloc(mp, agcount);
....@@ -240,7 +244,6 @@
240244 out_hash_destroy:
241245 xfs_buf_hash_destroy(pag);
242246 out_free_pag:
243
- mutex_destroy(&pag->pag_ici_reclaim_lock);
244247 kmem_free(pag);
245248 out_unwind_new_pags:
246249 /* unwind any prior newly initialized pags */
....@@ -249,7 +252,7 @@
249252 if (!pag)
250253 break;
251254 xfs_buf_hash_destroy(pag);
252
- mutex_destroy(&pag->pag_ici_reclaim_lock);
255
+ xfs_iunlink_destroy(pag);
253256 kmem_free(pag);
254257 }
255258 return error;
....@@ -307,7 +310,7 @@
307310 /*
308311 * Initialize the mount structure from the superblock.
309312 */
310
- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
313
+ xfs_sb_from_disk(sbp, bp->b_addr);
311314
312315 /*
313316 * If we haven't validated the superblock, do so now before we try
....@@ -357,132 +360,122 @@
357360 }
358361
359362 /*
360
- * Update alignment values based on mount options and sb values
363
+ * If the sunit/swidth change would move the precomputed root inode value, we
364
+ * must reject the ondisk change because repair will stumble over that.
365
+ * However, we allow the mount to proceed because we never rejected this
366
+ * combination before. Returns true to update the sb, false otherwise.
367
+ */
368
+static inline int
369
+xfs_check_new_dalign(
370
+ struct xfs_mount *mp,
371
+ int new_dalign,
372
+ bool *update_sb)
373
+{
374
+ struct xfs_sb *sbp = &mp->m_sb;
375
+ xfs_ino_t calc_ino;
376
+
377
+ calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign);
378
+ trace_xfs_check_new_dalign(mp, new_dalign, calc_ino);
379
+
380
+ if (sbp->sb_rootino == calc_ino) {
381
+ *update_sb = true;
382
+ return 0;
383
+ }
384
+
385
+ xfs_warn(mp,
386
+"Cannot change stripe alignment; would require moving root inode.");
387
+
388
+ /*
389
+ * XXX: Next time we add a new incompat feature, this should start
390
+ * returning -EINVAL to fail the mount. Until then, spit out a warning
391
+ * that we're ignoring the administrator's instructions.
392
+ */
393
+ xfs_warn(mp, "Skipping superblock stripe alignment update.");
394
+ *update_sb = false;
395
+ return 0;
396
+}
397
+
398
+/*
399
+ * If we were provided with new sunit/swidth values as mount options, make sure
400
+ * that they pass basic alignment and superblock feature checks, and convert
401
+ * them into the same units (FSB) that everything else expects. This step
402
+ * /must/ be done before computing the inode geometry.
361403 */
362404 STATIC int
363
-xfs_update_alignment(xfs_mount_t *mp)
405
+xfs_validate_new_dalign(
406
+ struct xfs_mount *mp)
364407 {
365
- xfs_sb_t *sbp = &(mp->m_sb);
408
+ if (mp->m_dalign == 0)
409
+ return 0;
366410
367
- if (mp->m_dalign) {
411
+ /*
412
+ * If stripe unit and stripe width are not multiples
413
+ * of the fs blocksize turn off alignment.
414
+ */
415
+ if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
416
+ (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
417
+ xfs_warn(mp,
418
+ "alignment check failed: sunit/swidth vs. blocksize(%d)",
419
+ mp->m_sb.sb_blocksize);
420
+ return -EINVAL;
421
+ } else {
368422 /*
369
- * If stripe unit and stripe width are not multiples
370
- * of the fs blocksize turn off alignment.
423
+ * Convert the stripe unit and width to FSBs.
371424 */
372
- if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
373
- (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
425
+ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
426
+ if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
374427 xfs_warn(mp,
375
- "alignment check failed: sunit/swidth vs. blocksize(%d)",
376
- sbp->sb_blocksize);
428
+ "alignment check failed: sunit/swidth vs. agsize(%d)",
429
+ mp->m_sb.sb_agblocks);
377430 return -EINVAL;
378
- } else {
379
- /*
380
- * Convert the stripe unit and width to FSBs.
381
- */
382
- mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
383
- if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
384
- xfs_warn(mp,
385
- "alignment check failed: sunit/swidth vs. agsize(%d)",
386
- sbp->sb_agblocks);
387
- return -EINVAL;
388
- } else if (mp->m_dalign) {
389
- mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
390
- } else {
391
- xfs_warn(mp,
392
- "alignment check failed: sunit(%d) less than bsize(%d)",
393
- mp->m_dalign, sbp->sb_blocksize);
394
- return -EINVAL;
395
- }
396
- }
397
-
398
- /*
399
- * Update superblock with new values
400
- * and log changes
401
- */
402
- if (xfs_sb_version_hasdalign(sbp)) {
403
- if (sbp->sb_unit != mp->m_dalign) {
404
- sbp->sb_unit = mp->m_dalign;
405
- mp->m_update_sb = true;
406
- }
407
- if (sbp->sb_width != mp->m_swidth) {
408
- sbp->sb_width = mp->m_swidth;
409
- mp->m_update_sb = true;
410
- }
431
+ } else if (mp->m_dalign) {
432
+ mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
411433 } else {
412434 xfs_warn(mp,
413
- "cannot change alignment: superblock does not support data alignment");
435
+ "alignment check failed: sunit(%d) less than bsize(%d)",
436
+ mp->m_dalign, mp->m_sb.sb_blocksize);
414437 return -EINVAL;
415438 }
416
- } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
417
- xfs_sb_version_hasdalign(&mp->m_sb)) {
418
- mp->m_dalign = sbp->sb_unit;
419
- mp->m_swidth = sbp->sb_width;
439
+ }
440
+
441
+ if (!xfs_sb_version_hasdalign(&mp->m_sb)) {
442
+ xfs_warn(mp,
443
+"cannot change alignment: superblock does not support data alignment");
444
+ return -EINVAL;
420445 }
421446
422447 return 0;
423448 }
424449
425
-/*
426
- * Set the maximum inode count for this filesystem
427
- */
428
-STATIC void
429
-xfs_set_maxicount(xfs_mount_t *mp)
450
+/* Update alignment values based on mount options and sb values. */
451
+STATIC int
452
+xfs_update_alignment(
453
+ struct xfs_mount *mp)
430454 {
431
- xfs_sb_t *sbp = &(mp->m_sb);
432
- uint64_t icount;
455
+ struct xfs_sb *sbp = &mp->m_sb;
433456
434
- if (sbp->sb_imax_pct) {
435
- /*
436
- * Make sure the maximum inode count is a multiple
437
- * of the units we allocate inodes in.
438
- */
439
- icount = sbp->sb_dblocks * sbp->sb_imax_pct;
440
- do_div(icount, 100);
441
- do_div(icount, mp->m_ialloc_blks);
442
- mp->m_maxicount = (icount * mp->m_ialloc_blks) <<
443
- sbp->sb_inopblog;
444
- } else {
445
- mp->m_maxicount = 0;
446
- }
447
-}
457
+ if (mp->m_dalign) {
458
+ bool update_sb;
459
+ int error;
448460
449
-/*
450
- * Set the default minimum read and write sizes unless
451
- * already specified in a mount option.
452
- * We use smaller I/O sizes when the file system
453
- * is being used for NFS service (wsync mount option).
454
- */
455
-STATIC void
456
-xfs_set_rw_sizes(xfs_mount_t *mp)
457
-{
458
- xfs_sb_t *sbp = &(mp->m_sb);
459
- int readio_log, writeio_log;
461
+ if (sbp->sb_unit == mp->m_dalign &&
462
+ sbp->sb_width == mp->m_swidth)
463
+ return 0;
460464
461
- if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
462
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
463
- readio_log = XFS_WSYNC_READIO_LOG;
464
- writeio_log = XFS_WSYNC_WRITEIO_LOG;
465
- } else {
466
- readio_log = XFS_READIO_LOG_LARGE;
467
- writeio_log = XFS_WRITEIO_LOG_LARGE;
468
- }
469
- } else {
470
- readio_log = mp->m_readio_log;
471
- writeio_log = mp->m_writeio_log;
465
+ error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb);
466
+ if (error || !update_sb)
467
+ return error;
468
+
469
+ sbp->sb_unit = mp->m_dalign;
470
+ sbp->sb_width = mp->m_swidth;
471
+ mp->m_update_sb = true;
472
+ } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
473
+ xfs_sb_version_hasdalign(&mp->m_sb)) {
474
+ mp->m_dalign = sbp->sb_unit;
475
+ mp->m_swidth = sbp->sb_width;
472476 }
473477
474
- if (sbp->sb_blocklog > readio_log) {
475
- mp->m_readio_log = sbp->sb_blocklog;
476
- } else {
477
- mp->m_readio_log = readio_log;
478
- }
479
- mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
480
- if (sbp->sb_blocklog > writeio_log) {
481
- mp->m_writeio_log = sbp->sb_blocklog;
482
- } else {
483
- mp->m_writeio_log = writeio_log;
484
- }
485
- mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
478
+ return 0;
486479 }
487480
488481 /*
....@@ -500,29 +493,6 @@
500493 do_div(space, 100);
501494 mp->m_low_space[i] = space * (i + 1);
502495 }
503
-}
504
-
505
-
506
-/*
507
- * Set whether we're using inode alignment.
508
- */
509
-STATIC void
510
-xfs_set_inoalignment(xfs_mount_t *mp)
511
-{
512
- if (xfs_sb_version_hasalign(&mp->m_sb) &&
513
- mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
514
- mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
515
- else
516
- mp->m_inoalign_mask = 0;
517
- /*
518
- * If we are using stripe alignment, check whether
519
- * the stripe unit is a multiple of the inode alignment
520
- */
521
- if (mp->m_dalign && mp->m_inoalign_mask &&
522
- !(mp->m_dalign & mp->m_inoalign_mask))
523
- mp->m_sinoalign = mp->m_dalign;
524
- else
525
- mp->m_sinoalign = 0;
526496 }
527497
528498 /*
....@@ -639,7 +609,7 @@
639609 (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
640610 !xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
641611 mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
642
- mp->m_flags |= XFS_MOUNT_BAD_SUMMARY;
612
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
643613
644614 /*
645615 * We can safely re-initialise incore superblock counters from the
....@@ -654,10 +624,51 @@
654624 */
655625 if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) ||
656626 XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) &&
657
- !(mp->m_flags & XFS_MOUNT_BAD_SUMMARY))
627
+ !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
658628 return 0;
659629
660630 return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
631
+}
632
+
633
+/*
634
+ * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
635
+ * internal inode structures can be sitting in the CIL and AIL at this point,
636
+ * so we need to unpin them, write them back and/or reclaim them before unmount
637
+ * can proceed.
638
+ *
639
+ * An inode cluster that has been freed can have its buffer still pinned in
640
+ * memory because the transaction is still sitting in a iclog. The stale inodes
641
+ * on that buffer will be pinned to the buffer until the transaction hits the
642
+ * disk and the callbacks run. Pushing the AIL will skip the stale inodes and
643
+ * may never see the pinned buffer, so nothing will push out the iclog and
644
+ * unpin the buffer.
645
+ *
646
+ * Hence we need to force the log to unpin everything first. However, log
647
+ * forces don't wait for the discards they issue to complete, so we have to
648
+ * explicitly wait for them to complete here as well.
649
+ *
650
+ * Then we can tell the world we are unmounting so that error handling knows
651
+ * that the filesystem is going away and we should error out anything that we
652
+ * have been retrying in the background. This will prevent never-ending
653
+ * retries in AIL pushing from hanging the unmount.
654
+ *
655
+ * Finally, we can push the AIL to clean all the remaining dirty objects, then
656
+ * reclaim the remaining inodes that are still in memory at this point in time.
657
+ */
658
+static void
659
+xfs_unmount_flush_inodes(
660
+ struct xfs_mount *mp)
661
+{
662
+ xfs_log_force(mp, XFS_LOG_SYNC);
663
+ xfs_extent_busy_wait_all(mp);
664
+ flush_workqueue(xfs_discard_wq);
665
+
666
+ mp->m_flags |= XFS_MOUNT_UNMOUNTING;
667
+
668
+ xfs_ail_push_all_sync(mp->m_ail);
669
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
670
+ xfs_reclaim_inodes(mp);
671
+ xfs_health_unmount(mp);
661672 }
662673
663674 /*
....@@ -676,6 +687,7 @@
676687 {
677688 struct xfs_sb *sbp = &(mp->m_sb);
678689 struct xfs_inode *rip;
690
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
679691 uint64_t resblks;
680692 uint quotamount = 0;
681693 uint quotaflags = 0;
....@@ -730,28 +742,38 @@
730742 }
731743
732744 /*
733
- * Check if sb_agblocks is aligned at stripe boundary
734
- * If sb_agblocks is NOT aligned turn off m_dalign since
735
- * allocator alignment is within an ag, therefore ag has
736
- * to be aligned at stripe boundary.
745
+ * If we were given new sunit/swidth options, do some basic validation
746
+ * checks and convert the incore dalign and swidth values to the
747
+ * same units (FSB) that everything else uses. This /must/ happen
748
+ * before computing the inode geometry.
737749 */
738
- error = xfs_update_alignment(mp);
750
+ error = xfs_validate_new_dalign(mp);
739751 if (error)
740752 goto out;
741753
742754 xfs_alloc_compute_maxlevels(mp);
743755 xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
744756 xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
745
- xfs_ialloc_compute_maxlevels(mp);
757
+ xfs_ialloc_setup_geometry(mp);
746758 xfs_rmapbt_compute_maxlevels(mp);
747759 xfs_refcountbt_compute_maxlevels(mp);
748760
749
- xfs_set_maxicount(mp);
761
+ /*
762
+ * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
763
+ * is NOT aligned turn off m_dalign since allocator alignment is within
764
+ * an ag, therefore ag has to be aligned at stripe boundary. Note that
765
+ * we must compute the free space and rmap btree geometry before doing
766
+ * this.
767
+ */
768
+ error = xfs_update_alignment(mp);
769
+ if (error)
770
+ goto out;
750771
751772 /* enable fail_at_unmount as default */
752773 mp->m_fail_unmount = true;
753774
754
- error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
775
+ error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
776
+ NULL, mp->m_super->s_id);
755777 if (error)
756778 goto out;
757779
....@@ -773,31 +795,15 @@
773795 goto out_remove_errortag;
774796
775797 /*
776
- * Set the minimum read and write sizes
798
+ * Update the preferred write size based on the information from the
799
+ * on-disk superblock.
777800 */
778
- xfs_set_rw_sizes(mp);
801
+ mp->m_allocsize_log =
802
+ max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log);
803
+ mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog);
779804
780805 /* set the low space thresholds for dynamic preallocation */
781806 xfs_set_low_space_thresholds(mp);
782
-
783
- /*
784
- * Set the inode cluster size.
785
- * This may still be overridden by the file system
786
- * block size if it is larger than the chosen cluster size.
787
- *
788
- * For v5 filesystems, scale the cluster size with the inode size to
789
- * keep a constant ratio of inode per cluster buffer, but only if mkfs
790
- * has set the inode alignment value appropriately for larger cluster
791
- * sizes.
792
- */
793
- mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
794
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
795
- int new_size = mp->m_inode_cluster_size;
796
-
797
- new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
798
- if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
799
- mp->m_inode_cluster_size = new_size;
800
- }
801807
802808 /*
803809 * If enabled, sparse inode chunk alignment is expected to match the
....@@ -806,19 +812,14 @@
806812 */
807813 if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
808814 mp->m_sb.sb_spino_align !=
809
- XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
815
+ XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
810816 xfs_warn(mp,
811817 "Sparse inode block alignment (%u) must match cluster size (%llu).",
812818 mp->m_sb.sb_spino_align,
813
- XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
819
+ XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
814820 error = -EINVAL;
815821 goto out_remove_uuid;
816822 }
817
-
818
- /*
819
- * Set inode alignment fields
820
- */
821
- xfs_set_inoalignment(mp);
822823
823824 /*
824825 * Check that the data (and log if separate) is an ok size.
....@@ -865,9 +866,8 @@
865866 goto out_free_dir;
866867 }
867868
868
- if (!sbp->sb_logblocks) {
869
+ if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
869870 xfs_warn(mp, "no log defined");
870
- XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
871871 error = -EFSCORRUPTED;
872872 goto out_free_perag;
873873 }
....@@ -905,12 +905,10 @@
905905
906906 ASSERT(rip != NULL);
907907
908
- if (unlikely(!S_ISDIR(VFS_I(rip)->i_mode))) {
908
+ if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) {
909909 xfs_warn(mp, "corrupted root inode %llu: not a directory",
910910 (unsigned long long)rip->i_ino);
911911 xfs_iunlock(rip, XFS_ILOCK_EXCL);
912
- XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
913
- mp);
914912 error = -EFSCORRUPTED;
915913 goto out_rele_rip;
916914 }
....@@ -969,9 +967,17 @@
969967 /*
970968 * Finish recovering the file system. This part needed to be delayed
971969 * until after the root and real-time bitmap inodes were consistently
972
- * read in.
970
+ * read in. Temporarily create per-AG space reservations for metadata
971
+ * btree shape changes because space freeing transactions (for inode
972
+ * inactivation) require the per-AG reservation in lieu of reserving
973
+ * blocks.
973974 */
975
+ error = xfs_fs_reserve_ag_blocks(mp);
976
+ if (error && error == -ENOSPC)
977
+ xfs_warn(mp,
978
+ "ENOSPC reserving per-AG metadata pool, log recovery may fail.");
974979 error = xfs_log_mount_finish(mp);
980
+ xfs_fs_unreserve_ag_blocks(mp);
975981 if (error) {
976982 xfs_warn(mp, "log mount finish failed");
977983 goto out_rtunmount;
....@@ -1047,7 +1053,7 @@
10471053 /* Clean out dquots that might be in memory after quotacheck. */
10481054 xfs_qm_unmount(mp);
10491055 /*
1050
- * Cancel all delayed reclaim work and reclaim the inodes directly.
1056
+ * Flush all inode reclamation work and flush the log.
10511057 * We have to do this /after/ rtunmount and qm_unmount because those
10521058 * two will have scheduled delayed reclaim for the rt/quota inodes.
10531059 *
....@@ -1057,10 +1063,8 @@
10571063 * qm_unmount_quotas and therefore rely on qm_unmount to release the
10581064 * quota inodes.
10591065 */
1060
- cancel_delayed_work_sync(&mp->m_reclaim_work);
1061
- xfs_reclaim_inodes(mp, SYNC_WAIT);
1066
+ xfs_unmount_flush_inodes(mp);
10621067 out_log_dealloc:
1063
- mp->m_flags |= XFS_MOUNT_UNMOUNTING;
10641068 xfs_log_mount_cancel(mp);
10651069 out_fail_wait:
10661070 if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
....@@ -1095,52 +1099,13 @@
10951099 uint64_t resblks;
10961100 int error;
10971101
1098
- xfs_icache_disable_reclaim(mp);
1102
+ xfs_stop_block_reaping(mp);
10991103 xfs_fs_unreserve_ag_blocks(mp);
11001104 xfs_qm_unmount_quotas(mp);
11011105 xfs_rtunmount_inodes(mp);
11021106 xfs_irele(mp->m_rootip);
11031107
1104
- /*
1105
- * We can potentially deadlock here if we have an inode cluster
1106
- * that has been freed has its buffer still pinned in memory because
1107
- * the transaction is still sitting in a iclog. The stale inodes
1108
- * on that buffer will have their flush locks held until the
1109
- * transaction hits the disk and the callbacks run. the inode
1110
- * flush takes the flush lock unconditionally and with nothing to
1111
- * push out the iclog we will never get that unlocked. hence we
1112
- * need to force the log first.
1113
- */
1114
- xfs_log_force(mp, XFS_LOG_SYNC);
1115
-
1116
- /*
1117
- * Wait for all busy extents to be freed, including completion of
1118
- * any discard operation.
1119
- */
1120
- xfs_extent_busy_wait_all(mp);
1121
- flush_workqueue(xfs_discard_wq);
1122
-
1123
- /*
1124
- * We now need to tell the world we are unmounting. This will allow
1125
- * us to detect that the filesystem is going away and we should error
1126
- * out anything that we have been retrying in the background. This will
1127
- * prevent neverending retries in AIL pushing from hanging the unmount.
1128
- */
1129
- mp->m_flags |= XFS_MOUNT_UNMOUNTING;
1130
-
1131
- /*
1132
- * Flush all pending changes from the AIL.
1133
- */
1134
- xfs_ail_push_all_sync(mp->m_ail);
1135
-
1136
- /*
1137
- * And reclaim all inodes. At this point there should be no dirty
1138
- * inodes and none should be pinned or locked, but use synchronous
1139
- * reclaim just to be sure. We can stop background inode reclaim
1140
- * here as well if it is still running.
1141
- */
1142
- cancel_delayed_work_sync(&mp->m_reclaim_work);
1143
- xfs_reclaim_inodes(mp, SYNC_WAIT);
1108
+ xfs_unmount_flush_inodes(mp);
11441109
11451110 xfs_qm_unmount(mp);
11461111
....@@ -1216,8 +1181,7 @@
12161181 int
12171182 xfs_log_sbcount(xfs_mount_t *mp)
12181183 {
1219
- /* allow this to proceed during the freeze sequence... */
1220
- if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
1184
+ if (!xfs_log_writable(mp))
12211185 return 0;
12221186
12231187 /*
....@@ -1228,39 +1192,6 @@
12281192 return 0;
12291193
12301194 return xfs_sync_sb(mp, true);
1231
-}
1232
-
1233
-/*
1234
- * Deltas for the inode count are +/-64, hence we use a large batch size
1235
- * of 128 so we don't need to take the counter lock on every update.
1236
- */
1237
-#define XFS_ICOUNT_BATCH 128
1238
-int
1239
-xfs_mod_icount(
1240
- struct xfs_mount *mp,
1241
- int64_t delta)
1242
-{
1243
- percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
1244
- if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) {
1245
- ASSERT(0);
1246
- percpu_counter_add(&mp->m_icount, -delta);
1247
- return -EINVAL;
1248
- }
1249
- return 0;
1250
-}
1251
-
1252
-int
1253
-xfs_mod_ifree(
1254
- struct xfs_mount *mp,
1255
- int64_t delta)
1256
-{
1257
- percpu_counter_add(&mp->m_ifree, delta);
1258
- if (percpu_counter_compare(&mp->m_ifree, 0) < 0) {
1259
- ASSERT(0);
1260
- percpu_counter_add(&mp->m_ifree, -delta);
1261
- return -EINVAL;
1262
- }
1263
- return 0;
12641195 }
12651196
12661197 /*
....@@ -1341,10 +1272,9 @@
13411272 spin_unlock(&mp->m_sb_lock);
13421273 return 0;
13431274 }
1344
- printk_once(KERN_WARNING
1345
- "Filesystem \"%s\": reserve blocks depleted! "
1346
- "Consider increasing reserve pool size.",
1347
- mp->m_fsname);
1275
+ xfs_warn_once(mp,
1276
+"Reserve blocks depleted! Consider increasing reserve pool size.");
1277
+
13481278 fdblocks_enospc:
13491279 spin_unlock(&mp->m_sb_lock);
13501280 return -ENOSPC;
....@@ -1366,33 +1296,6 @@
13661296 mp->m_sb.sb_frextents = lcounter;
13671297 spin_unlock(&mp->m_sb_lock);
13681298 return ret;
1369
-}
1370
-
1371
-/*
1372
- * xfs_getsb() is called to obtain the buffer for the superblock.
1373
- * The buffer is returned locked and read in from disk.
1374
- * The buffer should be released with a call to xfs_brelse().
1375
- *
1376
- * If the flags parameter is BUF_TRYLOCK, then we'll only return
1377
- * the superblock buffer if it can be locked without sleeping.
1378
- * If it can't then we'll return NULL.
1379
- */
1380
-struct xfs_buf *
1381
-xfs_getsb(
1382
- struct xfs_mount *mp,
1383
- int flags)
1384
-{
1385
- struct xfs_buf *bp = mp->m_sb_bp;
1386
-
1387
- if (!xfs_buf_trylock(bp)) {
1388
- if (flags & XBF_TRYLOCK)
1389
- return NULL;
1390
- xfs_buf_lock(bp);
1391
- }
1392
-
1393
- xfs_buf_hold(bp);
1394
- ASSERT(bp->b_flags & XBF_DONE);
1395
- return bp;
13961299 }
13971300
13981301 /*
....@@ -1436,7 +1339,26 @@
14361339 if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
14371340 return;
14381341
1439
- spin_lock(&mp->m_sb_lock);
1440
- mp->m_flags |= XFS_MOUNT_BAD_SUMMARY;
1441
- spin_unlock(&mp->m_sb_lock);
1342
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
1343
+}
1344
+
1345
+/*
1346
+ * Update the in-core delayed block counter.
1347
+ *
1348
+ * We prefer to update the counter without having to take a spinlock for every
1349
+ * counter update (i.e. batching). Each change to delayed allocation
1350
+ * reservations can change can easily exceed the default percpu counter
1351
+ * batching, so we use a larger batch factor here.
1352
+ *
1353
+ * Note that we don't currently have any callers requiring fast summation
1354
+ * (e.g. percpu_counter_read) so we can use a big batch value here.
1355
+ */
1356
+#define XFS_DELALLOC_BATCH (4096)
1357
+void
1358
+xfs_mod_delalloc(
1359
+ struct xfs_mount *mp,
1360
+ int64_t delta)
1361
+{
1362
+ percpu_counter_add_batch(&mp->m_delalloc_blks, delta,
1363
+ XFS_DELALLOC_BATCH);
14421364 }