hc
2024-05-11 04dd17822334871b23ea2862f7798fb0e0007777
kernel/fs/xfs/xfs_inode.c
....@@ -3,7 +3,6 @@
33 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
44 * All Rights Reserved.
55 */
6
-#include <linux/log2.h>
76 #include <linux/iversion.h>
87
98 #include "xfs.h"
....@@ -16,10 +15,7 @@
1615 #include "xfs_mount.h"
1716 #include "xfs_defer.h"
1817 #include "xfs_inode.h"
19
-#include "xfs_da_format.h"
20
-#include "xfs_da_btree.h"
2118 #include "xfs_dir2.h"
22
-#include "xfs_attr_sf.h"
2319 #include "xfs_attr.h"
2420 #include "xfs_trans_space.h"
2521 #include "xfs_trans.h"
....@@ -32,7 +28,6 @@
3228 #include "xfs_error.h"
3329 #include "xfs_quota.h"
3430 #include "xfs_filestream.h"
35
-#include "xfs_cksum.h"
3631 #include "xfs_trace.h"
3732 #include "xfs_icache.h"
3833 #include "xfs_symlink.h"
....@@ -40,7 +35,6 @@
4035 #include "xfs_log.h"
4136 #include "xfs_bmap_btree.h"
4237 #include "xfs_reflink.h"
43
-#include "xfs_dir2_priv.h"
4438
4539 kmem_zone_t *xfs_inode_zone;
4640
....@@ -50,7 +44,6 @@
5044 */
5145 #define XFS_ITRUNC_MAX_EXTENTS 2
5246
53
-STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
5447 STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
5548 STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
5649
....@@ -61,6 +54,12 @@
6154 xfs_get_extsz_hint(
6255 struct xfs_inode *ip)
6356 {
57
+ /*
58
+ * No point in aligning allocations if we need to COW to actually
59
+ * write to them.
60
+ */
61
+ if (xfs_is_always_cow_inode(ip))
62
+ return 0;
6463 if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
6564 return ip->i_d.di_extsize;
6665 if (XFS_IS_REALTIME_INODE(ip))
....@@ -112,7 +111,7 @@
112111 {
113112 uint lock_mode = XFS_ILOCK_SHARED;
114113
115
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
114
+ if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE &&
116115 (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
117116 lock_mode = XFS_ILOCK_EXCL;
118117 xfs_ilock(ip, lock_mode);
....@@ -125,7 +124,8 @@
125124 {
126125 uint lock_mode = XFS_ILOCK_SHARED;
127126
128
- if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
127
+ if (ip->i_afp &&
128
+ ip->i_afp->if_format == XFS_DINODE_FMT_BTREE &&
129129 (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
130130 lock_mode = XFS_ILOCK_EXCL;
131131 xfs_ilock(ip, lock_mode);
....@@ -144,17 +144,17 @@
144144 *
145145 * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
146146 *
147
- * mmap_sem locking order:
147
+ * mmap_lock locking order:
148148 *
149
- * i_rwsem -> page lock -> mmap_sem
150
- * mmap_sem -> i_mmap_lock -> page_lock
149
+ * i_rwsem -> page lock -> mmap_lock
150
+ * mmap_lock -> i_mmap_lock -> page_lock
151151 *
152
- * The difference in mmap_sem locking order mean that we cannot hold the
152
+ * The difference in mmap_lock locking order mean that we cannot hold the
153153 * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
154
- * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
154
+ * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
155155 * in get_user_pages() to map the user pages into the kernel address space for
156156 * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
157
- * page faults already hold the mmap_sem.
157
+ * page faults already hold the mmap_lock.
158158 *
159159 * Hence to serialise fully against both syscall and mmap based IO, we need to
160160 * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
....@@ -441,17 +441,17 @@
441441 */
442442 static void
443443 xfs_lock_inodes(
444
- xfs_inode_t **ips,
445
- int inodes,
446
- uint lock_mode)
444
+ struct xfs_inode **ips,
445
+ int inodes,
446
+ uint lock_mode)
447447 {
448
- int attempts = 0, i, j, try_lock;
449
- xfs_log_item_t *lp;
448
+ int attempts = 0, i, j, try_lock;
449
+ struct xfs_log_item *lp;
450450
451451 /*
452452 * Currently supports between 2 and 5 inodes with exclusive locking. We
453453 * support an arbitrary depth of locking here, but absolute limits on
454
- * inodes depend on the the type of locking and the limits placed by
454
+ * inodes depend on the type of locking and the limits placed by
455455 * lockdep annotations in xfs_lock_inumorder. These are all checked by
456456 * the asserts.
457457 */
....@@ -485,7 +485,7 @@
485485 */
486486 if (!try_lock) {
487487 for (j = (i - 1); j >= 0 && !try_lock; j--) {
488
- lp = (xfs_log_item_t *)ips[j]->i_itemp;
488
+ lp = &ips[j]->i_itemp->ili_item;
489489 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
490490 try_lock++;
491491 }
....@@ -551,7 +551,7 @@
551551 struct xfs_inode *temp;
552552 uint mode_temp;
553553 int attempts = 0;
554
- xfs_log_item_t *lp;
554
+ struct xfs_log_item *lp;
555555
556556 ASSERT(hweight32(ip0_mode) == 1);
557557 ASSERT(hweight32(ip1_mode) == 1);
....@@ -585,7 +585,7 @@
585585 * the second lock. If we can't get it, we must release the first one
586586 * and try again.
587587 */
588
- lp = (xfs_log_item_t *)ip0->i_itemp;
588
+ lp = &ip0->i_itemp->ili_item;
589589 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
590590 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
591591 xfs_iunlock(ip0, ip0_mode);
....@@ -596,22 +596,6 @@
596596 } else {
597597 xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1));
598598 }
599
-}
600
-
601
-void
602
-__xfs_iflock(
603
- struct xfs_inode *ip)
604
-{
605
- wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
606
- DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
607
-
608
- do {
609
- prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
610
- if (xfs_isiflocked(ip))
611
- io_schedule();
612
- } while (!xfs_iflock_nowait(ip));
613
-
614
- finish_wait(wq, &wait.wq_entry);
615599 }
616600
617601 STATIC uint
....@@ -714,6 +698,68 @@
714698 return error;
715699 }
716700
701
+/* Propagate di_flags from a parent inode to a child inode. */
702
+static void
703
+xfs_inode_inherit_flags(
704
+ struct xfs_inode *ip,
705
+ const struct xfs_inode *pip)
706
+{
707
+ unsigned int di_flags = 0;
708
+ umode_t mode = VFS_I(ip)->i_mode;
709
+
710
+ if (S_ISDIR(mode)) {
711
+ if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
712
+ di_flags |= XFS_DIFLAG_RTINHERIT;
713
+ if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
714
+ di_flags |= XFS_DIFLAG_EXTSZINHERIT;
715
+ ip->i_d.di_extsize = pip->i_d.di_extsize;
716
+ }
717
+ if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
718
+ di_flags |= XFS_DIFLAG_PROJINHERIT;
719
+ } else if (S_ISREG(mode)) {
720
+ if ((pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) &&
721
+ xfs_sb_version_hasrealtime(&ip->i_mount->m_sb))
722
+ di_flags |= XFS_DIFLAG_REALTIME;
723
+ if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
724
+ di_flags |= XFS_DIFLAG_EXTSIZE;
725
+ ip->i_d.di_extsize = pip->i_d.di_extsize;
726
+ }
727
+ }
728
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
729
+ xfs_inherit_noatime)
730
+ di_flags |= XFS_DIFLAG_NOATIME;
731
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
732
+ xfs_inherit_nodump)
733
+ di_flags |= XFS_DIFLAG_NODUMP;
734
+ if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
735
+ xfs_inherit_sync)
736
+ di_flags |= XFS_DIFLAG_SYNC;
737
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
738
+ xfs_inherit_nosymlinks)
739
+ di_flags |= XFS_DIFLAG_NOSYMLINKS;
740
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
741
+ xfs_inherit_nodefrag)
742
+ di_flags |= XFS_DIFLAG_NODEFRAG;
743
+ if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
744
+ di_flags |= XFS_DIFLAG_FILESTREAM;
745
+
746
+ ip->i_d.di_flags |= di_flags;
747
+}
748
+
749
+/* Propagate di_flags2 from a parent inode to a child inode. */
750
+static void
751
+xfs_inode_inherit_flags2(
752
+ struct xfs_inode *ip,
753
+ const struct xfs_inode *pip)
754
+{
755
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
756
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
757
+ ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
758
+ }
759
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
760
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX;
761
+}
762
+
717763 /*
718764 * Allocate an inode on disk and return a copy of its in-core version.
719765 * The in-core inode is locked exclusively. Set mode, nlink, and rdev
....@@ -756,6 +802,7 @@
756802 xfs_buf_t **ialloc_context,
757803 xfs_inode_t **ipp)
758804 {
805
+ struct inode *dir = pip ? VFS_I(pip) : NULL;
759806 struct xfs_mount *mp = tp->t_mountp;
760807 xfs_ino_t ino;
761808 xfs_inode_t *ip;
....@@ -801,26 +848,17 @@
801848 return error;
802849 ASSERT(ip != NULL);
803850 inode = VFS_I(ip);
804
-
805
- /*
806
- * We always convert v1 inodes to v2 now - we only support filesystems
807
- * with >= v2 inode capability, so there is no reason for ever leaving
808
- * an inode in v1 format.
809
- */
810
- if (ip->i_d.di_version == 1)
811
- ip->i_d.di_version = 2;
812
-
813
- inode->i_mode = mode;
814851 set_nlink(inode, nlink);
815
- ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
816
- ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
817852 inode->i_rdev = rdev;
818
- xfs_set_projid(ip, prid);
853
+ ip->i_d.di_projid = prid;
819854
820
- if (pip && XFS_INHERIT_GID(pip)) {
821
- ip->i_d.di_gid = pip->i_d.di_gid;
822
- if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
823
- inode->i_mode |= S_ISGID;
855
+ if (dir && !(dir->i_mode & S_ISGID) &&
856
+ (mp->m_flags & XFS_MOUNT_GRPID)) {
857
+ inode->i_uid = current_fsuid();
858
+ inode->i_gid = dir->i_gid;
859
+ inode->i_mode = mode;
860
+ } else {
861
+ inode_init_owner(inode, dir, mode);
824862 }
825863
826864 /*
....@@ -828,13 +866,12 @@
828866 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
829867 * (and only if the irix_sgid_inherit compatibility variable is set).
830868 */
831
- if ((irix_sgid_inherit) &&
832
- (inode->i_mode & S_ISGID) &&
833
- (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
869
+ if (irix_sgid_inherit &&
870
+ (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid))
834871 inode->i_mode &= ~S_ISGID;
835872
836873 ip->i_d.di_size = 0;
837
- ip->i_d.di_nextents = 0;
874
+ ip->i_df.if_nextents = 0;
838875 ASSERT(ip->i_d.di_nblocks == 0);
839876
840877 tv = current_time(inode);
....@@ -847,14 +884,12 @@
847884 ip->i_d.di_dmstate = 0;
848885 ip->i_d.di_flags = 0;
849886
850
- if (ip->i_d.di_version == 3) {
887
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
851888 inode_set_iversion(inode, 1);
852
- ip->i_d.di_flags2 = 0;
889
+ ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2;
853890 ip->i_d.di_cowextsize = 0;
854
- ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
855
- ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
891
+ ip->i_d.di_crtime = tv;
856892 }
857
-
858893
859894 flags = XFS_ILOG_CORE;
860895 switch (mode & S_IFMT) {
....@@ -862,70 +897,19 @@
862897 case S_IFCHR:
863898 case S_IFBLK:
864899 case S_IFSOCK:
865
- ip->i_d.di_format = XFS_DINODE_FMT_DEV;
900
+ ip->i_df.if_format = XFS_DINODE_FMT_DEV;
866901 ip->i_df.if_flags = 0;
867902 flags |= XFS_ILOG_DEV;
868903 break;
869904 case S_IFREG:
870905 case S_IFDIR:
871
- if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
872
- uint di_flags = 0;
873
-
874
- if (S_ISDIR(mode)) {
875
- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
876
- di_flags |= XFS_DIFLAG_RTINHERIT;
877
- if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
878
- di_flags |= XFS_DIFLAG_EXTSZINHERIT;
879
- ip->i_d.di_extsize = pip->i_d.di_extsize;
880
- }
881
- if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
882
- di_flags |= XFS_DIFLAG_PROJINHERIT;
883
- } else if (S_ISREG(mode)) {
884
- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
885
- di_flags |= XFS_DIFLAG_REALTIME;
886
- if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
887
- di_flags |= XFS_DIFLAG_EXTSIZE;
888
- ip->i_d.di_extsize = pip->i_d.di_extsize;
889
- }
890
- }
891
- if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
892
- xfs_inherit_noatime)
893
- di_flags |= XFS_DIFLAG_NOATIME;
894
- if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
895
- xfs_inherit_nodump)
896
- di_flags |= XFS_DIFLAG_NODUMP;
897
- if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
898
- xfs_inherit_sync)
899
- di_flags |= XFS_DIFLAG_SYNC;
900
- if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
901
- xfs_inherit_nosymlinks)
902
- di_flags |= XFS_DIFLAG_NOSYMLINKS;
903
- if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
904
- xfs_inherit_nodefrag)
905
- di_flags |= XFS_DIFLAG_NODEFRAG;
906
- if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
907
- di_flags |= XFS_DIFLAG_FILESTREAM;
908
-
909
- ip->i_d.di_flags |= di_flags;
910
- }
911
- if (pip &&
912
- (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
913
- pip->i_d.di_version == 3 &&
914
- ip->i_d.di_version == 3) {
915
- uint64_t di_flags2 = 0;
916
-
917
- if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
918
- di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
919
- ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
920
- }
921
- if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
922
- di_flags2 |= XFS_DIFLAG2_DAX;
923
-
924
- ip->i_d.di_flags2 |= di_flags2;
925
- }
906
+ if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY))
907
+ xfs_inode_inherit_flags(ip, pip);
908
+ if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY))
909
+ xfs_inode_inherit_flags2(ip, pip);
926910 /* FALLTHROUGH */
927911 case S_IFLNK:
928
- ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
912
+ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
929913 ip->i_df.if_flags = XFS_IFEXTENTS;
930914 ip->i_df.if_bytes = 0;
931915 ip->i_df.if_u1.if_root = NULL;
....@@ -933,11 +917,6 @@
933917 default:
934918 ASSERT(0);
935919 }
936
- /*
937
- * Attribute fork settings for new inode.
938
- */
939
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
940
- ip->i_d.di_anextents = 0;
941920
942921 /*
943922 * Log the new values stuffed into the inode.
....@@ -1116,17 +1095,15 @@
11161095 /*
11171096 * Increment the link count on an inode & log the change.
11181097 */
1119
-static int
1098
+static void
11201099 xfs_bumplink(
11211100 xfs_trans_t *tp,
11221101 xfs_inode_t *ip)
11231102 {
11241103 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
11251104
1126
- ASSERT(ip->i_d.di_version > 1);
11271105 inc_nlink(VFS_I(ip));
11281106 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1129
- return 0;
11301107 }
11311108
11321109 int
....@@ -1160,8 +1137,7 @@
11601137 /*
11611138 * Make sure that we have allocated dquot(s) on disk.
11621139 */
1163
- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1164
- xfs_kgid_to_gid(current_fsgid()), prid,
1140
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
11651141 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
11661142 &udqp, &gdqp, &pdqp);
11671143 if (error)
....@@ -1221,8 +1197,7 @@
12211197 unlock_dp_on_error = false;
12221198
12231199 error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1224
- resblks ?
1225
- resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1200
+ resblks - XFS_IALLOC_SPACE_RES(mp));
12261201 if (error) {
12271202 ASSERT(error != -ENOSPC);
12281203 goto out_trans_cancel;
....@@ -1235,9 +1210,7 @@
12351210 if (error)
12361211 goto out_trans_cancel;
12371212
1238
- error = xfs_bumplink(tp, dp);
1239
- if (error)
1240
- goto out_trans_cancel;
1213
+ xfs_bumplink(tp, dp);
12411214 }
12421215
12431216 /*
....@@ -1313,8 +1286,7 @@
13131286 /*
13141287 * Make sure that we have allocated dquot(s) on disk.
13151288 */
1316
- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1317
- xfs_kgid_to_gid(current_fsgid()), prid,
1289
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
13181290 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
13191291 &udqp, &gdqp, &pdqp);
13201292 if (error)
....@@ -1427,7 +1399,7 @@
14271399 * the tree quota mechanism could be circumvented.
14281400 */
14291401 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1430
- (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1402
+ tdp->i_d.di_projid != sip->i_d.di_projid)) {
14311403 error = -EXDEV;
14321404 goto error_return;
14331405 }
....@@ -1454,9 +1426,7 @@
14541426 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
14551427 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
14561428
1457
- error = xfs_bumplink(tp, sip);
1458
- if (error)
1459
- goto error_return;
1429
+ xfs_bumplink(tp, sip);
14601430
14611431 /*
14621432 * If this is a synchronous mount, make sure that the
....@@ -1524,10 +1494,8 @@
15241494 struct xfs_mount *mp = ip->i_mount;
15251495 struct xfs_trans *tp = *tpp;
15261496 xfs_fileoff_t first_unmap_block;
1527
- xfs_fileoff_t last_block;
15281497 xfs_filblks_t unmap_len;
15291498 int error = 0;
1530
- int done = 0;
15311499
15321500 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
15331501 ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
....@@ -1547,33 +1515,27 @@
15471515 * the end of the file (in a crash where the space is allocated
15481516 * but the inode size is not yet updated), simply remove any
15491517 * blocks which show up between the new EOF and the maximum
1550
- * possible file size. If the first block to be removed is
1551
- * beyond the maximum file size (ie it is the same as last_block),
1552
- * then there is nothing to do.
1518
+ * possible file size.
1519
+ *
1520
+ * We have to free all the blocks to the bmbt maximum offset, even if
1521
+ * the page cache can't scale that far.
15531522 */
15541523 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1555
- last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
1556
- if (first_unmap_block == last_block)
1524
+ if (first_unmap_block >= XFS_MAX_FILEOFF) {
1525
+ WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
15571526 return 0;
1527
+ }
15581528
1559
- ASSERT(first_unmap_block < last_block);
1560
- unmap_len = last_block - first_unmap_block + 1;
1561
- while (!done) {
1529
+ unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
1530
+ while (unmap_len > 0) {
15621531 ASSERT(tp->t_firstblock == NULLFSBLOCK);
1563
- error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
1564
- XFS_ITRUNC_MAX_EXTENTS, &done);
1532
+ error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
1533
+ flags, XFS_ITRUNC_MAX_EXTENTS);
15651534 if (error)
15661535 goto out;
15671536
1568
- /*
1569
- * Duplicate the transaction that has the permanent
1570
- * reservation and commit the old transaction.
1571
- */
1537
+ /* free the just unmapped extents */
15721538 error = xfs_defer_finish(&tp);
1573
- if (error)
1574
- goto out;
1575
-
1576
- error = xfs_trans_roll_inode(&tp, ip);
15771539 if (error)
15781540 goto out;
15791541 }
....@@ -1581,7 +1543,7 @@
15811543 if (whichfork == XFS_DATA_FORK) {
15821544 /* Remove all pending CoW reservations. */
15831545 error = xfs_reflink_cancel_cow_blocks(ip, &tp,
1584
- first_unmap_block, last_block, true);
1546
+ first_unmap_block, XFS_MAX_FILEOFF, true);
15851547 if (error)
15861548 goto out;
15871549
....@@ -1662,7 +1624,7 @@
16621624 return 0;
16631625 /*
16641626 * If we can't get the iolock just skip truncating the blocks
1665
- * past EOF because we could deadlock with the mmap_sem
1627
+ * past EOF because we could deadlock with the mmap_lock
16661628 * otherwise. We'll get another chance to drop them once the
16671629 * last reference to the inode is dropped, so we'll never leak
16681630 * blocks permanently.
....@@ -1714,7 +1676,7 @@
17141676 if (error)
17151677 goto error_trans_cancel;
17161678
1717
- ASSERT(ip->i_d.di_nextents == 0);
1679
+ ASSERT(ip->i_df.if_nextents == 0);
17181680
17191681 error = xfs_trans_commit(tp);
17201682 if (error)
....@@ -1883,7 +1845,7 @@
18831845
18841846 if (S_ISREG(VFS_I(ip)->i_mode) &&
18851847 (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
1886
- ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
1848
+ ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
18871849 truncate = 1;
18881850
18891851 error = xfs_qm_dqattach(ip);
....@@ -1909,7 +1871,6 @@
19091871 }
19101872
19111873 ASSERT(!ip->i_afp);
1912
- ASSERT(ip->i_d.di_anextents == 0);
19131874 ASSERT(ip->i_d.di_forkoff == 0);
19141875
19151876 /*
....@@ -1926,6 +1887,336 @@
19261887 }
19271888
19281889 /*
1890
+ * In-Core Unlinked List Lookups
1891
+ * =============================
1892
+ *
1893
+ * Every inode is supposed to be reachable from some other piece of metadata
1894
+ * with the exception of the root directory. Inodes with a connection to a
1895
+ * file descriptor but not linked from anywhere in the on-disk directory tree
1896
+ * are collectively known as unlinked inodes, though the filesystem itself
1897
+ * maintains links to these inodes so that on-disk metadata are consistent.
1898
+ *
1899
+ * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
1900
+ * header contains a number of buckets that point to an inode, and each inode
1901
+ * record has a pointer to the next inode in the hash chain. This
1902
+ * singly-linked list causes scaling problems in the iunlink remove function
1903
+ * because we must walk that list to find the inode that points to the inode
1904
+ * being removed from the unlinked hash bucket list.
1905
+ *
1906
+ * What if we modelled the unlinked list as a collection of records capturing
1907
+ * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
1908
+ * have a fast way to look up unlinked list predecessors, which avoids the
1909
+ * slow list walk. That's exactly what we do here (in-core) with a per-AG
1910
+ * rhashtable.
1911
+ *
1912
+ * Because this is a backref cache, we ignore operational failures since the
1913
+ * iunlink code can fall back to the slow bucket walk. The only errors that
1914
+ * should bubble out are for obviously incorrect situations.
1915
+ *
1916
+ * All users of the backref cache MUST hold the AGI buffer lock to serialize
1917
+ * access or have otherwise provided for concurrency control.
1918
+ */
1919
+
1920
+/* Capture a "X.next_unlinked = Y" relationship. */
1921
+struct xfs_iunlink {
1922
+ struct rhash_head iu_rhash_head;
1923
+ xfs_agino_t iu_agino; /* X */
1924
+ xfs_agino_t iu_next_unlinked; /* Y */
1925
+};
1926
+
1927
+/* Unlinked list predecessor lookup hashtable construction */
1928
+static int
1929
+xfs_iunlink_obj_cmpfn(
1930
+ struct rhashtable_compare_arg *arg,
1931
+ const void *obj)
1932
+{
1933
+ const xfs_agino_t *key = arg->key;
1934
+ const struct xfs_iunlink *iu = obj;
1935
+
1936
+ if (iu->iu_next_unlinked != *key)
1937
+ return 1;
1938
+ return 0;
1939
+}
1940
+
1941
+static const struct rhashtable_params xfs_iunlink_hash_params = {
1942
+ .min_size = XFS_AGI_UNLINKED_BUCKETS,
1943
+ .key_len = sizeof(xfs_agino_t),
1944
+ .key_offset = offsetof(struct xfs_iunlink,
1945
+ iu_next_unlinked),
1946
+ .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
1947
+ .automatic_shrinking = true,
1948
+ .obj_cmpfn = xfs_iunlink_obj_cmpfn,
1949
+};
1950
+
1951
+/*
1952
+ * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
1953
+ * relation is found.
1954
+ */
1955
+static xfs_agino_t
1956
+xfs_iunlink_lookup_backref(
1957
+ struct xfs_perag *pag,
1958
+ xfs_agino_t agino)
1959
+{
1960
+ struct xfs_iunlink *iu;
1961
+
1962
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
1963
+ xfs_iunlink_hash_params);
1964
+ return iu ? iu->iu_agino : NULLAGINO;
1965
+}
1966
+
1967
+/*
1968
+ * Take ownership of an iunlink cache entry and insert it into the hash table.
1969
+ * If successful, the entry will be owned by the cache; if not, it is freed.
1970
+ * Either way, the caller does not own @iu after this call.
1971
+ */
1972
+static int
1973
+xfs_iunlink_insert_backref(
1974
+ struct xfs_perag *pag,
1975
+ struct xfs_iunlink *iu)
1976
+{
1977
+ int error;
1978
+
1979
+ error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
1980
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
1981
+ /*
1982
+ * Fail loudly if there already was an entry because that's a sign of
1983
+ * corruption of in-memory data. Also fail loudly if we see an error
1984
+ * code we didn't anticipate from the rhashtable code. Currently we
1985
+ * only anticipate ENOMEM.
1986
+ */
1987
+ if (error) {
1988
+ WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
1989
+ kmem_free(iu);
1990
+ }
1991
+ /*
1992
+ * Absorb any runtime errors that aren't a result of corruption because
1993
+ * this is a cache and we can always fall back to bucket list scanning.
1994
+ */
1995
+ if (error != 0 && error != -EEXIST)
1996
+ error = 0;
1997
+ return error;
1998
+}
1999
+
2000
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
2001
+static int
2002
+xfs_iunlink_add_backref(
2003
+ struct xfs_perag *pag,
2004
+ xfs_agino_t prev_agino,
2005
+ xfs_agino_t this_agino)
2006
+{
2007
+ struct xfs_iunlink *iu;
2008
+
2009
+ if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
2010
+ return 0;
2011
+
2012
+ iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
2013
+ iu->iu_agino = prev_agino;
2014
+ iu->iu_next_unlinked = this_agino;
2015
+
2016
+ return xfs_iunlink_insert_backref(pag, iu);
2017
+}
2018
+
2019
+/*
2020
+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
2021
+ * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
2022
+ * wasn't any such entry then we don't bother.
2023
+ */
2024
+static int
2025
+xfs_iunlink_change_backref(
2026
+ struct xfs_perag *pag,
2027
+ xfs_agino_t agino,
2028
+ xfs_agino_t next_unlinked)
2029
+{
2030
+ struct xfs_iunlink *iu;
2031
+ int error;
2032
+
2033
+ /* Look up the old entry; if there wasn't one then exit. */
2034
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
2035
+ xfs_iunlink_hash_params);
2036
+ if (!iu)
2037
+ return 0;
2038
+
2039
+ /*
2040
+ * Remove the entry. This shouldn't ever return an error, but if we
2041
+ * couldn't remove the old entry we don't want to add it again to the
2042
+ * hash table, and if the entry disappeared on us then someone's
2043
+ * violated the locking rules and we need to fail loudly. Either way
2044
+ * we cannot remove the inode because internal state is or would have
2045
+ * been corrupt.
2046
+ */
2047
+ error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
2048
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
2049
+ if (error)
2050
+ return error;
2051
+
2052
+ /* If there is no new next entry just free our item and return. */
2053
+ if (next_unlinked == NULLAGINO) {
2054
+ kmem_free(iu);
2055
+ return 0;
2056
+ }
2057
+
2058
+ /* Update the entry and re-add it to the hash table. */
2059
+ iu->iu_next_unlinked = next_unlinked;
2060
+ return xfs_iunlink_insert_backref(pag, iu);
2061
+}
2062
+
2063
+/* Set up the in-core predecessor structures. */
2064
+int
2065
+xfs_iunlink_init(
2066
+ struct xfs_perag *pag)
2067
+{
2068
+ return rhashtable_init(&pag->pagi_unlinked_hash,
2069
+ &xfs_iunlink_hash_params);
2070
+}
2071
+
2072
+/* Free the in-core predecessor structures. */
2073
+static void
2074
+xfs_iunlink_free_item(
2075
+ void *ptr,
2076
+ void *arg)
2077
+{
2078
+ struct xfs_iunlink *iu = ptr;
2079
+ bool *freed_anything = arg;
2080
+
2081
+ *freed_anything = true;
2082
+ kmem_free(iu);
2083
+}
2084
+
2085
+void
2086
+xfs_iunlink_destroy(
2087
+ struct xfs_perag *pag)
2088
+{
2089
+ bool freed_anything = false;
2090
+
2091
+ rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
2092
+ xfs_iunlink_free_item, &freed_anything);
2093
+
2094
+ ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
2095
+}
2096
+
2097
+/*
2098
+ * Point the AGI unlinked bucket at an inode and log the results. The caller
2099
+ * is responsible for validating the old value.
2100
+ */
2101
+STATIC int
2102
+xfs_iunlink_update_bucket(
2103
+ struct xfs_trans *tp,
2104
+ xfs_agnumber_t agno,
2105
+ struct xfs_buf *agibp,
2106
+ unsigned int bucket_index,
2107
+ xfs_agino_t new_agino)
2108
+{
2109
+ struct xfs_agi *agi = agibp->b_addr;
2110
+ xfs_agino_t old_value;
2111
+ int offset;
2112
+
2113
+ ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
2114
+
2115
+ old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2116
+ trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
2117
+ old_value, new_agino);
2118
+
2119
+ /*
2120
+ * We should never find the head of the list already set to the value
2121
+ * passed in because either we're adding or removing ourselves from the
2122
+ * head of the list.
2123
+ */
2124
+ if (old_value == new_agino) {
2125
+ xfs_buf_mark_corrupt(agibp);
2126
+ return -EFSCORRUPTED;
2127
+ }
2128
+
2129
+ agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
2130
+ offset = offsetof(struct xfs_agi, agi_unlinked) +
2131
+ (sizeof(xfs_agino_t) * bucket_index);
2132
+ xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
2133
+ return 0;
2134
+}
2135
+
2136
+/* Set an on-disk inode's next_unlinked pointer. */
2137
+STATIC void
2138
+xfs_iunlink_update_dinode(
2139
+ struct xfs_trans *tp,
2140
+ xfs_agnumber_t agno,
2141
+ xfs_agino_t agino,
2142
+ struct xfs_buf *ibp,
2143
+ struct xfs_dinode *dip,
2144
+ struct xfs_imap *imap,
2145
+ xfs_agino_t next_agino)
2146
+{
2147
+ struct xfs_mount *mp = tp->t_mountp;
2148
+ int offset;
2149
+
2150
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
2151
+
2152
+ trace_xfs_iunlink_update_dinode(mp, agno, agino,
2153
+ be32_to_cpu(dip->di_next_unlinked), next_agino);
2154
+
2155
+ dip->di_next_unlinked = cpu_to_be32(next_agino);
2156
+ offset = imap->im_boffset +
2157
+ offsetof(struct xfs_dinode, di_next_unlinked);
2158
+
2159
+ /* need to recalc the inode CRC if appropriate */
2160
+ xfs_dinode_calc_crc(mp, dip);
2161
+ xfs_trans_inode_buf(tp, ibp);
2162
+ xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
2163
+}
2164
+
2165
+/* Set an in-core inode's unlinked pointer and return the old value. */
2166
+STATIC int
2167
+xfs_iunlink_update_inode(
2168
+ struct xfs_trans *tp,
2169
+ struct xfs_inode *ip,
2170
+ xfs_agnumber_t agno,
2171
+ xfs_agino_t next_agino,
2172
+ xfs_agino_t *old_next_agino)
2173
+{
2174
+ struct xfs_mount *mp = tp->t_mountp;
2175
+ struct xfs_dinode *dip;
2176
+ struct xfs_buf *ibp;
2177
+ xfs_agino_t old_value;
2178
+ int error;
2179
+
2180
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
2181
+
2182
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0);
2183
+ if (error)
2184
+ return error;
2185
+
2186
+ /* Make sure the old pointer isn't garbage. */
2187
+ old_value = be32_to_cpu(dip->di_next_unlinked);
2188
+ if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
2189
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
2190
+ sizeof(*dip), __this_address);
2191
+ error = -EFSCORRUPTED;
2192
+ goto out;
2193
+ }
2194
+
2195
+ /*
2196
+ * Since we're updating a linked list, we should never find that the
2197
+ * current pointer is the same as the new value, unless we're
2198
+ * terminating the list.
2199
+ */
2200
+ *old_next_agino = old_value;
2201
+ if (old_value == next_agino) {
2202
+ if (next_agino != NULLAGINO) {
2203
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
2204
+ dip, sizeof(*dip), __this_address);
2205
+ error = -EFSCORRUPTED;
2206
+ }
2207
+ goto out;
2208
+ }
2209
+
2210
+ /* Ok, update the new pointer. */
2211
+ xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
2212
+ ibp, dip, &ip->i_imap, next_agino);
2213
+ return 0;
2214
+out:
2215
+ xfs_trans_brelse(tp, ibp);
2216
+ return error;
2217
+}
2218
+
2219
+/*
19292220 * This is called when the inode's link count has gone to 0 or we are creating
19302221 * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
19312222 *
....@@ -1934,76 +2225,177 @@
19342225 */
19352226 STATIC int
19362227 xfs_iunlink(
1937
- struct xfs_trans *tp,
1938
- struct xfs_inode *ip)
2228
+ struct xfs_trans *tp,
2229
+ struct xfs_inode *ip)
19392230 {
1940
- xfs_mount_t *mp = tp->t_mountp;
1941
- xfs_agi_t *agi;
1942
- xfs_dinode_t *dip;
1943
- xfs_buf_t *agibp;
1944
- xfs_buf_t *ibp;
1945
- xfs_agino_t agino;
1946
- short bucket_index;
1947
- int offset;
1948
- int error;
2231
+ struct xfs_mount *mp = tp->t_mountp;
2232
+ struct xfs_agi *agi;
2233
+ struct xfs_buf *agibp;
2234
+ xfs_agino_t next_agino;
2235
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2236
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2237
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2238
+ int error;
19492239
19502240 ASSERT(VFS_I(ip)->i_nlink == 0);
19512241 ASSERT(VFS_I(ip)->i_mode != 0);
2242
+ trace_xfs_iunlink(ip);
19522243
1953
- /*
1954
- * Get the agi buffer first. It ensures lock ordering
1955
- * on the list.
1956
- */
1957
- error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
2244
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
2245
+ error = xfs_read_agi(mp, tp, agno, &agibp);
19582246 if (error)
19592247 return error;
1960
- agi = XFS_BUF_TO_AGI(agibp);
2248
+ agi = agibp->b_addr;
19612249
19622250 /*
1963
- * Get the index into the agi hash table for the
1964
- * list this inode will go on.
2251
+ * Get the index into the agi hash table for the list this inode will
2252
+ * go on. Make sure the pointer isn't garbage and that this inode
2253
+ * isn't already on the list.
19652254 */
1966
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1967
- ASSERT(agino != 0);
1968
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1969
- ASSERT(agi->agi_unlinked[bucket_index]);
1970
- ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
2255
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2256
+ if (next_agino == agino ||
2257
+ !xfs_verify_agino_or_null(mp, agno, next_agino)) {
2258
+ xfs_buf_mark_corrupt(agibp);
2259
+ return -EFSCORRUPTED;
2260
+ }
19712261
1972
- if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
2262
+ if (next_agino != NULLAGINO) {
2263
+ xfs_agino_t old_agino;
2264
+
19732265 /*
1974
- * There is already another inode in the bucket we need
1975
- * to add ourselves to. Add us at the front of the list.
1976
- * Here we put the head pointer into our next pointer,
1977
- * and then we fall through to point the head at us.
2266
+ * There is already another inode in the bucket, so point this
2267
+ * inode to the current head of the list.
19782268 */
1979
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1980
- 0, 0);
2269
+ error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
2270
+ &old_agino);
2271
+ if (error)
2272
+ return error;
2273
+ ASSERT(old_agino == NULLAGINO);
2274
+
2275
+ /*
2276
+ * agino has been unlinked, add a backref from the next inode
2277
+ * back to agino.
2278
+ */
2279
+ error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino);
2280
+ if (error)
2281
+ return error;
2282
+ }
2283
+
2284
+ /* Point the head of the list to point to this inode. */
2285
+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
2286
+}
2287
+
2288
+/* Return the imap, dinode pointer, and buffer for an inode. */
2289
+STATIC int
2290
+xfs_iunlink_map_ino(
2291
+ struct xfs_trans *tp,
2292
+ xfs_agnumber_t agno,
2293
+ xfs_agino_t agino,
2294
+ struct xfs_imap *imap,
2295
+ struct xfs_dinode **dipp,
2296
+ struct xfs_buf **bpp)
2297
+{
2298
+ struct xfs_mount *mp = tp->t_mountp;
2299
+ int error;
2300
+
2301
+ imap->im_blkno = 0;
2302
+ error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
2303
+ if (error) {
2304
+ xfs_warn(mp, "%s: xfs_imap returned error %d.",
2305
+ __func__, error);
2306
+ return error;
2307
+ }
2308
+
2309
+ error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0);
2310
+ if (error) {
2311
+ xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
2312
+ __func__, error);
2313
+ return error;
2314
+ }
2315
+
2316
+ return 0;
2317
+}
2318
+
2319
+/*
2320
+ * Walk the unlinked chain from @head_agino until we find the inode that
2321
+ * points to @target_agino. Return the inode number, map, dinode pointer,
2322
+ * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
2323
+ *
2324
+ * @tp, @pag, @head_agino, and @target_agino are input parameters.
2325
+ * @agino, @imap, @dipp, and @bpp are all output parameters.
2326
+ *
2327
+ * Do not call this function if @target_agino is the head of the list.
2328
+ */
2329
+STATIC int
2330
+xfs_iunlink_map_prev(
2331
+ struct xfs_trans *tp,
2332
+ xfs_agnumber_t agno,
2333
+ xfs_agino_t head_agino,
2334
+ xfs_agino_t target_agino,
2335
+ xfs_agino_t *agino,
2336
+ struct xfs_imap *imap,
2337
+ struct xfs_dinode **dipp,
2338
+ struct xfs_buf **bpp,
2339
+ struct xfs_perag *pag)
2340
+{
2341
+ struct xfs_mount *mp = tp->t_mountp;
2342
+ xfs_agino_t next_agino;
2343
+ int error;
2344
+
2345
+ ASSERT(head_agino != target_agino);
2346
+ *bpp = NULL;
2347
+
2348
+ /* See if our backref cache can find it faster. */
2349
+ *agino = xfs_iunlink_lookup_backref(pag, target_agino);
2350
+ if (*agino != NULLAGINO) {
2351
+ error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
19812352 if (error)
19822353 return error;
19832354
1984
- ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
1985
- dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1986
- offset = ip->i_imap.im_boffset +
1987
- offsetof(xfs_dinode_t, di_next_unlinked);
2355
+ if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
2356
+ return 0;
19882357
1989
- /* need to recalc the inode CRC if appropriate */
1990
- xfs_dinode_calc_crc(mp, dip);
1991
-
1992
- xfs_trans_inode_buf(tp, ibp);
1993
- xfs_trans_log_buf(tp, ibp, offset,
1994
- (offset + sizeof(xfs_agino_t) - 1));
1995
- xfs_inobp_check(mp, ibp);
2358
+ /*
2359
+ * If we get here the cache contents were corrupt, so drop the
2360
+ * buffer and fall back to walking the bucket list.
2361
+ */
2362
+ xfs_trans_brelse(tp, *bpp);
2363
+ *bpp = NULL;
2364
+ WARN_ON_ONCE(1);
19962365 }
19972366
1998
- /*
1999
- * Point the bucket head pointer at the inode being inserted.
2000
- */
2001
- ASSERT(agino != 0);
2002
- agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
2003
- offset = offsetof(xfs_agi_t, agi_unlinked) +
2004
- (sizeof(xfs_agino_t) * bucket_index);
2005
- xfs_trans_log_buf(tp, agibp, offset,
2006
- (offset + sizeof(xfs_agino_t) - 1));
2367
+ trace_xfs_iunlink_map_prev_fallback(mp, agno);
2368
+
2369
+ /* Otherwise, walk the entire bucket until we find it. */
2370
+ next_agino = head_agino;
2371
+ while (next_agino != target_agino) {
2372
+ xfs_agino_t unlinked_agino;
2373
+
2374
+ if (*bpp)
2375
+ xfs_trans_brelse(tp, *bpp);
2376
+
2377
+ *agino = next_agino;
2378
+ error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
2379
+ bpp);
2380
+ if (error)
2381
+ return error;
2382
+
2383
+ unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
2384
+ /*
2385
+ * Make sure this pointer is valid and isn't an obvious
2386
+ * infinite loop.
2387
+ */
2388
+ if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
2389
+ next_agino == unlinked_agino) {
2390
+ XFS_CORRUPTION_ERROR(__func__,
2391
+ XFS_ERRLEVEL_LOW, mp,
2392
+ *dipp, sizeof(**dipp));
2393
+ error = -EFSCORRUPTED;
2394
+ return error;
2395
+ }
2396
+ next_agino = unlinked_agino;
2397
+ }
2398
+
20072399 return 0;
20082400 }
20092401
....@@ -2012,181 +2404,190 @@
20122404 */
20132405 STATIC int
20142406 xfs_iunlink_remove(
2015
- xfs_trans_t *tp,
2016
- xfs_inode_t *ip)
2407
+ struct xfs_trans *tp,
2408
+ struct xfs_inode *ip)
20172409 {
2018
- xfs_ino_t next_ino;
2019
- xfs_mount_t *mp;
2020
- xfs_agi_t *agi;
2021
- xfs_dinode_t *dip;
2022
- xfs_buf_t *agibp;
2023
- xfs_buf_t *ibp;
2024
- xfs_agnumber_t agno;
2025
- xfs_agino_t agino;
2026
- xfs_agino_t next_agino;
2027
- xfs_buf_t *last_ibp;
2028
- xfs_dinode_t *last_dip = NULL;
2029
- short bucket_index;
2030
- int offset, last_offset = 0;
2031
- int error;
2410
+ struct xfs_mount *mp = tp->t_mountp;
2411
+ struct xfs_agi *agi;
2412
+ struct xfs_buf *agibp;
2413
+ struct xfs_buf *last_ibp;
2414
+ struct xfs_dinode *last_dip = NULL;
2415
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2416
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2417
+ xfs_agino_t next_agino;
2418
+ xfs_agino_t head_agino;
2419
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2420
+ int error;
20322421
2033
- mp = tp->t_mountp;
2034
- agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2422
+ trace_xfs_iunlink_remove(ip);
20352423
2036
- /*
2037
- * Get the agi buffer first. It ensures lock ordering
2038
- * on the list.
2039
- */
2424
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
20402425 error = xfs_read_agi(mp, tp, agno, &agibp);
20412426 if (error)
20422427 return error;
2043
-
2044
- agi = XFS_BUF_TO_AGI(agibp);
2428
+ agi = agibp->b_addr;
20452429
20462430 /*
2047
- * Get the index into the agi hash table for the
2048
- * list this inode will go on.
2431
+ * Get the index into the agi hash table for the list this inode will
2432
+ * go on. Make sure the head pointer isn't garbage.
20492433 */
2050
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2051
- if (!xfs_verify_agino(mp, agno, agino))
2052
- return -EFSCORRUPTED;
2053
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2054
- if (!xfs_verify_agino(mp, agno,
2055
- be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
2434
+ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2435
+ if (!xfs_verify_agino(mp, agno, head_agino)) {
20562436 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
20572437 agi, sizeof(*agi));
20582438 return -EFSCORRUPTED;
20592439 }
20602440
2061
- if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
2062
- /*
2063
- * We're at the head of the list. Get the inode's on-disk
2064
- * buffer to see if there is anyone after us on the list.
2065
- * Only modify our next pointer if it is not already NULLAGINO.
2066
- * This saves us the overhead of dealing with the buffer when
2067
- * there is no need to change it.
2068
- */
2069
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2070
- 0, 0);
2071
- if (error) {
2072
- xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
2073
- __func__, error);
2441
+ /*
2442
+ * Set our inode's next_unlinked pointer to NULL and then return
2443
+ * the old pointer value so that we can update whatever was previous
2444
+ * to us in the list to point to whatever was next in the list.
2445
+ */
2446
+ error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
2447
+ if (error)
2448
+ return error;
2449
+
2450
+ /*
2451
+ * If there was a backref pointing from the next inode back to this
2452
+ * one, remove it because we've removed this inode from the list.
2453
+ *
2454
+ * Later, if this inode was in the middle of the list we'll update
2455
+ * this inode's backref to point from the next inode.
2456
+ */
2457
+ if (next_agino != NULLAGINO) {
2458
+ error = xfs_iunlink_change_backref(agibp->b_pag, next_agino,
2459
+ NULLAGINO);
2460
+ if (error)
20742461 return error;
2075
- }
2076
- next_agino = be32_to_cpu(dip->di_next_unlinked);
2077
- ASSERT(next_agino != 0);
2078
- if (next_agino != NULLAGINO) {
2079
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2080
- offset = ip->i_imap.im_boffset +
2081
- offsetof(xfs_dinode_t, di_next_unlinked);
2082
-
2083
- /* need to recalc the inode CRC if appropriate */
2084
- xfs_dinode_calc_crc(mp, dip);
2085
-
2086
- xfs_trans_inode_buf(tp, ibp);
2087
- xfs_trans_log_buf(tp, ibp, offset,
2088
- (offset + sizeof(xfs_agino_t) - 1));
2089
- xfs_inobp_check(mp, ibp);
2090
- } else {
2091
- xfs_trans_brelse(tp, ibp);
2092
- }
2093
- /*
2094
- * Point the bucket head pointer at the next inode.
2095
- */
2096
- ASSERT(next_agino != 0);
2097
- ASSERT(next_agino != agino);
2098
- agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
2099
- offset = offsetof(xfs_agi_t, agi_unlinked) +
2100
- (sizeof(xfs_agino_t) * bucket_index);
2101
- xfs_trans_log_buf(tp, agibp, offset,
2102
- (offset + sizeof(xfs_agino_t) - 1));
2103
- } else {
2104
- /*
2105
- * We need to search the list for the inode being freed.
2106
- */
2107
- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2108
- last_ibp = NULL;
2109
- while (next_agino != agino) {
2110
- struct xfs_imap imap;
2111
-
2112
- if (last_ibp)
2113
- xfs_trans_brelse(tp, last_ibp);
2114
-
2115
- imap.im_blkno = 0;
2116
- next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
2117
-
2118
- error = xfs_imap(mp, tp, next_ino, &imap, 0);
2119
- if (error) {
2120
- xfs_warn(mp,
2121
- "%s: xfs_imap returned error %d.",
2122
- __func__, error);
2123
- return error;
2124
- }
2125
-
2126
- error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
2127
- &last_ibp, 0, 0);
2128
- if (error) {
2129
- xfs_warn(mp,
2130
- "%s: xfs_imap_to_bp returned error %d.",
2131
- __func__, error);
2132
- return error;
2133
- }
2134
-
2135
- last_offset = imap.im_boffset;
2136
- next_agino = be32_to_cpu(last_dip->di_next_unlinked);
2137
- if (!xfs_verify_agino(mp, agno, next_agino)) {
2138
- XFS_CORRUPTION_ERROR(__func__,
2139
- XFS_ERRLEVEL_LOW, mp,
2140
- last_dip, sizeof(*last_dip));
2141
- return -EFSCORRUPTED;
2142
- }
2143
- }
2144
-
2145
- /*
2146
- * Now last_ibp points to the buffer previous to us on the
2147
- * unlinked list. Pull us from the list.
2148
- */
2149
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2150
- 0, 0);
2151
- if (error) {
2152
- xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
2153
- __func__, error);
2154
- return error;
2155
- }
2156
- next_agino = be32_to_cpu(dip->di_next_unlinked);
2157
- ASSERT(next_agino != 0);
2158
- ASSERT(next_agino != agino);
2159
- if (next_agino != NULLAGINO) {
2160
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2161
- offset = ip->i_imap.im_boffset +
2162
- offsetof(xfs_dinode_t, di_next_unlinked);
2163
-
2164
- /* need to recalc the inode CRC if appropriate */
2165
- xfs_dinode_calc_crc(mp, dip);
2166
-
2167
- xfs_trans_inode_buf(tp, ibp);
2168
- xfs_trans_log_buf(tp, ibp, offset,
2169
- (offset + sizeof(xfs_agino_t) - 1));
2170
- xfs_inobp_check(mp, ibp);
2171
- } else {
2172
- xfs_trans_brelse(tp, ibp);
2173
- }
2174
- /*
2175
- * Point the previous inode on the list to the next inode.
2176
- */
2177
- last_dip->di_next_unlinked = cpu_to_be32(next_agino);
2178
- ASSERT(next_agino != 0);
2179
- offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
2180
-
2181
- /* need to recalc the inode CRC if appropriate */
2182
- xfs_dinode_calc_crc(mp, last_dip);
2183
-
2184
- xfs_trans_inode_buf(tp, last_ibp);
2185
- xfs_trans_log_buf(tp, last_ibp, offset,
2186
- (offset + sizeof(xfs_agino_t) - 1));
2187
- xfs_inobp_check(mp, last_ibp);
21882462 }
2189
- return 0;
2463
+
2464
+ if (head_agino != agino) {
2465
+ struct xfs_imap imap;
2466
+ xfs_agino_t prev_agino;
2467
+
2468
+ /* We need to search the list for the inode being freed. */
2469
+ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
2470
+ &prev_agino, &imap, &last_dip, &last_ibp,
2471
+ agibp->b_pag);
2472
+ if (error)
2473
+ return error;
2474
+
2475
+ /* Point the previous inode on the list to the next inode. */
2476
+ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
2477
+ last_dip, &imap, next_agino);
2478
+
2479
+ /*
2480
+ * Now we deal with the backref for this inode. If this inode
2481
+ * pointed at a real inode, change the backref that pointed to
2482
+ * us to point to our old next. If this inode was the end of
2483
+ * the list, delete the backref that pointed to us. Note that
2484
+ * change_backref takes care of deleting the backref if
2485
+ * next_agino is NULLAGINO.
2486
+ */
2487
+ return xfs_iunlink_change_backref(agibp->b_pag, agino,
2488
+ next_agino);
2489
+ }
2490
+
2491
+ /* Point the head of the list to the next unlinked inode. */
2492
+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
2493
+ next_agino);
2494
+}
2495
+
2496
+/*
2497
+ * Look up the inode number specified and if it is not already marked XFS_ISTALE
2498
+ * mark it stale. We should only find clean inodes in this lookup that aren't
2499
+ * already stale.
2500
+ */
2501
+static void
2502
+xfs_ifree_mark_inode_stale(
2503
+ struct xfs_buf *bp,
2504
+ struct xfs_inode *free_ip,
2505
+ xfs_ino_t inum)
2506
+{
2507
+ struct xfs_mount *mp = bp->b_mount;
2508
+ struct xfs_perag *pag = bp->b_pag;
2509
+ struct xfs_inode_log_item *iip;
2510
+ struct xfs_inode *ip;
2511
+
2512
+retry:
2513
+ rcu_read_lock();
2514
+ ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
2515
+
2516
+ /* Inode not in memory, nothing to do */
2517
+ if (!ip) {
2518
+ rcu_read_unlock();
2519
+ return;
2520
+ }
2521
+
2522
+ /*
2523
+ * because this is an RCU protected lookup, we could find a recently
2524
+ * freed or even reallocated inode during the lookup. We need to check
2525
+ * under the i_flags_lock for a valid inode here. Skip it if it is not
2526
+ * valid, the wrong inode or stale.
2527
+ */
2528
+ spin_lock(&ip->i_flags_lock);
2529
+ if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
2530
+ goto out_iflags_unlock;
2531
+
2532
+ /*
2533
+ * Don't try to lock/unlock the current inode, but we _cannot_ skip the
2534
+ * other inodes that we did not find in the list attached to the buffer
2535
+ * and are not already marked stale. If we can't lock it, back off and
2536
+ * retry.
2537
+ */
2538
+ if (ip != free_ip) {
2539
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2540
+ spin_unlock(&ip->i_flags_lock);
2541
+ rcu_read_unlock();
2542
+ delay(1);
2543
+ goto retry;
2544
+ }
2545
+ }
2546
+ ip->i_flags |= XFS_ISTALE;
2547
+
2548
+ /*
2549
+ * If the inode is flushing, it is already attached to the buffer. All
2550
+ * we needed to do here is mark the inode stale so buffer IO completion
2551
+ * will remove it from the AIL.
2552
+ */
2553
+ iip = ip->i_itemp;
2554
+ if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
2555
+ ASSERT(!list_empty(&iip->ili_item.li_bio_list));
2556
+ ASSERT(iip->ili_last_fields);
2557
+ goto out_iunlock;
2558
+ }
2559
+
2560
+ /*
2561
+ * Inodes not attached to the buffer can be released immediately.
2562
+ * Everything else has to go through xfs_iflush_abort() on journal
2563
+ * commit as the flock synchronises removal of the inode from the
2564
+ * cluster buffer against inode reclaim.
2565
+ */
2566
+ if (!iip || list_empty(&iip->ili_item.li_bio_list))
2567
+ goto out_iunlock;
2568
+
2569
+ __xfs_iflags_set(ip, XFS_IFLUSHING);
2570
+ spin_unlock(&ip->i_flags_lock);
2571
+ rcu_read_unlock();
2572
+
2573
+ /* we have a dirty inode in memory that has not yet been flushed. */
2574
+ spin_lock(&iip->ili_lock);
2575
+ iip->ili_last_fields = iip->ili_fields;
2576
+ iip->ili_fields = 0;
2577
+ iip->ili_fsync_fields = 0;
2578
+ spin_unlock(&iip->ili_lock);
2579
+ ASSERT(iip->ili_last_fields);
2580
+
2581
+ if (ip != free_ip)
2582
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
2583
+ return;
2584
+
2585
+out_iunlock:
2586
+ if (ip != free_ip)
2587
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
2588
+out_iflags_unlock:
2589
+ spin_unlock(&ip->i_flags_lock);
2590
+ rcu_read_unlock();
21902591 }
21912592
21922593 /*
....@@ -2196,31 +2597,23 @@
21962597 */
21972598 STATIC int
21982599 xfs_ifree_cluster(
2199
- xfs_inode_t *free_ip,
2200
- xfs_trans_t *tp,
2600
+ struct xfs_inode *free_ip,
2601
+ struct xfs_trans *tp,
22012602 struct xfs_icluster *xic)
22022603 {
2203
- xfs_mount_t *mp = free_ip->i_mount;
2204
- int blks_per_cluster;
2205
- int inodes_per_cluster;
2604
+ struct xfs_mount *mp = free_ip->i_mount;
2605
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
2606
+ struct xfs_buf *bp;
2607
+ xfs_daddr_t blkno;
2608
+ xfs_ino_t inum = xic->first_ino;
22062609 int nbufs;
22072610 int i, j;
22082611 int ioffset;
2209
- xfs_daddr_t blkno;
2210
- xfs_buf_t *bp;
2211
- xfs_inode_t *ip;
2212
- xfs_inode_log_item_t *iip;
2213
- struct xfs_log_item *lip;
2214
- struct xfs_perag *pag;
2215
- xfs_ino_t inum;
2612
+ int error;
22162613
2217
- inum = xic->first_ino;
2218
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
2219
- blks_per_cluster = xfs_icluster_size_fsb(mp);
2220
- inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
2221
- nbufs = mp->m_ialloc_blks / blks_per_cluster;
2614
+ nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
22222615
2223
- for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
2616
+ for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
22242617 /*
22252618 * The allocation bitmap tells us which inodes of the chunk were
22262619 * physically allocated. Skip the cluster if an inode falls into
....@@ -2228,7 +2621,7 @@
22282621 */
22292622 ioffset = inum - xic->first_ino;
22302623 if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
2231
- ASSERT(ioffset % inodes_per_cluster == 0);
2624
+ ASSERT(ioffset % igeo->inodes_per_cluster == 0);
22322625 continue;
22332626 }
22342627
....@@ -2237,18 +2630,18 @@
22372630
22382631 /*
22392632 * We obtain and lock the backing buffer first in the process
2240
- * here, as we have to ensure that any dirty inode that we
2241
- * can't get the flush lock on is attached to the buffer.
2633
+ * here to ensure dirty inodes attached to the buffer remain in
2634
+ * the flushing state while we mark them stale.
2635
+ *
22422636 * If we scan the in-memory inodes first, then buffer IO can
22432637 * complete before we get a lock on it, and hence we may fail
22442638 * to mark all the active inodes on the buffer stale.
22452639 */
2246
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2247
- mp->m_bsize * blks_per_cluster,
2248
- XBF_UNMAPPED);
2249
-
2250
- if (!bp)
2251
- return -ENOMEM;
2640
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2641
+ mp->m_bsize * igeo->blocks_per_cluster,
2642
+ XBF_UNMAPPED, &bp);
2643
+ if (error)
2644
+ return error;
22522645
22532646 /*
22542647 * This buffer may not have been correctly initialised as we
....@@ -2259,159 +2652,30 @@
22592652 * want it to fail. We can acheive this by adding a write
22602653 * verifier to the buffer.
22612654 */
2262
- bp->b_ops = &xfs_inode_buf_ops;
2655
+ bp->b_ops = &xfs_inode_buf_ops;
22632656
22642657 /*
2265
- * Walk the inodes already attached to the buffer and mark them
2266
- * stale. These will all have the flush locks held, so an
2267
- * in-memory inode walk can't lock them. By marking them all
2268
- * stale first, we will not attempt to lock them in the loop
2269
- * below as the XFS_ISTALE flag will be set.
2658
+ * Now we need to set all the cached clean inodes as XFS_ISTALE,
2659
+ * too. This requires lookups, and will skip inodes that we've
2660
+ * already marked XFS_ISTALE.
22702661 */
2271
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
2272
- if (lip->li_type == XFS_LI_INODE) {
2273
- iip = (xfs_inode_log_item_t *)lip;
2274
- ASSERT(iip->ili_logged == 1);
2275
- lip->li_cb = xfs_istale_done;
2276
- xfs_trans_ail_copy_lsn(mp->m_ail,
2277
- &iip->ili_flush_lsn,
2278
- &iip->ili_item.li_lsn);
2279
- xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2280
- }
2281
- }
2282
-
2283
-
2284
- /*
2285
- * For each inode in memory attempt to add it to the inode
2286
- * buffer and set it up for being staled on buffer IO
2287
- * completion. This is safe as we've locked out tail pushing
2288
- * and flushing by locking the buffer.
2289
- *
2290
- * We have already marked every inode that was part of a
2291
- * transaction stale above, which means there is no point in
2292
- * even trying to lock them.
2293
- */
2294
- for (i = 0; i < inodes_per_cluster; i++) {
2295
-retry:
2296
- rcu_read_lock();
2297
- ip = radix_tree_lookup(&pag->pag_ici_root,
2298
- XFS_INO_TO_AGINO(mp, (inum + i)));
2299
-
2300
- /* Inode not in memory, nothing to do */
2301
- if (!ip) {
2302
- rcu_read_unlock();
2303
- continue;
2304
- }
2305
-
2306
- /*
2307
- * because this is an RCU protected lookup, we could
2308
- * find a recently freed or even reallocated inode
2309
- * during the lookup. We need to check under the
2310
- * i_flags_lock for a valid inode here. Skip it if it
2311
- * is not valid, the wrong inode or stale.
2312
- */
2313
- spin_lock(&ip->i_flags_lock);
2314
- if (ip->i_ino != inum + i ||
2315
- __xfs_iflags_test(ip, XFS_ISTALE)) {
2316
- spin_unlock(&ip->i_flags_lock);
2317
- rcu_read_unlock();
2318
- continue;
2319
- }
2320
- spin_unlock(&ip->i_flags_lock);
2321
-
2322
- /*
2323
- * Don't try to lock/unlock the current inode, but we
2324
- * _cannot_ skip the other inodes that we did not find
2325
- * in the list attached to the buffer and are not
2326
- * already marked stale. If we can't lock it, back off
2327
- * and retry.
2328
- */
2329
- if (ip != free_ip) {
2330
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2331
- rcu_read_unlock();
2332
- delay(1);
2333
- goto retry;
2334
- }
2335
-
2336
- /*
2337
- * Check the inode number again in case we're
2338
- * racing with freeing in xfs_reclaim_inode().
2339
- * See the comments in that function for more
2340
- * information as to why the initial check is
2341
- * not sufficient.
2342
- */
2343
- if (ip->i_ino != inum + i) {
2344
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
2345
- rcu_read_unlock();
2346
- continue;
2347
- }
2348
- }
2349
- rcu_read_unlock();
2350
-
2351
- xfs_iflock(ip);
2352
- xfs_iflags_set(ip, XFS_ISTALE);
2353
-
2354
- /*
2355
- * we don't need to attach clean inodes or those only
2356
- * with unlogged changes (which we throw away, anyway).
2357
- */
2358
- iip = ip->i_itemp;
2359
- if (!iip || xfs_inode_clean(ip)) {
2360
- ASSERT(ip != free_ip);
2361
- xfs_ifunlock(ip);
2362
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
2363
- continue;
2364
- }
2365
-
2366
- iip->ili_last_fields = iip->ili_fields;
2367
- iip->ili_fields = 0;
2368
- iip->ili_fsync_fields = 0;
2369
- iip->ili_logged = 1;
2370
- xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2371
- &iip->ili_item.li_lsn);
2372
-
2373
- xfs_buf_attach_iodone(bp, xfs_istale_done,
2374
- &iip->ili_item);
2375
-
2376
- if (ip != free_ip)
2377
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
2378
- }
2662
+ for (i = 0; i < igeo->inodes_per_cluster; i++)
2663
+ xfs_ifree_mark_inode_stale(bp, free_ip, inum + i);
23792664
23802665 xfs_trans_stale_inode_buf(tp, bp);
23812666 xfs_trans_binval(tp, bp);
23822667 }
2383
-
2384
- xfs_perag_put(pag);
23852668 return 0;
23862669 }
23872670
23882671 /*
2389
- * Free any local-format buffers sitting around before we reset to
2390
- * extents format.
2391
- */
2392
-static inline void
2393
-xfs_ifree_local_data(
2394
- struct xfs_inode *ip,
2395
- int whichfork)
2396
-{
2397
- struct xfs_ifork *ifp;
2398
-
2399
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
2400
- return;
2401
-
2402
- ifp = XFS_IFORK_PTR(ip, whichfork);
2403
- xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
2404
-}
2405
-
2406
-/*
2407
- * This is called to return an inode to the inode free list.
2408
- * The inode should already be truncated to 0 length and have
2409
- * no pages associated with it. This routine also assumes that
2410
- * the inode is already a part of the transaction.
2672
+ * This is called to return an inode to the inode free list. The inode should
2673
+ * already be truncated to 0 length and have no pages associated with it. This
2674
+ * routine also assumes that the inode is already a part of the transaction.
24112675 *
2412
- * The on-disk copy of the inode will have been added to the list
2413
- * of unlinked inodes in the AGI. We need to remove the inode from
2414
- * that list atomically with respect to freeing it here.
2676
+ * The on-disk copy of the inode will have been added to the list of unlinked
2677
+ * inodes in the AGI. We need to remove the inode from that list atomically with
2678
+ * respect to freeing it here.
24152679 */
24162680 int
24172681 xfs_ifree(
....@@ -2420,38 +2684,50 @@
24202684 {
24212685 int error;
24222686 struct xfs_icluster xic = { 0 };
2687
+ struct xfs_inode_log_item *iip = ip->i_itemp;
24232688
24242689 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
24252690 ASSERT(VFS_I(ip)->i_nlink == 0);
2426
- ASSERT(ip->i_d.di_nextents == 0);
2427
- ASSERT(ip->i_d.di_anextents == 0);
2691
+ ASSERT(ip->i_df.if_nextents == 0);
24282692 ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
24292693 ASSERT(ip->i_d.di_nblocks == 0);
24302694
24312695 /*
2432
- * Pull the on-disk inode from the AGI unlinked list.
2696
+ * Free the inode first so that we guarantee that the AGI lock is going
2697
+ * to be taken before we remove the inode from the unlinked list. This
2698
+ * makes the AGI lock -> unlinked list modification order the same as
2699
+ * used in O_TMPFILE creation.
24332700 */
2434
- error = xfs_iunlink_remove(tp, ip);
2435
- if (error)
2436
- return error;
2437
-
24382701 error = xfs_difree(tp, ip->i_ino, &xic);
24392702 if (error)
24402703 return error;
24412704
2442
- xfs_ifree_local_data(ip, XFS_DATA_FORK);
2443
- xfs_ifree_local_data(ip, XFS_ATTR_FORK);
2705
+ error = xfs_iunlink_remove(tp, ip);
2706
+ if (error)
2707
+ return error;
2708
+
2709
+ /*
2710
+ * Free any local-format data sitting around before we reset the
2711
+ * data fork to extents format. Note that the attr fork data has
2712
+ * already been freed by xfs_attr_inactive.
2713
+ */
2714
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
2715
+ kmem_free(ip->i_df.if_u1.if_data);
2716
+ ip->i_df.if_u1.if_data = NULL;
2717
+ ip->i_df.if_bytes = 0;
2718
+ }
24442719
24452720 VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
24462721 ip->i_d.di_flags = 0;
2447
- ip->i_d.di_flags2 = 0;
2722
+ ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2;
24482723 ip->i_d.di_dmevmask = 0;
24492724 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
2450
- ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
2451
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
2725
+ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
24522726
24532727 /* Don't attempt to replay owner changes for a deleted inode */
2454
- ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER);
2728
+ spin_lock(&iip->ili_lock);
2729
+ iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
2730
+ spin_unlock(&iip->ili_lock);
24552731
24562732 /*
24572733 * Bump the generation count so no one will be confused
....@@ -2480,7 +2756,7 @@
24802756 trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
24812757
24822758 /* Give the log a push to start the unpinning I/O */
2483
- xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL);
2759
+ xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
24842760
24852761 }
24862762
....@@ -2769,9 +3045,7 @@
27693045 error = xfs_droplink(tp, dp2);
27703046 if (error)
27713047 goto out_trans_abort;
2772
- error = xfs_bumplink(tp, dp1);
2773
- if (error)
2774
- goto out_trans_abort;
3048
+ xfs_bumplink(tp, dp1);
27753049 }
27763050
27773051 /*
....@@ -2795,9 +3069,7 @@
27953069 error = xfs_droplink(tp, dp1);
27963070 if (error)
27973071 goto out_trans_abort;
2798
- error = xfs_bumplink(tp, dp2);
2799
- if (error)
2800
- goto out_trans_abort;
3072
+ xfs_bumplink(tp, dp2);
28013073 }
28023074
28033075 /*
....@@ -2835,7 +3107,7 @@
28353107 /*
28363108 * xfs_rename_alloc_whiteout()
28373109 *
2838
- * Return a referenced, unlinked, unlocked inode that that can be used as a
3110
+ * Return a referenced, unlinked, unlocked inode that can be used as a
28393111 * whiteout in a rename transaction. We use a tmpfile inode here so that if we
28403112 * crash between allocating the inode and linking it into the rename transaction
28413113 * recovery will free the inode and we won't leak it.
....@@ -2882,6 +3154,7 @@
28823154 struct xfs_trans *tp;
28833155 struct xfs_inode *wip = NULL; /* whiteout inode */
28843156 struct xfs_inode *inodes[__XFS_SORT_INODES];
3157
+ int i;
28853158 int num_inodes = __XFS_SORT_INODES;
28863159 bool new_parent = (src_dp != target_dp);
28873160 bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
....@@ -2899,7 +3172,6 @@
28993172 * appropriately.
29003173 */
29013174 if (flags & RENAME_WHITEOUT) {
2902
- ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
29033175 error = xfs_rename_alloc_whiteout(target_dp, &wip);
29043176 if (error)
29053177 return error;
....@@ -2956,7 +3228,7 @@
29563228 * tree quota mechanism would be circumvented.
29573229 */
29583230 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2959
- (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
3231
+ target_dp->i_d.di_projid != src_ip->i_d.di_projid)) {
29603232 error = -EXDEV;
29613233 goto out_trans_cancel;
29623234 }
....@@ -2995,6 +3267,30 @@
29953267 }
29963268
29973269 /*
3270
+ * Lock the AGI buffers we need to handle bumping the nlink of the
3271
+ * whiteout inode off the unlinked list and to handle dropping the
3272
+ * nlink of the target inode. Per locking order rules, do this in
3273
+ * increasing AG order and before directory block allocation tries to
3274
+ * grab AGFs because we grab AGIs before AGFs.
3275
+ *
3276
+ * The (vfs) caller must ensure that if src is a directory then
3277
+ * target_ip is either null or an empty directory.
3278
+ */
3279
+ for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
3280
+ if (inodes[i] == wip ||
3281
+ (inodes[i] == target_ip &&
3282
+ (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
3283
+ struct xfs_buf *bp;
3284
+ xfs_agnumber_t agno;
3285
+
3286
+ agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
3287
+ error = xfs_read_agi(mp, tp, agno, &bp);
3288
+ if (error)
3289
+ goto out_trans_cancel;
3290
+ }
3291
+ }
3292
+
3293
+ /*
29983294 * Directory entry creation below may acquire the AGF. Remove
29993295 * the whiteout from the unlinked list first to preserve correct
30003296 * AGI/AGF locking order. This dirties the transaction so failures
....@@ -3013,7 +3309,6 @@
30133309 goto out_trans_cancel;
30143310
30153311 xfs_bumplink(tp, wip);
3016
- xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
30173312 VFS_I(wip)->i_state &= ~I_LINKABLE;
30183313 }
30193314
....@@ -3035,9 +3330,7 @@
30353330 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
30363331
30373332 if (new_parent && src_is_directory) {
3038
- error = xfs_bumplink(tp, target_dp);
3039
- if (error)
3040
- goto out_trans_cancel;
3333
+ xfs_bumplink(tp, target_dp);
30413334 }
30423335 } else { /* target_ip != NULL */
30433336 /*
....@@ -3148,373 +3441,76 @@
31483441 return error;
31493442 }
31503443
3151
-STATIC int
3152
-xfs_iflush_cluster(
3153
- struct xfs_inode *ip,
3154
- struct xfs_buf *bp)
3155
-{
3156
- struct xfs_mount *mp = ip->i_mount;
3157
- struct xfs_perag *pag;
3158
- unsigned long first_index, mask;
3159
- unsigned long inodes_per_cluster;
3160
- int cilist_size;
3161
- struct xfs_inode **cilist;
3162
- struct xfs_inode *cip;
3163
- int nr_found;
3164
- int clcount = 0;
3165
- int i;
3166
-
3167
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
3168
-
3169
- inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
3170
- cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
3171
- cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
3172
- if (!cilist)
3173
- goto out_put;
3174
-
3175
- mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
3176
- first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
3177
- rcu_read_lock();
3178
- /* really need a gang lookup range call here */
3179
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
3180
- first_index, inodes_per_cluster);
3181
- if (nr_found == 0)
3182
- goto out_free;
3183
-
3184
- for (i = 0; i < nr_found; i++) {
3185
- cip = cilist[i];
3186
- if (cip == ip)
3187
- continue;
3188
-
3189
- /*
3190
- * because this is an RCU protected lookup, we could find a
3191
- * recently freed or even reallocated inode during the lookup.
3192
- * We need to check under the i_flags_lock for a valid inode
3193
- * here. Skip it if it is not valid or the wrong inode.
3194
- */
3195
- spin_lock(&cip->i_flags_lock);
3196
- if (!cip->i_ino ||
3197
- __xfs_iflags_test(cip, XFS_ISTALE)) {
3198
- spin_unlock(&cip->i_flags_lock);
3199
- continue;
3200
- }
3201
-
3202
- /*
3203
- * Once we fall off the end of the cluster, no point checking
3204
- * any more inodes in the list because they will also all be
3205
- * outside the cluster.
3206
- */
3207
- if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
3208
- spin_unlock(&cip->i_flags_lock);
3209
- break;
3210
- }
3211
- spin_unlock(&cip->i_flags_lock);
3212
-
3213
- /*
3214
- * Do an un-protected check to see if the inode is dirty and
3215
- * is a candidate for flushing. These checks will be repeated
3216
- * later after the appropriate locks are acquired.
3217
- */
3218
- if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
3219
- continue;
3220
-
3221
- /*
3222
- * Try to get locks. If any are unavailable or it is pinned,
3223
- * then this inode cannot be flushed and is skipped.
3224
- */
3225
-
3226
- if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
3227
- continue;
3228
- if (!xfs_iflock_nowait(cip)) {
3229
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
3230
- continue;
3231
- }
3232
- if (xfs_ipincount(cip)) {
3233
- xfs_ifunlock(cip);
3234
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
3235
- continue;
3236
- }
3237
-
3238
-
3239
- /*
3240
- * Check the inode number again, just to be certain we are not
3241
- * racing with freeing in xfs_reclaim_inode(). See the comments
3242
- * in that function for more information as to why the initial
3243
- * check is not sufficient.
3244
- */
3245
- if (!cip->i_ino) {
3246
- xfs_ifunlock(cip);
3247
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
3248
- continue;
3249
- }
3250
-
3251
- /*
3252
- * arriving here means that this inode can be flushed. First
3253
- * re-check that it's dirty before flushing.
3254
- */
3255
- if (!xfs_inode_clean(cip)) {
3256
- int error;
3257
- error = xfs_iflush_int(cip, bp);
3258
- if (error) {
3259
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
3260
- goto cluster_corrupt_out;
3261
- }
3262
- clcount++;
3263
- } else {
3264
- xfs_ifunlock(cip);
3265
- }
3266
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
3267
- }
3268
-
3269
- if (clcount) {
3270
- XFS_STATS_INC(mp, xs_icluster_flushcnt);
3271
- XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3272
- }
3273
-
3274
-out_free:
3275
- rcu_read_unlock();
3276
- kmem_free(cilist);
3277
-out_put:
3278
- xfs_perag_put(pag);
3279
- return 0;
3280
-
3281
-
3282
-cluster_corrupt_out:
3283
- /*
3284
- * Corruption detected in the clustering loop. Invalidate the
3285
- * inode buffer and shut down the filesystem.
3286
- */
3287
- rcu_read_unlock();
3288
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3289
-
3290
- /*
3291
- * We'll always have an inode attached to the buffer for completion
3292
- * process by the time we are called from xfs_iflush(). Hence we have
3293
- * always need to do IO completion processing to abort the inodes
3294
- * attached to the buffer. handle them just like the shutdown case in
3295
- * xfs_buf_submit().
3296
- */
3297
- ASSERT(bp->b_iodone);
3298
- bp->b_flags &= ~XBF_DONE;
3299
- xfs_buf_stale(bp);
3300
- xfs_buf_ioerror(bp, -EIO);
3301
- xfs_buf_ioend(bp);
3302
-
3303
- /* abort the corrupt inode, as it was not attached to the buffer */
3304
- xfs_iflush_abort(cip, false);
3305
- kmem_free(cilist);
3306
- xfs_perag_put(pag);
3307
- return -EFSCORRUPTED;
3308
-}
3309
-
3310
-/*
3311
- * Flush dirty inode metadata into the backing buffer.
3312
- *
3313
- * The caller must have the inode lock and the inode flush lock held. The
3314
- * inode lock will still be held upon return to the caller, and the inode
3315
- * flush lock will be released after the inode has reached the disk.
3316
- *
3317
- * The caller must write out the buffer returned in *bpp and release it.
3318
- */
3319
-int
3444
+static int
33203445 xfs_iflush(
3321
- struct xfs_inode *ip,
3322
- struct xfs_buf **bpp)
3323
-{
3324
- struct xfs_mount *mp = ip->i_mount;
3325
- struct xfs_buf *bp = NULL;
3326
- struct xfs_dinode *dip;
3327
- int error;
3328
-
3329
- XFS_STATS_INC(mp, xs_iflush_count);
3330
-
3331
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3332
- ASSERT(xfs_isiflocked(ip));
3333
- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3334
- ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3335
-
3336
- *bpp = NULL;
3337
-
3338
- xfs_iunpin_wait(ip);
3339
-
3340
- /*
3341
- * For stale inodes we cannot rely on the backing buffer remaining
3342
- * stale in cache for the remaining life of the stale inode and so
3343
- * xfs_imap_to_bp() below may give us a buffer that no longer contains
3344
- * inodes below. We have to check this after ensuring the inode is
3345
- * unpinned so that it is safe to reclaim the stale inode after the
3346
- * flush call.
3347
- */
3348
- if (xfs_iflags_test(ip, XFS_ISTALE)) {
3349
- xfs_ifunlock(ip);
3350
- return 0;
3351
- }
3352
-
3353
- /*
3354
- * This may have been unpinned because the filesystem is shutting
3355
- * down forcibly. If that's the case we must not write this inode
3356
- * to disk, because the log record didn't make it to disk.
3357
- *
3358
- * We also have to remove the log item from the AIL in this case,
3359
- * as we wait for an empty AIL as part of the unmount process.
3360
- */
3361
- if (XFS_FORCED_SHUTDOWN(mp)) {
3362
- error = -EIO;
3363
- goto abort_out;
3364
- }
3365
-
3366
- /*
3367
- * Get the buffer containing the on-disk inode. We are doing a try-lock
3368
- * operation here, so we may get an EAGAIN error. In that case, we
3369
- * simply want to return with the inode still dirty.
3370
- *
3371
- * If we get any other error, we effectively have a corruption situation
3372
- * and we cannot flush the inode, so we treat it the same as failing
3373
- * xfs_iflush_int().
3374
- */
3375
- error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
3376
- 0);
3377
- if (error == -EAGAIN) {
3378
- xfs_ifunlock(ip);
3379
- return error;
3380
- }
3381
- if (error)
3382
- goto corrupt_out;
3383
-
3384
- /*
3385
- * First flush out the inode that xfs_iflush was called with.
3386
- */
3387
- error = xfs_iflush_int(ip, bp);
3388
- if (error)
3389
- goto corrupt_out;
3390
-
3391
- /*
3392
- * If the buffer is pinned then push on the log now so we won't
3393
- * get stuck waiting in the write for too long.
3394
- */
3395
- if (xfs_buf_ispinned(bp))
3396
- xfs_log_force(mp, 0);
3397
-
3398
- /*
3399
- * inode clustering: try to gather other inodes into this write
3400
- *
3401
- * Note: Any error during clustering will result in the filesystem
3402
- * being shut down and completion callbacks run on the cluster buffer.
3403
- * As we have already flushed and attached this inode to the buffer,
3404
- * it has already been aborted and released by xfs_iflush_cluster() and
3405
- * so we have no further error handling to do here.
3406
- */
3407
- error = xfs_iflush_cluster(ip, bp);
3408
- if (error)
3409
- return error;
3410
-
3411
- *bpp = bp;
3412
- return 0;
3413
-
3414
-corrupt_out:
3415
- if (bp)
3416
- xfs_buf_relse(bp);
3417
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3418
-abort_out:
3419
- /* abort the corrupt inode, as it was not attached to the buffer */
3420
- xfs_iflush_abort(ip, false);
3421
- return error;
3422
-}
3423
-
3424
-/*
3425
- * If there are inline format data / attr forks attached to this inode,
3426
- * make sure they're not corrupt.
3427
- */
3428
-bool
3429
-xfs_inode_verify_forks(
3430
- struct xfs_inode *ip)
3431
-{
3432
- struct xfs_ifork *ifp;
3433
- xfs_failaddr_t fa;
3434
-
3435
- fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops);
3436
- if (fa) {
3437
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
3438
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
3439
- ifp->if_u1.if_data, ifp->if_bytes, fa);
3440
- return false;
3441
- }
3442
-
3443
- fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops);
3444
- if (fa) {
3445
- ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
3446
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
3447
- ifp ? ifp->if_u1.if_data : NULL,
3448
- ifp ? ifp->if_bytes : 0, fa);
3449
- return false;
3450
- }
3451
- return true;
3452
-}
3453
-
3454
-STATIC int
3455
-xfs_iflush_int(
34563446 struct xfs_inode *ip,
34573447 struct xfs_buf *bp)
34583448 {
34593449 struct xfs_inode_log_item *iip = ip->i_itemp;
34603450 struct xfs_dinode *dip;
34613451 struct xfs_mount *mp = ip->i_mount;
3452
+ int error;
34623453
34633454 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3464
- ASSERT(xfs_isiflocked(ip));
3465
- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3466
- ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3467
- ASSERT(iip != NULL && iip->ili_fields != 0);
3468
- ASSERT(ip->i_d.di_version > 1);
3455
+ ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
3456
+ ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
3457
+ ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3458
+ ASSERT(iip->ili_item.li_buf == bp);
34693459
3470
- /* set *dip = inode's place in the buffer */
34713460 dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
34723461
3462
+ /*
3463
+ * We don't flush the inode if any of the following checks fail, but we
3464
+ * do still update the log item and attach to the backing buffer as if
3465
+ * the flush happened. This is a formality to facilitate predictable
3466
+ * error handling as the caller will shutdown and fail the buffer.
3467
+ */
3468
+ error = -EFSCORRUPTED;
34733469 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
34743470 mp, XFS_ERRTAG_IFLUSH_1)) {
34753471 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
34763472 "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT,
34773473 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3478
- goto corrupt_out;
3474
+ goto flush_out;
34793475 }
34803476 if (S_ISREG(VFS_I(ip)->i_mode)) {
34813477 if (XFS_TEST_ERROR(
3482
- (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3483
- (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
3478
+ ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3479
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
34843480 mp, XFS_ERRTAG_IFLUSH_3)) {
34853481 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
34863482 "%s: Bad regular inode %Lu, ptr "PTR_FMT,
34873483 __func__, ip->i_ino, ip);
3488
- goto corrupt_out;
3484
+ goto flush_out;
34893485 }
34903486 } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
34913487 if (XFS_TEST_ERROR(
3492
- (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3493
- (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
3494
- (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
3488
+ ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3489
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
3490
+ ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
34953491 mp, XFS_ERRTAG_IFLUSH_4)) {
34963492 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
34973493 "%s: Bad directory inode %Lu, ptr "PTR_FMT,
34983494 __func__, ip->i_ino, ip);
3499
- goto corrupt_out;
3495
+ goto flush_out;
35003496 }
35013497 }
3502
- if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
3498
+ if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
35033499 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
35043500 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
35053501 "%s: detected corrupt incore inode %Lu, "
35063502 "total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
35073503 __func__, ip->i_ino,
3508
- ip->i_d.di_nextents + ip->i_d.di_anextents,
3504
+ ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
35093505 ip->i_d.di_nblocks, ip);
3510
- goto corrupt_out;
3506
+ goto flush_out;
35113507 }
35123508 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
35133509 mp, XFS_ERRTAG_IFLUSH_6)) {
35143510 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
35153511 "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT,
35163512 __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
3517
- goto corrupt_out;
3513
+ goto flush_out;
35183514 }
35193515
35203516 /*
....@@ -3526,12 +3522,19 @@
35263522 * backwards compatibility with old kernels that predate logging all
35273523 * inode changes.
35283524 */
3529
- if (ip->i_d.di_version < 3)
3525
+ if (!xfs_sb_version_has_v3inode(&mp->m_sb))
35303526 ip->i_d.di_flushiter++;
35313527
3532
- /* Check the inline fork data before we write out. */
3533
- if (!xfs_inode_verify_forks(ip))
3534
- goto corrupt_out;
3528
+ /*
3529
+ * If there are inline format data / attr forks attached to this inode,
3530
+ * make sure they are not corrupt.
3531
+ */
3532
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
3533
+ xfs_ifork_verify_local_data(ip))
3534
+ goto flush_out;
3535
+ if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL &&
3536
+ xfs_ifork_verify_local_attr(ip))
3537
+ goto flush_out;
35353538
35363539 /*
35373540 * Copy the dirty parts of the inode into the on-disk inode. We always
....@@ -3547,7 +3550,6 @@
35473550 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
35483551 if (XFS_IFORK_Q(ip))
35493552 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3550
- xfs_inobp_check(mp, bp);
35513553
35523554 /*
35533555 * We've recorded everything logged in the inode, so we'd like to clear
....@@ -3560,45 +3562,144 @@
35603562 *
35613563 * What we do is move the bits to the ili_last_fields field. When
35623564 * logging the inode, these bits are moved back to the ili_fields field.
3563
- * In the xfs_iflush_done() routine we clear ili_last_fields, since we
3564
- * know that the information those bits represent is permanently on
3565
+ * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
3566
+ * we know that the information those bits represent is permanently on
35653567 * disk. As long as the flush completes before the inode is logged
35663568 * again, then both ili_fields and ili_last_fields will be cleared.
3567
- *
3568
- * We can play with the ili_fields bits here, because the inode lock
3569
- * must be held exclusively in order to set bits there and the flush
3570
- * lock protects the ili_last_fields bits. Set ili_logged so the flush
3571
- * done routine can tell whether or not to look in the AIL. Also, store
3572
- * the current LSN of the inode so that we can tell whether the item has
3573
- * moved in the AIL from xfs_iflush_done(). In order to read the lsn we
3574
- * need the AIL lock, because it is a 64 bit value that cannot be read
3575
- * atomically.
35763569 */
3570
+ error = 0;
3571
+flush_out:
3572
+ spin_lock(&iip->ili_lock);
35773573 iip->ili_last_fields = iip->ili_fields;
35783574 iip->ili_fields = 0;
35793575 iip->ili_fsync_fields = 0;
3580
- iip->ili_logged = 1;
3576
+ spin_unlock(&iip->ili_lock);
35813577
3578
+ /*
3579
+ * Store the current LSN of the inode so that we can tell whether the
3580
+ * item has moved in the AIL from xfs_buf_inode_iodone().
3581
+ */
35823582 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
35833583 &iip->ili_item.li_lsn);
35843584
3585
- /*
3586
- * Attach the function xfs_iflush_done to the inode's
3587
- * buffer. This will remove the inode from the AIL
3588
- * and unlock the inode's flush lock when the inode is
3589
- * completely written to disk.
3590
- */
3591
- xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
3592
-
35933585 /* generate the checksum. */
35943586 xfs_dinode_calc_crc(mp, dip);
3587
+ return error;
3588
+}
35953589
3596
- ASSERT(!list_empty(&bp->b_li_list));
3597
- ASSERT(bp->b_iodone != NULL);
3590
+/*
3591
+ * Non-blocking flush of dirty inode metadata into the backing buffer.
3592
+ *
3593
+ * The caller must have a reference to the inode and hold the cluster buffer
3594
+ * locked. The function will walk across all the inodes on the cluster buffer it
3595
+ * can find and lock without blocking, and flush them to the cluster buffer.
3596
+ *
3597
+ * On successful flushing of at least one inode, the caller must write out the
3598
+ * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
3599
+ * the caller needs to release the buffer. On failure, the filesystem will be
3600
+ * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
3601
+ * will be returned.
3602
+ */
3603
+int
3604
+xfs_iflush_cluster(
3605
+ struct xfs_buf *bp)
3606
+{
3607
+ struct xfs_mount *mp = bp->b_mount;
3608
+ struct xfs_log_item *lip, *n;
3609
+ struct xfs_inode *ip;
3610
+ struct xfs_inode_log_item *iip;
3611
+ int clcount = 0;
3612
+ int error = 0;
3613
+
3614
+ /*
3615
+ * We must use the safe variant here as on shutdown xfs_iflush_abort()
3616
+ * can remove itself from the list.
3617
+ */
3618
+ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
3619
+ iip = (struct xfs_inode_log_item *)lip;
3620
+ ip = iip->ili_inode;
3621
+
3622
+ /*
3623
+ * Quick and dirty check to avoid locks if possible.
3624
+ */
3625
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
3626
+ continue;
3627
+ if (xfs_ipincount(ip))
3628
+ continue;
3629
+
3630
+ /*
3631
+ * The inode is still attached to the buffer, which means it is
3632
+ * dirty but reclaim might try to grab it. Check carefully for
3633
+ * that, and grab the ilock while still holding the i_flags_lock
3634
+ * to guarantee reclaim will not be able to reclaim this inode
3635
+ * once we drop the i_flags_lock.
3636
+ */
3637
+ spin_lock(&ip->i_flags_lock);
3638
+ ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
3639
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
3640
+ spin_unlock(&ip->i_flags_lock);
3641
+ continue;
3642
+ }
3643
+
3644
+ /*
3645
+ * ILOCK will pin the inode against reclaim and prevent
3646
+ * concurrent transactions modifying the inode while we are
3647
+ * flushing the inode. If we get the lock, set the flushing
3648
+ * state before we drop the i_flags_lock.
3649
+ */
3650
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3651
+ spin_unlock(&ip->i_flags_lock);
3652
+ continue;
3653
+ }
3654
+ __xfs_iflags_set(ip, XFS_IFLUSHING);
3655
+ spin_unlock(&ip->i_flags_lock);
3656
+
3657
+ /*
3658
+ * Abort flushing this inode if we are shut down because the
3659
+ * inode may not currently be in the AIL. This can occur when
3660
+ * log I/O failure unpins the inode without inserting into the
3661
+ * AIL, leaving a dirty/unpinned inode attached to the buffer
3662
+ * that otherwise looks like it should be flushed.
3663
+ */
3664
+ if (XFS_FORCED_SHUTDOWN(mp)) {
3665
+ xfs_iunpin_wait(ip);
3666
+ xfs_iflush_abort(ip);
3667
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
3668
+ error = -EIO;
3669
+ continue;
3670
+ }
3671
+
3672
+ /* don't block waiting on a log force to unpin dirty inodes */
3673
+ if (xfs_ipincount(ip)) {
3674
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
3675
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
3676
+ continue;
3677
+ }
3678
+
3679
+ if (!xfs_inode_clean(ip))
3680
+ error = xfs_iflush(ip, bp);
3681
+ else
3682
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
3683
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
3684
+ if (error)
3685
+ break;
3686
+ clcount++;
3687
+ }
3688
+
3689
+ if (error) {
3690
+ bp->b_flags |= XBF_ASYNC;
3691
+ xfs_buf_ioend_fail(bp);
3692
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3693
+ return error;
3694
+ }
3695
+
3696
+ if (!clcount)
3697
+ return -EAGAIN;
3698
+
3699
+ XFS_STATS_INC(mp, xs_icluster_flushcnt);
3700
+ XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
35983701 return 0;
35993702
3600
-corrupt_out:
3601
- return -EFSCORRUPTED;
36023703 }
36033704
36043705 /* Release an inode. */
....@@ -3609,3 +3710,115 @@
36093710 trace_xfs_irele(ip, _RET_IP_);
36103711 iput(VFS_I(ip));
36113712 }
3713
+
3714
+/*
3715
+ * Ensure all commited transactions touching the inode are written to the log.
3716
+ */
3717
+int
3718
+xfs_log_force_inode(
3719
+ struct xfs_inode *ip)
3720
+{
3721
+ xfs_csn_t seq = 0;
3722
+
3723
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
3724
+ if (xfs_ipincount(ip))
3725
+ seq = ip->i_itemp->ili_commit_seq;
3726
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
3727
+
3728
+ if (!seq)
3729
+ return 0;
3730
+ return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
3731
+}
3732
+
3733
+/*
3734
+ * Grab the exclusive iolock for a data copy from src to dest, making sure to
3735
+ * abide vfs locking order (lowest pointer value goes first) and breaking the
3736
+ * layout leases before proceeding. The loop is needed because we cannot call
3737
+ * the blocking break_layout() with the iolocks held, and therefore have to
3738
+ * back out both locks.
3739
+ */
3740
+static int
3741
+xfs_iolock_two_inodes_and_break_layout(
3742
+ struct inode *src,
3743
+ struct inode *dest)
3744
+{
3745
+ int error;
3746
+
3747
+ if (src > dest)
3748
+ swap(src, dest);
3749
+
3750
+retry:
3751
+ /* Wait to break both inodes' layouts before we start locking. */
3752
+ error = break_layout(src, true);
3753
+ if (error)
3754
+ return error;
3755
+ if (src != dest) {
3756
+ error = break_layout(dest, true);
3757
+ if (error)
3758
+ return error;
3759
+ }
3760
+
3761
+ /* Lock one inode and make sure nobody got in and leased it. */
3762
+ inode_lock(src);
3763
+ error = break_layout(src, false);
3764
+ if (error) {
3765
+ inode_unlock(src);
3766
+ if (error == -EWOULDBLOCK)
3767
+ goto retry;
3768
+ return error;
3769
+ }
3770
+
3771
+ if (src == dest)
3772
+ return 0;
3773
+
3774
+ /* Lock the other inode and make sure nobody got in and leased it. */
3775
+ inode_lock_nested(dest, I_MUTEX_NONDIR2);
3776
+ error = break_layout(dest, false);
3777
+ if (error) {
3778
+ inode_unlock(src);
3779
+ inode_unlock(dest);
3780
+ if (error == -EWOULDBLOCK)
3781
+ goto retry;
3782
+ return error;
3783
+ }
3784
+
3785
+ return 0;
3786
+}
3787
+
3788
+/*
3789
+ * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
3790
+ * mmap activity.
3791
+ */
3792
+int
3793
+xfs_ilock2_io_mmap(
3794
+ struct xfs_inode *ip1,
3795
+ struct xfs_inode *ip2)
3796
+{
3797
+ int ret;
3798
+
3799
+ ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
3800
+ if (ret)
3801
+ return ret;
3802
+ if (ip1 == ip2)
3803
+ xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
3804
+ else
3805
+ xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
3806
+ ip2, XFS_MMAPLOCK_EXCL);
3807
+ return 0;
3808
+}
3809
+
3810
+/* Unlock both inodes to allow IO and mmap activity. */
3811
+void
3812
+xfs_iunlock2_io_mmap(
3813
+ struct xfs_inode *ip1,
3814
+ struct xfs_inode *ip2)
3815
+{
3816
+ bool same_inode = (ip1 == ip2);
3817
+
3818
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3819
+ if (!same_inode)
3820
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3821
+ inode_unlock(VFS_I(ip2));
3822
+ if (!same_inode)
3823
+ inode_unlock(VFS_I(ip1));
3824
+}