~hc/RK356X_SDK_RELEASE.git

..	..	@@ -3,7 +3,6 @@
3	3	* Copyright (c) 2000-2006 Silicon Graphics, Inc.
4	4	* All Rights Reserved.
5	5	*/
6		-#include <linux/log2.h>
7	6	#include <linux/iversion.h>
8	7
9	8	#include "xfs.h"
..	..	@@ -16,10 +15,7 @@
16	15	#include "xfs_mount.h"
17	16	#include "xfs_defer.h"
18	17	#include "xfs_inode.h"
19		-#include "xfs_da_format.h"
20		-#include "xfs_da_btree.h"
21	18	#include "xfs_dir2.h"
22		-#include "xfs_attr_sf.h"
23	19	#include "xfs_attr.h"
24	20	#include "xfs_trans_space.h"
25	21	#include "xfs_trans.h"
..	..	@@ -32,7 +28,6 @@
32	28	#include "xfs_error.h"
33	29	#include "xfs_quota.h"
34	30	#include "xfs_filestream.h"
35		-#include "xfs_cksum.h"
36	31	#include "xfs_trace.h"
37	32	#include "xfs_icache.h"
38	33	#include "xfs_symlink.h"
..	..	@@ -40,7 +35,6 @@
40	35	#include "xfs_log.h"
41	36	#include "xfs_bmap_btree.h"
42	37	#include "xfs_reflink.h"
43		-#include "xfs_dir2_priv.h"
44	38
45	39	kmem_zone_t *xfs_inode_zone;
46	40
..	..	@@ -50,7 +44,6 @@
50	44	*/
51	45	#define XFS_ITRUNC_MAX_EXTENTS 2
52	46
53		-STATIC int xfs_iflush_int(struct xfs_inode , struct xfs_buf );
54	47	STATIC int xfs_iunlink(struct xfs_trans , struct xfs_inode );
55	48	STATIC int xfs_iunlink_remove(struct xfs_trans , struct xfs_inode );
56	49
..	..	@@ -61,6 +54,12 @@
61	54	xfs_get_extsz_hint(
62	55	struct xfs_inode *ip)
63	56	{
	57	+ /*
	58	+ * No point in aligning allocations if we need to COW to actually
	59	+ * write to them.
	60	+ */
	61	+ if (xfs_is_always_cow_inode(ip))
	62	+ return 0;
64	63	if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
65	64	return ip->i_d.di_extsize;
66	65	if (XFS_IS_REALTIME_INODE(ip))
..	..	@@ -112,7 +111,7 @@
112	111	{
113	112	uint lock_mode = XFS_ILOCK_SHARED;
114	113
115		- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
	114	+ if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE &&
116	115	(ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
117	116	lock_mode = XFS_ILOCK_EXCL;
118	117	xfs_ilock(ip, lock_mode);
..	..	@@ -125,7 +124,8 @@
125	124	{
126	125	uint lock_mode = XFS_ILOCK_SHARED;
127	126
128		- if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
	127	+ if (ip->i_afp &&
	128	+ ip->i_afp->if_format == XFS_DINODE_FMT_BTREE &&
129	129	(ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
130	130	lock_mode = XFS_ILOCK_EXCL;
131	131	xfs_ilock(ip, lock_mode);
..	..	@@ -144,17 +144,17 @@
144	144	*
145	145	* i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
146	146	*
147		- * mmap_sem locking order:
	147	+ * mmap_lock locking order:
148	148	*
149		- * i_rwsem -> page lock -> mmap_sem
150		- * mmap_sem -> i_mmap_lock -> page_lock
	149	+ * i_rwsem -> page lock -> mmap_lock
	150	+ * mmap_lock -> i_mmap_lock -> page_lock
151	151	*
152		- * The difference in mmap_sem locking order mean that we cannot hold the
	152	+ * The difference in mmap_lock locking order mean that we cannot hold the
153	153	* i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
154		- * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
	154	+ * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
155	155	* in get_user_pages() to map the user pages into the kernel address space for
156	156	* direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
157		- * page faults already hold the mmap_sem.
	157	+ * page faults already hold the mmap_lock.
158	158	*
159	159	* Hence to serialise fully against both syscall and mmap based IO, we need to
160	160	* take both the i_rwsem and the i_mmap_lock. These locks should only be both
..	..	@@ -441,17 +441,17 @@
441	441	*/
442	442	static void
443	443	xfs_lock_inodes(
444		- xfs_inode_t **ips,
445		- int inodes,
446		- uint lock_mode)
	444	+ struct xfs_inode **ips,
	445	+ int inodes,
	446	+ uint lock_mode)
447	447	{
448		- int attempts = 0, i, j, try_lock;
449		- xfs_log_item_t *lp;
	448	+ int attempts = 0, i, j, try_lock;
	449	+ struct xfs_log_item *lp;
450	450
451	451	/*
452	452	* Currently supports between 2 and 5 inodes with exclusive locking. We
453	453	* support an arbitrary depth of locking here, but absolute limits on
454		- * inodes depend on the the type of locking and the limits placed by
	454	+ * inodes depend on the type of locking and the limits placed by
455	455	* lockdep annotations in xfs_lock_inumorder. These are all checked by
456	456	* the asserts.
457	457	*/
..	..	@@ -485,7 +485,7 @@
485	485	*/
486	486	if (!try_lock) {
487	487	for (j = (i - 1); j >= 0 && !try_lock; j--) {
488		- lp = (xfs_log_item_t *)ips[j]->i_itemp;
	488	+ lp = &ips[j]->i_itemp->ili_item;
489	489	if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
490	490	try_lock++;
491	491	}
..	..	@@ -551,7 +551,7 @@
551	551	struct xfs_inode *temp;
552	552	uint mode_temp;
553	553	int attempts = 0;
554		- xfs_log_item_t *lp;
	554	+ struct xfs_log_item *lp;
555	555
556	556	ASSERT(hweight32(ip0_mode) == 1);
557	557	ASSERT(hweight32(ip1_mode) == 1);
..	..	@@ -585,7 +585,7 @@
585	585	* the second lock. If we can't get it, we must release the first one
586	586	* and try again.
587	587	*/
588		- lp = (xfs_log_item_t *)ip0->i_itemp;
	588	+ lp = &ip0->i_itemp->ili_item;
589	589	if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
590	590	if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
591	591	xfs_iunlock(ip0, ip0_mode);
..	..	@@ -596,22 +596,6 @@
596	596	} else {
597	597	xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1));
598	598	}
599		-}
600		-
601		-void
602		-__xfs_iflock(
603		- struct xfs_inode *ip)
604		-{
605		- wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
606		- DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
607		-
608		- do {
609		- prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
610		- if (xfs_isiflocked(ip))
611		- io_schedule();
612		- } while (!xfs_iflock_nowait(ip));
613		-
614		- finish_wait(wq, &wait.wq_entry);
615	599	}
616	600
617	601	STATIC uint
..	..	@@ -714,6 +698,68 @@
714	698	return error;
715	699	}
716	700
	701	+/* Propagate di_flags from a parent inode to a child inode. */
	702	+static void
	703	+xfs_inode_inherit_flags(
	704	+ struct xfs_inode *ip,
	705	+ const struct xfs_inode *pip)
	706	+{
	707	+ unsigned int di_flags = 0;
	708	+ umode_t mode = VFS_I(ip)->i_mode;
	709	+
	710	+ if (S_ISDIR(mode)) {
	711	+ if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
	712	+ di_flags \|= XFS_DIFLAG_RTINHERIT;
	713	+ if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
	714	+ di_flags \|= XFS_DIFLAG_EXTSZINHERIT;
	715	+ ip->i_d.di_extsize = pip->i_d.di_extsize;
	716	+ }
	717	+ if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
	718	+ di_flags \|= XFS_DIFLAG_PROJINHERIT;
	719	+ } else if (S_ISREG(mode)) {
	720	+ if ((pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) &&
	721	+ xfs_sb_version_hasrealtime(&ip->i_mount->m_sb))
	722	+ di_flags \|= XFS_DIFLAG_REALTIME;
	723	+ if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
	724	+ di_flags \|= XFS_DIFLAG_EXTSIZE;
	725	+ ip->i_d.di_extsize = pip->i_d.di_extsize;
	726	+ }
	727	+ }
	728	+ if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
	729	+ xfs_inherit_noatime)
	730	+ di_flags \|= XFS_DIFLAG_NOATIME;
	731	+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
	732	+ xfs_inherit_nodump)
	733	+ di_flags \|= XFS_DIFLAG_NODUMP;
	734	+ if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
	735	+ xfs_inherit_sync)
	736	+ di_flags \|= XFS_DIFLAG_SYNC;
	737	+ if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
	738	+ xfs_inherit_nosymlinks)
	739	+ di_flags \|= XFS_DIFLAG_NOSYMLINKS;
	740	+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
	741	+ xfs_inherit_nodefrag)
	742	+ di_flags \|= XFS_DIFLAG_NODEFRAG;
	743	+ if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
	744	+ di_flags \|= XFS_DIFLAG_FILESTREAM;
	745	+
	746	+ ip->i_d.di_flags \|= di_flags;
	747	+}
	748	+
	749	+/* Propagate di_flags2 from a parent inode to a child inode. */
	750	+static void
	751	+xfs_inode_inherit_flags2(
	752	+ struct xfs_inode *ip,
	753	+ const struct xfs_inode *pip)
	754	+{
	755	+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
	756	+ ip->i_d.di_flags2 \|= XFS_DIFLAG2_COWEXTSIZE;
	757	+ ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
	758	+ }
	759	+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
	760	+ ip->i_d.di_flags2 \|= XFS_DIFLAG2_DAX;
	761	+}
	762	+
717	763	/*
718	764	* Allocate an inode on disk and return a copy of its in-core version.
719	765	* The in-core inode is locked exclusively. Set mode, nlink, and rdev
..	..	@@ -756,6 +802,7 @@
756	802	xfs_buf_t **ialloc_context,
757	803	xfs_inode_t **ipp)
758	804	{
	805	+ struct inode *dir = pip ? VFS_I(pip) : NULL;
759	806	struct xfs_mount *mp = tp->t_mountp;
760	807	xfs_ino_t ino;
761	808	xfs_inode_t *ip;
..	..	@@ -801,26 +848,17 @@
801	848	return error;
802	849	ASSERT(ip != NULL);
803	850	inode = VFS_I(ip);
804		-
805		- /*
806		- * We always convert v1 inodes to v2 now - we only support filesystems
807		- * with >= v2 inode capability, so there is no reason for ever leaving
808		- * an inode in v1 format.
809		- */
810		- if (ip->i_d.di_version == 1)
811		- ip->i_d.di_version = 2;
812		-
813		- inode->i_mode = mode;
814	851	set_nlink(inode, nlink);
815		- ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
816		- ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
817	852	inode->i_rdev = rdev;
818		- xfs_set_projid(ip, prid);
	853	+ ip->i_d.di_projid = prid;
819	854
820		- if (pip && XFS_INHERIT_GID(pip)) {
821		- ip->i_d.di_gid = pip->i_d.di_gid;
822		- if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
823		- inode->i_mode \|= S_ISGID;
	855	+ if (dir && !(dir->i_mode & S_ISGID) &&
	856	+ (mp->m_flags & XFS_MOUNT_GRPID)) {
	857	+ inode->i_uid = current_fsuid();
	858	+ inode->i_gid = dir->i_gid;
	859	+ inode->i_mode = mode;
	860	+ } else {
	861	+ inode_init_owner(inode, dir, mode);
824	862	}
825	863
826	864	/*
..	..	@@ -828,13 +866,12 @@
828	866	* ID or one of the supplementary group IDs, the S_ISGID bit is cleared
829	867	* (and only if the irix_sgid_inherit compatibility variable is set).
830	868	*/
831		- if ((irix_sgid_inherit) &&
832		- (inode->i_mode & S_ISGID) &&
833		- (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
	869	+ if (irix_sgid_inherit &&
	870	+ (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid))
834	871	inode->i_mode &= ~S_ISGID;
835	872
836	873	ip->i_d.di_size = 0;
837		- ip->i_d.di_nextents = 0;
	874	+ ip->i_df.if_nextents = 0;
838	875	ASSERT(ip->i_d.di_nblocks == 0);
839	876
840	877	tv = current_time(inode);
..	..	@@ -847,14 +884,12 @@
847	884	ip->i_d.di_dmstate = 0;
848	885	ip->i_d.di_flags = 0;
849	886
850		- if (ip->i_d.di_version == 3) {
	887	+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
851	888	inode_set_iversion(inode, 1);
852		- ip->i_d.di_flags2 = 0;
	889	+ ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2;
853	890	ip->i_d.di_cowextsize = 0;
854		- ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
855		- ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
	891	+ ip->i_d.di_crtime = tv;
856	892	}
857		-
858	893
859	894	flags = XFS_ILOG_CORE;
860	895	switch (mode & S_IFMT) {
..	..	@@ -862,70 +897,19 @@
862	897	case S_IFCHR:
863	898	case S_IFBLK:
864	899	case S_IFSOCK:
865		- ip->i_d.di_format = XFS_DINODE_FMT_DEV;
	900	+ ip->i_df.if_format = XFS_DINODE_FMT_DEV;
866	901	ip->i_df.if_flags = 0;
867	902	flags \|= XFS_ILOG_DEV;
868	903	break;
869	904	case S_IFREG:
870	905	case S_IFDIR:
871		- if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
872		- uint di_flags = 0;
873		-
874		- if (S_ISDIR(mode)) {
875		- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
876		- di_flags \|= XFS_DIFLAG_RTINHERIT;
877		- if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
878		- di_flags \|= XFS_DIFLAG_EXTSZINHERIT;
879		- ip->i_d.di_extsize = pip->i_d.di_extsize;
880		- }
881		- if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
882		- di_flags \|= XFS_DIFLAG_PROJINHERIT;
883		- } else if (S_ISREG(mode)) {
884		- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
885		- di_flags \|= XFS_DIFLAG_REALTIME;
886		- if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
887		- di_flags \|= XFS_DIFLAG_EXTSIZE;
888		- ip->i_d.di_extsize = pip->i_d.di_extsize;
889		- }
890		- }
891		- if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
892		- xfs_inherit_noatime)
893		- di_flags \|= XFS_DIFLAG_NOATIME;
894		- if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
895		- xfs_inherit_nodump)
896		- di_flags \|= XFS_DIFLAG_NODUMP;
897		- if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
898		- xfs_inherit_sync)
899		- di_flags \|= XFS_DIFLAG_SYNC;
900		- if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
901		- xfs_inherit_nosymlinks)
902		- di_flags \|= XFS_DIFLAG_NOSYMLINKS;
903		- if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
904		- xfs_inherit_nodefrag)
905		- di_flags \|= XFS_DIFLAG_NODEFRAG;
906		- if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
907		- di_flags \|= XFS_DIFLAG_FILESTREAM;
908		-
909		- ip->i_d.di_flags \|= di_flags;
910		- }
911		- if (pip &&
912		- (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
913		- pip->i_d.di_version == 3 &&
914		- ip->i_d.di_version == 3) {
915		- uint64_t di_flags2 = 0;
916		-
917		- if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
918		- di_flags2 \|= XFS_DIFLAG2_COWEXTSIZE;
919		- ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
920		- }
921		- if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
922		- di_flags2 \|= XFS_DIFLAG2_DAX;
923		-
924		- ip->i_d.di_flags2 \|= di_flags2;
925		- }
	906	+ if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY))
	907	+ xfs_inode_inherit_flags(ip, pip);
	908	+ if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY))
	909	+ xfs_inode_inherit_flags2(ip, pip);
926	910	/* FALLTHROUGH */
927	911	case S_IFLNK:
928		- ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
	912	+ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
929	913	ip->i_df.if_flags = XFS_IFEXTENTS;
930	914	ip->i_df.if_bytes = 0;
931	915	ip->i_df.if_u1.if_root = NULL;
..	..	@@ -933,11 +917,6 @@
933	917	default:
934	918	ASSERT(0);
935	919	}
936		- /*
937		- * Attribute fork settings for new inode.
938		- */
939		- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
940		- ip->i_d.di_anextents = 0;
941	920
942	921	/*
943	922	* Log the new values stuffed into the inode.
..	..	@@ -1116,17 +1095,15 @@
1116	1095	/*
1117	1096	* Increment the link count on an inode & log the change.
1118	1097	*/
1119		-static int
	1098	+static void
1120	1099	xfs_bumplink(
1121	1100	xfs_trans_t *tp,
1122	1101	xfs_inode_t *ip)
1123	1102	{
1124	1103	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1125	1104
1126		- ASSERT(ip->i_d.di_version > 1);
1127	1105	inc_nlink(VFS_I(ip));
1128	1106	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1129		- return 0;
1130	1107	}
1131	1108
1132	1109	int
..	..	@@ -1160,8 +1137,7 @@
1160	1137	/*
1161	1138	* Make sure that we have allocated dquot(s) on disk.
1162	1139	*/
1163		- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1164		- xfs_kgid_to_gid(current_fsgid()), prid,
	1140	+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1165	1141	XFS_QMOPT_QUOTALL \| XFS_QMOPT_INHERIT,
1166	1142	&udqp, &gdqp, &pdqp);
1167	1143	if (error)
..	..	@@ -1221,8 +1197,7 @@
1221	1197	unlock_dp_on_error = false;
1222	1198
1223	1199	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1224		- resblks ?
1225		- resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
	1200	+ resblks - XFS_IALLOC_SPACE_RES(mp));
1226	1201	if (error) {
1227	1202	ASSERT(error != -ENOSPC);
1228	1203	goto out_trans_cancel;
..	..	@@ -1235,9 +1210,7 @@
1235	1210	if (error)
1236	1211	goto out_trans_cancel;
1237	1212
1238		- error = xfs_bumplink(tp, dp);
1239		- if (error)
1240		- goto out_trans_cancel;
	1213	+ xfs_bumplink(tp, dp);
1241	1214	}
1242	1215
1243	1216	/*
..	..	@@ -1313,8 +1286,7 @@
1313	1286	/*
1314	1287	* Make sure that we have allocated dquot(s) on disk.
1315	1288	*/
1316		- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1317		- xfs_kgid_to_gid(current_fsgid()), prid,
	1289	+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1318	1290	XFS_QMOPT_QUOTALL \| XFS_QMOPT_INHERIT,
1319	1291	&udqp, &gdqp, &pdqp);
1320	1292	if (error)
..	..	@@ -1427,7 +1399,7 @@
1427	1399	* the tree quota mechanism could be circumvented.
1428	1400	*/
1429	1401	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1430		- (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
	1402	+ tdp->i_d.di_projid != sip->i_d.di_projid)) {
1431	1403	error = -EXDEV;
1432	1404	goto error_return;
1433	1405	}
..	..	@@ -1454,9 +1426,7 @@
1454	1426	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
1455	1427	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1456	1428
1457		- error = xfs_bumplink(tp, sip);
1458		- if (error)
1459		- goto error_return;
	1429	+ xfs_bumplink(tp, sip);
1460	1430
1461	1431	/*
1462	1432	* If this is a synchronous mount, make sure that the
..	..	@@ -1524,10 +1494,8 @@
1524	1494	struct xfs_mount *mp = ip->i_mount;
1525	1495	struct xfs_trans tp = tpp;
1526	1496	xfs_fileoff_t first_unmap_block;
1527		- xfs_fileoff_t last_block;
1528	1497	xfs_filblks_t unmap_len;
1529	1498	int error = 0;
1530		- int done = 0;
1531	1499
1532	1500	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1533	1501	ASSERT(!atomic_read(&VFS_I(ip)->i_count) \|\|
..	..	@@ -1547,33 +1515,27 @@
1547	1515	* the end of the file (in a crash where the space is allocated
1548	1516	* but the inode size is not yet updated), simply remove any
1549	1517	* blocks which show up between the new EOF and the maximum
1550		- * possible file size. If the first block to be removed is
1551		- * beyond the maximum file size (ie it is the same as last_block),
1552		- * then there is nothing to do.
	1518	+ * possible file size.
	1519	+ *
	1520	+ * We have to free all the blocks to the bmbt maximum offset, even if
	1521	+ * the page cache can't scale that far.
1553	1522	*/
1554	1523	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1555		- last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
1556		- if (first_unmap_block == last_block)
	1524	+ if (first_unmap_block >= XFS_MAX_FILEOFF) {
	1525	+ WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
1557	1526	return 0;
	1527	+ }
1558	1528
1559		- ASSERT(first_unmap_block < last_block);
1560		- unmap_len = last_block - first_unmap_block + 1;
1561		- while (!done) {
	1529	+ unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
	1530	+ while (unmap_len > 0) {
1562	1531	ASSERT(tp->t_firstblock == NULLFSBLOCK);
1563		- error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
1564		- XFS_ITRUNC_MAX_EXTENTS, &done);
	1532	+ error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
	1533	+ flags, XFS_ITRUNC_MAX_EXTENTS);
1565	1534	if (error)
1566	1535	goto out;
1567	1536
1568		- /*
1569		- * Duplicate the transaction that has the permanent
1570		- * reservation and commit the old transaction.
1571		- */
	1537	+ /* free the just unmapped extents */
1572	1538	error = xfs_defer_finish(&tp);
1573		- if (error)
1574		- goto out;
1575		-
1576		- error = xfs_trans_roll_inode(&tp, ip);
1577	1539	if (error)
1578	1540	goto out;
1579	1541	}
..	..	@@ -1581,7 +1543,7 @@
1581	1543	if (whichfork == XFS_DATA_FORK) {
1582	1544	/* Remove all pending CoW reservations. */
1583	1545	error = xfs_reflink_cancel_cow_blocks(ip, &tp,
1584		- first_unmap_block, last_block, true);
	1546	+ first_unmap_block, XFS_MAX_FILEOFF, true);
1585	1547	if (error)
1586	1548	goto out;
1587	1549
..	..	@@ -1662,7 +1624,7 @@
1662	1624	return 0;
1663	1625	/*
1664	1626	* If we can't get the iolock just skip truncating the blocks
1665		- * past EOF because we could deadlock with the mmap_sem
	1627	+ * past EOF because we could deadlock with the mmap_lock
1666	1628	* otherwise. We'll get another chance to drop them once the
1667	1629	* last reference to the inode is dropped, so we'll never leak
1668	1630	* blocks permanently.
..	..	@@ -1714,7 +1676,7 @@
1714	1676	if (error)
1715	1677	goto error_trans_cancel;
1716	1678
1717		- ASSERT(ip->i_d.di_nextents == 0);
	1679	+ ASSERT(ip->i_df.if_nextents == 0);
1718	1680
1719	1681	error = xfs_trans_commit(tp);
1720	1682	if (error)
..	..	@@ -1883,7 +1845,7 @@
1883	1845
1884	1846	if (S_ISREG(VFS_I(ip)->i_mode) &&
1885	1847	(ip->i_d.di_size != 0 \|\| XFS_ISIZE(ip) != 0 \|\|
1886		- ip->i_d.di_nextents > 0 \|\| ip->i_delayed_blks > 0))
	1848	+ ip->i_df.if_nextents > 0 \|\| ip->i_delayed_blks > 0))
1887	1849	truncate = 1;
1888	1850
1889	1851	error = xfs_qm_dqattach(ip);
..	..	@@ -1909,7 +1871,6 @@
1909	1871	}
1910	1872
1911	1873	ASSERT(!ip->i_afp);
1912		- ASSERT(ip->i_d.di_anextents == 0);
1913	1874	ASSERT(ip->i_d.di_forkoff == 0);
1914	1875
1915	1876	/*
..	..	@@ -1926,6 +1887,336 @@
1926	1887	}
1927	1888
1928	1889	/*
	1890	+ * In-Core Unlinked List Lookups
	1891	+ * =============================
	1892	+ *
	1893	+ * Every inode is supposed to be reachable from some other piece of metadata
	1894	+ * with the exception of the root directory. Inodes with a connection to a
	1895	+ * file descriptor but not linked from anywhere in the on-disk directory tree
	1896	+ * are collectively known as unlinked inodes, though the filesystem itself
	1897	+ * maintains links to these inodes so that on-disk metadata are consistent.
	1898	+ *
	1899	+ * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
	1900	+ * header contains a number of buckets that point to an inode, and each inode
	1901	+ * record has a pointer to the next inode in the hash chain. This
	1902	+ * singly-linked list causes scaling problems in the iunlink remove function
	1903	+ * because we must walk that list to find the inode that points to the inode
	1904	+ * being removed from the unlinked hash bucket list.
	1905	+ *
	1906	+ * What if we modelled the unlinked list as a collection of records capturing
	1907	+ * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
	1908	+ * have a fast way to look up unlinked list predecessors, which avoids the
	1909	+ * slow list walk. That's exactly what we do here (in-core) with a per-AG
	1910	+ * rhashtable.
	1911	+ *
	1912	+ * Because this is a backref cache, we ignore operational failures since the
	1913	+ * iunlink code can fall back to the slow bucket walk. The only errors that
	1914	+ * should bubble out are for obviously incorrect situations.
	1915	+ *
	1916	+ * All users of the backref cache MUST hold the AGI buffer lock to serialize
	1917	+ * access or have otherwise provided for concurrency control.
	1918	+ */
	1919	+
	1920	+/* Capture a "X.next_unlinked = Y" relationship. */
	1921	+struct xfs_iunlink {
	1922	+ struct rhash_head iu_rhash_head;
	1923	+ xfs_agino_t iu_agino; /* X */
	1924	+ xfs_agino_t iu_next_unlinked; /* Y */
	1925	+};
	1926	+
	1927	+/* Unlinked list predecessor lookup hashtable construction */
	1928	+static int
	1929	+xfs_iunlink_obj_cmpfn(
	1930	+ struct rhashtable_compare_arg *arg,
	1931	+ const void *obj)
	1932	+{
	1933	+ const xfs_agino_t *key = arg->key;
	1934	+ const struct xfs_iunlink *iu = obj;
	1935	+
	1936	+ if (iu->iu_next_unlinked != *key)
	1937	+ return 1;
	1938	+ return 0;
	1939	+}
	1940	+
	1941	+static const struct rhashtable_params xfs_iunlink_hash_params = {
	1942	+ .min_size = XFS_AGI_UNLINKED_BUCKETS,
	1943	+ .key_len = sizeof(xfs_agino_t),
	1944	+ .key_offset = offsetof(struct xfs_iunlink,
	1945	+ iu_next_unlinked),
	1946	+ .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
	1947	+ .automatic_shrinking = true,
	1948	+ .obj_cmpfn = xfs_iunlink_obj_cmpfn,
	1949	+};
	1950	+
	1951	+/*
	1952	+ * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
	1953	+ * relation is found.
	1954	+ */
	1955	+static xfs_agino_t
	1956	+xfs_iunlink_lookup_backref(
	1957	+ struct xfs_perag *pag,
	1958	+ xfs_agino_t agino)
	1959	+{
	1960	+ struct xfs_iunlink *iu;
	1961	+
	1962	+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
	1963	+ xfs_iunlink_hash_params);
	1964	+ return iu ? iu->iu_agino : NULLAGINO;
	1965	+}
	1966	+
	1967	+/*
	1968	+ * Take ownership of an iunlink cache entry and insert it into the hash table.
	1969	+ * If successful, the entry will be owned by the cache; if not, it is freed.
	1970	+ * Either way, the caller does not own @iu after this call.
	1971	+ */
	1972	+static int
	1973	+xfs_iunlink_insert_backref(
	1974	+ struct xfs_perag *pag,
	1975	+ struct xfs_iunlink *iu)
	1976	+{
	1977	+ int error;
	1978	+
	1979	+ error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
	1980	+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
	1981	+ /*
	1982	+ * Fail loudly if there already was an entry because that's a sign of
	1983	+ * corruption of in-memory data. Also fail loudly if we see an error
	1984	+ * code we didn't anticipate from the rhashtable code. Currently we
	1985	+ * only anticipate ENOMEM.
	1986	+ */
	1987	+ if (error) {
	1988	+ WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
	1989	+ kmem_free(iu);
	1990	+ }
	1991	+ /*
	1992	+ * Absorb any runtime errors that aren't a result of corruption because
	1993	+ * this is a cache and we can always fall back to bucket list scanning.
	1994	+ */
	1995	+ if (error != 0 && error != -EEXIST)
	1996	+ error = 0;
	1997	+ return error;
	1998	+}
	1999	+
	2000	+/* Remember that @prev_agino.next_unlinked = @this_agino. */
	2001	+static int
	2002	+xfs_iunlink_add_backref(
	2003	+ struct xfs_perag *pag,
	2004	+ xfs_agino_t prev_agino,
	2005	+ xfs_agino_t this_agino)
	2006	+{
	2007	+ struct xfs_iunlink *iu;
	2008	+
	2009	+ if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
	2010	+ return 0;
	2011	+
	2012	+ iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
	2013	+ iu->iu_agino = prev_agino;
	2014	+ iu->iu_next_unlinked = this_agino;
	2015	+
	2016	+ return xfs_iunlink_insert_backref(pag, iu);
	2017	+}
	2018	+
	2019	+/*
	2020	+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
	2021	+ * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
	2022	+ * wasn't any such entry then we don't bother.
	2023	+ */
	2024	+static int
	2025	+xfs_iunlink_change_backref(
	2026	+ struct xfs_perag *pag,
	2027	+ xfs_agino_t agino,
	2028	+ xfs_agino_t next_unlinked)
	2029	+{
	2030	+ struct xfs_iunlink *iu;
	2031	+ int error;
	2032	+
	2033	+ /* Look up the old entry; if there wasn't one then exit. */
	2034	+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
	2035	+ xfs_iunlink_hash_params);
	2036	+ if (!iu)
	2037	+ return 0;
	2038	+
	2039	+ /*
	2040	+ * Remove the entry. This shouldn't ever return an error, but if we
	2041	+ * couldn't remove the old entry we don't want to add it again to the
	2042	+ * hash table, and if the entry disappeared on us then someone's
	2043	+ * violated the locking rules and we need to fail loudly. Either way
	2044	+ * we cannot remove the inode because internal state is or would have
	2045	+ * been corrupt.
	2046	+ */
	2047	+ error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
	2048	+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
	2049	+ if (error)
	2050	+ return error;
	2051	+
	2052	+ /* If there is no new next entry just free our item and return. */
	2053	+ if (next_unlinked == NULLAGINO) {
	2054	+ kmem_free(iu);
	2055	+ return 0;
	2056	+ }
	2057	+
	2058	+ /* Update the entry and re-add it to the hash table. */
	2059	+ iu->iu_next_unlinked = next_unlinked;
	2060	+ return xfs_iunlink_insert_backref(pag, iu);
	2061	+}
	2062	+
	2063	+/* Set up the in-core predecessor structures. */
	2064	+int
	2065	+xfs_iunlink_init(
	2066	+ struct xfs_perag *pag)
	2067	+{
	2068	+ return rhashtable_init(&pag->pagi_unlinked_hash,
	2069	+ &xfs_iunlink_hash_params);
	2070	+}
	2071	+
	2072	+/* Free the in-core predecessor structures. */
	2073	+static void
	2074	+xfs_iunlink_free_item(
	2075	+ void *ptr,
	2076	+ void *arg)
	2077	+{
	2078	+ struct xfs_iunlink *iu = ptr;
	2079	+ bool *freed_anything = arg;
	2080	+
	2081	+ *freed_anything = true;
	2082	+ kmem_free(iu);
	2083	+}
	2084	+
	2085	+void
	2086	+xfs_iunlink_destroy(
	2087	+ struct xfs_perag *pag)
	2088	+{
	2089	+ bool freed_anything = false;
	2090	+
	2091	+ rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
	2092	+ xfs_iunlink_free_item, &freed_anything);
	2093	+
	2094	+ ASSERT(freed_anything == false \|\| XFS_FORCED_SHUTDOWN(pag->pag_mount));
	2095	+}
	2096	+
	2097	+/*
	2098	+ * Point the AGI unlinked bucket at an inode and log the results. The caller
	2099	+ * is responsible for validating the old value.
	2100	+ */
	2101	+STATIC int
	2102	+xfs_iunlink_update_bucket(
	2103	+ struct xfs_trans *tp,
	2104	+ xfs_agnumber_t agno,
	2105	+ struct xfs_buf *agibp,
	2106	+ unsigned int bucket_index,
	2107	+ xfs_agino_t new_agino)
	2108	+{
	2109	+ struct xfs_agi *agi = agibp->b_addr;
	2110	+ xfs_agino_t old_value;
	2111	+ int offset;
	2112	+
	2113	+ ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
	2114	+
	2115	+ old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
	2116	+ trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
	2117	+ old_value, new_agino);
	2118	+
	2119	+ /*
	2120	+ * We should never find the head of the list already set to the value
	2121	+ * passed in because either we're adding or removing ourselves from the
	2122	+ * head of the list.
	2123	+ */
	2124	+ if (old_value == new_agino) {
	2125	+ xfs_buf_mark_corrupt(agibp);
	2126	+ return -EFSCORRUPTED;
	2127	+ }
	2128	+
	2129	+ agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
	2130	+ offset = offsetof(struct xfs_agi, agi_unlinked) +
	2131	+ (sizeof(xfs_agino_t) * bucket_index);
	2132	+ xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
	2133	+ return 0;
	2134	+}
	2135	+
	2136	+/* Set an on-disk inode's next_unlinked pointer. */
	2137	+STATIC void
	2138	+xfs_iunlink_update_dinode(
	2139	+ struct xfs_trans *tp,
	2140	+ xfs_agnumber_t agno,
	2141	+ xfs_agino_t agino,
	2142	+ struct xfs_buf *ibp,
	2143	+ struct xfs_dinode *dip,
	2144	+ struct xfs_imap *imap,
	2145	+ xfs_agino_t next_agino)
	2146	+{
	2147	+ struct xfs_mount *mp = tp->t_mountp;
	2148	+ int offset;
	2149	+
	2150	+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
	2151	+
	2152	+ trace_xfs_iunlink_update_dinode(mp, agno, agino,
	2153	+ be32_to_cpu(dip->di_next_unlinked), next_agino);
	2154	+
	2155	+ dip->di_next_unlinked = cpu_to_be32(next_agino);
	2156	+ offset = imap->im_boffset +
	2157	+ offsetof(struct xfs_dinode, di_next_unlinked);
	2158	+
	2159	+ /* need to recalc the inode CRC if appropriate */
	2160	+ xfs_dinode_calc_crc(mp, dip);
	2161	+ xfs_trans_inode_buf(tp, ibp);
	2162	+ xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
	2163	+}
	2164	+
	2165	+/* Set an in-core inode's unlinked pointer and return the old value. */
	2166	+STATIC int
	2167	+xfs_iunlink_update_inode(
	2168	+ struct xfs_trans *tp,
	2169	+ struct xfs_inode *ip,
	2170	+ xfs_agnumber_t agno,
	2171	+ xfs_agino_t next_agino,
	2172	+ xfs_agino_t *old_next_agino)
	2173	+{
	2174	+ struct xfs_mount *mp = tp->t_mountp;
	2175	+ struct xfs_dinode *dip;
	2176	+ struct xfs_buf *ibp;
	2177	+ xfs_agino_t old_value;
	2178	+ int error;
	2179	+
	2180	+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
	2181	+
	2182	+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0);
	2183	+ if (error)
	2184	+ return error;
	2185	+
	2186	+ /* Make sure the old pointer isn't garbage. */
	2187	+ old_value = be32_to_cpu(dip->di_next_unlinked);
	2188	+ if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
	2189	+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
	2190	+ sizeof(*dip), __this_address);
	2191	+ error = -EFSCORRUPTED;
	2192	+ goto out;
	2193	+ }
	2194	+
	2195	+ /*
	2196	+ * Since we're updating a linked list, we should never find that the
	2197	+ * current pointer is the same as the new value, unless we're
	2198	+ * terminating the list.
	2199	+ */
	2200	+ *old_next_agino = old_value;
	2201	+ if (old_value == next_agino) {
	2202	+ if (next_agino != NULLAGINO) {
	2203	+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
	2204	+ dip, sizeof(*dip), __this_address);
	2205	+ error = -EFSCORRUPTED;
	2206	+ }
	2207	+ goto out;
	2208	+ }
	2209	+
	2210	+ /* Ok, update the new pointer. */
	2211	+ xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
	2212	+ ibp, dip, &ip->i_imap, next_agino);
	2213	+ return 0;
	2214	+out:
	2215	+ xfs_trans_brelse(tp, ibp);
	2216	+ return error;
	2217	+}
	2218	+
	2219	+/*
1929	2220	* This is called when the inode's link count has gone to 0 or we are creating
1930	2221	* a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
1931	2222	*
..	..	@@ -1934,76 +2225,177 @@
1934	2225	*/
1935	2226	STATIC int
1936	2227	xfs_iunlink(
1937		- struct xfs_trans *tp,
1938		- struct xfs_inode *ip)
	2228	+ struct xfs_trans *tp,
	2229	+ struct xfs_inode *ip)
1939	2230	{
1940		- xfs_mount_t *mp = tp->t_mountp;
1941		- xfs_agi_t *agi;
1942		- xfs_dinode_t *dip;
1943		- xfs_buf_t *agibp;
1944		- xfs_buf_t *ibp;
1945		- xfs_agino_t agino;
1946		- short bucket_index;
1947		- int offset;
1948		- int error;
	2231	+ struct xfs_mount *mp = tp->t_mountp;
	2232	+ struct xfs_agi *agi;
	2233	+ struct xfs_buf *agibp;
	2234	+ xfs_agino_t next_agino;
	2235	+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
	2236	+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
	2237	+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
	2238	+ int error;
1949	2239
1950	2240	ASSERT(VFS_I(ip)->i_nlink == 0);
1951	2241	ASSERT(VFS_I(ip)->i_mode != 0);
	2242	+ trace_xfs_iunlink(ip);
1952	2243
1953		- /*
1954		- * Get the agi buffer first. It ensures lock ordering
1955		- * on the list.
1956		- */
1957		- error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
	2244	+ /* Get the agi buffer first. It ensures lock ordering on the list. */
	2245	+ error = xfs_read_agi(mp, tp, agno, &agibp);
1958	2246	if (error)
1959	2247	return error;
1960		- agi = XFS_BUF_TO_AGI(agibp);
	2248	+ agi = agibp->b_addr;
1961	2249
1962	2250	/*
1963		- * Get the index into the agi hash table for the
1964		- * list this inode will go on.
	2251	+ * Get the index into the agi hash table for the list this inode will
	2252	+ * go on. Make sure the pointer isn't garbage and that this inode
	2253	+ * isn't already on the list.
1965	2254	*/
1966		- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1967		- ASSERT(agino != 0);
1968		- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1969		- ASSERT(agi->agi_unlinked[bucket_index]);
1970		- ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
	2255	+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
	2256	+ if (next_agino == agino \|\|
	2257	+ !xfs_verify_agino_or_null(mp, agno, next_agino)) {
	2258	+ xfs_buf_mark_corrupt(agibp);
	2259	+ return -EFSCORRUPTED;
	2260	+ }
1971	2261
1972		- if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
	2262	+ if (next_agino != NULLAGINO) {
	2263	+ xfs_agino_t old_agino;
	2264	+
1973	2265	/*
1974		- * There is already another inode in the bucket we need
1975		- * to add ourselves to. Add us at the front of the list.
1976		- * Here we put the head pointer into our next pointer,
1977		- * and then we fall through to point the head at us.
	2266	+ * There is already another inode in the bucket, so point this
	2267	+ * inode to the current head of the list.
1978	2268	*/
1979		- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1980		- 0, 0);
	2269	+ error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
	2270	+ &old_agino);
	2271	+ if (error)
	2272	+ return error;
	2273	+ ASSERT(old_agino == NULLAGINO);
	2274	+
	2275	+ /*
	2276	+ * agino has been unlinked, add a backref from the next inode
	2277	+ * back to agino.
	2278	+ */
	2279	+ error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino);
	2280	+ if (error)
	2281	+ return error;
	2282	+ }
	2283	+
	2284	+ /* Point the head of the list to point to this inode. */
	2285	+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
	2286	+}
	2287	+
	2288	+/* Return the imap, dinode pointer, and buffer for an inode. */
	2289	+STATIC int
	2290	+xfs_iunlink_map_ino(
	2291	+ struct xfs_trans *tp,
	2292	+ xfs_agnumber_t agno,
	2293	+ xfs_agino_t agino,
	2294	+ struct xfs_imap *imap,
	2295	+ struct xfs_dinode **dipp,
	2296	+ struct xfs_buf **bpp)
	2297	+{
	2298	+ struct xfs_mount *mp = tp->t_mountp;
	2299	+ int error;
	2300	+
	2301	+ imap->im_blkno = 0;
	2302	+ error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
	2303	+ if (error) {
	2304	+ xfs_warn(mp, "%s: xfs_imap returned error %d.",
	2305	+ __func__, error);
	2306	+ return error;
	2307	+ }
	2308	+
	2309	+ error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0);
	2310	+ if (error) {
	2311	+ xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
	2312	+ __func__, error);
	2313	+ return error;
	2314	+ }
	2315	+
	2316	+ return 0;
	2317	+}
	2318	+
	2319	+/*
	2320	+ * Walk the unlinked chain from @head_agino until we find the inode that
	2321	+ * points to @target_agino. Return the inode number, map, dinode pointer,
	2322	+ * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
	2323	+ *
	2324	+ * @tp, @pag, @head_agino, and @target_agino are input parameters.
	2325	+ * @agino, @imap, @dipp, and @bpp are all output parameters.
	2326	+ *
	2327	+ * Do not call this function if @target_agino is the head of the list.
	2328	+ */
	2329	+STATIC int
	2330	+xfs_iunlink_map_prev(
	2331	+ struct xfs_trans *tp,
	2332	+ xfs_agnumber_t agno,
	2333	+ xfs_agino_t head_agino,
	2334	+ xfs_agino_t target_agino,
	2335	+ xfs_agino_t *agino,
	2336	+ struct xfs_imap *imap,
	2337	+ struct xfs_dinode **dipp,
	2338	+ struct xfs_buf **bpp,
	2339	+ struct xfs_perag *pag)
	2340	+{
	2341	+ struct xfs_mount *mp = tp->t_mountp;
	2342	+ xfs_agino_t next_agino;
	2343	+ int error;
	2344	+
	2345	+ ASSERT(head_agino != target_agino);
	2346	+ *bpp = NULL;
	2347	+
	2348	+ /* See if our backref cache can find it faster. */
	2349	+ *agino = xfs_iunlink_lookup_backref(pag, target_agino);
	2350	+ if (*agino != NULLAGINO) {
	2351	+ error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
1981	2352	if (error)
1982	2353	return error;
1983	2354
1984		- ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
1985		- dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1986		- offset = ip->i_imap.im_boffset +
1987		- offsetof(xfs_dinode_t, di_next_unlinked);
	2355	+ if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
	2356	+ return 0;
1988	2357
1989		- /* need to recalc the inode CRC if appropriate */
1990		- xfs_dinode_calc_crc(mp, dip);
1991		-
1992		- xfs_trans_inode_buf(tp, ibp);
1993		- xfs_trans_log_buf(tp, ibp, offset,
1994		- (offset + sizeof(xfs_agino_t) - 1));
1995		- xfs_inobp_check(mp, ibp);
	2358	+ /*
	2359	+ * If we get here the cache contents were corrupt, so drop the
	2360	+ * buffer and fall back to walking the bucket list.
	2361	+ */
	2362	+ xfs_trans_brelse(tp, *bpp);
	2363	+ *bpp = NULL;
	2364	+ WARN_ON_ONCE(1);
1996	2365	}
1997	2366
1998		- /*
1999		- * Point the bucket head pointer at the inode being inserted.
2000		- */
2001		- ASSERT(agino != 0);
2002		- agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
2003		- offset = offsetof(xfs_agi_t, agi_unlinked) +
2004		- (sizeof(xfs_agino_t) * bucket_index);
2005		- xfs_trans_log_buf(tp, agibp, offset,
2006		- (offset + sizeof(xfs_agino_t) - 1));
	2367	+ trace_xfs_iunlink_map_prev_fallback(mp, agno);
	2368	+
	2369	+ /* Otherwise, walk the entire bucket until we find it. */
	2370	+ next_agino = head_agino;
	2371	+ while (next_agino != target_agino) {
	2372	+ xfs_agino_t unlinked_agino;
	2373	+
	2374	+ if (*bpp)
	2375	+ xfs_trans_brelse(tp, *bpp);
	2376	+
	2377	+ *agino = next_agino;
	2378	+ error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
	2379	+ bpp);
	2380	+ if (error)
	2381	+ return error;
	2382	+
	2383	+ unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
	2384	+ /*
	2385	+ * Make sure this pointer is valid and isn't an obvious
	2386	+ * infinite loop.
	2387	+ */
	2388	+ if (!xfs_verify_agino(mp, agno, unlinked_agino) \|\|
	2389	+ next_agino == unlinked_agino) {
	2390	+ XFS_CORRUPTION_ERROR(__func__,
	2391	+ XFS_ERRLEVEL_LOW, mp,
	2392	+ dipp, sizeof(*dipp));
	2393	+ error = -EFSCORRUPTED;
	2394	+ return error;
	2395	+ }
	2396	+ next_agino = unlinked_agino;
	2397	+ }
	2398	+
2007	2399	return 0;
2008	2400	}
2009	2401
..	..	@@ -2012,181 +2404,190 @@
2012	2404	*/
2013	2405	STATIC int
2014	2406	xfs_iunlink_remove(
2015		- xfs_trans_t *tp,
2016		- xfs_inode_t *ip)
	2407	+ struct xfs_trans *tp,
	2408	+ struct xfs_inode *ip)
2017	2409	{
2018		- xfs_ino_t next_ino;
2019		- xfs_mount_t *mp;
2020		- xfs_agi_t *agi;
2021		- xfs_dinode_t *dip;
2022		- xfs_buf_t *agibp;
2023		- xfs_buf_t *ibp;
2024		- xfs_agnumber_t agno;
2025		- xfs_agino_t agino;
2026		- xfs_agino_t next_agino;
2027		- xfs_buf_t *last_ibp;
2028		- xfs_dinode_t *last_dip = NULL;
2029		- short bucket_index;
2030		- int offset, last_offset = 0;
2031		- int error;
	2410	+ struct xfs_mount *mp = tp->t_mountp;
	2411	+ struct xfs_agi *agi;
	2412	+ struct xfs_buf *agibp;
	2413	+ struct xfs_buf *last_ibp;
	2414	+ struct xfs_dinode *last_dip = NULL;
	2415	+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
	2416	+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
	2417	+ xfs_agino_t next_agino;
	2418	+ xfs_agino_t head_agino;
	2419	+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
	2420	+ int error;
2032	2421
2033		- mp = tp->t_mountp;
2034		- agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
	2422	+ trace_xfs_iunlink_remove(ip);
2035	2423
2036		- /*
2037		- * Get the agi buffer first. It ensures lock ordering
2038		- * on the list.
2039		- */
	2424	+ /* Get the agi buffer first. It ensures lock ordering on the list. */
2040	2425	error = xfs_read_agi(mp, tp, agno, &agibp);
2041	2426	if (error)
2042	2427	return error;
2043		-
2044		- agi = XFS_BUF_TO_AGI(agibp);
	2428	+ agi = agibp->b_addr;
2045	2429
2046	2430	/*
2047		- * Get the index into the agi hash table for the
2048		- * list this inode will go on.
	2431	+ * Get the index into the agi hash table for the list this inode will
	2432	+ * go on. Make sure the head pointer isn't garbage.
2049	2433	*/
2050		- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2051		- if (!xfs_verify_agino(mp, agno, agino))
2052		- return -EFSCORRUPTED;
2053		- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2054		- if (!xfs_verify_agino(mp, agno,
2055		- be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
	2434	+ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
	2435	+ if (!xfs_verify_agino(mp, agno, head_agino)) {
2056	2436	XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
2057	2437	agi, sizeof(*agi));
2058	2438	return -EFSCORRUPTED;
2059	2439	}
2060	2440
2061		- if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
2062		- /*
2063		- * We're at the head of the list. Get the inode's on-disk
2064		- * buffer to see if there is anyone after us on the list.
2065		- * Only modify our next pointer if it is not already NULLAGINO.
2066		- * This saves us the overhead of dealing with the buffer when
2067		- * there is no need to change it.
2068		- */
2069		- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2070		- 0, 0);
2071		- if (error) {
2072		- xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
2073		- __func__, error);
	2441	+ /*
	2442	+ * Set our inode's next_unlinked pointer to NULL and then return
	2443	+ * the old pointer value so that we can update whatever was previous
	2444	+ * to us in the list to point to whatever was next in the list.
	2445	+ */
	2446	+ error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
	2447	+ if (error)
	2448	+ return error;
	2449	+
	2450	+ /*
	2451	+ * If there was a backref pointing from the next inode back to this
	2452	+ * one, remove it because we've removed this inode from the list.
	2453	+ *
	2454	+ * Later, if this inode was in the middle of the list we'll update
	2455	+ * this inode's backref to point from the next inode.
	2456	+ */
	2457	+ if (next_agino != NULLAGINO) {
	2458	+ error = xfs_iunlink_change_backref(agibp->b_pag, next_agino,
	2459	+ NULLAGINO);
	2460	+ if (error)
2074	2461	return error;
2075		- }
2076		- next_agino = be32_to_cpu(dip->di_next_unlinked);
2077		- ASSERT(next_agino != 0);
2078		- if (next_agino != NULLAGINO) {
2079		- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2080		- offset = ip->i_imap.im_boffset +
2081		- offsetof(xfs_dinode_t, di_next_unlinked);
2082		-
2083		- /* need to recalc the inode CRC if appropriate */
2084		- xfs_dinode_calc_crc(mp, dip);
2085		-
2086		- xfs_trans_inode_buf(tp, ibp);
2087		- xfs_trans_log_buf(tp, ibp, offset,
2088		- (offset + sizeof(xfs_agino_t) - 1));
2089		- xfs_inobp_check(mp, ibp);
2090		- } else {
2091		- xfs_trans_brelse(tp, ibp);
2092		- }
2093		- /*
2094		- * Point the bucket head pointer at the next inode.
2095		- */
2096		- ASSERT(next_agino != 0);
2097		- ASSERT(next_agino != agino);
2098		- agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
2099		- offset = offsetof(xfs_agi_t, agi_unlinked) +
2100		- (sizeof(xfs_agino_t) * bucket_index);
2101		- xfs_trans_log_buf(tp, agibp, offset,
2102		- (offset + sizeof(xfs_agino_t) - 1));
2103		- } else {
2104		- /*
2105		- * We need to search the list for the inode being freed.
2106		- */
2107		- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2108		- last_ibp = NULL;
2109		- while (next_agino != agino) {
2110		- struct xfs_imap imap;
2111		-
2112		- if (last_ibp)
2113		- xfs_trans_brelse(tp, last_ibp);
2114		-
2115		- imap.im_blkno = 0;
2116		- next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
2117		-
2118		- error = xfs_imap(mp, tp, next_ino, &imap, 0);
2119		- if (error) {
2120		- xfs_warn(mp,
2121		- "%s: xfs_imap returned error %d.",
2122		- __func__, error);
2123		- return error;
2124		- }
2125		-
2126		- error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
2127		- &last_ibp, 0, 0);
2128		- if (error) {
2129		- xfs_warn(mp,
2130		- "%s: xfs_imap_to_bp returned error %d.",
2131		- __func__, error);
2132		- return error;
2133		- }
2134		-
2135		- last_offset = imap.im_boffset;
2136		- next_agino = be32_to_cpu(last_dip->di_next_unlinked);
2137		- if (!xfs_verify_agino(mp, agno, next_agino)) {
2138		- XFS_CORRUPTION_ERROR(__func__,
2139		- XFS_ERRLEVEL_LOW, mp,
2140		- last_dip, sizeof(*last_dip));
2141		- return -EFSCORRUPTED;
2142		- }
2143		- }
2144		-
2145		- /*
2146		- * Now last_ibp points to the buffer previous to us on the
2147		- * unlinked list. Pull us from the list.
2148		- */
2149		- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2150		- 0, 0);
2151		- if (error) {
2152		- xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
2153		- __func__, error);
2154		- return error;
2155		- }
2156		- next_agino = be32_to_cpu(dip->di_next_unlinked);
2157		- ASSERT(next_agino != 0);
2158		- ASSERT(next_agino != agino);
2159		- if (next_agino != NULLAGINO) {
2160		- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2161		- offset = ip->i_imap.im_boffset +
2162		- offsetof(xfs_dinode_t, di_next_unlinked);
2163		-
2164		- /* need to recalc the inode CRC if appropriate */
2165		- xfs_dinode_calc_crc(mp, dip);
2166		-
2167		- xfs_trans_inode_buf(tp, ibp);
2168		- xfs_trans_log_buf(tp, ibp, offset,
2169		- (offset + sizeof(xfs_agino_t) - 1));
2170		- xfs_inobp_check(mp, ibp);
2171		- } else {
2172		- xfs_trans_brelse(tp, ibp);
2173		- }
2174		- /*
2175		- * Point the previous inode on the list to the next inode.
2176		- */
2177		- last_dip->di_next_unlinked = cpu_to_be32(next_agino);
2178		- ASSERT(next_agino != 0);
2179		- offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
2180		-
2181		- /* need to recalc the inode CRC if appropriate */
2182		- xfs_dinode_calc_crc(mp, last_dip);
2183		-
2184		- xfs_trans_inode_buf(tp, last_ibp);
2185		- xfs_trans_log_buf(tp, last_ibp, offset,
2186		- (offset + sizeof(xfs_agino_t) - 1));
2187		- xfs_inobp_check(mp, last_ibp);
2188	2462	}
2189		- return 0;
	2463	+
	2464	+ if (head_agino != agino) {
	2465	+ struct xfs_imap imap;
	2466	+ xfs_agino_t prev_agino;
	2467	+
	2468	+ /* We need to search the list for the inode being freed. */
	2469	+ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
	2470	+ &prev_agino, &imap, &last_dip, &last_ibp,
	2471	+ agibp->b_pag);
	2472	+ if (error)
	2473	+ return error;
	2474	+
	2475	+ /* Point the previous inode on the list to the next inode. */
	2476	+ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
	2477	+ last_dip, &imap, next_agino);
	2478	+
	2479	+ /*
	2480	+ * Now we deal with the backref for this inode. If this inode
	2481	+ * pointed at a real inode, change the backref that pointed to
	2482	+ * us to point to our old next. If this inode was the end of
	2483	+ * the list, delete the backref that pointed to us. Note that
	2484	+ * change_backref takes care of deleting the backref if
	2485	+ * next_agino is NULLAGINO.
	2486	+ */
	2487	+ return xfs_iunlink_change_backref(agibp->b_pag, agino,
	2488	+ next_agino);
	2489	+ }
	2490	+
	2491	+ /* Point the head of the list to the next unlinked inode. */
	2492	+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
	2493	+ next_agino);
	2494	+}
	2495	+
	2496	+/*
	2497	+ * Look up the inode number specified and if it is not already marked XFS_ISTALE
	2498	+ * mark it stale. We should only find clean inodes in this lookup that aren't
	2499	+ * already stale.
	2500	+ */
	2501	+static void
	2502	+xfs_ifree_mark_inode_stale(
	2503	+ struct xfs_buf *bp,
	2504	+ struct xfs_inode *free_ip,
	2505	+ xfs_ino_t inum)
	2506	+{
	2507	+ struct xfs_mount *mp = bp->b_mount;
	2508	+ struct xfs_perag *pag = bp->b_pag;
	2509	+ struct xfs_inode_log_item *iip;
	2510	+ struct xfs_inode *ip;
	2511	+
	2512	+retry:
	2513	+ rcu_read_lock();
	2514	+ ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
	2515	+
	2516	+ /* Inode not in memory, nothing to do */
	2517	+ if (!ip) {
	2518	+ rcu_read_unlock();
	2519	+ return;
	2520	+ }
	2521	+
	2522	+ /*
	2523	+ * because this is an RCU protected lookup, we could find a recently
	2524	+ * freed or even reallocated inode during the lookup. We need to check
	2525	+ * under the i_flags_lock for a valid inode here. Skip it if it is not
	2526	+ * valid, the wrong inode or stale.
	2527	+ */
	2528	+ spin_lock(&ip->i_flags_lock);
	2529	+ if (ip->i_ino != inum \|\| __xfs_iflags_test(ip, XFS_ISTALE))
	2530	+ goto out_iflags_unlock;
	2531	+
	2532	+ /*
	2533	+ * Don't try to lock/unlock the current inode, but we _cannot_ skip the
	2534	+ * other inodes that we did not find in the list attached to the buffer
	2535	+ * and are not already marked stale. If we can't lock it, back off and
	2536	+ * retry.
	2537	+ */
	2538	+ if (ip != free_ip) {
	2539	+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
	2540	+ spin_unlock(&ip->i_flags_lock);
	2541	+ rcu_read_unlock();
	2542	+ delay(1);
	2543	+ goto retry;
	2544	+ }
	2545	+ }
	2546	+ ip->i_flags \|= XFS_ISTALE;
	2547	+
	2548	+ /*
	2549	+ * If the inode is flushing, it is already attached to the buffer. All
	2550	+ * we needed to do here is mark the inode stale so buffer IO completion
	2551	+ * will remove it from the AIL.
	2552	+ */
	2553	+ iip = ip->i_itemp;
	2554	+ if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
	2555	+ ASSERT(!list_empty(&iip->ili_item.li_bio_list));
	2556	+ ASSERT(iip->ili_last_fields);
	2557	+ goto out_iunlock;
	2558	+ }
	2559	+
	2560	+ /*
	2561	+ * Inodes not attached to the buffer can be released immediately.
	2562	+ * Everything else has to go through xfs_iflush_abort() on journal
	2563	+ * commit as the flock synchronises removal of the inode from the
	2564	+ * cluster buffer against inode reclaim.
	2565	+ */
	2566	+ if (!iip \|\| list_empty(&iip->ili_item.li_bio_list))
	2567	+ goto out_iunlock;
	2568	+
	2569	+ __xfs_iflags_set(ip, XFS_IFLUSHING);
	2570	+ spin_unlock(&ip->i_flags_lock);
	2571	+ rcu_read_unlock();
	2572	+
	2573	+ /* we have a dirty inode in memory that has not yet been flushed. */
	2574	+ spin_lock(&iip->ili_lock);
	2575	+ iip->ili_last_fields = iip->ili_fields;
	2576	+ iip->ili_fields = 0;
	2577	+ iip->ili_fsync_fields = 0;
	2578	+ spin_unlock(&iip->ili_lock);
	2579	+ ASSERT(iip->ili_last_fields);
	2580	+
	2581	+ if (ip != free_ip)
	2582	+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
	2583	+ return;
	2584	+
	2585	+out_iunlock:
	2586	+ if (ip != free_ip)
	2587	+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
	2588	+out_iflags_unlock:
	2589	+ spin_unlock(&ip->i_flags_lock);
	2590	+ rcu_read_unlock();
2190	2591	}
2191	2592
2192	2593	/*
..	..	@@ -2196,31 +2597,23 @@
2196	2597	*/
2197	2598	STATIC int
2198	2599	xfs_ifree_cluster(
2199		- xfs_inode_t *free_ip,
2200		- xfs_trans_t *tp,
	2600	+ struct xfs_inode *free_ip,
	2601	+ struct xfs_trans *tp,
2201	2602	struct xfs_icluster *xic)
2202	2603	{
2203		- xfs_mount_t *mp = free_ip->i_mount;
2204		- int blks_per_cluster;
2205		- int inodes_per_cluster;
	2604	+ struct xfs_mount *mp = free_ip->i_mount;
	2605	+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
	2606	+ struct xfs_buf *bp;
	2607	+ xfs_daddr_t blkno;
	2608	+ xfs_ino_t inum = xic->first_ino;
2206	2609	int nbufs;
2207	2610	int i, j;
2208	2611	int ioffset;
2209		- xfs_daddr_t blkno;
2210		- xfs_buf_t *bp;
2211		- xfs_inode_t *ip;
2212		- xfs_inode_log_item_t *iip;
2213		- struct xfs_log_item *lip;
2214		- struct xfs_perag *pag;
2215		- xfs_ino_t inum;
	2612	+ int error;
2216	2613
2217		- inum = xic->first_ino;
2218		- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
2219		- blks_per_cluster = xfs_icluster_size_fsb(mp);
2220		- inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
2221		- nbufs = mp->m_ialloc_blks / blks_per_cluster;
	2614	+ nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
2222	2615
2223		- for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
	2616	+ for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
2224	2617	/*
2225	2618	* The allocation bitmap tells us which inodes of the chunk were
2226	2619	* physically allocated. Skip the cluster if an inode falls into
..	..	@@ -2228,7 +2621,7 @@
2228	2621	*/
2229	2622	ioffset = inum - xic->first_ino;
2230	2623	if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
2231		- ASSERT(ioffset % inodes_per_cluster == 0);
	2624	+ ASSERT(ioffset % igeo->inodes_per_cluster == 0);
2232	2625	continue;
2233	2626	}
2234	2627
..	..	@@ -2237,18 +2630,18 @@
2237	2630
2238	2631	/*
2239	2632	* We obtain and lock the backing buffer first in the process
2240		- * here, as we have to ensure that any dirty inode that we
2241		- * can't get the flush lock on is attached to the buffer.
	2633	+ * here to ensure dirty inodes attached to the buffer remain in
	2634	+ * the flushing state while we mark them stale.
	2635	+ *
2242	2636	* If we scan the in-memory inodes first, then buffer IO can
2243	2637	* complete before we get a lock on it, and hence we may fail
2244	2638	* to mark all the active inodes on the buffer stale.
2245	2639	*/
2246		- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2247		- mp->m_bsize * blks_per_cluster,
2248		- XBF_UNMAPPED);
2249		-
2250		- if (!bp)
2251		- return -ENOMEM;
	2640	+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
	2641	+ mp->m_bsize * igeo->blocks_per_cluster,
	2642	+ XBF_UNMAPPED, &bp);
	2643	+ if (error)
	2644	+ return error;
2252	2645
2253	2646	/*
2254	2647	* This buffer may not have been correctly initialised as we
..	..	@@ -2259,159 +2652,30 @@
2259	2652	* want it to fail. We can acheive this by adding a write
2260	2653	* verifier to the buffer.
2261	2654	*/
2262		- bp->b_ops = &xfs_inode_buf_ops;
	2655	+ bp->b_ops = &xfs_inode_buf_ops;
2263	2656
2264	2657	/*
2265		- * Walk the inodes already attached to the buffer and mark them
2266		- * stale. These will all have the flush locks held, so an
2267		- * in-memory inode walk can't lock them. By marking them all
2268		- * stale first, we will not attempt to lock them in the loop
2269		- * below as the XFS_ISTALE flag will be set.
	2658	+ * Now we need to set all the cached clean inodes as XFS_ISTALE,
	2659	+ * too. This requires lookups, and will skip inodes that we've
	2660	+ * already marked XFS_ISTALE.
2270	2661	*/
2271		- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
2272		- if (lip->li_type == XFS_LI_INODE) {
2273		- iip = (xfs_inode_log_item_t *)lip;
2274		- ASSERT(iip->ili_logged == 1);
2275		- lip->li_cb = xfs_istale_done;
2276		- xfs_trans_ail_copy_lsn(mp->m_ail,
2277		- &iip->ili_flush_lsn,
2278		- &iip->ili_item.li_lsn);
2279		- xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2280		- }
2281		- }
2282		-
2283		-
2284		- /*
2285		- * For each inode in memory attempt to add it to the inode
2286		- * buffer and set it up for being staled on buffer IO
2287		- * completion. This is safe as we've locked out tail pushing
2288		- * and flushing by locking the buffer.
2289		- *
2290		- * We have already marked every inode that was part of a
2291		- * transaction stale above, which means there is no point in
2292		- * even trying to lock them.
2293		- */
2294		- for (i = 0; i < inodes_per_cluster; i++) {
2295		-retry:
2296		- rcu_read_lock();
2297		- ip = radix_tree_lookup(&pag->pag_ici_root,
2298		- XFS_INO_TO_AGINO(mp, (inum + i)));
2299		-
2300		- /* Inode not in memory, nothing to do */
2301		- if (!ip) {
2302		- rcu_read_unlock();
2303		- continue;
2304		- }
2305		-
2306		- /*
2307		- * because this is an RCU protected lookup, we could
2308		- * find a recently freed or even reallocated inode
2309		- * during the lookup. We need to check under the
2310		- * i_flags_lock for a valid inode here. Skip it if it
2311		- * is not valid, the wrong inode or stale.
2312		- */
2313		- spin_lock(&ip->i_flags_lock);
2314		- if (ip->i_ino != inum + i \|\|
2315		- __xfs_iflags_test(ip, XFS_ISTALE)) {
2316		- spin_unlock(&ip->i_flags_lock);
2317		- rcu_read_unlock();
2318		- continue;
2319		- }
2320		- spin_unlock(&ip->i_flags_lock);
2321		-
2322		- /*
2323		- * Don't try to lock/unlock the current inode, but we
2324		- * _cannot_ skip the other inodes that we did not find
2325		- * in the list attached to the buffer and are not
2326		- * already marked stale. If we can't lock it, back off
2327		- * and retry.
2328		- */
2329		- if (ip != free_ip) {
2330		- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2331		- rcu_read_unlock();
2332		- delay(1);
2333		- goto retry;
2334		- }
2335		-
2336		- /*
2337		- * Check the inode number again in case we're
2338		- * racing with freeing in xfs_reclaim_inode().
2339		- * See the comments in that function for more
2340		- * information as to why the initial check is
2341		- * not sufficient.
2342		- */
2343		- if (ip->i_ino != inum + i) {
2344		- xfs_iunlock(ip, XFS_ILOCK_EXCL);
2345		- rcu_read_unlock();
2346		- continue;
2347		- }
2348		- }
2349		- rcu_read_unlock();
2350		-
2351		- xfs_iflock(ip);
2352		- xfs_iflags_set(ip, XFS_ISTALE);
2353		-
2354		- /*
2355		- * we don't need to attach clean inodes or those only
2356		- * with unlogged changes (which we throw away, anyway).
2357		- */
2358		- iip = ip->i_itemp;
2359		- if (!iip \|\| xfs_inode_clean(ip)) {
2360		- ASSERT(ip != free_ip);
2361		- xfs_ifunlock(ip);
2362		- xfs_iunlock(ip, XFS_ILOCK_EXCL);
2363		- continue;
2364		- }
2365		-
2366		- iip->ili_last_fields = iip->ili_fields;
2367		- iip->ili_fields = 0;
2368		- iip->ili_fsync_fields = 0;
2369		- iip->ili_logged = 1;
2370		- xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2371		- &iip->ili_item.li_lsn);
2372		-
2373		- xfs_buf_attach_iodone(bp, xfs_istale_done,
2374		- &iip->ili_item);
2375		-
2376		- if (ip != free_ip)
2377		- xfs_iunlock(ip, XFS_ILOCK_EXCL);
2378		- }
	2662	+ for (i = 0; i < igeo->inodes_per_cluster; i++)
	2663	+ xfs_ifree_mark_inode_stale(bp, free_ip, inum + i);
2379	2664
2380	2665	xfs_trans_stale_inode_buf(tp, bp);
2381	2666	xfs_trans_binval(tp, bp);
2382	2667	}
2383		-
2384		- xfs_perag_put(pag);
2385	2668	return 0;
2386	2669	}
2387	2670
2388	2671	/*
2389		- * Free any local-format buffers sitting around before we reset to
2390		- * extents format.
2391		- */
2392		-static inline void
2393		-xfs_ifree_local_data(
2394		- struct xfs_inode *ip,
2395		- int whichfork)
2396		-{
2397		- struct xfs_ifork *ifp;
2398		-
2399		- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
2400		- return;
2401		-
2402		- ifp = XFS_IFORK_PTR(ip, whichfork);
2403		- xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
2404		-}
2405		-
2406		-/*
2407		- * This is called to return an inode to the inode free list.
2408		- * The inode should already be truncated to 0 length and have
2409		- * no pages associated with it. This routine also assumes that
2410		- * the inode is already a part of the transaction.
	2672	+ * This is called to return an inode to the inode free list. The inode should
	2673	+ * already be truncated to 0 length and have no pages associated with it. This
	2674	+ * routine also assumes that the inode is already a part of the transaction.
2411	2675	*
2412		- * The on-disk copy of the inode will have been added to the list
2413		- * of unlinked inodes in the AGI. We need to remove the inode from
2414		- * that list atomically with respect to freeing it here.
	2676	+ * The on-disk copy of the inode will have been added to the list of unlinked
	2677	+ * inodes in the AGI. We need to remove the inode from that list atomically with
	2678	+ * respect to freeing it here.
2415	2679	*/
2416	2680	int
2417	2681	xfs_ifree(
..	..	@@ -2420,38 +2684,50 @@
2420	2684	{
2421	2685	int error;
2422	2686	struct xfs_icluster xic = { 0 };
	2687	+ struct xfs_inode_log_item *iip = ip->i_itemp;
2423	2688
2424	2689	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2425	2690	ASSERT(VFS_I(ip)->i_nlink == 0);
2426		- ASSERT(ip->i_d.di_nextents == 0);
2427		- ASSERT(ip->i_d.di_anextents == 0);
	2691	+ ASSERT(ip->i_df.if_nextents == 0);
2428	2692	ASSERT(ip->i_d.di_size == 0 \|\| !S_ISREG(VFS_I(ip)->i_mode));
2429	2693	ASSERT(ip->i_d.di_nblocks == 0);
2430	2694
2431	2695	/*
2432		- * Pull the on-disk inode from the AGI unlinked list.
	2696	+ * Free the inode first so that we guarantee that the AGI lock is going
	2697	+ * to be taken before we remove the inode from the unlinked list. This
	2698	+ * makes the AGI lock -> unlinked list modification order the same as
	2699	+ * used in O_TMPFILE creation.
2433	2700	*/
2434		- error = xfs_iunlink_remove(tp, ip);
2435		- if (error)
2436		- return error;
2437		-
2438	2701	error = xfs_difree(tp, ip->i_ino, &xic);
2439	2702	if (error)
2440	2703	return error;
2441	2704
2442		- xfs_ifree_local_data(ip, XFS_DATA_FORK);
2443		- xfs_ifree_local_data(ip, XFS_ATTR_FORK);
	2705	+ error = xfs_iunlink_remove(tp, ip);
	2706	+ if (error)
	2707	+ return error;
	2708	+
	2709	+ /*
	2710	+ * Free any local-format data sitting around before we reset the
	2711	+ * data fork to extents format. Note that the attr fork data has
	2712	+ * already been freed by xfs_attr_inactive.
	2713	+ */
	2714	+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
	2715	+ kmem_free(ip->i_df.if_u1.if_data);
	2716	+ ip->i_df.if_u1.if_data = NULL;
	2717	+ ip->i_df.if_bytes = 0;
	2718	+ }
2444	2719
2445	2720	VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
2446	2721	ip->i_d.di_flags = 0;
2447		- ip->i_d.di_flags2 = 0;
	2722	+ ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2;
2448	2723	ip->i_d.di_dmevmask = 0;
2449	2724	ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
2450		- ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
2451		- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
	2725	+ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
2452	2726
2453	2727	/* Don't attempt to replay owner changes for a deleted inode */
2454		- ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER\|XFS_ILOG_DOWNER);
	2728	+ spin_lock(&iip->ili_lock);
	2729	+ iip->ili_fields &= ~(XFS_ILOG_AOWNER \| XFS_ILOG_DOWNER);
	2730	+ spin_unlock(&iip->ili_lock);
2455	2731
2456	2732	/*
2457	2733	* Bump the generation count so no one will be confused
..	..	@@ -2480,7 +2756,7 @@
2480	2756	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2481	2757
2482	2758	/* Give the log a push to start the unpinning I/O */
2483		- xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL);
	2759	+ xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
2484	2760
2485	2761	}
2486	2762
..	..	@@ -2769,9 +3045,7 @@
2769	3045	error = xfs_droplink(tp, dp2);
2770	3046	if (error)
2771	3047	goto out_trans_abort;
2772		- error = xfs_bumplink(tp, dp1);
2773		- if (error)
2774		- goto out_trans_abort;
	3048	+ xfs_bumplink(tp, dp1);
2775	3049	}
2776	3050
2777	3051	/*
..	..	@@ -2795,9 +3069,7 @@
2795	3069	error = xfs_droplink(tp, dp1);
2796	3070	if (error)
2797	3071	goto out_trans_abort;
2798		- error = xfs_bumplink(tp, dp2);
2799		- if (error)
2800		- goto out_trans_abort;
	3072	+ xfs_bumplink(tp, dp2);
2801	3073	}
2802	3074
2803	3075	/*
..	..	@@ -2835,7 +3107,7 @@
2835	3107	/*
2836	3108	* xfs_rename_alloc_whiteout()
2837	3109	*
2838		- * Return a referenced, unlinked, unlocked inode that that can be used as a
	3110	+ * Return a referenced, unlinked, unlocked inode that can be used as a
2839	3111	* whiteout in a rename transaction. We use a tmpfile inode here so that if we
2840	3112	* crash between allocating the inode and linking it into the rename transaction
2841	3113	* recovery will free the inode and we won't leak it.
..	..	@@ -2882,6 +3154,7 @@
2882	3154	struct xfs_trans *tp;
2883	3155	struct xfs_inode wip = NULL; / whiteout inode */
2884	3156	struct xfs_inode *inodes[__XFS_SORT_INODES];
	3157	+ int i;
2885	3158	int num_inodes = __XFS_SORT_INODES;
2886	3159	bool new_parent = (src_dp != target_dp);
2887	3160	bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
..	..	@@ -2899,7 +3172,6 @@
2899	3172	* appropriately.
2900	3173	*/
2901	3174	if (flags & RENAME_WHITEOUT) {
2902		- ASSERT(!(flags & (RENAME_NOREPLACE \| RENAME_EXCHANGE)));
2903	3175	error = xfs_rename_alloc_whiteout(target_dp, &wip);
2904	3176	if (error)
2905	3177	return error;
..	..	@@ -2956,7 +3228,7 @@
2956	3228	* tree quota mechanism would be circumvented.
2957	3229	*/
2958	3230	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2959		- (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
	3231	+ target_dp->i_d.di_projid != src_ip->i_d.di_projid)) {
2960	3232	error = -EXDEV;
2961	3233	goto out_trans_cancel;
2962	3234	}
..	..	@@ -2995,6 +3267,30 @@
2995	3267	}
2996	3268
2997	3269	/*
	3270	+ * Lock the AGI buffers we need to handle bumping the nlink of the
	3271	+ * whiteout inode off the unlinked list and to handle dropping the
	3272	+ * nlink of the target inode. Per locking order rules, do this in
	3273	+ * increasing AG order and before directory block allocation tries to
	3274	+ * grab AGFs because we grab AGIs before AGFs.
	3275	+ *
	3276	+ * The (vfs) caller must ensure that if src is a directory then
	3277	+ * target_ip is either null or an empty directory.
	3278	+ */
	3279	+ for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
	3280	+ if (inodes[i] == wip \|\|
	3281	+ (inodes[i] == target_ip &&
	3282	+ (VFS_I(target_ip)->i_nlink == 1 \|\| src_is_directory))) {
	3283	+ struct xfs_buf *bp;
	3284	+ xfs_agnumber_t agno;
	3285	+
	3286	+ agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
	3287	+ error = xfs_read_agi(mp, tp, agno, &bp);
	3288	+ if (error)
	3289	+ goto out_trans_cancel;
	3290	+ }
	3291	+ }
	3292	+
	3293	+ /*
2998	3294	* Directory entry creation below may acquire the AGF. Remove
2999	3295	* the whiteout from the unlinked list first to preserve correct
3000	3296	* AGI/AGF locking order. This dirties the transaction so failures
..	..	@@ -3013,7 +3309,6 @@
3013	3309	goto out_trans_cancel;
3014	3310
3015	3311	xfs_bumplink(tp, wip);
3016		- xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
3017	3312	VFS_I(wip)->i_state &= ~I_LINKABLE;
3018	3313	}
3019	3314
..	..	@@ -3035,9 +3330,7 @@
3035	3330	XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
3036	3331
3037	3332	if (new_parent && src_is_directory) {
3038		- error = xfs_bumplink(tp, target_dp);
3039		- if (error)
3040		- goto out_trans_cancel;
	3333	+ xfs_bumplink(tp, target_dp);
3041	3334	}
3042	3335	} else { /* target_ip != NULL */
3043	3336	/*
..	..	@@ -3148,373 +3441,76 @@
3148	3441	return error;
3149	3442	}
3150	3443
3151		-STATIC int
3152		-xfs_iflush_cluster(
3153		- struct xfs_inode *ip,
3154		- struct xfs_buf *bp)
3155		-{
3156		- struct xfs_mount *mp = ip->i_mount;
3157		- struct xfs_perag *pag;
3158		- unsigned long first_index, mask;
3159		- unsigned long inodes_per_cluster;
3160		- int cilist_size;
3161		- struct xfs_inode **cilist;
3162		- struct xfs_inode *cip;
3163		- int nr_found;
3164		- int clcount = 0;
3165		- int i;
3166		-
3167		- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
3168		-
3169		- inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
3170		- cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
3171		- cilist = kmem_alloc(cilist_size, KM_MAYFAIL\|KM_NOFS);
3172		- if (!cilist)
3173		- goto out_put;
3174		-
3175		- mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
3176		- first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
3177		- rcu_read_lock();
3178		- /* really need a gang lookup range call here */
3179		- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
3180		- first_index, inodes_per_cluster);
3181		- if (nr_found == 0)
3182		- goto out_free;
3183		-
3184		- for (i = 0; i < nr_found; i++) {
3185		- cip = cilist[i];
3186		- if (cip == ip)
3187		- continue;
3188		-
3189		- /*
3190		- * because this is an RCU protected lookup, we could find a
3191		- * recently freed or even reallocated inode during the lookup.
3192		- * We need to check under the i_flags_lock for a valid inode
3193		- * here. Skip it if it is not valid or the wrong inode.
3194		- */
3195		- spin_lock(&cip->i_flags_lock);
3196		- if (!cip->i_ino \|\|
3197		- __xfs_iflags_test(cip, XFS_ISTALE)) {
3198		- spin_unlock(&cip->i_flags_lock);
3199		- continue;
3200		- }
3201		-
3202		- /*
3203		- * Once we fall off the end of the cluster, no point checking
3204		- * any more inodes in the list because they will also all be
3205		- * outside the cluster.
3206		- */
3207		- if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
3208		- spin_unlock(&cip->i_flags_lock);
3209		- break;
3210		- }
3211		- spin_unlock(&cip->i_flags_lock);
3212		-
3213		- /*
3214		- * Do an un-protected check to see if the inode is dirty and
3215		- * is a candidate for flushing. These checks will be repeated
3216		- * later after the appropriate locks are acquired.
3217		- */
3218		- if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
3219		- continue;
3220		-
3221		- /*
3222		- * Try to get locks. If any are unavailable or it is pinned,
3223		- * then this inode cannot be flushed and is skipped.
3224		- */
3225		-
3226		- if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
3227		- continue;
3228		- if (!xfs_iflock_nowait(cip)) {
3229		- xfs_iunlock(cip, XFS_ILOCK_SHARED);
3230		- continue;
3231		- }
3232		- if (xfs_ipincount(cip)) {
3233		- xfs_ifunlock(cip);
3234		- xfs_iunlock(cip, XFS_ILOCK_SHARED);
3235		- continue;
3236		- }
3237		-
3238		-
3239		- /*
3240		- * Check the inode number again, just to be certain we are not
3241		- * racing with freeing in xfs_reclaim_inode(). See the comments
3242		- * in that function for more information as to why the initial
3243		- * check is not sufficient.
3244		- */
3245		- if (!cip->i_ino) {
3246		- xfs_ifunlock(cip);
3247		- xfs_iunlock(cip, XFS_ILOCK_SHARED);
3248		- continue;
3249		- }
3250		-
3251		- /*
3252		- * arriving here means that this inode can be flushed. First
3253		- * re-check that it's dirty before flushing.
3254		- */
3255		- if (!xfs_inode_clean(cip)) {
3256		- int error;
3257		- error = xfs_iflush_int(cip, bp);
3258		- if (error) {
3259		- xfs_iunlock(cip, XFS_ILOCK_SHARED);
3260		- goto cluster_corrupt_out;
3261		- }
3262		- clcount++;
3263		- } else {
3264		- xfs_ifunlock(cip);
3265		- }
3266		- xfs_iunlock(cip, XFS_ILOCK_SHARED);
3267		- }
3268		-
3269		- if (clcount) {
3270		- XFS_STATS_INC(mp, xs_icluster_flushcnt);
3271		- XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3272		- }
3273		-
3274		-out_free:
3275		- rcu_read_unlock();
3276		- kmem_free(cilist);
3277		-out_put:
3278		- xfs_perag_put(pag);
3279		- return 0;
3280		-
3281		-
3282		-cluster_corrupt_out:
3283		- /*
3284		- * Corruption detected in the clustering loop. Invalidate the
3285		- * inode buffer and shut down the filesystem.
3286		- */
3287		- rcu_read_unlock();
3288		- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3289		-
3290		- /*
3291		- * We'll always have an inode attached to the buffer for completion
3292		- * process by the time we are called from xfs_iflush(). Hence we have
3293		- * always need to do IO completion processing to abort the inodes
3294		- * attached to the buffer. handle them just like the shutdown case in
3295		- * xfs_buf_submit().
3296		- */
3297		- ASSERT(bp->b_iodone);
3298		- bp->b_flags &= ~XBF_DONE;
3299		- xfs_buf_stale(bp);
3300		- xfs_buf_ioerror(bp, -EIO);
3301		- xfs_buf_ioend(bp);
3302		-
3303		- /* abort the corrupt inode, as it was not attached to the buffer */
3304		- xfs_iflush_abort(cip, false);
3305		- kmem_free(cilist);
3306		- xfs_perag_put(pag);
3307		- return -EFSCORRUPTED;
3308		-}
3309		-
3310		-/*
3311		- * Flush dirty inode metadata into the backing buffer.
3312		- *
3313		- * The caller must have the inode lock and the inode flush lock held. The
3314		- * inode lock will still be held upon return to the caller, and the inode
3315		- * flush lock will be released after the inode has reached the disk.
3316		- *
3317		- * The caller must write out the buffer returned in *bpp and release it.
3318		- */
3319		-int
	3444	+static int
3320	3445	xfs_iflush(
3321		- struct xfs_inode *ip,
3322		- struct xfs_buf **bpp)
3323		-{
3324		- struct xfs_mount *mp = ip->i_mount;
3325		- struct xfs_buf *bp = NULL;
3326		- struct xfs_dinode *dip;
3327		- int error;
3328		-
3329		- XFS_STATS_INC(mp, xs_iflush_count);
3330		-
3331		- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED));
3332		- ASSERT(xfs_isiflocked(ip));
3333		- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE \|\|
3334		- ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3335		-
3336		- *bpp = NULL;
3337		-
3338		- xfs_iunpin_wait(ip);
3339		-
3340		- /*
3341		- * For stale inodes we cannot rely on the backing buffer remaining
3342		- * stale in cache for the remaining life of the stale inode and so
3343		- * xfs_imap_to_bp() below may give us a buffer that no longer contains
3344		- * inodes below. We have to check this after ensuring the inode is
3345		- * unpinned so that it is safe to reclaim the stale inode after the
3346		- * flush call.
3347		- */
3348		- if (xfs_iflags_test(ip, XFS_ISTALE)) {
3349		- xfs_ifunlock(ip);
3350		- return 0;
3351		- }
3352		-
3353		- /*
3354		- * This may have been unpinned because the filesystem is shutting
3355		- * down forcibly. If that's the case we must not write this inode
3356		- * to disk, because the log record didn't make it to disk.
3357		- *
3358		- * We also have to remove the log item from the AIL in this case,
3359		- * as we wait for an empty AIL as part of the unmount process.
3360		- */
3361		- if (XFS_FORCED_SHUTDOWN(mp)) {
3362		- error = -EIO;
3363		- goto abort_out;
3364		- }
3365		-
3366		- /*
3367		- * Get the buffer containing the on-disk inode. We are doing a try-lock
3368		- * operation here, so we may get an EAGAIN error. In that case, we
3369		- * simply want to return with the inode still dirty.
3370		- *
3371		- * If we get any other error, we effectively have a corruption situation
3372		- * and we cannot flush the inode, so we treat it the same as failing
3373		- * xfs_iflush_int().
3374		- */
3375		- error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
3376		- 0);
3377		- if (error == -EAGAIN) {
3378		- xfs_ifunlock(ip);
3379		- return error;
3380		- }
3381		- if (error)
3382		- goto corrupt_out;
3383		-
3384		- /*
3385		- * First flush out the inode that xfs_iflush was called with.
3386		- */
3387		- error = xfs_iflush_int(ip, bp);
3388		- if (error)
3389		- goto corrupt_out;
3390		-
3391		- /*
3392		- * If the buffer is pinned then push on the log now so we won't
3393		- * get stuck waiting in the write for too long.
3394		- */
3395		- if (xfs_buf_ispinned(bp))
3396		- xfs_log_force(mp, 0);
3397		-
3398		- /*
3399		- * inode clustering: try to gather other inodes into this write
3400		- *
3401		- * Note: Any error during clustering will result in the filesystem
3402		- * being shut down and completion callbacks run on the cluster buffer.
3403		- * As we have already flushed and attached this inode to the buffer,
3404		- * it has already been aborted and released by xfs_iflush_cluster() and
3405		- * so we have no further error handling to do here.
3406		- */
3407		- error = xfs_iflush_cluster(ip, bp);
3408		- if (error)
3409		- return error;
3410		-
3411		- *bpp = bp;
3412		- return 0;
3413		-
3414		-corrupt_out:
3415		- if (bp)
3416		- xfs_buf_relse(bp);
3417		- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3418		-abort_out:
3419		- /* abort the corrupt inode, as it was not attached to the buffer */
3420		- xfs_iflush_abort(ip, false);
3421		- return error;
3422		-}
3423		-
3424		-/*
3425		- * If there are inline format data / attr forks attached to this inode,
3426		- * make sure they're not corrupt.
3427		- */
3428		-bool
3429		-xfs_inode_verify_forks(
3430		- struct xfs_inode *ip)
3431		-{
3432		- struct xfs_ifork *ifp;
3433		- xfs_failaddr_t fa;
3434		-
3435		- fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops);
3436		- if (fa) {
3437		- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
3438		- xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
3439		- ifp->if_u1.if_data, ifp->if_bytes, fa);
3440		- return false;
3441		- }
3442		-
3443		- fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops);
3444		- if (fa) {
3445		- ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
3446		- xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
3447		- ifp ? ifp->if_u1.if_data : NULL,
3448		- ifp ? ifp->if_bytes : 0, fa);
3449		- return false;
3450		- }
3451		- return true;
3452		-}
3453		-
3454		-STATIC int
3455		-xfs_iflush_int(
3456	3446	struct xfs_inode *ip,
3457	3447	struct xfs_buf *bp)
3458	3448	{
3459	3449	struct xfs_inode_log_item *iip = ip->i_itemp;
3460	3450	struct xfs_dinode *dip;
3461	3451	struct xfs_mount *mp = ip->i_mount;
	3452	+ int error;
3462	3453
3463	3454	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED));
3464		- ASSERT(xfs_isiflocked(ip));
3465		- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE \|\|
3466		- ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3467		- ASSERT(iip != NULL && iip->ili_fields != 0);
3468		- ASSERT(ip->i_d.di_version > 1);
	3455	+ ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
	3456	+ ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE \|\|
	3457	+ ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
	3458	+ ASSERT(iip->ili_item.li_buf == bp);
3469	3459
3470		- /* set dip = inode's place in the buffer /
3471	3460	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3472	3461
	3462	+ /*
	3463	+ * We don't flush the inode if any of the following checks fail, but we
	3464	+ * do still update the log item and attach to the backing buffer as if
	3465	+ * the flush happened. This is a formality to facilitate predictable
	3466	+ * error handling as the caller will shutdown and fail the buffer.
	3467	+ */
	3468	+ error = -EFSCORRUPTED;
3473	3469	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3474	3470	mp, XFS_ERRTAG_IFLUSH_1)) {
3475	3471	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3476	3472	"%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT,
3477	3473	__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3478		- goto corrupt_out;
	3474	+ goto flush_out;
3479	3475	}
3480	3476	if (S_ISREG(VFS_I(ip)->i_mode)) {
3481	3477	if (XFS_TEST_ERROR(
3482		- (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3483		- (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
	3478	+ ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
	3479	+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
3484	3480	mp, XFS_ERRTAG_IFLUSH_3)) {
3485	3481	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3486	3482	"%s: Bad regular inode %Lu, ptr "PTR_FMT,
3487	3483	__func__, ip->i_ino, ip);
3488		- goto corrupt_out;
	3484	+ goto flush_out;
3489	3485	}
3490	3486	} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
3491	3487	if (XFS_TEST_ERROR(
3492		- (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3493		- (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
3494		- (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
	3488	+ ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
	3489	+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
	3490	+ ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
3495	3491	mp, XFS_ERRTAG_IFLUSH_4)) {
3496	3492	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3497	3493	"%s: Bad directory inode %Lu, ptr "PTR_FMT,
3498	3494	__func__, ip->i_ino, ip);
3499		- goto corrupt_out;
	3495	+ goto flush_out;
3500	3496	}
3501	3497	}
3502		- if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
	3498	+ if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
3503	3499	ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
3504	3500	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3505	3501	"%s: detected corrupt incore inode %Lu, "
3506	3502	"total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
3507	3503	__func__, ip->i_ino,
3508		- ip->i_d.di_nextents + ip->i_d.di_anextents,
	3504	+ ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
3509	3505	ip->i_d.di_nblocks, ip);
3510		- goto corrupt_out;
	3506	+ goto flush_out;
3511	3507	}
3512	3508	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
3513	3509	mp, XFS_ERRTAG_IFLUSH_6)) {
3514	3510	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3515	3511	"%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT,
3516	3512	__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
3517		- goto corrupt_out;
	3513	+ goto flush_out;
3518	3514	}
3519	3515
3520	3516	/*
..	..	@@ -3526,12 +3522,19 @@
3526	3522	* backwards compatibility with old kernels that predate logging all
3527	3523	* inode changes.
3528	3524	*/
3529		- if (ip->i_d.di_version < 3)
	3525	+ if (!xfs_sb_version_has_v3inode(&mp->m_sb))
3530	3526	ip->i_d.di_flushiter++;
3531	3527
3532		- /* Check the inline fork data before we write out. */
3533		- if (!xfs_inode_verify_forks(ip))
3534		- goto corrupt_out;
	3528	+ /*
	3529	+ * If there are inline format data / attr forks attached to this inode,
	3530	+ * make sure they are not corrupt.
	3531	+ */
	3532	+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
	3533	+ xfs_ifork_verify_local_data(ip))
	3534	+ goto flush_out;
	3535	+ if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL &&
	3536	+ xfs_ifork_verify_local_attr(ip))
	3537	+ goto flush_out;
3535	3538
3536	3539	/*
3537	3540	* Copy the dirty parts of the inode into the on-disk inode. We always
..	..	@@ -3547,7 +3550,6 @@
3547	3550	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3548	3551	if (XFS_IFORK_Q(ip))
3549	3552	xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3550		- xfs_inobp_check(mp, bp);
3551	3553
3552	3554	/*
3553	3555	* We've recorded everything logged in the inode, so we'd like to clear
..	..	@@ -3560,45 +3562,144 @@
3560	3562	*
3561	3563	* What we do is move the bits to the ili_last_fields field. When
3562	3564	* logging the inode, these bits are moved back to the ili_fields field.
3563		- * In the xfs_iflush_done() routine we clear ili_last_fields, since we
3564		- * know that the information those bits represent is permanently on
	3565	+ * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
	3566	+ * we know that the information those bits represent is permanently on
3565	3567	* disk. As long as the flush completes before the inode is logged
3566	3568	* again, then both ili_fields and ili_last_fields will be cleared.
3567		- *
3568		- * We can play with the ili_fields bits here, because the inode lock
3569		- * must be held exclusively in order to set bits there and the flush
3570		- * lock protects the ili_last_fields bits. Set ili_logged so the flush
3571		- * done routine can tell whether or not to look in the AIL. Also, store
3572		- * the current LSN of the inode so that we can tell whether the item has
3573		- * moved in the AIL from xfs_iflush_done(). In order to read the lsn we
3574		- * need the AIL lock, because it is a 64 bit value that cannot be read
3575		- * atomically.
3576	3569	*/
	3570	+ error = 0;
	3571	+flush_out:
	3572	+ spin_lock(&iip->ili_lock);
3577	3573	iip->ili_last_fields = iip->ili_fields;
3578	3574	iip->ili_fields = 0;
3579	3575	iip->ili_fsync_fields = 0;
3580		- iip->ili_logged = 1;
	3576	+ spin_unlock(&iip->ili_lock);
3581	3577
	3578	+ /*
	3579	+ * Store the current LSN of the inode so that we can tell whether the
	3580	+ * item has moved in the AIL from xfs_buf_inode_iodone().
	3581	+ */
3582	3582	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3583	3583	&iip->ili_item.li_lsn);
3584	3584
3585		- /*
3586		- * Attach the function xfs_iflush_done to the inode's
3587		- * buffer. This will remove the inode from the AIL
3588		- * and unlock the inode's flush lock when the inode is
3589		- * completely written to disk.
3590		- */
3591		- xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
3592		-
3593	3585	/* generate the checksum. */
3594	3586	xfs_dinode_calc_crc(mp, dip);
	3587	+ return error;
	3588	+}
3595	3589
3596		- ASSERT(!list_empty(&bp->b_li_list));
3597		- ASSERT(bp->b_iodone != NULL);
	3590	+/*
	3591	+ * Non-blocking flush of dirty inode metadata into the backing buffer.
	3592	+ *
	3593	+ * The caller must have a reference to the inode and hold the cluster buffer
	3594	+ * locked. The function will walk across all the inodes on the cluster buffer it
	3595	+ * can find and lock without blocking, and flush them to the cluster buffer.
	3596	+ *
	3597	+ * On successful flushing of at least one inode, the caller must write out the
	3598	+ * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
	3599	+ * the caller needs to release the buffer. On failure, the filesystem will be
	3600	+ * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
	3601	+ * will be returned.
	3602	+ */
	3603	+int
	3604	+xfs_iflush_cluster(
	3605	+ struct xfs_buf *bp)
	3606	+{
	3607	+ struct xfs_mount *mp = bp->b_mount;
	3608	+ struct xfs_log_item lip, n;
	3609	+ struct xfs_inode *ip;
	3610	+ struct xfs_inode_log_item *iip;
	3611	+ int clcount = 0;
	3612	+ int error = 0;
	3613	+
	3614	+ /*
	3615	+ * We must use the safe variant here as on shutdown xfs_iflush_abort()
	3616	+ * can remove itself from the list.
	3617	+ */
	3618	+ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
	3619	+ iip = (struct xfs_inode_log_item *)lip;
	3620	+ ip = iip->ili_inode;
	3621	+
	3622	+ /*
	3623	+ * Quick and dirty check to avoid locks if possible.
	3624	+ */
	3625	+ if (__xfs_iflags_test(ip, XFS_IRECLAIM \| XFS_IFLUSHING))
	3626	+ continue;
	3627	+ if (xfs_ipincount(ip))
	3628	+ continue;
	3629	+
	3630	+ /*
	3631	+ * The inode is still attached to the buffer, which means it is
	3632	+ * dirty but reclaim might try to grab it. Check carefully for
	3633	+ * that, and grab the ilock while still holding the i_flags_lock
	3634	+ * to guarantee reclaim will not be able to reclaim this inode
	3635	+ * once we drop the i_flags_lock.
	3636	+ */
	3637	+ spin_lock(&ip->i_flags_lock);
	3638	+ ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
	3639	+ if (__xfs_iflags_test(ip, XFS_IRECLAIM \| XFS_IFLUSHING)) {
	3640	+ spin_unlock(&ip->i_flags_lock);
	3641	+ continue;
	3642	+ }
	3643	+
	3644	+ /*
	3645	+ * ILOCK will pin the inode against reclaim and prevent
	3646	+ * concurrent transactions modifying the inode while we are
	3647	+ * flushing the inode. If we get the lock, set the flushing
	3648	+ * state before we drop the i_flags_lock.
	3649	+ */
	3650	+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
	3651	+ spin_unlock(&ip->i_flags_lock);
	3652	+ continue;
	3653	+ }
	3654	+ __xfs_iflags_set(ip, XFS_IFLUSHING);
	3655	+ spin_unlock(&ip->i_flags_lock);
	3656	+
	3657	+ /*
	3658	+ * Abort flushing this inode if we are shut down because the
	3659	+ * inode may not currently be in the AIL. This can occur when
	3660	+ * log I/O failure unpins the inode without inserting into the
	3661	+ * AIL, leaving a dirty/unpinned inode attached to the buffer
	3662	+ * that otherwise looks like it should be flushed.
	3663	+ */
	3664	+ if (XFS_FORCED_SHUTDOWN(mp)) {
	3665	+ xfs_iunpin_wait(ip);
	3666	+ xfs_iflush_abort(ip);
	3667	+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
	3668	+ error = -EIO;
	3669	+ continue;
	3670	+ }
	3671	+
	3672	+ /* don't block waiting on a log force to unpin dirty inodes */
	3673	+ if (xfs_ipincount(ip)) {
	3674	+ xfs_iflags_clear(ip, XFS_IFLUSHING);
	3675	+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
	3676	+ continue;
	3677	+ }
	3678	+
	3679	+ if (!xfs_inode_clean(ip))
	3680	+ error = xfs_iflush(ip, bp);
	3681	+ else
	3682	+ xfs_iflags_clear(ip, XFS_IFLUSHING);
	3683	+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
	3684	+ if (error)
	3685	+ break;
	3686	+ clcount++;
	3687	+ }
	3688	+
	3689	+ if (error) {
	3690	+ bp->b_flags \|= XBF_ASYNC;
	3691	+ xfs_buf_ioend_fail(bp);
	3692	+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
	3693	+ return error;
	3694	+ }
	3695	+
	3696	+ if (!clcount)
	3697	+ return -EAGAIN;
	3698	+
	3699	+ XFS_STATS_INC(mp, xs_icluster_flushcnt);
	3700	+ XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3598	3701	return 0;
3599	3702
3600		-corrupt_out:
3601		- return -EFSCORRUPTED;
3602	3703	}
3603	3704
3604	3705	/* Release an inode. */
..	..	@@ -3609,3 +3710,115 @@
3609	3710	trace_xfs_irele(ip, _RET_IP_);
3610	3711	iput(VFS_I(ip));
3611	3712	}
	3713	+
	3714	+/*
	3715	+ * Ensure all commited transactions touching the inode are written to the log.
	3716	+ */
	3717	+int
	3718	+xfs_log_force_inode(
	3719	+ struct xfs_inode *ip)
	3720	+{
	3721	+ xfs_csn_t seq = 0;
	3722	+
	3723	+ xfs_ilock(ip, XFS_ILOCK_SHARED);
	3724	+ if (xfs_ipincount(ip))
	3725	+ seq = ip->i_itemp->ili_commit_seq;
	3726	+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
	3727	+
	3728	+ if (!seq)
	3729	+ return 0;
	3730	+ return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
	3731	+}
	3732	+
	3733	+/*
	3734	+ * Grab the exclusive iolock for a data copy from src to dest, making sure to
	3735	+ * abide vfs locking order (lowest pointer value goes first) and breaking the
	3736	+ * layout leases before proceeding. The loop is needed because we cannot call
	3737	+ * the blocking break_layout() with the iolocks held, and therefore have to
	3738	+ * back out both locks.
	3739	+ */
	3740	+static int
	3741	+xfs_iolock_two_inodes_and_break_layout(
	3742	+ struct inode *src,
	3743	+ struct inode *dest)
	3744	+{
	3745	+ int error;
	3746	+
	3747	+ if (src > dest)
	3748	+ swap(src, dest);
	3749	+
	3750	+retry:
	3751	+ /* Wait to break both inodes' layouts before we start locking. */
	3752	+ error = break_layout(src, true);
	3753	+ if (error)
	3754	+ return error;
	3755	+ if (src != dest) {
	3756	+ error = break_layout(dest, true);
	3757	+ if (error)
	3758	+ return error;
	3759	+ }
	3760	+
	3761	+ /* Lock one inode and make sure nobody got in and leased it. */
	3762	+ inode_lock(src);
	3763	+ error = break_layout(src, false);
	3764	+ if (error) {
	3765	+ inode_unlock(src);
	3766	+ if (error == -EWOULDBLOCK)
	3767	+ goto retry;
	3768	+ return error;
	3769	+ }
	3770	+
	3771	+ if (src == dest)
	3772	+ return 0;
	3773	+
	3774	+ /* Lock the other inode and make sure nobody got in and leased it. */
	3775	+ inode_lock_nested(dest, I_MUTEX_NONDIR2);
	3776	+ error = break_layout(dest, false);
	3777	+ if (error) {
	3778	+ inode_unlock(src);
	3779	+ inode_unlock(dest);
	3780	+ if (error == -EWOULDBLOCK)
	3781	+ goto retry;
	3782	+ return error;
	3783	+ }
	3784	+
	3785	+ return 0;
	3786	+}
	3787	+
	3788	+/*
	3789	+ * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
	3790	+ * mmap activity.
	3791	+ */
	3792	+int
	3793	+xfs_ilock2_io_mmap(
	3794	+ struct xfs_inode *ip1,
	3795	+ struct xfs_inode *ip2)
	3796	+{
	3797	+ int ret;
	3798	+
	3799	+ ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
	3800	+ if (ret)
	3801	+ return ret;
	3802	+ if (ip1 == ip2)
	3803	+ xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
	3804	+ else
	3805	+ xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
	3806	+ ip2, XFS_MMAPLOCK_EXCL);
	3807	+ return 0;
	3808	+}
	3809	+
	3810	+/* Unlock both inodes to allow IO and mmap activity. */
	3811	+void
	3812	+xfs_iunlock2_io_mmap(
	3813	+ struct xfs_inode *ip1,
	3814	+ struct xfs_inode *ip2)
	3815	+{
	3816	+ bool same_inode = (ip1 == ip2);
	3817	+
	3818	+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
	3819	+ if (!same_inode)
	3820	+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
	3821	+ inode_unlock(VFS_I(ip2));
	3822	+ if (!same_inode)
	3823	+ inode_unlock(VFS_I(ip1));
	3824	+}