From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/xfs/xfs_inode.c | 2063 ++++++++++++++++++++++++++++++++-------------------------- 1 files changed, 1,138 insertions(+), 925 deletions(-) diff --git a/kernel/fs/xfs/xfs_inode.c b/kernel/fs/xfs/xfs_inode.c index cd81d6d..1900883 100644 --- a/kernel/fs/xfs/xfs_inode.c +++ b/kernel/fs/xfs/xfs_inode.c @@ -3,7 +3,6 @@ * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ -#include <linux/log2.h> #include <linux/iversion.h> #include "xfs.h" @@ -16,10 +15,7 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_dir2.h" -#include "xfs_attr_sf.h" #include "xfs_attr.h" #include "xfs_trans_space.h" #include "xfs_trans.h" @@ -32,7 +28,6 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_filestream.h" -#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_symlink.h" @@ -40,7 +35,6 @@ #include "xfs_log.h" #include "xfs_bmap_btree.h" #include "xfs_reflink.h" -#include "xfs_dir2_priv.h" kmem_zone_t *xfs_inode_zone; @@ -50,7 +44,6 @@ */ #define XFS_ITRUNC_MAX_EXTENTS 2 -STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *); STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); @@ -61,6 +54,12 @@ xfs_get_extsz_hint( struct xfs_inode *ip) { + /* + * No point in aligning allocations if we need to COW to actually + * write to them. + */ + if (xfs_is_always_cow_inode(ip)) + return 0; if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) return ip->i_d.di_extsize; if (XFS_IS_REALTIME_INODE(ip)) @@ -112,7 +111,7 @@ { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE && (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); @@ -125,7 +124,8 @@ { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + if (ip->i_afp && + ip->i_afp->if_format == XFS_DINODE_FMT_BTREE && (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); @@ -144,17 +144,17 @@ * * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock * - * mmap_sem locking order: + * mmap_lock locking order: * - * i_rwsem -> page lock -> mmap_sem - * mmap_sem -> i_mmap_lock -> page_lock + * i_rwsem -> page lock -> mmap_lock + * mmap_lock -> i_mmap_lock -> page_lock * - * The difference in mmap_sem locking order mean that we cannot hold the + * The difference in mmap_lock locking order mean that we cannot hold the * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can - * fault in pages during copy in/out (for buffered IO) or require the mmap_sem + * fault in pages during copy in/out (for buffered IO) or require the mmap_lock * in get_user_pages() to map the user pages into the kernel address space for * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because - * page faults already hold the mmap_sem. + * page faults already hold the mmap_lock. * * Hence to serialise fully against both syscall and mmap based IO, we need to * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both @@ -441,17 +441,17 @@ */ static void xfs_lock_inodes( - xfs_inode_t **ips, - int inodes, - uint lock_mode) + struct xfs_inode **ips, + int inodes, + uint lock_mode) { - int attempts = 0, i, j, try_lock; - xfs_log_item_t *lp; + int attempts = 0, i, j, try_lock; + struct xfs_log_item *lp; /* * Currently supports between 2 and 5 inodes with exclusive locking. We * support an arbitrary depth of locking here, but absolute limits on - * inodes depend on the the type of locking and the limits placed by + * inodes depend on the type of locking and the limits placed by * lockdep annotations in xfs_lock_inumorder. These are all checked by * the asserts. */ @@ -485,7 +485,7 @@ */ if (!try_lock) { for (j = (i - 1); j >= 0 && !try_lock; j--) { - lp = (xfs_log_item_t *)ips[j]->i_itemp; + lp = &ips[j]->i_itemp->ili_item; if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) try_lock++; } @@ -551,7 +551,7 @@ struct xfs_inode *temp; uint mode_temp; int attempts = 0; - xfs_log_item_t *lp; + struct xfs_log_item *lp; ASSERT(hweight32(ip0_mode) == 1); ASSERT(hweight32(ip1_mode) == 1); @@ -585,7 +585,7 @@ * the second lock. If we can't get it, we must release the first one * and try again. */ - lp = (xfs_log_item_t *)ip0->i_itemp; + lp = &ip0->i_itemp->ili_item; if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { xfs_iunlock(ip0, ip0_mode); @@ -596,22 +596,6 @@ } else { xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1)); } -} - -void -__xfs_iflock( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); - - do { - prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - if (xfs_isiflocked(ip)) - io_schedule(); - } while (!xfs_iflock_nowait(ip)); - - finish_wait(wq, &wait.wq_entry); } STATIC uint @@ -714,6 +698,68 @@ return error; } +/* Propagate di_flags from a parent inode to a child inode. */ +static void +xfs_inode_inherit_flags( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + unsigned int di_flags = 0; + umode_t mode = VFS_I(ip)->i_mode; + + if (S_ISDIR(mode)) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(mode)) { + if ((pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) && + xfs_sb_version_hasrealtime(&ip->i_mount->m_sb)) + di_flags |= XFS_DIFLAG_REALTIME; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSIZE; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + } + if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && + xfs_inherit_noatime) + di_flags |= XFS_DIFLAG_NOATIME; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && + xfs_inherit_nodump) + di_flags |= XFS_DIFLAG_NODUMP; + if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && + xfs_inherit_sync) + di_flags |= XFS_DIFLAG_SYNC; + if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && + xfs_inherit_nosymlinks) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && + xfs_inherit_nodefrag) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + + ip->i_d.di_flags |= di_flags; +} + +/* Propagate di_flags2 from a parent inode to a child inode. */ +static void +xfs_inode_inherit_flags2( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; + } + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; +} + /* * Allocate an inode on disk and return a copy of its in-core version. * The in-core inode is locked exclusively. Set mode, nlink, and rdev @@ -756,6 +802,7 @@ xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { + struct inode *dir = pip ? VFS_I(pip) : NULL; struct xfs_mount *mp = tp->t_mountp; xfs_ino_t ino; xfs_inode_t *ip; @@ -801,26 +848,17 @@ return error; ASSERT(ip != NULL); inode = VFS_I(ip); - - /* - * We always convert v1 inodes to v2 now - we only support filesystems - * with >= v2 inode capability, so there is no reason for ever leaving - * an inode in v1 format. - */ - if (ip->i_d.di_version == 1) - ip->i_d.di_version = 2; - - inode->i_mode = mode; set_nlink(inode, nlink); - ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); - ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); inode->i_rdev = rdev; - xfs_set_projid(ip, prid); + ip->i_d.di_projid = prid; - if (pip && XFS_INHERIT_GID(pip)) { - ip->i_d.di_gid = pip->i_d.di_gid; - if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode)) - inode->i_mode |= S_ISGID; + if (dir && !(dir->i_mode & S_ISGID) && + (mp->m_flags & XFS_MOUNT_GRPID)) { + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + inode->i_mode = mode; + } else { + inode_init_owner(inode, dir, mode); } /* @@ -828,13 +866,12 @@ * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ - if ((irix_sgid_inherit) && - (inode->i_mode & S_ISGID) && - (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) + if (irix_sgid_inherit && + (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid)) inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; - ip->i_d.di_nextents = 0; + ip->i_df.if_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); tv = current_time(inode); @@ -847,14 +884,12 @@ ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; - if (ip->i_d.di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { inode_set_iversion(inode, 1); - ip->i_d.di_flags2 = 0; + ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2; ip->i_d.di_cowextsize = 0; - ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec; - ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec; + ip->i_d.di_crtime = tv; } - flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { @@ -862,70 +897,19 @@ case S_IFCHR: case S_IFBLK: case S_IFSOCK: - ip->i_d.di_format = XFS_DINODE_FMT_DEV; + ip->i_df.if_format = XFS_DINODE_FMT_DEV; ip->i_df.if_flags = 0; flags |= XFS_ILOG_DEV; break; case S_IFREG: case S_IFDIR: - if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint di_flags = 0; - - if (S_ISDIR(mode)) { - if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_RTINHERIT; - if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { - di_flags |= XFS_DIFLAG_EXTSZINHERIT; - ip->i_d.di_extsize = pip->i_d.di_extsize; - } - if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; - } else if (S_ISREG(mode)) { - if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_REALTIME; - if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { - di_flags |= XFS_DIFLAG_EXTSIZE; - ip->i_d.di_extsize = pip->i_d.di_extsize; - } - } - if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && - xfs_inherit_noatime) - di_flags |= XFS_DIFLAG_NOATIME; - if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && - xfs_inherit_nodump) - di_flags |= XFS_DIFLAG_NODUMP; - if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && - xfs_inherit_sync) - di_flags |= XFS_DIFLAG_SYNC; - if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && - xfs_inherit_nosymlinks) - di_flags |= XFS_DIFLAG_NOSYMLINKS; - if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && - xfs_inherit_nodefrag) - di_flags |= XFS_DIFLAG_NODEFRAG; - if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) - di_flags |= XFS_DIFLAG_FILESTREAM; - - ip->i_d.di_flags |= di_flags; - } - if (pip && - (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && - pip->i_d.di_version == 3 && - ip->i_d.di_version == 3) { - uint64_t di_flags2 = 0; - - if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; - ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; - } - if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - di_flags2 |= XFS_DIFLAG2_DAX; - - ip->i_d.di_flags2 |= di_flags2; - } + if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) + xfs_inode_inherit_flags(ip, pip); + if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) + xfs_inode_inherit_flags2(ip, pip); /* FALLTHROUGH */ case S_IFLNK: - ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_flags = XFS_IFEXTENTS; ip->i_df.if_bytes = 0; ip->i_df.if_u1.if_root = NULL; @@ -933,11 +917,6 @@ default: ASSERT(0); } - /* - * Attribute fork settings for new inode. - */ - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - ip->i_d.di_anextents = 0; /* * Log the new values stuffed into the inode. @@ -1116,17 +1095,15 @@ /* * Increment the link count on an inode & log the change. */ -static int +static void xfs_bumplink( xfs_trans_t *tp, xfs_inode_t *ip) { xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - ASSERT(ip->i_d.di_version > 1); inc_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return 0; } int @@ -1160,8 +1137,7 @@ /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1221,8 +1197,7 @@ unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, - resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + resblks - XFS_IALLOC_SPACE_RES(mp)); if (error) { ASSERT(error != -ENOSPC); goto out_trans_cancel; @@ -1235,9 +1210,7 @@ if (error) goto out_trans_cancel; - error = xfs_bumplink(tp, dp); - if (error) - goto out_trans_cancel; + xfs_bumplink(tp, dp); } /* @@ -1313,8 +1286,7 @@ /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1427,7 +1399,7 @@ * the tree quota mechanism could be circumvented. */ if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { + tdp->i_d.di_projid != sip->i_d.di_projid)) { error = -EXDEV; goto error_return; } @@ -1454,9 +1426,7 @@ xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); - error = xfs_bumplink(tp, sip); - if (error) - goto error_return; + xfs_bumplink(tp, sip); /* * If this is a synchronous mount, make sure that the @@ -1524,10 +1494,8 @@ struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_fileoff_t last_block; xfs_filblks_t unmap_len; int error = 0; - int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!atomic_read(&VFS_I(ip)->i_count) || @@ -1547,33 +1515,27 @@ * the end of the file (in a crash where the space is allocated * but the inode size is not yet updated), simply remove any * blocks which show up between the new EOF and the maximum - * possible file size. If the first block to be removed is - * beyond the maximum file size (ie it is the same as last_block), - * then there is nothing to do. + * possible file size. + * + * We have to free all the blocks to the bmbt maximum offset, even if + * the page cache can't scale that far. */ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - if (first_unmap_block == last_block) + if (first_unmap_block >= XFS_MAX_FILEOFF) { + WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); return 0; + } - ASSERT(first_unmap_block < last_block); - unmap_len = last_block - first_unmap_block + 1; - while (!done) { + unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; + while (unmap_len > 0) { ASSERT(tp->t_firstblock == NULLFSBLOCK); - error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, - XFS_ITRUNC_MAX_EXTENTS, &done); + error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, + flags, XFS_ITRUNC_MAX_EXTENTS); if (error) goto out; - /* - * Duplicate the transaction that has the permanent - * reservation and commit the old transaction. - */ + /* free the just unmapped extents */ error = xfs_defer_finish(&tp); - if (error) - goto out; - - error = xfs_trans_roll_inode(&tp, ip); if (error) goto out; } @@ -1581,7 +1543,7 @@ if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, - first_unmap_block, last_block, true); + first_unmap_block, XFS_MAX_FILEOFF, true); if (error) goto out; @@ -1662,7 +1624,7 @@ return 0; /* * If we can't get the iolock just skip truncating the blocks - * past EOF because we could deadlock with the mmap_sem + * past EOF because we could deadlock with the mmap_lock * otherwise. We'll get another chance to drop them once the * last reference to the inode is dropped, so we'll never leak * blocks permanently. @@ -1714,7 +1676,7 @@ if (error) goto error_trans_cancel; - ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_df.if_nextents == 0); error = xfs_trans_commit(tp); if (error) @@ -1883,7 +1845,7 @@ if (S_ISREG(VFS_I(ip)->i_mode) && (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || - ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; error = xfs_qm_dqattach(ip); @@ -1909,7 +1871,6 @@ } ASSERT(!ip->i_afp); - ASSERT(ip->i_d.di_anextents == 0); ASSERT(ip->i_d.di_forkoff == 0); /* @@ -1926,6 +1887,336 @@ } /* + * In-Core Unlinked List Lookups + * ============================= + * + * Every inode is supposed to be reachable from some other piece of metadata + * with the exception of the root directory. Inodes with a connection to a + * file descriptor but not linked from anywhere in the on-disk directory tree + * are collectively known as unlinked inodes, though the filesystem itself + * maintains links to these inodes so that on-disk metadata are consistent. + * + * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI + * header contains a number of buckets that point to an inode, and each inode + * record has a pointer to the next inode in the hash chain. This + * singly-linked list causes scaling problems in the iunlink remove function + * because we must walk that list to find the inode that points to the inode + * being removed from the unlinked hash bucket list. + * + * What if we modelled the unlinked list as a collection of records capturing + * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd + * have a fast way to look up unlinked list predecessors, which avoids the + * slow list walk. That's exactly what we do here (in-core) with a per-AG + * rhashtable. + * + * Because this is a backref cache, we ignore operational failures since the + * iunlink code can fall back to the slow bucket walk. The only errors that + * should bubble out are for obviously incorrect situations. + * + * All users of the backref cache MUST hold the AGI buffer lock to serialize + * access or have otherwise provided for concurrency control. + */ + +/* Capture a "X.next_unlinked = Y" relationship. */ +struct xfs_iunlink { + struct rhash_head iu_rhash_head; + xfs_agino_t iu_agino; /* X */ + xfs_agino_t iu_next_unlinked; /* Y */ +}; + +/* Unlinked list predecessor lookup hashtable construction */ +static int +xfs_iunlink_obj_cmpfn( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const xfs_agino_t *key = arg->key; + const struct xfs_iunlink *iu = obj; + + if (iu->iu_next_unlinked != *key) + return 1; + return 0; +} + +static const struct rhashtable_params xfs_iunlink_hash_params = { + .min_size = XFS_AGI_UNLINKED_BUCKETS, + .key_len = sizeof(xfs_agino_t), + .key_offset = offsetof(struct xfs_iunlink, + iu_next_unlinked), + .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), + .automatic_shrinking = true, + .obj_cmpfn = xfs_iunlink_obj_cmpfn, +}; + +/* + * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such + * relation is found. + */ +static xfs_agino_t +xfs_iunlink_lookup_backref( + struct xfs_perag *pag, + xfs_agino_t agino) +{ + struct xfs_iunlink *iu; + + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + return iu ? iu->iu_agino : NULLAGINO; +} + +/* + * Take ownership of an iunlink cache entry and insert it into the hash table. + * If successful, the entry will be owned by the cache; if not, it is freed. + * Either way, the caller does not own @iu after this call. + */ +static int +xfs_iunlink_insert_backref( + struct xfs_perag *pag, + struct xfs_iunlink *iu) +{ + int error; + + error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + /* + * Fail loudly if there already was an entry because that's a sign of + * corruption of in-memory data. Also fail loudly if we see an error + * code we didn't anticipate from the rhashtable code. Currently we + * only anticipate ENOMEM. + */ + if (error) { + WARN(error != -ENOMEM, "iunlink cache insert error %d", error); + kmem_free(iu); + } + /* + * Absorb any runtime errors that aren't a result of corruption because + * this is a cache and we can always fall back to bucket list scanning. + */ + if (error != 0 && error != -EEXIST) + error = 0; + return error; +} + +/* Remember that @prev_agino.next_unlinked = @this_agino. */ +static int +xfs_iunlink_add_backref( + struct xfs_perag *pag, + xfs_agino_t prev_agino, + xfs_agino_t this_agino) +{ + struct xfs_iunlink *iu; + + if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) + return 0; + + iu = kmem_zalloc(sizeof(*iu), KM_NOFS); + iu->iu_agino = prev_agino; + iu->iu_next_unlinked = this_agino; + + return xfs_iunlink_insert_backref(pag, iu); +} + +/* + * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. + * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there + * wasn't any such entry then we don't bother. + */ +static int +xfs_iunlink_change_backref( + struct xfs_perag *pag, + xfs_agino_t agino, + xfs_agino_t next_unlinked) +{ + struct xfs_iunlink *iu; + int error; + + /* Look up the old entry; if there wasn't one then exit. */ + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + if (!iu) + return 0; + + /* + * Remove the entry. This shouldn't ever return an error, but if we + * couldn't remove the old entry we don't want to add it again to the + * hash table, and if the entry disappeared on us then someone's + * violated the locking rules and we need to fail loudly. Either way + * we cannot remove the inode because internal state is or would have + * been corrupt. + */ + error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + if (error) + return error; + + /* If there is no new next entry just free our item and return. */ + if (next_unlinked == NULLAGINO) { + kmem_free(iu); + return 0; + } + + /* Update the entry and re-add it to the hash table. */ + iu->iu_next_unlinked = next_unlinked; + return xfs_iunlink_insert_backref(pag, iu); +} + +/* Set up the in-core predecessor structures. */ +int +xfs_iunlink_init( + struct xfs_perag *pag) +{ + return rhashtable_init(&pag->pagi_unlinked_hash, + &xfs_iunlink_hash_params); +} + +/* Free the in-core predecessor structures. */ +static void +xfs_iunlink_free_item( + void *ptr, + void *arg) +{ + struct xfs_iunlink *iu = ptr; + bool *freed_anything = arg; + + *freed_anything = true; + kmem_free(iu); +} + +void +xfs_iunlink_destroy( + struct xfs_perag *pag) +{ + bool freed_anything = false; + + rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, + xfs_iunlink_free_item, &freed_anything); + + ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount)); +} + +/* + * Point the AGI unlinked bucket at an inode and log the results. The caller + * is responsible for validating the old value. + */ +STATIC int +xfs_iunlink_update_bucket( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf *agibp, + unsigned int bucket_index, + xfs_agino_t new_agino) +{ + struct xfs_agi *agi = agibp->b_addr; + xfs_agino_t old_value; + int offset; + + ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino)); + + old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); + trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index, + old_value, new_agino); + + /* + * We should never find the head of the list already set to the value + * passed in because either we're adding or removing ourselves from the + * head of the list. + */ + if (old_value == new_agino) { + xfs_buf_mark_corrupt(agibp); + return -EFSCORRUPTED; + } + + agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); + offset = offsetof(struct xfs_agi, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +} + +/* Set an on-disk inode's next_unlinked pointer. */ +STATIC void +xfs_iunlink_update_dinode( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_buf *ibp, + struct xfs_dinode *dip, + struct xfs_imap *imap, + xfs_agino_t next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + int offset; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + trace_xfs_iunlink_update_dinode(mp, agno, agino, + be32_to_cpu(dip->di_next_unlinked), next_agino); + + dip->di_next_unlinked = cpu_to_be32(next_agino); + offset = imap->im_boffset + + offsetof(struct xfs_dinode, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); +} + +/* Set an in-core inode's unlinked pointer and return the old value. */ +STATIC int +xfs_iunlink_update_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_agnumber_t agno, + xfs_agino_t next_agino, + xfs_agino_t *old_next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_dinode *dip; + struct xfs_buf *ibp; + xfs_agino_t old_value; + int error; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0); + if (error) + return error; + + /* Make sure the old pointer isn't garbage. */ + old_value = be32_to_cpu(dip->di_next_unlinked); + if (!xfs_verify_agino_or_null(mp, agno, old_value)) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); + error = -EFSCORRUPTED; + goto out; + } + + /* + * Since we're updating a linked list, we should never find that the + * current pointer is the same as the new value, unless we're + * terminating the list. + */ + *old_next_agino = old_value; + if (old_value == next_agino) { + if (next_agino != NULLAGINO) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, + dip, sizeof(*dip), __this_address); + error = -EFSCORRUPTED; + } + goto out; + } + + /* Ok, update the new pointer. */ + xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino), + ibp, dip, &ip->i_imap, next_agino); + return 0; +out: + xfs_trans_brelse(tp, ibp); + return error; +} + +/* * This is called when the inode's link count has gone to 0 or we are creating * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. * @@ -1934,76 +2225,177 @@ */ STATIC int xfs_iunlink( - struct xfs_trans *tp, - struct xfs_inode *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_mount_t *mp = tp->t_mountp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agino_t agino; - short bucket_index; - int offset; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + xfs_agino_t next_agino; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); + trace_xfs_iunlink(ip); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ - error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the pointer isn't garbage and that this inode + * isn't already on the list. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - ASSERT(agino != 0); - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - ASSERT(agi->agi_unlinked[bucket_index]); - ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (next_agino == agino || + !xfs_verify_agino_or_null(mp, agno, next_agino)) { + xfs_buf_mark_corrupt(agibp); + return -EFSCORRUPTED; + } - if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { + if (next_agino != NULLAGINO) { + xfs_agino_t old_agino; + /* - * There is already another inode in the bucket we need - * to add ourselves to. Add us at the front of the list. - * Here we put the head pointer into our next pointer, - * and then we fall through to point the head at us. + * There is already another inode in the bucket, so point this + * inode to the current head of the list. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); + error = xfs_iunlink_update_inode(tp, ip, agno, next_agino, + &old_agino); + if (error) + return error; + ASSERT(old_agino == NULLAGINO); + + /* + * agino has been unlinked, add a backref from the next inode + * back to agino. + */ + error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino); + if (error) + return error; + } + + /* Point the head of the list to point to this inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); +} + +/* Return the imap, dinode pointer, and buffer for an inode. */ +STATIC int +xfs_iunlink_map_ino( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = tp->t_mountp; + int error; + + imap->im_blkno = 0; + error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } + + error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", + __func__, error); + return error; + } + + return 0; +} + +/* + * Walk the unlinked chain from @head_agino until we find the inode that + * points to @target_agino. Return the inode number, map, dinode pointer, + * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. + * + * @tp, @pag, @head_agino, and @target_agino are input parameters. + * @agino, @imap, @dipp, and @bpp are all output parameters. + * + * Do not call this function if @target_agino is the head of the list. + */ +STATIC int +xfs_iunlink_map_prev( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t head_agino, + xfs_agino_t target_agino, + xfs_agino_t *agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + struct xfs_perag *pag) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_agino_t next_agino; + int error; + + ASSERT(head_agino != target_agino); + *bpp = NULL; + + /* See if our backref cache can find it faster. */ + *agino = xfs_iunlink_lookup_backref(pag, target_agino); + if (*agino != NULLAGINO) { + error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp); if (error) return error; - ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); - dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); + if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino) + return 0; - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); + /* + * If we get here the cache contents were corrupt, so drop the + * buffer and fall back to walking the bucket list. + */ + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + WARN_ON_ONCE(1); } - /* - * Point the bucket head pointer at the inode being inserted. - */ - ASSERT(agino != 0); - agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); + trace_xfs_iunlink_map_prev_fallback(mp, agno); + + /* Otherwise, walk the entire bucket until we find it. */ + next_agino = head_agino; + while (next_agino != target_agino) { + xfs_agino_t unlinked_agino; + + if (*bpp) + xfs_trans_brelse(tp, *bpp); + + *agino = next_agino; + error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp, + bpp); + if (error) + return error; + + unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked); + /* + * Make sure this pointer is valid and isn't an obvious + * infinite loop. + */ + if (!xfs_verify_agino(mp, agno, unlinked_agino) || + next_agino == unlinked_agino) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, + *dipp, sizeof(**dipp)); + error = -EFSCORRUPTED; + return error; + } + next_agino = unlinked_agino; + } + return 0; } @@ -2012,181 +2404,190 @@ */ STATIC int xfs_iunlink_remove( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_ino_t next_ino; - xfs_mount_t *mp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agnumber_t agno; - xfs_agino_t agino; - xfs_agino_t next_agino; - xfs_buf_t *last_ibp; - xfs_dinode_t *last_dip = NULL; - short bucket_index; - int offset, last_offset = 0; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + struct xfs_buf *last_ibp; + struct xfs_dinode *last_dip = NULL; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t next_agino; + xfs_agino_t head_agino; + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; - mp = tp->t_mountp; - agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + trace_xfs_iunlink_remove(ip); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ + /* Get the agi buffer first. It ensures lock ordering on the list. */ error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the head pointer isn't garbage. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - if (!xfs_verify_agino(mp, agno, agino)) - return -EFSCORRUPTED; - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - if (!xfs_verify_agino(mp, agno, - be32_to_cpu(agi->agi_unlinked[bucket_index]))) { + head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (!xfs_verify_agino(mp, agno, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; } - if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { - /* - * We're at the head of the list. Get the inode's on-disk - * buffer to see if there is anyone after us on the list. - * Only modify our next pointer if it is not already NULLAGINO. - * This saves us the overhead of dealing with the buffer when - * there is no need to change it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", - __func__, error); + /* + * Set our inode's next_unlinked pointer to NULL and then return + * the old pointer value so that we can update whatever was previous + * to us in the list to point to whatever was next in the list. + */ + error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino); + if (error) + return error; + + /* + * If there was a backref pointing from the next inode back to this + * one, remove it because we've removed this inode from the list. + * + * Later, if this inode was in the middle of the list we'll update + * this inode's backref to point from the next inode. + */ + if (next_agino != NULLAGINO) { + error = xfs_iunlink_change_backref(agibp->b_pag, next_agino, + NULLAGINO); + if (error) return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the bucket head pointer at the next inode. - */ - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - } else { - /* - * We need to search the list for the inode being freed. - */ - next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - last_ibp = NULL; - while (next_agino != agino) { - struct xfs_imap imap; - - if (last_ibp) - xfs_trans_brelse(tp, last_ibp); - - imap.im_blkno = 0; - next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); - - error = xfs_imap(mp, tp, next_ino, &imap, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap returned error %d.", - __func__, error); - return error; - } - - error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, - &last_ibp, 0, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } - - last_offset = imap.im_boffset; - next_agino = be32_to_cpu(last_dip->di_next_unlinked); - if (!xfs_verify_agino(mp, agno, next_agino)) { - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, mp, - last_dip, sizeof(*last_dip)); - return -EFSCORRUPTED; - } - } - - /* - * Now last_ibp points to the buffer previous to us on the - * unlinked list. Pull us from the list. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", - __func__, error); - return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the previous inode on the list to the next inode. - */ - last_dip->di_next_unlinked = cpu_to_be32(next_agino); - ASSERT(next_agino != 0); - offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, last_dip); - - xfs_trans_inode_buf(tp, last_ibp); - xfs_trans_log_buf(tp, last_ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, last_ibp); } - return 0; + + if (head_agino != agino) { + struct xfs_imap imap; + xfs_agino_t prev_agino; + + /* We need to search the list for the inode being freed. */ + error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, + &prev_agino, &imap, &last_dip, &last_ibp, + agibp->b_pag); + if (error) + return error; + + /* Point the previous inode on the list to the next inode. */ + xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, + last_dip, &imap, next_agino); + + /* + * Now we deal with the backref for this inode. If this inode + * pointed at a real inode, change the backref that pointed to + * us to point to our old next. If this inode was the end of + * the list, delete the backref that pointed to us. Note that + * change_backref takes care of deleting the backref if + * next_agino is NULLAGINO. + */ + return xfs_iunlink_change_backref(agibp->b_pag, agino, + next_agino); + } + + /* Point the head of the list to the next unlinked inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); +} + +/* + * Look up the inode number specified and if it is not already marked XFS_ISTALE + * mark it stale. We should only find clean inodes in this lookup that aren't + * already stale. + */ +static void +xfs_ifree_mark_inode_stale( + struct xfs_buf *bp, + struct xfs_inode *free_ip, + xfs_ino_t inum) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_perag *pag = bp->b_pag; + struct xfs_inode_log_item *iip; + struct xfs_inode *ip; + +retry: + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); + + /* Inode not in memory, nothing to do */ + if (!ip) { + rcu_read_unlock(); + return; + } + + /* + * because this is an RCU protected lookup, we could find a recently + * freed or even reallocated inode during the lookup. We need to check + * under the i_flags_lock for a valid inode here. Skip it if it is not + * valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) + goto out_iflags_unlock; + + /* + * Don't try to lock/unlock the current inode, but we _cannot_ skip the + * other inodes that we did not find in the list attached to the buffer + * and are not already marked stale. If we can't lock it, back off and + * retry. + */ + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + delay(1); + goto retry; + } + } + ip->i_flags |= XFS_ISTALE; + + /* + * If the inode is flushing, it is already attached to the buffer. All + * we needed to do here is mark the inode stale so buffer IO completion + * will remove it from the AIL. + */ + iip = ip->i_itemp; + if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { + ASSERT(!list_empty(&iip->ili_item.li_bio_list)); + ASSERT(iip->ili_last_fields); + goto out_iunlock; + } + + /* + * Inodes not attached to the buffer can be released immediately. + * Everything else has to go through xfs_iflush_abort() on journal + * commit as the flock synchronises removal of the inode from the + * cluster buffer against inode reclaim. + */ + if (!iip || list_empty(&iip->ili_item.li_bio_list)) + goto out_iunlock; + + __xfs_iflags_set(ip, XFS_IFLUSHING); + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + /* we have a dirty inode in memory that has not yet been flushed. */ + spin_lock(&iip->ili_lock); + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); + ASSERT(iip->ili_last_fields); + + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return; + +out_iunlock: + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_iflags_unlock: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); } /* @@ -2196,31 +2597,23 @@ */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, + struct xfs_inode *free_ip, + struct xfs_trans *tp, struct xfs_icluster *xic) { - xfs_mount_t *mp = free_ip->i_mount; - int blks_per_cluster; - int inodes_per_cluster; + struct xfs_mount *mp = free_ip->i_mount; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + struct xfs_buf *bp; + xfs_daddr_t blkno; + xfs_ino_t inum = xic->first_ino; int nbufs; int i, j; int ioffset; - xfs_daddr_t blkno; - xfs_buf_t *bp; - xfs_inode_t *ip; - xfs_inode_log_item_t *iip; - struct xfs_log_item *lip; - struct xfs_perag *pag; - xfs_ino_t inum; + int error; - inum = xic->first_ino; - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - blks_per_cluster = xfs_icluster_size_fsb(mp); - inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; - nbufs = mp->m_ialloc_blks / blks_per_cluster; + nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; - for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { + for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { /* * The allocation bitmap tells us which inodes of the chunk were * physically allocated. Skip the cluster if an inode falls into @@ -2228,7 +2621,7 @@ */ ioffset = inum - xic->first_ino; if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { - ASSERT(ioffset % inodes_per_cluster == 0); + ASSERT(ioffset % igeo->inodes_per_cluster == 0); continue; } @@ -2237,18 +2630,18 @@ /* * We obtain and lock the backing buffer first in the process - * here, as we have to ensure that any dirty inode that we - * can't get the flush lock on is attached to the buffer. + * here to ensure dirty inodes attached to the buffer remain in + * the flushing state while we mark them stale. + * * If we scan the in-memory inodes first, then buffer IO can * complete before we get a lock on it, and hence we may fail * to mark all the active inodes on the buffer stale. */ - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, - XBF_UNMAPPED); - - if (!bp) - return -ENOMEM; + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, + mp->m_bsize * igeo->blocks_per_cluster, + XBF_UNMAPPED, &bp); + if (error) + return error; /* * This buffer may not have been correctly initialised as we @@ -2259,159 +2652,30 @@ * want it to fail. We can acheive this by adding a write * verifier to the buffer. */ - bp->b_ops = &xfs_inode_buf_ops; + bp->b_ops = &xfs_inode_buf_ops; /* - * Walk the inodes already attached to the buffer and mark them - * stale. These will all have the flush locks held, so an - * in-memory inode walk can't lock them. By marking them all - * stale first, we will not attempt to lock them in the loop - * below as the XFS_ISTALE flag will be set. + * Now we need to set all the cached clean inodes as XFS_ISTALE, + * too. This requires lookups, and will skip inodes that we've + * already marked XFS_ISTALE. */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (lip->li_type == XFS_LI_INODE) { - iip = (xfs_inode_log_item_t *)lip; - ASSERT(iip->ili_logged == 1); - lip->li_cb = xfs_istale_done; - xfs_trans_ail_copy_lsn(mp->m_ail, - &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - } - } - - - /* - * For each inode in memory attempt to add it to the inode - * buffer and set it up for being staled on buffer IO - * completion. This is safe as we've locked out tail pushing - * and flushing by locking the buffer. - * - * We have already marked every inode that was part of a - * transaction stale above, which means there is no point in - * even trying to lock them. - */ - for (i = 0; i < inodes_per_cluster; i++) { -retry: - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, (inum + i))); - - /* Inode not in memory, nothing to do */ - if (!ip) { - rcu_read_unlock(); - continue; - } - - /* - * because this is an RCU protected lookup, we could - * find a recently freed or even reallocated inode - * during the lookup. We need to check under the - * i_flags_lock for a valid inode here. Skip it if it - * is not valid, the wrong inode or stale. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != inum + i || - __xfs_iflags_test(ip, XFS_ISTALE)) { - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - continue; - } - spin_unlock(&ip->i_flags_lock); - - /* - * Don't try to lock/unlock the current inode, but we - * _cannot_ skip the other inodes that we did not find - * in the list attached to the buffer and are not - * already marked stale. If we can't lock it, back off - * and retry. - */ - if (ip != free_ip) { - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; - } - - /* - * Check the inode number again in case we're - * racing with freeing in xfs_reclaim_inode(). - * See the comments in that function for more - * information as to why the initial check is - * not sufficient. - */ - if (ip->i_ino != inum + i) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - rcu_read_unlock(); - continue; - } - } - rcu_read_unlock(); - - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_ISTALE); - - /* - * we don't need to attach clean inodes or those only - * with unlogged changes (which we throw away, anyway). - */ - iip = ip->i_itemp; - if (!iip || xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - - iip->ili_last_fields = iip->ili_fields; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - iip->ili_logged = 1; - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - - xfs_buf_attach_iodone(bp, xfs_istale_done, - &iip->ili_item); - - if (ip != free_ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + for (i = 0; i < igeo->inodes_per_cluster; i++) + xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } - - xfs_perag_put(pag); return 0; } /* - * Free any local-format buffers sitting around before we reset to - * extents format. - */ -static inline void -xfs_ifree_local_data( - struct xfs_inode *ip, - int whichfork) -{ - struct xfs_ifork *ifp; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return; - - ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); -} - -/* - * This is called to return an inode to the inode free list. - * The inode should already be truncated to 0 length and have - * no pages associated with it. This routine also assumes that - * the inode is already a part of the transaction. + * This is called to return an inode to the inode free list. The inode should + * already be truncated to 0 length and have no pages associated with it. This + * routine also assumes that the inode is already a part of the transaction. * - * The on-disk copy of the inode will have been added to the list - * of unlinked inodes in the AGI. We need to remove the inode from - * that list atomically with respect to freeing it here. + * The on-disk copy of the inode will have been added to the list of unlinked + * inodes in the AGI. We need to remove the inode from that list atomically with + * respect to freeing it here. */ int xfs_ifree( @@ -2420,38 +2684,50 @@ { int error; struct xfs_icluster xic = { 0 }; + struct xfs_inode_log_item *iip = ip->i_itemp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); - ASSERT(ip->i_d.di_nextents == 0); - ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_df.if_nextents == 0); ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); ASSERT(ip->i_d.di_nblocks == 0); /* - * Pull the on-disk inode from the AGI unlinked list. + * Free the inode first so that we guarantee that the AGI lock is going + * to be taken before we remove the inode from the unlinked list. This + * makes the AGI lock -> unlinked list modification order the same as + * used in O_TMPFILE creation. */ - error = xfs_iunlink_remove(tp, ip); - if (error) - return error; - error = xfs_difree(tp, ip->i_ino, &xic); if (error) return error; - xfs_ifree_local_data(ip, XFS_DATA_FORK); - xfs_ifree_local_data(ip, XFS_ATTR_FORK); + error = xfs_iunlink_remove(tp, ip); + if (error) + return error; + + /* + * Free any local-format data sitting around before we reset the + * data fork to extents format. Note that the attr fork data has + * already been freed by xfs_attr_inactive. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + kmem_free(ip->i_df.if_u1.if_data); + ip->i_df.if_u1.if_data = NULL; + ip->i_df.if_bytes = 0; + } VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; - ip->i_d.di_flags2 = 0; + ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ - ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; /* Don't attempt to replay owner changes for a deleted inode */ - ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); + spin_lock(&iip->ili_lock); + iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); + spin_unlock(&iip->ili_lock); /* * Bump the generation count so no one will be confused @@ -2480,7 +2756,7 @@ trace_xfs_inode_unpin_nowait(ip, _RET_IP_); /* Give the log a push to start the unpinning I/O */ - xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL); + xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); } @@ -2769,9 +3045,7 @@ error = xfs_droplink(tp, dp2); if (error) goto out_trans_abort; - error = xfs_bumplink(tp, dp1); - if (error) - goto out_trans_abort; + xfs_bumplink(tp, dp1); } /* @@ -2795,9 +3069,7 @@ error = xfs_droplink(tp, dp1); if (error) goto out_trans_abort; - error = xfs_bumplink(tp, dp2); - if (error) - goto out_trans_abort; + xfs_bumplink(tp, dp2); } /* @@ -2835,7 +3107,7 @@ /* * xfs_rename_alloc_whiteout() * - * Return a referenced, unlinked, unlocked inode that that can be used as a + * Return a referenced, unlinked, unlocked inode that can be used as a * whiteout in a rename transaction. We use a tmpfile inode here so that if we * crash between allocating the inode and linking it into the rename transaction * recovery will free the inode and we won't leak it. @@ -2882,6 +3154,7 @@ struct xfs_trans *tp; struct xfs_inode *wip = NULL; /* whiteout inode */ struct xfs_inode *inodes[__XFS_SORT_INODES]; + int i; int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); @@ -2899,7 +3172,6 @@ * appropriately. */ if (flags & RENAME_WHITEOUT) { - ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); error = xfs_rename_alloc_whiteout(target_dp, &wip); if (error) return error; @@ -2956,7 +3228,7 @@ * tree quota mechanism would be circumvented. */ if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { + target_dp->i_d.di_projid != src_ip->i_d.di_projid)) { error = -EXDEV; goto out_trans_cancel; } @@ -2995,6 +3267,30 @@ } /* + * Lock the AGI buffers we need to handle bumping the nlink of the + * whiteout inode off the unlinked list and to handle dropping the + * nlink of the target inode. Per locking order rules, do this in + * increasing AG order and before directory block allocation tries to + * grab AGFs because we grab AGIs before AGFs. + * + * The (vfs) caller must ensure that if src is a directory then + * target_ip is either null or an empty directory. + */ + for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { + if (inodes[i] == wip || + (inodes[i] == target_ip && + (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { + struct xfs_buf *bp; + xfs_agnumber_t agno; + + agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino); + error = xfs_read_agi(mp, tp, agno, &bp); + if (error) + goto out_trans_cancel; + } + } + + /* * Directory entry creation below may acquire the AGF. Remove * the whiteout from the unlinked list first to preserve correct * AGI/AGF locking order. This dirties the transaction so failures @@ -3013,7 +3309,6 @@ goto out_trans_cancel; xfs_bumplink(tp, wip); - xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); VFS_I(wip)->i_state &= ~I_LINKABLE; } @@ -3035,9 +3330,7 @@ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); if (new_parent && src_is_directory) { - error = xfs_bumplink(tp, target_dp); - if (error) - goto out_trans_cancel; + xfs_bumplink(tp, target_dp); } } else { /* target_ip != NULL */ /* @@ -3148,373 +3441,76 @@ return error; } -STATIC int -xfs_iflush_cluster( - struct xfs_inode *ip, - struct xfs_buf *bp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - unsigned long first_index, mask; - unsigned long inodes_per_cluster; - int cilist_size; - struct xfs_inode **cilist; - struct xfs_inode *cip; - int nr_found; - int clcount = 0; - int i; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - - inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; - cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); - cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); - if (!cilist) - goto out_put; - - mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; - rcu_read_lock(); - /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, - first_index, inodes_per_cluster); - if (nr_found == 0) - goto out_free; - - for (i = 0; i < nr_found; i++) { - cip = cilist[i]; - if (cip == ip) - continue; - - /* - * because this is an RCU protected lookup, we could find a - * recently freed or even reallocated inode during the lookup. - * We need to check under the i_flags_lock for a valid inode - * here. Skip it if it is not valid or the wrong inode. - */ - spin_lock(&cip->i_flags_lock); - if (!cip->i_ino || - __xfs_iflags_test(cip, XFS_ISTALE)) { - spin_unlock(&cip->i_flags_lock); - continue; - } - - /* - * Once we fall off the end of the cluster, no point checking - * any more inodes in the list because they will also all be - * outside the cluster. - */ - if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { - spin_unlock(&cip->i_flags_lock); - break; - } - spin_unlock(&cip->i_flags_lock); - - /* - * Do an un-protected check to see if the inode is dirty and - * is a candidate for flushing. These checks will be repeated - * later after the appropriate locks are acquired. - */ - if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) - continue; - - /* - * Try to get locks. If any are unavailable or it is pinned, - * then this inode cannot be flushed and is skipped. - */ - - if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) - continue; - if (!xfs_iflock_nowait(cip)) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - if (xfs_ipincount(cip)) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - - /* - * Check the inode number again, just to be certain we are not - * racing with freeing in xfs_reclaim_inode(). See the comments - * in that function for more information as to why the initial - * check is not sufficient. - */ - if (!cip->i_ino) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - /* - * arriving here means that this inode can be flushed. First - * re-check that it's dirty before flushing. - */ - if (!xfs_inode_clean(cip)) { - int error; - error = xfs_iflush_int(cip, bp); - if (error) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - goto cluster_corrupt_out; - } - clcount++; - } else { - xfs_ifunlock(cip); - } - xfs_iunlock(cip, XFS_ILOCK_SHARED); - } - - if (clcount) { - XFS_STATS_INC(mp, xs_icluster_flushcnt); - XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); - } - -out_free: - rcu_read_unlock(); - kmem_free(cilist); -out_put: - xfs_perag_put(pag); - return 0; - - -cluster_corrupt_out: - /* - * Corruption detected in the clustering loop. Invalidate the - * inode buffer and shut down the filesystem. - */ - rcu_read_unlock(); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - - /* - * We'll always have an inode attached to the buffer for completion - * process by the time we are called from xfs_iflush(). Hence we have - * always need to do IO completion processing to abort the inodes - * attached to the buffer. handle them just like the shutdown case in - * xfs_buf_submit(). - */ - ASSERT(bp->b_iodone); - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp); - - /* abort the corrupt inode, as it was not attached to the buffer */ - xfs_iflush_abort(cip, false); - kmem_free(cilist); - xfs_perag_put(pag); - return -EFSCORRUPTED; -} - -/* - * Flush dirty inode metadata into the backing buffer. - * - * The caller must have the inode lock and the inode flush lock held. The - * inode lock will still be held upon return to the caller, and the inode - * flush lock will be released after the inode has reached the disk. - * - * The caller must write out the buffer returned in *bpp and release it. - */ -int +static int xfs_iflush( - struct xfs_inode *ip, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp = NULL; - struct xfs_dinode *dip; - int error; - - XFS_STATS_INC(mp, xs_iflush_count); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - *bpp = NULL; - - xfs_iunpin_wait(ip); - - /* - * For stale inodes we cannot rely on the backing buffer remaining - * stale in cache for the remaining life of the stale inode and so - * xfs_imap_to_bp() below may give us a buffer that no longer contains - * inodes below. We have to check this after ensuring the inode is - * unpinned so that it is safe to reclaim the stale inode after the - * flush call. - */ - if (xfs_iflags_test(ip, XFS_ISTALE)) { - xfs_ifunlock(ip); - return 0; - } - - /* - * This may have been unpinned because the filesystem is shutting - * down forcibly. If that's the case we must not write this inode - * to disk, because the log record didn't make it to disk. - * - * We also have to remove the log item from the AIL in this case, - * as we wait for an empty AIL as part of the unmount process. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - error = -EIO; - goto abort_out; - } - - /* - * Get the buffer containing the on-disk inode. We are doing a try-lock - * operation here, so we may get an EAGAIN error. In that case, we - * simply want to return with the inode still dirty. - * - * If we get any other error, we effectively have a corruption situation - * and we cannot flush the inode, so we treat it the same as failing - * xfs_iflush_int(). - */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, - 0); - if (error == -EAGAIN) { - xfs_ifunlock(ip); - return error; - } - if (error) - goto corrupt_out; - - /* - * First flush out the inode that xfs_iflush was called with. - */ - error = xfs_iflush_int(ip, bp); - if (error) - goto corrupt_out; - - /* - * If the buffer is pinned then push on the log now so we won't - * get stuck waiting in the write for too long. - */ - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - - /* - * inode clustering: try to gather other inodes into this write - * - * Note: Any error during clustering will result in the filesystem - * being shut down and completion callbacks run on the cluster buffer. - * As we have already flushed and attached this inode to the buffer, - * it has already been aborted and released by xfs_iflush_cluster() and - * so we have no further error handling to do here. - */ - error = xfs_iflush_cluster(ip, bp); - if (error) - return error; - - *bpp = bp; - return 0; - -corrupt_out: - if (bp) - xfs_buf_relse(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -abort_out: - /* abort the corrupt inode, as it was not attached to the buffer */ - xfs_iflush_abort(ip, false); - return error; -} - -/* - * If there are inline format data / attr forks attached to this inode, - * make sure they're not corrupt. - */ -bool -xfs_inode_verify_forks( - struct xfs_inode *ip) -{ - struct xfs_ifork *ifp; - xfs_failaddr_t fa; - - fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops); - if (fa) { - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", - ifp->if_u1.if_data, ifp->if_bytes, fa); - return false; - } - - fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops); - if (fa) { - ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp ? ifp->if_u1.if_data : NULL, - ifp ? ifp->if_bytes : 0, fa); - return false; - } - return true; -} - -STATIC int -xfs_iflush_int( struct xfs_inode *ip, struct xfs_buf *bp) { struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_dinode *dip; struct xfs_mount *mp = ip->i_mount; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - ASSERT(iip != NULL && iip->ili_fields != 0); - ASSERT(ip->i_d.di_version > 1); + ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || + ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ASSERT(iip->ili_item.li_buf == bp); - /* set *dip = inode's place in the buffer */ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); + /* + * We don't flush the inode if any of the following checks fail, but we + * do still update the log item and attach to the backing buffer as if + * the flush happened. This is a formality to facilitate predictable + * error handling as the caller will shutdown and fail the buffer. + */ + error = -EFSCORRUPTED; if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); - goto corrupt_out; + goto flush_out; } if (S_ISREG(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( - (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), + ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE, mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); - goto corrupt_out; + goto flush_out; } } else if (S_ISDIR(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( - (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && - (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), + ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE && + ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); - goto corrupt_out; + goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > + if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %Lu, " "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, __func__, ip->i_ino, - ip->i_d.di_nextents + ip->i_d.di_anextents, + ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), ip->i_d.di_nblocks, ip); - goto corrupt_out; + goto flush_out; } if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_d.di_forkoff, ip); - goto corrupt_out; + goto flush_out; } /* @@ -3526,12 +3522,19 @@ * backwards compatibility with old kernels that predate logging all * inode changes. */ - if (ip->i_d.di_version < 3) + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) ip->i_d.di_flushiter++; - /* Check the inline fork data before we write out. */ - if (!xfs_inode_verify_forks(ip)) - goto corrupt_out; + /* + * If there are inline format data / attr forks attached to this inode, + * make sure they are not corrupt. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && + xfs_ifork_verify_local_data(ip)) + goto flush_out; + if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL && + xfs_ifork_verify_local_attr(ip)) + goto flush_out; /* * Copy the dirty parts of the inode into the on-disk inode. We always @@ -3547,7 +3550,6 @@ xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); if (XFS_IFORK_Q(ip)) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); - xfs_inobp_check(mp, bp); /* * We've recorded everything logged in the inode, so we'd like to clear @@ -3560,45 +3562,144 @@ * * What we do is move the bits to the ili_last_fields field. When * logging the inode, these bits are moved back to the ili_fields field. - * In the xfs_iflush_done() routine we clear ili_last_fields, since we - * know that the information those bits represent is permanently on + * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since + * we know that the information those bits represent is permanently on * disk. As long as the flush completes before the inode is logged * again, then both ili_fields and ili_last_fields will be cleared. - * - * We can play with the ili_fields bits here, because the inode lock - * must be held exclusively in order to set bits there and the flush - * lock protects the ili_last_fields bits. Set ili_logged so the flush - * done routine can tell whether or not to look in the AIL. Also, store - * the current LSN of the inode so that we can tell whether the item has - * moved in the AIL from xfs_iflush_done(). In order to read the lsn we - * need the AIL lock, because it is a 64 bit value that cannot be read - * atomically. */ + error = 0; +flush_out: + spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; - iip->ili_logged = 1; + spin_unlock(&iip->ili_lock); + /* + * Store the current LSN of the inode so that we can tell whether the + * item has moved in the AIL from xfs_buf_inode_iodone(). + */ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); - /* - * Attach the function xfs_iflush_done to the inode's - * buffer. This will remove the inode from the AIL - * and unlock the inode's flush lock when the inode is - * completely written to disk. - */ - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); - /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); + return error; +} - ASSERT(!list_empty(&bp->b_li_list)); - ASSERT(bp->b_iodone != NULL); +/* + * Non-blocking flush of dirty inode metadata into the backing buffer. + * + * The caller must have a reference to the inode and hold the cluster buffer + * locked. The function will walk across all the inodes on the cluster buffer it + * can find and lock without blocking, and flush them to the cluster buffer. + * + * On successful flushing of at least one inode, the caller must write out the + * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and + * the caller needs to release the buffer. On failure, the filesystem will be + * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED + * will be returned. + */ +int +xfs_iflush_cluster( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_log_item *lip, *n; + struct xfs_inode *ip; + struct xfs_inode_log_item *iip; + int clcount = 0; + int error = 0; + + /* + * We must use the safe variant here as on shutdown xfs_iflush_abort() + * can remove itself from the list. + */ + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + iip = (struct xfs_inode_log_item *)lip; + ip = iip->ili_inode; + + /* + * Quick and dirty check to avoid locks if possible. + */ + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) + continue; + if (xfs_ipincount(ip)) + continue; + + /* + * The inode is still attached to the buffer, which means it is + * dirty but reclaim might try to grab it. Check carefully for + * that, and grab the ilock while still holding the i_flags_lock + * to guarantee reclaim will not be able to reclaim this inode + * once we drop the i_flags_lock. + */ + spin_lock(&ip->i_flags_lock); + ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { + spin_unlock(&ip->i_flags_lock); + continue; + } + + /* + * ILOCK will pin the inode against reclaim and prevent + * concurrent transactions modifying the inode while we are + * flushing the inode. If we get the lock, set the flushing + * state before we drop the i_flags_lock. + */ + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + spin_unlock(&ip->i_flags_lock); + continue; + } + __xfs_iflags_set(ip, XFS_IFLUSHING); + spin_unlock(&ip->i_flags_lock); + + /* + * Abort flushing this inode if we are shut down because the + * inode may not currently be in the AIL. This can occur when + * log I/O failure unpins the inode without inserting into the + * AIL, leaving a dirty/unpinned inode attached to the buffer + * that otherwise looks like it should be flushed. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_iunpin_wait(ip); + xfs_iflush_abort(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + error = -EIO; + continue; + } + + /* don't block waiting on a log force to unpin dirty inodes */ + if (xfs_ipincount(ip)) { + xfs_iflags_clear(ip, XFS_IFLUSHING); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + continue; + } + + if (!xfs_inode_clean(ip)) + error = xfs_iflush(ip, bp); + else + xfs_iflags_clear(ip, XFS_IFLUSHING); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) + break; + clcount++; + } + + if (error) { + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + if (!clcount) + return -EAGAIN; + + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); return 0; -corrupt_out: - return -EFSCORRUPTED; } /* Release an inode. */ @@ -3609,3 +3710,115 @@ trace_xfs_irele(ip, _RET_IP_); iput(VFS_I(ip)); } + +/* + * Ensure all commited transactions touching the inode are written to the log. + */ +int +xfs_log_force_inode( + struct xfs_inode *ip) +{ + xfs_csn_t seq = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + seq = ip->i_itemp->ili_commit_seq; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!seq) + return 0; + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL); +} + +/* + * Grab the exclusive iolock for a data copy from src to dest, making sure to + * abide vfs locking order (lowest pointer value goes first) and breaking the + * layout leases before proceeding. The loop is needed because we cannot call + * the blocking break_layout() with the iolocks held, and therefore have to + * back out both locks. + */ +static int +xfs_iolock_two_inodes_and_break_layout( + struct inode *src, + struct inode *dest) +{ + int error; + + if (src > dest) + swap(src, dest); + +retry: + /* Wait to break both inodes' layouts before we start locking. */ + error = break_layout(src, true); + if (error) + return error; + if (src != dest) { + error = break_layout(dest, true); + if (error) + return error; + } + + /* Lock one inode and make sure nobody got in and leased it. */ + inode_lock(src); + error = break_layout(src, false); + if (error) { + inode_unlock(src); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + if (src == dest) + return 0; + + /* Lock the other inode and make sure nobody got in and leased it. */ + inode_lock_nested(dest, I_MUTEX_NONDIR2); + error = break_layout(dest, false); + if (error) { + inode_unlock(src); + inode_unlock(dest); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + return 0; +} + +/* + * Lock two inodes so that userspace cannot initiate I/O via file syscalls or + * mmap activity. + */ +int +xfs_ilock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + int ret; + + ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); + if (ret) + return ret; + if (ip1 == ip2) + xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); + else + xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, + ip2, XFS_MMAPLOCK_EXCL); + return 0; +} + +/* Unlock both inodes to allow IO and mmap activity. */ +void +xfs_iunlock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + bool same_inode = (ip1 == ip2); + + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + inode_unlock(VFS_I(ip2)); + if (!same_inode) + inode_unlock(VFS_I(ip1)); +} -- Gitblit v1.6.2