From 95099d4622f8cb224d94e314c7a8e0df60b13f87 Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Sat, 09 Dec 2023 08:38:01 +0000
Subject: [PATCH] enable docker ppp
---
kernel/fs/xfs/xfs_log.c | 2083 ++++++++++++++++++++++++++---------------------------------
1 files changed, 912 insertions(+), 1,171 deletions(-)
diff --git a/kernel/fs/xfs/xfs_log.c b/kernel/fs/xfs/xfs_log.c
index 8b1b086..22d7d74 100644
--- a/kernel/fs/xfs/xfs_log.c
+++ b/kernel/fs/xfs/xfs_log.c
@@ -16,24 +16,14 @@
#include "xfs_trans_priv.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_log_recover.h"
-#include "xfs_inode.h"
#include "xfs_trace.h"
-#include "xfs_fsops.h"
-#include "xfs_cksum.h"
#include "xfs_sysfs.h"
#include "xfs_sb.h"
+#include "xfs_health.h"
kmem_zone_t *xfs_log_ticket_zone;
/* Local miscellaneous function prototypes */
-STATIC int
-xlog_commit_record(
- struct xlog *log,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- xfs_lsn_t *commitlsnp);
-
STATIC struct xlog *
xlog_alloc_log(
struct xfs_mount *mp,
@@ -44,20 +34,12 @@
xlog_space_left(
struct xlog *log,
atomic64_t *head);
-STATIC int
-xlog_sync(
- struct xlog *log,
- struct xlog_in_core *iclog);
STATIC void
xlog_dealloc_log(
struct xlog *log);
/* local state machine functions */
-STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
-STATIC void
-xlog_state_do_callback(
- struct xlog *log,
- int aborted,
+STATIC void xlog_state_done_syncing(
struct xlog_in_core *iclog);
STATIC int
xlog_state_get_iclog_space(
@@ -67,33 +49,19 @@
struct xlog_ticket *ticket,
int *continued_write,
int *logoffsetp);
-STATIC int
-xlog_state_release_iclog(
- struct xlog *log,
- struct xlog_in_core *iclog);
STATIC void
xlog_state_switch_iclogs(
struct xlog *log,
struct xlog_in_core *iclog,
int eventual_size);
STATIC void
-xlog_state_want_sync(
- struct xlog *log,
- struct xlog_in_core *iclog);
-
-STATIC void
xlog_grant_push_ail(
struct xlog *log,
int need_bytes);
STATIC void
-xlog_regrant_reserve_log_space(
+xlog_sync(
struct xlog *log,
- struct xlog_ticket *ticket);
-STATIC void
-xlog_ungrant_log_space(
- struct xlog *log,
- struct xlog_ticket *ticket);
-
+ struct xlog_in_core *iclog);
#if defined(DEBUG)
STATIC void
xlog_verify_dest_ptr(
@@ -106,8 +74,7 @@
xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
- int count,
- bool syncing);
+ int count);
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
@@ -116,7 +83,7 @@
#else
#define xlog_verify_dest_ptr(a,b)
#define xlog_verify_grant_tail(a)
-#define xlog_verify_iclog(a,b,c,d)
+#define xlog_verify_iclog(a,b,c)
#define xlog_verify_tail_lsn(a,b,c)
#endif
@@ -225,15 +192,42 @@
{
struct xlog_ticket *tic;
int need_bytes;
+ bool woken_task = false;
list_for_each_entry(tic, &head->waiters, t_queue) {
+
+ /*
+ * There is a chance that the size of the CIL checkpoints in
+ * progress at the last AIL push target calculation resulted in
+ * limiting the target to the log head (l_last_sync_lsn) at the
+ * time. This may not reflect where the log head is now as the
+ * CIL checkpoints may have completed.
+ *
+ * Hence when we are woken here, it may be that the head of the
+ * log that has moved rather than the tail. As the tail didn't
+ * move, there still won't be space available for the
+ * reservation we require. However, if the AIL has already
+ * pushed to the target defined by the old log head location, we
+ * will hang here waiting for something else to update the AIL
+ * push target.
+ *
+ * Therefore, if there isn't space to wake the first waiter on
+ * the grant head, we need to push the AIL again to ensure the
+ * target reflects both the current log tail and log head
+ * position before we wait for the tail to move again.
+ */
+
need_bytes = xlog_ticket_reservation(log, head, tic);
- if (*free_bytes < need_bytes)
+ if (*free_bytes < need_bytes) {
+ if (!woken_task)
+ xlog_grant_push_ail(log, need_bytes);
return false;
+ }
*free_bytes -= need_bytes;
trace_xfs_log_grant_wake_up(log, tic);
wake_up_process(tic->t_task);
+ woken_task = true;
}
return true;
@@ -353,6 +347,25 @@
tic->t_res_num++;
}
+bool
+xfs_log_writable(
+ struct xfs_mount *mp)
+{
+ /*
+ * Never write to the log on norecovery mounts, if the block device is
+ * read-only, or if the filesystem is shutdown. Read-only mounts still
+ * allow internal writes for log recovery and unmount purposes, so don't
+ * restrict that case here.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return false;
+ if (xfs_readonly_buftarg(mp->m_log->l_targ))
+ return false;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return false;
+ return true;
+}
+
/*
* Replenish the byte reservation required by moving the grant write head.
*/
@@ -439,11 +452,7 @@
XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
- tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
- KM_SLEEP | KM_MAYFAIL);
- if (!tic)
- return -ENOMEM;
-
+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent);
*ticp = tic;
xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -473,110 +482,67 @@
return error;
}
-
-/*
- * NOTES:
- *
- * 1. currblock field gets updated at startup and after in-core logs
- * marked as with WANT_SYNC.
- */
-
-/*
- * This routine is called when a user of a log manager ticket is done with
- * the reservation. If the ticket was ever used, then a commit record for
- * the associated transaction is written out as a log operation header with
- * no data. The flag XLOG_TIC_INITED is set when the first write occurs with
- * a given ticket. If the ticket was one with a permanent reservation, then
- * a few operations are done differently. Permanent reservation tickets by
- * default don't release the reservation. They just commit the current
- * transaction with the belief that the reservation is still needed. A flag
- * must be passed in before permanent reservations are actually released.
- * When these type of tickets are not released, they need to be set into
- * the inited state again. By doing this, a start record will be written
- * out when the next write occurs.
- */
-xfs_lsn_t
-xfs_log_done(
- struct xfs_mount *mp,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- bool regrant)
-{
- struct xlog *log = mp->m_log;
- xfs_lsn_t lsn = 0;
-
- if (XLOG_FORCED_SHUTDOWN(log) ||
- /*
- * If nothing was ever written, don't write out commit record.
- * If we get an error, just continue and give back the log ticket.
- */
- (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
- (xlog_commit_record(log, ticket, iclog, &lsn)))) {
- lsn = (xfs_lsn_t) -1;
- regrant = false;
- }
-
-
- if (!regrant) {
- trace_xfs_log_done_nonperm(log, ticket);
-
- /*
- * Release ticket if not permanent reservation or a specific
- * request has been made to release a permanent reservation.
- */
- xlog_ungrant_log_space(log, ticket);
- } else {
- trace_xfs_log_done_perm(log, ticket);
-
- xlog_regrant_reserve_log_space(log, ticket);
- /* If this ticket was a permanent reservation and we aren't
- * trying to release it, reset the inited flags; so next time
- * we write, a start record will be written out.
- */
- ticket->t_flags |= XLOG_TIC_INITED;
- }
-
- xfs_log_ticket_put(ticket);
- return lsn;
-}
-
-/*
- * Attaches a new iclog I/O completion callback routine during
- * transaction commit. If the log is in error state, a non-zero
- * return code is handed back and the caller is responsible for
- * executing the callback at an appropriate time.
- */
-int
-xfs_log_notify(
- struct xlog_in_core *iclog,
- xfs_log_callback_t *cb)
-{
- int abortflg;
-
- spin_lock(&iclog->ic_callback_lock);
- abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
- if (!abortflg) {
- ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
- (iclog->ic_state == XLOG_STATE_WANT_SYNC));
- cb->cb_next = NULL;
- *(iclog->ic_callback_tail) = cb;
- iclog->ic_callback_tail = &(cb->cb_next);
- }
- spin_unlock(&iclog->ic_callback_lock);
- return abortflg;
-}
-
-int
-xfs_log_release_iclog(
- struct xfs_mount *mp,
+static bool
+__xlog_state_release_iclog(
+ struct xlog *log,
struct xlog_in_core *iclog)
{
- if (xlog_state_release_iclog(mp->m_log, iclog)) {
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ lockdep_assert_held(&log->l_icloglock);
+
+ if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+ /* update tail before writing to iclog */
+ xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+
+ iclog->ic_state = XLOG_STATE_SYNCING;
+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ xlog_verify_tail_lsn(log, iclog, tail_lsn);
+ /* cycle incremented when incrementing curr_block */
+ return true;
+ }
+
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ return false;
+}
+
+/*
+ * Flush iclog to disk if this is the last reference to the given iclog and the
+ * it is in the WANT_SYNC state.
+ */
+static int
+xlog_state_release_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog)
+{
+ lockdep_assert_held(&log->l_icloglock);
+
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
return -EIO;
+
+ if (atomic_dec_and_test(&iclog->ic_refcnt) &&
+ __xlog_state_release_iclog(log, iclog)) {
+ spin_unlock(&log->l_icloglock);
+ xlog_sync(log, iclog);
+ spin_lock(&log->l_icloglock);
}
return 0;
+}
+
+void
+xfs_log_release_iclog(
+ struct xlog_in_core *iclog)
+{
+ struct xlog *log = iclog->ic_log;
+ bool sync = false;
+
+ if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) {
+ if (iclog->ic_state != XLOG_STATE_IOERROR)
+ sync = __xlog_state_release_iclog(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ }
+
+ if (sync)
+ xlog_sync(log, iclog);
}
/*
@@ -799,6 +765,9 @@
if (readonly)
mp->m_flags |= XFS_MOUNT_RDONLY;
+ /* Make sure the log is dead if we're returning failure. */
+ ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR));
+
return error;
}
@@ -806,45 +775,78 @@
* The mount has failed. Cancel the recovery if it hasn't completed and destroy
* the log.
*/
-int
+void
xfs_log_mount_cancel(
struct xfs_mount *mp)
{
- int error;
-
- error = xlog_recover_cancel(mp->m_log);
+ xlog_recover_cancel(mp->m_log);
xfs_log_unmount(mp);
-
- return error;
}
/*
- * Final log writes as part of unmount.
- *
- * Mark the filesystem clean as unmount happens. Note that during relocation
- * this routine needs to be executed as part of source-bag while the
- * deallocation must not be done until source-end.
+ * Wait for the iclog to be written disk, or return an error if the log has been
+ * shut down.
*/
-
-/* Actually write the unmount record to disk. */
-static void
-xfs_log_write_unmount_record(
- struct xfs_mount *mp)
+static int
+xlog_wait_on_iclog(
+ struct xlog_in_core *iclog)
+ __releases(iclog->ic_log->l_icloglock)
{
- /* the data section must be 32 bit size aligned */
- struct xfs_unmount_log_format magic = {
+ struct xlog *log = iclog->ic_log;
+
+ if (!XLOG_FORCED_SHUTDOWN(log) &&
+ iclog->ic_state != XLOG_STATE_ACTIVE &&
+ iclog->ic_state != XLOG_STATE_DIRTY) {
+ XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
+ xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+ } else {
+ spin_unlock(&log->l_icloglock);
+ }
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return -EIO;
+ return 0;
+}
+
+/*
+ * Write out an unmount record using the ticket provided. We have to account for
+ * the data space used in the unmount ticket as this write is not done from a
+ * transaction context that has already done the accounting for us.
+ */
+static int
+xlog_write_unmount_record(
+ struct xlog *log,
+ struct xlog_ticket *ticket,
+ xfs_lsn_t *lsn,
+ uint flags)
+{
+ struct xfs_unmount_log_format ulf = {
.magic = XLOG_UNMOUNT_TYPE,
};
struct xfs_log_iovec reg = {
- .i_addr = &magic,
- .i_len = sizeof(magic),
+ .i_addr = &ulf,
+ .i_len = sizeof(ulf),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
struct xfs_log_vec vec = {
.lv_niovecs = 1,
.lv_iovecp = ®,
};
- struct xlog *log = mp->m_log;
+
+ /* account for space used by record data */
+ ticket->t_curr_res -= sizeof(ulf);
+ return xlog_write(log, &vec, ticket, lsn, NULL, flags, false);
+}
+
+/*
+ * Mark the filesystem clean by writing an unmount record to the head of the
+ * log.
+ */
+static void
+xlog_unmount_write(
+ struct xlog *log)
+{
+ struct xfs_mount *mp = log->l_mp;
struct xlog_in_core *iclog;
struct xlog_ticket *tic = NULL;
xfs_lsn_t lsn;
@@ -855,23 +857,7 @@
if (error)
goto out_err;
- /*
- * If we think the summary counters are bad, clear the unmount header
- * flag in the unmount record so that the summary counters will be
- * recalculated during log recovery at next mount. Refer to
- * xlog_check_unmount_rec for more details.
- */
- if (XFS_TEST_ERROR((mp->m_flags & XFS_MOUNT_BAD_SUMMARY), mp,
- XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
- xfs_alert(mp, "%s: will fix summary counters at next mount",
- __func__);
- flags &= ~XLOG_UNMOUNT_TRANS;
- }
-
- /* remove inited flag, and account for space used */
- tic->t_flags = 0;
- tic->t_curr_res -= sizeof(magic);
- error = xlog_write(log, &vec, tic, &lsn, NULL, flags);
+ error = xlog_write_unmount_record(log, tic, &lsn, flags);
/*
* At this point, we're umounting anyway, so there's no point in
* transitioning log state to IOERROR. Just continue...
@@ -883,29 +869,30 @@
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
atomic_inc(&iclog->ic_refcnt);
- xlog_state_want_sync(log, iclog);
- spin_unlock(&log->l_icloglock);
+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(log, iclog, 0);
+ else
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_state == XLOG_STATE_IOERROR);
error = xlog_state_release_iclog(log, iclog);
-
- spin_lock(&log->l_icloglock);
- switch (iclog->ic_state) {
- default:
- if (!XLOG_FORCED_SHUTDOWN(log)) {
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- break;
- }
- /* fall through */
- case XLOG_STATE_ACTIVE:
- case XLOG_STATE_DIRTY:
- spin_unlock(&log->l_icloglock);
- break;
- }
+ xlog_wait_on_iclog(iclog);
if (tic) {
trace_xfs_log_umount_write(log, tic);
- xlog_ungrant_log_space(log, tic);
- xfs_log_ticket_put(tic);
+ xfs_log_ticket_ungrant(log, tic);
}
+}
+
+static void
+xfs_log_unmount_verify_iclog(
+ struct xlog *log)
+{
+ struct xlog_in_core *iclog = log->l_iclog;
+
+ do {
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ ASSERT(iclog->ic_offset == 0);
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
}
/*
@@ -915,79 +902,36 @@
* currently architecture converted and "Unmount" is a bit foo.
* As far as I know, there weren't any dependencies on the old behaviour.
*/
-
-static int
-xfs_log_unmount_write(xfs_mount_t *mp)
+static void
+xfs_log_unmount_write(
+ struct xfs_mount *mp)
{
- struct xlog *log = mp->m_log;
- xlog_in_core_t *iclog;
-#ifdef DEBUG
- xlog_in_core_t *first_iclog;
-#endif
- int error;
+ struct xlog *log = mp->m_log;
+
+ if (!xfs_log_writable(mp))
+ return;
+
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return;
/*
- * Don't write out unmount record on norecovery mounts or ro devices.
- * Or, if we are doing a forced umount (typically because of IO errors).
+ * If we think the summary counters are bad, avoid writing the unmount
+ * record to force log recovery at next mount, after which the summary
+ * counters will be recalculated. Refer to xlog_check_unmount_rec for
+ * more details.
*/
- if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
- xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
- return 0;
+ if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
+ XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+ xfs_alert(mp, "%s: will fix summary counters at next mount",
+ __func__);
+ return;
}
- error = xfs_log_force(mp, XFS_LOG_SYNC);
- ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
-
-#ifdef DEBUG
- first_iclog = iclog = log->l_iclog;
- do {
- if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
- ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
- ASSERT(iclog->ic_offset == 0);
- }
- iclog = iclog->ic_next;
- } while (iclog != first_iclog);
-#endif
- if (! (XLOG_FORCED_SHUTDOWN(log))) {
- xfs_log_write_unmount_record(mp);
- } else {
- /*
- * We're already in forced_shutdown mode, couldn't
- * even attempt to write out the unmount transaction.
- *
- * Go through the motions of sync'ing and releasing
- * the iclog, even though no I/O will actually happen,
- * we need to wait for other log I/Os that may already
- * be in progress. Do this as a separate section of
- * code so we'll know if we ever get stuck here that
- * we're in this odd situation of trying to unmount
- * a file system that went into forced_shutdown as
- * the result of an unmount..
- */
- spin_lock(&log->l_icloglock);
- iclog = log->l_iclog;
- atomic_inc(&iclog->ic_refcnt);
-
- xlog_state_want_sync(log, iclog);
- spin_unlock(&log->l_icloglock);
- error = xlog_state_release_iclog(log, iclog);
-
- spin_lock(&log->l_icloglock);
-
- if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE
- || iclog->ic_state == XLOG_STATE_DIRTY
- || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-
- xlog_wait(&iclog->ic_force_wait,
- &log->l_icloglock);
- } else {
- spin_unlock(&log->l_icloglock);
- }
- }
-
- return error;
-} /* xfs_log_unmount_write */
+ xfs_log_unmount_verify_iclog(log);
+ xlog_unmount_write(log);
+}
/*
* Empty the log for unmount/freeze.
@@ -1243,53 +1187,40 @@
}
-/*
- * Log function which is called when an io completes.
- *
- * The log manager needs its own routine, in order to control what
- * happens with the buffer after the write completes.
- */
static void
-xlog_iodone(xfs_buf_t *bp)
+xlog_ioend_work(
+ struct work_struct *work)
{
- struct xlog_in_core *iclog = bp->b_log_item;
- struct xlog *l = iclog->ic_log;
- int aborted = 0;
+ struct xlog_in_core *iclog =
+ container_of(work, struct xlog_in_core, ic_end_io_work);
+ struct xlog *log = iclog->ic_log;
+ int error;
+
+ error = blk_status_to_errno(iclog->ic_bio.bi_status);
+#ifdef DEBUG
+ /* treat writes with injected CRC errors as failed */
+ if (iclog->ic_fail_crc)
+ error = -EIO;
+#endif
/*
- * Race to shutdown the filesystem if we see an error or the iclog is in
- * IOABORT state. The IOABORT state is only set in DEBUG mode to inject
- * CRC errors into log recovery.
+ * Race to shutdown the filesystem if we see an error.
*/
- if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR) ||
- iclog->ic_state & XLOG_STATE_IOABORT) {
- if (iclog->ic_state & XLOG_STATE_IOABORT)
- iclog->ic_state &= ~XLOG_STATE_IOABORT;
-
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_buf_stale(bp);
- xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
- /*
- * This flag will be propagated to the trans-committed
- * callback routines to let them know that the log-commit
- * didn't succeed.
- */
- aborted = XFS_LI_ABORTED;
- } else if (iclog->ic_state & XLOG_STATE_IOERROR) {
- aborted = XFS_LI_ABORTED;
+ if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
+ xfs_alert(log->l_mp, "log I/O error %d", error);
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
}
- /* log I/O is always issued ASYNC */
- ASSERT(bp->b_flags & XBF_ASYNC);
- xlog_state_done_syncing(iclog, aborted);
+ xlog_state_done_syncing(iclog);
+ bio_uninit(&iclog->ic_bio);
/*
- * drop the buffer lock now that we are done. Nothing references
- * the buffer after this, so an unmount waiting on this lock can now
- * tear it down safely. As such, it is unsafe to reference the buffer
- * (bp) after the unlock as we could race with it being freed.
+ * Drop the lock to signal that we are done. Nothing references the
+ * iclog after this, so an unmount waiting on this lock can now tear it
+ * down safely. As such, it is unsafe to reference the iclog after the
+ * unlock as we could race with it being freed.
*/
- xfs_buf_unlock(bp);
+ up(&iclog->ic_sema);
}
/*
@@ -1300,65 +1231,26 @@
* If the filesystem blocksize is too large, we may need to choose a
* larger size since the directory code currently logs entire blocks.
*/
-
STATIC void
xlog_get_iclog_buffer_size(
struct xfs_mount *mp,
struct xlog *log)
{
- int size;
- int xhdrs;
-
if (mp->m_logbufs <= 0)
- log->l_iclog_bufs = XLOG_MAX_ICLOGS;
- else
- log->l_iclog_bufs = mp->m_logbufs;
+ mp->m_logbufs = XLOG_MAX_ICLOGS;
+ if (mp->m_logbsize <= 0)
+ mp->m_logbsize = XLOG_BIG_RECORD_BSIZE;
+
+ log->l_iclog_bufs = mp->m_logbufs;
+ log->l_iclog_size = mp->m_logbsize;
/*
- * Buffer size passed in from mount system call.
+ * # headers = size / 32k - one header holds cycles from 32k of data.
*/
- if (mp->m_logbsize > 0) {
- size = log->l_iclog_size = mp->m_logbsize;
- log->l_iclog_size_log = 0;
- while (size != 1) {
- log->l_iclog_size_log++;
- size >>= 1;
- }
-
- if (xfs_sb_version_haslogv2(&mp->m_sb)) {
- /* # headers = size / 32k
- * one header holds cycles from 32k of data
- */
-
- xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
- if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
- xhdrs++;
- log->l_iclog_hsize = xhdrs << BBSHIFT;
- log->l_iclog_heads = xhdrs;
- } else {
- ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
- log->l_iclog_hsize = BBSIZE;
- log->l_iclog_heads = 1;
- }
- goto done;
- }
-
- /* All machines use 32kB buffers by default. */
- log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
- log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
-
- /* the default log size is 16k or 32k which is one header sector */
- log->l_iclog_hsize = BBSIZE;
- log->l_iclog_heads = 1;
-
-done:
- /* are we being asked to make the sizes selected above visible? */
- if (mp->m_logbufs == 0)
- mp->m_logbufs = log->l_iclog_bufs;
- if (mp->m_logbsize == 0)
- mp->m_logbsize = log->l_iclog_size;
-} /* xlog_get_iclog_buffer_size */
-
+ log->l_iclog_heads =
+ DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE);
+ log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT;
+}
void
xfs_log_work_queue(
@@ -1421,7 +1313,6 @@
xlog_rec_header_t *head;
xlog_in_core_t **iclogp;
xlog_in_core_t *iclog, *prev_iclog=NULL;
- xfs_buf_t *bp;
int i;
int error = -ENOMEM;
uint log2_size = 0;
@@ -1479,30 +1370,6 @@
xlog_get_iclog_buffer_size(mp, log);
- /*
- * Use a NULL block for the extra log buffer used during splits so that
- * it will trigger errors if we ever try to do IO on it without first
- * having set it up properly.
- */
- error = -ENOMEM;
- bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
- BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
- if (!bp)
- goto out_free_log;
-
- /*
- * The iclogbuf buffer locks are held over IO but we are not going to do
- * IO yet. Hence unlock the buffer so that the log IO path can grab it
- * when appropriately.
- */
- ASSERT(xfs_buf_islocked(bp));
- xfs_buf_unlock(bp);
-
- /* use high priority wq for log I/O completion */
- bp->b_ioend_wq = mp->m_log_workqueue;
- bp->b_iodone = xlog_iodone;
- log->l_xbuf = bp;
-
spin_lock_init(&log->l_icloglock);
init_waitqueue_head(&log->l_flush_wait);
@@ -1515,29 +1382,23 @@
* xlog_in_core_t in xfs_log_priv.h for details.
*/
ASSERT(log->l_iclog_size >= 4096);
- for (i=0; i < log->l_iclog_bufs; i++) {
- *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
- if (!*iclogp)
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
+ size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
+ sizeof(struct bio_vec);
+
+ iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
+ if (!iclog)
goto out_free_iclog;
- iclog = *iclogp;
+ *iclogp = iclog;
iclog->ic_prev = prev_iclog;
prev_iclog = iclog;
- bp = xfs_buf_get_uncached(mp->m_logdev_targp,
- BTOBB(log->l_iclog_size),
- XBF_NO_IOACCT);
- if (!bp)
+ iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
+ KM_MAYFAIL | KM_ZERO);
+ if (!iclog->ic_data)
goto out_free_iclog;
-
- ASSERT(xfs_buf_islocked(bp));
- xfs_buf_unlock(bp);
-
- /* use high priority wq for log I/O completion */
- bp->b_ioend_wq = mp->m_log_workqueue;
- bp->b_iodone = xlog_iodone;
- iclog->ic_bp = bp;
- iclog->ic_data = bp->b_addr;
#ifdef DEBUG
log->l_iclog_bak[i] = &iclog->ic_header;
#endif
@@ -1551,58 +1412,62 @@
head->h_fmt = cpu_to_be32(XLOG_FMT);
memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
- iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize;
+ iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize;
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_log = log;
atomic_set(&iclog->ic_refcnt, 0);
spin_lock_init(&iclog->ic_callback_lock);
- iclog->ic_callback_tail = &(iclog->ic_callback);
+ INIT_LIST_HEAD(&iclog->ic_callbacks);
iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
+ INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work);
+ sema_init(&iclog->ic_sema, 1);
iclogp = &iclog->ic_next;
}
*iclogp = log->l_iclog; /* complete ring */
log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
+ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
+ WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0,
+ mp->m_super->s_id);
+ if (!log->l_ioend_workqueue)
+ goto out_free_iclog;
+
error = xlog_cil_init(log);
if (error)
- goto out_free_iclog;
+ goto out_destroy_workqueue;
return log;
+out_destroy_workqueue:
+ destroy_workqueue(log->l_ioend_workqueue);
out_free_iclog:
for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
prev_iclog = iclog->ic_next;
- if (iclog->ic_bp)
- xfs_buf_free(iclog->ic_bp);
+ kmem_free(iclog->ic_data);
kmem_free(iclog);
if (prev_iclog == log->l_iclog)
break;
}
- spinlock_destroy(&log->l_icloglock);
- xfs_buf_free(log->l_xbuf);
out_free_log:
kmem_free(log);
out:
return ERR_PTR(error);
} /* xlog_alloc_log */
-
/*
* Write out the commit record of a transaction associated with the given
- * ticket. Return the lsn of the commit record.
+ * ticket to close off a running log write. Return the lsn of the commit record.
*/
-STATIC int
+int
xlog_commit_record(
struct xlog *log,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
- xfs_lsn_t *commitlsnp)
+ xfs_lsn_t *lsn)
{
- struct xfs_mount *mp = log->l_mp;
- int error;
struct xfs_log_iovec reg = {
.i_addr = NULL,
.i_len = 0,
@@ -1612,24 +1477,27 @@
.lv_niovecs = 1,
.lv_iovecp = ®,
};
+ int error;
- ASSERT_ALWAYS(iclog);
- error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
- XLOG_COMMIT_TRANS);
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return -EIO;
+
+ error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS,
+ false);
if (error)
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
return error;
}
/*
- * Push on the buffer cache code if we ever use more than 75% of the on-disk
- * log space. This code pushes on the lsn which would supposedly free up
- * the 25% which we want to leave free. We may need to adopt a policy which
- * pushes on an lsn which is further along in the log once we reach the high
- * water mark. In this manner, we would be creating a low water mark.
+ * Compute the LSN that we'd need to push the log tail towards in order to have
+ * (a) enough on-disk log space to log the number of bytes specified, (b) at
+ * least 25% of the log space free, and (c) at least 256 blocks free. If the
+ * log free space already meets all three thresholds, this function returns
+ * NULLCOMMITLSN.
*/
-STATIC void
-xlog_grant_push_ail(
+xfs_lsn_t
+xlog_grant_push_threshold(
struct xlog *log,
int need_bytes)
{
@@ -1655,7 +1523,7 @@
free_threshold = max(free_threshold, (log->l_logBBsize >> 2));
free_threshold = max(free_threshold, 256);
if (free_blocks >= free_threshold)
- return;
+ return NULLCOMMITLSN;
xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
&threshold_block);
@@ -1675,13 +1543,33 @@
if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
threshold_lsn = last_sync_lsn;
+ return threshold_lsn;
+}
+
+/*
+ * Push the tail of the log if we need to do so to maintain the free log space
+ * thresholds set out by xlog_grant_push_threshold. We may need to adopt a
+ * policy which pushes on an lsn which is further along in the log once we
+ * reach the high water mark. In this manner, we would be creating a low water
+ * mark.
+ */
+STATIC void
+xlog_grant_push_ail(
+ struct xlog *log,
+ int need_bytes)
+{
+ xfs_lsn_t threshold_lsn;
+
+ threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
+ if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log))
+ return;
+
/*
* Get the transaction layer to kick the dirty buffers out to
* disk asynchronously. No point in trying to do this if
* the filesystem is shutting down.
*/
- if (!XLOG_FORCED_SHUTDOWN(log))
- xfs_ail_push(log->l_ailp, threshold_lsn);
+ xfs_ail_push(log->l_ailp, threshold_lsn);
}
/*
@@ -1751,9 +1639,7 @@
int i;
int xheads;
- xheads = size / XLOG_HEADER_CYCLE_SIZE;
- if (size % XLOG_HEADER_CYCLE_SIZE)
- xheads++;
+ xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE);
for (i = 1; i < xheads; i++) {
crc = crc32c(crc, &xhdr[i].hic_xheader,
@@ -1767,42 +1653,167 @@
return xfs_end_cksum(crc);
}
-/*
- * The bdstrat callback function for log bufs. This gives us a central
- * place to trap bufs in case we get hit by a log I/O error and need to
- * shutdown. Actually, in practice, even when we didn't get a log error,
- * we transition the iclogs to IOERROR state *after* flushing all existing
- * iclogs to disk. This is because we don't want anymore new transactions to be
- * started or completed afterwards.
- *
- * We lock the iclogbufs here so that we can serialise against IO completion
- * during unmount. We might be processing a shutdown triggered during unmount,
- * and that can occur asynchronously to the unmount thread, and hence we need to
- * ensure that completes before tearing down the iclogbufs. Hence we need to
- * hold the buffer lock across the log IO to acheive that.
- */
-STATIC int
-xlog_bdstrat(
- struct xfs_buf *bp)
+static void
+xlog_bio_end_io(
+ struct bio *bio)
{
- struct xlog_in_core *iclog = bp->b_log_item;
+ struct xlog_in_core *iclog = bio->bi_private;
- xfs_buf_lock(bp);
- if (iclog->ic_state & XLOG_STATE_IOERROR) {
- xfs_buf_ioerror(bp, -EIO);
- xfs_buf_stale(bp);
- xfs_buf_ioend(bp);
+ queue_work(iclog->ic_log->l_ioend_workqueue,
+ &iclog->ic_end_io_work);
+}
+
+static int
+xlog_map_iclog_data(
+ struct bio *bio,
+ void *data,
+ size_t count)
+{
+ do {
+ struct page *page = kmem_to_page(data);
+ unsigned int off = offset_in_page(data);
+ size_t len = min_t(size_t, count, PAGE_SIZE - off);
+
+ if (bio_add_page(bio, page, len, off) != len)
+ return -EIO;
+
+ data += len;
+ count -= len;
+ } while (count);
+
+ return 0;
+}
+
+STATIC void
+xlog_write_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ uint64_t bno,
+ unsigned int count,
+ bool need_flush)
+{
+ ASSERT(bno < log->l_logBBsize);
+
+ /*
+ * We lock the iclogbufs here so that we can serialise against I/O
+ * completion during unmount. We might be processing a shutdown
+ * triggered during unmount, and that can occur asynchronously to the
+ * unmount thread, and hence we need to ensure that completes before
+ * tearing down the iclogbufs. Hence we need to hold the buffer lock
+ * across the log IO to archieve that.
+ */
+ down(&iclog->ic_sema);
+ if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) {
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
- * doing it here. Similarly, IO completion will unlock the
- * buffer, so we don't do it here.
+ * doing it here. We kick of the state machine and unlock
+ * the buffer manually, the code needs to be kept in sync
+ * with the I/O completion path.
*/
- return 0;
+ xlog_state_done_syncing(iclog);
+ up(&iclog->ic_sema);
+ return;
}
- xfs_buf_submit(bp);
- return 0;
+ bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
+ bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
+ iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
+ iclog->ic_bio.bi_end_io = xlog_bio_end_io;
+ iclog->ic_bio.bi_private = iclog;
+
+ /*
+ * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
+ * IOs coming immediately after this one. This prevents the block layer
+ * writeback throttle from throttling log writes behind background
+ * metadata writeback and causing priority inversions.
+ */
+ iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC |
+ REQ_IDLE | REQ_FUA;
+ if (need_flush)
+ iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
+
+ if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ return;
+ }
+ if (is_vmalloc_addr(iclog->ic_data))
+ flush_kernel_vmap_range(iclog->ic_data, count);
+
+ /*
+ * If this log buffer would straddle the end of the log we will have
+ * to split it up into two bios, so that we can continue at the start.
+ */
+ if (bno + BTOBB(count) > log->l_logBBsize) {
+ struct bio *split;
+
+ split = bio_split(&iclog->ic_bio, log->l_logBBsize - bno,
+ GFP_NOIO, &fs_bio_set);
+ bio_chain(split, &iclog->ic_bio);
+ submit_bio(split);
+
+ /* restart at logical offset zero for the remainder */
+ iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart;
+ }
+
+ submit_bio(&iclog->ic_bio);
+}
+
+/*
+ * We need to bump cycle number for the part of the iclog that is
+ * written to the start of the log. Watch out for the header magic
+ * number case, though.
+ */
+static void
+xlog_split_iclog(
+ struct xlog *log,
+ void *data,
+ uint64_t bno,
+ unsigned int count)
+{
+ unsigned int split_offset = BBTOB(log->l_logBBsize - bno);
+ unsigned int i;
+
+ for (i = split_offset; i < count; i += BBSIZE) {
+ uint32_t cycle = get_unaligned_be32(data + i);
+
+ if (++cycle == XLOG_HEADER_MAGIC_NUM)
+ cycle++;
+ put_unaligned_be32(cycle, data + i);
+ }
+}
+
+static int
+xlog_calc_iclog_size(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ uint32_t *roundoff)
+{
+ uint32_t count_init, count;
+ bool use_lsunit;
+
+ use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
+ log->l_mp->m_sb.sb_logsunit > 1;
+
+ /* Add for LR header */
+ count_init = log->l_iclog_hsize + iclog->ic_offset;
+
+ /* Round out the log write size */
+ if (use_lsunit) {
+ /* we have a v2 stripe unit to use */
+ count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
+ } else {
+ count = BBTOB(BTOBB(count_init));
+ }
+
+ ASSERT(count >= count_init);
+ *roundoff = count - count_init;
+
+ if (use_lsunit)
+ ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit);
+ else
+ ASSERT(*roundoff < BBTOB(1));
+ return count;
}
/*
@@ -1825,46 +1836,23 @@
* log will require grabbing the lock though.
*
* The entire log manager uses a logical block numbering scheme. Only
- * log_sync (and then only bwrite()) know about the fact that the log may
- * not start with block zero on a given device. The log block start offset
- * is added immediately before calling bwrite().
+ * xlog_write_iclog knows about the fact that the log may not start with
+ * block zero on a given device.
*/
-
-STATIC int
+STATIC void
xlog_sync(
struct xlog *log,
struct xlog_in_core *iclog)
{
- xfs_buf_t *bp;
- int i;
- uint count; /* byte count of bwrite */
- uint count_init; /* initial count before roundup */
- int roundoff; /* roundoff to BB or stripe */
- int split = 0; /* split write into two regions */
- int error;
- int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
- int size;
+ unsigned int count; /* byte count of bwrite */
+ unsigned int roundoff; /* roundoff to BB or stripe */
+ uint64_t bno;
+ unsigned int size;
+ bool need_flush = true, split = false;
- XFS_STATS_INC(log->l_mp, xs_log_writes);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
- /* Add for LR header */
- count_init = log->l_iclog_hsize + iclog->ic_offset;
-
- /* Round out the log write size */
- if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
- /* we have a v2 stripe unit to use */
- count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
- } else {
- count = BBTOB(BTOBB(count_init));
- }
- roundoff = count - count_init;
- ASSERT(roundoff >= 0);
- ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 &&
- roundoff < log->l_mp->m_sb.sb_logsunit)
- ||
- (log->l_mp->m_sb.sb_logsunit <= 1 &&
- roundoff < BBTOB(1)));
+ count = xlog_calc_iclog_size(log, iclog, &roundoff);
/* move grant heads by roundoff in sync */
xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
@@ -1875,41 +1863,19 @@
/* real byte length */
size = iclog->ic_offset;
- if (v2)
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb))
size += roundoff;
iclog->ic_header.h_len = cpu_to_be32(size);
- bp = iclog->ic_bp;
- XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
-
+ XFS_STATS_INC(log->l_mp, xs_log_writes);
XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
+ bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
+
/* Do we need to split this write into 2 parts? */
- if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
- char *dptr;
-
- split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
- count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
- iclog->ic_bwritecnt = 2;
-
- /*
- * Bump the cycle numbers at the start of each block in the
- * part of the iclog that ends up in the buffer that gets
- * written to the start of the log.
- *
- * Watch out for the header magic number case, though.
- */
- dptr = (char *)&iclog->ic_header + count;
- for (i = 0; i < split; i += BBSIZE) {
- uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
- if (++cycle == XLOG_HEADER_MAGIC_NUM)
- cycle++;
- *(__be32 *)dptr = cpu_to_be32(cycle);
-
- dptr += BBSIZE;
- }
- } else {
- iclog->ic_bwritecnt = 1;
+ if (bno + BTOBB(count) > log->l_logBBsize) {
+ xlog_split_iclog(log, &iclog->ic_header, bno, count);
+ split = true;
}
/* calculcate the checksum */
@@ -1922,18 +1888,15 @@
* write on I/O completion and shutdown the fs. The subsequent mount
* detects the bad CRC and attempts to recover.
*/
+#ifdef DEBUG
if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
- iclog->ic_state |= XLOG_STATE_IOABORT;
+ iclog->ic_fail_crc = true;
xfs_warn(log->l_mp,
"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
be64_to_cpu(iclog->ic_header.h_lsn));
}
-
- bp->b_io_length = BTOBB(count);
- bp->b_log_item = iclog;
- bp->b_flags &= ~XBF_FLUSH;
- bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
+#endif
/*
* Flush the data device before flushing the log to make sure all meta
@@ -1943,50 +1906,14 @@
* synchronously here; for an internal log we can simply use the block
* layer state machine for preflushes.
*/
- if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+ if (log->l_targ != log->l_mp->m_ddev_targp || split) {
xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
- else
- bp->b_flags |= XBF_FLUSH;
-
- ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
- ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
-
- xlog_verify_iclog(log, iclog, count, true);
-
- /* account for log which doesn't start at block #0 */
- XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
-
- /*
- * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
- * is shutting down.
- */
- error = xlog_bdstrat(bp);
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_sync");
- return error;
+ need_flush = false;
}
- if (split) {
- bp = iclog->ic_log->l_xbuf;
- XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */
- xfs_buf_associate_memory(bp,
- (char *)&iclog->ic_header + count, split);
- bp->b_log_item = iclog;
- bp->b_flags &= ~XBF_FLUSH;
- bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
- ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
- ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
-
- /* account for internal log which doesn't start at block #0 */
- XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
- error = xlog_bdstrat(bp);
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
- return error;
- }
- }
- return 0;
-} /* xlog_sync */
+ xlog_verify_iclog(log, iclog, count);
+ xlog_write_iclog(log, iclog, bno, count, need_flush);
+}
/*
* Deallocate a log structure
@@ -2006,38 +1933,27 @@
*/
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
- xfs_buf_lock(iclog->ic_bp);
- xfs_buf_unlock(iclog->ic_bp);
+ down(&iclog->ic_sema);
+ up(&iclog->ic_sema);
iclog = iclog->ic_next;
}
- /*
- * Always need to ensure that the extra buffer does not point to memory
- * owned by another log buffer before we free it. Also, cycle the lock
- * first to ensure we've completed IO on it.
- */
- xfs_buf_lock(log->l_xbuf);
- xfs_buf_unlock(log->l_xbuf);
- xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
- xfs_buf_free(log->l_xbuf);
-
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
- xfs_buf_free(iclog->ic_bp);
next_iclog = iclog->ic_next;
+ kmem_free(iclog->ic_data);
kmem_free(iclog);
iclog = next_iclog;
}
- spinlock_destroy(&log->l_icloglock);
log->l_mp->m_log = NULL;
+ destroy_workqueue(log->l_ioend_workqueue);
kmem_free(log);
-} /* xlog_dealloc_log */
+}
/*
* Update counters atomically now that memcpy is done.
*/
-/* ARGSUSED */
static inline void
xlog_state_finish_copy(
struct xlog *log,
@@ -2045,16 +1961,11 @@
int record_cnt,
int copy_bytes)
{
- spin_lock(&log->l_icloglock);
+ lockdep_assert_held(&log->l_icloglock);
be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
iclog->ic_offset += copy_bytes;
-
- spin_unlock(&log->l_icloglock);
-} /* xlog_state_finish_copy */
-
-
-
+}
/*
* print out info relating to regions written which consume
@@ -2070,7 +1981,7 @@
/* match with XLOG_REG_TYPE_* in xfs_log.h */
#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
- static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = {
+ static char *res_type_str[] = {
REG_TYPE_STR(BFORMAT, "bformat"),
REG_TYPE_STR(BCHUNK, "bchunk"),
REG_TYPE_STR(EFI_FORMAT, "efi_format"),
@@ -2090,8 +2001,15 @@
REG_TYPE_STR(UNMOUNT, "unmount"),
REG_TYPE_STR(COMMIT, "commit"),
REG_TYPE_STR(TRANSHDR, "trans header"),
- REG_TYPE_STR(ICREATE, "inode create")
+ REG_TYPE_STR(ICREATE, "inode create"),
+ REG_TYPE_STR(RUI_FORMAT, "rui_format"),
+ REG_TYPE_STR(RUD_FORMAT, "rud_format"),
+ REG_TYPE_STR(CUI_FORMAT, "cui_format"),
+ REG_TYPE_STR(CUD_FORMAT, "cud_format"),
+ REG_TYPE_STR(BUI_FORMAT, "bui_format"),
+ REG_TYPE_STR(BUD_FORMAT, "bud_format"),
};
+ BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
#undef REG_TYPE_STR
xfs_warn(mp, "ticket reservation summary:");
@@ -2168,22 +2086,20 @@
}
/*
- * Calculate the potential space needed by the log vector. Each region gets
- * its own xlog_op_header_t and may need to be double word aligned.
+ * Calculate the potential space needed by the log vector. We may need a start
+ * record, and each region gets its own struct xlog_op_header and may need to be
+ * double word aligned.
*/
static int
xlog_write_calc_vec_length(
struct xlog_ticket *ticket,
- struct xfs_log_vec *log_vector)
+ struct xfs_log_vec *log_vector,
+ bool need_start_rec)
{
struct xfs_log_vec *lv;
- int headers = 0;
+ int headers = need_start_rec ? 1 : 0;
int len = 0;
int i;
-
- /* acct for start rec of xact */
- if (ticket->t_flags & XLOG_TIC_INITED)
- headers++;
for (lv = log_vector; lv; lv = lv->lv_next) {
/* we don't write ordered log vectors */
@@ -2206,27 +2122,16 @@
return len;
}
-/*
- * If first write for transaction, insert start record We can't be trying to
- * commit if we are inited. We can't have any "partial_copy" if we are inited.
- */
-static int
+static void
xlog_write_start_rec(
struct xlog_op_header *ophdr,
struct xlog_ticket *ticket)
{
- if (!(ticket->t_flags & XLOG_TIC_INITED))
- return 0;
-
ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
ophdr->oh_clientid = ticket->t_clientid;
ophdr->oh_len = 0;
ophdr->oh_flags = XLOG_START_TRANS;
ophdr->oh_res2 = 0;
-
- ticket->t_flags &= ~XLOG_TIC_INITED;
-
- return sizeof(struct xlog_op_header);
}
static xlog_op_header_t *
@@ -2324,15 +2229,18 @@
int log_offset,
struct xlog_in_core **commit_iclog)
{
+ int error;
+
if (*partial_copy) {
/*
* This iclog has already been marked WANT_SYNC by
* xlog_state_get_iclog_space.
*/
+ spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
*record_cnt = 0;
*data_cnt = 0;
- return xlog_state_release_iclog(log, iclog);
+ goto release_iclog;
}
*partial_copy = 0;
@@ -2340,21 +2248,29 @@
if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
/* no more space in this iclog - push it. */
+ spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
*record_cnt = 0;
*data_cnt = 0;
- spin_lock(&log->l_icloglock);
- xlog_state_want_sync(log, iclog);
- spin_unlock(&log->l_icloglock);
-
+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(log, iclog, 0);
+ else
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_state == XLOG_STATE_IOERROR);
if (!commit_iclog)
- return xlog_state_release_iclog(log, iclog);
+ goto release_iclog;
+ spin_unlock(&log->l_icloglock);
ASSERT(flags & XLOG_COMMIT_TRANS);
*commit_iclog = iclog;
}
return 0;
+
+release_iclog:
+ error = xlog_state_release_iclog(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ return error;
}
/*
@@ -2404,39 +2320,28 @@
struct xlog_ticket *ticket,
xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog,
- uint flags)
+ uint flags,
+ bool need_start_rec)
{
struct xlog_in_core *iclog = NULL;
- struct xfs_log_iovec *vecp;
- struct xfs_log_vec *lv;
+ struct xfs_log_vec *lv = log_vector;
+ struct xfs_log_iovec *vecp = lv->lv_iovecp;
+ int index = 0;
int len;
- int index;
int partial_copy = 0;
int partial_copy_len = 0;
int contwr = 0;
int record_cnt = 0;
int data_cnt = 0;
- int error;
-
- *start_lsn = 0;
-
- len = xlog_write_calc_vec_length(ticket, log_vector);
+ int error = 0;
/*
- * Region headers and bytes are already accounted for.
- * We only need to take into account start records and
- * split regions in this function.
+ * If this is a commit or unmount transaction, we don't need a start
+ * record to be written. We do, however, have to account for the
+ * commit or unmount header that gets written. Hence we always have
+ * to account for an extra xlog_op_header here.
*/
- if (ticket->t_flags & XLOG_TIC_INITED)
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
- /*
- * Commit record headers need to be accounted for. These
- * come in as separate writes so are easy to detect.
- */
- if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
if (ticket->t_curr_res < 0) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
@@ -2444,9 +2349,8 @@
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
}
- index = 0;
- lv = log_vector;
- vecp = lv->lv_iovecp;
+ len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec);
+ *start_lsn = 0;
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
@@ -2470,7 +2374,6 @@
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
struct xfs_log_iovec *reg;
struct xlog_op_header *ophdr;
- int start_rec_copy;
int copy_len;
int copy_off;
bool ordered = false;
@@ -2486,11 +2389,15 @@
ASSERT(reg->i_len % sizeof(int32_t) == 0);
ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
- start_rec_copy = xlog_write_start_rec(ptr, ticket);
- if (start_rec_copy) {
- record_cnt++;
+ /*
+ * Before we start formatting log vectors, we need to
+ * write a start record. Only do this for the first
+ * iclog we write to.
+ */
+ if (need_start_rec) {
+ xlog_write_start_rec(ptr, ticket);
xlog_write_adv_cnt(&ptr, &len, &log_offset,
- start_rec_copy);
+ sizeof(struct xlog_op_header));
}
ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
@@ -2522,8 +2429,13 @@
xlog_write_adv_cnt(&ptr, &len, &log_offset,
copy_len);
}
- copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+ copy_len += sizeof(struct xlog_op_header);
record_cnt++;
+ if (need_start_rec) {
+ copy_len += sizeof(struct xlog_op_header);
+ record_cnt++;
+ need_start_rec = false;
+ }
data_cnt += contwr ? copy_len : 0;
error = xlog_write_copy_finish(log, iclog, flags,
@@ -2567,158 +2479,284 @@
ASSERT(len == 0);
+ spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
- if (!commit_iclog)
- return xlog_state_release_iclog(log, iclog);
+ if (commit_iclog) {
+ ASSERT(flags & XLOG_COMMIT_TRANS);
+ *commit_iclog = iclog;
+ } else {
+ error = xlog_state_release_iclog(log, iclog);
+ }
+ spin_unlock(&log->l_icloglock);
- ASSERT(flags & XLOG_COMMIT_TRANS);
- *commit_iclog = iclog;
- return 0;
+ return error;
}
-
-/*****************************************************************************
- *
- * State Machine functions
- *
- *****************************************************************************
- */
-
-/* Clean iclogs starting from the head. This ordering must be
- * maintained, so an iclog doesn't become ACTIVE beyond one that
- * is SYNCING. This is also required to maintain the notion that we use
- * a ordered wait queue to hold off would be writers to the log when every
- * iclog is trying to sync to disk.
- *
- * State Change: DIRTY -> ACTIVE
- */
-STATIC void
-xlog_state_clean_log(
- struct xlog *log)
+static void
+xlog_state_activate_iclog(
+ struct xlog_in_core *iclog,
+ int *iclogs_changed)
{
- xlog_in_core_t *iclog;
- int changed = 0;
+ ASSERT(list_empty_careful(&iclog->ic_callbacks));
- iclog = log->l_iclog;
- do {
- if (iclog->ic_state == XLOG_STATE_DIRTY) {
- iclog->ic_state = XLOG_STATE_ACTIVE;
- iclog->ic_offset = 0;
- ASSERT(iclog->ic_callback == NULL);
- /*
- * If the number of ops in this iclog indicate it just
- * contains the dummy transaction, we can
- * change state into IDLE (the second time around).
- * Otherwise we should change the state into
- * NEED a dummy.
- * We don't need to cover the dummy.
- */
- if (!changed &&
- (be32_to_cpu(iclog->ic_header.h_num_logops) ==
- XLOG_COVER_OPS)) {
- changed = 1;
- } else {
- /*
- * We have two dirty iclogs so start over
- * This could also be num of ops indicates
- * this is not the dummy going out.
- */
- changed = 2;
- }
- iclog->ic_header.h_num_logops = 0;
- memset(iclog->ic_header.h_cycle_data, 0,
- sizeof(iclog->ic_header.h_cycle_data));
- iclog->ic_header.h_lsn = 0;
- } else if (iclog->ic_state == XLOG_STATE_ACTIVE)
- /* do nothing */;
- else
- break; /* stop cleaning */
- iclog = iclog->ic_next;
- } while (iclog != log->l_iclog);
-
- /* log is locked when we are called */
/*
- * Change state for the dummy log recording.
- * We usually go to NEED. But we go to NEED2 if the changed indicates
- * we are done writing the dummy record.
- * If we are done with the second dummy recored (DONE2), then
- * we go to IDLE.
+ * If the number of ops in this iclog indicate it just contains the
+ * dummy transaction, we can change state into IDLE (the second time
+ * around). Otherwise we should change the state into NEED a dummy.
+ * We don't need to cover the dummy.
*/
- if (changed) {
- switch (log->l_covered_state) {
- case XLOG_STATE_COVER_IDLE:
- case XLOG_STATE_COVER_NEED:
- case XLOG_STATE_COVER_NEED2:
- log->l_covered_state = XLOG_STATE_COVER_NEED;
- break;
-
- case XLOG_STATE_COVER_DONE:
- if (changed == 1)
- log->l_covered_state = XLOG_STATE_COVER_NEED2;
- else
- log->l_covered_state = XLOG_STATE_COVER_NEED;
- break;
-
- case XLOG_STATE_COVER_DONE2:
- if (changed == 1)
- log->l_covered_state = XLOG_STATE_COVER_IDLE;
- else
- log->l_covered_state = XLOG_STATE_COVER_NEED;
- break;
-
- default:
- ASSERT(0);
- }
+ if (*iclogs_changed == 0 &&
+ iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) {
+ *iclogs_changed = 1;
+ } else {
+ /*
+ * We have two dirty iclogs so start over. This could also be
+ * num of ops indicating this is not the dummy going out.
+ */
+ *iclogs_changed = 2;
}
-} /* xlog_state_clean_log */
+
+ iclog->ic_state = XLOG_STATE_ACTIVE;
+ iclog->ic_offset = 0;
+ iclog->ic_header.h_num_logops = 0;
+ memset(iclog->ic_header.h_cycle_data, 0,
+ sizeof(iclog->ic_header.h_cycle_data));
+ iclog->ic_header.h_lsn = 0;
+}
+
+/*
+ * Loop through all iclogs and mark all iclogs currently marked DIRTY as
+ * ACTIVE after iclog I/O has completed.
+ */
+static void
+xlog_state_activate_iclogs(
+ struct xlog *log,
+ int *iclogs_changed)
+{
+ struct xlog_in_core *iclog = log->l_iclog;
+
+ do {
+ if (iclog->ic_state == XLOG_STATE_DIRTY)
+ xlog_state_activate_iclog(iclog, iclogs_changed);
+ /*
+ * The ordering of marking iclogs ACTIVE must be maintained, so
+ * an iclog doesn't become ACTIVE beyond one that is SYNCING.
+ */
+ else if (iclog->ic_state != XLOG_STATE_ACTIVE)
+ break;
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+}
+
+static int
+xlog_covered_state(
+ int prev_state,
+ int iclogs_changed)
+{
+ /*
+ * We usually go to NEED. But we go to NEED2 if the changed indicates we
+ * are done writing the dummy record. If we are done with the second
+ * dummy recored (DONE2), then we go to IDLE.
+ */
+ switch (prev_state) {
+ case XLOG_STATE_COVER_IDLE:
+ case XLOG_STATE_COVER_NEED:
+ case XLOG_STATE_COVER_NEED2:
+ break;
+ case XLOG_STATE_COVER_DONE:
+ if (iclogs_changed == 1)
+ return XLOG_STATE_COVER_NEED2;
+ break;
+ case XLOG_STATE_COVER_DONE2:
+ if (iclogs_changed == 1)
+ return XLOG_STATE_COVER_IDLE;
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ return XLOG_STATE_COVER_NEED;
+}
+
+STATIC void
+xlog_state_clean_iclog(
+ struct xlog *log,
+ struct xlog_in_core *dirty_iclog)
+{
+ int iclogs_changed = 0;
+
+ dirty_iclog->ic_state = XLOG_STATE_DIRTY;
+
+ xlog_state_activate_iclogs(log, &iclogs_changed);
+ wake_up_all(&dirty_iclog->ic_force_wait);
+
+ if (iclogs_changed) {
+ log->l_covered_state = xlog_covered_state(log->l_covered_state,
+ iclogs_changed);
+ }
+}
STATIC xfs_lsn_t
xlog_get_lowest_lsn(
- struct xlog *log)
+ struct xlog *log)
{
- xlog_in_core_t *lsn_log;
- xfs_lsn_t lowest_lsn, lsn;
+ struct xlog_in_core *iclog = log->l_iclog;
+ xfs_lsn_t lowest_lsn = 0, lsn;
- lsn_log = log->l_iclog;
- lowest_lsn = 0;
do {
- if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
- lsn = be64_to_cpu(lsn_log->ic_header.h_lsn);
- if ((lsn && !lowest_lsn) ||
- (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
+ if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+ iclog->ic_state == XLOG_STATE_DIRTY)
+ continue;
+
+ lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0)
lowest_lsn = lsn;
- }
- }
- lsn_log = lsn_log->ic_next;
- } while (lsn_log != log->l_iclog);
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+
return lowest_lsn;
}
+/*
+ * Completion of a iclog IO does not imply that a transaction has completed, as
+ * transactions can be large enough to span many iclogs. We cannot change the
+ * tail of the log half way through a transaction as this may be the only
+ * transaction in the log and moving the tail to point to the middle of it
+ * will prevent recovery from finding the start of the transaction. Hence we
+ * should only update the last_sync_lsn if this iclog contains transaction
+ * completion callbacks on it.
+ *
+ * We have to do this before we drop the icloglock to ensure we are the only one
+ * that can update it.
+ *
+ * If we are moving the last_sync_lsn forwards, we also need to ensure we kick
+ * the reservation grant head pushing. This is due to the fact that the push
+ * target is bound by the current last_sync_lsn value. Hence if we have a large
+ * amount of log space bound up in this committing transaction then the
+ * last_sync_lsn value may be the limiting factor preventing tail pushing from
+ * freeing space in the log. Hence once we've updated the last_sync_lsn we
+ * should push the AIL to ensure the push target (and hence the grant head) is
+ * no longer bound by the old log head location and can move forwards and make
+ * progress again.
+ */
+static void
+xlog_state_set_callback(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ xfs_lsn_t header_lsn)
+{
+ iclog->ic_state = XLOG_STATE_CALLBACK;
+
+ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+ header_lsn) <= 0);
+
+ if (list_empty_careful(&iclog->ic_callbacks))
+ return;
+
+ atomic64_set(&log->l_last_sync_lsn, header_lsn);
+ xlog_grant_push_ail(log, 0);
+}
+
+/*
+ * Return true if we need to stop processing, false to continue to the next
+ * iclog. The caller will need to run callbacks if the iclog is returned in the
+ * XLOG_STATE_CALLBACK state.
+ */
+static bool
+xlog_state_iodone_process_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ bool *ioerror)
+{
+ xfs_lsn_t lowest_lsn;
+ xfs_lsn_t header_lsn;
+
+ switch (iclog->ic_state) {
+ case XLOG_STATE_ACTIVE:
+ case XLOG_STATE_DIRTY:
+ /*
+ * Skip all iclogs in the ACTIVE & DIRTY states:
+ */
+ return false;
+ case XLOG_STATE_IOERROR:
+ /*
+ * Between marking a filesystem SHUTDOWN and stopping the log,
+ * we do flush all iclogs to disk (if there wasn't a log I/O
+ * error). So, we do want things to go smoothly in case of just
+ * a SHUTDOWN w/o a LOG_IO_ERROR.
+ */
+ *ioerror = true;
+ return false;
+ case XLOG_STATE_DONE_SYNC:
+ /*
+ * Now that we have an iclog that is in the DONE_SYNC state, do
+ * one more check here to see if we have chased our tail around.
+ * If this is not the lowest lsn iclog, then we will leave it
+ * for another completion to process.
+ */
+ header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ lowest_lsn = xlog_get_lowest_lsn(log);
+ if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
+ return false;
+ xlog_state_set_callback(log, iclog, header_lsn);
+ return false;
+ default:
+ /*
+ * Can only perform callbacks in order. Since this iclog is not
+ * in the DONE_SYNC state, we skip the rest and just try to
+ * clean up.
+ */
+ return true;
+ }
+}
+
+/*
+ * Keep processing entries in the iclog callback list until we come around and
+ * it is empty. We need to atomically see that the list is empty and change the
+ * state to DIRTY so that we don't miss any more callbacks being added.
+ *
+ * This function is called with the icloglock held and returns with it held. We
+ * drop it while running callbacks, however, as holding it over thousands of
+ * callbacks is unnecessary and causes excessive contention if we do.
+ */
+static void
+xlog_state_do_iclog_callbacks(
+ struct xlog *log,
+ struct xlog_in_core *iclog)
+ __releases(&log->l_icloglock)
+ __acquires(&log->l_icloglock)
+{
+ spin_unlock(&log->l_icloglock);
+ spin_lock(&iclog->ic_callback_lock);
+ while (!list_empty(&iclog->ic_callbacks)) {
+ LIST_HEAD(tmp);
+
+ list_splice_init(&iclog->ic_callbacks, &tmp);
+
+ spin_unlock(&iclog->ic_callback_lock);
+ xlog_cil_process_committed(&tmp);
+ spin_lock(&iclog->ic_callback_lock);
+ }
+
+ /*
+ * Pick up the icloglock while still holding the callback lock so we
+ * serialise against anyone trying to add more callbacks to this iclog
+ * now we've finished processing.
+ */
+ spin_lock(&log->l_icloglock);
+ spin_unlock(&iclog->ic_callback_lock);
+}
STATIC void
xlog_state_do_callback(
- struct xlog *log,
- int aborted,
- struct xlog_in_core *ciclog)
+ struct xlog *log)
{
- xlog_in_core_t *iclog;
- xlog_in_core_t *first_iclog; /* used to know when we've
- * processed all iclogs once */
- xfs_log_callback_t *cb, *cb_next;
- int flushcnt = 0;
- xfs_lsn_t lowest_lsn;
- int ioerrors; /* counter: iclogs with errors */
- int loopdidcallbacks; /* flag: inner loop did callbacks*/
- int funcdidcallbacks; /* flag: function did callbacks */
- int repeats; /* for issuing console warnings if
- * looping too many times */
+ struct xlog_in_core *iclog;
+ struct xlog_in_core *first_iclog;
+ bool cycled_icloglock;
+ bool ioerror;
+ int flushcnt = 0;
+ int repeats = 0;
spin_lock(&log->l_icloglock);
- first_iclog = iclog = log->l_iclog;
- ioerrors = 0;
- funcdidcallbacks = 0;
- repeats = 0;
-
do {
/*
* Scan all iclogs starting with the one pointed to by the
@@ -2730,140 +2768,31 @@
*/
first_iclog = log->l_iclog;
iclog = log->l_iclog;
- loopdidcallbacks = 0;
+ cycled_icloglock = false;
+ ioerror = false;
repeats++;
do {
+ if (xlog_state_iodone_process_iclog(log, iclog,
+ &ioerror))
+ break;
- /* skip all iclogs in the ACTIVE & DIRTY states */
- if (iclog->ic_state &
- (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
+ if (iclog->ic_state != XLOG_STATE_CALLBACK &&
+ iclog->ic_state != XLOG_STATE_IOERROR) {
iclog = iclog->ic_next;
continue;
}
/*
- * Between marking a filesystem SHUTDOWN and stopping
- * the log, we do flush all iclogs to disk (if there
- * wasn't a log I/O error). So, we do want things to
- * go smoothly in case of just a SHUTDOWN w/o a
- * LOG_IO_ERROR.
+ * Running callbacks will drop the icloglock which means
+ * we'll have to run at least one more complete loop.
*/
- if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
- /*
- * Can only perform callbacks in order. Since
- * this iclog is not in the DONE_SYNC/
- * DO_CALLBACK state, we skip the rest and
- * just try to clean up. If we set our iclog
- * to DO_CALLBACK, we will not process it when
- * we retry since a previous iclog is in the
- * CALLBACK and the state cannot change since
- * we are holding the l_icloglock.
- */
- if (!(iclog->ic_state &
- (XLOG_STATE_DONE_SYNC |
- XLOG_STATE_DO_CALLBACK))) {
- if (ciclog && (ciclog->ic_state ==
- XLOG_STATE_DONE_SYNC)) {
- ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
- }
- break;
- }
- /*
- * We now have an iclog that is in either the
- * DO_CALLBACK or DONE_SYNC states. The other
- * states (WANT_SYNC, SYNCING, or CALLBACK were
- * caught by the above if and are going to
- * clean (i.e. we aren't doing their callbacks)
- * see the above if.
- */
-
- /*
- * We will do one more check here to see if we
- * have chased our tail around.
- */
-
- lowest_lsn = xlog_get_lowest_lsn(log);
- if (lowest_lsn &&
- XFS_LSN_CMP(lowest_lsn,
- be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
- iclog = iclog->ic_next;
- continue; /* Leave this iclog for
- * another thread */
- }
-
- iclog->ic_state = XLOG_STATE_CALLBACK;
-
-
- /*
- * Completion of a iclog IO does not imply that
- * a transaction has completed, as transactions
- * can be large enough to span many iclogs. We
- * cannot change the tail of the log half way
- * through a transaction as this may be the only
- * transaction in the log and moving th etail to
- * point to the middle of it will prevent
- * recovery from finding the start of the
- * transaction. Hence we should only update the
- * last_sync_lsn if this iclog contains
- * transaction completion callbacks on it.
- *
- * We have to do this before we drop the
- * icloglock to ensure we are the only one that
- * can update it.
- */
- ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
- be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
- if (iclog->ic_callback)
- atomic64_set(&log->l_last_sync_lsn,
- be64_to_cpu(iclog->ic_header.h_lsn));
-
- } else
- ioerrors++;
-
- spin_unlock(&log->l_icloglock);
-
- /*
- * Keep processing entries in the callback list until
- * we come around and it is empty. We need to
- * atomically see that the list is empty and change the
- * state to DIRTY so that we don't miss any more
- * callbacks being added.
- */
- spin_lock(&iclog->ic_callback_lock);
- cb = iclog->ic_callback;
- while (cb) {
- iclog->ic_callback_tail = &(iclog->ic_callback);
- iclog->ic_callback = NULL;
- spin_unlock(&iclog->ic_callback_lock);
-
- /* perform callbacks in the order given */
- for (; cb; cb = cb_next) {
- cb_next = cb->cb_next;
- cb->cb_func(cb->cb_arg, aborted);
- }
- spin_lock(&iclog->ic_callback_lock);
- cb = iclog->ic_callback;
- }
-
- loopdidcallbacks++;
- funcdidcallbacks++;
-
- spin_lock(&log->l_icloglock);
- ASSERT(iclog->ic_callback == NULL);
- spin_unlock(&iclog->ic_callback_lock);
- if (!(iclog->ic_state & XLOG_STATE_IOERROR))
- iclog->ic_state = XLOG_STATE_DIRTY;
-
- /*
- * Transition from DIRTY to ACTIVE if applicable.
- * NOP if STATE_IOERROR.
- */
- xlog_state_clean_log(log);
-
- /* wake up threads waiting in xfs_log_force() */
- wake_up_all(&iclog->ic_force_wait);
-
+ cycled_icloglock = true;
+ xlog_state_do_iclog_callbacks(log, iclog);
+ if (XLOG_FORCED_SHUTDOWN(log))
+ wake_up_all(&iclog->ic_force_wait);
+ else
+ xlog_state_clean_iclog(log, iclog);
iclog = iclog->ic_next;
} while (first_iclog != iclog);
@@ -2874,45 +2803,10 @@
"%s: possible infinite loop (%d iterations)",
__func__, flushcnt);
}
- } while (!ioerrors && loopdidcallbacks);
+ } while (!ioerror && cycled_icloglock);
-#ifdef DEBUG
- /*
- * Make one last gasp attempt to see if iclogs are being left in limbo.
- * If the above loop finds an iclog earlier than the current iclog and
- * in one of the syncing states, the current iclog is put into
- * DO_CALLBACK and the callbacks are deferred to the completion of the
- * earlier iclog. Walk the iclogs in order and make sure that no iclog
- * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
- * states.
- *
- * Note that SYNCING|IOABORT is a valid state so we cannot just check
- * for ic_state == SYNCING.
- */
- if (funcdidcallbacks) {
- first_iclog = iclog = log->l_iclog;
- do {
- ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
- /*
- * Terminate the loop if iclogs are found in states
- * which will cause other threads to clean up iclogs.
- *
- * SYNCING - i/o completion will go through logs
- * DONE_SYNC - interrupt thread should be waiting for
- * l_icloglock
- * IOERROR - give up hope all ye who enter here
- */
- if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state & XLOG_STATE_SYNCING ||
- iclog->ic_state == XLOG_STATE_DONE_SYNC ||
- iclog->ic_state == XLOG_STATE_IOERROR )
- break;
- iclog = iclog->ic_next;
- } while (first_iclog != iclog);
- }
-#endif
-
- if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
+ if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE ||
+ log->l_iclog->ic_state == XLOG_STATE_IOERROR)
wake_up_all(&log->l_flush_wait);
spin_unlock(&log->l_icloglock);
@@ -2934,30 +2828,20 @@
*/
STATIC void
xlog_state_done_syncing(
- xlog_in_core_t *iclog,
- int aborted)
+ struct xlog_in_core *iclog)
{
- struct xlog *log = iclog->ic_log;
+ struct xlog *log = iclog->ic_log;
spin_lock(&log->l_icloglock);
-
- ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
- iclog->ic_state == XLOG_STATE_IOERROR);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
- ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
-
/*
* If we got an error, either on the first buffer, or in the case of
- * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
- * and none should ever be attempted to be written to disk
- * again.
+ * split log writes, on the second, we shut down the file system and
+ * no iclogs should ever be attempted to be written to disk again.
*/
- if (iclog->ic_state != XLOG_STATE_IOERROR) {
- if (--iclog->ic_bwritecnt == 1) {
- spin_unlock(&log->l_icloglock);
- return;
- }
+ if (!XLOG_FORCED_SHUTDOWN(log)) {
+ ASSERT(iclog->ic_state == XLOG_STATE_SYNCING);
iclog->ic_state = XLOG_STATE_DONE_SYNC;
}
@@ -2968,9 +2852,8 @@
*/
wake_up_all(&iclog->ic_write_wait);
spin_unlock(&log->l_icloglock);
- xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
-} /* xlog_state_done_syncing */
-
+ xlog_state_do_callback(log);
+}
/*
* If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
@@ -3002,7 +2885,6 @@
int log_offset;
xlog_rec_header_t *head;
xlog_in_core_t *iclog;
- int error;
restart:
spin_lock(&log->l_icloglock);
@@ -3051,24 +2933,22 @@
* can fit into remaining data section.
*/
if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+ int error = 0;
+
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
/*
- * If I'm the only one writing to this iclog, sync it to disk.
- * We need to do an atomic compare and decrement here to avoid
- * racing with concurrent atomic_dec_and_lock() calls in
+ * If we are the only one writing to this iclog, sync it to
+ * disk. We need to do an atomic compare and decrement here to
+ * avoid racing with concurrent atomic_dec_and_lock() calls in
* xlog_state_release_iclog() when there is more than one
* reference to the iclog.
*/
- if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
- /* we are the only one */
- spin_unlock(&log->l_icloglock);
+ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
error = xlog_state_release_iclog(log, iclog);
- if (error)
- return error;
- } else {
- spin_unlock(&log->l_icloglock);
- }
+ spin_unlock(&log->l_icloglock);
+ if (error)
+ return error;
goto restart;
}
@@ -3092,21 +2972,21 @@
*logoffsetp = log_offset;
return 0;
-} /* xlog_state_get_iclog_space */
+}
-/* The first cnt-1 times through here we don't need to
- * move the grant write head because the permanent
- * reservation has reserved cnt times the unit amount.
- * Release part of current permanent unit reservation and
- * reset current reservation to be one units worth. Also
- * move grant reservation head forward.
+/*
+ * The first cnt-1 times a ticket goes through here we don't need to move the
+ * grant write head because the permanent reservation has reserved cnt times the
+ * unit amount. Release part of current permanent unit reservation and reset
+ * current reservation to be one units worth. Also move grant reservation head
+ * forward.
*/
-STATIC void
-xlog_regrant_reserve_log_space(
+void
+xfs_log_ticket_regrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
- trace_xfs_log_regrant_reserve_enter(log, ticket);
+ trace_xfs_log_ticket_regrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
@@ -3118,21 +2998,20 @@
ticket->t_curr_res = ticket->t_unit_res;
xlog_tic_reset_res(ticket);
- trace_xfs_log_regrant_reserve_sub(log, ticket);
+ trace_xfs_log_ticket_regrant_sub(log, ticket);
/* just return if we still have some of the pre-reserved space */
- if (ticket->t_cnt > 0)
- return;
+ if (!ticket->t_cnt) {
+ xlog_grant_add_space(log, &log->l_reserve_head.grant,
+ ticket->t_unit_res);
+ trace_xfs_log_ticket_regrant_exit(log, ticket);
- xlog_grant_add_space(log, &log->l_reserve_head.grant,
- ticket->t_unit_res);
+ ticket->t_curr_res = ticket->t_unit_res;
+ xlog_tic_reset_res(ticket);
+ }
- trace_xfs_log_regrant_reserve_exit(log, ticket);
-
- ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
-} /* xlog_regrant_reserve_log_space */
-
+ xfs_log_ticket_put(ticket);
+}
/*
* Give back the space left from a reservation.
@@ -3148,18 +3027,19 @@
* space, the count will stay at zero and the only space remaining will be
* in the current reservation field.
*/
-STATIC void
-xlog_ungrant_log_space(
+void
+xfs_log_ticket_ungrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
- int bytes;
+ int bytes;
+
+ trace_xfs_log_ticket_ungrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
- trace_xfs_log_ungrant_enter(log, ticket);
- trace_xfs_log_ungrant_sub(log, ticket);
+ trace_xfs_log_ticket_ungrant_sub(log, ticket);
/*
* If this is a permanent reservation ticket, we may be able to free
@@ -3174,71 +3054,15 @@
xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
- trace_xfs_log_ungrant_exit(log, ticket);
+ trace_xfs_log_ticket_ungrant_exit(log, ticket);
xfs_log_space_wake(log->l_mp);
+ xfs_log_ticket_put(ticket);
}
/*
- * Flush iclog to disk if this is the last reference to the given iclog and
- * the WANT_SYNC bit is set.
- *
- * When this function is entered, the iclog is not necessarily in the
- * WANT_SYNC state. It may be sitting around waiting to get filled.
- *
- *
- */
-STATIC int
-xlog_state_release_iclog(
- struct xlog *log,
- struct xlog_in_core *iclog)
-{
- int sync = 0; /* do we sync? */
-
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- return -EIO;
-
- ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
- if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
- return 0;
-
- if (iclog->ic_state & XLOG_STATE_IOERROR) {
- spin_unlock(&log->l_icloglock);
- return -EIO;
- }
- ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
- iclog->ic_state == XLOG_STATE_WANT_SYNC);
-
- if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
- /* update tail before writing to iclog */
- xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
- sync++;
- iclog->ic_state = XLOG_STATE_SYNCING;
- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
- xlog_verify_tail_lsn(log, iclog, tail_lsn);
- /* cycle incremented when incrementing curr_block */
- }
- spin_unlock(&log->l_icloglock);
-
- /*
- * We let the log lock go, so it's possible that we hit a log I/O
- * error or some other SHUTDOWN condition that marks the iclog
- * as XLOG_STATE_IOERROR before the bwrite. However, we know that
- * this iclog has consistent data, so we ignore IOERROR
- * flags after this point.
- */
- if (sync)
- return xlog_sync(log, iclog);
- return 0;
-} /* xlog_state_release_iclog */
-
-
-/*
- * This routine will mark the current iclog in the ring as WANT_SYNC
- * and move the current iclog pointer to the next iclog in the ring.
- * When this routine is called from xlog_state_get_iclog_space(), the
- * exact size of the iclog has not yet been determined. All we know is
- * that every data block. We have run out of space in this log record.
+ * This routine will mark the current iclog in the ring as WANT_SYNC and move
+ * the current iclog pointer to the next iclog in the ring.
*/
STATIC void
xlog_state_switch_iclogs(
@@ -3247,6 +3071,8 @@
int eventual_size)
{
ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ assert_spin_locked(&log->l_icloglock);
+
if (!eventual_size)
eventual_size = iclog->ic_offset;
iclog->ic_state = XLOG_STATE_WANT_SYNC;
@@ -3281,7 +3107,7 @@
}
ASSERT(iclog == log->l_iclog);
log->l_iclog = iclog->ic_next;
-} /* xlog_state_switch_iclogs */
+}
/*
* Write out all data in the in-core log as of this exact moment in time.
@@ -3326,7 +3152,7 @@
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
- if (iclog->ic_state & XLOG_STATE_IOERROR)
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
goto out_error;
if (iclog->ic_state == XLOG_STATE_DIRTY ||
@@ -3341,9 +3167,6 @@
* previous iclog and go to sleep.
*/
iclog = iclog->ic_prev;
- if (iclog->ic_state == XLOG_STATE_ACTIVE ||
- iclog->ic_state == XLOG_STATE_DIRTY)
- goto out_unlock;
} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
if (atomic_read(&iclog->ic_refcnt) == 0) {
/*
@@ -3356,14 +3179,10 @@
atomic_inc(&iclog->ic_refcnt);
lsn = be64_to_cpu(iclog->ic_header.h_lsn);
xlog_state_switch_iclogs(log, iclog, 0);
- spin_unlock(&log->l_icloglock);
-
if (xlog_state_release_iclog(log, iclog))
- return -EIO;
+ goto out_error;
- spin_lock(&log->l_icloglock);
- if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn ||
- iclog->ic_state == XLOG_STATE_DIRTY)
+ if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
goto out_unlock;
} else {
/*
@@ -3383,17 +3202,8 @@
;
}
- if (!(flags & XFS_LOG_SYNC))
- goto out_unlock;
-
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- goto out_error;
- XFS_STATS_INC(mp, xs_log_force_sleep);
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- return -EIO;
- return 0;
-
+ if (flags & XFS_LOG_SYNC)
+ return xlog_wait_on_iclog(iclog);
out_unlock:
spin_unlock(&log->l_icloglock);
return 0;
@@ -3403,19 +3213,18 @@
}
static int
-__xfs_log_force_lsn(
- struct xfs_mount *mp,
+xlog_force_lsn(
+ struct xlog *log,
xfs_lsn_t lsn,
uint flags,
int *log_flushed,
bool already_slept)
{
- struct xlog *log = mp->m_log;
struct xlog_in_core *iclog;
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
- if (iclog->ic_state & XLOG_STATE_IOERROR)
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
goto out_error;
while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
@@ -3423,9 +3232,6 @@
if (iclog == log->l_iclog)
goto out_unlock;
}
-
- if (iclog->ic_state == XLOG_STATE_DIRTY)
- goto out_unlock;
if (iclog->ic_state == XLOG_STATE_ACTIVE) {
/*
@@ -3444,39 +3250,22 @@
* will go out then.
*/
if (!already_slept &&
- (iclog->ic_prev->ic_state &
- (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
- ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
-
- XFS_STATS_INC(mp, xs_log_force_sleep);
-
+ (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
return -EAGAIN;
}
atomic_inc(&iclog->ic_refcnt);
xlog_state_switch_iclogs(log, iclog, 0);
- spin_unlock(&log->l_icloglock);
if (xlog_state_release_iclog(log, iclog))
- return -EIO;
+ goto out_error;
if (log_flushed)
*log_flushed = 1;
- spin_lock(&log->l_icloglock);
}
- if (!(flags & XFS_LOG_SYNC) ||
- (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)))
- goto out_unlock;
-
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- goto out_error;
-
- XFS_STATS_INC(mp, xs_log_force_sleep);
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- return -EIO;
- return 0;
-
+ if (flags & XFS_LOG_SYNC)
+ return xlog_wait_on_iclog(iclog);
out_unlock:
spin_unlock(&log->l_icloglock);
return 0;
@@ -3500,54 +3289,31 @@
* to disk, that thread will wake up all threads waiting on the queue.
*/
int
-xfs_log_force_lsn(
+xfs_log_force_seq(
struct xfs_mount *mp,
- xfs_lsn_t lsn,
+ xfs_csn_t seq,
uint flags,
int *log_flushed)
{
+ struct xlog *log = mp->m_log;
+ xfs_lsn_t lsn;
int ret;
- ASSERT(lsn != 0);
+ ASSERT(seq != 0);
XFS_STATS_INC(mp, xs_log_force);
- trace_xfs_log_force(mp, lsn, _RET_IP_);
+ trace_xfs_log_force(mp, seq, _RET_IP_);
- lsn = xlog_cil_force_lsn(mp->m_log, lsn);
+ lsn = xlog_cil_force_seq(log, seq);
if (lsn == NULLCOMMITLSN)
return 0;
- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false);
- if (ret == -EAGAIN)
- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true);
+ ret = xlog_force_lsn(log, lsn, flags, log_flushed, false);
+ if (ret == -EAGAIN) {
+ XFS_STATS_INC(mp, xs_log_force_sleep);
+ ret = xlog_force_lsn(log, lsn, flags, log_flushed, true);
+ }
return ret;
}
-
-/*
- * Called when we want to mark the current iclog as being ready to sync to
- * disk.
- */
-STATIC void
-xlog_state_want_sync(
- struct xlog *log,
- struct xlog_in_core *iclog)
-{
- assert_spin_locked(&log->l_icloglock);
-
- if (iclog->ic_state == XLOG_STATE_ACTIVE) {
- xlog_state_switch_iclogs(log, iclog, 0);
- } else {
- ASSERT(iclog->ic_state &
- (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
- }
-}
-
-
-/*****************************************************************************
- *
- * TICKET functions
- *
- *****************************************************************************
- */
/*
* Free a used ticket when its refcount falls to zero.
@@ -3558,7 +3324,7 @@
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
if (atomic_dec_and_test(&ticket->t_ref))
- kmem_zone_free(xfs_log_ticket_zone, ticket);
+ kmem_cache_free(xfs_log_ticket_zone, ticket);
}
xlog_ticket_t *
@@ -3676,15 +3442,12 @@
int unit_bytes,
int cnt,
char client,
- bool permanent,
- xfs_km_flags_t alloc_flags)
+ bool permanent)
{
struct xlog_ticket *tic;
int unit_res;
- tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
- if (!tic)
- return NULL;
+ tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL);
unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
@@ -3697,7 +3460,6 @@
tic->t_ocnt = cnt;
tic->t_tid = prandom_u32();
tic->t_clientid = client;
- tic->t_flags = XLOG_TIC_INITED;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
@@ -3706,13 +3468,6 @@
return tic;
}
-
-/******************************************************************************
- *
- * Log debug routines
- *
- ******************************************************************************
- */
#if defined(DEBUG)
/*
* Make sure that the destination ptr is within the valid data region of
@@ -3798,7 +3553,7 @@
if (blocks < BTOBB(iclog->ic_offset) + 1)
xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
}
-} /* xlog_verify_tail_lsn */
+}
/*
* Perform a number of checks on the iclog before writing to disk.
@@ -3819,8 +3574,7 @@
xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
- int count,
- bool syncing)
+ int count)
{
xlog_op_header_t *ophead;
xlog_in_core_t *icptr;
@@ -3864,7 +3618,7 @@
/* clientid is only 1 byte */
p = &ophead->oh_clientid;
field_offset = p - base_ptr;
- if (!syncing || (field_offset & 0x1ff)) {
+ if (field_offset & 0x1ff) {
clientid = ophead->oh_clientid;
} else {
idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
@@ -3887,7 +3641,7 @@
/* check length */
p = &ophead->oh_len;
field_offset = p - base_ptr;
- if (!syncing || (field_offset & 0x1ff)) {
+ if (field_offset & 0x1ff) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
idx = BTOBBT((uintptr_t)&ophead->oh_len -
@@ -3902,7 +3656,7 @@
}
ptr += sizeof(xlog_op_header_t) + op_len;
}
-} /* xlog_verify_iclog */
+}
#endif
/*
@@ -3915,7 +3669,7 @@
xlog_in_core_t *iclog, *ic;
iclog = log->l_iclog;
- if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
+ if (iclog->ic_state != XLOG_STATE_IOERROR) {
/*
* Mark all the incore logs IOERROR.
* From now on, no log flushes will result.
@@ -3975,7 +3729,7 @@
* Somebody could've already done the hard work for us.
* No need to get locks for this.
*/
- if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
+ if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) {
ASSERT(XLOG_FORCED_SHUTDOWN(log));
return 1;
}
@@ -4026,21 +3780,8 @@
spin_lock(&log->l_cilp->xc_push_lock);
wake_up_all(&log->l_cilp->xc_commit_wait);
spin_unlock(&log->l_cilp->xc_push_lock);
- xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
+ xlog_state_do_callback(log);
-#ifdef XFSERRORDEBUG
- {
- xlog_in_core_t *iclog;
-
- spin_lock(&log->l_icloglock);
- iclog = log->l_iclog;
- do {
- ASSERT(iclog->ic_callback == 0);
- iclog = iclog->ic_next;
- } while (iclog != log->l_iclog);
- spin_unlock(&log->l_icloglock);
- }
-#endif
/* return non-zero if log IOERROR transition had already happened */
return retval;
}
--
Gitblit v1.6.2