From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/fs/xfs/xfs_log.c | 2083 ++++++++++++++++++++++++++--------------------------------- 1 files changed, 912 insertions(+), 1,171 deletions(-) diff --git a/kernel/fs/xfs/xfs_log.c b/kernel/fs/xfs/xfs_log.c index 8b1b086..22d7d74 100644 --- a/kernel/fs/xfs/xfs_log.c +++ b/kernel/fs/xfs/xfs_log.c @@ -16,24 +16,14 @@ #include "xfs_trans_priv.h" #include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_log_recover.h" -#include "xfs_inode.h" #include "xfs_trace.h" -#include "xfs_fsops.h" -#include "xfs_cksum.h" #include "xfs_sysfs.h" #include "xfs_sb.h" +#include "xfs_health.h" kmem_zone_t *xfs_log_ticket_zone; /* Local miscellaneous function prototypes */ -STATIC int -xlog_commit_record( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp); - STATIC struct xlog * xlog_alloc_log( struct xfs_mount *mp, @@ -44,20 +34,12 @@ xlog_space_left( struct xlog *log, atomic64_t *head); -STATIC int -xlog_sync( - struct xlog *log, - struct xlog_in_core *iclog); STATIC void xlog_dealloc_log( struct xlog *log); /* local state machine functions */ -STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); -STATIC void -xlog_state_do_callback( - struct xlog *log, - int aborted, +STATIC void xlog_state_done_syncing( struct xlog_in_core *iclog); STATIC int xlog_state_get_iclog_space( @@ -67,33 +49,19 @@ struct xlog_ticket *ticket, int *continued_write, int *logoffsetp); -STATIC int -xlog_state_release_iclog( - struct xlog *log, - struct xlog_in_core *iclog); STATIC void xlog_state_switch_iclogs( struct xlog *log, struct xlog_in_core *iclog, int eventual_size); STATIC void -xlog_state_want_sync( - struct xlog *log, - struct xlog_in_core *iclog); - -STATIC void xlog_grant_push_ail( struct xlog *log, int need_bytes); STATIC void -xlog_regrant_reserve_log_space( +xlog_sync( struct xlog *log, - struct xlog_ticket *ticket); -STATIC void -xlog_ungrant_log_space( - struct xlog *log, - struct xlog_ticket *ticket); - + struct xlog_in_core *iclog); #if defined(DEBUG) STATIC void xlog_verify_dest_ptr( @@ -106,8 +74,7 @@ xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, - int count, - bool syncing); + int count); STATIC void xlog_verify_tail_lsn( struct xlog *log, @@ -116,7 +83,7 @@ #else #define xlog_verify_dest_ptr(a,b) #define xlog_verify_grant_tail(a) -#define xlog_verify_iclog(a,b,c,d) +#define xlog_verify_iclog(a,b,c) #define xlog_verify_tail_lsn(a,b,c) #endif @@ -225,15 +192,42 @@ { struct xlog_ticket *tic; int need_bytes; + bool woken_task = false; list_for_each_entry(tic, &head->waiters, t_queue) { + + /* + * There is a chance that the size of the CIL checkpoints in + * progress at the last AIL push target calculation resulted in + * limiting the target to the log head (l_last_sync_lsn) at the + * time. This may not reflect where the log head is now as the + * CIL checkpoints may have completed. + * + * Hence when we are woken here, it may be that the head of the + * log that has moved rather than the tail. As the tail didn't + * move, there still won't be space available for the + * reservation we require. However, if the AIL has already + * pushed to the target defined by the old log head location, we + * will hang here waiting for something else to update the AIL + * push target. + * + * Therefore, if there isn't space to wake the first waiter on + * the grant head, we need to push the AIL again to ensure the + * target reflects both the current log tail and log head + * position before we wait for the tail to move again. + */ + need_bytes = xlog_ticket_reservation(log, head, tic); - if (*free_bytes < need_bytes) + if (*free_bytes < need_bytes) { + if (!woken_task) + xlog_grant_push_ail(log, need_bytes); return false; + } *free_bytes -= need_bytes; trace_xfs_log_grant_wake_up(log, tic); wake_up_process(tic->t_task); + woken_task = true; } return true; @@ -353,6 +347,25 @@ tic->t_res_num++; } +bool +xfs_log_writable( + struct xfs_mount *mp) +{ + /* + * Never write to the log on norecovery mounts, if the block device is + * read-only, or if the filesystem is shutdown. Read-only mounts still + * allow internal writes for log recovery and unmount purposes, so don't + * restrict that case here. + */ + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + return false; + if (xfs_readonly_buftarg(mp->m_log->l_targ)) + return false; + if (XFS_FORCED_SHUTDOWN(mp)) + return false; + return true; +} + /* * Replenish the byte reservation required by moving the grant write head. */ @@ -439,11 +452,7 @@ XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); - tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, - KM_SLEEP | KM_MAYFAIL); - if (!tic) - return -ENOMEM; - + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent); *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -473,110 +482,67 @@ return error; } - -/* - * NOTES: - * - * 1. currblock field gets updated at startup and after in-core logs - * marked as with WANT_SYNC. - */ - -/* - * This routine is called when a user of a log manager ticket is done with - * the reservation. If the ticket was ever used, then a commit record for - * the associated transaction is written out as a log operation header with - * no data. The flag XLOG_TIC_INITED is set when the first write occurs with - * a given ticket. If the ticket was one with a permanent reservation, then - * a few operations are done differently. Permanent reservation tickets by - * default don't release the reservation. They just commit the current - * transaction with the belief that the reservation is still needed. A flag - * must be passed in before permanent reservations are actually released. - * When these type of tickets are not released, they need to be set into - * the inited state again. By doing this, a start record will be written - * out when the next write occurs. - */ -xfs_lsn_t -xfs_log_done( - struct xfs_mount *mp, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - bool regrant) -{ - struct xlog *log = mp->m_log; - xfs_lsn_t lsn = 0; - - if (XLOG_FORCED_SHUTDOWN(log) || - /* - * If nothing was ever written, don't write out commit record. - * If we get an error, just continue and give back the log ticket. - */ - (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(log, ticket, iclog, &lsn)))) { - lsn = (xfs_lsn_t) -1; - regrant = false; - } - - - if (!regrant) { - trace_xfs_log_done_nonperm(log, ticket); - - /* - * Release ticket if not permanent reservation or a specific - * request has been made to release a permanent reservation. - */ - xlog_ungrant_log_space(log, ticket); - } else { - trace_xfs_log_done_perm(log, ticket); - - xlog_regrant_reserve_log_space(log, ticket); - /* If this ticket was a permanent reservation and we aren't - * trying to release it, reset the inited flags; so next time - * we write, a start record will be written out. - */ - ticket->t_flags |= XLOG_TIC_INITED; - } - - xfs_log_ticket_put(ticket); - return lsn; -} - -/* - * Attaches a new iclog I/O completion callback routine during - * transaction commit. If the log is in error state, a non-zero - * return code is handed back and the caller is responsible for - * executing the callback at an appropriate time. - */ -int -xfs_log_notify( - struct xlog_in_core *iclog, - xfs_log_callback_t *cb) -{ - int abortflg; - - spin_lock(&iclog->ic_callback_lock); - abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); - if (!abortflg) { - ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || - (iclog->ic_state == XLOG_STATE_WANT_SYNC)); - cb->cb_next = NULL; - *(iclog->ic_callback_tail) = cb; - iclog->ic_callback_tail = &(cb->cb_next); - } - spin_unlock(&iclog->ic_callback_lock); - return abortflg; -} - -int -xfs_log_release_iclog( - struct xfs_mount *mp, +static bool +__xlog_state_release_iclog( + struct xlog *log, struct xlog_in_core *iclog) { - if (xlog_state_release_iclog(mp->m_log, iclog)) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + lockdep_assert_held(&log->l_icloglock); + + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { + /* update tail before writing to iclog */ + xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); + + iclog->ic_state = XLOG_STATE_SYNCING; + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + xlog_verify_tail_lsn(log, iclog, tail_lsn); + /* cycle incremented when incrementing curr_block */ + return true; + } + + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + return false; +} + +/* + * Flush iclog to disk if this is the last reference to the given iclog and the + * it is in the WANT_SYNC state. + */ +static int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog) +{ + lockdep_assert_held(&log->l_icloglock); + + if (iclog->ic_state == XLOG_STATE_IOERROR) return -EIO; + + if (atomic_dec_and_test(&iclog->ic_refcnt) && + __xlog_state_release_iclog(log, iclog)) { + spin_unlock(&log->l_icloglock); + xlog_sync(log, iclog); + spin_lock(&log->l_icloglock); } return 0; +} + +void +xfs_log_release_iclog( + struct xlog_in_core *iclog) +{ + struct xlog *log = iclog->ic_log; + bool sync = false; + + if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) { + if (iclog->ic_state != XLOG_STATE_IOERROR) + sync = __xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + } + + if (sync) + xlog_sync(log, iclog); } /* @@ -799,6 +765,9 @@ if (readonly) mp->m_flags |= XFS_MOUNT_RDONLY; + /* Make sure the log is dead if we're returning failure. */ + ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR)); + return error; } @@ -806,45 +775,78 @@ * The mount has failed. Cancel the recovery if it hasn't completed and destroy * the log. */ -int +void xfs_log_mount_cancel( struct xfs_mount *mp) { - int error; - - error = xlog_recover_cancel(mp->m_log); + xlog_recover_cancel(mp->m_log); xfs_log_unmount(mp); - - return error; } /* - * Final log writes as part of unmount. - * - * Mark the filesystem clean as unmount happens. Note that during relocation - * this routine needs to be executed as part of source-bag while the - * deallocation must not be done until source-end. + * Wait for the iclog to be written disk, or return an error if the log has been + * shut down. */ - -/* Actually write the unmount record to disk. */ -static void -xfs_log_write_unmount_record( - struct xfs_mount *mp) +static int +xlog_wait_on_iclog( + struct xlog_in_core *iclog) + __releases(iclog->ic_log->l_icloglock) { - /* the data section must be 32 bit size aligned */ - struct xfs_unmount_log_format magic = { + struct xlog *log = iclog->ic_log; + + if (!XLOG_FORCED_SHUTDOWN(log) && + iclog->ic_state != XLOG_STATE_ACTIVE && + iclog->ic_state != XLOG_STATE_DIRTY) { + XFS_STATS_INC(log->l_mp, xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + } else { + spin_unlock(&log->l_icloglock); + } + + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + return 0; +} + +/* + * Write out an unmount record using the ticket provided. We have to account for + * the data space used in the unmount ticket as this write is not done from a + * transaction context that has already done the accounting for us. + */ +static int +xlog_write_unmount_record( + struct xlog *log, + struct xlog_ticket *ticket, + xfs_lsn_t *lsn, + uint flags) +{ + struct xfs_unmount_log_format ulf = { .magic = XLOG_UNMOUNT_TYPE, }; struct xfs_log_iovec reg = { - .i_addr = &magic, - .i_len = sizeof(magic), + .i_addr = &ulf, + .i_len = sizeof(ulf), .i_type = XLOG_REG_TYPE_UNMOUNT, }; struct xfs_log_vec vec = { .lv_niovecs = 1, .lv_iovecp = ®, }; - struct xlog *log = mp->m_log; + + /* account for space used by record data */ + ticket->t_curr_res -= sizeof(ulf); + return xlog_write(log, &vec, ticket, lsn, NULL, flags, false); +} + +/* + * Mark the filesystem clean by writing an unmount record to the head of the + * log. + */ +static void +xlog_unmount_write( + struct xlog *log) +{ + struct xfs_mount *mp = log->l_mp; struct xlog_in_core *iclog; struct xlog_ticket *tic = NULL; xfs_lsn_t lsn; @@ -855,23 +857,7 @@ if (error) goto out_err; - /* - * If we think the summary counters are bad, clear the unmount header - * flag in the unmount record so that the summary counters will be - * recalculated during log recovery at next mount. Refer to - * xlog_check_unmount_rec for more details. - */ - if (XFS_TEST_ERROR((mp->m_flags & XFS_MOUNT_BAD_SUMMARY), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { - xfs_alert(mp, "%s: will fix summary counters at next mount", - __func__); - flags &= ~XLOG_UNMOUNT_TRANS; - } - - /* remove inited flag, and account for space used */ - tic->t_flags = 0; - tic->t_curr_res -= sizeof(magic); - error = xlog_write(log, &vec, tic, &lsn, NULL, flags); + error = xlog_write_unmount_record(log, tic, &lsn, flags); /* * At this point, we're umounting anyway, so there's no point in * transitioning log state to IOERROR. Just continue... @@ -883,29 +869,30 @@ spin_lock(&log->l_icloglock); iclog = log->l_iclog; atomic_inc(&iclog->ic_refcnt); - xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR); error = xlog_state_release_iclog(log, iclog); - - spin_lock(&log->l_icloglock); - switch (iclog->ic_state) { - default: - if (!XLOG_FORCED_SHUTDOWN(log)) { - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - break; - } - /* fall through */ - case XLOG_STATE_ACTIVE: - case XLOG_STATE_DIRTY: - spin_unlock(&log->l_icloglock); - break; - } + xlog_wait_on_iclog(iclog); if (tic) { trace_xfs_log_umount_write(log, tic); - xlog_ungrant_log_space(log, tic); - xfs_log_ticket_put(tic); + xfs_log_ticket_ungrant(log, tic); } +} + +static void +xfs_log_unmount_verify_iclog( + struct xlog *log) +{ + struct xlog_in_core *iclog = log->l_iclog; + + do { + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + ASSERT(iclog->ic_offset == 0); + } while ((iclog = iclog->ic_next) != log->l_iclog); } /* @@ -915,79 +902,36 @@ * currently architecture converted and "Unmount" is a bit foo. * As far as I know, there weren't any dependencies on the old behaviour. */ - -static int -xfs_log_unmount_write(xfs_mount_t *mp) +static void +xfs_log_unmount_write( + struct xfs_mount *mp) { - struct xlog *log = mp->m_log; - xlog_in_core_t *iclog; -#ifdef DEBUG - xlog_in_core_t *first_iclog; -#endif - int error; + struct xlog *log = mp->m_log; + + if (!xfs_log_writable(mp)) + return; + + xfs_log_force(mp, XFS_LOG_SYNC); + + if (XLOG_FORCED_SHUTDOWN(log)) + return; /* - * Don't write out unmount record on norecovery mounts or ro devices. - * Or, if we are doing a forced umount (typically because of IO errors). + * If we think the summary counters are bad, avoid writing the unmount + * record to force log recovery at next mount, after which the summary + * counters will be recalculated. Refer to xlog_check_unmount_rec for + * more details. */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY || - xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); - return 0; + if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, + XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + xfs_alert(mp, "%s: will fix summary counters at next mount", + __func__); + return; } - error = xfs_log_force(mp, XFS_LOG_SYNC); - ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); - -#ifdef DEBUG - first_iclog = iclog = log->l_iclog; - do { - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { - ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE); - ASSERT(iclog->ic_offset == 0); - } - iclog = iclog->ic_next; - } while (iclog != first_iclog); -#endif - if (! (XLOG_FORCED_SHUTDOWN(log))) { - xfs_log_write_unmount_record(mp); - } else { - /* - * We're already in forced_shutdown mode, couldn't - * even attempt to write out the unmount transaction. - * - * Go through the motions of sync'ing and releasing - * the iclog, even though no I/O will actually happen, - * we need to wait for other log I/Os that may already - * be in progress. Do this as a separate section of - * code so we'll know if we ever get stuck here that - * we're in this odd situation of trying to unmount - * a file system that went into forced_shutdown as - * the result of an unmount.. - */ - spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - atomic_inc(&iclog->ic_refcnt); - - xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); - error = xlog_state_release_iclog(log, iclog); - - spin_lock(&log->l_icloglock); - - if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE - || iclog->ic_state == XLOG_STATE_DIRTY - || iclog->ic_state == XLOG_STATE_IOERROR) ) { - - xlog_wait(&iclog->ic_force_wait, - &log->l_icloglock); - } else { - spin_unlock(&log->l_icloglock); - } - } - - return error; -} /* xfs_log_unmount_write */ + xfs_log_unmount_verify_iclog(log); + xlog_unmount_write(log); +} /* * Empty the log for unmount/freeze. @@ -1243,53 +1187,40 @@ } -/* - * Log function which is called when an io completes. - * - * The log manager needs its own routine, in order to control what - * happens with the buffer after the write completes. - */ static void -xlog_iodone(xfs_buf_t *bp) +xlog_ioend_work( + struct work_struct *work) { - struct xlog_in_core *iclog = bp->b_log_item; - struct xlog *l = iclog->ic_log; - int aborted = 0; + struct xlog_in_core *iclog = + container_of(work, struct xlog_in_core, ic_end_io_work); + struct xlog *log = iclog->ic_log; + int error; + + error = blk_status_to_errno(iclog->ic_bio.bi_status); +#ifdef DEBUG + /* treat writes with injected CRC errors as failed */ + if (iclog->ic_fail_crc) + error = -EIO; +#endif /* - * Race to shutdown the filesystem if we see an error or the iclog is in - * IOABORT state. The IOABORT state is only set in DEBUG mode to inject - * CRC errors into log recovery. + * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR) || - iclog->ic_state & XLOG_STATE_IOABORT) { - if (iclog->ic_state & XLOG_STATE_IOABORT) - iclog->ic_state &= ~XLOG_STATE_IOABORT; - - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_stale(bp); - xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); - /* - * This flag will be propagated to the trans-committed - * callback routines to let them know that the log-commit - * didn't succeed. - */ - aborted = XFS_LI_ABORTED; - } else if (iclog->ic_state & XLOG_STATE_IOERROR) { - aborted = XFS_LI_ABORTED; + if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { + xfs_alert(log->l_mp, "log I/O error %d", error); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } - /* log I/O is always issued ASYNC */ - ASSERT(bp->b_flags & XBF_ASYNC); - xlog_state_done_syncing(iclog, aborted); + xlog_state_done_syncing(iclog); + bio_uninit(&iclog->ic_bio); /* - * drop the buffer lock now that we are done. Nothing references - * the buffer after this, so an unmount waiting on this lock can now - * tear it down safely. As such, it is unsafe to reference the buffer - * (bp) after the unlock as we could race with it being freed. + * Drop the lock to signal that we are done. Nothing references the + * iclog after this, so an unmount waiting on this lock can now tear it + * down safely. As such, it is unsafe to reference the iclog after the + * unlock as we could race with it being freed. */ - xfs_buf_unlock(bp); + up(&iclog->ic_sema); } /* @@ -1300,65 +1231,26 @@ * If the filesystem blocksize is too large, we may need to choose a * larger size since the directory code currently logs entire blocks. */ - STATIC void xlog_get_iclog_buffer_size( struct xfs_mount *mp, struct xlog *log) { - int size; - int xhdrs; - if (mp->m_logbufs <= 0) - log->l_iclog_bufs = XLOG_MAX_ICLOGS; - else - log->l_iclog_bufs = mp->m_logbufs; + mp->m_logbufs = XLOG_MAX_ICLOGS; + if (mp->m_logbsize <= 0) + mp->m_logbsize = XLOG_BIG_RECORD_BSIZE; + + log->l_iclog_bufs = mp->m_logbufs; + log->l_iclog_size = mp->m_logbsize; /* - * Buffer size passed in from mount system call. + * # headers = size / 32k - one header holds cycles from 32k of data. */ - if (mp->m_logbsize > 0) { - size = log->l_iclog_size = mp->m_logbsize; - log->l_iclog_size_log = 0; - while (size != 1) { - log->l_iclog_size_log++; - size >>= 1; - } - - if (xfs_sb_version_haslogv2(&mp->m_sb)) { - /* # headers = size / 32k - * one header holds cycles from 32k of data - */ - - xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE; - if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE) - xhdrs++; - log->l_iclog_hsize = xhdrs << BBSHIFT; - log->l_iclog_heads = xhdrs; - } else { - ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE); - log->l_iclog_hsize = BBSIZE; - log->l_iclog_heads = 1; - } - goto done; - } - - /* All machines use 32kB buffers by default. */ - log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; - log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; - - /* the default log size is 16k or 32k which is one header sector */ - log->l_iclog_hsize = BBSIZE; - log->l_iclog_heads = 1; - -done: - /* are we being asked to make the sizes selected above visible? */ - if (mp->m_logbufs == 0) - mp->m_logbufs = log->l_iclog_bufs; - if (mp->m_logbsize == 0) - mp->m_logbsize = log->l_iclog_size; -} /* xlog_get_iclog_buffer_size */ - + log->l_iclog_heads = + DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE); + log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT; +} void xfs_log_work_queue( @@ -1421,7 +1313,6 @@ xlog_rec_header_t *head; xlog_in_core_t **iclogp; xlog_in_core_t *iclog, *prev_iclog=NULL; - xfs_buf_t *bp; int i; int error = -ENOMEM; uint log2_size = 0; @@ -1479,30 +1370,6 @@ xlog_get_iclog_buffer_size(mp, log); - /* - * Use a NULL block for the extra log buffer used during splits so that - * it will trigger errors if we ever try to do IO on it without first - * having set it up properly. - */ - error = -ENOMEM; - bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, - BTOBB(log->l_iclog_size), XBF_NO_IOACCT); - if (!bp) - goto out_free_log; - - /* - * The iclogbuf buffer locks are held over IO but we are not going to do - * IO yet. Hence unlock the buffer so that the log IO path can grab it - * when appropriately. - */ - ASSERT(xfs_buf_islocked(bp)); - xfs_buf_unlock(bp); - - /* use high priority wq for log I/O completion */ - bp->b_ioend_wq = mp->m_log_workqueue; - bp->b_iodone = xlog_iodone; - log->l_xbuf = bp; - spin_lock_init(&log->l_icloglock); init_waitqueue_head(&log->l_flush_wait); @@ -1515,29 +1382,23 @@ * xlog_in_core_t in xfs_log_priv.h for details. */ ASSERT(log->l_iclog_size >= 4096); - for (i=0; i < log->l_iclog_bufs; i++) { - *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL); - if (!*iclogp) + for (i = 0; i < log->l_iclog_bufs; i++) { + int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); + size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * + sizeof(struct bio_vec); + + iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL); + if (!iclog) goto out_free_iclog; - iclog = *iclogp; + *iclogp = iclog; iclog->ic_prev = prev_iclog; prev_iclog = iclog; - bp = xfs_buf_get_uncached(mp->m_logdev_targp, - BTOBB(log->l_iclog_size), - XBF_NO_IOACCT); - if (!bp) + iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, + KM_MAYFAIL | KM_ZERO); + if (!iclog->ic_data) goto out_free_iclog; - - ASSERT(xfs_buf_islocked(bp)); - xfs_buf_unlock(bp); - - /* use high priority wq for log I/O completion */ - bp->b_ioend_wq = mp->m_log_workqueue; - bp->b_iodone = xlog_iodone; - iclog->ic_bp = bp; - iclog->ic_data = bp->b_addr; #ifdef DEBUG log->l_iclog_bak[i] = &iclog->ic_header; #endif @@ -1551,58 +1412,62 @@ head->h_fmt = cpu_to_be32(XLOG_FMT); memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); - iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize; + iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); spin_lock_init(&iclog->ic_callback_lock); - iclog->ic_callback_tail = &(iclog->ic_callback); + INIT_LIST_HEAD(&iclog->ic_callbacks); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); + INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work); + sema_init(&iclog->ic_sema, 1); iclogp = &iclog->ic_next; } *iclogp = log->l_iclog; /* complete ring */ log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ + log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0, + mp->m_super->s_id); + if (!log->l_ioend_workqueue) + goto out_free_iclog; + error = xlog_cil_init(log); if (error) - goto out_free_iclog; + goto out_destroy_workqueue; return log; +out_destroy_workqueue: + destroy_workqueue(log->l_ioend_workqueue); out_free_iclog: for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { prev_iclog = iclog->ic_next; - if (iclog->ic_bp) - xfs_buf_free(iclog->ic_bp); + kmem_free(iclog->ic_data); kmem_free(iclog); if (prev_iclog == log->l_iclog) break; } - spinlock_destroy(&log->l_icloglock); - xfs_buf_free(log->l_xbuf); out_free_log: kmem_free(log); out: return ERR_PTR(error); } /* xlog_alloc_log */ - /* * Write out the commit record of a transaction associated with the given - * ticket. Return the lsn of the commit record. + * ticket to close off a running log write. Return the lsn of the commit record. */ -STATIC int +int xlog_commit_record( struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp) + xfs_lsn_t *lsn) { - struct xfs_mount *mp = log->l_mp; - int error; struct xfs_log_iovec reg = { .i_addr = NULL, .i_len = 0, @@ -1612,24 +1477,27 @@ .lv_niovecs = 1, .lv_iovecp = ®, }; + int error; - ASSERT_ALWAYS(iclog); - error = xlog_write(log, &vec, ticket, commitlsnp, iclog, - XLOG_COMMIT_TRANS); + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + + error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS, + false); if (error) - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); return error; } /* - * Push on the buffer cache code if we ever use more than 75% of the on-disk - * log space. This code pushes on the lsn which would supposedly free up - * the 25% which we want to leave free. We may need to adopt a policy which - * pushes on an lsn which is further along in the log once we reach the high - * water mark. In this manner, we would be creating a low water mark. + * Compute the LSN that we'd need to push the log tail towards in order to have + * (a) enough on-disk log space to log the number of bytes specified, (b) at + * least 25% of the log space free, and (c) at least 256 blocks free. If the + * log free space already meets all three thresholds, this function returns + * NULLCOMMITLSN. */ -STATIC void -xlog_grant_push_ail( +xfs_lsn_t +xlog_grant_push_threshold( struct xlog *log, int need_bytes) { @@ -1655,7 +1523,7 @@ free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); free_threshold = max(free_threshold, 256); if (free_blocks >= free_threshold) - return; + return NULLCOMMITLSN; xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, &threshold_block); @@ -1675,13 +1543,33 @@ if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) threshold_lsn = last_sync_lsn; + return threshold_lsn; +} + +/* + * Push the tail of the log if we need to do so to maintain the free log space + * thresholds set out by xlog_grant_push_threshold. We may need to adopt a + * policy which pushes on an lsn which is further along in the log once we + * reach the high water mark. In this manner, we would be creating a low water + * mark. + */ +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes) +{ + xfs_lsn_t threshold_lsn; + + threshold_lsn = xlog_grant_push_threshold(log, need_bytes); + if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log)) + return; + /* * Get the transaction layer to kick the dirty buffers out to * disk asynchronously. No point in trying to do this if * the filesystem is shutting down. */ - if (!XLOG_FORCED_SHUTDOWN(log)) - xfs_ail_push(log->l_ailp, threshold_lsn); + xfs_ail_push(log->l_ailp, threshold_lsn); } /* @@ -1751,9 +1639,7 @@ int i; int xheads; - xheads = size / XLOG_HEADER_CYCLE_SIZE; - if (size % XLOG_HEADER_CYCLE_SIZE) - xheads++; + xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE); for (i = 1; i < xheads; i++) { crc = crc32c(crc, &xhdr[i].hic_xheader, @@ -1767,42 +1653,167 @@ return xfs_end_cksum(crc); } -/* - * The bdstrat callback function for log bufs. This gives us a central - * place to trap bufs in case we get hit by a log I/O error and need to - * shutdown. Actually, in practice, even when we didn't get a log error, - * we transition the iclogs to IOERROR state *after* flushing all existing - * iclogs to disk. This is because we don't want anymore new transactions to be - * started or completed afterwards. - * - * We lock the iclogbufs here so that we can serialise against IO completion - * during unmount. We might be processing a shutdown triggered during unmount, - * and that can occur asynchronously to the unmount thread, and hence we need to - * ensure that completes before tearing down the iclogbufs. Hence we need to - * hold the buffer lock across the log IO to acheive that. - */ -STATIC int -xlog_bdstrat( - struct xfs_buf *bp) +static void +xlog_bio_end_io( + struct bio *bio) { - struct xlog_in_core *iclog = bp->b_log_item; + struct xlog_in_core *iclog = bio->bi_private; - xfs_buf_lock(bp); - if (iclog->ic_state & XLOG_STATE_IOERROR) { - xfs_buf_ioerror(bp, -EIO); - xfs_buf_stale(bp); - xfs_buf_ioend(bp); + queue_work(iclog->ic_log->l_ioend_workqueue, + &iclog->ic_end_io_work); +} + +static int +xlog_map_iclog_data( + struct bio *bio, + void *data, + size_t count) +{ + do { + struct page *page = kmem_to_page(data); + unsigned int off = offset_in_page(data); + size_t len = min_t(size_t, count, PAGE_SIZE - off); + + if (bio_add_page(bio, page, len, off) != len) + return -EIO; + + data += len; + count -= len; + } while (count); + + return 0; +} + +STATIC void +xlog_write_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + uint64_t bno, + unsigned int count, + bool need_flush) +{ + ASSERT(bno < log->l_logBBsize); + + /* + * We lock the iclogbufs here so that we can serialise against I/O + * completion during unmount. We might be processing a shutdown + * triggered during unmount, and that can occur asynchronously to the + * unmount thread, and hence we need to ensure that completes before + * tearing down the iclogbufs. Hence we need to hold the buffer lock + * across the log IO to archieve that. + */ + down(&iclog->ic_sema); + if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) { /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. Similarly, IO completion will unlock the - * buffer, so we don't do it here. + * doing it here. We kick of the state machine and unlock + * the buffer manually, the code needs to be kept in sync + * with the I/O completion path. */ - return 0; + xlog_state_done_syncing(iclog); + up(&iclog->ic_sema); + return; } - xfs_buf_submit(bp); - return 0; + bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE)); + bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev); + iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; + iclog->ic_bio.bi_end_io = xlog_bio_end_io; + iclog->ic_bio.bi_private = iclog; + + /* + * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more + * IOs coming immediately after this one. This prevents the block layer + * writeback throttle from throttling log writes behind background + * metadata writeback and causing priority inversions. + */ + iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | + REQ_IDLE | REQ_FUA; + if (need_flush) + iclog->ic_bio.bi_opf |= REQ_PREFLUSH; + + if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + return; + } + if (is_vmalloc_addr(iclog->ic_data)) + flush_kernel_vmap_range(iclog->ic_data, count); + + /* + * If this log buffer would straddle the end of the log we will have + * to split it up into two bios, so that we can continue at the start. + */ + if (bno + BTOBB(count) > log->l_logBBsize) { + struct bio *split; + + split = bio_split(&iclog->ic_bio, log->l_logBBsize - bno, + GFP_NOIO, &fs_bio_set); + bio_chain(split, &iclog->ic_bio); + submit_bio(split); + + /* restart at logical offset zero for the remainder */ + iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart; + } + + submit_bio(&iclog->ic_bio); +} + +/* + * We need to bump cycle number for the part of the iclog that is + * written to the start of the log. Watch out for the header magic + * number case, though. + */ +static void +xlog_split_iclog( + struct xlog *log, + void *data, + uint64_t bno, + unsigned int count) +{ + unsigned int split_offset = BBTOB(log->l_logBBsize - bno); + unsigned int i; + + for (i = split_offset; i < count; i += BBSIZE) { + uint32_t cycle = get_unaligned_be32(data + i); + + if (++cycle == XLOG_HEADER_MAGIC_NUM) + cycle++; + put_unaligned_be32(cycle, data + i); + } +} + +static int +xlog_calc_iclog_size( + struct xlog *log, + struct xlog_in_core *iclog, + uint32_t *roundoff) +{ + uint32_t count_init, count; + bool use_lsunit; + + use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) && + log->l_mp->m_sb.sb_logsunit > 1; + + /* Add for LR header */ + count_init = log->l_iclog_hsize + iclog->ic_offset; + + /* Round out the log write size */ + if (use_lsunit) { + /* we have a v2 stripe unit to use */ + count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); + } else { + count = BBTOB(BTOBB(count_init)); + } + + ASSERT(count >= count_init); + *roundoff = count - count_init; + + if (use_lsunit) + ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit); + else + ASSERT(*roundoff < BBTOB(1)); + return count; } /* @@ -1825,46 +1836,23 @@ * log will require grabbing the lock though. * * The entire log manager uses a logical block numbering scheme. Only - * log_sync (and then only bwrite()) know about the fact that the log may - * not start with block zero on a given device. The log block start offset - * is added immediately before calling bwrite(). + * xlog_write_iclog knows about the fact that the log may not start with + * block zero on a given device. */ - -STATIC int +STATIC void xlog_sync( struct xlog *log, struct xlog_in_core *iclog) { - xfs_buf_t *bp; - int i; - uint count; /* byte count of bwrite */ - uint count_init; /* initial count before roundup */ - int roundoff; /* roundoff to BB or stripe */ - int split = 0; /* split write into two regions */ - int error; - int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); - int size; + unsigned int count; /* byte count of bwrite */ + unsigned int roundoff; /* roundoff to BB or stripe */ + uint64_t bno; + unsigned int size; + bool need_flush = true, split = false; - XFS_STATS_INC(log->l_mp, xs_log_writes); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); - /* Add for LR header */ - count_init = log->l_iclog_hsize + iclog->ic_offset; - - /* Round out the log write size */ - if (v2 && log->l_mp->m_sb.sb_logsunit > 1) { - /* we have a v2 stripe unit to use */ - count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); - } else { - count = BBTOB(BTOBB(count_init)); - } - roundoff = count - count_init; - ASSERT(roundoff >= 0); - ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 && - roundoff < log->l_mp->m_sb.sb_logsunit) - || - (log->l_mp->m_sb.sb_logsunit <= 1 && - roundoff < BBTOB(1))); + count = xlog_calc_iclog_size(log, iclog, &roundoff); /* move grant heads by roundoff in sync */ xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); @@ -1875,41 +1863,19 @@ /* real byte length */ size = iclog->ic_offset; - if (v2) + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) size += roundoff; iclog->ic_header.h_len = cpu_to_be32(size); - bp = iclog->ic_bp; - XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); - + XFS_STATS_INC(log->l_mp, xs_log_writes); XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count)); + bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); + /* Do we need to split this write into 2 parts? */ - if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { - char *dptr; - - split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); - count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); - iclog->ic_bwritecnt = 2; - - /* - * Bump the cycle numbers at the start of each block in the - * part of the iclog that ends up in the buffer that gets - * written to the start of the log. - * - * Watch out for the header magic number case, though. - */ - dptr = (char *)&iclog->ic_header + count; - for (i = 0; i < split; i += BBSIZE) { - uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); - if (++cycle == XLOG_HEADER_MAGIC_NUM) - cycle++; - *(__be32 *)dptr = cpu_to_be32(cycle); - - dptr += BBSIZE; - } - } else { - iclog->ic_bwritecnt = 1; + if (bno + BTOBB(count) > log->l_logBBsize) { + xlog_split_iclog(log, &iclog->ic_header, bno, count); + split = true; } /* calculcate the checksum */ @@ -1922,18 +1888,15 @@ * write on I/O completion and shutdown the fs. The subsequent mount * detects the bad CRC and attempts to recover. */ +#ifdef DEBUG if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); - iclog->ic_state |= XLOG_STATE_IOABORT; + iclog->ic_fail_crc = true; xfs_warn(log->l_mp, "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", be64_to_cpu(iclog->ic_header.h_lsn)); } - - bp->b_io_length = BTOBB(count); - bp->b_log_item = iclog; - bp->b_flags &= ~XBF_FLUSH; - bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); +#endif /* * Flush the data device before flushing the log to make sure all meta @@ -1943,50 +1906,14 @@ * synchronously here; for an internal log we can simply use the block * layer state machine for preflushes. */ - if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + if (log->l_targ != log->l_mp->m_ddev_targp || split) { xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); - else - bp->b_flags |= XBF_FLUSH; - - ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); - ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); - - xlog_verify_iclog(log, iclog, count, true); - - /* account for log which doesn't start at block #0 */ - XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); - - /* - * Don't call xfs_bwrite here. We do log-syncs even when the filesystem - * is shutting down. - */ - error = xlog_bdstrat(bp); - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_sync"); - return error; + need_flush = false; } - if (split) { - bp = iclog->ic_log->l_xbuf; - XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ - xfs_buf_associate_memory(bp, - (char *)&iclog->ic_header + count, split); - bp->b_log_item = iclog; - bp->b_flags &= ~XBF_FLUSH; - bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); - ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); - ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); - - /* account for internal log which doesn't start at block #0 */ - XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); - error = xlog_bdstrat(bp); - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_sync (split)"); - return error; - } - } - return 0; -} /* xlog_sync */ + xlog_verify_iclog(log, iclog, count); + xlog_write_iclog(log, iclog, bno, count, need_flush); +} /* * Deallocate a log structure @@ -2006,38 +1933,27 @@ */ iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { - xfs_buf_lock(iclog->ic_bp); - xfs_buf_unlock(iclog->ic_bp); + down(&iclog->ic_sema); + up(&iclog->ic_sema); iclog = iclog->ic_next; } - /* - * Always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. Also, cycle the lock - * first to ensure we've completed IO on it. - */ - xfs_buf_lock(log->l_xbuf); - xfs_buf_unlock(log->l_xbuf); - xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); - xfs_buf_free(log->l_xbuf); - iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { - xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; + kmem_free(iclog->ic_data); kmem_free(iclog); iclog = next_iclog; } - spinlock_destroy(&log->l_icloglock); log->l_mp->m_log = NULL; + destroy_workqueue(log->l_ioend_workqueue); kmem_free(log); -} /* xlog_dealloc_log */ +} /* * Update counters atomically now that memcpy is done. */ -/* ARGSUSED */ static inline void xlog_state_finish_copy( struct xlog *log, @@ -2045,16 +1961,11 @@ int record_cnt, int copy_bytes) { - spin_lock(&log->l_icloglock); + lockdep_assert_held(&log->l_icloglock); be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); iclog->ic_offset += copy_bytes; - - spin_unlock(&log->l_icloglock); -} /* xlog_state_finish_copy */ - - - +} /* * print out info relating to regions written which consume @@ -2070,7 +1981,7 @@ /* match with XLOG_REG_TYPE_* in xfs_log.h */ #define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str - static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = { + static char *res_type_str[] = { REG_TYPE_STR(BFORMAT, "bformat"), REG_TYPE_STR(BCHUNK, "bchunk"), REG_TYPE_STR(EFI_FORMAT, "efi_format"), @@ -2090,8 +2001,15 @@ REG_TYPE_STR(UNMOUNT, "unmount"), REG_TYPE_STR(COMMIT, "commit"), REG_TYPE_STR(TRANSHDR, "trans header"), - REG_TYPE_STR(ICREATE, "inode create") + REG_TYPE_STR(ICREATE, "inode create"), + REG_TYPE_STR(RUI_FORMAT, "rui_format"), + REG_TYPE_STR(RUD_FORMAT, "rud_format"), + REG_TYPE_STR(CUI_FORMAT, "cui_format"), + REG_TYPE_STR(CUD_FORMAT, "cud_format"), + REG_TYPE_STR(BUI_FORMAT, "bui_format"), + REG_TYPE_STR(BUD_FORMAT, "bud_format"), }; + BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1); #undef REG_TYPE_STR xfs_warn(mp, "ticket reservation summary:"); @@ -2168,22 +2086,20 @@ } /* - * Calculate the potential space needed by the log vector. Each region gets - * its own xlog_op_header_t and may need to be double word aligned. + * Calculate the potential space needed by the log vector. We may need a start + * record, and each region gets its own struct xlog_op_header and may need to be + * double word aligned. */ static int xlog_write_calc_vec_length( struct xlog_ticket *ticket, - struct xfs_log_vec *log_vector) + struct xfs_log_vec *log_vector, + bool need_start_rec) { struct xfs_log_vec *lv; - int headers = 0; + int headers = need_start_rec ? 1 : 0; int len = 0; int i; - - /* acct for start rec of xact */ - if (ticket->t_flags & XLOG_TIC_INITED) - headers++; for (lv = log_vector; lv; lv = lv->lv_next) { /* we don't write ordered log vectors */ @@ -2206,27 +2122,16 @@ return len; } -/* - * If first write for transaction, insert start record We can't be trying to - * commit if we are inited. We can't have any "partial_copy" if we are inited. - */ -static int +static void xlog_write_start_rec( struct xlog_op_header *ophdr, struct xlog_ticket *ticket) { - if (!(ticket->t_flags & XLOG_TIC_INITED)) - return 0; - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); ophdr->oh_clientid = ticket->t_clientid; ophdr->oh_len = 0; ophdr->oh_flags = XLOG_START_TRANS; ophdr->oh_res2 = 0; - - ticket->t_flags &= ~XLOG_TIC_INITED; - - return sizeof(struct xlog_op_header); } static xlog_op_header_t * @@ -2324,15 +2229,18 @@ int log_offset, struct xlog_in_core **commit_iclog) { + int error; + if (*partial_copy) { /* * This iclog has already been marked WANT_SYNC by * xlog_state_get_iclog_space. */ + spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); *record_cnt = 0; *data_cnt = 0; - return xlog_state_release_iclog(log, iclog); + goto release_iclog; } *partial_copy = 0; @@ -2340,21 +2248,29 @@ if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { /* no more space in this iclog - push it. */ + spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); *record_cnt = 0; *data_cnt = 0; - spin_lock(&log->l_icloglock); - xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); - + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR); if (!commit_iclog) - return xlog_state_release_iclog(log, iclog); + goto release_iclog; + spin_unlock(&log->l_icloglock); ASSERT(flags & XLOG_COMMIT_TRANS); *commit_iclog = iclog; } return 0; + +release_iclog: + error = xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + return error; } /* @@ -2404,39 +2320,28 @@ struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, - uint flags) + uint flags, + bool need_start_rec) { struct xlog_in_core *iclog = NULL; - struct xfs_log_iovec *vecp; - struct xfs_log_vec *lv; + struct xfs_log_vec *lv = log_vector; + struct xfs_log_iovec *vecp = lv->lv_iovecp; + int index = 0; int len; - int index; int partial_copy = 0; int partial_copy_len = 0; int contwr = 0; int record_cnt = 0; int data_cnt = 0; - int error; - - *start_lsn = 0; - - len = xlog_write_calc_vec_length(ticket, log_vector); + int error = 0; /* - * Region headers and bytes are already accounted for. - * We only need to take into account start records and - * split regions in this function. + * If this is a commit or unmount transaction, we don't need a start + * record to be written. We do, however, have to account for the + * commit or unmount header that gets written. Hence we always have + * to account for an extra xlog_op_header here. */ - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - - /* - * Commit record headers need to be accounted for. These - * come in as separate writes so are easy to detect. - */ - if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - + ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -2444,9 +2349,8 @@ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } - index = 0; - lv = log_vector; - vecp = lv->lv_iovecp; + len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec); + *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2470,7 +2374,6 @@ while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; - int start_rec_copy; int copy_len; int copy_off; bool ordered = false; @@ -2486,11 +2389,15 @@ ASSERT(reg->i_len % sizeof(int32_t) == 0); ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); - start_rec_copy = xlog_write_start_rec(ptr, ticket); - if (start_rec_copy) { - record_cnt++; + /* + * Before we start formatting log vectors, we need to + * write a start record. Only do this for the first + * iclog we write to. + */ + if (need_start_rec) { + xlog_write_start_rec(ptr, ticket); xlog_write_adv_cnt(&ptr, &len, &log_offset, - start_rec_copy); + sizeof(struct xlog_op_header)); } ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); @@ -2522,8 +2429,13 @@ xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); } - copy_len += start_rec_copy + sizeof(xlog_op_header_t); + copy_len += sizeof(struct xlog_op_header); record_cnt++; + if (need_start_rec) { + copy_len += sizeof(struct xlog_op_header); + record_cnt++; + need_start_rec = false; + } data_cnt += contwr ? copy_len : 0; error = xlog_write_copy_finish(log, iclog, flags, @@ -2567,158 +2479,284 @@ ASSERT(len == 0); + spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - if (!commit_iclog) - return xlog_state_release_iclog(log, iclog); + if (commit_iclog) { + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + } else { + error = xlog_state_release_iclog(log, iclog); + } + spin_unlock(&log->l_icloglock); - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - return 0; + return error; } - -/***************************************************************************** - * - * State Machine functions - * - ***************************************************************************** - */ - -/* Clean iclogs starting from the head. This ordering must be - * maintained, so an iclog doesn't become ACTIVE beyond one that - * is SYNCING. This is also required to maintain the notion that we use - * a ordered wait queue to hold off would be writers to the log when every - * iclog is trying to sync to disk. - * - * State Change: DIRTY -> ACTIVE - */ -STATIC void -xlog_state_clean_log( - struct xlog *log) +static void +xlog_state_activate_iclog( + struct xlog_in_core *iclog, + int *iclogs_changed) { - xlog_in_core_t *iclog; - int changed = 0; + ASSERT(list_empty_careful(&iclog->ic_callbacks)); - iclog = log->l_iclog; - do { - if (iclog->ic_state == XLOG_STATE_DIRTY) { - iclog->ic_state = XLOG_STATE_ACTIVE; - iclog->ic_offset = 0; - ASSERT(iclog->ic_callback == NULL); - /* - * If the number of ops in this iclog indicate it just - * contains the dummy transaction, we can - * change state into IDLE (the second time around). - * Otherwise we should change the state into - * NEED a dummy. - * We don't need to cover the dummy. - */ - if (!changed && - (be32_to_cpu(iclog->ic_header.h_num_logops) == - XLOG_COVER_OPS)) { - changed = 1; - } else { - /* - * We have two dirty iclogs so start over - * This could also be num of ops indicates - * this is not the dummy going out. - */ - changed = 2; - } - iclog->ic_header.h_num_logops = 0; - memset(iclog->ic_header.h_cycle_data, 0, - sizeof(iclog->ic_header.h_cycle_data)); - iclog->ic_header.h_lsn = 0; - } else if (iclog->ic_state == XLOG_STATE_ACTIVE) - /* do nothing */; - else - break; /* stop cleaning */ - iclog = iclog->ic_next; - } while (iclog != log->l_iclog); - - /* log is locked when we are called */ /* - * Change state for the dummy log recording. - * We usually go to NEED. But we go to NEED2 if the changed indicates - * we are done writing the dummy record. - * If we are done with the second dummy recored (DONE2), then - * we go to IDLE. + * If the number of ops in this iclog indicate it just contains the + * dummy transaction, we can change state into IDLE (the second time + * around). Otherwise we should change the state into NEED a dummy. + * We don't need to cover the dummy. */ - if (changed) { - switch (log->l_covered_state) { - case XLOG_STATE_COVER_IDLE: - case XLOG_STATE_COVER_NEED: - case XLOG_STATE_COVER_NEED2: - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; - - case XLOG_STATE_COVER_DONE: - if (changed == 1) - log->l_covered_state = XLOG_STATE_COVER_NEED2; - else - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; - - case XLOG_STATE_COVER_DONE2: - if (changed == 1) - log->l_covered_state = XLOG_STATE_COVER_IDLE; - else - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; - - default: - ASSERT(0); - } + if (*iclogs_changed == 0 && + iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { + *iclogs_changed = 1; + } else { + /* + * We have two dirty iclogs so start over. This could also be + * num of ops indicating this is not the dummy going out. + */ + *iclogs_changed = 2; } -} /* xlog_state_clean_log */ + + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_offset = 0; + iclog->ic_header.h_num_logops = 0; + memset(iclog->ic_header.h_cycle_data, 0, + sizeof(iclog->ic_header.h_cycle_data)); + iclog->ic_header.h_lsn = 0; +} + +/* + * Loop through all iclogs and mark all iclogs currently marked DIRTY as + * ACTIVE after iclog I/O has completed. + */ +static void +xlog_state_activate_iclogs( + struct xlog *log, + int *iclogs_changed) +{ + struct xlog_in_core *iclog = log->l_iclog; + + do { + if (iclog->ic_state == XLOG_STATE_DIRTY) + xlog_state_activate_iclog(iclog, iclogs_changed); + /* + * The ordering of marking iclogs ACTIVE must be maintained, so + * an iclog doesn't become ACTIVE beyond one that is SYNCING. + */ + else if (iclog->ic_state != XLOG_STATE_ACTIVE) + break; + } while ((iclog = iclog->ic_next) != log->l_iclog); +} + +static int +xlog_covered_state( + int prev_state, + int iclogs_changed) +{ + /* + * We usually go to NEED. But we go to NEED2 if the changed indicates we + * are done writing the dummy record. If we are done with the second + * dummy recored (DONE2), then we go to IDLE. + */ + switch (prev_state) { + case XLOG_STATE_COVER_IDLE: + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + break; + case XLOG_STATE_COVER_DONE: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_NEED2; + break; + case XLOG_STATE_COVER_DONE2: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_IDLE; + break; + default: + ASSERT(0); + } + + return XLOG_STATE_COVER_NEED; +} + +STATIC void +xlog_state_clean_iclog( + struct xlog *log, + struct xlog_in_core *dirty_iclog) +{ + int iclogs_changed = 0; + + dirty_iclog->ic_state = XLOG_STATE_DIRTY; + + xlog_state_activate_iclogs(log, &iclogs_changed); + wake_up_all(&dirty_iclog->ic_force_wait); + + if (iclogs_changed) { + log->l_covered_state = xlog_covered_state(log->l_covered_state, + iclogs_changed); + } +} STATIC xfs_lsn_t xlog_get_lowest_lsn( - struct xlog *log) + struct xlog *log) { - xlog_in_core_t *lsn_log; - xfs_lsn_t lowest_lsn, lsn; + struct xlog_in_core *iclog = log->l_iclog; + xfs_lsn_t lowest_lsn = 0, lsn; - lsn_log = log->l_iclog; - lowest_lsn = 0; do { - if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) { - lsn = be64_to_cpu(lsn_log->ic_header.h_lsn); - if ((lsn && !lowest_lsn) || - (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) { + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) + continue; + + lsn = be64_to_cpu(iclog->ic_header.h_lsn); + if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0) lowest_lsn = lsn; - } - } - lsn_log = lsn_log->ic_next; - } while (lsn_log != log->l_iclog); + } while ((iclog = iclog->ic_next) != log->l_iclog); + return lowest_lsn; } +/* + * Completion of a iclog IO does not imply that a transaction has completed, as + * transactions can be large enough to span many iclogs. We cannot change the + * tail of the log half way through a transaction as this may be the only + * transaction in the log and moving the tail to point to the middle of it + * will prevent recovery from finding the start of the transaction. Hence we + * should only update the last_sync_lsn if this iclog contains transaction + * completion callbacks on it. + * + * We have to do this before we drop the icloglock to ensure we are the only one + * that can update it. + * + * If we are moving the last_sync_lsn forwards, we also need to ensure we kick + * the reservation grant head pushing. This is due to the fact that the push + * target is bound by the current last_sync_lsn value. Hence if we have a large + * amount of log space bound up in this committing transaction then the + * last_sync_lsn value may be the limiting factor preventing tail pushing from + * freeing space in the log. Hence once we've updated the last_sync_lsn we + * should push the AIL to ensure the push target (and hence the grant head) is + * no longer bound by the old log head location and can move forwards and make + * progress again. + */ +static void +xlog_state_set_callback( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t header_lsn) +{ + iclog->ic_state = XLOG_STATE_CALLBACK; + + ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), + header_lsn) <= 0); + + if (list_empty_careful(&iclog->ic_callbacks)) + return; + + atomic64_set(&log->l_last_sync_lsn, header_lsn); + xlog_grant_push_ail(log, 0); +} + +/* + * Return true if we need to stop processing, false to continue to the next + * iclog. The caller will need to run callbacks if the iclog is returned in the + * XLOG_STATE_CALLBACK state. + */ +static bool +xlog_state_iodone_process_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + bool *ioerror) +{ + xfs_lsn_t lowest_lsn; + xfs_lsn_t header_lsn; + + switch (iclog->ic_state) { + case XLOG_STATE_ACTIVE: + case XLOG_STATE_DIRTY: + /* + * Skip all iclogs in the ACTIVE & DIRTY states: + */ + return false; + case XLOG_STATE_IOERROR: + /* + * Between marking a filesystem SHUTDOWN and stopping the log, + * we do flush all iclogs to disk (if there wasn't a log I/O + * error). So, we do want things to go smoothly in case of just + * a SHUTDOWN w/o a LOG_IO_ERROR. + */ + *ioerror = true; + return false; + case XLOG_STATE_DONE_SYNC: + /* + * Now that we have an iclog that is in the DONE_SYNC state, do + * one more check here to see if we have chased our tail around. + * If this is not the lowest lsn iclog, then we will leave it + * for another completion to process. + */ + header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + lowest_lsn = xlog_get_lowest_lsn(log); + if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) + return false; + xlog_state_set_callback(log, iclog, header_lsn); + return false; + default: + /* + * Can only perform callbacks in order. Since this iclog is not + * in the DONE_SYNC state, we skip the rest and just try to + * clean up. + */ + return true; + } +} + +/* + * Keep processing entries in the iclog callback list until we come around and + * it is empty. We need to atomically see that the list is empty and change the + * state to DIRTY so that we don't miss any more callbacks being added. + * + * This function is called with the icloglock held and returns with it held. We + * drop it while running callbacks, however, as holding it over thousands of + * callbacks is unnecessary and causes excessive contention if we do. + */ +static void +xlog_state_do_iclog_callbacks( + struct xlog *log, + struct xlog_in_core *iclog) + __releases(&log->l_icloglock) + __acquires(&log->l_icloglock) +{ + spin_unlock(&log->l_icloglock); + spin_lock(&iclog->ic_callback_lock); + while (!list_empty(&iclog->ic_callbacks)) { + LIST_HEAD(tmp); + + list_splice_init(&iclog->ic_callbacks, &tmp); + + spin_unlock(&iclog->ic_callback_lock); + xlog_cil_process_committed(&tmp); + spin_lock(&iclog->ic_callback_lock); + } + + /* + * Pick up the icloglock while still holding the callback lock so we + * serialise against anyone trying to add more callbacks to this iclog + * now we've finished processing. + */ + spin_lock(&log->l_icloglock); + spin_unlock(&iclog->ic_callback_lock); +} STATIC void xlog_state_do_callback( - struct xlog *log, - int aborted, - struct xlog_in_core *ciclog) + struct xlog *log) { - xlog_in_core_t *iclog; - xlog_in_core_t *first_iclog; /* used to know when we've - * processed all iclogs once */ - xfs_log_callback_t *cb, *cb_next; - int flushcnt = 0; - xfs_lsn_t lowest_lsn; - int ioerrors; /* counter: iclogs with errors */ - int loopdidcallbacks; /* flag: inner loop did callbacks*/ - int funcdidcallbacks; /* flag: function did callbacks */ - int repeats; /* for issuing console warnings if - * looping too many times */ + struct xlog_in_core *iclog; + struct xlog_in_core *first_iclog; + bool cycled_icloglock; + bool ioerror; + int flushcnt = 0; + int repeats = 0; spin_lock(&log->l_icloglock); - first_iclog = iclog = log->l_iclog; - ioerrors = 0; - funcdidcallbacks = 0; - repeats = 0; - do { /* * Scan all iclogs starting with the one pointed to by the @@ -2730,140 +2768,31 @@ */ first_iclog = log->l_iclog; iclog = log->l_iclog; - loopdidcallbacks = 0; + cycled_icloglock = false; + ioerror = false; repeats++; do { + if (xlog_state_iodone_process_iclog(log, iclog, + &ioerror)) + break; - /* skip all iclogs in the ACTIVE & DIRTY states */ - if (iclog->ic_state & - (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { + if (iclog->ic_state != XLOG_STATE_CALLBACK && + iclog->ic_state != XLOG_STATE_IOERROR) { iclog = iclog->ic_next; continue; } /* - * Between marking a filesystem SHUTDOWN and stopping - * the log, we do flush all iclogs to disk (if there - * wasn't a log I/O error). So, we do want things to - * go smoothly in case of just a SHUTDOWN w/o a - * LOG_IO_ERROR. + * Running callbacks will drop the icloglock which means + * we'll have to run at least one more complete loop. */ - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { - /* - * Can only perform callbacks in order. Since - * this iclog is not in the DONE_SYNC/ - * DO_CALLBACK state, we skip the rest and - * just try to clean up. If we set our iclog - * to DO_CALLBACK, we will not process it when - * we retry since a previous iclog is in the - * CALLBACK and the state cannot change since - * we are holding the l_icloglock. - */ - if (!(iclog->ic_state & - (XLOG_STATE_DONE_SYNC | - XLOG_STATE_DO_CALLBACK))) { - if (ciclog && (ciclog->ic_state == - XLOG_STATE_DONE_SYNC)) { - ciclog->ic_state = XLOG_STATE_DO_CALLBACK; - } - break; - } - /* - * We now have an iclog that is in either the - * DO_CALLBACK or DONE_SYNC states. The other - * states (WANT_SYNC, SYNCING, or CALLBACK were - * caught by the above if and are going to - * clean (i.e. we aren't doing their callbacks) - * see the above if. - */ - - /* - * We will do one more check here to see if we - * have chased our tail around. - */ - - lowest_lsn = xlog_get_lowest_lsn(log); - if (lowest_lsn && - XFS_LSN_CMP(lowest_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { - iclog = iclog->ic_next; - continue; /* Leave this iclog for - * another thread */ - } - - iclog->ic_state = XLOG_STATE_CALLBACK; - - - /* - * Completion of a iclog IO does not imply that - * a transaction has completed, as transactions - * can be large enough to span many iclogs. We - * cannot change the tail of the log half way - * through a transaction as this may be the only - * transaction in the log and moving th etail to - * point to the middle of it will prevent - * recovery from finding the start of the - * transaction. Hence we should only update the - * last_sync_lsn if this iclog contains - * transaction completion callbacks on it. - * - * We have to do this before we drop the - * icloglock to ensure we are the only one that - * can update it. - */ - ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), - be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - if (iclog->ic_callback) - atomic64_set(&log->l_last_sync_lsn, - be64_to_cpu(iclog->ic_header.h_lsn)); - - } else - ioerrors++; - - spin_unlock(&log->l_icloglock); - - /* - * Keep processing entries in the callback list until - * we come around and it is empty. We need to - * atomically see that the list is empty and change the - * state to DIRTY so that we don't miss any more - * callbacks being added. - */ - spin_lock(&iclog->ic_callback_lock); - cb = iclog->ic_callback; - while (cb) { - iclog->ic_callback_tail = &(iclog->ic_callback); - iclog->ic_callback = NULL; - spin_unlock(&iclog->ic_callback_lock); - - /* perform callbacks in the order given */ - for (; cb; cb = cb_next) { - cb_next = cb->cb_next; - cb->cb_func(cb->cb_arg, aborted); - } - spin_lock(&iclog->ic_callback_lock); - cb = iclog->ic_callback; - } - - loopdidcallbacks++; - funcdidcallbacks++; - - spin_lock(&log->l_icloglock); - ASSERT(iclog->ic_callback == NULL); - spin_unlock(&iclog->ic_callback_lock); - if (!(iclog->ic_state & XLOG_STATE_IOERROR)) - iclog->ic_state = XLOG_STATE_DIRTY; - - /* - * Transition from DIRTY to ACTIVE if applicable. - * NOP if STATE_IOERROR. - */ - xlog_state_clean_log(log); - - /* wake up threads waiting in xfs_log_force() */ - wake_up_all(&iclog->ic_force_wait); - + cycled_icloglock = true; + xlog_state_do_iclog_callbacks(log, iclog); + if (XLOG_FORCED_SHUTDOWN(log)) + wake_up_all(&iclog->ic_force_wait); + else + xlog_state_clean_iclog(log, iclog); iclog = iclog->ic_next; } while (first_iclog != iclog); @@ -2874,45 +2803,10 @@ "%s: possible infinite loop (%d iterations)", __func__, flushcnt); } - } while (!ioerrors && loopdidcallbacks); + } while (!ioerror && cycled_icloglock); -#ifdef DEBUG - /* - * Make one last gasp attempt to see if iclogs are being left in limbo. - * If the above loop finds an iclog earlier than the current iclog and - * in one of the syncing states, the current iclog is put into - * DO_CALLBACK and the callbacks are deferred to the completion of the - * earlier iclog. Walk the iclogs in order and make sure that no iclog - * is in DO_CALLBACK unless an earlier iclog is in one of the syncing - * states. - * - * Note that SYNCING|IOABORT is a valid state so we cannot just check - * for ic_state == SYNCING. - */ - if (funcdidcallbacks) { - first_iclog = iclog = log->l_iclog; - do { - ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); - /* - * Terminate the loop if iclogs are found in states - * which will cause other threads to clean up iclogs. - * - * SYNCING - i/o completion will go through logs - * DONE_SYNC - interrupt thread should be waiting for - * l_icloglock - * IOERROR - give up hope all ye who enter here - */ - if (iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state & XLOG_STATE_SYNCING || - iclog->ic_state == XLOG_STATE_DONE_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR ) - break; - iclog = iclog->ic_next; - } while (first_iclog != iclog); - } -#endif - - if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) + if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE || + log->l_iclog->ic_state == XLOG_STATE_IOERROR) wake_up_all(&log->l_flush_wait); spin_unlock(&log->l_icloglock); @@ -2934,30 +2828,20 @@ */ STATIC void xlog_state_done_syncing( - xlog_in_core_t *iclog, - int aborted) + struct xlog_in_core *iclog) { - struct xlog *log = iclog->ic_log; + struct xlog *log = iclog->ic_log; spin_lock(&log->l_icloglock); - - ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || - iclog->ic_state == XLOG_STATE_IOERROR); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); - ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); - /* * If we got an error, either on the first buffer, or in the case of - * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, - * and none should ever be attempted to be written to disk - * again. + * split log writes, on the second, we shut down the file system and + * no iclogs should ever be attempted to be written to disk again. */ - if (iclog->ic_state != XLOG_STATE_IOERROR) { - if (--iclog->ic_bwritecnt == 1) { - spin_unlock(&log->l_icloglock); - return; - } + if (!XLOG_FORCED_SHUTDOWN(log)) { + ASSERT(iclog->ic_state == XLOG_STATE_SYNCING); iclog->ic_state = XLOG_STATE_DONE_SYNC; } @@ -2968,9 +2852,8 @@ */ wake_up_all(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); - xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ -} /* xlog_state_done_syncing */ - + xlog_state_do_callback(log); +} /* * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must @@ -3002,7 +2885,6 @@ int log_offset; xlog_rec_header_t *head; xlog_in_core_t *iclog; - int error; restart: spin_lock(&log->l_icloglock); @@ -3051,24 +2933,22 @@ * can fit into remaining data section. */ if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + int error = 0; + xlog_state_switch_iclogs(log, iclog, iclog->ic_size); /* - * If I'm the only one writing to this iclog, sync it to disk. - * We need to do an atomic compare and decrement here to avoid - * racing with concurrent atomic_dec_and_lock() calls in + * If we are the only one writing to this iclog, sync it to + * disk. We need to do an atomic compare and decrement here to + * avoid racing with concurrent atomic_dec_and_lock() calls in * xlog_state_release_iclog() when there is more than one * reference to the iclog. */ - if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { - /* we are the only one */ - spin_unlock(&log->l_icloglock); + if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) error = xlog_state_release_iclog(log, iclog); - if (error) - return error; - } else { - spin_unlock(&log->l_icloglock); - } + spin_unlock(&log->l_icloglock); + if (error) + return error; goto restart; } @@ -3092,21 +2972,21 @@ *logoffsetp = log_offset; return 0; -} /* xlog_state_get_iclog_space */ +} -/* The first cnt-1 times through here we don't need to - * move the grant write head because the permanent - * reservation has reserved cnt times the unit amount. - * Release part of current permanent unit reservation and - * reset current reservation to be one units worth. Also - * move grant reservation head forward. +/* + * The first cnt-1 times a ticket goes through here we don't need to move the + * grant write head because the permanent reservation has reserved cnt times the + * unit amount. Release part of current permanent unit reservation and reset + * current reservation to be one units worth. Also move grant reservation head + * forward. */ -STATIC void -xlog_regrant_reserve_log_space( +void +xfs_log_ticket_regrant( struct xlog *log, struct xlog_ticket *ticket) { - trace_xfs_log_regrant_reserve_enter(log, ticket); + trace_xfs_log_ticket_regrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; @@ -3118,21 +2998,20 @@ ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); - trace_xfs_log_regrant_reserve_sub(log, ticket); + trace_xfs_log_ticket_regrant_sub(log, ticket); /* just return if we still have some of the pre-reserved space */ - if (ticket->t_cnt > 0) - return; + if (!ticket->t_cnt) { + xlog_grant_add_space(log, &log->l_reserve_head.grant, + ticket->t_unit_res); + trace_xfs_log_ticket_regrant_exit(log, ticket); - xlog_grant_add_space(log, &log->l_reserve_head.grant, - ticket->t_unit_res); + ticket->t_curr_res = ticket->t_unit_res; + xlog_tic_reset_res(ticket); + } - trace_xfs_log_regrant_reserve_exit(log, ticket); - - ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); -} /* xlog_regrant_reserve_log_space */ - + xfs_log_ticket_put(ticket); +} /* * Give back the space left from a reservation. @@ -3148,18 +3027,19 @@ * space, the count will stay at zero and the only space remaining will be * in the current reservation field. */ -STATIC void -xlog_ungrant_log_space( +void +xfs_log_ticket_ungrant( struct xlog *log, struct xlog_ticket *ticket) { - int bytes; + int bytes; + + trace_xfs_log_ticket_ungrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; - trace_xfs_log_ungrant_enter(log, ticket); - trace_xfs_log_ungrant_sub(log, ticket); + trace_xfs_log_ticket_ungrant_sub(log, ticket); /* * If this is a permanent reservation ticket, we may be able to free @@ -3174,71 +3054,15 @@ xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); - trace_xfs_log_ungrant_exit(log, ticket); + trace_xfs_log_ticket_ungrant_exit(log, ticket); xfs_log_space_wake(log->l_mp); + xfs_log_ticket_put(ticket); } /* - * Flush iclog to disk if this is the last reference to the given iclog and - * the WANT_SYNC bit is set. - * - * When this function is entered, the iclog is not necessarily in the - * WANT_SYNC state. It may be sitting around waiting to get filled. - * - * - */ -STATIC int -xlog_state_release_iclog( - struct xlog *log, - struct xlog_in_core *iclog) -{ - int sync = 0; /* do we sync? */ - - if (iclog->ic_state & XLOG_STATE_IOERROR) - return -EIO; - - ASSERT(atomic_read(&iclog->ic_refcnt) > 0); - if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) - return 0; - - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return -EIO; - } - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_WANT_SYNC); - - if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { - /* update tail before writing to iclog */ - xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); - sync++; - iclog->ic_state = XLOG_STATE_SYNCING; - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); - xlog_verify_tail_lsn(log, iclog, tail_lsn); - /* cycle incremented when incrementing curr_block */ - } - spin_unlock(&log->l_icloglock); - - /* - * We let the log lock go, so it's possible that we hit a log I/O - * error or some other SHUTDOWN condition that marks the iclog - * as XLOG_STATE_IOERROR before the bwrite. However, we know that - * this iclog has consistent data, so we ignore IOERROR - * flags after this point. - */ - if (sync) - return xlog_sync(log, iclog); - return 0; -} /* xlog_state_release_iclog */ - - -/* - * This routine will mark the current iclog in the ring as WANT_SYNC - * and move the current iclog pointer to the next iclog in the ring. - * When this routine is called from xlog_state_get_iclog_space(), the - * exact size of the iclog has not yet been determined. All we know is - * that every data block. We have run out of space in this log record. + * This routine will mark the current iclog in the ring as WANT_SYNC and move + * the current iclog pointer to the next iclog in the ring. */ STATIC void xlog_state_switch_iclogs( @@ -3247,6 +3071,8 @@ int eventual_size) { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + assert_spin_locked(&log->l_icloglock); + if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; @@ -3281,7 +3107,7 @@ } ASSERT(iclog == log->l_iclog); log->l_iclog = iclog->ic_next; -} /* xlog_state_switch_iclogs */ +} /* * Write out all data in the in-core log as of this exact moment in time. @@ -3326,7 +3152,7 @@ spin_lock(&log->l_icloglock); iclog = log->l_iclog; - if (iclog->ic_state & XLOG_STATE_IOERROR) + if (iclog->ic_state == XLOG_STATE_IOERROR) goto out_error; if (iclog->ic_state == XLOG_STATE_DIRTY || @@ -3341,9 +3167,6 @@ * previous iclog and go to sleep. */ iclog = iclog->ic_prev; - if (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY) - goto out_unlock; } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { if (atomic_read(&iclog->ic_refcnt) == 0) { /* @@ -3356,14 +3179,10 @@ atomic_inc(&iclog->ic_refcnt); lsn = be64_to_cpu(iclog->ic_header.h_lsn); xlog_state_switch_iclogs(log, iclog, 0); - spin_unlock(&log->l_icloglock); - if (xlog_state_release_iclog(log, iclog)) - return -EIO; + goto out_error; - spin_lock(&log->l_icloglock); - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn || - iclog->ic_state == XLOG_STATE_DIRTY) + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) goto out_unlock; } else { /* @@ -3383,17 +3202,8 @@ ; } - if (!(flags & XFS_LOG_SYNC)) - goto out_unlock; - - if (iclog->ic_state & XLOG_STATE_IOERROR) - goto out_error; - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state & XLOG_STATE_IOERROR) - return -EIO; - return 0; - + if (flags & XFS_LOG_SYNC) + return xlog_wait_on_iclog(iclog); out_unlock: spin_unlock(&log->l_icloglock); return 0; @@ -3403,19 +3213,18 @@ } static int -__xfs_log_force_lsn( - struct xfs_mount *mp, +xlog_force_lsn( + struct xlog *log, xfs_lsn_t lsn, uint flags, int *log_flushed, bool already_slept) { - struct xlog *log = mp->m_log; struct xlog_in_core *iclog; spin_lock(&log->l_icloglock); iclog = log->l_iclog; - if (iclog->ic_state & XLOG_STATE_IOERROR) + if (iclog->ic_state == XLOG_STATE_IOERROR) goto out_error; while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { @@ -3423,9 +3232,6 @@ if (iclog == log->l_iclog) goto out_unlock; } - - if (iclog->ic_state == XLOG_STATE_DIRTY) - goto out_unlock; if (iclog->ic_state == XLOG_STATE_ACTIVE) { /* @@ -3444,39 +3250,22 @@ * will go out then. */ if (!already_slept && - (iclog->ic_prev->ic_state & - (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { - ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); - - XFS_STATS_INC(mp, xs_log_force_sleep); - + (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) { xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); return -EAGAIN; } atomic_inc(&iclog->ic_refcnt); xlog_state_switch_iclogs(log, iclog, 0); - spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) - return -EIO; + goto out_error; if (log_flushed) *log_flushed = 1; - spin_lock(&log->l_icloglock); } - if (!(flags & XFS_LOG_SYNC) || - (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) - goto out_unlock; - - if (iclog->ic_state & XLOG_STATE_IOERROR) - goto out_error; - - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state & XLOG_STATE_IOERROR) - return -EIO; - return 0; - + if (flags & XFS_LOG_SYNC) + return xlog_wait_on_iclog(iclog); out_unlock: spin_unlock(&log->l_icloglock); return 0; @@ -3500,54 +3289,31 @@ * to disk, that thread will wake up all threads waiting on the queue. */ int -xfs_log_force_lsn( +xfs_log_force_seq( struct xfs_mount *mp, - xfs_lsn_t lsn, + xfs_csn_t seq, uint flags, int *log_flushed) { + struct xlog *log = mp->m_log; + xfs_lsn_t lsn; int ret; - ASSERT(lsn != 0); + ASSERT(seq != 0); XFS_STATS_INC(mp, xs_log_force); - trace_xfs_log_force(mp, lsn, _RET_IP_); + trace_xfs_log_force(mp, seq, _RET_IP_); - lsn = xlog_cil_force_lsn(mp->m_log, lsn); + lsn = xlog_cil_force_seq(log, seq); if (lsn == NULLCOMMITLSN) return 0; - ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false); - if (ret == -EAGAIN) - ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true); + ret = xlog_force_lsn(log, lsn, flags, log_flushed, false); + if (ret == -EAGAIN) { + XFS_STATS_INC(mp, xs_log_force_sleep); + ret = xlog_force_lsn(log, lsn, flags, log_flushed, true); + } return ret; } - -/* - * Called when we want to mark the current iclog as being ready to sync to - * disk. - */ -STATIC void -xlog_state_want_sync( - struct xlog *log, - struct xlog_in_core *iclog) -{ - assert_spin_locked(&log->l_icloglock); - - if (iclog->ic_state == XLOG_STATE_ACTIVE) { - xlog_state_switch_iclogs(log, iclog, 0); - } else { - ASSERT(iclog->ic_state & - (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); - } -} - - -/***************************************************************************** - * - * TICKET functions - * - ***************************************************************************** - */ /* * Free a used ticket when its refcount falls to zero. @@ -3558,7 +3324,7 @@ { ASSERT(atomic_read(&ticket->t_ref) > 0); if (atomic_dec_and_test(&ticket->t_ref)) - kmem_zone_free(xfs_log_ticket_zone, ticket); + kmem_cache_free(xfs_log_ticket_zone, ticket); } xlog_ticket_t * @@ -3676,15 +3442,12 @@ int unit_bytes, int cnt, char client, - bool permanent, - xfs_km_flags_t alloc_flags) + bool permanent) { struct xlog_ticket *tic; int unit_res; - tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); - if (!tic) - return NULL; + tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL); unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); @@ -3697,7 +3460,6 @@ tic->t_ocnt = cnt; tic->t_tid = prandom_u32(); tic->t_clientid = client; - tic->t_flags = XLOG_TIC_INITED; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; @@ -3706,13 +3468,6 @@ return tic; } - -/****************************************************************************** - * - * Log debug routines - * - ****************************************************************************** - */ #if defined(DEBUG) /* * Make sure that the destination ptr is within the valid data region of @@ -3798,7 +3553,7 @@ if (blocks < BTOBB(iclog->ic_offset) + 1) xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); } -} /* xlog_verify_tail_lsn */ +} /* * Perform a number of checks on the iclog before writing to disk. @@ -3819,8 +3574,7 @@ xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, - int count, - bool syncing) + int count) { xlog_op_header_t *ophead; xlog_in_core_t *icptr; @@ -3864,7 +3618,7 @@ /* clientid is only 1 byte */ p = &ophead->oh_clientid; field_offset = p - base_ptr; - if (!syncing || (field_offset & 0x1ff)) { + if (field_offset & 0x1ff) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); @@ -3887,7 +3641,7 @@ /* check length */ p = &ophead->oh_len; field_offset = p - base_ptr; - if (!syncing || (field_offset & 0x1ff)) { + if (field_offset & 0x1ff) { op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((uintptr_t)&ophead->oh_len - @@ -3902,7 +3656,7 @@ } ptr += sizeof(xlog_op_header_t) + op_len; } -} /* xlog_verify_iclog */ +} #endif /* @@ -3915,7 +3669,7 @@ xlog_in_core_t *iclog, *ic; iclog = log->l_iclog; - if (! (iclog->ic_state & XLOG_STATE_IOERROR)) { + if (iclog->ic_state != XLOG_STATE_IOERROR) { /* * Mark all the incore logs IOERROR. * From now on, no log flushes will result. @@ -3975,7 +3729,7 @@ * Somebody could've already done the hard work for us. * No need to get locks for this. */ - if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) { + if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) { ASSERT(XLOG_FORCED_SHUTDOWN(log)); return 1; } @@ -4026,21 +3780,8 @@ spin_lock(&log->l_cilp->xc_push_lock); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); - xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); + xlog_state_do_callback(log); -#ifdef XFSERRORDEBUG - { - xlog_in_core_t *iclog; - - spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - do { - ASSERT(iclog->ic_callback == 0); - iclog = iclog->ic_next; - } while (iclog != log->l_iclog); - spin_unlock(&log->l_icloglock); - } -#endif /* return non-zero if log IOERROR transition had already happened */ return retval; } -- Gitblit v1.6.2