~hc/RK356X_SDK_RELEASE.git

..	..	@@ -16,24 +16,14 @@
16	16	#include "xfs_trans_priv.h"
17	17	#include "xfs_log.h"
18	18	#include "xfs_log_priv.h"
19		-#include "xfs_log_recover.h"
20		-#include "xfs_inode.h"
21	19	#include "xfs_trace.h"
22		-#include "xfs_fsops.h"
23		-#include "xfs_cksum.h"
24	20	#include "xfs_sysfs.h"
25	21	#include "xfs_sb.h"
	22	+#include "xfs_health.h"
26	23
27	24	kmem_zone_t *xfs_log_ticket_zone;
28	25
29	26	/* Local miscellaneous function prototypes */
30		-STATIC int
31		-xlog_commit_record(
32		- struct xlog *log,
33		- struct xlog_ticket *ticket,
34		- struct xlog_in_core **iclog,
35		- xfs_lsn_t *commitlsnp);
36		-
37	27	STATIC struct xlog *
38	28	xlog_alloc_log(
39	29	struct xfs_mount *mp,
..	..	@@ -44,20 +34,12 @@
44	34	xlog_space_left(
45	35	struct xlog *log,
46	36	atomic64_t *head);
47		-STATIC int
48		-xlog_sync(
49		- struct xlog *log,
50		- struct xlog_in_core *iclog);
51	37	STATIC void
52	38	xlog_dealloc_log(
53	39	struct xlog *log);
54	40
55	41	/* local state machine functions */
56		-STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
57		-STATIC void
58		-xlog_state_do_callback(
59		- struct xlog *log,
60		- int aborted,
	42	+STATIC void xlog_state_done_syncing(
61	43	struct xlog_in_core *iclog);
62	44	STATIC int
63	45	xlog_state_get_iclog_space(
..	..	@@ -67,33 +49,19 @@
67	49	struct xlog_ticket *ticket,
68	50	int *continued_write,
69	51	int *logoffsetp);
70		-STATIC int
71		-xlog_state_release_iclog(
72		- struct xlog *log,
73		- struct xlog_in_core *iclog);
74	52	STATIC void
75	53	xlog_state_switch_iclogs(
76	54	struct xlog *log,
77	55	struct xlog_in_core *iclog,
78	56	int eventual_size);
79	57	STATIC void
80		-xlog_state_want_sync(
81		- struct xlog *log,
82		- struct xlog_in_core *iclog);
83		-
84		-STATIC void
85	58	xlog_grant_push_ail(
86	59	struct xlog *log,
87	60	int need_bytes);
88	61	STATIC void
89		-xlog_regrant_reserve_log_space(
	62	+xlog_sync(
90	63	struct xlog *log,
91		- struct xlog_ticket *ticket);
92		-STATIC void
93		-xlog_ungrant_log_space(
94		- struct xlog *log,
95		- struct xlog_ticket *ticket);
96		-
	64	+ struct xlog_in_core *iclog);
97	65	#if defined(DEBUG)
98	66	STATIC void
99	67	xlog_verify_dest_ptr(
..	..	@@ -106,8 +74,7 @@
106	74	xlog_verify_iclog(
107	75	struct xlog *log,
108	76	struct xlog_in_core *iclog,
109		- int count,
110		- bool syncing);
	77	+ int count);
111	78	STATIC void
112	79	xlog_verify_tail_lsn(
113	80	struct xlog *log,
..	..	@@ -116,7 +83,7 @@
116	83	#else
117	84	#define xlog_verify_dest_ptr(a,b)
118	85	#define xlog_verify_grant_tail(a)
119		-#define xlog_verify_iclog(a,b,c,d)
	86	+#define xlog_verify_iclog(a,b,c)
120	87	#define xlog_verify_tail_lsn(a,b,c)
121	88	#endif
122	89
..	..	@@ -225,15 +192,42 @@
225	192	{
226	193	struct xlog_ticket *tic;
227	194	int need_bytes;
	195	+ bool woken_task = false;
228	196
229	197	list_for_each_entry(tic, &head->waiters, t_queue) {
	198	+
	199	+ /*
	200	+ * There is a chance that the size of the CIL checkpoints in
	201	+ * progress at the last AIL push target calculation resulted in
	202	+ * limiting the target to the log head (l_last_sync_lsn) at the
	203	+ * time. This may not reflect where the log head is now as the
	204	+ * CIL checkpoints may have completed.
	205	+ *
	206	+ * Hence when we are woken here, it may be that the head of the
	207	+ * log that has moved rather than the tail. As the tail didn't
	208	+ * move, there still won't be space available for the
	209	+ * reservation we require. However, if the AIL has already
	210	+ * pushed to the target defined by the old log head location, we
	211	+ * will hang here waiting for something else to update the AIL
	212	+ * push target.
	213	+ *
	214	+ * Therefore, if there isn't space to wake the first waiter on
	215	+ * the grant head, we need to push the AIL again to ensure the
	216	+ * target reflects both the current log tail and log head
	217	+ * position before we wait for the tail to move again.
	218	+ */
	219	+
230	220	need_bytes = xlog_ticket_reservation(log, head, tic);
231		- if (*free_bytes < need_bytes)
	221	+ if (*free_bytes < need_bytes) {
	222	+ if (!woken_task)
	223	+ xlog_grant_push_ail(log, need_bytes);
232	224	return false;
	225	+ }
233	226
234	227	*free_bytes -= need_bytes;
235	228	trace_xfs_log_grant_wake_up(log, tic);
236	229	wake_up_process(tic->t_task);
	230	+ woken_task = true;
237	231	}
238	232
239	233	return true;
..	..	@@ -353,6 +347,25 @@
353	347	tic->t_res_num++;
354	348	}
355	349
	350	+bool
	351	+xfs_log_writable(
	352	+ struct xfs_mount *mp)
	353	+{
	354	+ /*
	355	+ * Never write to the log on norecovery mounts, if the block device is
	356	+ * read-only, or if the filesystem is shutdown. Read-only mounts still
	357	+ * allow internal writes for log recovery and unmount purposes, so don't
	358	+ * restrict that case here.
	359	+ */
	360	+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
	361	+ return false;
	362	+ if (xfs_readonly_buftarg(mp->m_log->l_targ))
	363	+ return false;
	364	+ if (XFS_FORCED_SHUTDOWN(mp))
	365	+ return false;
	366	+ return true;
	367	+}
	368	+
356	369	/*
357	370	* Replenish the byte reservation required by moving the grant write head.
358	371	*/
..	..	@@ -439,11 +452,7 @@
439	452	XFS_STATS_INC(mp, xs_try_logspace);
440	453
441	454	ASSERT(*ticp == NULL);
442		- tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
443		- KM_SLEEP \| KM_MAYFAIL);
444		- if (!tic)
445		- return -ENOMEM;
446		-
	455	+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent);
447	456	*ticp = tic;
448	457
449	458	xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
..	..	@@ -473,110 +482,67 @@
473	482	return error;
474	483	}
475	484
476		-
477		-/*
478		- * NOTES:
479		- *
480		- * 1. currblock field gets updated at startup and after in-core logs
481		- * marked as with WANT_SYNC.
482		- */
483		-
484		-/*
485		- * This routine is called when a user of a log manager ticket is done with
486		- * the reservation. If the ticket was ever used, then a commit record for
487		- * the associated transaction is written out as a log operation header with
488		- * no data. The flag XLOG_TIC_INITED is set when the first write occurs with
489		- * a given ticket. If the ticket was one with a permanent reservation, then
490		- * a few operations are done differently. Permanent reservation tickets by
491		- * default don't release the reservation. They just commit the current
492		- * transaction with the belief that the reservation is still needed. A flag
493		- * must be passed in before permanent reservations are actually released.
494		- * When these type of tickets are not released, they need to be set into
495		- * the inited state again. By doing this, a start record will be written
496		- * out when the next write occurs.
497		- */
498		-xfs_lsn_t
499		-xfs_log_done(
500		- struct xfs_mount *mp,
501		- struct xlog_ticket *ticket,
502		- struct xlog_in_core **iclog,
503		- bool regrant)
504		-{
505		- struct xlog *log = mp->m_log;
506		- xfs_lsn_t lsn = 0;
507		-
508		- if (XLOG_FORCED_SHUTDOWN(log) \|\|
509		- /*
510		- * If nothing was ever written, don't write out commit record.
511		- * If we get an error, just continue and give back the log ticket.
512		- */
513		- (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
514		- (xlog_commit_record(log, ticket, iclog, &lsn)))) {
515		- lsn = (xfs_lsn_t) -1;
516		- regrant = false;
517		- }
518		-
519		-
520		- if (!regrant) {
521		- trace_xfs_log_done_nonperm(log, ticket);
522		-
523		- /*
524		- * Release ticket if not permanent reservation or a specific
525		- * request has been made to release a permanent reservation.
526		- */
527		- xlog_ungrant_log_space(log, ticket);
528		- } else {
529		- trace_xfs_log_done_perm(log, ticket);
530		-
531		- xlog_regrant_reserve_log_space(log, ticket);
532		- /* If this ticket was a permanent reservation and we aren't
533		- * trying to release it, reset the inited flags; so next time
534		- * we write, a start record will be written out.
535		- */
536		- ticket->t_flags \|= XLOG_TIC_INITED;
537		- }
538		-
539		- xfs_log_ticket_put(ticket);
540		- return lsn;
541		-}
542		-
543		-/*
544		- * Attaches a new iclog I/O completion callback routine during
545		- * transaction commit. If the log is in error state, a non-zero
546		- * return code is handed back and the caller is responsible for
547		- * executing the callback at an appropriate time.
548		- */
549		-int
550		-xfs_log_notify(
551		- struct xlog_in_core *iclog,
552		- xfs_log_callback_t *cb)
553		-{
554		- int abortflg;
555		-
556		- spin_lock(&iclog->ic_callback_lock);
557		- abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
558		- if (!abortflg) {
559		- ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) \|\|
560		- (iclog->ic_state == XLOG_STATE_WANT_SYNC));
561		- cb->cb_next = NULL;
562		- *(iclog->ic_callback_tail) = cb;
563		- iclog->ic_callback_tail = &(cb->cb_next);
564		- }
565		- spin_unlock(&iclog->ic_callback_lock);
566		- return abortflg;
567		-}
568		-
569		-int
570		-xfs_log_release_iclog(
571		- struct xfs_mount *mp,
	485	+static bool
	486	+__xlog_state_release_iclog(
	487	+ struct xlog *log,
572	488	struct xlog_in_core *iclog)
573	489	{
574		- if (xlog_state_release_iclog(mp->m_log, iclog)) {
575		- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
	490	+ lockdep_assert_held(&log->l_icloglock);
	491	+
	492	+ if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
	493	+ /* update tail before writing to iclog */
	494	+ xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
	495	+
	496	+ iclog->ic_state = XLOG_STATE_SYNCING;
	497	+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
	498	+ xlog_verify_tail_lsn(log, iclog, tail_lsn);
	499	+ /* cycle incremented when incrementing curr_block */
	500	+ return true;
	501	+ }
	502	+
	503	+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
	504	+ return false;
	505	+}
	506	+
	507	+/*
	508	+ * Flush iclog to disk if this is the last reference to the given iclog and the
	509	+ * it is in the WANT_SYNC state.
	510	+ */
	511	+static int
	512	+xlog_state_release_iclog(
	513	+ struct xlog *log,
	514	+ struct xlog_in_core *iclog)
	515	+{
	516	+ lockdep_assert_held(&log->l_icloglock);
	517	+
	518	+ if (iclog->ic_state == XLOG_STATE_IOERROR)
576	519	return -EIO;
	520	+
	521	+ if (atomic_dec_and_test(&iclog->ic_refcnt) &&
	522	+ __xlog_state_release_iclog(log, iclog)) {
	523	+ spin_unlock(&log->l_icloglock);
	524	+ xlog_sync(log, iclog);
	525	+ spin_lock(&log->l_icloglock);
577	526	}
578	527
579	528	return 0;
	529	+}
	530	+
	531	+void
	532	+xfs_log_release_iclog(
	533	+ struct xlog_in_core *iclog)
	534	+{
	535	+ struct xlog *log = iclog->ic_log;
	536	+ bool sync = false;
	537	+
	538	+ if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) {
	539	+ if (iclog->ic_state != XLOG_STATE_IOERROR)
	540	+ sync = __xlog_state_release_iclog(log, iclog);
	541	+ spin_unlock(&log->l_icloglock);
	542	+ }
	543	+
	544	+ if (sync)
	545	+ xlog_sync(log, iclog);
580	546	}
581	547
582	548	/*
..	..	@@ -799,6 +765,9 @@
799	765	if (readonly)
800	766	mp->m_flags \|= XFS_MOUNT_RDONLY;
801	767
	768	+ /* Make sure the log is dead if we're returning failure. */
	769	+ ASSERT(!error \|\| (mp->m_log->l_flags & XLOG_IO_ERROR));
	770	+
802	771	return error;
803	772	}
804	773
..	..	@@ -806,45 +775,78 @@
806	775	* The mount has failed. Cancel the recovery if it hasn't completed and destroy
807	776	* the log.
808	777	*/
809		-int
	778	+void
810	779	xfs_log_mount_cancel(
811	780	struct xfs_mount *mp)
812	781	{
813		- int error;
814		-
815		- error = xlog_recover_cancel(mp->m_log);
	782	+ xlog_recover_cancel(mp->m_log);
816	783	xfs_log_unmount(mp);
817		-
818		- return error;
819	784	}
820	785
821	786	/*
822		- * Final log writes as part of unmount.
823		- *
824		- * Mark the filesystem clean as unmount happens. Note that during relocation
825		- * this routine needs to be executed as part of source-bag while the
826		- * deallocation must not be done until source-end.
	787	+ * Wait for the iclog to be written disk, or return an error if the log has been
	788	+ * shut down.
827	789	*/
828		-
829		-/* Actually write the unmount record to disk. */
830		-static void
831		-xfs_log_write_unmount_record(
832		- struct xfs_mount *mp)
	790	+static int
	791	+xlog_wait_on_iclog(
	792	+ struct xlog_in_core *iclog)
	793	+ __releases(iclog->ic_log->l_icloglock)
833	794	{
834		- /* the data section must be 32 bit size aligned */
835		- struct xfs_unmount_log_format magic = {
	795	+ struct xlog *log = iclog->ic_log;
	796	+
	797	+ if (!XLOG_FORCED_SHUTDOWN(log) &&
	798	+ iclog->ic_state != XLOG_STATE_ACTIVE &&
	799	+ iclog->ic_state != XLOG_STATE_DIRTY) {
	800	+ XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
	801	+ xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
	802	+ } else {
	803	+ spin_unlock(&log->l_icloglock);
	804	+ }
	805	+
	806	+ if (XLOG_FORCED_SHUTDOWN(log))
	807	+ return -EIO;
	808	+ return 0;
	809	+}
	810	+
	811	+/*
	812	+ * Write out an unmount record using the ticket provided. We have to account for
	813	+ * the data space used in the unmount ticket as this write is not done from a
	814	+ * transaction context that has already done the accounting for us.
	815	+ */
	816	+static int
	817	+xlog_write_unmount_record(
	818	+ struct xlog *log,
	819	+ struct xlog_ticket *ticket,
	820	+ xfs_lsn_t *lsn,
	821	+ uint flags)
	822	+{
	823	+ struct xfs_unmount_log_format ulf = {
836	824	.magic = XLOG_UNMOUNT_TYPE,
837	825	};
838	826	struct xfs_log_iovec reg = {
839		- .i_addr = &magic,
840		- .i_len = sizeof(magic),
	827	+ .i_addr = &ulf,
	828	+ .i_len = sizeof(ulf),
841	829	.i_type = XLOG_REG_TYPE_UNMOUNT,
842	830	};
843	831	struct xfs_log_vec vec = {
844	832	.lv_niovecs = 1,
845	833	.lv_iovecp = &reg,
846	834	};
847		- struct xlog *log = mp->m_log;
	835	+
	836	+ /* account for space used by record data */
	837	+ ticket->t_curr_res -= sizeof(ulf);
	838	+ return xlog_write(log, &vec, ticket, lsn, NULL, flags, false);
	839	+}
	840	+
	841	+/*
	842	+ * Mark the filesystem clean by writing an unmount record to the head of the
	843	+ * log.
	844	+ */
	845	+static void
	846	+xlog_unmount_write(
	847	+ struct xlog *log)
	848	+{
	849	+ struct xfs_mount *mp = log->l_mp;
848	850	struct xlog_in_core *iclog;
849	851	struct xlog_ticket *tic = NULL;
850	852	xfs_lsn_t lsn;
..	..	@@ -855,23 +857,7 @@
855	857	if (error)
856	858	goto out_err;
857	859
858		- /*
859		- * If we think the summary counters are bad, clear the unmount header
860		- * flag in the unmount record so that the summary counters will be
861		- * recalculated during log recovery at next mount. Refer to
862		- * xlog_check_unmount_rec for more details.
863		- */
864		- if (XFS_TEST_ERROR((mp->m_flags & XFS_MOUNT_BAD_SUMMARY), mp,
865		- XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
866		- xfs_alert(mp, "%s: will fix summary counters at next mount",
867		- __func__);
868		- flags &= ~XLOG_UNMOUNT_TRANS;
869		- }
870		-
871		- /* remove inited flag, and account for space used */
872		- tic->t_flags = 0;
873		- tic->t_curr_res -= sizeof(magic);
874		- error = xlog_write(log, &vec, tic, &lsn, NULL, flags);
	860	+ error = xlog_write_unmount_record(log, tic, &lsn, flags);
875	861	/*
876	862	* At this point, we're umounting anyway, so there's no point in
877	863	* transitioning log state to IOERROR. Just continue...
..	..	@@ -883,29 +869,30 @@
883	869	spin_lock(&log->l_icloglock);
884	870	iclog = log->l_iclog;
885	871	atomic_inc(&iclog->ic_refcnt);
886		- xlog_state_want_sync(log, iclog);
887		- spin_unlock(&log->l_icloglock);
	872	+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
	873	+ xlog_state_switch_iclogs(log, iclog, 0);
	874	+ else
	875	+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC \|\|
	876	+ iclog->ic_state == XLOG_STATE_IOERROR);
888	877	error = xlog_state_release_iclog(log, iclog);
889		-
890		- spin_lock(&log->l_icloglock);
891		- switch (iclog->ic_state) {
892		- default:
893		- if (!XLOG_FORCED_SHUTDOWN(log)) {
894		- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
895		- break;
896		- }
897		- /* fall through */
898		- case XLOG_STATE_ACTIVE:
899		- case XLOG_STATE_DIRTY:
900		- spin_unlock(&log->l_icloglock);
901		- break;
902		- }
	878	+ xlog_wait_on_iclog(iclog);
903	879
904	880	if (tic) {
905	881	trace_xfs_log_umount_write(log, tic);
906		- xlog_ungrant_log_space(log, tic);
907		- xfs_log_ticket_put(tic);
	882	+ xfs_log_ticket_ungrant(log, tic);
908	883	}
	884	+}
	885	+
	886	+static void
	887	+xfs_log_unmount_verify_iclog(
	888	+ struct xlog *log)
	889	+{
	890	+ struct xlog_in_core *iclog = log->l_iclog;
	891	+
	892	+ do {
	893	+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
	894	+ ASSERT(iclog->ic_offset == 0);
	895	+ } while ((iclog = iclog->ic_next) != log->l_iclog);
909	896	}
910	897
911	898	/*
..	..	@@ -915,79 +902,36 @@
915	902	* currently architecture converted and "Unmount" is a bit foo.
916	903	* As far as I know, there weren't any dependencies on the old behaviour.
917	904	*/
918		-
919		-static int
920		-xfs_log_unmount_write(xfs_mount_t *mp)
	905	+static void
	906	+xfs_log_unmount_write(
	907	+ struct xfs_mount *mp)
921	908	{
922		- struct xlog *log = mp->m_log;
923		- xlog_in_core_t *iclog;
924		-#ifdef DEBUG
925		- xlog_in_core_t *first_iclog;
926		-#endif
927		- int error;
	909	+ struct xlog *log = mp->m_log;
	910	+
	911	+ if (!xfs_log_writable(mp))
	912	+ return;
	913	+
	914	+ xfs_log_force(mp, XFS_LOG_SYNC);
	915	+
	916	+ if (XLOG_FORCED_SHUTDOWN(log))
	917	+ return;
928	918
929	919	/*
930		- * Don't write out unmount record on norecovery mounts or ro devices.
931		- * Or, if we are doing a forced umount (typically because of IO errors).
	920	+ * If we think the summary counters are bad, avoid writing the unmount
	921	+ * record to force log recovery at next mount, after which the summary
	922	+ * counters will be recalculated. Refer to xlog_check_unmount_rec for
	923	+ * more details.
932	924	*/
933		- if (mp->m_flags & XFS_MOUNT_NORECOVERY \|\|
934		- xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
935		- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
936		- return 0;
	925	+ if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
	926	+ XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
	927	+ xfs_alert(mp, "%s: will fix summary counters at next mount",
	928	+ __func__);
	929	+ return;
937	930	}
938	931
939		- error = xfs_log_force(mp, XFS_LOG_SYNC);
940		- ASSERT(error \|\| !(XLOG_FORCED_SHUTDOWN(log)));
941		-
942		-#ifdef DEBUG
943		- first_iclog = iclog = log->l_iclog;
944		- do {
945		- if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
946		- ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
947		- ASSERT(iclog->ic_offset == 0);
948		- }
949		- iclog = iclog->ic_next;
950		- } while (iclog != first_iclog);
951		-#endif
952		- if (! (XLOG_FORCED_SHUTDOWN(log))) {
953		- xfs_log_write_unmount_record(mp);
954		- } else {
955		- /*
956		- * We're already in forced_shutdown mode, couldn't
957		- * even attempt to write out the unmount transaction.
958		- *
959		- * Go through the motions of sync'ing and releasing
960		- * the iclog, even though no I/O will actually happen,
961		- * we need to wait for other log I/Os that may already
962		- * be in progress. Do this as a separate section of
963		- * code so we'll know if we ever get stuck here that
964		- * we're in this odd situation of trying to unmount
965		- * a file system that went into forced_shutdown as
966		- * the result of an unmount..
967		- */
968		- spin_lock(&log->l_icloglock);
969		- iclog = log->l_iclog;
970		- atomic_inc(&iclog->ic_refcnt);
971		-
972		- xlog_state_want_sync(log, iclog);
973		- spin_unlock(&log->l_icloglock);
974		- error = xlog_state_release_iclog(log, iclog);
975		-
976		- spin_lock(&log->l_icloglock);
977		-
978		- if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE
979		- \|\| iclog->ic_state == XLOG_STATE_DIRTY
980		- \|\| iclog->ic_state == XLOG_STATE_IOERROR) ) {
981		-
982		- xlog_wait(&iclog->ic_force_wait,
983		- &log->l_icloglock);
984		- } else {
985		- spin_unlock(&log->l_icloglock);
986		- }
987		- }
988		-
989		- return error;
990		-} /* xfs_log_unmount_write */
	932	+ xfs_log_unmount_verify_iclog(log);
	933	+ xlog_unmount_write(log);
	934	+}
991	935
992	936	/*
993	937	* Empty the log for unmount/freeze.
..	..	@@ -1243,53 +1187,40 @@
1243	1187	}
1244	1188
1245	1189
1246		-/*
1247		- * Log function which is called when an io completes.
1248		- *
1249		- * The log manager needs its own routine, in order to control what
1250		- * happens with the buffer after the write completes.
1251		- */
1252	1190	static void
1253		-xlog_iodone(xfs_buf_t *bp)
	1191	+xlog_ioend_work(
	1192	+ struct work_struct *work)
1254	1193	{
1255		- struct xlog_in_core *iclog = bp->b_log_item;
1256		- struct xlog *l = iclog->ic_log;
1257		- int aborted = 0;
	1194	+ struct xlog_in_core *iclog =
	1195	+ container_of(work, struct xlog_in_core, ic_end_io_work);
	1196	+ struct xlog *log = iclog->ic_log;
	1197	+ int error;
	1198	+
	1199	+ error = blk_status_to_errno(iclog->ic_bio.bi_status);
	1200	+#ifdef DEBUG
	1201	+ /* treat writes with injected CRC errors as failed */
	1202	+ if (iclog->ic_fail_crc)
	1203	+ error = -EIO;
	1204	+#endif
1258	1205
1259	1206	/*
1260		- * Race to shutdown the filesystem if we see an error or the iclog is in
1261		- * IOABORT state. The IOABORT state is only set in DEBUG mode to inject
1262		- * CRC errors into log recovery.
	1207	+ * Race to shutdown the filesystem if we see an error.
1263	1208	*/
1264		- if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR) \|\|
1265		- iclog->ic_state & XLOG_STATE_IOABORT) {
1266		- if (iclog->ic_state & XLOG_STATE_IOABORT)
1267		- iclog->ic_state &= ~XLOG_STATE_IOABORT;
1268		-
1269		- xfs_buf_ioerror_alert(bp, __func__);
1270		- xfs_buf_stale(bp);
1271		- xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
1272		- /*
1273		- * This flag will be propagated to the trans-committed
1274		- * callback routines to let them know that the log-commit
1275		- * didn't succeed.
1276		- */
1277		- aborted = XFS_LI_ABORTED;
1278		- } else if (iclog->ic_state & XLOG_STATE_IOERROR) {
1279		- aborted = XFS_LI_ABORTED;
	1209	+ if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
	1210	+ xfs_alert(log->l_mp, "log I/O error %d", error);
	1211	+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
1280	1212	}
1281	1213
1282		- /* log I/O is always issued ASYNC */
1283		- ASSERT(bp->b_flags & XBF_ASYNC);
1284		- xlog_state_done_syncing(iclog, aborted);
	1214	+ xlog_state_done_syncing(iclog);
	1215	+ bio_uninit(&iclog->ic_bio);
1285	1216
1286	1217	/*
1287		- * drop the buffer lock now that we are done. Nothing references
1288		- * the buffer after this, so an unmount waiting on this lock can now
1289		- * tear it down safely. As such, it is unsafe to reference the buffer
1290		- * (bp) after the unlock as we could race with it being freed.
	1218	+ * Drop the lock to signal that we are done. Nothing references the
	1219	+ * iclog after this, so an unmount waiting on this lock can now tear it
	1220	+ * down safely. As such, it is unsafe to reference the iclog after the
	1221	+ * unlock as we could race with it being freed.
1291	1222	*/
1292		- xfs_buf_unlock(bp);
	1223	+ up(&iclog->ic_sema);
1293	1224	}
1294	1225
1295	1226	/*
..	..	@@ -1300,65 +1231,26 @@
1300	1231	* If the filesystem blocksize is too large, we may need to choose a
1301	1232	* larger size since the directory code currently logs entire blocks.
1302	1233	*/
1303		-
1304	1234	STATIC void
1305	1235	xlog_get_iclog_buffer_size(
1306	1236	struct xfs_mount *mp,
1307	1237	struct xlog *log)
1308	1238	{
1309		- int size;
1310		- int xhdrs;
1311		-
1312	1239	if (mp->m_logbufs <= 0)
1313		- log->l_iclog_bufs = XLOG_MAX_ICLOGS;
1314		- else
1315		- log->l_iclog_bufs = mp->m_logbufs;
	1240	+ mp->m_logbufs = XLOG_MAX_ICLOGS;
	1241	+ if (mp->m_logbsize <= 0)
	1242	+ mp->m_logbsize = XLOG_BIG_RECORD_BSIZE;
	1243	+
	1244	+ log->l_iclog_bufs = mp->m_logbufs;
	1245	+ log->l_iclog_size = mp->m_logbsize;
1316	1246
1317	1247	/*
1318		- * Buffer size passed in from mount system call.
	1248	+ * # headers = size / 32k - one header holds cycles from 32k of data.
1319	1249	*/
1320		- if (mp->m_logbsize > 0) {
1321		- size = log->l_iclog_size = mp->m_logbsize;
1322		- log->l_iclog_size_log = 0;
1323		- while (size != 1) {
1324		- log->l_iclog_size_log++;
1325		- size >>= 1;
1326		- }
1327		-
1328		- if (xfs_sb_version_haslogv2(&mp->m_sb)) {
1329		- /* # headers = size / 32k
1330		- * one header holds cycles from 32k of data
1331		- */
1332		-
1333		- xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
1334		- if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
1335		- xhdrs++;
1336		- log->l_iclog_hsize = xhdrs << BBSHIFT;
1337		- log->l_iclog_heads = xhdrs;
1338		- } else {
1339		- ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
1340		- log->l_iclog_hsize = BBSIZE;
1341		- log->l_iclog_heads = 1;
1342		- }
1343		- goto done;
1344		- }
1345		-
1346		- /* All machines use 32kB buffers by default. */
1347		- log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
1348		- log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
1349		-
1350		- /* the default log size is 16k or 32k which is one header sector */
1351		- log->l_iclog_hsize = BBSIZE;
1352		- log->l_iclog_heads = 1;
1353		-
1354		-done:
1355		- /* are we being asked to make the sizes selected above visible? */
1356		- if (mp->m_logbufs == 0)
1357		- mp->m_logbufs = log->l_iclog_bufs;
1358		- if (mp->m_logbsize == 0)
1359		- mp->m_logbsize = log->l_iclog_size;
1360		-} /* xlog_get_iclog_buffer_size */
1361		-
	1250	+ log->l_iclog_heads =
	1251	+ DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE);
	1252	+ log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT;
	1253	+}
1362	1254
1363	1255	void
1364	1256	xfs_log_work_queue(
..	..	@@ -1421,7 +1313,6 @@
1421	1313	xlog_rec_header_t *head;
1422	1314	xlog_in_core_t **iclogp;
1423	1315	xlog_in_core_t iclog, prev_iclog=NULL;
1424		- xfs_buf_t *bp;
1425	1316	int i;
1426	1317	int error = -ENOMEM;
1427	1318	uint log2_size = 0;
..	..	@@ -1479,30 +1370,6 @@
1479	1370
1480	1371	xlog_get_iclog_buffer_size(mp, log);
1481	1372
1482		- /*
1483		- * Use a NULL block for the extra log buffer used during splits so that
1484		- * it will trigger errors if we ever try to do IO on it without first
1485		- * having set it up properly.
1486		- */
1487		- error = -ENOMEM;
1488		- bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
1489		- BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
1490		- if (!bp)
1491		- goto out_free_log;
1492		-
1493		- /*
1494		- * The iclogbuf buffer locks are held over IO but we are not going to do
1495		- * IO yet. Hence unlock the buffer so that the log IO path can grab it
1496		- * when appropriately.
1497		- */
1498		- ASSERT(xfs_buf_islocked(bp));
1499		- xfs_buf_unlock(bp);
1500		-
1501		- /* use high priority wq for log I/O completion */
1502		- bp->b_ioend_wq = mp->m_log_workqueue;
1503		- bp->b_iodone = xlog_iodone;
1504		- log->l_xbuf = bp;
1505		-
1506	1373	spin_lock_init(&log->l_icloglock);
1507	1374	init_waitqueue_head(&log->l_flush_wait);
1508	1375
..	..	@@ -1515,29 +1382,23 @@
1515	1382	* xlog_in_core_t in xfs_log_priv.h for details.
1516	1383	*/
1517	1384	ASSERT(log->l_iclog_size >= 4096);
1518		- for (i=0; i < log->l_iclog_bufs; i++) {
1519		- *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
1520		- if (!*iclogp)
	1385	+ for (i = 0; i < log->l_iclog_bufs; i++) {
	1386	+ int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
	1387	+ size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
	1388	+ sizeof(struct bio_vec);
	1389	+
	1390	+ iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
	1391	+ if (!iclog)
1521	1392	goto out_free_iclog;
1522	1393
1523		- iclog = *iclogp;
	1394	+ *iclogp = iclog;
1524	1395	iclog->ic_prev = prev_iclog;
1525	1396	prev_iclog = iclog;
1526	1397
1527		- bp = xfs_buf_get_uncached(mp->m_logdev_targp,
1528		- BTOBB(log->l_iclog_size),
1529		- XBF_NO_IOACCT);
1530		- if (!bp)
	1398	+ iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
	1399	+ KM_MAYFAIL \| KM_ZERO);
	1400	+ if (!iclog->ic_data)
1531	1401	goto out_free_iclog;
1532		-
1533		- ASSERT(xfs_buf_islocked(bp));
1534		- xfs_buf_unlock(bp);
1535		-
1536		- /* use high priority wq for log I/O completion */
1537		- bp->b_ioend_wq = mp->m_log_workqueue;
1538		- bp->b_iodone = xlog_iodone;
1539		- iclog->ic_bp = bp;
1540		- iclog->ic_data = bp->b_addr;
1541	1402	#ifdef DEBUG
1542	1403	log->l_iclog_bak[i] = &iclog->ic_header;
1543	1404	#endif
..	..	@@ -1551,58 +1412,62 @@
1551	1412	head->h_fmt = cpu_to_be32(XLOG_FMT);
1552	1413	memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
1553	1414
1554		- iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize;
	1415	+ iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize;
1555	1416	iclog->ic_state = XLOG_STATE_ACTIVE;
1556	1417	iclog->ic_log = log;
1557	1418	atomic_set(&iclog->ic_refcnt, 0);
1558	1419	spin_lock_init(&iclog->ic_callback_lock);
1559		- iclog->ic_callback_tail = &(iclog->ic_callback);
	1420	+ INIT_LIST_HEAD(&iclog->ic_callbacks);
1560	1421	iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
1561	1422
1562	1423	init_waitqueue_head(&iclog->ic_force_wait);
1563	1424	init_waitqueue_head(&iclog->ic_write_wait);
	1425	+ INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work);
	1426	+ sema_init(&iclog->ic_sema, 1);
1564	1427
1565	1428	iclogp = &iclog->ic_next;
1566	1429	}
1567	1430	iclogp = log->l_iclog; / complete ring */
1568	1431	log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
1569	1432
	1433	+ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
	1434	+ WQ_MEM_RECLAIM \| WQ_FREEZABLE \| WQ_HIGHPRI, 0,
	1435	+ mp->m_super->s_id);
	1436	+ if (!log->l_ioend_workqueue)
	1437	+ goto out_free_iclog;
	1438	+
1570	1439	error = xlog_cil_init(log);
1571	1440	if (error)
1572		- goto out_free_iclog;
	1441	+ goto out_destroy_workqueue;
1573	1442	return log;
1574	1443
	1444	+out_destroy_workqueue:
	1445	+ destroy_workqueue(log->l_ioend_workqueue);
1575	1446	out_free_iclog:
1576	1447	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
1577	1448	prev_iclog = iclog->ic_next;
1578		- if (iclog->ic_bp)
1579		- xfs_buf_free(iclog->ic_bp);
	1449	+ kmem_free(iclog->ic_data);
1580	1450	kmem_free(iclog);
1581	1451	if (prev_iclog == log->l_iclog)
1582	1452	break;
1583	1453	}
1584		- spinlock_destroy(&log->l_icloglock);
1585		- xfs_buf_free(log->l_xbuf);
1586	1454	out_free_log:
1587	1455	kmem_free(log);
1588	1456	out:
1589	1457	return ERR_PTR(error);
1590	1458	} /* xlog_alloc_log */
1591	1459
1592		-
1593	1460	/*
1594	1461	* Write out the commit record of a transaction associated with the given
1595		- * ticket. Return the lsn of the commit record.
	1462	+ * ticket to close off a running log write. Return the lsn of the commit record.
1596	1463	*/
1597		-STATIC int
	1464	+int
1598	1465	xlog_commit_record(
1599	1466	struct xlog *log,
1600	1467	struct xlog_ticket *ticket,
1601	1468	struct xlog_in_core **iclog,
1602		- xfs_lsn_t *commitlsnp)
	1469	+ xfs_lsn_t *lsn)
1603	1470	{
1604		- struct xfs_mount *mp = log->l_mp;
1605		- int error;
1606	1471	struct xfs_log_iovec reg = {
1607	1472	.i_addr = NULL,
1608	1473	.i_len = 0,
..	..	@@ -1612,24 +1477,27 @@
1612	1477	.lv_niovecs = 1,
1613	1478	.lv_iovecp = &reg,
1614	1479	};
	1480	+ int error;
1615	1481
1616		- ASSERT_ALWAYS(iclog);
1617		- error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
1618		- XLOG_COMMIT_TRANS);
	1482	+ if (XLOG_FORCED_SHUTDOWN(log))
	1483	+ return -EIO;
	1484	+
	1485	+ error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS,
	1486	+ false);
1619	1487	if (error)
1620		- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
	1488	+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
1621	1489	return error;
1622	1490	}
1623	1491
1624	1492	/*
1625		- * Push on the buffer cache code if we ever use more than 75% of the on-disk
1626		- * log space. This code pushes on the lsn which would supposedly free up
1627		- * the 25% which we want to leave free. We may need to adopt a policy which
1628		- * pushes on an lsn which is further along in the log once we reach the high
1629		- * water mark. In this manner, we would be creating a low water mark.
	1493	+ * Compute the LSN that we'd need to push the log tail towards in order to have
	1494	+ * (a) enough on-disk log space to log the number of bytes specified, (b) at
	1495	+ * least 25% of the log space free, and (c) at least 256 blocks free. If the
	1496	+ * log free space already meets all three thresholds, this function returns
	1497	+ * NULLCOMMITLSN.
1630	1498	*/
1631		-STATIC void
1632		-xlog_grant_push_ail(
	1499	+xfs_lsn_t
	1500	+xlog_grant_push_threshold(
1633	1501	struct xlog *log,
1634	1502	int need_bytes)
1635	1503	{
..	..	@@ -1655,7 +1523,7 @@
1655	1523	free_threshold = max(free_threshold, (log->l_logBBsize >> 2));
1656	1524	free_threshold = max(free_threshold, 256);
1657	1525	if (free_blocks >= free_threshold)
1658		- return;
	1526	+ return NULLCOMMITLSN;
1659	1527
1660	1528	xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
1661	1529	&threshold_block);
..	..	@@ -1675,13 +1543,33 @@
1675	1543	if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
1676	1544	threshold_lsn = last_sync_lsn;
1677	1545
	1546	+ return threshold_lsn;
	1547	+}
	1548	+
	1549	+/*
	1550	+ * Push the tail of the log if we need to do so to maintain the free log space
	1551	+ * thresholds set out by xlog_grant_push_threshold. We may need to adopt a
	1552	+ * policy which pushes on an lsn which is further along in the log once we
	1553	+ * reach the high water mark. In this manner, we would be creating a low water
	1554	+ * mark.
	1555	+ */
	1556	+STATIC void
	1557	+xlog_grant_push_ail(
	1558	+ struct xlog *log,
	1559	+ int need_bytes)
	1560	+{
	1561	+ xfs_lsn_t threshold_lsn;
	1562	+
	1563	+ threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
	1564	+ if (threshold_lsn == NULLCOMMITLSN \|\| XLOG_FORCED_SHUTDOWN(log))
	1565	+ return;
	1566	+
1678	1567	/*
1679	1568	* Get the transaction layer to kick the dirty buffers out to
1680	1569	* disk asynchronously. No point in trying to do this if
1681	1570	* the filesystem is shutting down.
1682	1571	*/
1683		- if (!XLOG_FORCED_SHUTDOWN(log))
1684		- xfs_ail_push(log->l_ailp, threshold_lsn);
	1572	+ xfs_ail_push(log->l_ailp, threshold_lsn);
1685	1573	}
1686	1574
1687	1575	/*
..	..	@@ -1751,9 +1639,7 @@
1751	1639	int i;
1752	1640	int xheads;
1753	1641
1754		- xheads = size / XLOG_HEADER_CYCLE_SIZE;
1755		- if (size % XLOG_HEADER_CYCLE_SIZE)
1756		- xheads++;
	1642	+ xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE);
1757	1643
1758	1644	for (i = 1; i < xheads; i++) {
1759	1645	crc = crc32c(crc, &xhdr[i].hic_xheader,
..	..	@@ -1767,42 +1653,167 @@
1767	1653	return xfs_end_cksum(crc);
1768	1654	}
1769	1655
1770		-/*
1771		- * The bdstrat callback function for log bufs. This gives us a central
1772		- * place to trap bufs in case we get hit by a log I/O error and need to
1773		- * shutdown. Actually, in practice, even when we didn't get a log error,
1774		- * we transition the iclogs to IOERROR state after flushing all existing
1775		- * iclogs to disk. This is because we don't want anymore new transactions to be
1776		- * started or completed afterwards.
1777		- *
1778		- * We lock the iclogbufs here so that we can serialise against IO completion
1779		- * during unmount. We might be processing a shutdown triggered during unmount,
1780		- * and that can occur asynchronously to the unmount thread, and hence we need to
1781		- * ensure that completes before tearing down the iclogbufs. Hence we need to
1782		- * hold the buffer lock across the log IO to acheive that.
1783		- */
1784		-STATIC int
1785		-xlog_bdstrat(
1786		- struct xfs_buf *bp)
	1656	+static void
	1657	+xlog_bio_end_io(
	1658	+ struct bio *bio)
1787	1659	{
1788		- struct xlog_in_core *iclog = bp->b_log_item;
	1660	+ struct xlog_in_core *iclog = bio->bi_private;
1789	1661
1790		- xfs_buf_lock(bp);
1791		- if (iclog->ic_state & XLOG_STATE_IOERROR) {
1792		- xfs_buf_ioerror(bp, -EIO);
1793		- xfs_buf_stale(bp);
1794		- xfs_buf_ioend(bp);
	1662	+ queue_work(iclog->ic_log->l_ioend_workqueue,
	1663	+ &iclog->ic_end_io_work);
	1664	+}
	1665	+
	1666	+static int
	1667	+xlog_map_iclog_data(
	1668	+ struct bio *bio,
	1669	+ void *data,
	1670	+ size_t count)
	1671	+{
	1672	+ do {
	1673	+ struct page *page = kmem_to_page(data);
	1674	+ unsigned int off = offset_in_page(data);
	1675	+ size_t len = min_t(size_t, count, PAGE_SIZE - off);
	1676	+
	1677	+ if (bio_add_page(bio, page, len, off) != len)
	1678	+ return -EIO;
	1679	+
	1680	+ data += len;
	1681	+ count -= len;
	1682	+ } while (count);
	1683	+
	1684	+ return 0;
	1685	+}
	1686	+
	1687	+STATIC void
	1688	+xlog_write_iclog(
	1689	+ struct xlog *log,
	1690	+ struct xlog_in_core *iclog,
	1691	+ uint64_t bno,
	1692	+ unsigned int count,
	1693	+ bool need_flush)
	1694	+{
	1695	+ ASSERT(bno < log->l_logBBsize);
	1696	+
	1697	+ /*
	1698	+ * We lock the iclogbufs here so that we can serialise against I/O
	1699	+ * completion during unmount. We might be processing a shutdown
	1700	+ * triggered during unmount, and that can occur asynchronously to the
	1701	+ * unmount thread, and hence we need to ensure that completes before
	1702	+ * tearing down the iclogbufs. Hence we need to hold the buffer lock
	1703	+ * across the log IO to archieve that.
	1704	+ */
	1705	+ down(&iclog->ic_sema);
	1706	+ if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) {
1795	1707	/*
1796	1708	* It would seem logical to return EIO here, but we rely on
1797	1709	* the log state machine to propagate I/O errors instead of
1798		- * doing it here. Similarly, IO completion will unlock the
1799		- * buffer, so we don't do it here.
	1710	+ * doing it here. We kick of the state machine and unlock
	1711	+ * the buffer manually, the code needs to be kept in sync
	1712	+ * with the I/O completion path.
1800	1713	*/
1801		- return 0;
	1714	+ xlog_state_done_syncing(iclog);
	1715	+ up(&iclog->ic_sema);
	1716	+ return;
1802	1717	}
1803	1718
1804		- xfs_buf_submit(bp);
1805		- return 0;
	1719	+ bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
	1720	+ bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
	1721	+ iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
	1722	+ iclog->ic_bio.bi_end_io = xlog_bio_end_io;
	1723	+ iclog->ic_bio.bi_private = iclog;
	1724	+
	1725	+ /*
	1726	+ * We use REQ_SYNC \| REQ_IDLE here to tell the block layer the are more
	1727	+ * IOs coming immediately after this one. This prevents the block layer
	1728	+ * writeback throttle from throttling log writes behind background
	1729	+ * metadata writeback and causing priority inversions.
	1730	+ */
	1731	+ iclog->ic_bio.bi_opf = REQ_OP_WRITE \| REQ_META \| REQ_SYNC \|
	1732	+ REQ_IDLE \| REQ_FUA;
	1733	+ if (need_flush)
	1734	+ iclog->ic_bio.bi_opf \|= REQ_PREFLUSH;
	1735	+
	1736	+ if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
	1737	+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
	1738	+ return;
	1739	+ }
	1740	+ if (is_vmalloc_addr(iclog->ic_data))
	1741	+ flush_kernel_vmap_range(iclog->ic_data, count);
	1742	+
	1743	+ /*
	1744	+ * If this log buffer would straddle the end of the log we will have
	1745	+ * to split it up into two bios, so that we can continue at the start.
	1746	+ */
	1747	+ if (bno + BTOBB(count) > log->l_logBBsize) {
	1748	+ struct bio *split;
	1749	+
	1750	+ split = bio_split(&iclog->ic_bio, log->l_logBBsize - bno,
	1751	+ GFP_NOIO, &fs_bio_set);
	1752	+ bio_chain(split, &iclog->ic_bio);
	1753	+ submit_bio(split);
	1754	+
	1755	+ /* restart at logical offset zero for the remainder */
	1756	+ iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart;
	1757	+ }
	1758	+
	1759	+ submit_bio(&iclog->ic_bio);
	1760	+}
	1761	+
	1762	+/*
	1763	+ * We need to bump cycle number for the part of the iclog that is
	1764	+ * written to the start of the log. Watch out for the header magic
	1765	+ * number case, though.
	1766	+ */
	1767	+static void
	1768	+xlog_split_iclog(
	1769	+ struct xlog *log,
	1770	+ void *data,
	1771	+ uint64_t bno,
	1772	+ unsigned int count)
	1773	+{
	1774	+ unsigned int split_offset = BBTOB(log->l_logBBsize - bno);
	1775	+ unsigned int i;
	1776	+
	1777	+ for (i = split_offset; i < count; i += BBSIZE) {
	1778	+ uint32_t cycle = get_unaligned_be32(data + i);
	1779	+
	1780	+ if (++cycle == XLOG_HEADER_MAGIC_NUM)
	1781	+ cycle++;
	1782	+ put_unaligned_be32(cycle, data + i);
	1783	+ }
	1784	+}
	1785	+
	1786	+static int
	1787	+xlog_calc_iclog_size(
	1788	+ struct xlog *log,
	1789	+ struct xlog_in_core *iclog,
	1790	+ uint32_t *roundoff)
	1791	+{
	1792	+ uint32_t count_init, count;
	1793	+ bool use_lsunit;
	1794	+
	1795	+ use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
	1796	+ log->l_mp->m_sb.sb_logsunit > 1;
	1797	+
	1798	+ /* Add for LR header */
	1799	+ count_init = log->l_iclog_hsize + iclog->ic_offset;
	1800	+
	1801	+ /* Round out the log write size */
	1802	+ if (use_lsunit) {
	1803	+ /* we have a v2 stripe unit to use */
	1804	+ count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
	1805	+ } else {
	1806	+ count = BBTOB(BTOBB(count_init));
	1807	+ }
	1808	+
	1809	+ ASSERT(count >= count_init);
	1810	+ *roundoff = count - count_init;
	1811	+
	1812	+ if (use_lsunit)
	1813	+ ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit);
	1814	+ else
	1815	+ ASSERT(*roundoff < BBTOB(1));
	1816	+ return count;
1806	1817	}
1807	1818
1808	1819	/*
..	..	@@ -1825,46 +1836,23 @@
1825	1836	* log will require grabbing the lock though.
1826	1837	*
1827	1838	* The entire log manager uses a logical block numbering scheme. Only
1828		- * log_sync (and then only bwrite()) know about the fact that the log may
1829		- * not start with block zero on a given device. The log block start offset
1830		- * is added immediately before calling bwrite().
	1839	+ * xlog_write_iclog knows about the fact that the log may not start with
	1840	+ * block zero on a given device.
1831	1841	*/
1832		-
1833		-STATIC int
	1842	+STATIC void
1834	1843	xlog_sync(
1835	1844	struct xlog *log,
1836	1845	struct xlog_in_core *iclog)
1837	1846	{
1838		- xfs_buf_t *bp;
1839		- int i;
1840		- uint count; /* byte count of bwrite */
1841		- uint count_init; /* initial count before roundup */
1842		- int roundoff; /* roundoff to BB or stripe */
1843		- int split = 0; /* split write into two regions */
1844		- int error;
1845		- int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
1846		- int size;
	1847	+ unsigned int count; /* byte count of bwrite */
	1848	+ unsigned int roundoff; /* roundoff to BB or stripe */
	1849	+ uint64_t bno;
	1850	+ unsigned int size;
	1851	+ bool need_flush = true, split = false;
1847	1852
1848		- XFS_STATS_INC(log->l_mp, xs_log_writes);
1849	1853	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
1850	1854
1851		- /* Add for LR header */
1852		- count_init = log->l_iclog_hsize + iclog->ic_offset;
1853		-
1854		- /* Round out the log write size */
1855		- if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
1856		- /* we have a v2 stripe unit to use */
1857		- count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
1858		- } else {
1859		- count = BBTOB(BTOBB(count_init));
1860		- }
1861		- roundoff = count - count_init;
1862		- ASSERT(roundoff >= 0);
1863		- ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 &&
1864		- roundoff < log->l_mp->m_sb.sb_logsunit)
1865		- \|\|
1866		- (log->l_mp->m_sb.sb_logsunit <= 1 &&
1867		- roundoff < BBTOB(1)));
	1855	+ count = xlog_calc_iclog_size(log, iclog, &roundoff);
1868	1856
1869	1857	/* move grant heads by roundoff in sync */
1870	1858	xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
..	..	@@ -1875,41 +1863,19 @@
1875	1863
1876	1864	/* real byte length */
1877	1865	size = iclog->ic_offset;
1878		- if (v2)
	1866	+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb))
1879	1867	size += roundoff;
1880	1868	iclog->ic_header.h_len = cpu_to_be32(size);
1881	1869
1882		- bp = iclog->ic_bp;
1883		- XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
1884		-
	1870	+ XFS_STATS_INC(log->l_mp, xs_log_writes);
1885	1871	XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
1886	1872
	1873	+ bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
	1874	+
1887	1875	/* Do we need to split this write into 2 parts? */
1888		- if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
1889		- char *dptr;
1890		-
1891		- split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
1892		- count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
1893		- iclog->ic_bwritecnt = 2;
1894		-
1895		- /*
1896		- * Bump the cycle numbers at the start of each block in the
1897		- * part of the iclog that ends up in the buffer that gets
1898		- * written to the start of the log.
1899		- *
1900		- * Watch out for the header magic number case, though.
1901		- */
1902		- dptr = (char *)&iclog->ic_header + count;
1903		- for (i = 0; i < split; i += BBSIZE) {
1904		- uint32_t cycle = be32_to_cpu((__be32 )dptr);
1905		- if (++cycle == XLOG_HEADER_MAGIC_NUM)
1906		- cycle++;
1907		- (__be32 )dptr = cpu_to_be32(cycle);
1908		-
1909		- dptr += BBSIZE;
1910		- }
1911		- } else {
1912		- iclog->ic_bwritecnt = 1;
	1876	+ if (bno + BTOBB(count) > log->l_logBBsize) {
	1877	+ xlog_split_iclog(log, &iclog->ic_header, bno, count);
	1878	+ split = true;
1913	1879	}
1914	1880
1915	1881	/* calculcate the checksum */
..	..	@@ -1922,18 +1888,15 @@
1922	1888	* write on I/O completion and shutdown the fs. The subsequent mount
1923	1889	* detects the bad CRC and attempts to recover.
1924	1890	*/
	1891	+#ifdef DEBUG
1925	1892	if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
1926	1893	iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
1927		- iclog->ic_state \|= XLOG_STATE_IOABORT;
	1894	+ iclog->ic_fail_crc = true;
1928	1895	xfs_warn(log->l_mp,
1929	1896	"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
1930	1897	be64_to_cpu(iclog->ic_header.h_lsn));
1931	1898	}
1932		-
1933		- bp->b_io_length = BTOBB(count);
1934		- bp->b_log_item = iclog;
1935		- bp->b_flags &= ~XBF_FLUSH;
1936		- bp->b_flags \|= (XBF_ASYNC \| XBF_SYNCIO \| XBF_WRITE \| XBF_FUA);
	1899	+#endif
1937	1900
1938	1901	/*
1939	1902	* Flush the data device before flushing the log to make sure all meta
..	..	@@ -1943,50 +1906,14 @@
1943	1906	* synchronously here; for an internal log we can simply use the block
1944	1907	* layer state machine for preflushes.
1945	1908	*/
1946		- if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
	1909	+ if (log->l_targ != log->l_mp->m_ddev_targp \|\| split) {
1947	1910	xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
1948		- else
1949		- bp->b_flags \|= XBF_FLUSH;
1950		-
1951		- ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1952		- ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
1953		-
1954		- xlog_verify_iclog(log, iclog, count, true);
1955		-
1956		- /* account for log which doesn't start at block #0 */
1957		- XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1958		-
1959		- /*
1960		- * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
1961		- * is shutting down.
1962		- */
1963		- error = xlog_bdstrat(bp);
1964		- if (error) {
1965		- xfs_buf_ioerror_alert(bp, "xlog_sync");
1966		- return error;
	1911	+ need_flush = false;
1967	1912	}
1968		- if (split) {
1969		- bp = iclog->ic_log->l_xbuf;
1970		- XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */
1971		- xfs_buf_associate_memory(bp,
1972		- (char *)&iclog->ic_header + count, split);
1973		- bp->b_log_item = iclog;
1974		- bp->b_flags &= ~XBF_FLUSH;
1975		- bp->b_flags \|= (XBF_ASYNC \| XBF_SYNCIO \| XBF_WRITE \| XBF_FUA);
1976	1913
1977		- ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1978		- ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
1979		-
1980		- /* account for internal log which doesn't start at block #0 */
1981		- XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1982		- error = xlog_bdstrat(bp);
1983		- if (error) {
1984		- xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
1985		- return error;
1986		- }
1987		- }
1988		- return 0;
1989		-} /* xlog_sync */
	1914	+ xlog_verify_iclog(log, iclog, count);
	1915	+ xlog_write_iclog(log, iclog, bno, count, need_flush);
	1916	+}
1990	1917
1991	1918	/*
1992	1919	* Deallocate a log structure
..	..	@@ -2006,38 +1933,27 @@
2006	1933	*/
2007	1934	iclog = log->l_iclog;
2008	1935	for (i = 0; i < log->l_iclog_bufs; i++) {
2009		- xfs_buf_lock(iclog->ic_bp);
2010		- xfs_buf_unlock(iclog->ic_bp);
	1936	+ down(&iclog->ic_sema);
	1937	+ up(&iclog->ic_sema);
2011	1938	iclog = iclog->ic_next;
2012	1939	}
2013	1940
2014		- /*
2015		- * Always need to ensure that the extra buffer does not point to memory
2016		- * owned by another log buffer before we free it. Also, cycle the lock
2017		- * first to ensure we've completed IO on it.
2018		- */
2019		- xfs_buf_lock(log->l_xbuf);
2020		- xfs_buf_unlock(log->l_xbuf);
2021		- xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
2022		- xfs_buf_free(log->l_xbuf);
2023		-
2024	1941	iclog = log->l_iclog;
2025	1942	for (i = 0; i < log->l_iclog_bufs; i++) {
2026		- xfs_buf_free(iclog->ic_bp);
2027	1943	next_iclog = iclog->ic_next;
	1944	+ kmem_free(iclog->ic_data);
2028	1945	kmem_free(iclog);
2029	1946	iclog = next_iclog;
2030	1947	}
2031		- spinlock_destroy(&log->l_icloglock);
2032	1948
2033	1949	log->l_mp->m_log = NULL;
	1950	+ destroy_workqueue(log->l_ioend_workqueue);
2034	1951	kmem_free(log);
2035		-} /* xlog_dealloc_log */
	1952	+}
2036	1953
2037	1954	/*
2038	1955	* Update counters atomically now that memcpy is done.
2039	1956	*/
2040		-/* ARGSUSED */
2041	1957	static inline void
2042	1958	xlog_state_finish_copy(
2043	1959	struct xlog *log,
..	..	@@ -2045,16 +1961,11 @@
2045	1961	int record_cnt,
2046	1962	int copy_bytes)
2047	1963	{
2048		- spin_lock(&log->l_icloglock);
	1964	+ lockdep_assert_held(&log->l_icloglock);
2049	1965
2050	1966	be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
2051	1967	iclog->ic_offset += copy_bytes;
2052		-
2053		- spin_unlock(&log->l_icloglock);
2054		-} /* xlog_state_finish_copy */
2055		-
2056		-
2057		-
	1968	+}
2058	1969
2059	1970	/*
2060	1971	* print out info relating to regions written which consume
..	..	@@ -2070,7 +1981,7 @@
2070	1981
2071	1982	/* match with XLOG_REG_TYPE_* in xfs_log.h */
2072	1983	#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
2073		- static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = {
	1984	+ static char *res_type_str[] = {
2074	1985	REG_TYPE_STR(BFORMAT, "bformat"),
2075	1986	REG_TYPE_STR(BCHUNK, "bchunk"),
2076	1987	REG_TYPE_STR(EFI_FORMAT, "efi_format"),
..	..	@@ -2090,8 +2001,15 @@
2090	2001	REG_TYPE_STR(UNMOUNT, "unmount"),
2091	2002	REG_TYPE_STR(COMMIT, "commit"),
2092	2003	REG_TYPE_STR(TRANSHDR, "trans header"),
2093		- REG_TYPE_STR(ICREATE, "inode create")
	2004	+ REG_TYPE_STR(ICREATE, "inode create"),
	2005	+ REG_TYPE_STR(RUI_FORMAT, "rui_format"),
	2006	+ REG_TYPE_STR(RUD_FORMAT, "rud_format"),
	2007	+ REG_TYPE_STR(CUI_FORMAT, "cui_format"),
	2008	+ REG_TYPE_STR(CUD_FORMAT, "cud_format"),
	2009	+ REG_TYPE_STR(BUI_FORMAT, "bui_format"),
	2010	+ REG_TYPE_STR(BUD_FORMAT, "bud_format"),
2094	2011	};
	2012	+ BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
2095	2013	#undef REG_TYPE_STR
2096	2014
2097	2015	xfs_warn(mp, "ticket reservation summary:");
..	..	@@ -2168,22 +2086,20 @@
2168	2086	}
2169	2087
2170	2088	/*
2171		- * Calculate the potential space needed by the log vector. Each region gets
2172		- * its own xlog_op_header_t and may need to be double word aligned.
	2089	+ * Calculate the potential space needed by the log vector. We may need a start
	2090	+ * record, and each region gets its own struct xlog_op_header and may need to be
	2091	+ * double word aligned.
2173	2092	*/
2174	2093	static int
2175	2094	xlog_write_calc_vec_length(
2176	2095	struct xlog_ticket *ticket,
2177		- struct xfs_log_vec *log_vector)
	2096	+ struct xfs_log_vec *log_vector,
	2097	+ bool need_start_rec)
2178	2098	{
2179	2099	struct xfs_log_vec *lv;
2180		- int headers = 0;
	2100	+ int headers = need_start_rec ? 1 : 0;
2181	2101	int len = 0;
2182	2102	int i;
2183		-
2184		- /* acct for start rec of xact */
2185		- if (ticket->t_flags & XLOG_TIC_INITED)
2186		- headers++;
2187	2103
2188	2104	for (lv = log_vector; lv; lv = lv->lv_next) {
2189	2105	/* we don't write ordered log vectors */
..	..	@@ -2206,27 +2122,16 @@
2206	2122	return len;
2207	2123	}
2208	2124
2209		-/*
2210		- * If first write for transaction, insert start record We can't be trying to
2211		- * commit if we are inited. We can't have any "partial_copy" if we are inited.
2212		- */
2213		-static int
	2125	+static void
2214	2126	xlog_write_start_rec(
2215	2127	struct xlog_op_header *ophdr,
2216	2128	struct xlog_ticket *ticket)
2217	2129	{
2218		- if (!(ticket->t_flags & XLOG_TIC_INITED))
2219		- return 0;
2220		-
2221	2130	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
2222	2131	ophdr->oh_clientid = ticket->t_clientid;
2223	2132	ophdr->oh_len = 0;
2224	2133	ophdr->oh_flags = XLOG_START_TRANS;
2225	2134	ophdr->oh_res2 = 0;
2226		-
2227		- ticket->t_flags &= ~XLOG_TIC_INITED;
2228		-
2229		- return sizeof(struct xlog_op_header);
2230	2135	}
2231	2136
2232	2137	static xlog_op_header_t *
..	..	@@ -2324,15 +2229,18 @@
2324	2229	int log_offset,
2325	2230	struct xlog_in_core **commit_iclog)
2326	2231	{
	2232	+ int error;
	2233	+
2327	2234	if (*partial_copy) {
2328	2235	/*
2329	2236	* This iclog has already been marked WANT_SYNC by
2330	2237	* xlog_state_get_iclog_space.
2331	2238	*/
	2239	+ spin_lock(&log->l_icloglock);
2332	2240	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
2333	2241	*record_cnt = 0;
2334	2242	*data_cnt = 0;
2335		- return xlog_state_release_iclog(log, iclog);
	2243	+ goto release_iclog;
2336	2244	}
2337	2245
2338	2246	*partial_copy = 0;
..	..	@@ -2340,21 +2248,29 @@
2340	2248
2341	2249	if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
2342	2250	/* no more space in this iclog - push it. */
	2251	+ spin_lock(&log->l_icloglock);
2343	2252	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
2344	2253	*record_cnt = 0;
2345	2254	*data_cnt = 0;
2346	2255
2347		- spin_lock(&log->l_icloglock);
2348		- xlog_state_want_sync(log, iclog);
2349		- spin_unlock(&log->l_icloglock);
2350		-
	2256	+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
	2257	+ xlog_state_switch_iclogs(log, iclog, 0);
	2258	+ else
	2259	+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC \|\|
	2260	+ iclog->ic_state == XLOG_STATE_IOERROR);
2351	2261	if (!commit_iclog)
2352		- return xlog_state_release_iclog(log, iclog);
	2262	+ goto release_iclog;
	2263	+ spin_unlock(&log->l_icloglock);
2353	2264	ASSERT(flags & XLOG_COMMIT_TRANS);
2354	2265	*commit_iclog = iclog;
2355	2266	}
2356	2267
2357	2268	return 0;
	2269	+
	2270	+release_iclog:
	2271	+ error = xlog_state_release_iclog(log, iclog);
	2272	+ spin_unlock(&log->l_icloglock);
	2273	+ return error;
2358	2274	}
2359	2275
2360	2276	/*
..	..	@@ -2404,39 +2320,28 @@
2404	2320	struct xlog_ticket *ticket,
2405	2321	xfs_lsn_t *start_lsn,
2406	2322	struct xlog_in_core **commit_iclog,
2407		- uint flags)
	2323	+ uint flags,
	2324	+ bool need_start_rec)
2408	2325	{
2409	2326	struct xlog_in_core *iclog = NULL;
2410		- struct xfs_log_iovec *vecp;
2411		- struct xfs_log_vec *lv;
	2327	+ struct xfs_log_vec *lv = log_vector;
	2328	+ struct xfs_log_iovec *vecp = lv->lv_iovecp;
	2329	+ int index = 0;
2412	2330	int len;
2413		- int index;
2414	2331	int partial_copy = 0;
2415	2332	int partial_copy_len = 0;
2416	2333	int contwr = 0;
2417	2334	int record_cnt = 0;
2418	2335	int data_cnt = 0;
2419		- int error;
2420		-
2421		- *start_lsn = 0;
2422		-
2423		- len = xlog_write_calc_vec_length(ticket, log_vector);
	2336	+ int error = 0;
2424	2337
2425	2338	/*
2426		- * Region headers and bytes are already accounted for.
2427		- * We only need to take into account start records and
2428		- * split regions in this function.
	2339	+ * If this is a commit or unmount transaction, we don't need a start
	2340	+ * record to be written. We do, however, have to account for the
	2341	+ * commit or unmount header that gets written. Hence we always have
	2342	+ * to account for an extra xlog_op_header here.
2429	2343	*/
2430		- if (ticket->t_flags & XLOG_TIC_INITED)
2431		- ticket->t_curr_res -= sizeof(xlog_op_header_t);
2432		-
2433		- /*
2434		- * Commit record headers need to be accounted for. These
2435		- * come in as separate writes so are easy to detect.
2436		- */
2437		- if (flags & (XLOG_COMMIT_TRANS \| XLOG_UNMOUNT_TRANS))
2438		- ticket->t_curr_res -= sizeof(xlog_op_header_t);
2439		-
	2344	+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
2440	2345	if (ticket->t_curr_res < 0) {
2441	2346	xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
2442	2347	"ctx ticket reservation ran out. Need to up reservation");
..	..	@@ -2444,9 +2349,8 @@
2444	2349	xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
2445	2350	}
2446	2351
2447		- index = 0;
2448		- lv = log_vector;
2449		- vecp = lv->lv_iovecp;
	2352	+ len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec);
	2353	+ *start_lsn = 0;
2450	2354	while (lv && (!lv->lv_niovecs \|\| index < lv->lv_niovecs)) {
2451	2355	void *ptr;
2452	2356	int log_offset;
..	..	@@ -2470,7 +2374,6 @@
2470	2374	while (lv && (!lv->lv_niovecs \|\| index < lv->lv_niovecs)) {
2471	2375	struct xfs_log_iovec *reg;
2472	2376	struct xlog_op_header *ophdr;
2473		- int start_rec_copy;
2474	2377	int copy_len;
2475	2378	int copy_off;
2476	2379	bool ordered = false;
..	..	@@ -2486,11 +2389,15 @@
2486	2389	ASSERT(reg->i_len % sizeof(int32_t) == 0);
2487	2390	ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
2488	2391
2489		- start_rec_copy = xlog_write_start_rec(ptr, ticket);
2490		- if (start_rec_copy) {
2491		- record_cnt++;
	2392	+ /*
	2393	+ * Before we start formatting log vectors, we need to
	2394	+ * write a start record. Only do this for the first
	2395	+ * iclog we write to.
	2396	+ */
	2397	+ if (need_start_rec) {
	2398	+ xlog_write_start_rec(ptr, ticket);
2492	2399	xlog_write_adv_cnt(&ptr, &len, &log_offset,
2493		- start_rec_copy);
	2400	+ sizeof(struct xlog_op_header));
2494	2401	}
2495	2402
2496	2403	ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
..	..	@@ -2522,8 +2429,13 @@
2522	2429	xlog_write_adv_cnt(&ptr, &len, &log_offset,
2523	2430	copy_len);
2524	2431	}
2525		- copy_len += start_rec_copy + sizeof(xlog_op_header_t);
	2432	+ copy_len += sizeof(struct xlog_op_header);
2526	2433	record_cnt++;
	2434	+ if (need_start_rec) {
	2435	+ copy_len += sizeof(struct xlog_op_header);
	2436	+ record_cnt++;
	2437	+ need_start_rec = false;
	2438	+ }
2527	2439	data_cnt += contwr ? copy_len : 0;
2528	2440
2529	2441	error = xlog_write_copy_finish(log, iclog, flags,
..	..	@@ -2567,158 +2479,284 @@
2567	2479
2568	2480	ASSERT(len == 0);
2569	2481
	2482	+ spin_lock(&log->l_icloglock);
2570	2483	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
2571		- if (!commit_iclog)
2572		- return xlog_state_release_iclog(log, iclog);
	2484	+ if (commit_iclog) {
	2485	+ ASSERT(flags & XLOG_COMMIT_TRANS);
	2486	+ *commit_iclog = iclog;
	2487	+ } else {
	2488	+ error = xlog_state_release_iclog(log, iclog);
	2489	+ }
	2490	+ spin_unlock(&log->l_icloglock);
2573	2491
2574		- ASSERT(flags & XLOG_COMMIT_TRANS);
2575		- *commit_iclog = iclog;
2576		- return 0;
	2492	+ return error;
2577	2493	}
2578	2494
2579		-
2580		-/*****************************************************************************
2581		- *
2582		- * State Machine functions
2583		- *
2584		- *****************************************************************************
2585		- */
2586		-
2587		-/* Clean iclogs starting from the head. This ordering must be
2588		- * maintained, so an iclog doesn't become ACTIVE beyond one that
2589		- * is SYNCING. This is also required to maintain the notion that we use
2590		- * a ordered wait queue to hold off would be writers to the log when every
2591		- * iclog is trying to sync to disk.
2592		- *
2593		- * State Change: DIRTY -> ACTIVE
2594		- */
2595		-STATIC void
2596		-xlog_state_clean_log(
2597		- struct xlog *log)
	2495	+static void
	2496	+xlog_state_activate_iclog(
	2497	+ struct xlog_in_core *iclog,
	2498	+ int *iclogs_changed)
2598	2499	{
2599		- xlog_in_core_t *iclog;
2600		- int changed = 0;
	2500	+ ASSERT(list_empty_careful(&iclog->ic_callbacks));
2601	2501
2602		- iclog = log->l_iclog;
2603		- do {
2604		- if (iclog->ic_state == XLOG_STATE_DIRTY) {
2605		- iclog->ic_state = XLOG_STATE_ACTIVE;
2606		- iclog->ic_offset = 0;
2607		- ASSERT(iclog->ic_callback == NULL);
2608		- /*
2609		- * If the number of ops in this iclog indicate it just
2610		- * contains the dummy transaction, we can
2611		- * change state into IDLE (the second time around).
2612		- * Otherwise we should change the state into
2613		- * NEED a dummy.
2614		- * We don't need to cover the dummy.
2615		- */
2616		- if (!changed &&
2617		- (be32_to_cpu(iclog->ic_header.h_num_logops) ==
2618		- XLOG_COVER_OPS)) {
2619		- changed = 1;
2620		- } else {
2621		- /*
2622		- * We have two dirty iclogs so start over
2623		- * This could also be num of ops indicates
2624		- * this is not the dummy going out.
2625		- */
2626		- changed = 2;
2627		- }
2628		- iclog->ic_header.h_num_logops = 0;
2629		- memset(iclog->ic_header.h_cycle_data, 0,
2630		- sizeof(iclog->ic_header.h_cycle_data));
2631		- iclog->ic_header.h_lsn = 0;
2632		- } else if (iclog->ic_state == XLOG_STATE_ACTIVE)
2633		- /* do nothing */;
2634		- else
2635		- break; /* stop cleaning */
2636		- iclog = iclog->ic_next;
2637		- } while (iclog != log->l_iclog);
2638		-
2639		- /* log is locked when we are called */
2640	2502	/*
2641		- * Change state for the dummy log recording.
2642		- * We usually go to NEED. But we go to NEED2 if the changed indicates
2643		- * we are done writing the dummy record.
2644		- * If we are done with the second dummy recored (DONE2), then
2645		- * we go to IDLE.
	2503	+ * If the number of ops in this iclog indicate it just contains the
	2504	+ * dummy transaction, we can change state into IDLE (the second time
	2505	+ * around). Otherwise we should change the state into NEED a dummy.
	2506	+ * We don't need to cover the dummy.
2646	2507	*/
2647		- if (changed) {
2648		- switch (log->l_covered_state) {
2649		- case XLOG_STATE_COVER_IDLE:
2650		- case XLOG_STATE_COVER_NEED:
2651		- case XLOG_STATE_COVER_NEED2:
2652		- log->l_covered_state = XLOG_STATE_COVER_NEED;
2653		- break;
2654		-
2655		- case XLOG_STATE_COVER_DONE:
2656		- if (changed == 1)
2657		- log->l_covered_state = XLOG_STATE_COVER_NEED2;
2658		- else
2659		- log->l_covered_state = XLOG_STATE_COVER_NEED;
2660		- break;
2661		-
2662		- case XLOG_STATE_COVER_DONE2:
2663		- if (changed == 1)
2664		- log->l_covered_state = XLOG_STATE_COVER_IDLE;
2665		- else
2666		- log->l_covered_state = XLOG_STATE_COVER_NEED;
2667		- break;
2668		-
2669		- default:
2670		- ASSERT(0);
2671		- }
	2508	+ if (*iclogs_changed == 0 &&
	2509	+ iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) {
	2510	+ *iclogs_changed = 1;
	2511	+ } else {
	2512	+ /*
	2513	+ * We have two dirty iclogs so start over. This could also be
	2514	+ * num of ops indicating this is not the dummy going out.
	2515	+ */
	2516	+ *iclogs_changed = 2;
2672	2517	}
2673		-} /* xlog_state_clean_log */
	2518	+
	2519	+ iclog->ic_state = XLOG_STATE_ACTIVE;
	2520	+ iclog->ic_offset = 0;
	2521	+ iclog->ic_header.h_num_logops = 0;
	2522	+ memset(iclog->ic_header.h_cycle_data, 0,
	2523	+ sizeof(iclog->ic_header.h_cycle_data));
	2524	+ iclog->ic_header.h_lsn = 0;
	2525	+}
	2526	+
	2527	+/*
	2528	+ * Loop through all iclogs and mark all iclogs currently marked DIRTY as
	2529	+ * ACTIVE after iclog I/O has completed.
	2530	+ */
	2531	+static void
	2532	+xlog_state_activate_iclogs(
	2533	+ struct xlog *log,
	2534	+ int *iclogs_changed)
	2535	+{
	2536	+ struct xlog_in_core *iclog = log->l_iclog;
	2537	+
	2538	+ do {
	2539	+ if (iclog->ic_state == XLOG_STATE_DIRTY)
	2540	+ xlog_state_activate_iclog(iclog, iclogs_changed);
	2541	+ /*
	2542	+ * The ordering of marking iclogs ACTIVE must be maintained, so
	2543	+ * an iclog doesn't become ACTIVE beyond one that is SYNCING.
	2544	+ */
	2545	+ else if (iclog->ic_state != XLOG_STATE_ACTIVE)
	2546	+ break;
	2547	+ } while ((iclog = iclog->ic_next) != log->l_iclog);
	2548	+}
	2549	+
	2550	+static int
	2551	+xlog_covered_state(
	2552	+ int prev_state,
	2553	+ int iclogs_changed)
	2554	+{
	2555	+ /*
	2556	+ * We usually go to NEED. But we go to NEED2 if the changed indicates we
	2557	+ * are done writing the dummy record. If we are done with the second
	2558	+ * dummy recored (DONE2), then we go to IDLE.
	2559	+ */
	2560	+ switch (prev_state) {
	2561	+ case XLOG_STATE_COVER_IDLE:
	2562	+ case XLOG_STATE_COVER_NEED:
	2563	+ case XLOG_STATE_COVER_NEED2:
	2564	+ break;
	2565	+ case XLOG_STATE_COVER_DONE:
	2566	+ if (iclogs_changed == 1)
	2567	+ return XLOG_STATE_COVER_NEED2;
	2568	+ break;
	2569	+ case XLOG_STATE_COVER_DONE2:
	2570	+ if (iclogs_changed == 1)
	2571	+ return XLOG_STATE_COVER_IDLE;
	2572	+ break;
	2573	+ default:
	2574	+ ASSERT(0);
	2575	+ }
	2576	+
	2577	+ return XLOG_STATE_COVER_NEED;
	2578	+}
	2579	+
	2580	+STATIC void
	2581	+xlog_state_clean_iclog(
	2582	+ struct xlog *log,
	2583	+ struct xlog_in_core *dirty_iclog)
	2584	+{
	2585	+ int iclogs_changed = 0;
	2586	+
	2587	+ dirty_iclog->ic_state = XLOG_STATE_DIRTY;
	2588	+
	2589	+ xlog_state_activate_iclogs(log, &iclogs_changed);
	2590	+ wake_up_all(&dirty_iclog->ic_force_wait);
	2591	+
	2592	+ if (iclogs_changed) {
	2593	+ log->l_covered_state = xlog_covered_state(log->l_covered_state,
	2594	+ iclogs_changed);
	2595	+ }
	2596	+}
2674	2597
2675	2598	STATIC xfs_lsn_t
2676	2599	xlog_get_lowest_lsn(
2677		- struct xlog *log)
	2600	+ struct xlog *log)
2678	2601	{
2679		- xlog_in_core_t *lsn_log;
2680		- xfs_lsn_t lowest_lsn, lsn;
	2602	+ struct xlog_in_core *iclog = log->l_iclog;
	2603	+ xfs_lsn_t lowest_lsn = 0, lsn;
2681	2604
2682		- lsn_log = log->l_iclog;
2683		- lowest_lsn = 0;
2684	2605	do {
2685		- if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE\|XLOG_STATE_DIRTY))) {
2686		- lsn = be64_to_cpu(lsn_log->ic_header.h_lsn);
2687		- if ((lsn && !lowest_lsn) \|\|
2688		- (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
	2606	+ if (iclog->ic_state == XLOG_STATE_ACTIVE \|\|
	2607	+ iclog->ic_state == XLOG_STATE_DIRTY)
	2608	+ continue;
	2609	+
	2610	+ lsn = be64_to_cpu(iclog->ic_header.h_lsn);
	2611	+ if ((lsn && !lowest_lsn) \|\| XFS_LSN_CMP(lsn, lowest_lsn) < 0)
2689	2612	lowest_lsn = lsn;
2690		- }
2691		- }
2692		- lsn_log = lsn_log->ic_next;
2693		- } while (lsn_log != log->l_iclog);
	2613	+ } while ((iclog = iclog->ic_next) != log->l_iclog);
	2614	+
2694	2615	return lowest_lsn;
2695	2616	}
2696	2617
	2618	+/*
	2619	+ * Completion of a iclog IO does not imply that a transaction has completed, as
	2620	+ * transactions can be large enough to span many iclogs. We cannot change the
	2621	+ * tail of the log half way through a transaction as this may be the only
	2622	+ * transaction in the log and moving the tail to point to the middle of it
	2623	+ * will prevent recovery from finding the start of the transaction. Hence we
	2624	+ * should only update the last_sync_lsn if this iclog contains transaction
	2625	+ * completion callbacks on it.
	2626	+ *
	2627	+ * We have to do this before we drop the icloglock to ensure we are the only one
	2628	+ * that can update it.
	2629	+ *
	2630	+ * If we are moving the last_sync_lsn forwards, we also need to ensure we kick
	2631	+ * the reservation grant head pushing. This is due to the fact that the push
	2632	+ * target is bound by the current last_sync_lsn value. Hence if we have a large
	2633	+ * amount of log space bound up in this committing transaction then the
	2634	+ * last_sync_lsn value may be the limiting factor preventing tail pushing from
	2635	+ * freeing space in the log. Hence once we've updated the last_sync_lsn we
	2636	+ * should push the AIL to ensure the push target (and hence the grant head) is
	2637	+ * no longer bound by the old log head location and can move forwards and make
	2638	+ * progress again.
	2639	+ */
	2640	+static void
	2641	+xlog_state_set_callback(
	2642	+ struct xlog *log,
	2643	+ struct xlog_in_core *iclog,
	2644	+ xfs_lsn_t header_lsn)
	2645	+{
	2646	+ iclog->ic_state = XLOG_STATE_CALLBACK;
	2647	+
	2648	+ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
	2649	+ header_lsn) <= 0);
	2650	+
	2651	+ if (list_empty_careful(&iclog->ic_callbacks))
	2652	+ return;
	2653	+
	2654	+ atomic64_set(&log->l_last_sync_lsn, header_lsn);
	2655	+ xlog_grant_push_ail(log, 0);
	2656	+}
	2657	+
	2658	+/*
	2659	+ * Return true if we need to stop processing, false to continue to the next
	2660	+ * iclog. The caller will need to run callbacks if the iclog is returned in the
	2661	+ * XLOG_STATE_CALLBACK state.
	2662	+ */
	2663	+static bool
	2664	+xlog_state_iodone_process_iclog(
	2665	+ struct xlog *log,
	2666	+ struct xlog_in_core *iclog,
	2667	+ bool *ioerror)
	2668	+{
	2669	+ xfs_lsn_t lowest_lsn;
	2670	+ xfs_lsn_t header_lsn;
	2671	+
	2672	+ switch (iclog->ic_state) {
	2673	+ case XLOG_STATE_ACTIVE:
	2674	+ case XLOG_STATE_DIRTY:
	2675	+ /*
	2676	+ * Skip all iclogs in the ACTIVE & DIRTY states:
	2677	+ */
	2678	+ return false;
	2679	+ case XLOG_STATE_IOERROR:
	2680	+ /*
	2681	+ * Between marking a filesystem SHUTDOWN and stopping the log,
	2682	+ * we do flush all iclogs to disk (if there wasn't a log I/O
	2683	+ * error). So, we do want things to go smoothly in case of just
	2684	+ * a SHUTDOWN w/o a LOG_IO_ERROR.
	2685	+ */
	2686	+ *ioerror = true;
	2687	+ return false;
	2688	+ case XLOG_STATE_DONE_SYNC:
	2689	+ /*
	2690	+ * Now that we have an iclog that is in the DONE_SYNC state, do
	2691	+ * one more check here to see if we have chased our tail around.
	2692	+ * If this is not the lowest lsn iclog, then we will leave it
	2693	+ * for another completion to process.
	2694	+ */
	2695	+ header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
	2696	+ lowest_lsn = xlog_get_lowest_lsn(log);
	2697	+ if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
	2698	+ return false;
	2699	+ xlog_state_set_callback(log, iclog, header_lsn);
	2700	+ return false;
	2701	+ default:
	2702	+ /*
	2703	+ * Can only perform callbacks in order. Since this iclog is not
	2704	+ * in the DONE_SYNC state, we skip the rest and just try to
	2705	+ * clean up.
	2706	+ */
	2707	+ return true;
	2708	+ }
	2709	+}
	2710	+
	2711	+/*
	2712	+ * Keep processing entries in the iclog callback list until we come around and
	2713	+ * it is empty. We need to atomically see that the list is empty and change the
	2714	+ * state to DIRTY so that we don't miss any more callbacks being added.
	2715	+ *
	2716	+ * This function is called with the icloglock held and returns with it held. We
	2717	+ * drop it while running callbacks, however, as holding it over thousands of
	2718	+ * callbacks is unnecessary and causes excessive contention if we do.
	2719	+ */
	2720	+static void
	2721	+xlog_state_do_iclog_callbacks(
	2722	+ struct xlog *log,
	2723	+ struct xlog_in_core *iclog)
	2724	+ __releases(&log->l_icloglock)
	2725	+ __acquires(&log->l_icloglock)
	2726	+{
	2727	+ spin_unlock(&log->l_icloglock);
	2728	+ spin_lock(&iclog->ic_callback_lock);
	2729	+ while (!list_empty(&iclog->ic_callbacks)) {
	2730	+ LIST_HEAD(tmp);
	2731	+
	2732	+ list_splice_init(&iclog->ic_callbacks, &tmp);
	2733	+
	2734	+ spin_unlock(&iclog->ic_callback_lock);
	2735	+ xlog_cil_process_committed(&tmp);
	2736	+ spin_lock(&iclog->ic_callback_lock);
	2737	+ }
	2738	+
	2739	+ /*
	2740	+ * Pick up the icloglock while still holding the callback lock so we
	2741	+ * serialise against anyone trying to add more callbacks to this iclog
	2742	+ * now we've finished processing.
	2743	+ */
	2744	+ spin_lock(&log->l_icloglock);
	2745	+ spin_unlock(&iclog->ic_callback_lock);
	2746	+}
2697	2747
2698	2748	STATIC void
2699	2749	xlog_state_do_callback(
2700		- struct xlog *log,
2701		- int aborted,
2702		- struct xlog_in_core *ciclog)
	2750	+ struct xlog *log)
2703	2751	{
2704		- xlog_in_core_t *iclog;
2705		- xlog_in_core_t first_iclog; / used to know when we've
2706		- * processed all iclogs once */
2707		- xfs_log_callback_t cb, cb_next;
2708		- int flushcnt = 0;
2709		- xfs_lsn_t lowest_lsn;
2710		- int ioerrors; /* counter: iclogs with errors */
2711		- int loopdidcallbacks; /* flag: inner loop did callbacks*/
2712		- int funcdidcallbacks; /* flag: function did callbacks */
2713		- int repeats; /* for issuing console warnings if
2714		- * looping too many times */
	2752	+ struct xlog_in_core *iclog;
	2753	+ struct xlog_in_core *first_iclog;
	2754	+ bool cycled_icloglock;
	2755	+ bool ioerror;
	2756	+ int flushcnt = 0;
	2757	+ int repeats = 0;
2715	2758
2716	2759	spin_lock(&log->l_icloglock);
2717		- first_iclog = iclog = log->l_iclog;
2718		- ioerrors = 0;
2719		- funcdidcallbacks = 0;
2720		- repeats = 0;
2721		-
2722	2760	do {
2723	2761	/*
2724	2762	* Scan all iclogs starting with the one pointed to by the
..	..	@@ -2730,140 +2768,31 @@
2730	2768	*/
2731	2769	first_iclog = log->l_iclog;
2732	2770	iclog = log->l_iclog;
2733		- loopdidcallbacks = 0;
	2771	+ cycled_icloglock = false;
	2772	+ ioerror = false;
2734	2773	repeats++;
2735	2774
2736	2775	do {
	2776	+ if (xlog_state_iodone_process_iclog(log, iclog,
	2777	+ &ioerror))
	2778	+ break;
2737	2779
2738		- /* skip all iclogs in the ACTIVE & DIRTY states */
2739		- if (iclog->ic_state &
2740		- (XLOG_STATE_ACTIVE\|XLOG_STATE_DIRTY)) {
	2780	+ if (iclog->ic_state != XLOG_STATE_CALLBACK &&
	2781	+ iclog->ic_state != XLOG_STATE_IOERROR) {
2741	2782	iclog = iclog->ic_next;
2742	2783	continue;
2743	2784	}
2744	2785
2745	2786	/*
2746		- * Between marking a filesystem SHUTDOWN and stopping
2747		- * the log, we do flush all iclogs to disk (if there
2748		- * wasn't a log I/O error). So, we do want things to
2749		- * go smoothly in case of just a SHUTDOWN w/o a
2750		- * LOG_IO_ERROR.
	2787	+ * Running callbacks will drop the icloglock which means
	2788	+ * we'll have to run at least one more complete loop.
2751	2789	*/
2752		- if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
2753		- /*
2754		- * Can only perform callbacks in order. Since
2755		- * this iclog is not in the DONE_SYNC/
2756		- * DO_CALLBACK state, we skip the rest and
2757		- * just try to clean up. If we set our iclog
2758		- * to DO_CALLBACK, we will not process it when
2759		- * we retry since a previous iclog is in the
2760		- * CALLBACK and the state cannot change since
2761		- * we are holding the l_icloglock.
2762		- */
2763		- if (!(iclog->ic_state &
2764		- (XLOG_STATE_DONE_SYNC \|
2765		- XLOG_STATE_DO_CALLBACK))) {
2766		- if (ciclog && (ciclog->ic_state ==
2767		- XLOG_STATE_DONE_SYNC)) {
2768		- ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
2769		- }
2770		- break;
2771		- }
2772		- /*
2773		- * We now have an iclog that is in either the
2774		- * DO_CALLBACK or DONE_SYNC states. The other
2775		- * states (WANT_SYNC, SYNCING, or CALLBACK were
2776		- * caught by the above if and are going to
2777		- * clean (i.e. we aren't doing their callbacks)
2778		- * see the above if.
2779		- */
2780		-
2781		- /*
2782		- * We will do one more check here to see if we
2783		- * have chased our tail around.
2784		- */
2785		-
2786		- lowest_lsn = xlog_get_lowest_lsn(log);
2787		- if (lowest_lsn &&
2788		- XFS_LSN_CMP(lowest_lsn,
2789		- be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
2790		- iclog = iclog->ic_next;
2791		- continue; /* Leave this iclog for
2792		- * another thread */
2793		- }
2794		-
2795		- iclog->ic_state = XLOG_STATE_CALLBACK;
2796		-
2797		-
2798		- /*
2799		- * Completion of a iclog IO does not imply that
2800		- * a transaction has completed, as transactions
2801		- * can be large enough to span many iclogs. We
2802		- * cannot change the tail of the log half way
2803		- * through a transaction as this may be the only
2804		- * transaction in the log and moving th etail to
2805		- * point to the middle of it will prevent
2806		- * recovery from finding the start of the
2807		- * transaction. Hence we should only update the
2808		- * last_sync_lsn if this iclog contains
2809		- * transaction completion callbacks on it.
2810		- *
2811		- * We have to do this before we drop the
2812		- * icloglock to ensure we are the only one that
2813		- * can update it.
2814		- */
2815		- ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2816		- be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2817		- if (iclog->ic_callback)
2818		- atomic64_set(&log->l_last_sync_lsn,
2819		- be64_to_cpu(iclog->ic_header.h_lsn));
2820		-
2821		- } else
2822		- ioerrors++;
2823		-
2824		- spin_unlock(&log->l_icloglock);
2825		-
2826		- /*
2827		- * Keep processing entries in the callback list until
2828		- * we come around and it is empty. We need to
2829		- * atomically see that the list is empty and change the
2830		- * state to DIRTY so that we don't miss any more
2831		- * callbacks being added.
2832		- */
2833		- spin_lock(&iclog->ic_callback_lock);
2834		- cb = iclog->ic_callback;
2835		- while (cb) {
2836		- iclog->ic_callback_tail = &(iclog->ic_callback);
2837		- iclog->ic_callback = NULL;
2838		- spin_unlock(&iclog->ic_callback_lock);
2839		-
2840		- /* perform callbacks in the order given */
2841		- for (; cb; cb = cb_next) {
2842		- cb_next = cb->cb_next;
2843		- cb->cb_func(cb->cb_arg, aborted);
2844		- }
2845		- spin_lock(&iclog->ic_callback_lock);
2846		- cb = iclog->ic_callback;
2847		- }
2848		-
2849		- loopdidcallbacks++;
2850		- funcdidcallbacks++;
2851		-
2852		- spin_lock(&log->l_icloglock);
2853		- ASSERT(iclog->ic_callback == NULL);
2854		- spin_unlock(&iclog->ic_callback_lock);
2855		- if (!(iclog->ic_state & XLOG_STATE_IOERROR))
2856		- iclog->ic_state = XLOG_STATE_DIRTY;
2857		-
2858		- /*
2859		- * Transition from DIRTY to ACTIVE if applicable.
2860		- * NOP if STATE_IOERROR.
2861		- */
2862		- xlog_state_clean_log(log);
2863		-
2864		- /* wake up threads waiting in xfs_log_force() */
2865		- wake_up_all(&iclog->ic_force_wait);
2866		-
	2790	+ cycled_icloglock = true;
	2791	+ xlog_state_do_iclog_callbacks(log, iclog);
	2792	+ if (XLOG_FORCED_SHUTDOWN(log))
	2793	+ wake_up_all(&iclog->ic_force_wait);
	2794	+ else
	2795	+ xlog_state_clean_iclog(log, iclog);
2867	2796	iclog = iclog->ic_next;
2868	2797	} while (first_iclog != iclog);
2869	2798
..	..	@@ -2874,45 +2803,10 @@
2874	2803	"%s: possible infinite loop (%d iterations)",
2875	2804	__func__, flushcnt);
2876	2805	}
2877		- } while (!ioerrors && loopdidcallbacks);
	2806	+ } while (!ioerror && cycled_icloglock);
2878	2807
2879		-#ifdef DEBUG
2880		- /*
2881		- * Make one last gasp attempt to see if iclogs are being left in limbo.
2882		- * If the above loop finds an iclog earlier than the current iclog and
2883		- * in one of the syncing states, the current iclog is put into
2884		- * DO_CALLBACK and the callbacks are deferred to the completion of the
2885		- * earlier iclog. Walk the iclogs in order and make sure that no iclog
2886		- * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
2887		- * states.
2888		- *
2889		- * Note that SYNCING\|IOABORT is a valid state so we cannot just check
2890		- * for ic_state == SYNCING.
2891		- */
2892		- if (funcdidcallbacks) {
2893		- first_iclog = iclog = log->l_iclog;
2894		- do {
2895		- ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
2896		- /*
2897		- * Terminate the loop if iclogs are found in states
2898		- * which will cause other threads to clean up iclogs.
2899		- *
2900		- * SYNCING - i/o completion will go through logs
2901		- * DONE_SYNC - interrupt thread should be waiting for
2902		- * l_icloglock
2903		- * IOERROR - give up hope all ye who enter here
2904		- */
2905		- if (iclog->ic_state == XLOG_STATE_WANT_SYNC \|\|
2906		- iclog->ic_state & XLOG_STATE_SYNCING \|\|
2907		- iclog->ic_state == XLOG_STATE_DONE_SYNC \|\|
2908		- iclog->ic_state == XLOG_STATE_IOERROR )
2909		- break;
2910		- iclog = iclog->ic_next;
2911		- } while (first_iclog != iclog);
2912		- }
2913		-#endif
2914		-
2915		- if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE\|XLOG_STATE_IOERROR))
	2808	+ if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE \|\|
	2809	+ log->l_iclog->ic_state == XLOG_STATE_IOERROR)
2916	2810	wake_up_all(&log->l_flush_wait);
2917	2811
2918	2812	spin_unlock(&log->l_icloglock);
..	..	@@ -2934,30 +2828,20 @@
2934	2828	*/
2935	2829	STATIC void
2936	2830	xlog_state_done_syncing(
2937		- xlog_in_core_t *iclog,
2938		- int aborted)
	2831	+ struct xlog_in_core *iclog)
2939	2832	{
2940		- struct xlog *log = iclog->ic_log;
	2833	+ struct xlog *log = iclog->ic_log;
2941	2834
2942	2835	spin_lock(&log->l_icloglock);
2943		-
2944		- ASSERT(iclog->ic_state == XLOG_STATE_SYNCING \|\|
2945		- iclog->ic_state == XLOG_STATE_IOERROR);
2946	2836	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
2947		- ASSERT(iclog->ic_bwritecnt == 1 \|\| iclog->ic_bwritecnt == 2);
2948		-
2949	2837
2950	2838	/*
2951	2839	* If we got an error, either on the first buffer, or in the case of
2952		- * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
2953		- * and none should ever be attempted to be written to disk
2954		- * again.
	2840	+ * split log writes, on the second, we shut down the file system and
	2841	+ * no iclogs should ever be attempted to be written to disk again.
2955	2842	*/
2956		- if (iclog->ic_state != XLOG_STATE_IOERROR) {
2957		- if (--iclog->ic_bwritecnt == 1) {
2958		- spin_unlock(&log->l_icloglock);
2959		- return;
2960		- }
	2843	+ if (!XLOG_FORCED_SHUTDOWN(log)) {
	2844	+ ASSERT(iclog->ic_state == XLOG_STATE_SYNCING);
2961	2845	iclog->ic_state = XLOG_STATE_DONE_SYNC;
2962	2846	}
2963	2847
..	..	@@ -2968,9 +2852,8 @@
2968	2852	*/
2969	2853	wake_up_all(&iclog->ic_write_wait);
2970	2854	spin_unlock(&log->l_icloglock);
2971		- xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
2972		-} /* xlog_state_done_syncing */
2973		-
	2855	+ xlog_state_do_callback(log);
	2856	+}
2974	2857
2975	2858	/*
2976	2859	* If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
..	..	@@ -3002,7 +2885,6 @@
3002	2885	int log_offset;
3003	2886	xlog_rec_header_t *head;
3004	2887	xlog_in_core_t *iclog;
3005		- int error;
3006	2888
3007	2889	restart:
3008	2890	spin_lock(&log->l_icloglock);
..	..	@@ -3051,24 +2933,22 @@
3051	2933	* can fit into remaining data section.
3052	2934	*/
3053	2935	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
	2936	+ int error = 0;
	2937	+
3054	2938	xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
3055	2939
3056	2940	/*
3057		- * If I'm the only one writing to this iclog, sync it to disk.
3058		- * We need to do an atomic compare and decrement here to avoid
3059		- * racing with concurrent atomic_dec_and_lock() calls in
	2941	+ * If we are the only one writing to this iclog, sync it to
	2942	+ * disk. We need to do an atomic compare and decrement here to
	2943	+ * avoid racing with concurrent atomic_dec_and_lock() calls in
3060	2944	* xlog_state_release_iclog() when there is more than one
3061	2945	* reference to the iclog.
3062	2946	*/
3063		- if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
3064		- /* we are the only one */
3065		- spin_unlock(&log->l_icloglock);
	2947	+ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
3066	2948	error = xlog_state_release_iclog(log, iclog);
3067		- if (error)
3068		- return error;
3069		- } else {
3070		- spin_unlock(&log->l_icloglock);
3071		- }
	2949	+ spin_unlock(&log->l_icloglock);
	2950	+ if (error)
	2951	+ return error;
3072	2952	goto restart;
3073	2953	}
3074	2954
..	..	@@ -3092,21 +2972,21 @@
3092	2972
3093	2973	*logoffsetp = log_offset;
3094	2974	return 0;
3095		-} /* xlog_state_get_iclog_space */
	2975	+}
3096	2976
3097		-/* The first cnt-1 times through here we don't need to
3098		- * move the grant write head because the permanent
3099		- * reservation has reserved cnt times the unit amount.
3100		- * Release part of current permanent unit reservation and
3101		- * reset current reservation to be one units worth. Also
3102		- * move grant reservation head forward.
	2977	+/*
	2978	+ * The first cnt-1 times a ticket goes through here we don't need to move the
	2979	+ * grant write head because the permanent reservation has reserved cnt times the
	2980	+ * unit amount. Release part of current permanent unit reservation and reset
	2981	+ * current reservation to be one units worth. Also move grant reservation head
	2982	+ * forward.
3103	2983	*/
3104		-STATIC void
3105		-xlog_regrant_reserve_log_space(
	2984	+void
	2985	+xfs_log_ticket_regrant(
3106	2986	struct xlog *log,
3107	2987	struct xlog_ticket *ticket)
3108	2988	{
3109		- trace_xfs_log_regrant_reserve_enter(log, ticket);
	2989	+ trace_xfs_log_ticket_regrant(log, ticket);
3110	2990
3111	2991	if (ticket->t_cnt > 0)
3112	2992	ticket->t_cnt--;
..	..	@@ -3118,21 +2998,20 @@
3118	2998	ticket->t_curr_res = ticket->t_unit_res;
3119	2999	xlog_tic_reset_res(ticket);
3120	3000
3121		- trace_xfs_log_regrant_reserve_sub(log, ticket);
	3001	+ trace_xfs_log_ticket_regrant_sub(log, ticket);
3122	3002
3123	3003	/* just return if we still have some of the pre-reserved space */
3124		- if (ticket->t_cnt > 0)
3125		- return;
	3004	+ if (!ticket->t_cnt) {
	3005	+ xlog_grant_add_space(log, &log->l_reserve_head.grant,
	3006	+ ticket->t_unit_res);
	3007	+ trace_xfs_log_ticket_regrant_exit(log, ticket);
3126	3008
3127		- xlog_grant_add_space(log, &log->l_reserve_head.grant,
3128		- ticket->t_unit_res);
	3009	+ ticket->t_curr_res = ticket->t_unit_res;
	3010	+ xlog_tic_reset_res(ticket);
	3011	+ }
3129	3012
3130		- trace_xfs_log_regrant_reserve_exit(log, ticket);
3131		-
3132		- ticket->t_curr_res = ticket->t_unit_res;
3133		- xlog_tic_reset_res(ticket);
3134		-} /* xlog_regrant_reserve_log_space */
3135		-
	3013	+ xfs_log_ticket_put(ticket);
	3014	+}
3136	3015
3137	3016	/*
3138	3017	* Give back the space left from a reservation.
..	..	@@ -3148,18 +3027,19 @@
3148	3027	* space, the count will stay at zero and the only space remaining will be
3149	3028	* in the current reservation field.
3150	3029	*/
3151		-STATIC void
3152		-xlog_ungrant_log_space(
	3030	+void
	3031	+xfs_log_ticket_ungrant(
3153	3032	struct xlog *log,
3154	3033	struct xlog_ticket *ticket)
3155	3034	{
3156		- int bytes;
	3035	+ int bytes;
	3036	+
	3037	+ trace_xfs_log_ticket_ungrant(log, ticket);
3157	3038
3158	3039	if (ticket->t_cnt > 0)
3159	3040	ticket->t_cnt--;
3160	3041
3161		- trace_xfs_log_ungrant_enter(log, ticket);
3162		- trace_xfs_log_ungrant_sub(log, ticket);
	3042	+ trace_xfs_log_ticket_ungrant_sub(log, ticket);
3163	3043
3164	3044	/*
3165	3045	* If this is a permanent reservation ticket, we may be able to free
..	..	@@ -3174,71 +3054,15 @@
3174	3054	xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
3175	3055	xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
3176	3056
3177		- trace_xfs_log_ungrant_exit(log, ticket);
	3057	+ trace_xfs_log_ticket_ungrant_exit(log, ticket);
3178	3058
3179	3059	xfs_log_space_wake(log->l_mp);
	3060	+ xfs_log_ticket_put(ticket);
3180	3061	}
3181	3062
3182	3063	/*
3183		- * Flush iclog to disk if this is the last reference to the given iclog and
3184		- * the WANT_SYNC bit is set.
3185		- *
3186		- * When this function is entered, the iclog is not necessarily in the
3187		- * WANT_SYNC state. It may be sitting around waiting to get filled.
3188		- *
3189		- *
3190		- */
3191		-STATIC int
3192		-xlog_state_release_iclog(
3193		- struct xlog *log,
3194		- struct xlog_in_core *iclog)
3195		-{
3196		- int sync = 0; /* do we sync? */
3197		-
3198		- if (iclog->ic_state & XLOG_STATE_IOERROR)
3199		- return -EIO;
3200		-
3201		- ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
3202		- if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
3203		- return 0;
3204		-
3205		- if (iclog->ic_state & XLOG_STATE_IOERROR) {
3206		- spin_unlock(&log->l_icloglock);
3207		- return -EIO;
3208		- }
3209		- ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE \|\|
3210		- iclog->ic_state == XLOG_STATE_WANT_SYNC);
3211		-
3212		- if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
3213		- /* update tail before writing to iclog */
3214		- xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
3215		- sync++;
3216		- iclog->ic_state = XLOG_STATE_SYNCING;
3217		- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
3218		- xlog_verify_tail_lsn(log, iclog, tail_lsn);
3219		- /* cycle incremented when incrementing curr_block */
3220		- }
3221		- spin_unlock(&log->l_icloglock);
3222		-
3223		- /*
3224		- * We let the log lock go, so it's possible that we hit a log I/O
3225		- * error or some other SHUTDOWN condition that marks the iclog
3226		- * as XLOG_STATE_IOERROR before the bwrite. However, we know that
3227		- * this iclog has consistent data, so we ignore IOERROR
3228		- * flags after this point.
3229		- */
3230		- if (sync)
3231		- return xlog_sync(log, iclog);
3232		- return 0;
3233		-} /* xlog_state_release_iclog */
3234		-
3235		-
3236		-/*
3237		- * This routine will mark the current iclog in the ring as WANT_SYNC
3238		- * and move the current iclog pointer to the next iclog in the ring.
3239		- * When this routine is called from xlog_state_get_iclog_space(), the
3240		- * exact size of the iclog has not yet been determined. All we know is
3241		- * that every data block. We have run out of space in this log record.
	3064	+ * This routine will mark the current iclog in the ring as WANT_SYNC and move
	3065	+ * the current iclog pointer to the next iclog in the ring.
3242	3066	*/
3243	3067	STATIC void
3244	3068	xlog_state_switch_iclogs(
..	..	@@ -3247,6 +3071,8 @@
3247	3071	int eventual_size)
3248	3072	{
3249	3073	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
	3074	+ assert_spin_locked(&log->l_icloglock);
	3075	+
3250	3076	if (!eventual_size)
3251	3077	eventual_size = iclog->ic_offset;
3252	3078	iclog->ic_state = XLOG_STATE_WANT_SYNC;
..	..	@@ -3281,7 +3107,7 @@
3281	3107	}
3282	3108	ASSERT(iclog == log->l_iclog);
3283	3109	log->l_iclog = iclog->ic_next;
3284		-} /* xlog_state_switch_iclogs */
	3110	+}
3285	3111
3286	3112	/*
3287	3113	* Write out all data in the in-core log as of this exact moment in time.
..	..	@@ -3326,7 +3152,7 @@
3326	3152
3327	3153	spin_lock(&log->l_icloglock);
3328	3154	iclog = log->l_iclog;
3329		- if (iclog->ic_state & XLOG_STATE_IOERROR)
	3155	+ if (iclog->ic_state == XLOG_STATE_IOERROR)
3330	3156	goto out_error;
3331	3157
3332	3158	if (iclog->ic_state == XLOG_STATE_DIRTY \|\|
..	..	@@ -3341,9 +3167,6 @@
3341	3167	* previous iclog and go to sleep.
3342	3168	*/
3343	3169	iclog = iclog->ic_prev;
3344		- if (iclog->ic_state == XLOG_STATE_ACTIVE \|\|
3345		- iclog->ic_state == XLOG_STATE_DIRTY)
3346		- goto out_unlock;
3347	3170	} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3348	3171	if (atomic_read(&iclog->ic_refcnt) == 0) {
3349	3172	/*
..	..	@@ -3356,14 +3179,10 @@
3356	3179	atomic_inc(&iclog->ic_refcnt);
3357	3180	lsn = be64_to_cpu(iclog->ic_header.h_lsn);
3358	3181	xlog_state_switch_iclogs(log, iclog, 0);
3359		- spin_unlock(&log->l_icloglock);
3360		-
3361	3182	if (xlog_state_release_iclog(log, iclog))
3362		- return -EIO;
	3183	+ goto out_error;
3363	3184
3364		- spin_lock(&log->l_icloglock);
3365		- if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn \|\|
3366		- iclog->ic_state == XLOG_STATE_DIRTY)
	3185	+ if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
3367	3186	goto out_unlock;
3368	3187	} else {
3369	3188	/*
..	..	@@ -3383,17 +3202,8 @@
3383	3202	;
3384	3203	}
3385	3204
3386		- if (!(flags & XFS_LOG_SYNC))
3387		- goto out_unlock;
3388		-
3389		- if (iclog->ic_state & XLOG_STATE_IOERROR)
3390		- goto out_error;
3391		- XFS_STATS_INC(mp, xs_log_force_sleep);
3392		- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3393		- if (iclog->ic_state & XLOG_STATE_IOERROR)
3394		- return -EIO;
3395		- return 0;
3396		-
	3205	+ if (flags & XFS_LOG_SYNC)
	3206	+ return xlog_wait_on_iclog(iclog);
3397	3207	out_unlock:
3398	3208	spin_unlock(&log->l_icloglock);
3399	3209	return 0;
..	..	@@ -3403,19 +3213,18 @@
3403	3213	}
3404	3214
3405	3215	static int
3406		-__xfs_log_force_lsn(
3407		- struct xfs_mount *mp,
	3216	+xlog_force_lsn(
	3217	+ struct xlog *log,
3408	3218	xfs_lsn_t lsn,
3409	3219	uint flags,
3410	3220	int *log_flushed,
3411	3221	bool already_slept)
3412	3222	{
3413		- struct xlog *log = mp->m_log;
3414	3223	struct xlog_in_core *iclog;
3415	3224
3416	3225	spin_lock(&log->l_icloglock);
3417	3226	iclog = log->l_iclog;
3418		- if (iclog->ic_state & XLOG_STATE_IOERROR)
	3227	+ if (iclog->ic_state == XLOG_STATE_IOERROR)
3419	3228	goto out_error;
3420	3229
3421	3230	while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
..	..	@@ -3423,9 +3232,6 @@
3423	3232	if (iclog == log->l_iclog)
3424	3233	goto out_unlock;
3425	3234	}
3426		-
3427		- if (iclog->ic_state == XLOG_STATE_DIRTY)
3428		- goto out_unlock;
3429	3235
3430	3236	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3431	3237	/*
..	..	@@ -3444,39 +3250,22 @@
3444	3250	* will go out then.
3445	3251	*/
3446	3252	if (!already_slept &&
3447		- (iclog->ic_prev->ic_state &
3448		- (XLOG_STATE_WANT_SYNC \| XLOG_STATE_SYNCING))) {
3449		- ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
3450		-
3451		- XFS_STATS_INC(mp, xs_log_force_sleep);
3452		-
	3253	+ (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC \|\|
	3254	+ iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
3453	3255	xlog_wait(&iclog->ic_prev->ic_write_wait,
3454	3256	&log->l_icloglock);
3455	3257	return -EAGAIN;
3456	3258	}
3457	3259	atomic_inc(&iclog->ic_refcnt);
3458	3260	xlog_state_switch_iclogs(log, iclog, 0);
3459		- spin_unlock(&log->l_icloglock);
3460	3261	if (xlog_state_release_iclog(log, iclog))
3461		- return -EIO;
	3262	+ goto out_error;
3462	3263	if (log_flushed)
3463	3264	*log_flushed = 1;
3464		- spin_lock(&log->l_icloglock);
3465	3265	}
3466	3266
3467		- if (!(flags & XFS_LOG_SYNC) \|\|
3468		- (iclog->ic_state & (XLOG_STATE_ACTIVE \| XLOG_STATE_DIRTY)))
3469		- goto out_unlock;
3470		-
3471		- if (iclog->ic_state & XLOG_STATE_IOERROR)
3472		- goto out_error;
3473		-
3474		- XFS_STATS_INC(mp, xs_log_force_sleep);
3475		- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3476		- if (iclog->ic_state & XLOG_STATE_IOERROR)
3477		- return -EIO;
3478		- return 0;
3479		-
	3267	+ if (flags & XFS_LOG_SYNC)
	3268	+ return xlog_wait_on_iclog(iclog);
3480	3269	out_unlock:
3481	3270	spin_unlock(&log->l_icloglock);
3482	3271	return 0;
..	..	@@ -3500,54 +3289,31 @@
3500	3289	* to disk, that thread will wake up all threads waiting on the queue.
3501	3290	*/
3502	3291	int
3503		-xfs_log_force_lsn(
	3292	+xfs_log_force_seq(
3504	3293	struct xfs_mount *mp,
3505		- xfs_lsn_t lsn,
	3294	+ xfs_csn_t seq,
3506	3295	uint flags,
3507	3296	int *log_flushed)
3508	3297	{
	3298	+ struct xlog *log = mp->m_log;
	3299	+ xfs_lsn_t lsn;
3509	3300	int ret;
3510		- ASSERT(lsn != 0);
	3301	+ ASSERT(seq != 0);
3511	3302
3512	3303	XFS_STATS_INC(mp, xs_log_force);
3513		- trace_xfs_log_force(mp, lsn, _RET_IP_);
	3304	+ trace_xfs_log_force(mp, seq, _RET_IP_);
3514	3305
3515		- lsn = xlog_cil_force_lsn(mp->m_log, lsn);
	3306	+ lsn = xlog_cil_force_seq(log, seq);
3516	3307	if (lsn == NULLCOMMITLSN)
3517	3308	return 0;
3518	3309
3519		- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false);
3520		- if (ret == -EAGAIN)
3521		- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true);
	3310	+ ret = xlog_force_lsn(log, lsn, flags, log_flushed, false);
	3311	+ if (ret == -EAGAIN) {
	3312	+ XFS_STATS_INC(mp, xs_log_force_sleep);
	3313	+ ret = xlog_force_lsn(log, lsn, flags, log_flushed, true);
	3314	+ }
3522	3315	return ret;
3523	3316	}
3524		-
3525		-/*
3526		- * Called when we want to mark the current iclog as being ready to sync to
3527		- * disk.
3528		- */
3529		-STATIC void
3530		-xlog_state_want_sync(
3531		- struct xlog *log,
3532		- struct xlog_in_core *iclog)
3533		-{
3534		- assert_spin_locked(&log->l_icloglock);
3535		-
3536		- if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3537		- xlog_state_switch_iclogs(log, iclog, 0);
3538		- } else {
3539		- ASSERT(iclog->ic_state &
3540		- (XLOG_STATE_WANT_SYNC\|XLOG_STATE_IOERROR));
3541		- }
3542		-}
3543		-
3544		-
3545		-/*****************************************************************************
3546		- *
3547		- * TICKET functions
3548		- *
3549		- *****************************************************************************
3550		- */
3551	3317
3552	3318	/*
3553	3319	* Free a used ticket when its refcount falls to zero.
..	..	@@ -3558,7 +3324,7 @@
3558	3324	{
3559	3325	ASSERT(atomic_read(&ticket->t_ref) > 0);
3560	3326	if (atomic_dec_and_test(&ticket->t_ref))
3561		- kmem_zone_free(xfs_log_ticket_zone, ticket);
	3327	+ kmem_cache_free(xfs_log_ticket_zone, ticket);
3562	3328	}
3563	3329
3564	3330	xlog_ticket_t *
..	..	@@ -3676,15 +3442,12 @@
3676	3442	int unit_bytes,
3677	3443	int cnt,
3678	3444	char client,
3679		- bool permanent,
3680		- xfs_km_flags_t alloc_flags)
	3445	+ bool permanent)
3681	3446	{
3682	3447	struct xlog_ticket *tic;
3683	3448	int unit_res;
3684	3449
3685		- tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3686		- if (!tic)
3687		- return NULL;
	3450	+ tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS \| __GFP_NOFAIL);
3688	3451
3689	3452	unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
3690	3453
..	..	@@ -3697,7 +3460,6 @@
3697	3460	tic->t_ocnt = cnt;
3698	3461	tic->t_tid = prandom_u32();
3699	3462	tic->t_clientid = client;
3700		- tic->t_flags = XLOG_TIC_INITED;
3701	3463	if (permanent)
3702	3464	tic->t_flags \|= XLOG_TIC_PERM_RESERV;
3703	3465
..	..	@@ -3706,13 +3468,6 @@
3706	3468	return tic;
3707	3469	}
3708	3470
3709		-
3710		-/******************************************************************************
3711		- *
3712		- * Log debug routines
3713		- *
3714		- ******************************************************************************
3715		- */
3716	3471	#if defined(DEBUG)
3717	3472	/*
3718	3473	* Make sure that the destination ptr is within the valid data region of
..	..	@@ -3798,7 +3553,7 @@
3798	3553	if (blocks < BTOBB(iclog->ic_offset) + 1)
3799	3554	xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3800	3555	}
3801		-} /* xlog_verify_tail_lsn */
	3556	+}
3802	3557
3803	3558	/*
3804	3559	* Perform a number of checks on the iclog before writing to disk.
..	..	@@ -3819,8 +3574,7 @@
3819	3574	xlog_verify_iclog(
3820	3575	struct xlog *log,
3821	3576	struct xlog_in_core *iclog,
3822		- int count,
3823		- bool syncing)
	3577	+ int count)
3824	3578	{
3825	3579	xlog_op_header_t *ophead;
3826	3580	xlog_in_core_t *icptr;
..	..	@@ -3864,7 +3618,7 @@
3864	3618	/* clientid is only 1 byte */
3865	3619	p = &ophead->oh_clientid;
3866	3620	field_offset = p - base_ptr;
3867		- if (!syncing \|\| (field_offset & 0x1ff)) {
	3621	+ if (field_offset & 0x1ff) {
3868	3622	clientid = ophead->oh_clientid;
3869	3623	} else {
3870	3624	idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
..	..	@@ -3887,7 +3641,7 @@
3887	3641	/* check length */
3888	3642	p = &ophead->oh_len;
3889	3643	field_offset = p - base_ptr;
3890		- if (!syncing \|\| (field_offset & 0x1ff)) {
	3644	+ if (field_offset & 0x1ff) {
3891	3645	op_len = be32_to_cpu(ophead->oh_len);
3892	3646	} else {
3893	3647	idx = BTOBBT((uintptr_t)&ophead->oh_len -
..	..	@@ -3902,7 +3656,7 @@
3902	3656	}
3903	3657	ptr += sizeof(xlog_op_header_t) + op_len;
3904	3658	}
3905		-} /* xlog_verify_iclog */
	3659	+}
3906	3660	#endif
3907	3661
3908	3662	/*
..	..	@@ -3915,7 +3669,7 @@
3915	3669	xlog_in_core_t iclog, ic;
3916	3670
3917	3671	iclog = log->l_iclog;
3918		- if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
	3672	+ if (iclog->ic_state != XLOG_STATE_IOERROR) {
3919	3673	/*
3920	3674	* Mark all the incore logs IOERROR.
3921	3675	* From now on, no log flushes will result.
..	..	@@ -3975,7 +3729,7 @@
3975	3729	* Somebody could've already done the hard work for us.
3976	3730	* No need to get locks for this.
3977	3731	*/
3978		- if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
	3732	+ if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) {
3979	3733	ASSERT(XLOG_FORCED_SHUTDOWN(log));
3980	3734	return 1;
3981	3735	}
..	..	@@ -4026,21 +3780,8 @@
4026	3780	spin_lock(&log->l_cilp->xc_push_lock);
4027	3781	wake_up_all(&log->l_cilp->xc_commit_wait);
4028	3782	spin_unlock(&log->l_cilp->xc_push_lock);
4029		- xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
	3783	+ xlog_state_do_callback(log);
4030	3784
4031		-#ifdef XFSERRORDEBUG
4032		- {
4033		- xlog_in_core_t *iclog;
4034		-
4035		- spin_lock(&log->l_icloglock);
4036		- iclog = log->l_iclog;
4037		- do {
4038		- ASSERT(iclog->ic_callback == 0);
4039		- iclog = iclog->ic_next;
4040		- } while (iclog != log->l_iclog);
4041		- spin_unlock(&log->l_icloglock);
4042		- }
4043		-#endif
4044	3785	/* return non-zero if log IOERROR transition had already happened */
4045	3786	return retval;
4046	3787	}