~hc/RK356X_SDK_RELEASE.git

..	..	@@ -13,29 +13,18 @@
13	13	#include "xfs_sb.h"
14	14	#include "xfs_mount.h"
15	15	#include "xfs_defer.h"
16		-#include "xfs_da_format.h"
17		-#include "xfs_da_btree.h"
18	16	#include "xfs_inode.h"
19	17	#include "xfs_trans.h"
20	18	#include "xfs_log.h"
21	19	#include "xfs_log_priv.h"
22	20	#include "xfs_log_recover.h"
23		-#include "xfs_inode_item.h"
24		-#include "xfs_extfree_item.h"
25	21	#include "xfs_trans_priv.h"
26	22	#include "xfs_alloc.h"
27	23	#include "xfs_ialloc.h"
28		-#include "xfs_quota.h"
29		-#include "xfs_cksum.h"
30	24	#include "xfs_trace.h"
31	25	#include "xfs_icache.h"
32		-#include "xfs_bmap_btree.h"
33	26	#include "xfs_error.h"
34		-#include "xfs_dir2.h"
35		-#include "xfs_rmap_item.h"
36	27	#include "xfs_buf_item.h"
37		-#include "xfs_refcount_item.h"
38		-#include "xfs_bmap_item.h"
39	28
40	29	#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
41	30
..	..	@@ -59,17 +48,6 @@
59	48	struct xlog , xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t );
60	49
61	50	/*
62		- * This structure is used during recovery to record the buf log items which
63		- * have been canceled and should not be replayed.
64		- */
65		-struct xfs_buf_cancel {
66		- xfs_daddr_t bc_blkno;
67		- uint bc_len;
68		- int bc_refcount;
69		- struct list_head bc_list;
70		-};
71		-
72		-/*
73	51	* Sector aligned buffer routines for buffer create/read/write/access
74	52	*/
75	53
..	..	@@ -79,7 +57,7 @@
79	57	* are valid, false otherwise.
80	58	*/
81	59	static inline bool
82		-xlog_verify_bp(
	60	+xlog_verify_bno(
83	61	struct xlog *log,
84	62	xfs_daddr_t blk_no,
85	63	int bbcount)
..	..	@@ -92,114 +70,98 @@
92	70	}
93	71
94	72	/*
95		- * Allocate a buffer to hold log data. The buffer needs to be able
96		- * to map to a range of nbblks basic blocks at any valid (basic
97		- * block) offset within the log.
	73	+ * Allocate a buffer to hold log data. The buffer needs to be able to map to
	74	+ * a range of nbblks basic blocks at any valid offset within the log.
98	75	*/
99		-STATIC xfs_buf_t *
100		-xlog_get_bp(
	76	+static char *
	77	+xlog_alloc_buffer(
101	78	struct xlog *log,
102	79	int nbblks)
103	80	{
104		- struct xfs_buf *bp;
	81	+ int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
105	82
106	83	/*
107	84	* Pass log block 0 since we don't have an addr yet, buffer will be
108	85	* verified on read.
109	86	*/
110		- if (!xlog_verify_bp(log, 0, nbblks)) {
	87	+ if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
111	88	xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
112	89	nbblks);
113		- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
114	90	return NULL;
115	91	}
116	92
117	93	/*
118		- * We do log I/O in units of log sectors (a power-of-2
119		- * multiple of the basic block size), so we round up the
120		- * requested size to accommodate the basic blocks required
121		- * for complete log sectors.
	94	+ * We do log I/O in units of log sectors (a power-of-2 multiple of the
	95	+ * basic block size), so we round up the requested size to accommodate
	96	+ * the basic blocks required for complete log sectors.
122	97	*
123		- * In addition, the buffer may be used for a non-sector-
124		- * aligned block offset, in which case an I/O of the
125		- * requested size could extend beyond the end of the
126		- * buffer. If the requested size is only 1 basic block it
127		- * will never straddle a sector boundary, so this won't be
128		- * an issue. Nor will this be a problem if the log I/O is
129		- * done in basic blocks (sector size 1). But otherwise we
130		- * extend the buffer by one extra log sector to ensure
131		- * there's space to accommodate this possibility.
	98	+ * In addition, the buffer may be used for a non-sector-aligned block
	99	+ * offset, in which case an I/O of the requested size could extend
	100	+ * beyond the end of the buffer. If the requested size is only 1 basic
	101	+ * block it will never straddle a sector boundary, so this won't be an
	102	+ * issue. Nor will this be a problem if the log I/O is done in basic
	103	+ * blocks (sector size 1). But otherwise we extend the buffer by one
	104	+ * extra log sector to ensure there's space to accommodate this
	105	+ * possibility.
132	106	*/
133	107	if (nbblks > 1 && log->l_sectBBsize > 1)
134	108	nbblks += log->l_sectBBsize;
135	109	nbblks = round_up(nbblks, log->l_sectBBsize);
136		-
137		- bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
138		- if (bp)
139		- xfs_buf_unlock(bp);
140		- return bp;
141		-}
142		-
143		-STATIC void
144		-xlog_put_bp(
145		- xfs_buf_t *bp)
146		-{
147		- xfs_buf_free(bp);
	110	+ return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL \| KM_ZERO);
148	111	}
149	112
150	113	/*
151	114	* Return the address of the start of the given block number's data
152	115	* in a log buffer. The buffer covers a log sector-aligned region.
153	116	*/
154		-STATIC char *
	117	+static inline unsigned int
155	118	xlog_align(
156	119	struct xlog *log,
157		- xfs_daddr_t blk_no,
158		- int nbblks,
159		- struct xfs_buf *bp)
	120	+ xfs_daddr_t blk_no)
160	121	{
161		- xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
162		-
163		- ASSERT(offset + nbblks <= bp->b_length);
164		- return bp->b_addr + BBTOB(offset);
	122	+ return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
165	123	}
166	124
167		-
168		-/*
169		- * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
170		- */
171		-STATIC int
172		-xlog_bread_noalign(
173		- struct xlog *log,
174		- xfs_daddr_t blk_no,
175		- int nbblks,
176		- struct xfs_buf *bp)
	125	+static int
	126	+xlog_do_io(
	127	+ struct xlog *log,
	128	+ xfs_daddr_t blk_no,
	129	+ unsigned int nbblks,
	130	+ char *data,
	131	+ unsigned int op)
177	132	{
178		- int error;
	133	+ int error;
179	134
180		- if (!xlog_verify_bp(log, blk_no, nbblks)) {
	135	+ if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
181	136	xfs_warn(log->l_mp,
182	137	"Invalid log block/length (0x%llx, 0x%x) for buffer",
183	138	blk_no, nbblks);
184		- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
185	139	return -EFSCORRUPTED;
186	140	}
187	141
188	142	blk_no = round_down(blk_no, log->l_sectBBsize);
189	143	nbblks = round_up(nbblks, log->l_sectBBsize);
190		-
191	144	ASSERT(nbblks > 0);
192		- ASSERT(nbblks <= bp->b_length);
193	145
194		- XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
195		- bp->b_flags \|= XBF_READ;
196		- bp->b_io_length = nbblks;
197		- bp->b_error = 0;
198		-
199		- error = xfs_buf_submit(bp);
200		- if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
201		- xfs_buf_ioerror_alert(bp, __func__);
	146	+ error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
	147	+ BBTOB(nbblks), data, op);
	148	+ if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) {
	149	+ xfs_alert(log->l_mp,
	150	+ "log recovery %s I/O error at daddr 0x%llx len %d error %d",
	151	+ op == REQ_OP_WRITE ? "write" : "read",
	152	+ blk_no, nbblks, error);
	153	+ }
202	154	return error;
	155	+}
	156	+
	157	+STATIC int
	158	+xlog_bread_noalign(
	159	+ struct xlog *log,
	160	+ xfs_daddr_t blk_no,
	161	+ int nbblks,
	162	+ char *data)
	163	+{
	164	+ return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
203	165	}
204	166
205	167	STATIC int
..	..	@@ -207,87 +169,25 @@
207	169	struct xlog *log,
208	170	xfs_daddr_t blk_no,
209	171	int nbblks,
210		- struct xfs_buf *bp,
	172	+ char *data,
211	173	char **offset)
212	174	{
213	175	int error;
214	176
215		- error = xlog_bread_noalign(log, blk_no, nbblks, bp);
216		- if (error)
217		- return error;
218		-
219		- *offset = xlog_align(log, blk_no, nbblks, bp);
220		- return 0;
	177	+ error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
	178	+ if (!error)
	179	+ *offset = data + xlog_align(log, blk_no);
	180	+ return error;
221	181	}
222	182
223		-/*
224		- * Read at an offset into the buffer. Returns with the buffer in it's original
225		- * state regardless of the result of the read.
226		- */
227		-STATIC int
228		-xlog_bread_offset(
229		- struct xlog *log,
230		- xfs_daddr_t blk_no, /* block to read from */
231		- int nbblks, /* blocks to read */
232		- struct xfs_buf *bp,
233		- char *offset)
234		-{
235		- char *orig_offset = bp->b_addr;
236		- int orig_len = BBTOB(bp->b_length);
237		- int error, error2;
238		-
239		- error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
240		- if (error)
241		- return error;
242		-
243		- error = xlog_bread_noalign(log, blk_no, nbblks, bp);
244		-
245		- /* must reset buffer pointer even on error */
246		- error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
247		- if (error)
248		- return error;
249		- return error2;
250		-}
251		-
252		-/*
253		- * Write out the buffer at the given block for the given number of blocks.
254		- * The buffer is kept locked across the write and is returned locked.
255		- * This can only be used for synchronous log writes.
256		- */
257	183	STATIC int
258	184	xlog_bwrite(
259	185	struct xlog *log,
260	186	xfs_daddr_t blk_no,
261	187	int nbblks,
262		- struct xfs_buf *bp)
	188	+ char *data)
263	189	{
264		- int error;
265		-
266		- if (!xlog_verify_bp(log, blk_no, nbblks)) {
267		- xfs_warn(log->l_mp,
268		- "Invalid log block/length (0x%llx, 0x%x) for buffer",
269		- blk_no, nbblks);
270		- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
271		- return -EFSCORRUPTED;
272		- }
273		-
274		- blk_no = round_down(blk_no, log->l_sectBBsize);
275		- nbblks = round_up(nbblks, log->l_sectBBsize);
276		-
277		- ASSERT(nbblks > 0);
278		- ASSERT(nbblks <= bp->b_length);
279		-
280		- XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
281		- xfs_buf_hold(bp);
282		- xfs_buf_lock(bp);
283		- bp->b_io_length = nbblks;
284		- bp->b_error = 0;
285		-
286		- error = xfs_bwrite(bp);
287		- if (error)
288		- xfs_buf_ioerror_alert(bp, __func__);
289		- xfs_buf_relse(bp);
290		- return error;
	190	+ return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
291	191	}
292	192
293	193	#ifdef DEBUG
..	..	@@ -323,19 +223,17 @@
323	223	* (XLOG_FMT_UNKNOWN). This stops us from trying to recover
324	224	* a dirty log created in IRIX.
325	225	*/
326		- if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
	226	+ if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
327	227	xfs_warn(mp,
328	228	"dirty log written in incompatible format - can't recover");
329	229	xlog_header_check_dump(mp, head);
330		- XFS_ERROR_REPORT("xlog_header_check_recover(1)",
331		- XFS_ERRLEVEL_HIGH, mp);
332	230	return -EFSCORRUPTED;
333		- } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
	231	+ }
	232	+ if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
	233	+ &head->h_fs_uuid))) {
334	234	xfs_warn(mp,
335	235	"dirty log entry has mismatched uuid - can't recover");
336	236	xlog_header_check_dump(mp, head);
337		- XFS_ERROR_REPORT("xlog_header_check_recover(2)",
338		- XFS_ERRLEVEL_HIGH, mp);
339	237	return -EFSCORRUPTED;
340	238	}
341	239	return 0;
..	..	@@ -358,42 +256,13 @@
358	256	* by IRIX and continue.
359	257	*/
360	258	xfs_warn(mp, "null uuid in log - IRIX style log");
361		- } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
	259	+ } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
	260	+ &head->h_fs_uuid))) {
362	261	xfs_warn(mp, "log has mismatched uuid - can't recover");
363	262	xlog_header_check_dump(mp, head);
364		- XFS_ERROR_REPORT("xlog_header_check_mount",
365		- XFS_ERRLEVEL_HIGH, mp);
366	263	return -EFSCORRUPTED;
367	264	}
368	265	return 0;
369		-}
370		-
371		-STATIC void
372		-xlog_recover_iodone(
373		- struct xfs_buf *bp)
374		-{
375		- if (bp->b_error) {
376		- /*
377		- * We're not going to bother about retrying
378		- * this during recovery. One strike!
379		- */
380		- if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
381		- xfs_buf_ioerror_alert(bp, __func__);
382		- xfs_force_shutdown(bp->b_target->bt_mount,
383		- SHUTDOWN_META_IO_ERROR);
384		- }
385		- }
386		-
387		- /*
388		- * On v5 supers, a bli could be attached to update the metadata LSN.
389		- * Clean it up.
390		- */
391		- if (bp->b_log_item)
392		- xfs_buf_item_relse(bp);
393		- ASSERT(bp->b_log_item == NULL);
394		-
395		- bp->b_iodone = NULL;
396		- xfs_buf_ioend(bp);
397	266	}
398	267
399	268	/*
..	..	@@ -405,7 +274,7 @@
405	274	STATIC int
406	275	xlog_find_cycle_start(
407	276	struct xlog *log,
408		- struct xfs_buf *bp,
	277	+ char *buffer,
409	278	xfs_daddr_t first_blk,
410	279	xfs_daddr_t *last_blk,
411	280	uint cycle)
..	..	@@ -419,7 +288,7 @@
419	288	end_blk = *last_blk;
420	289	mid_blk = BLK_AVG(first_blk, end_blk);
421	290	while (mid_blk != first_blk && mid_blk != end_blk) {
422		- error = xlog_bread(log, mid_blk, 1, bp, &offset);
	291	+ error = xlog_bread(log, mid_blk, 1, buffer, &offset);
423	292	if (error)
424	293	return error;
425	294	mid_cycle = xlog_get_cycle(offset);
..	..	@@ -455,7 +324,7 @@
455	324	{
456	325	xfs_daddr_t i, j;
457	326	uint cycle;
458		- xfs_buf_t *bp;
	327	+ char *buffer;
459	328	xfs_daddr_t bufblks;
460	329	char *buf = NULL;
461	330	int error = 0;
..	..	@@ -469,7 +338,7 @@
469	338	bufblks = 1 << ffs(nbblks);
470	339	while (bufblks > log->l_logBBsize)
471	340	bufblks >>= 1;
472		- while (!(bp = xlog_get_bp(log, bufblks))) {
	341	+ while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
473	342	bufblks >>= 1;
474	343	if (bufblks < log->l_sectBBsize)
475	344	return -ENOMEM;
..	..	@@ -480,7 +349,7 @@
480	349
481	350	bcount = min(bufblks, (start_blk + nbblks - i));
482	351
483		- error = xlog_bread(log, i, bcount, bp, &buf);
	352	+ error = xlog_bread(log, i, bcount, buffer, &buf);
484	353	if (error)
485	354	goto out;
486	355
..	..	@@ -498,8 +367,21 @@
498	367	*new_blk = -1;
499	368
500	369	out:
501		- xlog_put_bp(bp);
	370	+ kmem_free(buffer);
502	371	return error;
	372	+}
	373	+
	374	+static inline int
	375	+xlog_logrec_hblks(struct xlog log, struct xlog_rec_header rh)
	376	+{
	377	+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
	378	+ int h_size = be32_to_cpu(rh->h_size);
	379	+
	380	+ if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
	381	+ h_size > XLOG_HEADER_CYCLE_SIZE)
	382	+ return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
	383	+ }
	384	+ return 1;
503	385	}
504	386
505	387	/*
..	..	@@ -522,7 +404,7 @@
522	404	int extra_bblks)
523	405	{
524	406	xfs_daddr_t i;
525		- xfs_buf_t *bp;
	407	+ char *buffer;
526	408	char *offset = NULL;
527	409	xlog_rec_header_t *head = NULL;
528	410	int error = 0;
..	..	@@ -532,12 +414,14 @@
532	414
533	415	ASSERT(start_blk != 0 \|\| *last_blk != start_blk);
534	416
535		- if (!(bp = xlog_get_bp(log, num_blks))) {
536		- if (!(bp = xlog_get_bp(log, 1)))
	417	+ buffer = xlog_alloc_buffer(log, num_blks);
	418	+ if (!buffer) {
	419	+ buffer = xlog_alloc_buffer(log, 1);
	420	+ if (!buffer)
537	421	return -ENOMEM;
538	422	smallmem = 1;
539	423	} else {
540		- error = xlog_bread(log, start_blk, num_blks, bp, &offset);
	424	+ error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
541	425	if (error)
542	426	goto out;
543	427	offset += ((num_blks - 1) << BBSHIFT);
..	..	@@ -549,12 +433,12 @@
549	433	xfs_warn(log->l_mp,
550	434	"Log inconsistent (didn't find previous header)");
551	435	ASSERT(0);
552		- error = -EIO;
	436	+ error = -EFSCORRUPTED;
553	437	goto out;
554	438	}
555	439
556	440	if (smallmem) {
557		- error = xlog_bread(log, i, 1, bp, &offset);
	441	+ error = xlog_bread(log, i, 1, buffer, &offset);
558	442	if (error)
559	443	goto out;
560	444	}
..	..	@@ -592,22 +476,14 @@
592	476	* reset last_blk. Only when last_blk points in the middle of a log
593	477	* record do we update last_blk.
594	478	*/
595		- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
596		- uint h_size = be32_to_cpu(head->h_size);
597		-
598		- xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
599		- if (h_size % XLOG_HEADER_CYCLE_SIZE)
600		- xhdrs++;
601		- } else {
602		- xhdrs = 1;
603		- }
	479	+ xhdrs = xlog_logrec_hblks(log, head);
604	480
605	481	if (*last_blk - i + extra_bblks !=
606	482	BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
607	483	*last_blk = i;
608	484
609	485	out:
610		- xlog_put_bp(bp);
	486	+ kmem_free(buffer);
611	487	return error;
612	488	}
613	489
..	..	@@ -629,7 +505,7 @@
629	505	struct xlog *log,
630	506	xfs_daddr_t *return_head_blk)
631	507	{
632		- xfs_buf_t *bp;
	508	+ char *buffer;
633	509	char *offset;
634	510	xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
635	511	int num_scan_bblks;
..	..	@@ -659,20 +535,20 @@
659	535	}
660	536
661	537	first_blk = 0; /* get cycle # of 1st block */
662		- bp = xlog_get_bp(log, 1);
663		- if (!bp)
	538	+ buffer = xlog_alloc_buffer(log, 1);
	539	+ if (!buffer)
664	540	return -ENOMEM;
665	541
666		- error = xlog_bread(log, 0, 1, bp, &offset);
	542	+ error = xlog_bread(log, 0, 1, buffer, &offset);
667	543	if (error)
668		- goto bp_err;
	544	+ goto out_free_buffer;
669	545
670	546	first_half_cycle = xlog_get_cycle(offset);
671	547
672	548	last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
673		- error = xlog_bread(log, last_blk, 1, bp, &offset);
	549	+ error = xlog_bread(log, last_blk, 1, buffer, &offset);
674	550	if (error)
675		- goto bp_err;
	551	+ goto out_free_buffer;
676	552
677	553	last_half_cycle = xlog_get_cycle(offset);
678	554	ASSERT(last_half_cycle != 0);
..	..	@@ -740,9 +616,10 @@
740	616	* ^ we want to locate this spot
741	617	*/
742	618	stop_on_cycle = last_half_cycle;
743		- if ((error = xlog_find_cycle_start(log, bp, first_blk,
744		- &head_blk, last_half_cycle)))
745		- goto bp_err;
	619	+ error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
	620	+ last_half_cycle);
	621	+ if (error)
	622	+ goto out_free_buffer;
746	623	}
747	624
748	625	/*
..	..	@@ -762,7 +639,7 @@
762	639	if ((error = xlog_find_verify_cycle(log,
763	640	start_blk, num_scan_bblks,
764	641	stop_on_cycle, &new_blk)))
765		- goto bp_err;
	642	+ goto out_free_buffer;
766	643	if (new_blk != -1)
767	644	head_blk = new_blk;
768	645	} else { /* need to read 2 parts of log */
..	..	@@ -799,7 +676,7 @@
799	676	if ((error = xlog_find_verify_cycle(log, start_blk,
800	677	num_scan_bblks - (int)head_blk,
801	678	(stop_on_cycle - 1), &new_blk)))
802		- goto bp_err;
	679	+ goto out_free_buffer;
803	680	if (new_blk != -1) {
804	681	head_blk = new_blk;
805	682	goto validate_head;
..	..	@@ -815,7 +692,7 @@
815	692	if ((error = xlog_find_verify_cycle(log,
816	693	start_blk, (int)head_blk,
817	694	stop_on_cycle, &new_blk)))
818		- goto bp_err;
	695	+ goto out_free_buffer;
819	696	if (new_blk != -1)
820	697	head_blk = new_blk;
821	698	}
..	..	@@ -834,13 +711,13 @@
834	711	if (error == 1)
835	712	error = -EIO;
836	713	if (error)
837		- goto bp_err;
	714	+ goto out_free_buffer;
838	715	} else {
839	716	start_blk = 0;
840	717	ASSERT(head_blk <= INT_MAX);
841	718	error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
842	719	if (error < 0)
843		- goto bp_err;
	720	+ goto out_free_buffer;
844	721	if (error == 1) {
845	722	/* We hit the beginning of the log during our search */
846	723	start_blk = log_bbnum - (num_scan_bblks - head_blk);
..	..	@@ -853,14 +730,14 @@
853	730	if (error == 1)
854	731	error = -EIO;
855	732	if (error)
856		- goto bp_err;
	733	+ goto out_free_buffer;
857	734	if (new_blk != log_bbnum)
858	735	head_blk = new_blk;
859	736	} else if (error)
860		- goto bp_err;
	737	+ goto out_free_buffer;
861	738	}
862	739
863		- xlog_put_bp(bp);
	740	+ kmem_free(buffer);
864	741	if (head_blk == log_bbnum)
865	742	*return_head_blk = 0;
866	743	else
..	..	@@ -873,9 +750,8 @@
873	750	*/
874	751	return 0;
875	752
876		- bp_err:
877		- xlog_put_bp(bp);
878		-
	753	+out_free_buffer:
	754	+ kmem_free(buffer);
879	755	if (error)
880	756	xfs_warn(log->l_mp, "failed to find log head");
881	757	return error;
..	..	@@ -895,7 +771,7 @@
895	771	xfs_daddr_t head_blk,
896	772	xfs_daddr_t tail_blk,
897	773	int count,
898		- struct xfs_buf *bp,
	774	+ char *buffer,
899	775	xfs_daddr_t *rblk,
900	776	struct xlog_rec_header **rhead,
901	777	bool *wrapped)
..	..	@@ -914,7 +790,7 @@
914	790	*/
915	791	end_blk = head_blk > tail_blk ? tail_blk : 0;
916	792	for (i = (int) head_blk - 1; i >= end_blk; i--) {
917		- error = xlog_bread(log, i, 1, bp, &offset);
	793	+ error = xlog_bread(log, i, 1, buffer, &offset);
918	794	if (error)
919	795	goto out_error;
920	796
..	..	@@ -933,7 +809,7 @@
933	809	*/
934	810	if (tail_blk >= head_blk && found != count) {
935	811	for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
936		- error = xlog_bread(log, i, 1, bp, &offset);
	812	+ error = xlog_bread(log, i, 1, buffer, &offset);
937	813	if (error)
938	814	goto out_error;
939	815
..	..	@@ -969,7 +845,7 @@
969	845	xfs_daddr_t head_blk,
970	846	xfs_daddr_t tail_blk,
971	847	int count,
972		- struct xfs_buf *bp,
	848	+ char *buffer,
973	849	xfs_daddr_t *rblk,
974	850	struct xlog_rec_header **rhead,
975	851	bool *wrapped)
..	..	@@ -988,7 +864,7 @@
988	864	*/
989	865	end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
990	866	for (i = (int) tail_blk; i <= end_blk; i++) {
991		- error = xlog_bread(log, i, 1, bp, &offset);
	867	+ error = xlog_bread(log, i, 1, buffer, &offset);
992	868	if (error)
993	869	goto out_error;
994	870
..	..	@@ -1006,7 +882,7 @@
1006	882	*/
1007	883	if (tail_blk > head_blk && found != count) {
1008	884	for (i = 0; i < (int) head_blk; i++) {
1009		- error = xlog_bread(log, i, 1, bp, &offset);
	885	+ error = xlog_bread(log, i, 1, buffer, &offset);
1010	886	if (error)
1011	887	goto out_error;
1012	888
..	..	@@ -1069,22 +945,22 @@
1069	945	int hsize)
1070	946	{
1071	947	struct xlog_rec_header *thead;
1072		- struct xfs_buf *bp;
	948	+ char *buffer;
1073	949	xfs_daddr_t first_bad;
1074	950	int error = 0;
1075	951	bool wrapped;
1076	952	xfs_daddr_t tmp_tail;
1077	953	xfs_daddr_t orig_tail = *tail_blk;
1078	954
1079		- bp = xlog_get_bp(log, 1);
1080		- if (!bp)
	955	+ buffer = xlog_alloc_buffer(log, 1);
	956	+ if (!buffer)
1081	957	return -ENOMEM;
1082	958
1083	959	/*
1084	960	* Make sure the tail points to a record (returns positive count on
1085	961	* success).
1086	962	*/
1087		- error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
	963	+ error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
1088	964	&tmp_tail, &thead, &wrapped);
1089	965	if (error < 0)
1090	966	goto out;
..	..	@@ -1113,8 +989,8 @@
1113	989	break;
1114	990
1115	991	/* skip to the next record; returns positive count on success */
1116		- error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
1117		- &tmp_tail, &thead, &wrapped);
	992	+ error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
	993	+ buffer, &tmp_tail, &thead, &wrapped);
1118	994	if (error < 0)
1119	995	goto out;
1120	996
..	..	@@ -1129,7 +1005,7 @@
1129	1005	"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
1130	1006	orig_tail, *tail_blk);
1131	1007	out:
1132		- xlog_put_bp(bp);
	1008	+ kmem_free(buffer);
1133	1009	return error;
1134	1010	}
1135	1011
..	..	@@ -1151,13 +1027,13 @@
1151	1027	struct xlog *log,
1152	1028	xfs_daddr_t head_blk, / in/out: unverified head */
1153	1029	xfs_daddr_t tail_blk, / out: tail block */
1154		- struct xfs_buf *bp,
	1030	+ char *buffer,
1155	1031	xfs_daddr_t rhead_blk, / start blk of last record */
1156	1032	struct xlog_rec_header *rhead, / ptr to last record */
1157	1033	bool wrapped) / last rec. wraps phys. log */
1158	1034	{
1159	1035	struct xlog_rec_header *tmp_rhead;
1160		- struct xfs_buf *tmp_bp;
	1036	+ char *tmp_buffer;
1161	1037	xfs_daddr_t first_bad;
1162	1038	xfs_daddr_t tmp_rhead_blk;
1163	1039	int found;
..	..	@@ -1168,15 +1044,15 @@
1168	1044	* Check the head of the log for torn writes. Search backwards from the
1169	1045	* head until we hit the tail or the maximum number of log record I/Os
1170	1046	* that could have been in flight at one time. Use a temporary buffer so
1171		- * we don't trash the rhead/bp pointers from the caller.
	1047	+ * we don't trash the rhead/buffer pointers from the caller.
1172	1048	*/
1173		- tmp_bp = xlog_get_bp(log, 1);
1174		- if (!tmp_bp)
	1049	+ tmp_buffer = xlog_alloc_buffer(log, 1);
	1050	+ if (!tmp_buffer)
1175	1051	return -ENOMEM;
1176	1052	error = xlog_rseek_logrec_hdr(log, head_blk, tail_blk,
1177		- XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
1178		- &tmp_rhead, &tmp_wrapped);
1179		- xlog_put_bp(tmp_bp);
	1053	+ XLOG_MAX_ICLOGS, tmp_buffer,
	1054	+ &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
	1055	+ kmem_free(tmp_buffer);
1180	1056	if (error < 0)
1181	1057	return error;
1182	1058
..	..	@@ -1203,10 +1079,10 @@
1203	1079	*
1204	1080	* Note that xlog_find_tail() clears the blocks at the new head
1205	1081	* (i.e., the records with invalid CRC) if the cycle number
1206		- * matches the the current cycle.
	1082	+ * matches the current cycle.
1207	1083	*/
1208		- found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
1209		- rhead_blk, rhead, wrapped);
	1084	+ found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
	1085	+ buffer, rhead_blk, rhead, wrapped);
1210	1086	if (found < 0)
1211	1087	return found;
1212	1088	if (found == 0) /* XXX: right thing to do here? */
..	..	@@ -1266,7 +1142,7 @@
1266	1142	xfs_daddr_t *tail_blk,
1267	1143	struct xlog_rec_header *rhead,
1268	1144	xfs_daddr_t rhead_blk,
1269		- struct xfs_buf *bp,
	1145	+ char *buffer,
1270	1146	bool *clean)
1271	1147	{
1272	1148	struct xlog_op_header *op_head;
..	..	@@ -1287,29 +1163,14 @@
1287	1163	* below. We won't want to clear the unmount record if there is one, so
1288	1164	* we pass the lsn of the unmount record rather than the block after it.
1289	1165	*/
1290		- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1291		- int h_size = be32_to_cpu(rhead->h_size);
1292		- int h_version = be32_to_cpu(rhead->h_version);
1293		-
1294		- if ((h_version & XLOG_VERSION_2) &&
1295		- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1296		- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1297		- if (h_size % XLOG_HEADER_CYCLE_SIZE)
1298		- hblks++;
1299		- } else {
1300		- hblks = 1;
1301		- }
1302		- } else {
1303		- hblks = 1;
1304		- }
1305		-
	1166	+ hblks = xlog_logrec_hblks(log, rhead);
1306	1167	after_umount_blk = xlog_wrap_logbno(log,
1307	1168	rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
1308	1169
1309	1170	if (*head_blk == after_umount_blk &&
1310	1171	be32_to_cpu(rhead->h_num_logops) == 1) {
1311	1172	umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
1312		- error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
	1173	+ error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
1313	1174	if (error)
1314	1175	return error;
1315	1176
..	..	@@ -1388,7 +1249,7 @@
1388	1249	{
1389	1250	xlog_rec_header_t *rhead;
1390	1251	char *offset = NULL;
1391		- xfs_buf_t *bp;
	1252	+ char *buffer;
1392	1253	int error;
1393	1254	xfs_daddr_t rhead_blk;
1394	1255	xfs_lsn_t tail_lsn;
..	..	@@ -1402,11 +1263,11 @@
1402	1263	return error;
1403	1264	ASSERT(*head_blk < INT_MAX);
1404	1265
1405		- bp = xlog_get_bp(log, 1);
1406		- if (!bp)
	1266	+ buffer = xlog_alloc_buffer(log, 1);
	1267	+ if (!buffer)
1407	1268	return -ENOMEM;
1408	1269	if (head_blk == 0) { / special case */
1409		- error = xlog_bread(log, 0, 1, bp, &offset);
	1270	+ error = xlog_bread(log, 0, 1, buffer, &offset);
1410	1271	if (error)
1411	1272	goto done;
1412	1273
..	..	@@ -1422,13 +1283,14 @@
1422	1283	* block. This wraps all the way back around to the head so something is
1423	1284	* seriously wrong if we can't find it.
1424	1285	*/
1425		- error = xlog_rseek_logrec_hdr(log, head_blk, head_blk, 1, bp,
	1286	+ error = xlog_rseek_logrec_hdr(log, head_blk, head_blk, 1, buffer,
1426	1287	&rhead_blk, &rhead, &wrapped);
1427	1288	if (error < 0)
1428		- return error;
	1289	+ goto done;
1429	1290	if (!error) {
1430	1291	xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1431		- return -EIO;
	1292	+ error = -EFSCORRUPTED;
	1293	+ goto done;
1432	1294	}
1433	1295	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1434	1296
..	..	@@ -1443,7 +1305,7 @@
1443	1305	* state to determine whether recovery is necessary.
1444	1306	*/
1445	1307	error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
1446		- rhead_blk, bp, &clean);
	1308	+ rhead_blk, buffer, &clean);
1447	1309	if (error)
1448	1310	goto done;
1449	1311
..	..	@@ -1460,7 +1322,7 @@
1460	1322	if (!clean) {
1461	1323	xfs_daddr_t orig_head = *head_blk;
1462	1324
1463		- error = xlog_verify_head(log, head_blk, tail_blk, bp,
	1325	+ error = xlog_verify_head(log, head_blk, tail_blk, buffer,
1464	1326	&rhead_blk, &rhead, &wrapped);
1465	1327	if (error)
1466	1328	goto done;
..	..	@@ -1471,7 +1333,7 @@
1471	1333	wrapped);
1472	1334	tail_lsn = atomic64_read(&log->l_tail_lsn);
1473	1335	error = xlog_check_unmount_rec(log, head_blk, tail_blk,
1474		- rhead, rhead_blk, bp,
	1336	+ rhead, rhead_blk, buffer,
1475	1337	&clean);
1476	1338	if (error)
1477	1339	goto done;
..	..	@@ -1505,11 +1367,11 @@
1505	1367	* But... if the -device- itself is readonly, just skip this.
1506	1368	* We can't recover this device anyway, so it won't matter.
1507	1369	*/
1508		- if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
	1370	+ if (!xfs_readonly_buftarg(log->l_targ))
1509	1371	error = xlog_clear_stale_blocks(log, tail_lsn);
1510	1372
1511	1373	done:
1512		- xlog_put_bp(bp);
	1374	+ kmem_free(buffer);
1513	1375
1514	1376	if (error)
1515	1377	xfs_warn(log->l_mp, "failed to locate log tail");
..	..	@@ -1537,7 +1399,7 @@
1537	1399	struct xlog *log,
1538	1400	xfs_daddr_t *blk_no)
1539	1401	{
1540		- xfs_buf_t *bp;
	1402	+ char *buffer;
1541	1403	char *offset;
1542	1404	uint first_cycle, last_cycle;
1543	1405	xfs_daddr_t new_blk, last_blk, start_blk;
..	..	@@ -1547,35 +1409,36 @@
1547	1409	*blk_no = 0;
1548	1410
1549	1411	/* check totally zeroed log */
1550		- bp = xlog_get_bp(log, 1);
1551		- if (!bp)
	1412	+ buffer = xlog_alloc_buffer(log, 1);
	1413	+ if (!buffer)
1552	1414	return -ENOMEM;
1553		- error = xlog_bread(log, 0, 1, bp, &offset);
	1415	+ error = xlog_bread(log, 0, 1, buffer, &offset);
1554	1416	if (error)
1555		- goto bp_err;
	1417	+ goto out_free_buffer;
1556	1418
1557	1419	first_cycle = xlog_get_cycle(offset);
1558	1420	if (first_cycle == 0) { /* completely zeroed log */
1559	1421	*blk_no = 0;
1560		- xlog_put_bp(bp);
	1422	+ kmem_free(buffer);
1561	1423	return 1;
1562	1424	}
1563	1425
1564	1426	/* check partially zeroed log */
1565		- error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
	1427	+ error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
1566	1428	if (error)
1567		- goto bp_err;
	1429	+ goto out_free_buffer;
1568	1430
1569	1431	last_cycle = xlog_get_cycle(offset);
1570	1432	if (last_cycle != 0) { /* log completely written to */
1571		- xlog_put_bp(bp);
	1433	+ kmem_free(buffer);
1572	1434	return 0;
1573	1435	}
1574	1436
1575	1437	/* we have a partially zeroed log */
1576	1438	last_blk = log_bbnum-1;
1577		- if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1578		- goto bp_err;
	1439	+ error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
	1440	+ if (error)
	1441	+ goto out_free_buffer;
1579	1442
1580	1443	/*
1581	1444	* Validate the answer. Because there is no way to guarantee that
..	..	@@ -1598,7 +1461,7 @@
1598	1461	*/
1599	1462	if ((error = xlog_find_verify_cycle(log, start_blk,
1600	1463	(int)num_scan_bblks, 0, &new_blk)))
1601		- goto bp_err;
	1464	+ goto out_free_buffer;
1602	1465	if (new_blk != -1)
1603	1466	last_blk = new_blk;
1604	1467
..	..	@@ -1610,11 +1473,11 @@
1610	1473	if (error == 1)
1611	1474	error = -EIO;
1612	1475	if (error)
1613		- goto bp_err;
	1476	+ goto out_free_buffer;
1614	1477
1615	1478	*blk_no = last_blk;
1616		-bp_err:
1617		- xlog_put_bp(bp);
	1479	+out_free_buffer:
	1480	+ kmem_free(buffer);
1618	1481	if (error)
1619	1482	return error;
1620	1483	return 1;
..	..	@@ -1657,7 +1520,7 @@
1657	1520	int tail_block)
1658	1521	{
1659	1522	char *offset;
1660		- xfs_buf_t *bp;
	1523	+ char *buffer;
1661	1524	int balign, ealign;
1662	1525	int sectbb = log->l_sectBBsize;
1663	1526	int end_block = start_block + blocks;
..	..	@@ -1674,7 +1537,7 @@
1674	1537	bufblks = 1 << ffs(blocks);
1675	1538	while (bufblks > log->l_logBBsize)
1676	1539	bufblks >>= 1;
1677		- while (!(bp = xlog_get_bp(log, bufblks))) {
	1540	+ while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
1678	1541	bufblks >>= 1;
1679	1542	if (bufblks < sectbb)
1680	1543	return -ENOMEM;
..	..	@@ -1686,9 +1549,9 @@
1686	1549	*/
1687	1550	balign = round_down(start_block, sectbb);
1688	1551	if (balign != start_block) {
1689		- error = xlog_bread_noalign(log, start_block, 1, bp);
	1552	+ error = xlog_bread_noalign(log, start_block, 1, buffer);
1690	1553	if (error)
1691		- goto out_put_bp;
	1554	+ goto out_free_buffer;
1692	1555
1693	1556	j = start_block - balign;
1694	1557	}
..	..	@@ -1705,29 +1568,28 @@
1705	1568	*/
1706	1569	ealign = round_down(end_block, sectbb);
1707	1570	if (j == 0 && (start_block + endcount > ealign)) {
1708		- offset = bp->b_addr + BBTOB(ealign - start_block);
1709		- error = xlog_bread_offset(log, ealign, sectbb,
1710		- bp, offset);
	1571	+ error = xlog_bread_noalign(log, ealign, sectbb,
	1572	+ buffer + BBTOB(ealign - start_block));
1711	1573	if (error)
1712	1574	break;
1713	1575
1714	1576	}
1715	1577
1716		- offset = xlog_align(log, start_block, endcount, bp);
	1578	+ offset = buffer + xlog_align(log, start_block);
1717	1579	for (; j < endcount; j++) {
1718	1580	xlog_add_record(log, offset, cycle, i+j,
1719	1581	tail_cycle, tail_block);
1720	1582	offset += BBSIZE;
1721	1583	}
1722		- error = xlog_bwrite(log, start_block, endcount, bp);
	1584	+ error = xlog_bwrite(log, start_block, endcount, buffer);
1723	1585	if (error)
1724	1586	break;
1725	1587	start_block += endcount;
1726	1588	j = 0;
1727	1589	}
1728	1590
1729		- out_put_bp:
1730		- xlog_put_bp(bp);
	1591	+out_free_buffer:
	1592	+ kmem_free(buffer);
1731	1593	return error;
1732	1594	}
1733	1595
..	..	@@ -1777,11 +1639,10 @@
1777	1639	* the distance from the beginning of the log to the
1778	1640	* tail.
1779	1641	*/
1780		- if (unlikely(head_block < tail_block \|\| head_block >= log->l_logBBsize)) {
1781		- XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1782		- XFS_ERRLEVEL_LOW, log->l_mp);
	1642	+ if (XFS_IS_CORRUPT(log->l_mp,
	1643	+ head_block < tail_block \|\|
	1644	+ head_block >= log->l_logBBsize))
1783	1645	return -EFSCORRUPTED;
1784		- }
1785	1646	tail_distance = tail_block + (log->l_logBBsize - head_block);
1786	1647	} else {
1787	1648	/*
..	..	@@ -1789,11 +1650,10 @@
1789	1650	* so the distance from the head to the tail is just
1790	1651	* the tail block minus the head block.
1791	1652	*/
1792		- if (unlikely(head_block >= tail_block \|\| head_cycle != (tail_cycle + 1))){
1793		- XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1794		- XFS_ERRLEVEL_LOW, log->l_mp);
	1653	+ if (XFS_IS_CORRUPT(log->l_mp,
	1654	+ head_block >= tail_block \|\|
	1655	+ head_cycle != tail_cycle + 1))
1795	1656	return -EFSCORRUPTED;
1796		- }
1797	1657	tail_distance = tail_block - head_block;
1798	1658	}
1799	1659
..	..	@@ -1863,12 +1723,72 @@
1863	1723	return 0;
1864	1724	}
1865	1725
	1726	+/*
	1727	+ * Release the recovered intent item in the AIL that matches the given intent
	1728	+ * type and intent id.
	1729	+ */
	1730	+void
	1731	+xlog_recover_release_intent(
	1732	+ struct xlog *log,
	1733	+ unsigned short intent_type,
	1734	+ uint64_t intent_id)
	1735	+{
	1736	+ struct xfs_ail_cursor cur;
	1737	+ struct xfs_log_item *lip;
	1738	+ struct xfs_ail *ailp = log->l_ailp;
	1739	+
	1740	+ spin_lock(&ailp->ail_lock);
	1741	+ for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
	1742	+ lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
	1743	+ if (lip->li_type != intent_type)
	1744	+ continue;
	1745	+ if (!lip->li_ops->iop_match(lip, intent_id))
	1746	+ continue;
	1747	+
	1748	+ spin_unlock(&ailp->ail_lock);
	1749	+ lip->li_ops->iop_release(lip);
	1750	+ spin_lock(&ailp->ail_lock);
	1751	+ break;
	1752	+ }
	1753	+
	1754	+ xfs_trans_ail_cursor_done(&cur);
	1755	+ spin_unlock(&ailp->ail_lock);
	1756	+}
	1757	+
1866	1758	/******************************************************************************
1867	1759	*
1868	1760	* Log recover routines
1869	1761	*
1870	1762	******************************************************************************
1871	1763	*/
	1764	+static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
	1765	+ &xlog_buf_item_ops,
	1766	+ &xlog_inode_item_ops,
	1767	+ &xlog_dquot_item_ops,
	1768	+ &xlog_quotaoff_item_ops,
	1769	+ &xlog_icreate_item_ops,
	1770	+ &xlog_efi_item_ops,
	1771	+ &xlog_efd_item_ops,
	1772	+ &xlog_rui_item_ops,
	1773	+ &xlog_rud_item_ops,
	1774	+ &xlog_cui_item_ops,
	1775	+ &xlog_cud_item_ops,
	1776	+ &xlog_bui_item_ops,
	1777	+ &xlog_bud_item_ops,
	1778	+};
	1779	+
	1780	+static const struct xlog_recover_item_ops *
	1781	+xlog_find_item_ops(
	1782	+ struct xlog_recover_item *item)
	1783	+{
	1784	+ unsigned int i;
	1785	+
	1786	+ for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
	1787	+ if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
	1788	+ return xlog_recover_item_ops[i];
	1789	+
	1790	+ return NULL;
	1791	+}
1872	1792
1873	1793	/*
1874	1794	* Sort the log items in the transaction.
..	..	@@ -1925,54 +1845,23 @@
1925	1845	struct xlog_recover *trans,
1926	1846	int pass)
1927	1847	{
1928		- xlog_recover_item_t item, n;
	1848	+ struct xlog_recover_item item, n;
1929	1849	int error = 0;
1930	1850	LIST_HEAD(sort_list);
1931	1851	LIST_HEAD(cancel_list);
1932	1852	LIST_HEAD(buffer_list);
1933	1853	LIST_HEAD(inode_buffer_list);
1934		- LIST_HEAD(inode_list);
	1854	+ LIST_HEAD(item_list);
1935	1855
1936	1856	list_splice_init(&trans->r_itemq, &sort_list);
1937	1857	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1938		- xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
	1858	+ enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST;
1939	1859
1940		- switch (ITEM_TYPE(item)) {
1941		- case XFS_LI_ICREATE:
1942		- list_move_tail(&item->ri_list, &buffer_list);
1943		- break;
1944		- case XFS_LI_BUF:
1945		- if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1946		- trace_xfs_log_recover_item_reorder_head(log,
1947		- trans, item, pass);
1948		- list_move(&item->ri_list, &cancel_list);
1949		- break;
1950		- }
1951		- if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
1952		- list_move(&item->ri_list, &inode_buffer_list);
1953		- break;
1954		- }
1955		- list_move_tail(&item->ri_list, &buffer_list);
1956		- break;
1957		- case XFS_LI_INODE:
1958		- case XFS_LI_DQUOT:
1959		- case XFS_LI_QUOTAOFF:
1960		- case XFS_LI_EFD:
1961		- case XFS_LI_EFI:
1962		- case XFS_LI_RUI:
1963		- case XFS_LI_RUD:
1964		- case XFS_LI_CUI:
1965		- case XFS_LI_CUD:
1966		- case XFS_LI_BUI:
1967		- case XFS_LI_BUD:
1968		- trace_xfs_log_recover_item_reorder_tail(log,
1969		- trans, item, pass);
1970		- list_move_tail(&item->ri_list, &inode_list);
1971		- break;
1972		- default:
	1860	+ item->ri_ops = xlog_find_item_ops(item);
	1861	+ if (!item->ri_ops) {
1973	1862	xfs_warn(log->l_mp,
1974		- "%s: unrecognized type of log operation",
1975		- __func__);
	1863	+ "%s: unrecognized type of log operation (%d)",
	1864	+ __func__, ITEM_TYPE(item));
1976	1865	ASSERT(0);
1977	1866	/*
1978	1867	* return the remaining items back to the transaction
..	..	@@ -1980,16 +1869,38 @@
1980	1869	*/
1981	1870	if (!list_empty(&sort_list))
1982	1871	list_splice_init(&sort_list, &trans->r_itemq);
1983		- error = -EIO;
1984		- goto out;
	1872	+ error = -EFSCORRUPTED;
	1873	+ break;
	1874	+ }
	1875	+
	1876	+ if (item->ri_ops->reorder)
	1877	+ fate = item->ri_ops->reorder(item);
	1878	+
	1879	+ switch (fate) {
	1880	+ case XLOG_REORDER_BUFFER_LIST:
	1881	+ list_move_tail(&item->ri_list, &buffer_list);
	1882	+ break;
	1883	+ case XLOG_REORDER_CANCEL_LIST:
	1884	+ trace_xfs_log_recover_item_reorder_head(log,
	1885	+ trans, item, pass);
	1886	+ list_move(&item->ri_list, &cancel_list);
	1887	+ break;
	1888	+ case XLOG_REORDER_INODE_BUFFER_LIST:
	1889	+ list_move(&item->ri_list, &inode_buffer_list);
	1890	+ break;
	1891	+ case XLOG_REORDER_ITEM_LIST:
	1892	+ trace_xfs_log_recover_item_reorder_tail(log,
	1893	+ trans, item, pass);
	1894	+ list_move_tail(&item->ri_list, &item_list);
	1895	+ break;
1985	1896	}
1986	1897	}
1987		-out:
	1898	+
1988	1899	ASSERT(list_empty(&sort_list));
1989	1900	if (!list_empty(&buffer_list))
1990	1901	list_splice(&buffer_list, &trans->r_itemq);
1991		- if (!list_empty(&inode_list))
1992		- list_splice_tail(&inode_list, &trans->r_itemq);
	1902	+ if (!list_empty(&item_list))
	1903	+ list_splice_tail(&item_list, &trans->r_itemq);
1993	1904	if (!list_empty(&inode_buffer_list))
1994	1905	list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1995	1906	if (!list_empty(&cancel_list))
..	..	@@ -1997,2154 +1908,15 @@
1997	1908	return error;
1998	1909	}
1999	1910
2000		-/*
2001		- * Build up the table of buf cancel records so that we don't replay
2002		- * cancelled data in the second pass. For buffer records that are
2003		- * not cancel records, there is nothing to do here so we just return.
2004		- *
2005		- * If we get a cancel record which is already in the table, this indicates
2006		- * that the buffer was cancelled multiple times. In order to ensure
2007		- * that during pass 2 we keep the record in the table until we reach its
2008		- * last occurrence in the log, we keep a reference count in the cancel
2009		- * record in the table to tell us how many times we expect to see this
2010		- * record during the second pass.
2011		- */
2012		-STATIC int
2013		-xlog_recover_buffer_pass1(
2014		- struct xlog *log,
2015		- struct xlog_recover_item *item)
2016		-{
2017		- xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2018		- struct list_head *bucket;
2019		- struct xfs_buf_cancel *bcp;
2020		-
2021		- /*
2022		- * If this isn't a cancel buffer item, then just return.
2023		- */
2024		- if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
2025		- trace_xfs_log_recover_buf_not_cancel(log, buf_f);
2026		- return 0;
2027		- }
2028		-
2029		- /*
2030		- * Insert an xfs_buf_cancel record into the hash table of them.
2031		- * If there is already an identical record, bump its reference count.
2032		- */
2033		- bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
2034		- list_for_each_entry(bcp, bucket, bc_list) {
2035		- if (bcp->bc_blkno == buf_f->blf_blkno &&
2036		- bcp->bc_len == buf_f->blf_len) {
2037		- bcp->bc_refcount++;
2038		- trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
2039		- return 0;
2040		- }
2041		- }
2042		-
2043		- bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
2044		- bcp->bc_blkno = buf_f->blf_blkno;
2045		- bcp->bc_len = buf_f->blf_len;
2046		- bcp->bc_refcount = 1;
2047		- list_add_tail(&bcp->bc_list, bucket);
2048		-
2049		- trace_xfs_log_recover_buf_cancel_add(log, buf_f);
2050		- return 0;
2051		-}
2052		-
2053		-/*
2054		- * Check to see whether the buffer being recovered has a corresponding
2055		- * entry in the buffer cancel record table. If it is, return the cancel
2056		- * buffer structure to the caller.
2057		- */
2058		-STATIC struct xfs_buf_cancel *
2059		-xlog_peek_buffer_cancelled(
	1911	+void
	1912	+xlog_buf_readahead(
2060	1913	struct xlog *log,
2061	1914	xfs_daddr_t blkno,
2062	1915	uint len,
2063		- unsigned short flags)
	1916	+ const struct xfs_buf_ops *ops)
2064	1917	{
2065		- struct list_head *bucket;
2066		- struct xfs_buf_cancel *bcp;
2067		-
2068		- if (!log->l_buf_cancel_table) {
2069		- /* empty table means no cancelled buffers in the log */
2070		- ASSERT(!(flags & XFS_BLF_CANCEL));
2071		- return NULL;
2072		- }
2073		-
2074		- bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
2075		- list_for_each_entry(bcp, bucket, bc_list) {
2076		- if (bcp->bc_blkno == blkno && bcp->bc_len == len)
2077		- return bcp;
2078		- }
2079		-
2080		- /*
2081		- * We didn't find a corresponding entry in the table, so return 0 so
2082		- * that the buffer is NOT cancelled.
2083		- */
2084		- ASSERT(!(flags & XFS_BLF_CANCEL));
2085		- return NULL;
2086		-}
2087		-
2088		-/*
2089		- * If the buffer is being cancelled then return 1 so that it will be cancelled,
2090		- * otherwise return 0. If the buffer is actually a buffer cancel item
2091		- * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
2092		- * table and remove it from the table if this is the last reference.
2093		- *
2094		- * We remove the cancel record from the table when we encounter its last
2095		- * occurrence in the log so that if the same buffer is re-used again after its
2096		- * last cancellation we actually replay the changes made at that point.
2097		- */
2098		-STATIC int
2099		-xlog_check_buffer_cancelled(
2100		- struct xlog *log,
2101		- xfs_daddr_t blkno,
2102		- uint len,
2103		- unsigned short flags)
2104		-{
2105		- struct xfs_buf_cancel *bcp;
2106		-
2107		- bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
2108		- if (!bcp)
2109		- return 0;
2110		-
2111		- /*
2112		- * We've go a match, so return 1 so that the recovery of this buffer
2113		- * is cancelled. If this buffer is actually a buffer cancel log
2114		- * item, then decrement the refcount on the one in the table and
2115		- * remove it if this is the last reference.
2116		- */
2117		- if (flags & XFS_BLF_CANCEL) {
2118		- if (--bcp->bc_refcount == 0) {
2119		- list_del(&bcp->bc_list);
2120		- kmem_free(bcp);
2121		- }
2122		- }
2123		- return 1;
2124		-}
2125		-
2126		-/*
2127		- * Perform recovery for a buffer full of inodes. In these buffers, the only
2128		- * data which should be recovered is that which corresponds to the
2129		- * di_next_unlinked pointers in the on disk inode structures. The rest of the
2130		- * data for the inodes is always logged through the inodes themselves rather
2131		- * than the inode buffer and is recovered in xlog_recover_inode_pass2().
2132		- *
2133		- * The only time when buffers full of inodes are fully recovered is when the
2134		- * buffer is full of newly allocated inodes. In this case the buffer will
2135		- * not be marked as an inode buffer and so will be sent to
2136		- * xlog_recover_do_reg_buffer() below during recovery.
2137		- */
2138		-STATIC int
2139		-xlog_recover_do_inode_buffer(
2140		- struct xfs_mount *mp,
2141		- xlog_recover_item_t *item,
2142		- struct xfs_buf *bp,
2143		- xfs_buf_log_format_t *buf_f)
2144		-{
2145		- int i;
2146		- int item_index = 0;
2147		- int bit = 0;
2148		- int nbits = 0;
2149		- int reg_buf_offset = 0;
2150		- int reg_buf_bytes = 0;
2151		- int next_unlinked_offset;
2152		- int inodes_per_buf;
2153		- xfs_agino_t *logged_nextp;
2154		- xfs_agino_t *buffer_nextp;
2155		-
2156		- trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
2157		-
2158		- /*
2159		- * Post recovery validation only works properly on CRC enabled
2160		- * filesystems.
2161		- */
2162		- if (xfs_sb_version_hascrc(&mp->m_sb))
2163		- bp->b_ops = &xfs_inode_buf_ops;
2164		-
2165		- inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
2166		- for (i = 0; i < inodes_per_buf; i++) {
2167		- next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
2168		- offsetof(xfs_dinode_t, di_next_unlinked);
2169		-
2170		- while (next_unlinked_offset >=
2171		- (reg_buf_offset + reg_buf_bytes)) {
2172		- /*
2173		- * The next di_next_unlinked field is beyond
2174		- * the current logged region. Find the next
2175		- * logged region that contains or is beyond
2176		- * the current di_next_unlinked field.
2177		- */
2178		- bit += nbits;
2179		- bit = xfs_next_bit(buf_f->blf_data_map,
2180		- buf_f->blf_map_size, bit);
2181		-
2182		- /*
2183		- * If there are no more logged regions in the
2184		- * buffer, then we're done.
2185		- */
2186		- if (bit == -1)
2187		- return 0;
2188		-
2189		- nbits = xfs_contig_bits(buf_f->blf_data_map,
2190		- buf_f->blf_map_size, bit);
2191		- ASSERT(nbits > 0);
2192		- reg_buf_offset = bit << XFS_BLF_SHIFT;
2193		- reg_buf_bytes = nbits << XFS_BLF_SHIFT;
2194		- item_index++;
2195		- }
2196		-
2197		- /*
2198		- * If the current logged region starts after the current
2199		- * di_next_unlinked field, then move on to the next
2200		- * di_next_unlinked field.
2201		- */
2202		- if (next_unlinked_offset < reg_buf_offset)
2203		- continue;
2204		-
2205		- ASSERT(item->ri_buf[item_index].i_addr != NULL);
2206		- ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
2207		- ASSERT((reg_buf_offset + reg_buf_bytes) <=
2208		- BBTOB(bp->b_io_length));
2209		-
2210		- /*
2211		- * The current logged region contains a copy of the
2212		- * current di_next_unlinked field. Extract its value
2213		- * and copy it to the buffer copy.
2214		- */
2215		- logged_nextp = item->ri_buf[item_index].i_addr +
2216		- next_unlinked_offset - reg_buf_offset;
2217		- if (unlikely(*logged_nextp == 0)) {
2218		- xfs_alert(mp,
2219		- "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). "
2220		- "Trying to replay bad (0) inode di_next_unlinked field.",
2221		- item, bp);
2222		- XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
2223		- XFS_ERRLEVEL_LOW, mp);
2224		- return -EFSCORRUPTED;
2225		- }
2226		-
2227		- buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
2228		- buffer_nextp = logged_nextp;
2229		-
2230		- /*
2231		- * If necessary, recalculate the CRC in the on-disk inode. We
2232		- * have to leave the inode in a consistent state for whoever
2233		- * reads it next....
2234		- */
2235		- xfs_dinode_calc_crc(mp,
2236		- xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
2237		-
2238		- }
2239		-
2240		- return 0;
2241		-}
2242		-
2243		-/*
2244		- * V5 filesystems know the age of the buffer on disk being recovered. We can
2245		- * have newer objects on disk than we are replaying, and so for these cases we
2246		- * don't want to replay the current change as that will make the buffer contents
2247		- * temporarily invalid on disk.
2248		- *
2249		- * The magic number might not match the buffer type we are going to recover
2250		- * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
2251		- * extract the LSN of the existing object in the buffer based on it's current
2252		- * magic number. If we don't recognise the magic number in the buffer, then
2253		- * return a LSN of -1 so that the caller knows it was an unrecognised block and
2254		- * so can recover the buffer.
2255		- *
2256		- * Note: we cannot rely solely on magic number matches to determine that the
2257		- * buffer has a valid LSN - we also need to verify that it belongs to this
2258		- * filesystem, so we need to extract the object's LSN and compare it to that
2259		- * which we read from the superblock. If the UUIDs don't match, then we've got a
2260		- * stale metadata block from an old filesystem instance that we need to recover
2261		- * over the top of.
2262		- */
2263		-static xfs_lsn_t
2264		-xlog_recover_get_buf_lsn(
2265		- struct xfs_mount *mp,
2266		- struct xfs_buf *bp)
2267		-{
2268		- uint32_t magic32;
2269		- uint16_t magic16;
2270		- uint16_t magicda;
2271		- void *blk = bp->b_addr;
2272		- uuid_t *uuid;
2273		- xfs_lsn_t lsn = -1;
2274		-
2275		- /* v4 filesystems always recover immediately */
2276		- if (!xfs_sb_version_hascrc(&mp->m_sb))
2277		- goto recover_immediately;
2278		-
2279		- magic32 = be32_to_cpu((__be32 )blk);
2280		- switch (magic32) {
2281		- case XFS_ABTB_CRC_MAGIC:
2282		- case XFS_ABTC_CRC_MAGIC:
2283		- case XFS_ABTB_MAGIC:
2284		- case XFS_ABTC_MAGIC:
2285		- case XFS_RMAP_CRC_MAGIC:
2286		- case XFS_REFC_CRC_MAGIC:
2287		- case XFS_IBT_CRC_MAGIC:
2288		- case XFS_IBT_MAGIC: {
2289		- struct xfs_btree_block *btb = blk;
2290		-
2291		- lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
2292		- uuid = &btb->bb_u.s.bb_uuid;
2293		- break;
2294		- }
2295		- case XFS_BMAP_CRC_MAGIC:
2296		- case XFS_BMAP_MAGIC: {
2297		- struct xfs_btree_block *btb = blk;
2298		-
2299		- lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
2300		- uuid = &btb->bb_u.l.bb_uuid;
2301		- break;
2302		- }
2303		- case XFS_AGF_MAGIC:
2304		- lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
2305		- uuid = &((struct xfs_agf *)blk)->agf_uuid;
2306		- break;
2307		- case XFS_AGFL_MAGIC:
2308		- lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
2309		- uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
2310		- break;
2311		- case XFS_AGI_MAGIC:
2312		- lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
2313		- uuid = &((struct xfs_agi *)blk)->agi_uuid;
2314		- break;
2315		- case XFS_SYMLINK_MAGIC:
2316		- lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
2317		- uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
2318		- break;
2319		- case XFS_DIR3_BLOCK_MAGIC:
2320		- case XFS_DIR3_DATA_MAGIC:
2321		- case XFS_DIR3_FREE_MAGIC:
2322		- lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
2323		- uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
2324		- break;
2325		- case XFS_ATTR3_RMT_MAGIC:
2326		- /*
2327		- * Remote attr blocks are written synchronously, rather than
2328		- * being logged. That means they do not contain a valid LSN
2329		- * (i.e. transactionally ordered) in them, and hence any time we
2330		- * see a buffer to replay over the top of a remote attribute
2331		- * block we should simply do so.
2332		- */
2333		- goto recover_immediately;
2334		- case XFS_SB_MAGIC:
2335		- /*
2336		- * superblock uuids are magic. We may or may not have a
2337		- * sb_meta_uuid on disk, but it will be set in the in-core
2338		- * superblock. We set the uuid pointer for verification
2339		- * according to the superblock feature mask to ensure we check
2340		- * the relevant UUID in the superblock.
2341		- */
2342		- lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
2343		- if (xfs_sb_version_hasmetauuid(&mp->m_sb))
2344		- uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
2345		- else
2346		- uuid = &((struct xfs_dsb *)blk)->sb_uuid;
2347		- break;
2348		- default:
2349		- break;
2350		- }
2351		-
2352		- if (lsn != (xfs_lsn_t)-1) {
2353		- if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
2354		- goto recover_immediately;
2355		- return lsn;
2356		- }
2357		-
2358		- magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
2359		- switch (magicda) {
2360		- case XFS_DIR3_LEAF1_MAGIC:
2361		- case XFS_DIR3_LEAFN_MAGIC:
2362		- case XFS_DA3_NODE_MAGIC:
2363		- lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
2364		- uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
2365		- break;
2366		- default:
2367		- break;
2368		- }
2369		-
2370		- if (lsn != (xfs_lsn_t)-1) {
2371		- if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
2372		- goto recover_immediately;
2373		- return lsn;
2374		- }
2375		-
2376		- /*
2377		- * We do individual object checks on dquot and inode buffers as they
2378		- * have their own individual LSN records. Also, we could have a stale
2379		- * buffer here, so we have to at least recognise these buffer types.
2380		- *
2381		- * A notd complexity here is inode unlinked list processing - it logs
2382		- * the inode directly in the buffer, but we don't know which inodes have
2383		- * been modified, and there is no global buffer LSN. Hence we need to
2384		- * recover all inode buffer types immediately. This problem will be
2385		- * fixed by logical logging of the unlinked list modifications.
2386		- */
2387		- magic16 = be16_to_cpu((__be16 )blk);
2388		- switch (magic16) {
2389		- case XFS_DQUOT_MAGIC:
2390		- case XFS_DINODE_MAGIC:
2391		- goto recover_immediately;
2392		- default:
2393		- break;
2394		- }
2395		-
2396		- /* unknown buffer contents, recover immediately */
2397		-
2398		-recover_immediately:
2399		- return (xfs_lsn_t)-1;
2400		-
2401		-}
2402		-
2403		-/*
2404		- * Validate the recovered buffer is of the correct type and attach the
2405		- * appropriate buffer operations to them for writeback. Magic numbers are in a
2406		- * few places:
2407		- * the first 16 bits of the buffer (inode buffer, dquot buffer),
2408		- * the first 32 bits of the buffer (most blocks),
2409		- * inside a struct xfs_da_blkinfo at the start of the buffer.
2410		- */
2411		-static void
2412		-xlog_recover_validate_buf_type(
2413		- struct xfs_mount *mp,
2414		- struct xfs_buf *bp,
2415		- xfs_buf_log_format_t *buf_f,
2416		- xfs_lsn_t current_lsn)
2417		-{
2418		- struct xfs_da_blkinfo *info = bp->b_addr;
2419		- uint32_t magic32;
2420		- uint16_t magic16;
2421		- uint16_t magicda;
2422		- char *warnmsg = NULL;
2423		-
2424		- /*
2425		- * We can only do post recovery validation on items on CRC enabled
2426		- * fielsystems as we need to know when the buffer was written to be able
2427		- * to determine if we should have replayed the item. If we replay old
2428		- * metadata over a newer buffer, then it will enter a temporarily
2429		- * inconsistent state resulting in verification failures. Hence for now
2430		- * just avoid the verification stage for non-crc filesystems
2431		- */
2432		- if (!xfs_sb_version_hascrc(&mp->m_sb))
2433		- return;
2434		-
2435		- magic32 = be32_to_cpu((__be32 )bp->b_addr);
2436		- magic16 = be16_to_cpu((__be16)bp->b_addr);
2437		- magicda = be16_to_cpu(info->magic);
2438		- switch (xfs_blft_from_flags(buf_f)) {
2439		- case XFS_BLFT_BTREE_BUF:
2440		- switch (magic32) {
2441		- case XFS_ABTB_CRC_MAGIC:
2442		- case XFS_ABTC_CRC_MAGIC:
2443		- case XFS_ABTB_MAGIC:
2444		- case XFS_ABTC_MAGIC:
2445		- bp->b_ops = &xfs_allocbt_buf_ops;
2446		- break;
2447		- case XFS_IBT_CRC_MAGIC:
2448		- case XFS_FIBT_CRC_MAGIC:
2449		- case XFS_IBT_MAGIC:
2450		- case XFS_FIBT_MAGIC:
2451		- bp->b_ops = &xfs_inobt_buf_ops;
2452		- break;
2453		- case XFS_BMAP_CRC_MAGIC:
2454		- case XFS_BMAP_MAGIC:
2455		- bp->b_ops = &xfs_bmbt_buf_ops;
2456		- break;
2457		- case XFS_RMAP_CRC_MAGIC:
2458		- bp->b_ops = &xfs_rmapbt_buf_ops;
2459		- break;
2460		- case XFS_REFC_CRC_MAGIC:
2461		- bp->b_ops = &xfs_refcountbt_buf_ops;
2462		- break;
2463		- default:
2464		- warnmsg = "Bad btree block magic!";
2465		- break;
2466		- }
2467		- break;
2468		- case XFS_BLFT_AGF_BUF:
2469		- if (magic32 != XFS_AGF_MAGIC) {
2470		- warnmsg = "Bad AGF block magic!";
2471		- break;
2472		- }
2473		- bp->b_ops = &xfs_agf_buf_ops;
2474		- break;
2475		- case XFS_BLFT_AGFL_BUF:
2476		- if (magic32 != XFS_AGFL_MAGIC) {
2477		- warnmsg = "Bad AGFL block magic!";
2478		- break;
2479		- }
2480		- bp->b_ops = &xfs_agfl_buf_ops;
2481		- break;
2482		- case XFS_BLFT_AGI_BUF:
2483		- if (magic32 != XFS_AGI_MAGIC) {
2484		- warnmsg = "Bad AGI block magic!";
2485		- break;
2486		- }
2487		- bp->b_ops = &xfs_agi_buf_ops;
2488		- break;
2489		- case XFS_BLFT_UDQUOT_BUF:
2490		- case XFS_BLFT_PDQUOT_BUF:
2491		- case XFS_BLFT_GDQUOT_BUF:
2492		-#ifdef CONFIG_XFS_QUOTA
2493		- if (magic16 != XFS_DQUOT_MAGIC) {
2494		- warnmsg = "Bad DQUOT block magic!";
2495		- break;
2496		- }
2497		- bp->b_ops = &xfs_dquot_buf_ops;
2498		-#else
2499		- xfs_alert(mp,
2500		- "Trying to recover dquots without QUOTA support built in!");
2501		- ASSERT(0);
2502		-#endif
2503		- break;
2504		- case XFS_BLFT_DINO_BUF:
2505		- if (magic16 != XFS_DINODE_MAGIC) {
2506		- warnmsg = "Bad INODE block magic!";
2507		- break;
2508		- }
2509		- bp->b_ops = &xfs_inode_buf_ops;
2510		- break;
2511		- case XFS_BLFT_SYMLINK_BUF:
2512		- if (magic32 != XFS_SYMLINK_MAGIC) {
2513		- warnmsg = "Bad symlink block magic!";
2514		- break;
2515		- }
2516		- bp->b_ops = &xfs_symlink_buf_ops;
2517		- break;
2518		- case XFS_BLFT_DIR_BLOCK_BUF:
2519		- if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
2520		- magic32 != XFS_DIR3_BLOCK_MAGIC) {
2521		- warnmsg = "Bad dir block magic!";
2522		- break;
2523		- }
2524		- bp->b_ops = &xfs_dir3_block_buf_ops;
2525		- break;
2526		- case XFS_BLFT_DIR_DATA_BUF:
2527		- if (magic32 != XFS_DIR2_DATA_MAGIC &&
2528		- magic32 != XFS_DIR3_DATA_MAGIC) {
2529		- warnmsg = "Bad dir data magic!";
2530		- break;
2531		- }
2532		- bp->b_ops = &xfs_dir3_data_buf_ops;
2533		- break;
2534		- case XFS_BLFT_DIR_FREE_BUF:
2535		- if (magic32 != XFS_DIR2_FREE_MAGIC &&
2536		- magic32 != XFS_DIR3_FREE_MAGIC) {
2537		- warnmsg = "Bad dir3 free magic!";
2538		- break;
2539		- }
2540		- bp->b_ops = &xfs_dir3_free_buf_ops;
2541		- break;
2542		- case XFS_BLFT_DIR_LEAF1_BUF:
2543		- if (magicda != XFS_DIR2_LEAF1_MAGIC &&
2544		- magicda != XFS_DIR3_LEAF1_MAGIC) {
2545		- warnmsg = "Bad dir leaf1 magic!";
2546		- break;
2547		- }
2548		- bp->b_ops = &xfs_dir3_leaf1_buf_ops;
2549		- break;
2550		- case XFS_BLFT_DIR_LEAFN_BUF:
2551		- if (magicda != XFS_DIR2_LEAFN_MAGIC &&
2552		- magicda != XFS_DIR3_LEAFN_MAGIC) {
2553		- warnmsg = "Bad dir leafn magic!";
2554		- break;
2555		- }
2556		- bp->b_ops = &xfs_dir3_leafn_buf_ops;
2557		- break;
2558		- case XFS_BLFT_DA_NODE_BUF:
2559		- if (magicda != XFS_DA_NODE_MAGIC &&
2560		- magicda != XFS_DA3_NODE_MAGIC) {
2561		- warnmsg = "Bad da node magic!";
2562		- break;
2563		- }
2564		- bp->b_ops = &xfs_da3_node_buf_ops;
2565		- break;
2566		- case XFS_BLFT_ATTR_LEAF_BUF:
2567		- if (magicda != XFS_ATTR_LEAF_MAGIC &&
2568		- magicda != XFS_ATTR3_LEAF_MAGIC) {
2569		- warnmsg = "Bad attr leaf magic!";
2570		- break;
2571		- }
2572		- bp->b_ops = &xfs_attr3_leaf_buf_ops;
2573		- break;
2574		- case XFS_BLFT_ATTR_RMT_BUF:
2575		- if (magic32 != XFS_ATTR3_RMT_MAGIC) {
2576		- warnmsg = "Bad attr remote magic!";
2577		- break;
2578		- }
2579		- bp->b_ops = &xfs_attr3_rmt_buf_ops;
2580		- break;
2581		- case XFS_BLFT_SB_BUF:
2582		- if (magic32 != XFS_SB_MAGIC) {
2583		- warnmsg = "Bad SB block magic!";
2584		- break;
2585		- }
2586		- bp->b_ops = &xfs_sb_buf_ops;
2587		- break;
2588		-#ifdef CONFIG_XFS_RT
2589		- case XFS_BLFT_RTBITMAP_BUF:
2590		- case XFS_BLFT_RTSUMMARY_BUF:
2591		- /* no magic numbers for verification of RT buffers */
2592		- bp->b_ops = &xfs_rtbuf_ops;
2593		- break;
2594		-#endif /* CONFIG_XFS_RT */
2595		- default:
2596		- xfs_warn(mp, "Unknown buffer type %d!",
2597		- xfs_blft_from_flags(buf_f));
2598		- break;
2599		- }
2600		-
2601		- /*
2602		- * Nothing else to do in the case of a NULL current LSN as this means
2603		- * the buffer is more recent than the change in the log and will be
2604		- * skipped.
2605		- */
2606		- if (current_lsn == NULLCOMMITLSN)
2607		- return;
2608		-
2609		- if (warnmsg) {
2610		- xfs_warn(mp, warnmsg);
2611		- ASSERT(0);
2612		- }
2613		-
2614		- /*
2615		- * We must update the metadata LSN of the buffer as it is written out to
2616		- * ensure that older transactions never replay over this one and corrupt
2617		- * the buffer. This can occur if log recovery is interrupted at some
2618		- * point after the current transaction completes, at which point a
2619		- * subsequent mount starts recovery from the beginning.
2620		- *
2621		- * Write verifiers update the metadata LSN from log items attached to
2622		- * the buffer. Therefore, initialize a bli purely to carry the LSN to
2623		- * the verifier. We'll clean it up in our ->iodone() callback.
2624		- */
2625		- if (bp->b_ops) {
2626		- struct xfs_buf_log_item *bip;
2627		-
2628		- ASSERT(!bp->b_iodone \|\| bp->b_iodone == xlog_recover_iodone);
2629		- bp->b_iodone = xlog_recover_iodone;
2630		- xfs_buf_item_init(bp, mp);
2631		- bip = bp->b_log_item;
2632		- bip->bli_item.li_lsn = current_lsn;
2633		- }
2634		-}
2635		-
2636		-/*
2637		- * Perform a 'normal' buffer recovery. Each logged region of the
2638		- * buffer should be copied over the corresponding region in the
2639		- * given buffer. The bitmap in the buf log format structure indicates
2640		- * where to place the logged data.
2641		- */
2642		-STATIC void
2643		-xlog_recover_do_reg_buffer(
2644		- struct xfs_mount *mp,
2645		- xlog_recover_item_t *item,
2646		- struct xfs_buf *bp,
2647		- xfs_buf_log_format_t *buf_f,
2648		- xfs_lsn_t current_lsn)
2649		-{
2650		- int i;
2651		- int bit;
2652		- int nbits;
2653		- xfs_failaddr_t fa;
2654		-
2655		- trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
2656		-
2657		- bit = 0;
2658		- i = 1; /* 0 is the buf format structure */
2659		- while (1) {
2660		- bit = xfs_next_bit(buf_f->blf_data_map,
2661		- buf_f->blf_map_size, bit);
2662		- if (bit == -1)
2663		- break;
2664		- nbits = xfs_contig_bits(buf_f->blf_data_map,
2665		- buf_f->blf_map_size, bit);
2666		- ASSERT(nbits > 0);
2667		- ASSERT(item->ri_buf[i].i_addr != NULL);
2668		- ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
2669		- ASSERT(BBTOB(bp->b_io_length) >=
2670		- ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
2671		-
2672		- /*
2673		- * The dirty regions logged in the buffer, even though
2674		- * contiguous, may span multiple chunks. This is because the
2675		- * dirty region may span a physical page boundary in a buffer
2676		- * and hence be split into two separate vectors for writing into
2677		- * the log. Hence we need to trim nbits back to the length of
2678		- * the current region being copied out of the log.
2679		- */
2680		- if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
2681		- nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
2682		-
2683		- /*
2684		- * Do a sanity check if this is a dquot buffer. Just checking
2685		- * the first dquot in the buffer should do. XXXThis is
2686		- * probably a good thing to do for other buf types also.
2687		- */
2688		- fa = NULL;
2689		- if (buf_f->blf_flags &
2690		- (XFS_BLF_UDQUOT_BUF\|XFS_BLF_PDQUOT_BUF\|XFS_BLF_GDQUOT_BUF)) {
2691		- if (item->ri_buf[i].i_addr == NULL) {
2692		- xfs_alert(mp,
2693		- "XFS: NULL dquot in %s.", __func__);
2694		- goto next;
2695		- }
2696		- if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
2697		- xfs_alert(mp,
2698		- "XFS: dquot too small (%d) in %s.",
2699		- item->ri_buf[i].i_len, __func__);
2700		- goto next;
2701		- }
2702		- fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr,
2703		- -1, 0);
2704		- if (fa) {
2705		- xfs_alert(mp,
2706		- "dquot corrupt at %pS trying to replay into block 0x%llx",
2707		- fa, bp->b_bn);
2708		- goto next;
2709		- }
2710		- }
2711		-
2712		- memcpy(xfs_buf_offset(bp,
2713		- (uint)bit << XFS_BLF_SHIFT), /* dest */
2714		- item->ri_buf[i].i_addr, /* source */
2715		- nbits<<XFS_BLF_SHIFT); /* length */
2716		- next:
2717		- i++;
2718		- bit += nbits;
2719		- }
2720		-
2721		- /* Shouldn't be any more regions */
2722		- ASSERT(i == item->ri_total);
2723		-
2724		- xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
2725		-}
2726		-
2727		-/*
2728		- * Perform a dquot buffer recovery.
2729		- * Simple algorithm: if we have found a QUOTAOFF log item of the same type
2730		- * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2731		- * Else, treat it as a regular buffer and do recovery.
2732		- *
2733		- * Return false if the buffer was tossed and true if we recovered the buffer to
2734		- * indicate to the caller if the buffer needs writing.
2735		- */
2736		-STATIC bool
2737		-xlog_recover_do_dquot_buffer(
2738		- struct xfs_mount *mp,
2739		- struct xlog *log,
2740		- struct xlog_recover_item *item,
2741		- struct xfs_buf *bp,
2742		- struct xfs_buf_log_format *buf_f)
2743		-{
2744		- uint type;
2745		-
2746		- trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2747		-
2748		- /*
2749		- * Filesystems are required to send in quota flags at mount time.
2750		- */
2751		- if (!mp->m_qflags)
2752		- return false;
2753		-
2754		- type = 0;
2755		- if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2756		- type \|= XFS_DQ_USER;
2757		- if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2758		- type \|= XFS_DQ_PROJ;
2759		- if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2760		- type \|= XFS_DQ_GROUP;
2761		- /*
2762		- * This type of quotas was turned off, so ignore this buffer
2763		- */
2764		- if (log->l_quotaoffs_flag & type)
2765		- return false;
2766		-
2767		- xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
2768		- return true;
2769		-}
2770		-
2771		-/*
2772		- * This routine replays a modification made to a buffer at runtime.
2773		- * There are actually two types of buffer, regular and inode, which
2774		- * are handled differently. Inode buffers are handled differently
2775		- * in that we only recover a specific set of data from them, namely
2776		- * the inode di_next_unlinked fields. This is because all other inode
2777		- * data is actually logged via inode records and any data we replay
2778		- * here which overlaps that may be stale.
2779		- *
2780		- * When meta-data buffers are freed at run time we log a buffer item
2781		- * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2782		- * of the buffer in the log should not be replayed at recovery time.
2783		- * This is so that if the blocks covered by the buffer are reused for
2784		- * file data before we crash we don't end up replaying old, freed
2785		- * meta-data into a user's file.
2786		- *
2787		- * To handle the cancellation of buffer log items, we make two passes
2788		- * over the log during recovery. During the first we build a table of
2789		- * those buffers which have been cancelled, and during the second we
2790		- * only replay those buffers which do not have corresponding cancel
2791		- * records in the table. See xlog_recover_buffer_pass[1,2] above
2792		- * for more details on the implementation of the table of cancel records.
2793		- */
2794		-STATIC int
2795		-xlog_recover_buffer_pass2(
2796		- struct xlog *log,
2797		- struct list_head *buffer_list,
2798		- struct xlog_recover_item *item,
2799		- xfs_lsn_t current_lsn)
2800		-{
2801		- xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2802		- xfs_mount_t *mp = log->l_mp;
2803		- xfs_buf_t *bp;
2804		- int error;
2805		- uint buf_flags;
2806		- xfs_lsn_t lsn;
2807		-
2808		- /*
2809		- * In this pass we only want to recover all the buffers which have
2810		- * not been cancelled and are not cancellation buffers themselves.
2811		- */
2812		- if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2813		- buf_f->blf_len, buf_f->blf_flags)) {
2814		- trace_xfs_log_recover_buf_cancel(log, buf_f);
2815		- return 0;
2816		- }
2817		-
2818		- trace_xfs_log_recover_buf_recover(log, buf_f);
2819		-
2820		- buf_flags = 0;
2821		- if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
2822		- buf_flags \|= XBF_UNMAPPED;
2823		-
2824		- bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2825		- buf_flags, NULL);
2826		- if (!bp)
2827		- return -ENOMEM;
2828		- error = bp->b_error;
2829		- if (error) {
2830		- xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2831		- goto out_release;
2832		- }
2833		-
2834		- /*
2835		- * Recover the buffer only if we get an LSN from it and it's less than
2836		- * the lsn of the transaction we are replaying.
2837		- *
2838		- * Note that we have to be extremely careful of readahead here.
2839		- * Readahead does not attach verfiers to the buffers so if we don't
2840		- * actually do any replay after readahead because of the LSN we found
2841		- * in the buffer if more recent than that current transaction then we
2842		- * need to attach the verifier directly. Failure to do so can lead to
2843		- * future recovery actions (e.g. EFI and unlinked list recovery) can
2844		- * operate on the buffers and they won't get the verifier attached. This
2845		- * can lead to blocks on disk having the correct content but a stale
2846		- * CRC.
2847		- *
2848		- * It is safe to assume these clean buffers are currently up to date.
2849		- * If the buffer is dirtied by a later transaction being replayed, then
2850		- * the verifier will be reset to match whatever recover turns that
2851		- * buffer into.
2852		- */
2853		- lsn = xlog_recover_get_buf_lsn(mp, bp);
2854		- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2855		- trace_xfs_log_recover_buf_skip(log, buf_f);
2856		- xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
2857		- goto out_release;
2858		- }
2859		-
2860		- if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2861		- error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2862		- if (error)
2863		- goto out_release;
2864		- } else if (buf_f->blf_flags &
2865		- (XFS_BLF_UDQUOT_BUF\|XFS_BLF_PDQUOT_BUF\|XFS_BLF_GDQUOT_BUF)) {
2866		- bool dirty;
2867		-
2868		- dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2869		- if (!dirty)
2870		- goto out_release;
2871		- } else {
2872		- xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
2873		- }
2874		-
2875		- /*
2876		- * Perform delayed write on the buffer. Asynchronous writes will be
2877		- * slower when taking into account all the buffers to be flushed.
2878		- *
2879		- * Also make sure that only inode buffers with good sizes stay in
2880		- * the buffer cache. The kernel moves inodes in buffers of 1 block
2881		- * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode
2882		- * buffers in the log can be a different size if the log was generated
2883		- * by an older kernel using unclustered inode buffers or a newer kernel
2884		- * running with a different inode cluster size. Regardless, if the
2885		- * the inode buffer size isn't max(blocksize, mp->m_inode_cluster_size)
2886		- * for our value of mp->m_inode_cluster_size, then we need to keep
2887		- * the buffer out of the buffer cache so that the buffer won't
2888		- * overlap with future reads of those inodes.
2889		- */
2890		- if (XFS_DINODE_MAGIC ==
2891		- be16_to_cpu(((__be16 )xfs_buf_offset(bp, 0))) &&
2892		- (BBTOB(bp->b_io_length) != max(log->l_mp->m_sb.sb_blocksize,
2893		- (uint32_t)log->l_mp->m_inode_cluster_size))) {
2894		- xfs_buf_stale(bp);
2895		- error = xfs_bwrite(bp);
2896		- } else {
2897		- ASSERT(bp->b_target->bt_mount == mp);
2898		- bp->b_iodone = xlog_recover_iodone;
2899		- xfs_buf_delwri_queue(bp, buffer_list);
2900		- }
2901		-
2902		-out_release:
2903		- xfs_buf_relse(bp);
2904		- return error;
2905		-}
2906		-
2907		-/*
2908		- * Inode fork owner changes
2909		- *
2910		- * If we have been told that we have to reparent the inode fork, it's because an
2911		- * extent swap operation on a CRC enabled filesystem has been done and we are
2912		- * replaying it. We need to walk the BMBT of the appropriate fork and change the
2913		- * owners of it.
2914		- *
2915		- * The complexity here is that we don't have an inode context to work with, so
2916		- * after we've replayed the inode we need to instantiate one. This is where the
2917		- * fun begins.
2918		- *
2919		- * We are in the middle of log recovery, so we can't run transactions. That
2920		- * means we cannot use cache coherent inode instantiation via xfs_iget(), as
2921		- * that will result in the corresponding iput() running the inode through
2922		- * xfs_inactive(). If we've just replayed an inode core that changes the link
2923		- * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
2924		- * transactions (bad!).
2925		- *
2926		- * So, to avoid this, we instantiate an inode directly from the inode core we've
2927		- * just recovered. We have the buffer still locked, and all we really need to
2928		- * instantiate is the inode core and the forks being modified. We can do this
2929		- * manually, then run the inode btree owner change, and then tear down the
2930		- * xfs_inode without having to run any transactions at all.
2931		- *
2932		- * Also, because we don't have a transaction context available here but need to
2933		- * gather all the buffers we modify for writeback so we pass the buffer_list
2934		- * instead for the operation to use.
2935		- */
2936		-
2937		-STATIC int
2938		-xfs_recover_inode_owner_change(
2939		- struct xfs_mount *mp,
2940		- struct xfs_dinode *dip,
2941		- struct xfs_inode_log_format *in_f,
2942		- struct list_head *buffer_list)
2943		-{
2944		- struct xfs_inode *ip;
2945		- int error;
2946		-
2947		- ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER\|XFS_ILOG_AOWNER));
2948		-
2949		- ip = xfs_inode_alloc(mp, in_f->ilf_ino);
2950		- if (!ip)
2951		- return -ENOMEM;
2952		-
2953		- /* instantiate the inode */
2954		- xfs_inode_from_disk(ip, dip);
2955		- ASSERT(ip->i_d.di_version >= 3);
2956		-
2957		- error = xfs_iformat_fork(ip, dip);
2958		- if (error)
2959		- goto out_free_ip;
2960		-
2961		- if (!xfs_inode_verify_forks(ip)) {
2962		- error = -EFSCORRUPTED;
2963		- goto out_free_ip;
2964		- }
2965		-
2966		- if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
2967		- ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
2968		- error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
2969		- ip->i_ino, buffer_list);
2970		- if (error)
2971		- goto out_free_ip;
2972		- }
2973		-
2974		- if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
2975		- ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
2976		- error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
2977		- ip->i_ino, buffer_list);
2978		- if (error)
2979		- goto out_free_ip;
2980		- }
2981		-
2982		-out_free_ip:
2983		- xfs_inode_free(ip);
2984		- return error;
2985		-}
2986		-
2987		-STATIC int
2988		-xlog_recover_inode_pass2(
2989		- struct xlog *log,
2990		- struct list_head *buffer_list,
2991		- struct xlog_recover_item *item,
2992		- xfs_lsn_t current_lsn)
2993		-{
2994		- struct xfs_inode_log_format *in_f;
2995		- xfs_mount_t *mp = log->l_mp;
2996		- xfs_buf_t *bp;
2997		- xfs_dinode_t *dip;
2998		- int len;
2999		- char *src;
3000		- char *dest;
3001		- int error;
3002		- int attr_index;
3003		- uint fields;
3004		- struct xfs_log_dinode *ldip;
3005		- uint isize;
3006		- int need_free = 0;
3007		-
3008		- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
3009		- in_f = item->ri_buf[0].i_addr;
3010		- } else {
3011		- in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP);
3012		- need_free = 1;
3013		- error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
3014		- if (error)
3015		- goto error;
3016		- }
3017		-
3018		- /*
3019		- * Inode buffers can be freed, look out for it,
3020		- * and do not replay the inode.
3021		- */
3022		- if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
3023		- in_f->ilf_len, 0)) {
3024		- error = 0;
3025		- trace_xfs_log_recover_inode_cancel(log, in_f);
3026		- goto error;
3027		- }
3028		- trace_xfs_log_recover_inode_recover(log, in_f);
3029		-
3030		- bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
3031		- &xfs_inode_buf_ops);
3032		- if (!bp) {
3033		- error = -ENOMEM;
3034		- goto error;
3035		- }
3036		- error = bp->b_error;
3037		- if (error) {
3038		- xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
3039		- goto out_release;
3040		- }
3041		- ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
3042		- dip = xfs_buf_offset(bp, in_f->ilf_boffset);
3043		-
3044		- /*
3045		- * Make sure the place we're flushing out to really looks
3046		- * like an inode!
3047		- */
3048		- if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
3049		- xfs_alert(mp,
3050		- "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
3051		- __func__, dip, bp, in_f->ilf_ino);
3052		- XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
3053		- XFS_ERRLEVEL_LOW, mp);
3054		- error = -EFSCORRUPTED;
3055		- goto out_release;
3056		- }
3057		- ldip = item->ri_buf[1].i_addr;
3058		- if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) {
3059		- xfs_alert(mp,
3060		- "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld",
3061		- __func__, item, in_f->ilf_ino);
3062		- XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
3063		- XFS_ERRLEVEL_LOW, mp);
3064		- error = -EFSCORRUPTED;
3065		- goto out_release;
3066		- }
3067		-
3068		- /*
3069		- * If the inode has an LSN in it, recover the inode only if it's less
3070		- * than the lsn of the transaction we are replaying. Note: we still
3071		- * need to replay an owner change even though the inode is more recent
3072		- * than the transaction as there is no guarantee that all the btree
3073		- * blocks are more recent than this transaction, too.
3074		- */
3075		- if (dip->di_version >= 3) {
3076		- xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
3077		-
3078		- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3079		- trace_xfs_log_recover_inode_skip(log, in_f);
3080		- error = 0;
3081		- goto out_owner_change;
3082		- }
3083		- }
3084		-
3085		- /*
3086		- * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
3087		- * are transactional and if ordering is necessary we can determine that
3088		- * more accurately by the LSN field in the V3 inode core. Don't trust
3089		- * the inode versions we might be changing them here - use the
3090		- * superblock flag to determine whether we need to look at di_flushiter
3091		- * to skip replay when the on disk inode is newer than the log one
3092		- */
3093		- if (!xfs_sb_version_hascrc(&mp->m_sb) &&
3094		- ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
3095		- /*
3096		- * Deal with the wrap case, DI_MAX_FLUSH is less
3097		- * than smaller numbers
3098		- */
3099		- if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
3100		- ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
3101		- /* do nothing */
3102		- } else {
3103		- trace_xfs_log_recover_inode_skip(log, in_f);
3104		- error = 0;
3105		- goto out_release;
3106		- }
3107		- }
3108		-
3109		- /* Take the opportunity to reset the flush iteration count */
3110		- ldip->di_flushiter = 0;
3111		-
3112		- if (unlikely(S_ISREG(ldip->di_mode))) {
3113		- if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3114		- (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
3115		- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
3116		- XFS_ERRLEVEL_LOW, mp, ldip,
3117		- sizeof(*ldip));
3118		- xfs_alert(mp,
3119		- "%s: Bad regular inode log record, rec ptr "PTR_FMT", "
3120		- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
3121		- __func__, item, dip, bp, in_f->ilf_ino);
3122		- error = -EFSCORRUPTED;
3123		- goto out_release;
3124		- }
3125		- } else if (unlikely(S_ISDIR(ldip->di_mode))) {
3126		- if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3127		- (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
3128		- (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
3129		- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
3130		- XFS_ERRLEVEL_LOW, mp, ldip,
3131		- sizeof(*ldip));
3132		- xfs_alert(mp,
3133		- "%s: Bad dir inode log record, rec ptr "PTR_FMT", "
3134		- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
3135		- __func__, item, dip, bp, in_f->ilf_ino);
3136		- error = -EFSCORRUPTED;
3137		- goto out_release;
3138		- }
3139		- }
3140		- if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
3141		- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
3142		- XFS_ERRLEVEL_LOW, mp, ldip,
3143		- sizeof(*ldip));
3144		- xfs_alert(mp,
3145		- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
3146		- "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
3147		- __func__, item, dip, bp, in_f->ilf_ino,
3148		- ldip->di_nextents + ldip->di_anextents,
3149		- ldip->di_nblocks);
3150		- error = -EFSCORRUPTED;
3151		- goto out_release;
3152		- }
3153		- if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
3154		- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
3155		- XFS_ERRLEVEL_LOW, mp, ldip,
3156		- sizeof(*ldip));
3157		- xfs_alert(mp,
3158		- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
3159		- "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
3160		- item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
3161		- error = -EFSCORRUPTED;
3162		- goto out_release;
3163		- }
3164		- isize = xfs_log_dinode_size(ldip->di_version);
3165		- if (unlikely(item->ri_buf[1].i_len > isize)) {
3166		- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
3167		- XFS_ERRLEVEL_LOW, mp, ldip,
3168		- sizeof(*ldip));
3169		- xfs_alert(mp,
3170		- "%s: Bad inode log record length %d, rec ptr "PTR_FMT,
3171		- __func__, item->ri_buf[1].i_len, item);
3172		- error = -EFSCORRUPTED;
3173		- goto out_release;
3174		- }
3175		-
3176		- /* recover the log dinode inode into the on disk inode */
3177		- xfs_log_dinode_to_disk(ldip, dip);
3178		-
3179		- fields = in_f->ilf_fields;
3180		- if (fields & XFS_ILOG_DEV)
3181		- xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
3182		-
3183		- if (in_f->ilf_size == 2)
3184		- goto out_owner_change;
3185		- len = item->ri_buf[2].i_len;
3186		- src = item->ri_buf[2].i_addr;
3187		- ASSERT(in_f->ilf_size <= 4);
3188		- ASSERT((in_f->ilf_size == 3) \|\| (fields & XFS_ILOG_AFORK));
3189		- ASSERT(!(fields & XFS_ILOG_DFORK) \|\|
3190		- (len == in_f->ilf_dsize));
3191		-
3192		- switch (fields & XFS_ILOG_DFORK) {
3193		- case XFS_ILOG_DDATA:
3194		- case XFS_ILOG_DEXT:
3195		- memcpy(XFS_DFORK_DPTR(dip), src, len);
3196		- break;
3197		-
3198		- case XFS_ILOG_DBROOT:
3199		- xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
3200		- (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
3201		- XFS_DFORK_DSIZE(dip, mp));
3202		- break;
3203		-
3204		- default:
3205		- /*
3206		- * There are no data fork flags set.
3207		- */
3208		- ASSERT((fields & XFS_ILOG_DFORK) == 0);
3209		- break;
3210		- }
3211		-
3212		- /*
3213		- * If we logged any attribute data, recover it. There may or
3214		- * may not have been any other non-core data logged in this
3215		- * transaction.
3216		- */
3217		- if (in_f->ilf_fields & XFS_ILOG_AFORK) {
3218		- if (in_f->ilf_fields & XFS_ILOG_DFORK) {
3219		- attr_index = 3;
3220		- } else {
3221		- attr_index = 2;
3222		- }
3223		- len = item->ri_buf[attr_index].i_len;
3224		- src = item->ri_buf[attr_index].i_addr;
3225		- ASSERT(len == in_f->ilf_asize);
3226		-
3227		- switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
3228		- case XFS_ILOG_ADATA:
3229		- case XFS_ILOG_AEXT:
3230		- dest = XFS_DFORK_APTR(dip);
3231		- ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
3232		- memcpy(dest, src, len);
3233		- break;
3234		-
3235		- case XFS_ILOG_ABROOT:
3236		- dest = XFS_DFORK_APTR(dip);
3237		- xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
3238		- len, (xfs_bmdr_block_t*)dest,
3239		- XFS_DFORK_ASIZE(dip, mp));
3240		- break;
3241		-
3242		- default:
3243		- xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
3244		- ASSERT(0);
3245		- error = -EIO;
3246		- goto out_release;
3247		- }
3248		- }
3249		-
3250		-out_owner_change:
3251		- /* Recover the swapext owner change unless inode has been deleted */
3252		- if ((in_f->ilf_fields & (XFS_ILOG_DOWNER\|XFS_ILOG_AOWNER)) &&
3253		- (dip->di_mode != 0))
3254		- error = xfs_recover_inode_owner_change(mp, dip, in_f,
3255		- buffer_list);
3256		- /* re-generate the checksum. */
3257		- xfs_dinode_calc_crc(log->l_mp, dip);
3258		-
3259		- ASSERT(bp->b_target->bt_mount == mp);
3260		- bp->b_iodone = xlog_recover_iodone;
3261		- xfs_buf_delwri_queue(bp, buffer_list);
3262		-
3263		-out_release:
3264		- xfs_buf_relse(bp);
3265		-error:
3266		- if (need_free)
3267		- kmem_free(in_f);
3268		- return error;
3269		-}
3270		-
3271		-/*
3272		- * Recover QUOTAOFF records. We simply make a note of it in the xlog
3273		- * structure, so that we know not to do any dquot item or dquot buffer recovery,
3274		- * of that type.
3275		- */
3276		-STATIC int
3277		-xlog_recover_quotaoff_pass1(
3278		- struct xlog *log,
3279		- struct xlog_recover_item *item)
3280		-{
3281		- xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
3282		- ASSERT(qoff_f);
3283		-
3284		- /*
3285		- * The logitem format's flag tells us if this was user quotaoff,
3286		- * group/project quotaoff or both.
3287		- */
3288		- if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
3289		- log->l_quotaoffs_flag \|= XFS_DQ_USER;
3290		- if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
3291		- log->l_quotaoffs_flag \|= XFS_DQ_PROJ;
3292		- if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
3293		- log->l_quotaoffs_flag \|= XFS_DQ_GROUP;
3294		-
3295		- return 0;
3296		-}
3297		-
3298		-/*
3299		- * Recover a dquot record
3300		- */
3301		-STATIC int
3302		-xlog_recover_dquot_pass2(
3303		- struct xlog *log,
3304		- struct list_head *buffer_list,
3305		- struct xlog_recover_item *item,
3306		- xfs_lsn_t current_lsn)
3307		-{
3308		- xfs_mount_t *mp = log->l_mp;
3309		- xfs_buf_t *bp;
3310		- struct xfs_disk_dquot ddq, recddq;
3311		- xfs_failaddr_t fa;
3312		- int error;
3313		- xfs_dq_logformat_t *dq_f;
3314		- uint type;
3315		-
3316		-
3317		- /*
3318		- * Filesystems are required to send in quota flags at mount time.
3319		- */
3320		- if (mp->m_qflags == 0)
3321		- return 0;
3322		-
3323		- recddq = item->ri_buf[1].i_addr;
3324		- if (recddq == NULL) {
3325		- xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
3326		- return -EIO;
3327		- }
3328		- if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
3329		- xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
3330		- item->ri_buf[1].i_len, __func__);
3331		- return -EIO;
3332		- }
3333		-
3334		- /*
3335		- * This type of quotas was turned off, so ignore this record.
3336		- */
3337		- type = recddq->d_flags & (XFS_DQ_USER \| XFS_DQ_PROJ \| XFS_DQ_GROUP);
3338		- ASSERT(type);
3339		- if (log->l_quotaoffs_flag & type)
3340		- return 0;
3341		-
3342		- /*
3343		- * At this point we know that quota was _not_ turned off.
3344		- * Since the mount flags are not indicating to us otherwise, this
3345		- * must mean that quota is on, and the dquot needs to be replayed.
3346		- * Remember that we may not have fully recovered the superblock yet,
3347		- * so we can't do the usual trick of looking at the SB quota bits.
3348		- *
3349		- * The other possibility, of course, is that the quota subsystem was
3350		- * removed since the last mount - ENOSYS.
3351		- */
3352		- dq_f = item->ri_buf[0].i_addr;
3353		- ASSERT(dq_f);
3354		- fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0);
3355		- if (fa) {
3356		- xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
3357		- dq_f->qlf_id, fa);
3358		- return -EIO;
3359		- }
3360		- ASSERT(dq_f->qlf_len == 1);
3361		-
3362		- /*
3363		- * At this point we are assuming that the dquots have been allocated
3364		- * and hence the buffer has valid dquots stamped in it. It should,
3365		- * therefore, pass verifier validation. If the dquot is bad, then the
3366		- * we'll return an error here, so we don't need to specifically check
3367		- * the dquot in the buffer after the verifier has run.
3368		- */
3369		- error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
3370		- XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
3371		- &xfs_dquot_buf_ops);
3372		- if (error)
3373		- return error;
3374		-
3375		- ASSERT(bp);
3376		- ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
3377		-
3378		- /*
3379		- * If the dquot has an LSN in it, recover the dquot only if it's less
3380		- * than the lsn of the transaction we are replaying.
3381		- */
3382		- if (xfs_sb_version_hascrc(&mp->m_sb)) {
3383		- struct xfs_dqblk dqb = (struct xfs_dqblk )ddq;
3384		- xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
3385		-
3386		- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3387		- goto out_release;
3388		- }
3389		- }
3390		-
3391		- memcpy(ddq, recddq, item->ri_buf[1].i_len);
3392		- if (xfs_sb_version_hascrc(&mp->m_sb)) {
3393		- xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
3394		- XFS_DQUOT_CRC_OFF);
3395		- }
3396		-
3397		- ASSERT(dq_f->qlf_size == 2);
3398		- ASSERT(bp->b_target->bt_mount == mp);
3399		- bp->b_iodone = xlog_recover_iodone;
3400		- xfs_buf_delwri_queue(bp, buffer_list);
3401		-
3402		-out_release:
3403		- xfs_buf_relse(bp);
3404		- return 0;
3405		-}
3406		-
3407		-/*
3408		- * This routine is called to create an in-core extent free intent
3409		- * item from the efi format structure which was logged on disk.
3410		- * It allocates an in-core efi, copies the extents from the format
3411		- * structure into it, and adds the efi to the AIL with the given
3412		- * LSN.
3413		- */
3414		-STATIC int
3415		-xlog_recover_efi_pass2(
3416		- struct xlog *log,
3417		- struct xlog_recover_item *item,
3418		- xfs_lsn_t lsn)
3419		-{
3420		- int error;
3421		- struct xfs_mount *mp = log->l_mp;
3422		- struct xfs_efi_log_item *efip;
3423		- struct xfs_efi_log_format *efi_formatp;
3424		-
3425		- efi_formatp = item->ri_buf[0].i_addr;
3426		-
3427		- efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
3428		- error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
3429		- if (error) {
3430		- xfs_efi_item_free(efip);
3431		- return error;
3432		- }
3433		- atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
3434		-
3435		- spin_lock(&log->l_ailp->ail_lock);
3436		- /*
3437		- * The EFI has two references. One for the EFD and one for EFI to ensure
3438		- * it makes it into the AIL. Insert the EFI into the AIL directly and
3439		- * drop the EFI reference. Note that xfs_trans_ail_update() drops the
3440		- * AIL lock.
3441		- */
3442		- xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
3443		- xfs_efi_release(efip);
3444		- return 0;
3445		-}
3446		-
3447		-
3448		-/*
3449		- * This routine is called when an EFD format structure is found in a committed
3450		- * transaction in the log. Its purpose is to cancel the corresponding EFI if it
3451		- * was still in the log. To do this it searches the AIL for the EFI with an id
3452		- * equal to that in the EFD format structure. If we find it we drop the EFD
3453		- * reference, which removes the EFI from the AIL and frees it.
3454		- */
3455		-STATIC int
3456		-xlog_recover_efd_pass2(
3457		- struct xlog *log,
3458		- struct xlog_recover_item *item)
3459		-{
3460		- xfs_efd_log_format_t *efd_formatp;
3461		- xfs_efi_log_item_t *efip = NULL;
3462		- xfs_log_item_t *lip;
3463		- uint64_t efi_id;
3464		- struct xfs_ail_cursor cur;
3465		- struct xfs_ail *ailp = log->l_ailp;
3466		-
3467		- efd_formatp = item->ri_buf[0].i_addr;
3468		- ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
3469		- ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) \|\|
3470		- (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
3471		- ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
3472		- efi_id = efd_formatp->efd_efi_id;
3473		-
3474		- /*
3475		- * Search for the EFI with the id in the EFD format structure in the
3476		- * AIL.
3477		- */
3478		- spin_lock(&ailp->ail_lock);
3479		- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3480		- while (lip != NULL) {
3481		- if (lip->li_type == XFS_LI_EFI) {
3482		- efip = (xfs_efi_log_item_t *)lip;
3483		- if (efip->efi_format.efi_id == efi_id) {
3484		- /*
3485		- * Drop the EFD reference to the EFI. This
3486		- * removes the EFI from the AIL and frees it.
3487		- */
3488		- spin_unlock(&ailp->ail_lock);
3489		- xfs_efi_release(efip);
3490		- spin_lock(&ailp->ail_lock);
3491		- break;
3492		- }
3493		- }
3494		- lip = xfs_trans_ail_cursor_next(ailp, &cur);
3495		- }
3496		-
3497		- xfs_trans_ail_cursor_done(&cur);
3498		- spin_unlock(&ailp->ail_lock);
3499		-
3500		- return 0;
3501		-}
3502		-
3503		-/*
3504		- * This routine is called to create an in-core extent rmap update
3505		- * item from the rui format structure which was logged on disk.
3506		- * It allocates an in-core rui, copies the extents from the format
3507		- * structure into it, and adds the rui to the AIL with the given
3508		- * LSN.
3509		- */
3510		-STATIC int
3511		-xlog_recover_rui_pass2(
3512		- struct xlog *log,
3513		- struct xlog_recover_item *item,
3514		- xfs_lsn_t lsn)
3515		-{
3516		- int error;
3517		- struct xfs_mount *mp = log->l_mp;
3518		- struct xfs_rui_log_item *ruip;
3519		- struct xfs_rui_log_format *rui_formatp;
3520		-
3521		- rui_formatp = item->ri_buf[0].i_addr;
3522		-
3523		- ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
3524		- error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format);
3525		- if (error) {
3526		- xfs_rui_item_free(ruip);
3527		- return error;
3528		- }
3529		- atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
3530		-
3531		- spin_lock(&log->l_ailp->ail_lock);
3532		- /*
3533		- * The RUI has two references. One for the RUD and one for RUI to ensure
3534		- * it makes it into the AIL. Insert the RUI into the AIL directly and
3535		- * drop the RUI reference. Note that xfs_trans_ail_update() drops the
3536		- * AIL lock.
3537		- */
3538		- xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn);
3539		- xfs_rui_release(ruip);
3540		- return 0;
3541		-}
3542		-
3543		-
3544		-/*
3545		- * This routine is called when an RUD format structure is found in a committed
3546		- * transaction in the log. Its purpose is to cancel the corresponding RUI if it
3547		- * was still in the log. To do this it searches the AIL for the RUI with an id
3548		- * equal to that in the RUD format structure. If we find it we drop the RUD
3549		- * reference, which removes the RUI from the AIL and frees it.
3550		- */
3551		-STATIC int
3552		-xlog_recover_rud_pass2(
3553		- struct xlog *log,
3554		- struct xlog_recover_item *item)
3555		-{
3556		- struct xfs_rud_log_format *rud_formatp;
3557		- struct xfs_rui_log_item *ruip = NULL;
3558		- struct xfs_log_item *lip;
3559		- uint64_t rui_id;
3560		- struct xfs_ail_cursor cur;
3561		- struct xfs_ail *ailp = log->l_ailp;
3562		-
3563		- rud_formatp = item->ri_buf[0].i_addr;
3564		- ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format));
3565		- rui_id = rud_formatp->rud_rui_id;
3566		-
3567		- /*
3568		- * Search for the RUI with the id in the RUD format structure in the
3569		- * AIL.
3570		- */
3571		- spin_lock(&ailp->ail_lock);
3572		- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3573		- while (lip != NULL) {
3574		- if (lip->li_type == XFS_LI_RUI) {
3575		- ruip = (struct xfs_rui_log_item *)lip;
3576		- if (ruip->rui_format.rui_id == rui_id) {
3577		- /*
3578		- * Drop the RUD reference to the RUI. This
3579		- * removes the RUI from the AIL and frees it.
3580		- */
3581		- spin_unlock(&ailp->ail_lock);
3582		- xfs_rui_release(ruip);
3583		- spin_lock(&ailp->ail_lock);
3584		- break;
3585		- }
3586		- }
3587		- lip = xfs_trans_ail_cursor_next(ailp, &cur);
3588		- }
3589		-
3590		- xfs_trans_ail_cursor_done(&cur);
3591		- spin_unlock(&ailp->ail_lock);
3592		-
3593		- return 0;
3594		-}
3595		-
3596		-/*
3597		- * Copy an CUI format buffer from the given buf, and into the destination
3598		- * CUI format structure. The CUI/CUD items were designed not to need any
3599		- * special alignment handling.
3600		- */
3601		-static int
3602		-xfs_cui_copy_format(
3603		- struct xfs_log_iovec *buf,
3604		- struct xfs_cui_log_format *dst_cui_fmt)
3605		-{
3606		- struct xfs_cui_log_format *src_cui_fmt;
3607		- uint len;
3608		-
3609		- src_cui_fmt = buf->i_addr;
3610		- len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
3611		-
3612		- if (buf->i_len == len) {
3613		- memcpy(dst_cui_fmt, src_cui_fmt, len);
3614		- return 0;
3615		- }
3616		- return -EFSCORRUPTED;
3617		-}
3618		-
3619		-/*
3620		- * This routine is called to create an in-core extent refcount update
3621		- * item from the cui format structure which was logged on disk.
3622		- * It allocates an in-core cui, copies the extents from the format
3623		- * structure into it, and adds the cui to the AIL with the given
3624		- * LSN.
3625		- */
3626		-STATIC int
3627		-xlog_recover_cui_pass2(
3628		- struct xlog *log,
3629		- struct xlog_recover_item *item,
3630		- xfs_lsn_t lsn)
3631		-{
3632		- int error;
3633		- struct xfs_mount *mp = log->l_mp;
3634		- struct xfs_cui_log_item *cuip;
3635		- struct xfs_cui_log_format *cui_formatp;
3636		-
3637		- cui_formatp = item->ri_buf[0].i_addr;
3638		-
3639		- cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
3640		- error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
3641		- if (error) {
3642		- xfs_cui_item_free(cuip);
3643		- return error;
3644		- }
3645		- atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
3646		-
3647		- spin_lock(&log->l_ailp->ail_lock);
3648		- /*
3649		- * The CUI has two references. One for the CUD and one for CUI to ensure
3650		- * it makes it into the AIL. Insert the CUI into the AIL directly and
3651		- * drop the CUI reference. Note that xfs_trans_ail_update() drops the
3652		- * AIL lock.
3653		- */
3654		- xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn);
3655		- xfs_cui_release(cuip);
3656		- return 0;
3657		-}
3658		-
3659		-
3660		-/*
3661		- * This routine is called when an CUD format structure is found in a committed
3662		- * transaction in the log. Its purpose is to cancel the corresponding CUI if it
3663		- * was still in the log. To do this it searches the AIL for the CUI with an id
3664		- * equal to that in the CUD format structure. If we find it we drop the CUD
3665		- * reference, which removes the CUI from the AIL and frees it.
3666		- */
3667		-STATIC int
3668		-xlog_recover_cud_pass2(
3669		- struct xlog *log,
3670		- struct xlog_recover_item *item)
3671		-{
3672		- struct xfs_cud_log_format *cud_formatp;
3673		- struct xfs_cui_log_item *cuip = NULL;
3674		- struct xfs_log_item *lip;
3675		- uint64_t cui_id;
3676		- struct xfs_ail_cursor cur;
3677		- struct xfs_ail *ailp = log->l_ailp;
3678		-
3679		- cud_formatp = item->ri_buf[0].i_addr;
3680		- if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format))
3681		- return -EFSCORRUPTED;
3682		- cui_id = cud_formatp->cud_cui_id;
3683		-
3684		- /*
3685		- * Search for the CUI with the id in the CUD format structure in the
3686		- * AIL.
3687		- */
3688		- spin_lock(&ailp->ail_lock);
3689		- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3690		- while (lip != NULL) {
3691		- if (lip->li_type == XFS_LI_CUI) {
3692		- cuip = (struct xfs_cui_log_item *)lip;
3693		- if (cuip->cui_format.cui_id == cui_id) {
3694		- /*
3695		- * Drop the CUD reference to the CUI. This
3696		- * removes the CUI from the AIL and frees it.
3697		- */
3698		- spin_unlock(&ailp->ail_lock);
3699		- xfs_cui_release(cuip);
3700		- spin_lock(&ailp->ail_lock);
3701		- break;
3702		- }
3703		- }
3704		- lip = xfs_trans_ail_cursor_next(ailp, &cur);
3705		- }
3706		-
3707		- xfs_trans_ail_cursor_done(&cur);
3708		- spin_unlock(&ailp->ail_lock);
3709		-
3710		- return 0;
3711		-}
3712		-
3713		-/*
3714		- * Copy an BUI format buffer from the given buf, and into the destination
3715		- * BUI format structure. The BUI/BUD items were designed not to need any
3716		- * special alignment handling.
3717		- */
3718		-static int
3719		-xfs_bui_copy_format(
3720		- struct xfs_log_iovec *buf,
3721		- struct xfs_bui_log_format *dst_bui_fmt)
3722		-{
3723		- struct xfs_bui_log_format *src_bui_fmt;
3724		- uint len;
3725		-
3726		- src_bui_fmt = buf->i_addr;
3727		- len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
3728		-
3729		- if (buf->i_len == len) {
3730		- memcpy(dst_bui_fmt, src_bui_fmt, len);
3731		- return 0;
3732		- }
3733		- return -EFSCORRUPTED;
3734		-}
3735		-
3736		-/*
3737		- * This routine is called to create an in-core extent bmap update
3738		- * item from the bui format structure which was logged on disk.
3739		- * It allocates an in-core bui, copies the extents from the format
3740		- * structure into it, and adds the bui to the AIL with the given
3741		- * LSN.
3742		- */
3743		-STATIC int
3744		-xlog_recover_bui_pass2(
3745		- struct xlog *log,
3746		- struct xlog_recover_item *item,
3747		- xfs_lsn_t lsn)
3748		-{
3749		- int error;
3750		- struct xfs_mount *mp = log->l_mp;
3751		- struct xfs_bui_log_item *buip;
3752		- struct xfs_bui_log_format *bui_formatp;
3753		-
3754		- bui_formatp = item->ri_buf[0].i_addr;
3755		-
3756		- if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
3757		- return -EFSCORRUPTED;
3758		- buip = xfs_bui_init(mp);
3759		- error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
3760		- if (error) {
3761		- xfs_bui_item_free(buip);
3762		- return error;
3763		- }
3764		- atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
3765		-
3766		- spin_lock(&log->l_ailp->ail_lock);
3767		- /*
3768		- * The RUI has two references. One for the RUD and one for RUI to ensure
3769		- * it makes it into the AIL. Insert the RUI into the AIL directly and
3770		- * drop the RUI reference. Note that xfs_trans_ail_update() drops the
3771		- * AIL lock.
3772		- */
3773		- xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn);
3774		- xfs_bui_release(buip);
3775		- return 0;
3776		-}
3777		-
3778		-
3779		-/*
3780		- * This routine is called when an BUD format structure is found in a committed
3781		- * transaction in the log. Its purpose is to cancel the corresponding BUI if it
3782		- * was still in the log. To do this it searches the AIL for the BUI with an id
3783		- * equal to that in the BUD format structure. If we find it we drop the BUD
3784		- * reference, which removes the BUI from the AIL and frees it.
3785		- */
3786		-STATIC int
3787		-xlog_recover_bud_pass2(
3788		- struct xlog *log,
3789		- struct xlog_recover_item *item)
3790		-{
3791		- struct xfs_bud_log_format *bud_formatp;
3792		- struct xfs_bui_log_item *buip = NULL;
3793		- struct xfs_log_item *lip;
3794		- uint64_t bui_id;
3795		- struct xfs_ail_cursor cur;
3796		- struct xfs_ail *ailp = log->l_ailp;
3797		-
3798		- bud_formatp = item->ri_buf[0].i_addr;
3799		- if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format))
3800		- return -EFSCORRUPTED;
3801		- bui_id = bud_formatp->bud_bui_id;
3802		-
3803		- /*
3804		- * Search for the BUI with the id in the BUD format structure in the
3805		- * AIL.
3806		- */
3807		- spin_lock(&ailp->ail_lock);
3808		- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3809		- while (lip != NULL) {
3810		- if (lip->li_type == XFS_LI_BUI) {
3811		- buip = (struct xfs_bui_log_item *)lip;
3812		- if (buip->bui_format.bui_id == bui_id) {
3813		- /*
3814		- * Drop the BUD reference to the BUI. This
3815		- * removes the BUI from the AIL and frees it.
3816		- */
3817		- spin_unlock(&ailp->ail_lock);
3818		- xfs_bui_release(buip);
3819		- spin_lock(&ailp->ail_lock);
3820		- break;
3821		- }
3822		- }
3823		- lip = xfs_trans_ail_cursor_next(ailp, &cur);
3824		- }
3825		-
3826		- xfs_trans_ail_cursor_done(&cur);
3827		- spin_unlock(&ailp->ail_lock);
3828		-
3829		- return 0;
3830		-}
3831		-
3832		-/*
3833		- * This routine is called when an inode create format structure is found in a
3834		- * committed transaction in the log. It's purpose is to initialise the inodes
3835		- * being allocated on disk. This requires us to get inode cluster buffers that
3836		- * match the range to be initialised, stamped with inode templates and written
3837		- * by delayed write so that subsequent modifications will hit the cached buffer
3838		- * and only need writing out at the end of recovery.
3839		- */
3840		-STATIC int
3841		-xlog_recover_do_icreate_pass2(
3842		- struct xlog *log,
3843		- struct list_head *buffer_list,
3844		- xlog_recover_item_t *item)
3845		-{
3846		- struct xfs_mount *mp = log->l_mp;
3847		- struct xfs_icreate_log *icl;
3848		- xfs_agnumber_t agno;
3849		- xfs_agblock_t agbno;
3850		- unsigned int count;
3851		- unsigned int isize;
3852		- xfs_agblock_t length;
3853		- int blks_per_cluster;
3854		- int bb_per_cluster;
3855		- int cancel_count;
3856		- int nbufs;
3857		- int i;
3858		-
3859		- icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3860		- if (icl->icl_type != XFS_LI_ICREATE) {
3861		- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3862		- return -EINVAL;
3863		- }
3864		-
3865		- if (icl->icl_size != 1) {
3866		- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3867		- return -EINVAL;
3868		- }
3869		-
3870		- agno = be32_to_cpu(icl->icl_ag);
3871		- if (agno >= mp->m_sb.sb_agcount) {
3872		- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3873		- return -EINVAL;
3874		- }
3875		- agbno = be32_to_cpu(icl->icl_agbno);
3876		- if (!agbno \|\| agbno == NULLAGBLOCK \|\| agbno >= mp->m_sb.sb_agblocks) {
3877		- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3878		- return -EINVAL;
3879		- }
3880		- isize = be32_to_cpu(icl->icl_isize);
3881		- if (isize != mp->m_sb.sb_inodesize) {
3882		- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3883		- return -EINVAL;
3884		- }
3885		- count = be32_to_cpu(icl->icl_count);
3886		- if (!count) {
3887		- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3888		- return -EINVAL;
3889		- }
3890		- length = be32_to_cpu(icl->icl_length);
3891		- if (!length \|\| length >= mp->m_sb.sb_agblocks) {
3892		- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3893		- return -EINVAL;
3894		- }
3895		-
3896		- /*
3897		- * The inode chunk is either full or sparse and we only support
3898		- * m_ialloc_min_blks sized sparse allocations at this time.
3899		- */
3900		- if (length != mp->m_ialloc_blks &&
3901		- length != mp->m_ialloc_min_blks) {
3902		- xfs_warn(log->l_mp,
3903		- "%s: unsupported chunk length", __FUNCTION__);
3904		- return -EINVAL;
3905		- }
3906		-
3907		- /* verify inode count is consistent with extent length */
3908		- if ((count >> mp->m_sb.sb_inopblog) != length) {
3909		- xfs_warn(log->l_mp,
3910		- "%s: inconsistent inode count and chunk length",
3911		- __FUNCTION__);
3912		- return -EINVAL;
3913		- }
3914		-
3915		- /*
3916		- * The icreate transaction can cover multiple cluster buffers and these
3917		- * buffers could have been freed and reused. Check the individual
3918		- * buffers for cancellation so we don't overwrite anything written after
3919		- * a cancellation.
3920		- */
3921		- blks_per_cluster = xfs_icluster_size_fsb(mp);
3922		- bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster);
3923		- nbufs = length / blks_per_cluster;
3924		- for (i = 0, cancel_count = 0; i < nbufs; i++) {
3925		- xfs_daddr_t daddr;
3926		-
3927		- daddr = XFS_AGB_TO_DADDR(mp, agno,
3928		- agbno + i * blks_per_cluster);
3929		- if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
3930		- cancel_count++;
3931		- }
3932		-
3933		- /*
3934		- * We currently only use icreate for a single allocation at a time. This
3935		- * means we should expect either all or none of the buffers to be
3936		- * cancelled. Be conservative and skip replay if at least one buffer is
3937		- * cancelled, but warn the user that something is awry if the buffers
3938		- * are not consistent.
3939		- *
3940		- * XXX: This must be refined to only skip cancelled clusters once we use
3941		- * icreate for multiple chunk allocations.
3942		- */
3943		- ASSERT(!cancel_count \|\| cancel_count == nbufs);
3944		- if (cancel_count) {
3945		- if (cancel_count != nbufs)
3946		- xfs_warn(mp,
3947		- "WARNING: partial inode chunk cancellation, skipped icreate.");
3948		- trace_xfs_log_recover_icreate_cancel(log, icl);
3949		- return 0;
3950		- }
3951		-
3952		- trace_xfs_log_recover_icreate_recover(log, icl);
3953		- return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
3954		- length, be32_to_cpu(icl->icl_gen));
3955		-}
3956		-
3957		-STATIC void
3958		-xlog_recover_buffer_ra_pass2(
3959		- struct xlog *log,
3960		- struct xlog_recover_item *item)
3961		-{
3962		- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
3963		- struct xfs_mount *mp = log->l_mp;
3964		-
3965		- if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
3966		- buf_f->blf_len, buf_f->blf_flags)) {
3967		- return;
3968		- }
3969		-
3970		- xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
3971		- buf_f->blf_len, NULL);
3972		-}
3973		-
3974		-STATIC void
3975		-xlog_recover_inode_ra_pass2(
3976		- struct xlog *log,
3977		- struct xlog_recover_item *item)
3978		-{
3979		- struct xfs_inode_log_format ilf_buf;
3980		- struct xfs_inode_log_format *ilfp;
3981		- struct xfs_mount *mp = log->l_mp;
3982		- int error;
3983		-
3984		- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
3985		- ilfp = item->ri_buf[0].i_addr;
3986		- } else {
3987		- ilfp = &ilf_buf;
3988		- memset(ilfp, 0, sizeof(*ilfp));
3989		- error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
3990		- if (error)
3991		- return;
3992		- }
3993		-
3994		- if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
3995		- return;
3996		-
3997		- xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
3998		- ilfp->ilf_len, &xfs_inode_buf_ra_ops);
3999		-}
4000		-
4001		-STATIC void
4002		-xlog_recover_dquot_ra_pass2(
4003		- struct xlog *log,
4004		- struct xlog_recover_item *item)
4005		-{
4006		- struct xfs_mount *mp = log->l_mp;
4007		- struct xfs_disk_dquot *recddq;
4008		- struct xfs_dq_logformat *dq_f;
4009		- uint type;
4010		- int len;
4011		-
4012		-
4013		- if (mp->m_qflags == 0)
4014		- return;
4015		-
4016		- recddq = item->ri_buf[1].i_addr;
4017		- if (recddq == NULL)
4018		- return;
4019		- if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
4020		- return;
4021		-
4022		- type = recddq->d_flags & (XFS_DQ_USER \| XFS_DQ_PROJ \| XFS_DQ_GROUP);
4023		- ASSERT(type);
4024		- if (log->l_quotaoffs_flag & type)
4025		- return;
4026		-
4027		- dq_f = item->ri_buf[0].i_addr;
4028		- ASSERT(dq_f);
4029		- ASSERT(dq_f->qlf_len == 1);
4030		-
4031		- len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
4032		- if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
4033		- return;
4034		-
4035		- xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
4036		- &xfs_dquot_buf_ra_ops);
4037		-}
4038		-
4039		-STATIC void
4040		-xlog_recover_ra_pass2(
4041		- struct xlog *log,
4042		- struct xlog_recover_item *item)
4043		-{
4044		- switch (ITEM_TYPE(item)) {
4045		- case XFS_LI_BUF:
4046		- xlog_recover_buffer_ra_pass2(log, item);
4047		- break;
4048		- case XFS_LI_INODE:
4049		- xlog_recover_inode_ra_pass2(log, item);
4050		- break;
4051		- case XFS_LI_DQUOT:
4052		- xlog_recover_dquot_ra_pass2(log, item);
4053		- break;
4054		- case XFS_LI_EFI:
4055		- case XFS_LI_EFD:
4056		- case XFS_LI_QUOTAOFF:
4057		- case XFS_LI_RUI:
4058		- case XFS_LI_RUD:
4059		- case XFS_LI_CUI:
4060		- case XFS_LI_CUD:
4061		- case XFS_LI_BUI:
4062		- case XFS_LI_BUD:
4063		- default:
4064		- break;
4065		- }
4066		-}
4067		-
4068		-STATIC int
4069		-xlog_recover_commit_pass1(
4070		- struct xlog *log,
4071		- struct xlog_recover *trans,
4072		- struct xlog_recover_item *item)
4073		-{
4074		- trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
4075		-
4076		- switch (ITEM_TYPE(item)) {
4077		- case XFS_LI_BUF:
4078		- return xlog_recover_buffer_pass1(log, item);
4079		- case XFS_LI_QUOTAOFF:
4080		- return xlog_recover_quotaoff_pass1(log, item);
4081		- case XFS_LI_INODE:
4082		- case XFS_LI_EFI:
4083		- case XFS_LI_EFD:
4084		- case XFS_LI_DQUOT:
4085		- case XFS_LI_ICREATE:
4086		- case XFS_LI_RUI:
4087		- case XFS_LI_RUD:
4088		- case XFS_LI_CUI:
4089		- case XFS_LI_CUD:
4090		- case XFS_LI_BUI:
4091		- case XFS_LI_BUD:
4092		- /* nothing to do in pass 1 */
4093		- return 0;
4094		- default:
4095		- xfs_warn(log->l_mp, "%s: invalid item type (%d)",
4096		- __func__, ITEM_TYPE(item));
4097		- ASSERT(0);
4098		- return -EIO;
4099		- }
4100		-}
4101		-
4102		-STATIC int
4103		-xlog_recover_commit_pass2(
4104		- struct xlog *log,
4105		- struct xlog_recover *trans,
4106		- struct list_head *buffer_list,
4107		- struct xlog_recover_item *item)
4108		-{
4109		- trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
4110		-
4111		- switch (ITEM_TYPE(item)) {
4112		- case XFS_LI_BUF:
4113		- return xlog_recover_buffer_pass2(log, buffer_list, item,
4114		- trans->r_lsn);
4115		- case XFS_LI_INODE:
4116		- return xlog_recover_inode_pass2(log, buffer_list, item,
4117		- trans->r_lsn);
4118		- case XFS_LI_EFI:
4119		- return xlog_recover_efi_pass2(log, item, trans->r_lsn);
4120		- case XFS_LI_EFD:
4121		- return xlog_recover_efd_pass2(log, item);
4122		- case XFS_LI_RUI:
4123		- return xlog_recover_rui_pass2(log, item, trans->r_lsn);
4124		- case XFS_LI_RUD:
4125		- return xlog_recover_rud_pass2(log, item);
4126		- case XFS_LI_CUI:
4127		- return xlog_recover_cui_pass2(log, item, trans->r_lsn);
4128		- case XFS_LI_CUD:
4129		- return xlog_recover_cud_pass2(log, item);
4130		- case XFS_LI_BUI:
4131		- return xlog_recover_bui_pass2(log, item, trans->r_lsn);
4132		- case XFS_LI_BUD:
4133		- return xlog_recover_bud_pass2(log, item);
4134		- case XFS_LI_DQUOT:
4135		- return xlog_recover_dquot_pass2(log, buffer_list, item,
4136		- trans->r_lsn);
4137		- case XFS_LI_ICREATE:
4138		- return xlog_recover_do_icreate_pass2(log, buffer_list, item);
4139		- case XFS_LI_QUOTAOFF:
4140		- /* nothing to do in pass2 */
4141		- return 0;
4142		- default:
4143		- xfs_warn(log->l_mp, "%s: invalid item type (%d)",
4144		- __func__, ITEM_TYPE(item));
4145		- ASSERT(0);
4146		- return -EIO;
4147		- }
	1918	+ if (!xlog_is_buffer_cancelled(log, blkno, len))
	1919	+ xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
4148	1920	}
4149	1921
4150	1922	STATIC int
..	..	@@ -4158,8 +1930,12 @@
4158	1930	int error = 0;
4159	1931
4160	1932	list_for_each_entry(item, item_list, ri_list) {
4161		- error = xlog_recover_commit_pass2(log, trans,
4162		- buffer_list, item);
	1933	+ trace_xfs_log_recover_item_recover(log, trans, item,
	1934	+ XLOG_RECOVER_PASS2);
	1935	+
	1936	+ if (item->ri_ops->commit_pass2)
	1937	+ error = item->ri_ops->commit_pass2(log, buffer_list,
	1938	+ item, trans->r_lsn);
4163	1939	if (error)
4164	1940	return error;
4165	1941	}
..	..	@@ -4196,12 +1972,16 @@
4196	1972	return error;
4197	1973
4198	1974	list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
	1975	+ trace_xfs_log_recover_item_recover(log, trans, item, pass);
	1976	+
4199	1977	switch (pass) {
4200	1978	case XLOG_RECOVER_PASS1:
4201		- error = xlog_recover_commit_pass1(log, trans, item);
	1979	+ if (item->ri_ops->commit_pass1)
	1980	+ error = item->ri_ops->commit_pass1(log, item);
4202	1981	break;
4203	1982	case XLOG_RECOVER_PASS2:
4204		- xlog_recover_ra_pass2(log, item);
	1983	+ if (item->ri_ops->ra_pass2)
	1984	+ item->ri_ops->ra_pass2(log, item);
4205	1985	list_move_tail(&item->ri_list, &ra_list);
4206	1986	items_queued++;
4207	1987	if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
..	..	@@ -4238,9 +2018,9 @@
4238	2018	xlog_recover_add_item(
4239	2019	struct list_head *head)
4240	2020	{
4241		- xlog_recover_item_t *item;
	2021	+ struct xlog_recover_item *item;
4242	2022
4243		- item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
	2023	+ item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
4244	2024	INIT_LIST_HEAD(&item->ri_list);
4245	2025	list_add_tail(&item->ri_list, head);
4246	2026	}
..	..	@@ -4252,7 +2032,7 @@
4252	2032	char *dp,
4253	2033	int len)
4254	2034	{
4255		- xlog_recover_item_t *item;
	2035	+ struct xlog_recover_item *item;
4256	2036	char ptr, old_ptr;
4257	2037	int old_len;
4258	2038
..	..	@@ -4264,7 +2044,7 @@
4264	2044	ASSERT(len <= sizeof(struct xfs_trans_header));
4265	2045	if (len > sizeof(struct xfs_trans_header)) {
4266	2046	xfs_warn(log->l_mp, "%s: bad header length", __func__);
4267		- return -EIO;
	2047	+ return -EFSCORRUPTED;
4268	2048	}
4269	2049
4270	2050	xlog_recover_add_item(&trans->r_itemq);
..	..	@@ -4275,12 +2055,15 @@
4275	2055	}
4276	2056
4277	2057	/* take the tail entry */
4278		- item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
	2058	+ item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
	2059	+ ri_list);
4279	2060
4280	2061	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
4281	2062	old_len = item->ri_buf[item->ri_cnt-1].i_len;
4282	2063
4283		- ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
	2064	+ ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
	2065	+ if (!ptr)
	2066	+ return -ENOMEM;
4284	2067	memcpy(&ptr[old_len], dp, len);
4285	2068	item->ri_buf[item->ri_cnt-1].i_len += len;
4286	2069	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
..	..	@@ -4309,7 +2092,7 @@
4309	2092	int len)
4310	2093	{
4311	2094	struct xfs_inode_log_format in_f; / any will do */
4312		- xlog_recover_item_t *item;
	2095	+ struct xlog_recover_item *item;
4313	2096	char *ptr;
4314	2097
4315	2098	if (!len)
..	..	@@ -4320,13 +2103,13 @@
4320	2103	xfs_warn(log->l_mp, "%s: bad header magic number",
4321	2104	__func__);
4322	2105	ASSERT(0);
4323		- return -EIO;
	2106	+ return -EFSCORRUPTED;
4324	2107	}
4325	2108
4326	2109	if (len > sizeof(struct xfs_trans_header)) {
4327	2110	xfs_warn(log->l_mp, "%s: bad header length", __func__);
4328	2111	ASSERT(0);
4329		- return -EIO;
	2112	+ return -EFSCORRUPTED;
4330	2113	}
4331	2114
4332	2115	/*
..	..	@@ -4340,18 +2123,19 @@
4340	2123	return 0;
4341	2124	}
4342	2125
4343		- ptr = kmem_alloc(len, KM_SLEEP);
	2126	+ ptr = kmem_alloc(len, 0);
4344	2127	memcpy(ptr, dp, len);
4345	2128	in_f = (struct xfs_inode_log_format *)ptr;
4346	2129
4347	2130	/* take the tail entry */
4348		- item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
	2131	+ item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
	2132	+ ri_list);
4349	2133	if (item->ri_total != 0 &&
4350	2134	item->ri_total == item->ri_cnt) {
4351	2135	/* tail item is in use, get a new one */
4352	2136	xlog_recover_add_item(&trans->r_itemq);
4353	2137	item = list_entry(trans->r_itemq.prev,
4354		- xlog_recover_item_t, ri_list);
	2138	+ struct xlog_recover_item, ri_list);
4355	2139	}
4356	2140
4357	2141	if (item->ri_total == 0) { /* first region to be added */
..	..	@@ -4362,15 +2146,24 @@
4362	2146	in_f->ilf_size);
4363	2147	ASSERT(0);
4364	2148	kmem_free(ptr);
4365		- return -EIO;
	2149	+ return -EFSCORRUPTED;
4366	2150	}
4367	2151
4368	2152	item->ri_total = in_f->ilf_size;
4369	2153	item->ri_buf =
4370	2154	kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
4371		- KM_SLEEP);
	2155	+ 0);
4372	2156	}
4373		- ASSERT(item->ri_total > item->ri_cnt);
	2157	+
	2158	+ if (item->ri_total <= item->ri_cnt) {
	2159	+ xfs_warn(log->l_mp,
	2160	+ "log item region count (%d) overflowed size (%d)",
	2161	+ item->ri_cnt, item->ri_total);
	2162	+ ASSERT(0);
	2163	+ kmem_free(ptr);
	2164	+ return -EFSCORRUPTED;
	2165	+ }
	2166	+
4374	2167	/* Description region is ri_buf[0] */
4375	2168	item->ri_buf[item->ri_cnt].i_addr = ptr;
4376	2169	item->ri_buf[item->ri_cnt].i_len = len;
..	..	@@ -4388,7 +2181,7 @@
4388	2181	xlog_recover_free_trans(
4389	2182	struct xlog_recover *trans)
4390	2183	{
4391		- xlog_recover_item_t item, n;
	2184	+ struct xlog_recover_item item, n;
4392	2185	int i;
4393	2186
4394	2187	hlist_del_init(&trans->r_list);
..	..	@@ -4457,7 +2250,7 @@
4457	2250	default:
4458	2251	xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
4459	2252	ASSERT(0);
4460		- error = -EIO;
	2253	+ error = -EFSCORRUPTED;
4461	2254	break;
4462	2255	}
4463	2256	if (error \|\| freeit)
..	..	@@ -4502,7 +2295,7 @@
4502	2295	* This is a new transaction so allocate a new recovery container to
4503	2296	* hold the recovery ops that will follow.
4504	2297	*/
4505		- trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
	2298	+ trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
4506	2299	trans->r_log_tid = tid;
4507	2300	trans->r_lsn = be64_to_cpu(rhead->h_lsn);
4508	2301	INIT_LIST_HEAD(&trans->r_itemq);
..	..	@@ -4537,7 +2330,7 @@
4537	2330	xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
4538	2331	__func__, ohead->oh_clientid);
4539	2332	ASSERT(0);
4540		- return -EIO;
	2333	+ return -EFSCORRUPTED;
4541	2334	}
4542	2335
4543	2336	/*
..	..	@@ -4547,7 +2340,7 @@
4547	2340	if (dp + len > end) {
4548	2341	xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
4549	2342	WARN_ON(1);
4550		- return -EIO;
	2343	+ return -EFSCORRUPTED;
4551	2344	}
4552	2345
4553	2346	trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
..	..	@@ -4640,214 +2433,71 @@
4640	2433	return 0;
4641	2434	}
4642	2435
4643		-/* Recover the EFI if necessary. */
4644		-STATIC int
4645		-xlog_recover_process_efi(
4646		- struct xfs_mount *mp,
4647		- struct xfs_ail *ailp,
4648		- struct xfs_log_item *lip)
4649		-{
4650		- struct xfs_efi_log_item *efip;
4651		- int error;
4652		-
4653		- /*
4654		- * Skip EFIs that we've already processed.
4655		- */
4656		- efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4657		- if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
4658		- return 0;
4659		-
4660		- spin_unlock(&ailp->ail_lock);
4661		- error = xfs_efi_recover(mp, efip);
4662		- spin_lock(&ailp->ail_lock);
4663		-
4664		- return error;
4665		-}
4666		-
4667		-/* Release the EFI since we're cancelling everything. */
4668		-STATIC void
4669		-xlog_recover_cancel_efi(
4670		- struct xfs_mount *mp,
4671		- struct xfs_ail *ailp,
4672		- struct xfs_log_item *lip)
4673		-{
4674		- struct xfs_efi_log_item *efip;
4675		-
4676		- efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4677		-
4678		- spin_unlock(&ailp->ail_lock);
4679		- xfs_efi_release(efip);
4680		- spin_lock(&ailp->ail_lock);
4681		-}
4682		-
4683		-/* Recover the RUI if necessary. */
4684		-STATIC int
4685		-xlog_recover_process_rui(
4686		- struct xfs_mount *mp,
4687		- struct xfs_ail *ailp,
4688		- struct xfs_log_item *lip)
4689		-{
4690		- struct xfs_rui_log_item *ruip;
4691		- int error;
4692		-
4693		- /*
4694		- * Skip RUIs that we've already processed.
4695		- */
4696		- ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
4697		- if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags))
4698		- return 0;
4699		-
4700		- spin_unlock(&ailp->ail_lock);
4701		- error = xfs_rui_recover(mp, ruip);
4702		- spin_lock(&ailp->ail_lock);
4703		-
4704		- return error;
4705		-}
4706		-
4707		-/* Release the RUI since we're cancelling everything. */
4708		-STATIC void
4709		-xlog_recover_cancel_rui(
4710		- struct xfs_mount *mp,
4711		- struct xfs_ail *ailp,
4712		- struct xfs_log_item *lip)
4713		-{
4714		- struct xfs_rui_log_item *ruip;
4715		-
4716		- ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
4717		-
4718		- spin_unlock(&ailp->ail_lock);
4719		- xfs_rui_release(ruip);
4720		- spin_lock(&ailp->ail_lock);
4721		-}
4722		-
4723		-/* Recover the CUI if necessary. */
4724		-STATIC int
4725		-xlog_recover_process_cui(
4726		- struct xfs_trans *parent_tp,
4727		- struct xfs_ail *ailp,
4728		- struct xfs_log_item *lip)
4729		-{
4730		- struct xfs_cui_log_item *cuip;
4731		- int error;
4732		-
4733		- /*
4734		- * Skip CUIs that we've already processed.
4735		- */
4736		- cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
4737		- if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags))
4738		- return 0;
4739		-
4740		- spin_unlock(&ailp->ail_lock);
4741		- error = xfs_cui_recover(parent_tp, cuip);
4742		- spin_lock(&ailp->ail_lock);
4743		-
4744		- return error;
4745		-}
4746		-
4747		-/* Release the CUI since we're cancelling everything. */
4748		-STATIC void
4749		-xlog_recover_cancel_cui(
4750		- struct xfs_mount *mp,
4751		- struct xfs_ail *ailp,
4752		- struct xfs_log_item *lip)
4753		-{
4754		- struct xfs_cui_log_item *cuip;
4755		-
4756		- cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
4757		-
4758		- spin_unlock(&ailp->ail_lock);
4759		- xfs_cui_release(cuip);
4760		- spin_lock(&ailp->ail_lock);
4761		-}
4762		-
4763		-/* Recover the BUI if necessary. */
4764		-STATIC int
4765		-xlog_recover_process_bui(
4766		- struct xfs_trans *parent_tp,
4767		- struct xfs_ail *ailp,
4768		- struct xfs_log_item *lip)
4769		-{
4770		- struct xfs_bui_log_item *buip;
4771		- int error;
4772		-
4773		- /*
4774		- * Skip BUIs that we've already processed.
4775		- */
4776		- buip = container_of(lip, struct xfs_bui_log_item, bui_item);
4777		- if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags))
4778		- return 0;
4779		-
4780		- spin_unlock(&ailp->ail_lock);
4781		- error = xfs_bui_recover(parent_tp, buip);
4782		- spin_lock(&ailp->ail_lock);
4783		-
4784		- return error;
4785		-}
4786		-
4787		-/* Release the BUI since we're cancelling everything. */
4788		-STATIC void
4789		-xlog_recover_cancel_bui(
4790		- struct xfs_mount *mp,
4791		- struct xfs_ail *ailp,
4792		- struct xfs_log_item *lip)
4793		-{
4794		- struct xfs_bui_log_item *buip;
4795		-
4796		- buip = container_of(lip, struct xfs_bui_log_item, bui_item);
4797		-
4798		- spin_unlock(&ailp->ail_lock);
4799		- xfs_bui_release(buip);
4800		- spin_lock(&ailp->ail_lock);
4801		-}
4802		-
4803		-/* Is this log item a deferred action intent? */
4804		-static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
4805		-{
4806		- switch (lip->li_type) {
4807		- case XFS_LI_EFI:
4808		- case XFS_LI_RUI:
4809		- case XFS_LI_CUI:
4810		- case XFS_LI_BUI:
4811		- return true;
4812		- default:
4813		- return false;
4814		- }
4815		-}
4816		-
4817	2436	/* Take all the collected deferred ops and finish them in order. */
4818	2437	static int
4819	2438	xlog_finish_defer_ops(
4820		- struct xfs_trans *parent_tp)
	2439	+ struct xfs_mount *mp,
	2440	+ struct list_head *capture_list)
4821	2441	{
4822		- struct xfs_mount *mp = parent_tp->t_mountp;
	2442	+ struct xfs_defer_capture dfc, next;
4823	2443	struct xfs_trans *tp;
4824		- int64_t freeblks;
4825		- uint resblks;
4826		- int error;
	2444	+ struct xfs_inode *ip;
	2445	+ int error = 0;
4827	2446
4828		- /*
4829		- * We're finishing the defer_ops that accumulated as a result of
4830		- * recovering unfinished intent items during log recovery. We
4831		- * reserve an itruncate transaction because it is the largest
4832		- * permanent transaction type. Since we're the only user of the fs
4833		- * right now, take 93% (15/16) of the available free blocks. Use
4834		- * weird math to avoid a 64-bit division.
4835		- */
4836		- freeblks = percpu_counter_sum(&mp->m_fdblocks);
4837		- if (freeblks <= 0)
4838		- return -ENOSPC;
4839		- resblks = min_t(int64_t, UINT_MAX, freeblks);
4840		- resblks = (resblks * 15) >> 4;
4841		- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
4842		- 0, XFS_TRANS_RESERVE, &tp);
4843		- if (error)
4844		- return error;
4845		- /* transfer all collected dfops to this transaction */
4846		- xfs_defer_move(tp, parent_tp);
	2447	+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
	2448	+ struct xfs_trans_res resv;
4847	2449
4848		- return xfs_trans_commit(tp);
	2450	+ /*
	2451	+ * Create a new transaction reservation from the captured
	2452	+ * information. Set logcount to 1 to force the new transaction
	2453	+ * to regrant every roll so that we can make forward progress
	2454	+ * in recovery no matter how full the log might be.
	2455	+ */
	2456	+ resv.tr_logres = dfc->dfc_logres;
	2457	+ resv.tr_logcount = 1;
	2458	+ resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
	2459	+
	2460	+ error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
	2461	+ dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
	2462	+ if (error) {
	2463	+ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
	2464	+ return error;
	2465	+ }
	2466	+
	2467	+ /*
	2468	+ * Transfer to this new transaction all the dfops we captured
	2469	+ * from recovering a single intent item.
	2470	+ */
	2471	+ list_del_init(&dfc->dfc_list);
	2472	+ xfs_defer_ops_continue(dfc, tp, &ip);
	2473	+
	2474	+ error = xfs_trans_commit(tp);
	2475	+ if (ip) {
	2476	+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
	2477	+ xfs_irele(ip);
	2478	+ }
	2479	+ if (error)
	2480	+ return error;
	2481	+ }
	2482	+
	2483	+ ASSERT(list_empty(capture_list));
	2484	+ return 0;
4849	2485	}
4850	2486
	2487	+/* Release all the captured defer ops and capture structures in this list. */
	2488	+static void
	2489	+xlog_abort_defer_ops(
	2490	+ struct xfs_mount *mp,
	2491	+ struct list_head *capture_list)
	2492	+{
	2493	+ struct xfs_defer_capture *dfc;
	2494	+ struct xfs_defer_capture *next;
	2495	+
	2496	+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
	2497	+ list_del_init(&dfc->dfc_list);
	2498	+ xfs_defer_ops_release(mp, dfc);
	2499	+ }
	2500	+}
4851	2501	/*
4852	2502	* When this is called, all of the log intent items which did not have
4853	2503	* corresponding log done items should be in the AIL. What we do now
..	..	@@ -4868,35 +2518,23 @@
4868	2518	xlog_recover_process_intents(
4869	2519	struct xlog *log)
4870	2520	{
4871		- struct xfs_trans *parent_tp;
	2521	+ LIST_HEAD(capture_list);
4872	2522	struct xfs_ail_cursor cur;
4873	2523	struct xfs_log_item *lip;
4874	2524	struct xfs_ail *ailp;
4875		- int error;
	2525	+ int error = 0;
4876	2526	#if defined(DEBUG) \|\| defined(XFS_WARN)
4877	2527	xfs_lsn_t last_lsn;
4878	2528	#endif
4879	2529
4880		- /*
4881		- * The intent recovery handlers commit transactions to complete recovery
4882		- * for individual intents, but any new deferred operations that are
4883		- * queued during that process are held off until the very end. The
4884		- * purpose of this transaction is to serve as a container for deferred
4885		- * operations. Each intent recovery handler must transfer dfops here
4886		- * before its local transaction commits, and we'll finish the entire
4887		- * list below.
4888		- */
4889		- error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
4890		- if (error)
4891		- return error;
4892		-
4893	2530	ailp = log->l_ailp;
4894	2531	spin_lock(&ailp->ail_lock);
4895		- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
4896	2532	#if defined(DEBUG) \|\| defined(XFS_WARN)
4897	2533	last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
4898	2534	#endif
4899		- while (lip != NULL) {
	2535	+ for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
	2536	+ lip != NULL;
	2537	+ lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
4900	2538	/*
4901	2539	* We're done when we see something other than an intent.
4902	2540	* There should be no intents left in the AIL now.
..	..	@@ -4918,35 +2556,29 @@
4918	2556
4919	2557	/*
4920	2558	* NOTE: If your intent processing routine can create more
4921		- * deferred ops, you /must/ attach them to the dfops in this
4922		- * routine or else those subsequent intents will get
	2559	+ * deferred ops, you /must/ attach them to the capture list in
	2560	+ * the recover routine or else those subsequent intents will be
4923	2561	* replayed in the wrong order!
4924	2562	*/
4925		- switch (lip->li_type) {
4926		- case XFS_LI_EFI:
4927		- error = xlog_recover_process_efi(log->l_mp, ailp, lip);
4928		- break;
4929		- case XFS_LI_RUI:
4930		- error = xlog_recover_process_rui(log->l_mp, ailp, lip);
4931		- break;
4932		- case XFS_LI_CUI:
4933		- error = xlog_recover_process_cui(parent_tp, ailp, lip);
4934		- break;
4935		- case XFS_LI_BUI:
4936		- error = xlog_recover_process_bui(parent_tp, ailp, lip);
4937		- break;
4938		- }
	2563	+ spin_unlock(&ailp->ail_lock);
	2564	+ error = lip->li_ops->iop_recover(lip, &capture_list);
	2565	+ spin_lock(&ailp->ail_lock);
4939	2566	if (error)
4940		- goto out;
4941		- lip = xfs_trans_ail_cursor_next(ailp, &cur);
	2567	+ break;
4942	2568	}
4943		-out:
	2569	+
4944	2570	xfs_trans_ail_cursor_done(&cur);
4945	2571	spin_unlock(&ailp->ail_lock);
4946		- if (!error)
4947		- error = xlog_finish_defer_ops(parent_tp);
4948		- xfs_trans_cancel(parent_tp);
	2572	+ if (error)
	2573	+ goto err;
4949	2574
	2575	+ error = xlog_finish_defer_ops(log->l_mp, &capture_list);
	2576	+ if (error)
	2577	+ goto err;
	2578	+
	2579	+ return 0;
	2580	+err:
	2581	+ xlog_abort_defer_ops(log->l_mp, &capture_list);
4950	2582	return error;
4951	2583	}
4952	2584
..	..	@@ -4954,12 +2586,11 @@
4954	2586	* A cancel occurs when the mount has failed and we're bailing out.
4955	2587	* Release all pending log intent items so they don't pin the AIL.
4956	2588	*/
4957		-STATIC int
	2589	+STATIC void
4958	2590	xlog_recover_cancel_intents(
4959	2591	struct xlog *log)
4960	2592	{
4961	2593	struct xfs_log_item *lip;
4962		- int error = 0;
4963	2594	struct xfs_ail_cursor cur;
4964	2595	struct xfs_ail *ailp;
4965	2596
..	..	@@ -4979,27 +2610,14 @@
4979	2610	break;
4980	2611	}
4981	2612
4982		- switch (lip->li_type) {
4983		- case XFS_LI_EFI:
4984		- xlog_recover_cancel_efi(log->l_mp, ailp, lip);
4985		- break;
4986		- case XFS_LI_RUI:
4987		- xlog_recover_cancel_rui(log->l_mp, ailp, lip);
4988		- break;
4989		- case XFS_LI_CUI:
4990		- xlog_recover_cancel_cui(log->l_mp, ailp, lip);
4991		- break;
4992		- case XFS_LI_BUI:
4993		- xlog_recover_cancel_bui(log->l_mp, ailp, lip);
4994		- break;
4995		- }
4996		-
	2613	+ spin_unlock(&ailp->ail_lock);
	2614	+ lip->li_ops->iop_release(lip);
	2615	+ spin_lock(&ailp->ail_lock);
4997	2616	lip = xfs_trans_ail_cursor_next(ailp, &cur);
4998	2617	}
4999	2618
5000	2619	xfs_trans_ail_cursor_done(&cur);
5001	2620	spin_unlock(&ailp->ail_lock);
5002		- return error;
5003	2621	}
5004	2622
5005	2623	/*
..	..	@@ -5026,7 +2644,7 @@
5026	2644	if (error)
5027	2645	goto out_abort;
5028	2646
5029		- agi = XFS_BUF_TO_AGI(agibp);
	2647	+ agi = agibp->b_addr;
5030	2648	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
5031	2649	offset = offsetof(xfs_agi_t, agi_unlinked) +
5032	2650	(sizeof(xfs_agino_t) * bucket);
..	..	@@ -5066,7 +2684,7 @@
5066	2684	/*
5067	2685	* Get the on disk inode to find the next inode in the bucket.
5068	2686	*/
5069		- error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
	2687	+ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0);
5070	2688	if (error)
5071	2689	goto fail_iput;
5072	2690
..	..	@@ -5103,16 +2721,27 @@
5103	2721	}
5104	2722
5105	2723	/*
5106		- * xlog_iunlink_recover
	2724	+ * Recover AGI unlinked lists
5107	2725	*
5108		- * This is called during recovery to process any inodes which
5109		- * we unlinked but not freed when the system crashed. These
5110		- * inodes will be on the lists in the AGI blocks. What we do
5111		- * here is scan all the AGIs and fully truncate and free any
5112		- * inodes found on the lists. Each inode is removed from the
5113		- * lists when it has been fully truncated and is freed. The
5114		- * freeing of the inode and its removal from the list must be
5115		- * atomic.
	2726	+ * This is called during recovery to process any inodes which we unlinked but
	2727	+ * not freed when the system crashed. These inodes will be on the lists in the
	2728	+ * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
	2729	+ * any inodes found on the lists. Each inode is removed from the lists when it
	2730	+ * has been fully truncated and is freed. The freeing of the inode and its
	2731	+ * removal from the list must be atomic.
	2732	+ *
	2733	+ * If everything we touch in the agi processing loop is already in memory, this
	2734	+ * loop can hold the cpu for a long time. It runs without lock contention,
	2735	+ * memory allocation contention, the need wait for IO, etc, and so will run
	2736	+ * until we either run out of inodes to process, run low on memory or we run out
	2737	+ * of log space.
	2738	+ *
	2739	+ * This behaviour is bad for latency on single CPU and non-preemptible kernels,
	2740	+ * and can prevent other filesytem work (such as CIL pushes) from running. This
	2741	+ * can lead to deadlocks if the recovery process runs out of log reservation
	2742	+ * space. Hence we need to yield the CPU when there is other kernel work
	2743	+ * scheduled on this CPU to ensure other scheduled work can run without undue
	2744	+ * latency.
5116	2745	*/
5117	2746	STATIC void
5118	2747	xlog_recover_process_iunlinks(
..	..	@@ -5151,7 +2780,7 @@
5151	2780	* buffer reference though, so that it stays pinned in memory
5152	2781	* while we need the buffer.
5153	2782	*/
5154		- agi = XFS_BUF_TO_AGI(agibp);
	2783	+ agi = agibp->b_addr;
5155	2784	xfs_buf_unlock(agibp);
5156	2785
5157	2786	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
..	..	@@ -5159,13 +2788,14 @@
5159	2788	while (agino != NULLAGINO) {
5160	2789	agino = xlog_recover_process_one_iunlink(mp,
5161	2790	agno, agino, bucket);
	2791	+ cond_resched();
5162	2792	}
5163	2793	}
5164	2794	xfs_buf_rele(agibp);
5165	2795	}
5166	2796	}
5167	2797
5168		-STATIC int
	2798	+STATIC void
5169	2799	xlog_unpack_data(
5170	2800	struct xlog_rec_header *rhead,
5171	2801	char *dp,
..	..	@@ -5188,8 +2818,6 @@
5188	2818	dp += BBSIZE;
5189	2819	}
5190	2820	}
5191		-
5192		- return 0;
5193	2821	}
5194	2822
5195	2823	/*
..	..	@@ -5204,10 +2832,8 @@
5204	2832	int pass,
5205	2833	struct list_head *buffer_list)
5206	2834	{
5207		- int error;
5208	2835	__le32 old_crc = rhead->h_crc;
5209	2836	__le32 crc;
5210		-
5211	2837
5212	2838	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
5213	2839
..	..	@@ -5243,13 +2869,13 @@
5243	2869	* If the filesystem is CRC enabled, this mismatch becomes a
5244	2870	* fatal log corruption failure.
5245	2871	*/
5246		- if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
	2872	+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
	2873	+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
5247	2874	return -EFSCORRUPTED;
	2875	+ }
5248	2876	}
5249	2877
5250		- error = xlog_unpack_data(rhead, dp, log);
5251		- if (error)
5252		- return error;
	2878	+ xlog_unpack_data(rhead, dp, log);
5253	2879
5254	2880	return xlog_recover_process_data(log, rhash, rhead, dp, pass,
5255	2881	buffer_list);
..	..	@@ -5259,35 +2885,34 @@
5259	2885	xlog_valid_rec_header(
5260	2886	struct xlog *log,
5261	2887	struct xlog_rec_header *rhead,
5262		- xfs_daddr_t blkno)
	2888	+ xfs_daddr_t blkno,
	2889	+ int bufsize)
5263	2890	{
5264	2891	int hlen;
5265	2892
5266		- if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
5267		- XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
5268		- XFS_ERRLEVEL_LOW, log->l_mp);
	2893	+ if (XFS_IS_CORRUPT(log->l_mp,
	2894	+ rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
5269	2895	return -EFSCORRUPTED;
5270		- }
5271		- if (unlikely(
5272		- (!rhead->h_version \|\|
5273		- (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
	2896	+ if (XFS_IS_CORRUPT(log->l_mp,
	2897	+ (!rhead->h_version \|\|
	2898	+ (be32_to_cpu(rhead->h_version) &
	2899	+ (~XLOG_VERSION_OKBITS))))) {
5274	2900	xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
5275	2901	__func__, be32_to_cpu(rhead->h_version));
5276		- return -EIO;
	2902	+ return -EFSCORRUPTED;
5277	2903	}
5278	2904
5279		- /* LR body must have data or it wouldn't have been written */
	2905	+ /*
	2906	+ * LR body must have data (or it wouldn't have been written)
	2907	+ * and h_len must not be greater than LR buffer size.
	2908	+ */
5280	2909	hlen = be32_to_cpu(rhead->h_len);
5281		- if (unlikely( hlen <= 0 \|\| hlen > INT_MAX )) {
5282		- XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
5283		- XFS_ERRLEVEL_LOW, log->l_mp);
	2910	+ if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 \|\| hlen > bufsize))
5284	2911	return -EFSCORRUPTED;
5285		- }
5286		- if (unlikely( blkno > log->l_logBBsize \|\| blkno > INT_MAX )) {
5287		- XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
5288		- XFS_ERRLEVEL_LOW, log->l_mp);
	2912	+
	2913	+ if (XFS_IS_CORRUPT(log->l_mp,
	2914	+ blkno > log->l_logBBsize \|\| blkno > INT_MAX))
5289	2915	return -EFSCORRUPTED;
5290		- }
5291	2916	return 0;
5292	2917	}
5293	2918
..	..	@@ -5311,7 +2936,7 @@
5311	2936	xfs_daddr_t blk_no, rblk_no;
5312	2937	xfs_daddr_t rhead_blk;
5313	2938	char *offset;
5314		- xfs_buf_t hbp, dbp;
	2939	+ char hbp, dbp;
5315	2940	int error = 0, h_size, h_len;
5316	2941	int error2 = 0;
5317	2942	int bblks, split_bblks;
..	..	@@ -5336,7 +2961,7 @@
5336	2961	* iclog header and extract the header size from it. Get a
5337	2962	* new hbp that is the correct size.
5338	2963	*/
5339		- hbp = xlog_get_bp(log, 1);
	2964	+ hbp = xlog_alloc_buffer(log, 1);
5340	2965	if (!hbp)
5341	2966	return -ENOMEM;
5342	2967
..	..	@@ -5345,9 +2970,6 @@
5345	2970	goto bread_err1;
5346	2971
5347	2972	rhead = (xlog_rec_header_t *)offset;
5348		- error = xlog_valid_rec_header(log, rhead, tail_blk);
5349		- if (error)
5350		- goto bread_err1;
5351	2973
5352	2974	/*
5353	2975	* xfsprogs has a bug where record length is based on lsunit but
..	..	@@ -5362,39 +2984,35 @@
5362	2984	*/
5363	2985	h_size = be32_to_cpu(rhead->h_size);
5364	2986	h_len = be32_to_cpu(rhead->h_len);
5365		- if (h_len > h_size) {
5366		- if (h_len <= log->l_mp->m_logbsize &&
5367		- be32_to_cpu(rhead->h_num_logops) == 1) {
5368		- xfs_warn(log->l_mp,
	2987	+ if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
	2988	+ rhead->h_num_logops == cpu_to_be32(1)) {
	2989	+ xfs_warn(log->l_mp,
5369	2990	"invalid iclog size (%d bytes), using lsunit (%d bytes)",
5370		- h_size, log->l_mp->m_logbsize);
5371		- h_size = log->l_mp->m_logbsize;
5372		- } else
5373		- return -EFSCORRUPTED;
	2991	+ h_size, log->l_mp->m_logbsize);
	2992	+ h_size = log->l_mp->m_logbsize;
5374	2993	}
5375	2994
5376		- if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
5377		- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
5378		- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
5379		- if (h_size % XLOG_HEADER_CYCLE_SIZE)
5380		- hblks++;
5381		- xlog_put_bp(hbp);
5382		- hbp = xlog_get_bp(log, hblks);
5383		- } else {
5384		- hblks = 1;
	2995	+ error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
	2996	+ if (error)
	2997	+ goto bread_err1;
	2998	+
	2999	+ hblks = xlog_logrec_hblks(log, rhead);
	3000	+ if (hblks != 1) {
	3001	+ kmem_free(hbp);
	3002	+ hbp = xlog_alloc_buffer(log, hblks);
5385	3003	}
5386	3004	} else {
5387	3005	ASSERT(log->l_sectBBsize == 1);
5388	3006	hblks = 1;
5389		- hbp = xlog_get_bp(log, 1);
	3007	+ hbp = xlog_alloc_buffer(log, 1);
5390	3008	h_size = XLOG_BIG_RECORD_BSIZE;
5391	3009	}
5392	3010
5393	3011	if (!hbp)
5394	3012	return -ENOMEM;
5395		- dbp = xlog_get_bp(log, BTOBB(h_size));
	3013	+ dbp = xlog_alloc_buffer(log, BTOBB(h_size));
5396	3014	if (!dbp) {
5397		- xlog_put_bp(hbp);
	3015	+ kmem_free(hbp);
5398	3016	return -ENOMEM;
5399	3017	}
5400	3018
..	..	@@ -5409,7 +3027,7 @@
5409	3027	/*
5410	3028	* Check for header wrapping around physical end-of-log
5411	3029	*/
5412		- offset = hbp->b_addr;
	3030	+ offset = hbp;
5413	3031	split_hblks = 0;
5414	3032	wrapped_hblks = 0;
5415	3033	if (blk_no + hblks <= log->l_logBBsize) {
..	..	@@ -5445,15 +3063,15 @@
5445	3063	* - order is important.
5446	3064	*/
5447	3065	wrapped_hblks = hblks - split_hblks;
5448		- error = xlog_bread_offset(log, 0,
5449		- wrapped_hblks, hbp,
	3066	+ error = xlog_bread_noalign(log, 0,
	3067	+ wrapped_hblks,
5450	3068	offset + BBTOB(split_hblks));
5451	3069	if (error)
5452	3070	goto bread_err2;
5453	3071	}
5454	3072	rhead = (xlog_rec_header_t *)offset;
5455	3073	error = xlog_valid_rec_header(log, rhead,
5456		- split_hblks ? blk_no : 0);
	3074	+ split_hblks ? blk_no : 0, h_size);
5457	3075	if (error)
5458	3076	goto bread_err2;
5459	3077
..	..	@@ -5477,7 +3095,7 @@
5477	3095	} else {
5478	3096	/* This log record is split across the
5479	3097	* physical end of log */
5480		- offset = dbp->b_addr;
	3098	+ offset = dbp;
5481	3099	split_bblks = 0;
5482	3100	if (blk_no != log->l_logBBsize) {
5483	3101	/* some data is before the physical
..	..	@@ -5506,8 +3124,8 @@
5506	3124	* _first_, then the log start (LR header end)
5507	3125	* - order is important.
5508	3126	*/
5509		- error = xlog_bread_offset(log, 0,
5510		- bblks - split_bblks, dbp,
	3127	+ error = xlog_bread_noalign(log, 0,
	3128	+ bblks - split_bblks,
5511	3129	offset + BBTOB(split_bblks));
5512	3130	if (error)
5513	3131	goto bread_err2;
..	..	@@ -5534,7 +3152,7 @@
5534	3152	goto bread_err2;
5535	3153
5536	3154	rhead = (xlog_rec_header_t *)offset;
5537		- error = xlog_valid_rec_header(log, rhead, blk_no);
	3155	+ error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
5538	3156	if (error)
5539	3157	goto bread_err2;
5540	3158
..	..	@@ -5555,9 +3173,9 @@
5555	3173	}
5556	3174
5557	3175	bread_err2:
5558		- xlog_put_bp(dbp);
	3176	+ kmem_free(dbp);
5559	3177	bread_err1:
5560		- xlog_put_bp(hbp);
	3178	+ kmem_free(hbp);
5561	3179
5562	3180	/*
5563	3181	* Submit buffers that have been added from the last record processed,
..	..	@@ -5614,7 +3232,7 @@
5614	3232	*/
5615	3233	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
5616	3234	sizeof(struct list_head),
5617		- KM_SLEEP);
	3235	+ 0);
5618	3236	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
5619	3237	INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
5620	3238
..	..	@@ -5651,14 +3269,14 @@
5651	3269	*/
5652	3270	STATIC int
5653	3271	xlog_do_recover(
5654		- struct xlog *log,
5655		- xfs_daddr_t head_blk,
5656		- xfs_daddr_t tail_blk)
	3272	+ struct xlog *log,
	3273	+ xfs_daddr_t head_blk,
	3274	+ xfs_daddr_t tail_blk)
5657	3275	{
5658		- struct xfs_mount *mp = log->l_mp;
5659		- int error;
5660		- xfs_buf_t *bp;
5661		- xfs_sb_t *sbp;
	3276	+ struct xfs_mount *mp = log->l_mp;
	3277	+ struct xfs_buf *bp = mp->m_sb_bp;
	3278	+ struct xfs_sb *sbp = &mp->m_sb;
	3279	+ int error;
5662	3280
5663	3281	trace_xfs_log_recover(log, head_blk, tail_blk);
5664	3282
..	..	@@ -5672,9 +3290,8 @@
5672	3290	/*
5673	3291	* If IO errors happened during recovery, bail out.
5674	3292	*/
5675		- if (XFS_FORCED_SHUTDOWN(mp)) {
	3293	+ if (XFS_FORCED_SHUTDOWN(mp))
5676	3294	return -EIO;
5677		- }
5678	3295
5679	3296	/*
5680	3297	* We now update the tail_lsn since much of the recovery has completed
..	..	@@ -5688,19 +3305,15 @@
5688	3305	xlog_assign_tail_lsn(mp);
5689	3306
5690	3307	/*
5691		- * Now that we've finished replaying all buffer and inode
5692		- * updates, re-read in the superblock and reverify it.
	3308	+ * Now that we've finished replaying all buffer and inode updates,
	3309	+ * re-read the superblock and reverify it.
5693	3310	*/
5694		- bp = xfs_getsb(mp, 0);
5695		- bp->b_flags &= ~(XBF_DONE \| XBF_ASYNC);
5696		- ASSERT(!(bp->b_flags & XBF_WRITE));
5697		- bp->b_flags \|= XBF_READ;
5698		- bp->b_ops = &xfs_sb_buf_ops;
5699		-
5700		- error = xfs_buf_submit(bp);
	3311	+ xfs_buf_lock(bp);
	3312	+ xfs_buf_hold(bp);
	3313	+ error = _xfs_buf_read(bp, XBF_READ);
5701	3314	if (error) {
5702	3315	if (!XFS_FORCED_SHUTDOWN(mp)) {
5703		- xfs_buf_ioerror_alert(bp, __func__);
	3316	+ xfs_buf_ioerror_alert(bp, __this_address);
5704	3317	ASSERT(0);
5705	3318	}
5706	3319	xfs_buf_relse(bp);
..	..	@@ -5708,8 +3321,7 @@
5708	3321	}
5709	3322
5710	3323	/* Convert superblock from on-disk format */
5711		- sbp = &mp->m_sb;
5712		- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
	3324	+ xfs_sb_from_disk(sbp, bp->b_addr);
5713	3325	xfs_buf_relse(bp);
5714	3326
5715	3327	/* re-initialise in-core superblock and geometry structures */
..	..	@@ -5838,6 +3450,15 @@
5838	3450	int error;
5839	3451	error = xlog_recover_process_intents(log);
5840	3452	if (error) {
	3453	+ /*
	3454	+ * Cancel all the unprocessed intent items now so that
	3455	+ * we don't leave them pinned in the AIL. This can
	3456	+ * cause the AIL to livelock on the pinned item if
	3457	+ * anyone tries to push the AIL (inode reclaim does
	3458	+ * this) before we get around to xfs_log_mount_cancel.
	3459	+ */
	3460	+ xlog_recover_cancel_intents(log);
	3461	+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
5841	3462	xfs_alert(log->l_mp, "Failed to recover intents");
5842	3463	return error;
5843	3464	}
..	..	@@ -5864,16 +3485,12 @@
5864	3485	return 0;
5865	3486	}
5866	3487
5867		-int
	3488	+void
5868	3489	xlog_recover_cancel(
5869	3490	struct xlog *log)
5870	3491	{
5871		- int error = 0;
5872		-
5873	3492	if (log->l_flags & XLOG_RECOVERY_NEEDED)
5874		- error = xlog_recover_cancel_intents(log);
5875		-
5876		- return error;
	3493	+ xlog_recover_cancel_intents(log);
5877	3494	}
5878	3495
5879	3496	#if defined(DEBUG)
..	..	@@ -5886,7 +3503,6 @@
5886	3503	struct xlog *log)
5887	3504	{
5888	3505	xfs_mount_t *mp;
5889		- xfs_agf_t *agfp;
5890	3506	xfs_buf_t *agfbp;
5891	3507	xfs_buf_t *agibp;
5892	3508	xfs_agnumber_t agno;
..	..	@@ -5906,7 +3522,8 @@
5906	3522	xfs_alert(mp, "%s agf read failed agno %d error %d",
5907	3523	__func__, agno, error);
5908	3524	} else {
5909		- agfp = XFS_BUF_TO_AGF(agfbp);
	3525	+ struct xfs_agf *agfp = agfbp->b_addr;
	3526	+
5910	3527	freeblks += be32_to_cpu(agfp->agf_freeblks) +
5911	3528	be32_to_cpu(agfp->agf_flcount);
5912	3529	xfs_buf_relse(agfbp);
..	..	@@ -5917,7 +3534,7 @@
5917	3534	xfs_alert(mp, "%s agi read failed agno %d error %d",
5918	3535	__func__, agno, error);
5919	3536	} else {
5920		- struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
	3537	+ struct xfs_agi *agi = agibp->b_addr;
5921	3538
5922	3539	itotal += be32_to_cpu(agi->agi_count);
5923	3540	ifree += be32_to_cpu(agi->agi_freecount);