hc
2024-05-16 8d2a02b24d66aa359e83eebc1ed3c0f85367a1cb
kernel/fs/xfs/xfs_log_recover.c
....@@ -13,29 +13,18 @@
1313 #include "xfs_sb.h"
1414 #include "xfs_mount.h"
1515 #include "xfs_defer.h"
16
-#include "xfs_da_format.h"
17
-#include "xfs_da_btree.h"
1816 #include "xfs_inode.h"
1917 #include "xfs_trans.h"
2018 #include "xfs_log.h"
2119 #include "xfs_log_priv.h"
2220 #include "xfs_log_recover.h"
23
-#include "xfs_inode_item.h"
24
-#include "xfs_extfree_item.h"
2521 #include "xfs_trans_priv.h"
2622 #include "xfs_alloc.h"
2723 #include "xfs_ialloc.h"
28
-#include "xfs_quota.h"
29
-#include "xfs_cksum.h"
3024 #include "xfs_trace.h"
3125 #include "xfs_icache.h"
32
-#include "xfs_bmap_btree.h"
3326 #include "xfs_error.h"
34
-#include "xfs_dir2.h"
35
-#include "xfs_rmap_item.h"
3627 #include "xfs_buf_item.h"
37
-#include "xfs_refcount_item.h"
38
-#include "xfs_bmap_item.h"
3928
4029 #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
4130
....@@ -59,17 +48,6 @@
5948 struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
6049
6150 /*
62
- * This structure is used during recovery to record the buf log items which
63
- * have been canceled and should not be replayed.
64
- */
65
-struct xfs_buf_cancel {
66
- xfs_daddr_t bc_blkno;
67
- uint bc_len;
68
- int bc_refcount;
69
- struct list_head bc_list;
70
-};
71
-
72
-/*
7351 * Sector aligned buffer routines for buffer create/read/write/access
7452 */
7553
....@@ -79,7 +57,7 @@
7957 * are valid, false otherwise.
8058 */
8159 static inline bool
82
-xlog_verify_bp(
60
+xlog_verify_bno(
8361 struct xlog *log,
8462 xfs_daddr_t blk_no,
8563 int bbcount)
....@@ -92,114 +70,98 @@
9270 }
9371
9472 /*
95
- * Allocate a buffer to hold log data. The buffer needs to be able
96
- * to map to a range of nbblks basic blocks at any valid (basic
97
- * block) offset within the log.
73
+ * Allocate a buffer to hold log data. The buffer needs to be able to map to
74
+ * a range of nbblks basic blocks at any valid offset within the log.
9875 */
99
-STATIC xfs_buf_t *
100
-xlog_get_bp(
76
+static char *
77
+xlog_alloc_buffer(
10178 struct xlog *log,
10279 int nbblks)
10380 {
104
- struct xfs_buf *bp;
81
+ int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
10582
10683 /*
10784 * Pass log block 0 since we don't have an addr yet, buffer will be
10885 * verified on read.
10986 */
110
- if (!xlog_verify_bp(log, 0, nbblks)) {
87
+ if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
11188 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
11289 nbblks);
113
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
11490 return NULL;
11591 }
11692
11793 /*
118
- * We do log I/O in units of log sectors (a power-of-2
119
- * multiple of the basic block size), so we round up the
120
- * requested size to accommodate the basic blocks required
121
- * for complete log sectors.
94
+ * We do log I/O in units of log sectors (a power-of-2 multiple of the
95
+ * basic block size), so we round up the requested size to accommodate
96
+ * the basic blocks required for complete log sectors.
12297 *
123
- * In addition, the buffer may be used for a non-sector-
124
- * aligned block offset, in which case an I/O of the
125
- * requested size could extend beyond the end of the
126
- * buffer. If the requested size is only 1 basic block it
127
- * will never straddle a sector boundary, so this won't be
128
- * an issue. Nor will this be a problem if the log I/O is
129
- * done in basic blocks (sector size 1). But otherwise we
130
- * extend the buffer by one extra log sector to ensure
131
- * there's space to accommodate this possibility.
98
+ * In addition, the buffer may be used for a non-sector-aligned block
99
+ * offset, in which case an I/O of the requested size could extend
100
+ * beyond the end of the buffer. If the requested size is only 1 basic
101
+ * block it will never straddle a sector boundary, so this won't be an
102
+ * issue. Nor will this be a problem if the log I/O is done in basic
103
+ * blocks (sector size 1). But otherwise we extend the buffer by one
104
+ * extra log sector to ensure there's space to accommodate this
105
+ * possibility.
132106 */
133107 if (nbblks > 1 && log->l_sectBBsize > 1)
134108 nbblks += log->l_sectBBsize;
135109 nbblks = round_up(nbblks, log->l_sectBBsize);
136
-
137
- bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
138
- if (bp)
139
- xfs_buf_unlock(bp);
140
- return bp;
141
-}
142
-
143
-STATIC void
144
-xlog_put_bp(
145
- xfs_buf_t *bp)
146
-{
147
- xfs_buf_free(bp);
110
+ return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
148111 }
149112
150113 /*
151114 * Return the address of the start of the given block number's data
152115 * in a log buffer. The buffer covers a log sector-aligned region.
153116 */
154
-STATIC char *
117
+static inline unsigned int
155118 xlog_align(
156119 struct xlog *log,
157
- xfs_daddr_t blk_no,
158
- int nbblks,
159
- struct xfs_buf *bp)
120
+ xfs_daddr_t blk_no)
160121 {
161
- xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
162
-
163
- ASSERT(offset + nbblks <= bp->b_length);
164
- return bp->b_addr + BBTOB(offset);
122
+ return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
165123 }
166124
167
-
168
-/*
169
- * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
170
- */
171
-STATIC int
172
-xlog_bread_noalign(
173
- struct xlog *log,
174
- xfs_daddr_t blk_no,
175
- int nbblks,
176
- struct xfs_buf *bp)
125
+static int
126
+xlog_do_io(
127
+ struct xlog *log,
128
+ xfs_daddr_t blk_no,
129
+ unsigned int nbblks,
130
+ char *data,
131
+ unsigned int op)
177132 {
178
- int error;
133
+ int error;
179134
180
- if (!xlog_verify_bp(log, blk_no, nbblks)) {
135
+ if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
181136 xfs_warn(log->l_mp,
182137 "Invalid log block/length (0x%llx, 0x%x) for buffer",
183138 blk_no, nbblks);
184
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
185139 return -EFSCORRUPTED;
186140 }
187141
188142 blk_no = round_down(blk_no, log->l_sectBBsize);
189143 nbblks = round_up(nbblks, log->l_sectBBsize);
190
-
191144 ASSERT(nbblks > 0);
192
- ASSERT(nbblks <= bp->b_length);
193145
194
- XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
195
- bp->b_flags |= XBF_READ;
196
- bp->b_io_length = nbblks;
197
- bp->b_error = 0;
198
-
199
- error = xfs_buf_submit(bp);
200
- if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
201
- xfs_buf_ioerror_alert(bp, __func__);
146
+ error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
147
+ BBTOB(nbblks), data, op);
148
+ if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) {
149
+ xfs_alert(log->l_mp,
150
+ "log recovery %s I/O error at daddr 0x%llx len %d error %d",
151
+ op == REQ_OP_WRITE ? "write" : "read",
152
+ blk_no, nbblks, error);
153
+ }
202154 return error;
155
+}
156
+
157
+STATIC int
158
+xlog_bread_noalign(
159
+ struct xlog *log,
160
+ xfs_daddr_t blk_no,
161
+ int nbblks,
162
+ char *data)
163
+{
164
+ return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
203165 }
204166
205167 STATIC int
....@@ -207,87 +169,25 @@
207169 struct xlog *log,
208170 xfs_daddr_t blk_no,
209171 int nbblks,
210
- struct xfs_buf *bp,
172
+ char *data,
211173 char **offset)
212174 {
213175 int error;
214176
215
- error = xlog_bread_noalign(log, blk_no, nbblks, bp);
216
- if (error)
217
- return error;
218
-
219
- *offset = xlog_align(log, blk_no, nbblks, bp);
220
- return 0;
177
+ error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
178
+ if (!error)
179
+ *offset = data + xlog_align(log, blk_no);
180
+ return error;
221181 }
222182
223
-/*
224
- * Read at an offset into the buffer. Returns with the buffer in it's original
225
- * state regardless of the result of the read.
226
- */
227
-STATIC int
228
-xlog_bread_offset(
229
- struct xlog *log,
230
- xfs_daddr_t blk_no, /* block to read from */
231
- int nbblks, /* blocks to read */
232
- struct xfs_buf *bp,
233
- char *offset)
234
-{
235
- char *orig_offset = bp->b_addr;
236
- int orig_len = BBTOB(bp->b_length);
237
- int error, error2;
238
-
239
- error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
240
- if (error)
241
- return error;
242
-
243
- error = xlog_bread_noalign(log, blk_no, nbblks, bp);
244
-
245
- /* must reset buffer pointer even on error */
246
- error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
247
- if (error)
248
- return error;
249
- return error2;
250
-}
251
-
252
-/*
253
- * Write out the buffer at the given block for the given number of blocks.
254
- * The buffer is kept locked across the write and is returned locked.
255
- * This can only be used for synchronous log writes.
256
- */
257183 STATIC int
258184 xlog_bwrite(
259185 struct xlog *log,
260186 xfs_daddr_t blk_no,
261187 int nbblks,
262
- struct xfs_buf *bp)
188
+ char *data)
263189 {
264
- int error;
265
-
266
- if (!xlog_verify_bp(log, blk_no, nbblks)) {
267
- xfs_warn(log->l_mp,
268
- "Invalid log block/length (0x%llx, 0x%x) for buffer",
269
- blk_no, nbblks);
270
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
271
- return -EFSCORRUPTED;
272
- }
273
-
274
- blk_no = round_down(blk_no, log->l_sectBBsize);
275
- nbblks = round_up(nbblks, log->l_sectBBsize);
276
-
277
- ASSERT(nbblks > 0);
278
- ASSERT(nbblks <= bp->b_length);
279
-
280
- XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
281
- xfs_buf_hold(bp);
282
- xfs_buf_lock(bp);
283
- bp->b_io_length = nbblks;
284
- bp->b_error = 0;
285
-
286
- error = xfs_bwrite(bp);
287
- if (error)
288
- xfs_buf_ioerror_alert(bp, __func__);
289
- xfs_buf_relse(bp);
290
- return error;
190
+ return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
291191 }
292192
293193 #ifdef DEBUG
....@@ -323,19 +223,17 @@
323223 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
324224 * a dirty log created in IRIX.
325225 */
326
- if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
226
+ if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
327227 xfs_warn(mp,
328228 "dirty log written in incompatible format - can't recover");
329229 xlog_header_check_dump(mp, head);
330
- XFS_ERROR_REPORT("xlog_header_check_recover(1)",
331
- XFS_ERRLEVEL_HIGH, mp);
332230 return -EFSCORRUPTED;
333
- } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
231
+ }
232
+ if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
233
+ &head->h_fs_uuid))) {
334234 xfs_warn(mp,
335235 "dirty log entry has mismatched uuid - can't recover");
336236 xlog_header_check_dump(mp, head);
337
- XFS_ERROR_REPORT("xlog_header_check_recover(2)",
338
- XFS_ERRLEVEL_HIGH, mp);
339237 return -EFSCORRUPTED;
340238 }
341239 return 0;
....@@ -358,42 +256,13 @@
358256 * by IRIX and continue.
359257 */
360258 xfs_warn(mp, "null uuid in log - IRIX style log");
361
- } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
259
+ } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
260
+ &head->h_fs_uuid))) {
362261 xfs_warn(mp, "log has mismatched uuid - can't recover");
363262 xlog_header_check_dump(mp, head);
364
- XFS_ERROR_REPORT("xlog_header_check_mount",
365
- XFS_ERRLEVEL_HIGH, mp);
366263 return -EFSCORRUPTED;
367264 }
368265 return 0;
369
-}
370
-
371
-STATIC void
372
-xlog_recover_iodone(
373
- struct xfs_buf *bp)
374
-{
375
- if (bp->b_error) {
376
- /*
377
- * We're not going to bother about retrying
378
- * this during recovery. One strike!
379
- */
380
- if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
381
- xfs_buf_ioerror_alert(bp, __func__);
382
- xfs_force_shutdown(bp->b_target->bt_mount,
383
- SHUTDOWN_META_IO_ERROR);
384
- }
385
- }
386
-
387
- /*
388
- * On v5 supers, a bli could be attached to update the metadata LSN.
389
- * Clean it up.
390
- */
391
- if (bp->b_log_item)
392
- xfs_buf_item_relse(bp);
393
- ASSERT(bp->b_log_item == NULL);
394
-
395
- bp->b_iodone = NULL;
396
- xfs_buf_ioend(bp);
397266 }
398267
399268 /*
....@@ -405,7 +274,7 @@
405274 STATIC int
406275 xlog_find_cycle_start(
407276 struct xlog *log,
408
- struct xfs_buf *bp,
277
+ char *buffer,
409278 xfs_daddr_t first_blk,
410279 xfs_daddr_t *last_blk,
411280 uint cycle)
....@@ -419,7 +288,7 @@
419288 end_blk = *last_blk;
420289 mid_blk = BLK_AVG(first_blk, end_blk);
421290 while (mid_blk != first_blk && mid_blk != end_blk) {
422
- error = xlog_bread(log, mid_blk, 1, bp, &offset);
291
+ error = xlog_bread(log, mid_blk, 1, buffer, &offset);
423292 if (error)
424293 return error;
425294 mid_cycle = xlog_get_cycle(offset);
....@@ -455,7 +324,7 @@
455324 {
456325 xfs_daddr_t i, j;
457326 uint cycle;
458
- xfs_buf_t *bp;
327
+ char *buffer;
459328 xfs_daddr_t bufblks;
460329 char *buf = NULL;
461330 int error = 0;
....@@ -469,7 +338,7 @@
469338 bufblks = 1 << ffs(nbblks);
470339 while (bufblks > log->l_logBBsize)
471340 bufblks >>= 1;
472
- while (!(bp = xlog_get_bp(log, bufblks))) {
341
+ while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
473342 bufblks >>= 1;
474343 if (bufblks < log->l_sectBBsize)
475344 return -ENOMEM;
....@@ -480,7 +349,7 @@
480349
481350 bcount = min(bufblks, (start_blk + nbblks - i));
482351
483
- error = xlog_bread(log, i, bcount, bp, &buf);
352
+ error = xlog_bread(log, i, bcount, buffer, &buf);
484353 if (error)
485354 goto out;
486355
....@@ -498,8 +367,21 @@
498367 *new_blk = -1;
499368
500369 out:
501
- xlog_put_bp(bp);
370
+ kmem_free(buffer);
502371 return error;
372
+}
373
+
374
+static inline int
375
+xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
376
+{
377
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
378
+ int h_size = be32_to_cpu(rh->h_size);
379
+
380
+ if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
381
+ h_size > XLOG_HEADER_CYCLE_SIZE)
382
+ return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
383
+ }
384
+ return 1;
503385 }
504386
505387 /*
....@@ -522,7 +404,7 @@
522404 int extra_bblks)
523405 {
524406 xfs_daddr_t i;
525
- xfs_buf_t *bp;
407
+ char *buffer;
526408 char *offset = NULL;
527409 xlog_rec_header_t *head = NULL;
528410 int error = 0;
....@@ -532,12 +414,14 @@
532414
533415 ASSERT(start_blk != 0 || *last_blk != start_blk);
534416
535
- if (!(bp = xlog_get_bp(log, num_blks))) {
536
- if (!(bp = xlog_get_bp(log, 1)))
417
+ buffer = xlog_alloc_buffer(log, num_blks);
418
+ if (!buffer) {
419
+ buffer = xlog_alloc_buffer(log, 1);
420
+ if (!buffer)
537421 return -ENOMEM;
538422 smallmem = 1;
539423 } else {
540
- error = xlog_bread(log, start_blk, num_blks, bp, &offset);
424
+ error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
541425 if (error)
542426 goto out;
543427 offset += ((num_blks - 1) << BBSHIFT);
....@@ -549,12 +433,12 @@
549433 xfs_warn(log->l_mp,
550434 "Log inconsistent (didn't find previous header)");
551435 ASSERT(0);
552
- error = -EIO;
436
+ error = -EFSCORRUPTED;
553437 goto out;
554438 }
555439
556440 if (smallmem) {
557
- error = xlog_bread(log, i, 1, bp, &offset);
441
+ error = xlog_bread(log, i, 1, buffer, &offset);
558442 if (error)
559443 goto out;
560444 }
....@@ -592,22 +476,14 @@
592476 * reset last_blk. Only when last_blk points in the middle of a log
593477 * record do we update last_blk.
594478 */
595
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
596
- uint h_size = be32_to_cpu(head->h_size);
597
-
598
- xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
599
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
600
- xhdrs++;
601
- } else {
602
- xhdrs = 1;
603
- }
479
+ xhdrs = xlog_logrec_hblks(log, head);
604480
605481 if (*last_blk - i + extra_bblks !=
606482 BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
607483 *last_blk = i;
608484
609485 out:
610
- xlog_put_bp(bp);
486
+ kmem_free(buffer);
611487 return error;
612488 }
613489
....@@ -629,7 +505,7 @@
629505 struct xlog *log,
630506 xfs_daddr_t *return_head_blk)
631507 {
632
- xfs_buf_t *bp;
508
+ char *buffer;
633509 char *offset;
634510 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
635511 int num_scan_bblks;
....@@ -659,20 +535,20 @@
659535 }
660536
661537 first_blk = 0; /* get cycle # of 1st block */
662
- bp = xlog_get_bp(log, 1);
663
- if (!bp)
538
+ buffer = xlog_alloc_buffer(log, 1);
539
+ if (!buffer)
664540 return -ENOMEM;
665541
666
- error = xlog_bread(log, 0, 1, bp, &offset);
542
+ error = xlog_bread(log, 0, 1, buffer, &offset);
667543 if (error)
668
- goto bp_err;
544
+ goto out_free_buffer;
669545
670546 first_half_cycle = xlog_get_cycle(offset);
671547
672548 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
673
- error = xlog_bread(log, last_blk, 1, bp, &offset);
549
+ error = xlog_bread(log, last_blk, 1, buffer, &offset);
674550 if (error)
675
- goto bp_err;
551
+ goto out_free_buffer;
676552
677553 last_half_cycle = xlog_get_cycle(offset);
678554 ASSERT(last_half_cycle != 0);
....@@ -740,9 +616,10 @@
740616 * ^ we want to locate this spot
741617 */
742618 stop_on_cycle = last_half_cycle;
743
- if ((error = xlog_find_cycle_start(log, bp, first_blk,
744
- &head_blk, last_half_cycle)))
745
- goto bp_err;
619
+ error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
620
+ last_half_cycle);
621
+ if (error)
622
+ goto out_free_buffer;
746623 }
747624
748625 /*
....@@ -762,7 +639,7 @@
762639 if ((error = xlog_find_verify_cycle(log,
763640 start_blk, num_scan_bblks,
764641 stop_on_cycle, &new_blk)))
765
- goto bp_err;
642
+ goto out_free_buffer;
766643 if (new_blk != -1)
767644 head_blk = new_blk;
768645 } else { /* need to read 2 parts of log */
....@@ -799,7 +676,7 @@
799676 if ((error = xlog_find_verify_cycle(log, start_blk,
800677 num_scan_bblks - (int)head_blk,
801678 (stop_on_cycle - 1), &new_blk)))
802
- goto bp_err;
679
+ goto out_free_buffer;
803680 if (new_blk != -1) {
804681 head_blk = new_blk;
805682 goto validate_head;
....@@ -815,7 +692,7 @@
815692 if ((error = xlog_find_verify_cycle(log,
816693 start_blk, (int)head_blk,
817694 stop_on_cycle, &new_blk)))
818
- goto bp_err;
695
+ goto out_free_buffer;
819696 if (new_blk != -1)
820697 head_blk = new_blk;
821698 }
....@@ -834,13 +711,13 @@
834711 if (error == 1)
835712 error = -EIO;
836713 if (error)
837
- goto bp_err;
714
+ goto out_free_buffer;
838715 } else {
839716 start_blk = 0;
840717 ASSERT(head_blk <= INT_MAX);
841718 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
842719 if (error < 0)
843
- goto bp_err;
720
+ goto out_free_buffer;
844721 if (error == 1) {
845722 /* We hit the beginning of the log during our search */
846723 start_blk = log_bbnum - (num_scan_bblks - head_blk);
....@@ -853,14 +730,14 @@
853730 if (error == 1)
854731 error = -EIO;
855732 if (error)
856
- goto bp_err;
733
+ goto out_free_buffer;
857734 if (new_blk != log_bbnum)
858735 head_blk = new_blk;
859736 } else if (error)
860
- goto bp_err;
737
+ goto out_free_buffer;
861738 }
862739
863
- xlog_put_bp(bp);
740
+ kmem_free(buffer);
864741 if (head_blk == log_bbnum)
865742 *return_head_blk = 0;
866743 else
....@@ -873,9 +750,8 @@
873750 */
874751 return 0;
875752
876
- bp_err:
877
- xlog_put_bp(bp);
878
-
753
+out_free_buffer:
754
+ kmem_free(buffer);
879755 if (error)
880756 xfs_warn(log->l_mp, "failed to find log head");
881757 return error;
....@@ -895,7 +771,7 @@
895771 xfs_daddr_t head_blk,
896772 xfs_daddr_t tail_blk,
897773 int count,
898
- struct xfs_buf *bp,
774
+ char *buffer,
899775 xfs_daddr_t *rblk,
900776 struct xlog_rec_header **rhead,
901777 bool *wrapped)
....@@ -914,7 +790,7 @@
914790 */
915791 end_blk = head_blk > tail_blk ? tail_blk : 0;
916792 for (i = (int) head_blk - 1; i >= end_blk; i--) {
917
- error = xlog_bread(log, i, 1, bp, &offset);
793
+ error = xlog_bread(log, i, 1, buffer, &offset);
918794 if (error)
919795 goto out_error;
920796
....@@ -933,7 +809,7 @@
933809 */
934810 if (tail_blk >= head_blk && found != count) {
935811 for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
936
- error = xlog_bread(log, i, 1, bp, &offset);
812
+ error = xlog_bread(log, i, 1, buffer, &offset);
937813 if (error)
938814 goto out_error;
939815
....@@ -969,7 +845,7 @@
969845 xfs_daddr_t head_blk,
970846 xfs_daddr_t tail_blk,
971847 int count,
972
- struct xfs_buf *bp,
848
+ char *buffer,
973849 xfs_daddr_t *rblk,
974850 struct xlog_rec_header **rhead,
975851 bool *wrapped)
....@@ -988,7 +864,7 @@
988864 */
989865 end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
990866 for (i = (int) tail_blk; i <= end_blk; i++) {
991
- error = xlog_bread(log, i, 1, bp, &offset);
867
+ error = xlog_bread(log, i, 1, buffer, &offset);
992868 if (error)
993869 goto out_error;
994870
....@@ -1006,7 +882,7 @@
1006882 */
1007883 if (tail_blk > head_blk && found != count) {
1008884 for (i = 0; i < (int) head_blk; i++) {
1009
- error = xlog_bread(log, i, 1, bp, &offset);
885
+ error = xlog_bread(log, i, 1, buffer, &offset);
1010886 if (error)
1011887 goto out_error;
1012888
....@@ -1069,22 +945,22 @@
1069945 int hsize)
1070946 {
1071947 struct xlog_rec_header *thead;
1072
- struct xfs_buf *bp;
948
+ char *buffer;
1073949 xfs_daddr_t first_bad;
1074950 int error = 0;
1075951 bool wrapped;
1076952 xfs_daddr_t tmp_tail;
1077953 xfs_daddr_t orig_tail = *tail_blk;
1078954
1079
- bp = xlog_get_bp(log, 1);
1080
- if (!bp)
955
+ buffer = xlog_alloc_buffer(log, 1);
956
+ if (!buffer)
1081957 return -ENOMEM;
1082958
1083959 /*
1084960 * Make sure the tail points to a record (returns positive count on
1085961 * success).
1086962 */
1087
- error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
963
+ error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
1088964 &tmp_tail, &thead, &wrapped);
1089965 if (error < 0)
1090966 goto out;
....@@ -1113,8 +989,8 @@
1113989 break;
1114990
1115991 /* skip to the next record; returns positive count on success */
1116
- error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
1117
- &tmp_tail, &thead, &wrapped);
992
+ error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
993
+ buffer, &tmp_tail, &thead, &wrapped);
1118994 if (error < 0)
1119995 goto out;
1120996
....@@ -1129,7 +1005,7 @@
11291005 "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
11301006 orig_tail, *tail_blk);
11311007 out:
1132
- xlog_put_bp(bp);
1008
+ kmem_free(buffer);
11331009 return error;
11341010 }
11351011
....@@ -1151,13 +1027,13 @@
11511027 struct xlog *log,
11521028 xfs_daddr_t *head_blk, /* in/out: unverified head */
11531029 xfs_daddr_t *tail_blk, /* out: tail block */
1154
- struct xfs_buf *bp,
1030
+ char *buffer,
11551031 xfs_daddr_t *rhead_blk, /* start blk of last record */
11561032 struct xlog_rec_header **rhead, /* ptr to last record */
11571033 bool *wrapped) /* last rec. wraps phys. log */
11581034 {
11591035 struct xlog_rec_header *tmp_rhead;
1160
- struct xfs_buf *tmp_bp;
1036
+ char *tmp_buffer;
11611037 xfs_daddr_t first_bad;
11621038 xfs_daddr_t tmp_rhead_blk;
11631039 int found;
....@@ -1168,15 +1044,15 @@
11681044 * Check the head of the log for torn writes. Search backwards from the
11691045 * head until we hit the tail or the maximum number of log record I/Os
11701046 * that could have been in flight at one time. Use a temporary buffer so
1171
- * we don't trash the rhead/bp pointers from the caller.
1047
+ * we don't trash the rhead/buffer pointers from the caller.
11721048 */
1173
- tmp_bp = xlog_get_bp(log, 1);
1174
- if (!tmp_bp)
1049
+ tmp_buffer = xlog_alloc_buffer(log, 1);
1050
+ if (!tmp_buffer)
11751051 return -ENOMEM;
11761052 error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
1177
- XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
1178
- &tmp_rhead, &tmp_wrapped);
1179
- xlog_put_bp(tmp_bp);
1053
+ XLOG_MAX_ICLOGS, tmp_buffer,
1054
+ &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
1055
+ kmem_free(tmp_buffer);
11801056 if (error < 0)
11811057 return error;
11821058
....@@ -1203,10 +1079,10 @@
12031079 *
12041080 * Note that xlog_find_tail() clears the blocks at the new head
12051081 * (i.e., the records with invalid CRC) if the cycle number
1206
- * matches the the current cycle.
1082
+ * matches the current cycle.
12071083 */
1208
- found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
1209
- rhead_blk, rhead, wrapped);
1084
+ found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
1085
+ buffer, rhead_blk, rhead, wrapped);
12101086 if (found < 0)
12111087 return found;
12121088 if (found == 0) /* XXX: right thing to do here? */
....@@ -1266,7 +1142,7 @@
12661142 xfs_daddr_t *tail_blk,
12671143 struct xlog_rec_header *rhead,
12681144 xfs_daddr_t rhead_blk,
1269
- struct xfs_buf *bp,
1145
+ char *buffer,
12701146 bool *clean)
12711147 {
12721148 struct xlog_op_header *op_head;
....@@ -1287,29 +1163,14 @@
12871163 * below. We won't want to clear the unmount record if there is one, so
12881164 * we pass the lsn of the unmount record rather than the block after it.
12891165 */
1290
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1291
- int h_size = be32_to_cpu(rhead->h_size);
1292
- int h_version = be32_to_cpu(rhead->h_version);
1293
-
1294
- if ((h_version & XLOG_VERSION_2) &&
1295
- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1296
- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1297
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
1298
- hblks++;
1299
- } else {
1300
- hblks = 1;
1301
- }
1302
- } else {
1303
- hblks = 1;
1304
- }
1305
-
1166
+ hblks = xlog_logrec_hblks(log, rhead);
13061167 after_umount_blk = xlog_wrap_logbno(log,
13071168 rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
13081169
13091170 if (*head_blk == after_umount_blk &&
13101171 be32_to_cpu(rhead->h_num_logops) == 1) {
13111172 umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
1312
- error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
1173
+ error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
13131174 if (error)
13141175 return error;
13151176
....@@ -1388,7 +1249,7 @@
13881249 {
13891250 xlog_rec_header_t *rhead;
13901251 char *offset = NULL;
1391
- xfs_buf_t *bp;
1252
+ char *buffer;
13921253 int error;
13931254 xfs_daddr_t rhead_blk;
13941255 xfs_lsn_t tail_lsn;
....@@ -1402,11 +1263,11 @@
14021263 return error;
14031264 ASSERT(*head_blk < INT_MAX);
14041265
1405
- bp = xlog_get_bp(log, 1);
1406
- if (!bp)
1266
+ buffer = xlog_alloc_buffer(log, 1);
1267
+ if (!buffer)
14071268 return -ENOMEM;
14081269 if (*head_blk == 0) { /* special case */
1409
- error = xlog_bread(log, 0, 1, bp, &offset);
1270
+ error = xlog_bread(log, 0, 1, buffer, &offset);
14101271 if (error)
14111272 goto done;
14121273
....@@ -1422,13 +1283,14 @@
14221283 * block. This wraps all the way back around to the head so something is
14231284 * seriously wrong if we can't find it.
14241285 */
1425
- error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
1286
+ error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
14261287 &rhead_blk, &rhead, &wrapped);
14271288 if (error < 0)
1428
- return error;
1289
+ goto done;
14291290 if (!error) {
14301291 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1431
- return -EIO;
1292
+ error = -EFSCORRUPTED;
1293
+ goto done;
14321294 }
14331295 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
14341296
....@@ -1443,7 +1305,7 @@
14431305 * state to determine whether recovery is necessary.
14441306 */
14451307 error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
1446
- rhead_blk, bp, &clean);
1308
+ rhead_blk, buffer, &clean);
14471309 if (error)
14481310 goto done;
14491311
....@@ -1460,7 +1322,7 @@
14601322 if (!clean) {
14611323 xfs_daddr_t orig_head = *head_blk;
14621324
1463
- error = xlog_verify_head(log, head_blk, tail_blk, bp,
1325
+ error = xlog_verify_head(log, head_blk, tail_blk, buffer,
14641326 &rhead_blk, &rhead, &wrapped);
14651327 if (error)
14661328 goto done;
....@@ -1471,7 +1333,7 @@
14711333 wrapped);
14721334 tail_lsn = atomic64_read(&log->l_tail_lsn);
14731335 error = xlog_check_unmount_rec(log, head_blk, tail_blk,
1474
- rhead, rhead_blk, bp,
1336
+ rhead, rhead_blk, buffer,
14751337 &clean);
14761338 if (error)
14771339 goto done;
....@@ -1505,11 +1367,11 @@
15051367 * But... if the -device- itself is readonly, just skip this.
15061368 * We can't recover this device anyway, so it won't matter.
15071369 */
1508
- if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
1370
+ if (!xfs_readonly_buftarg(log->l_targ))
15091371 error = xlog_clear_stale_blocks(log, tail_lsn);
15101372
15111373 done:
1512
- xlog_put_bp(bp);
1374
+ kmem_free(buffer);
15131375
15141376 if (error)
15151377 xfs_warn(log->l_mp, "failed to locate log tail");
....@@ -1537,7 +1399,7 @@
15371399 struct xlog *log,
15381400 xfs_daddr_t *blk_no)
15391401 {
1540
- xfs_buf_t *bp;
1402
+ char *buffer;
15411403 char *offset;
15421404 uint first_cycle, last_cycle;
15431405 xfs_daddr_t new_blk, last_blk, start_blk;
....@@ -1547,35 +1409,36 @@
15471409 *blk_no = 0;
15481410
15491411 /* check totally zeroed log */
1550
- bp = xlog_get_bp(log, 1);
1551
- if (!bp)
1412
+ buffer = xlog_alloc_buffer(log, 1);
1413
+ if (!buffer)
15521414 return -ENOMEM;
1553
- error = xlog_bread(log, 0, 1, bp, &offset);
1415
+ error = xlog_bread(log, 0, 1, buffer, &offset);
15541416 if (error)
1555
- goto bp_err;
1417
+ goto out_free_buffer;
15561418
15571419 first_cycle = xlog_get_cycle(offset);
15581420 if (first_cycle == 0) { /* completely zeroed log */
15591421 *blk_no = 0;
1560
- xlog_put_bp(bp);
1422
+ kmem_free(buffer);
15611423 return 1;
15621424 }
15631425
15641426 /* check partially zeroed log */
1565
- error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1427
+ error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
15661428 if (error)
1567
- goto bp_err;
1429
+ goto out_free_buffer;
15681430
15691431 last_cycle = xlog_get_cycle(offset);
15701432 if (last_cycle != 0) { /* log completely written to */
1571
- xlog_put_bp(bp);
1433
+ kmem_free(buffer);
15721434 return 0;
15731435 }
15741436
15751437 /* we have a partially zeroed log */
15761438 last_blk = log_bbnum-1;
1577
- if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1578
- goto bp_err;
1439
+ error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
1440
+ if (error)
1441
+ goto out_free_buffer;
15791442
15801443 /*
15811444 * Validate the answer. Because there is no way to guarantee that
....@@ -1598,7 +1461,7 @@
15981461 */
15991462 if ((error = xlog_find_verify_cycle(log, start_blk,
16001463 (int)num_scan_bblks, 0, &new_blk)))
1601
- goto bp_err;
1464
+ goto out_free_buffer;
16021465 if (new_blk != -1)
16031466 last_blk = new_blk;
16041467
....@@ -1610,11 +1473,11 @@
16101473 if (error == 1)
16111474 error = -EIO;
16121475 if (error)
1613
- goto bp_err;
1476
+ goto out_free_buffer;
16141477
16151478 *blk_no = last_blk;
1616
-bp_err:
1617
- xlog_put_bp(bp);
1479
+out_free_buffer:
1480
+ kmem_free(buffer);
16181481 if (error)
16191482 return error;
16201483 return 1;
....@@ -1657,7 +1520,7 @@
16571520 int tail_block)
16581521 {
16591522 char *offset;
1660
- xfs_buf_t *bp;
1523
+ char *buffer;
16611524 int balign, ealign;
16621525 int sectbb = log->l_sectBBsize;
16631526 int end_block = start_block + blocks;
....@@ -1674,7 +1537,7 @@
16741537 bufblks = 1 << ffs(blocks);
16751538 while (bufblks > log->l_logBBsize)
16761539 bufblks >>= 1;
1677
- while (!(bp = xlog_get_bp(log, bufblks))) {
1540
+ while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
16781541 bufblks >>= 1;
16791542 if (bufblks < sectbb)
16801543 return -ENOMEM;
....@@ -1686,9 +1549,9 @@
16861549 */
16871550 balign = round_down(start_block, sectbb);
16881551 if (balign != start_block) {
1689
- error = xlog_bread_noalign(log, start_block, 1, bp);
1552
+ error = xlog_bread_noalign(log, start_block, 1, buffer);
16901553 if (error)
1691
- goto out_put_bp;
1554
+ goto out_free_buffer;
16921555
16931556 j = start_block - balign;
16941557 }
....@@ -1705,29 +1568,28 @@
17051568 */
17061569 ealign = round_down(end_block, sectbb);
17071570 if (j == 0 && (start_block + endcount > ealign)) {
1708
- offset = bp->b_addr + BBTOB(ealign - start_block);
1709
- error = xlog_bread_offset(log, ealign, sectbb,
1710
- bp, offset);
1571
+ error = xlog_bread_noalign(log, ealign, sectbb,
1572
+ buffer + BBTOB(ealign - start_block));
17111573 if (error)
17121574 break;
17131575
17141576 }
17151577
1716
- offset = xlog_align(log, start_block, endcount, bp);
1578
+ offset = buffer + xlog_align(log, start_block);
17171579 for (; j < endcount; j++) {
17181580 xlog_add_record(log, offset, cycle, i+j,
17191581 tail_cycle, tail_block);
17201582 offset += BBSIZE;
17211583 }
1722
- error = xlog_bwrite(log, start_block, endcount, bp);
1584
+ error = xlog_bwrite(log, start_block, endcount, buffer);
17231585 if (error)
17241586 break;
17251587 start_block += endcount;
17261588 j = 0;
17271589 }
17281590
1729
- out_put_bp:
1730
- xlog_put_bp(bp);
1591
+out_free_buffer:
1592
+ kmem_free(buffer);
17311593 return error;
17321594 }
17331595
....@@ -1777,11 +1639,10 @@
17771639 * the distance from the beginning of the log to the
17781640 * tail.
17791641 */
1780
- if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1781
- XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1782
- XFS_ERRLEVEL_LOW, log->l_mp);
1642
+ if (XFS_IS_CORRUPT(log->l_mp,
1643
+ head_block < tail_block ||
1644
+ head_block >= log->l_logBBsize))
17831645 return -EFSCORRUPTED;
1784
- }
17851646 tail_distance = tail_block + (log->l_logBBsize - head_block);
17861647 } else {
17871648 /*
....@@ -1789,11 +1650,10 @@
17891650 * so the distance from the head to the tail is just
17901651 * the tail block minus the head block.
17911652 */
1792
- if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1793
- XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1794
- XFS_ERRLEVEL_LOW, log->l_mp);
1653
+ if (XFS_IS_CORRUPT(log->l_mp,
1654
+ head_block >= tail_block ||
1655
+ head_cycle != tail_cycle + 1))
17951656 return -EFSCORRUPTED;
1796
- }
17971657 tail_distance = tail_block - head_block;
17981658 }
17991659
....@@ -1863,12 +1723,72 @@
18631723 return 0;
18641724 }
18651725
1726
+/*
1727
+ * Release the recovered intent item in the AIL that matches the given intent
1728
+ * type and intent id.
1729
+ */
1730
+void
1731
+xlog_recover_release_intent(
1732
+ struct xlog *log,
1733
+ unsigned short intent_type,
1734
+ uint64_t intent_id)
1735
+{
1736
+ struct xfs_ail_cursor cur;
1737
+ struct xfs_log_item *lip;
1738
+ struct xfs_ail *ailp = log->l_ailp;
1739
+
1740
+ spin_lock(&ailp->ail_lock);
1741
+ for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
1742
+ lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
1743
+ if (lip->li_type != intent_type)
1744
+ continue;
1745
+ if (!lip->li_ops->iop_match(lip, intent_id))
1746
+ continue;
1747
+
1748
+ spin_unlock(&ailp->ail_lock);
1749
+ lip->li_ops->iop_release(lip);
1750
+ spin_lock(&ailp->ail_lock);
1751
+ break;
1752
+ }
1753
+
1754
+ xfs_trans_ail_cursor_done(&cur);
1755
+ spin_unlock(&ailp->ail_lock);
1756
+}
1757
+
18661758 /******************************************************************************
18671759 *
18681760 * Log recover routines
18691761 *
18701762 ******************************************************************************
18711763 */
1764
+static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
1765
+ &xlog_buf_item_ops,
1766
+ &xlog_inode_item_ops,
1767
+ &xlog_dquot_item_ops,
1768
+ &xlog_quotaoff_item_ops,
1769
+ &xlog_icreate_item_ops,
1770
+ &xlog_efi_item_ops,
1771
+ &xlog_efd_item_ops,
1772
+ &xlog_rui_item_ops,
1773
+ &xlog_rud_item_ops,
1774
+ &xlog_cui_item_ops,
1775
+ &xlog_cud_item_ops,
1776
+ &xlog_bui_item_ops,
1777
+ &xlog_bud_item_ops,
1778
+};
1779
+
1780
+static const struct xlog_recover_item_ops *
1781
+xlog_find_item_ops(
1782
+ struct xlog_recover_item *item)
1783
+{
1784
+ unsigned int i;
1785
+
1786
+ for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
1787
+ if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
1788
+ return xlog_recover_item_ops[i];
1789
+
1790
+ return NULL;
1791
+}
18721792
18731793 /*
18741794 * Sort the log items in the transaction.
....@@ -1925,54 +1845,23 @@
19251845 struct xlog_recover *trans,
19261846 int pass)
19271847 {
1928
- xlog_recover_item_t *item, *n;
1848
+ struct xlog_recover_item *item, *n;
19291849 int error = 0;
19301850 LIST_HEAD(sort_list);
19311851 LIST_HEAD(cancel_list);
19321852 LIST_HEAD(buffer_list);
19331853 LIST_HEAD(inode_buffer_list);
1934
- LIST_HEAD(inode_list);
1854
+ LIST_HEAD(item_list);
19351855
19361856 list_splice_init(&trans->r_itemq, &sort_list);
19371857 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1938
- xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1858
+ enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST;
19391859
1940
- switch (ITEM_TYPE(item)) {
1941
- case XFS_LI_ICREATE:
1942
- list_move_tail(&item->ri_list, &buffer_list);
1943
- break;
1944
- case XFS_LI_BUF:
1945
- if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1946
- trace_xfs_log_recover_item_reorder_head(log,
1947
- trans, item, pass);
1948
- list_move(&item->ri_list, &cancel_list);
1949
- break;
1950
- }
1951
- if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
1952
- list_move(&item->ri_list, &inode_buffer_list);
1953
- break;
1954
- }
1955
- list_move_tail(&item->ri_list, &buffer_list);
1956
- break;
1957
- case XFS_LI_INODE:
1958
- case XFS_LI_DQUOT:
1959
- case XFS_LI_QUOTAOFF:
1960
- case XFS_LI_EFD:
1961
- case XFS_LI_EFI:
1962
- case XFS_LI_RUI:
1963
- case XFS_LI_RUD:
1964
- case XFS_LI_CUI:
1965
- case XFS_LI_CUD:
1966
- case XFS_LI_BUI:
1967
- case XFS_LI_BUD:
1968
- trace_xfs_log_recover_item_reorder_tail(log,
1969
- trans, item, pass);
1970
- list_move_tail(&item->ri_list, &inode_list);
1971
- break;
1972
- default:
1860
+ item->ri_ops = xlog_find_item_ops(item);
1861
+ if (!item->ri_ops) {
19731862 xfs_warn(log->l_mp,
1974
- "%s: unrecognized type of log operation",
1975
- __func__);
1863
+ "%s: unrecognized type of log operation (%d)",
1864
+ __func__, ITEM_TYPE(item));
19761865 ASSERT(0);
19771866 /*
19781867 * return the remaining items back to the transaction
....@@ -1980,16 +1869,38 @@
19801869 */
19811870 if (!list_empty(&sort_list))
19821871 list_splice_init(&sort_list, &trans->r_itemq);
1983
- error = -EIO;
1984
- goto out;
1872
+ error = -EFSCORRUPTED;
1873
+ break;
1874
+ }
1875
+
1876
+ if (item->ri_ops->reorder)
1877
+ fate = item->ri_ops->reorder(item);
1878
+
1879
+ switch (fate) {
1880
+ case XLOG_REORDER_BUFFER_LIST:
1881
+ list_move_tail(&item->ri_list, &buffer_list);
1882
+ break;
1883
+ case XLOG_REORDER_CANCEL_LIST:
1884
+ trace_xfs_log_recover_item_reorder_head(log,
1885
+ trans, item, pass);
1886
+ list_move(&item->ri_list, &cancel_list);
1887
+ break;
1888
+ case XLOG_REORDER_INODE_BUFFER_LIST:
1889
+ list_move(&item->ri_list, &inode_buffer_list);
1890
+ break;
1891
+ case XLOG_REORDER_ITEM_LIST:
1892
+ trace_xfs_log_recover_item_reorder_tail(log,
1893
+ trans, item, pass);
1894
+ list_move_tail(&item->ri_list, &item_list);
1895
+ break;
19851896 }
19861897 }
1987
-out:
1898
+
19881899 ASSERT(list_empty(&sort_list));
19891900 if (!list_empty(&buffer_list))
19901901 list_splice(&buffer_list, &trans->r_itemq);
1991
- if (!list_empty(&inode_list))
1992
- list_splice_tail(&inode_list, &trans->r_itemq);
1902
+ if (!list_empty(&item_list))
1903
+ list_splice_tail(&item_list, &trans->r_itemq);
19931904 if (!list_empty(&inode_buffer_list))
19941905 list_splice_tail(&inode_buffer_list, &trans->r_itemq);
19951906 if (!list_empty(&cancel_list))
....@@ -1997,2154 +1908,15 @@
19971908 return error;
19981909 }
19991910
2000
-/*
2001
- * Build up the table of buf cancel records so that we don't replay
2002
- * cancelled data in the second pass. For buffer records that are
2003
- * not cancel records, there is nothing to do here so we just return.
2004
- *
2005
- * If we get a cancel record which is already in the table, this indicates
2006
- * that the buffer was cancelled multiple times. In order to ensure
2007
- * that during pass 2 we keep the record in the table until we reach its
2008
- * last occurrence in the log, we keep a reference count in the cancel
2009
- * record in the table to tell us how many times we expect to see this
2010
- * record during the second pass.
2011
- */
2012
-STATIC int
2013
-xlog_recover_buffer_pass1(
2014
- struct xlog *log,
2015
- struct xlog_recover_item *item)
2016
-{
2017
- xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2018
- struct list_head *bucket;
2019
- struct xfs_buf_cancel *bcp;
2020
-
2021
- /*
2022
- * If this isn't a cancel buffer item, then just return.
2023
- */
2024
- if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
2025
- trace_xfs_log_recover_buf_not_cancel(log, buf_f);
2026
- return 0;
2027
- }
2028
-
2029
- /*
2030
- * Insert an xfs_buf_cancel record into the hash table of them.
2031
- * If there is already an identical record, bump its reference count.
2032
- */
2033
- bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
2034
- list_for_each_entry(bcp, bucket, bc_list) {
2035
- if (bcp->bc_blkno == buf_f->blf_blkno &&
2036
- bcp->bc_len == buf_f->blf_len) {
2037
- bcp->bc_refcount++;
2038
- trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
2039
- return 0;
2040
- }
2041
- }
2042
-
2043
- bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
2044
- bcp->bc_blkno = buf_f->blf_blkno;
2045
- bcp->bc_len = buf_f->blf_len;
2046
- bcp->bc_refcount = 1;
2047
- list_add_tail(&bcp->bc_list, bucket);
2048
-
2049
- trace_xfs_log_recover_buf_cancel_add(log, buf_f);
2050
- return 0;
2051
-}
2052
-
2053
-/*
2054
- * Check to see whether the buffer being recovered has a corresponding
2055
- * entry in the buffer cancel record table. If it is, return the cancel
2056
- * buffer structure to the caller.
2057
- */
2058
-STATIC struct xfs_buf_cancel *
2059
-xlog_peek_buffer_cancelled(
1911
+void
1912
+xlog_buf_readahead(
20601913 struct xlog *log,
20611914 xfs_daddr_t blkno,
20621915 uint len,
2063
- unsigned short flags)
1916
+ const struct xfs_buf_ops *ops)
20641917 {
2065
- struct list_head *bucket;
2066
- struct xfs_buf_cancel *bcp;
2067
-
2068
- if (!log->l_buf_cancel_table) {
2069
- /* empty table means no cancelled buffers in the log */
2070
- ASSERT(!(flags & XFS_BLF_CANCEL));
2071
- return NULL;
2072
- }
2073
-
2074
- bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
2075
- list_for_each_entry(bcp, bucket, bc_list) {
2076
- if (bcp->bc_blkno == blkno && bcp->bc_len == len)
2077
- return bcp;
2078
- }
2079
-
2080
- /*
2081
- * We didn't find a corresponding entry in the table, so return 0 so
2082
- * that the buffer is NOT cancelled.
2083
- */
2084
- ASSERT(!(flags & XFS_BLF_CANCEL));
2085
- return NULL;
2086
-}
2087
-
2088
-/*
2089
- * If the buffer is being cancelled then return 1 so that it will be cancelled,
2090
- * otherwise return 0. If the buffer is actually a buffer cancel item
2091
- * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
2092
- * table and remove it from the table if this is the last reference.
2093
- *
2094
- * We remove the cancel record from the table when we encounter its last
2095
- * occurrence in the log so that if the same buffer is re-used again after its
2096
- * last cancellation we actually replay the changes made at that point.
2097
- */
2098
-STATIC int
2099
-xlog_check_buffer_cancelled(
2100
- struct xlog *log,
2101
- xfs_daddr_t blkno,
2102
- uint len,
2103
- unsigned short flags)
2104
-{
2105
- struct xfs_buf_cancel *bcp;
2106
-
2107
- bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
2108
- if (!bcp)
2109
- return 0;
2110
-
2111
- /*
2112
- * We've go a match, so return 1 so that the recovery of this buffer
2113
- * is cancelled. If this buffer is actually a buffer cancel log
2114
- * item, then decrement the refcount on the one in the table and
2115
- * remove it if this is the last reference.
2116
- */
2117
- if (flags & XFS_BLF_CANCEL) {
2118
- if (--bcp->bc_refcount == 0) {
2119
- list_del(&bcp->bc_list);
2120
- kmem_free(bcp);
2121
- }
2122
- }
2123
- return 1;
2124
-}
2125
-
2126
-/*
2127
- * Perform recovery for a buffer full of inodes. In these buffers, the only
2128
- * data which should be recovered is that which corresponds to the
2129
- * di_next_unlinked pointers in the on disk inode structures. The rest of the
2130
- * data for the inodes is always logged through the inodes themselves rather
2131
- * than the inode buffer and is recovered in xlog_recover_inode_pass2().
2132
- *
2133
- * The only time when buffers full of inodes are fully recovered is when the
2134
- * buffer is full of newly allocated inodes. In this case the buffer will
2135
- * not be marked as an inode buffer and so will be sent to
2136
- * xlog_recover_do_reg_buffer() below during recovery.
2137
- */
2138
-STATIC int
2139
-xlog_recover_do_inode_buffer(
2140
- struct xfs_mount *mp,
2141
- xlog_recover_item_t *item,
2142
- struct xfs_buf *bp,
2143
- xfs_buf_log_format_t *buf_f)
2144
-{
2145
- int i;
2146
- int item_index = 0;
2147
- int bit = 0;
2148
- int nbits = 0;
2149
- int reg_buf_offset = 0;
2150
- int reg_buf_bytes = 0;
2151
- int next_unlinked_offset;
2152
- int inodes_per_buf;
2153
- xfs_agino_t *logged_nextp;
2154
- xfs_agino_t *buffer_nextp;
2155
-
2156
- trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
2157
-
2158
- /*
2159
- * Post recovery validation only works properly on CRC enabled
2160
- * filesystems.
2161
- */
2162
- if (xfs_sb_version_hascrc(&mp->m_sb))
2163
- bp->b_ops = &xfs_inode_buf_ops;
2164
-
2165
- inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
2166
- for (i = 0; i < inodes_per_buf; i++) {
2167
- next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
2168
- offsetof(xfs_dinode_t, di_next_unlinked);
2169
-
2170
- while (next_unlinked_offset >=
2171
- (reg_buf_offset + reg_buf_bytes)) {
2172
- /*
2173
- * The next di_next_unlinked field is beyond
2174
- * the current logged region. Find the next
2175
- * logged region that contains or is beyond
2176
- * the current di_next_unlinked field.
2177
- */
2178
- bit += nbits;
2179
- bit = xfs_next_bit(buf_f->blf_data_map,
2180
- buf_f->blf_map_size, bit);
2181
-
2182
- /*
2183
- * If there are no more logged regions in the
2184
- * buffer, then we're done.
2185
- */
2186
- if (bit == -1)
2187
- return 0;
2188
-
2189
- nbits = xfs_contig_bits(buf_f->blf_data_map,
2190
- buf_f->blf_map_size, bit);
2191
- ASSERT(nbits > 0);
2192
- reg_buf_offset = bit << XFS_BLF_SHIFT;
2193
- reg_buf_bytes = nbits << XFS_BLF_SHIFT;
2194
- item_index++;
2195
- }
2196
-
2197
- /*
2198
- * If the current logged region starts after the current
2199
- * di_next_unlinked field, then move on to the next
2200
- * di_next_unlinked field.
2201
- */
2202
- if (next_unlinked_offset < reg_buf_offset)
2203
- continue;
2204
-
2205
- ASSERT(item->ri_buf[item_index].i_addr != NULL);
2206
- ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
2207
- ASSERT((reg_buf_offset + reg_buf_bytes) <=
2208
- BBTOB(bp->b_io_length));
2209
-
2210
- /*
2211
- * The current logged region contains a copy of the
2212
- * current di_next_unlinked field. Extract its value
2213
- * and copy it to the buffer copy.
2214
- */
2215
- logged_nextp = item->ri_buf[item_index].i_addr +
2216
- next_unlinked_offset - reg_buf_offset;
2217
- if (unlikely(*logged_nextp == 0)) {
2218
- xfs_alert(mp,
2219
- "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). "
2220
- "Trying to replay bad (0) inode di_next_unlinked field.",
2221
- item, bp);
2222
- XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
2223
- XFS_ERRLEVEL_LOW, mp);
2224
- return -EFSCORRUPTED;
2225
- }
2226
-
2227
- buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
2228
- *buffer_nextp = *logged_nextp;
2229
-
2230
- /*
2231
- * If necessary, recalculate the CRC in the on-disk inode. We
2232
- * have to leave the inode in a consistent state for whoever
2233
- * reads it next....
2234
- */
2235
- xfs_dinode_calc_crc(mp,
2236
- xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
2237
-
2238
- }
2239
-
2240
- return 0;
2241
-}
2242
-
2243
-/*
2244
- * V5 filesystems know the age of the buffer on disk being recovered. We can
2245
- * have newer objects on disk than we are replaying, and so for these cases we
2246
- * don't want to replay the current change as that will make the buffer contents
2247
- * temporarily invalid on disk.
2248
- *
2249
- * The magic number might not match the buffer type we are going to recover
2250
- * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
2251
- * extract the LSN of the existing object in the buffer based on it's current
2252
- * magic number. If we don't recognise the magic number in the buffer, then
2253
- * return a LSN of -1 so that the caller knows it was an unrecognised block and
2254
- * so can recover the buffer.
2255
- *
2256
- * Note: we cannot rely solely on magic number matches to determine that the
2257
- * buffer has a valid LSN - we also need to verify that it belongs to this
2258
- * filesystem, so we need to extract the object's LSN and compare it to that
2259
- * which we read from the superblock. If the UUIDs don't match, then we've got a
2260
- * stale metadata block from an old filesystem instance that we need to recover
2261
- * over the top of.
2262
- */
2263
-static xfs_lsn_t
2264
-xlog_recover_get_buf_lsn(
2265
- struct xfs_mount *mp,
2266
- struct xfs_buf *bp)
2267
-{
2268
- uint32_t magic32;
2269
- uint16_t magic16;
2270
- uint16_t magicda;
2271
- void *blk = bp->b_addr;
2272
- uuid_t *uuid;
2273
- xfs_lsn_t lsn = -1;
2274
-
2275
- /* v4 filesystems always recover immediately */
2276
- if (!xfs_sb_version_hascrc(&mp->m_sb))
2277
- goto recover_immediately;
2278
-
2279
- magic32 = be32_to_cpu(*(__be32 *)blk);
2280
- switch (magic32) {
2281
- case XFS_ABTB_CRC_MAGIC:
2282
- case XFS_ABTC_CRC_MAGIC:
2283
- case XFS_ABTB_MAGIC:
2284
- case XFS_ABTC_MAGIC:
2285
- case XFS_RMAP_CRC_MAGIC:
2286
- case XFS_REFC_CRC_MAGIC:
2287
- case XFS_IBT_CRC_MAGIC:
2288
- case XFS_IBT_MAGIC: {
2289
- struct xfs_btree_block *btb = blk;
2290
-
2291
- lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
2292
- uuid = &btb->bb_u.s.bb_uuid;
2293
- break;
2294
- }
2295
- case XFS_BMAP_CRC_MAGIC:
2296
- case XFS_BMAP_MAGIC: {
2297
- struct xfs_btree_block *btb = blk;
2298
-
2299
- lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
2300
- uuid = &btb->bb_u.l.bb_uuid;
2301
- break;
2302
- }
2303
- case XFS_AGF_MAGIC:
2304
- lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
2305
- uuid = &((struct xfs_agf *)blk)->agf_uuid;
2306
- break;
2307
- case XFS_AGFL_MAGIC:
2308
- lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
2309
- uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
2310
- break;
2311
- case XFS_AGI_MAGIC:
2312
- lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
2313
- uuid = &((struct xfs_agi *)blk)->agi_uuid;
2314
- break;
2315
- case XFS_SYMLINK_MAGIC:
2316
- lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
2317
- uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
2318
- break;
2319
- case XFS_DIR3_BLOCK_MAGIC:
2320
- case XFS_DIR3_DATA_MAGIC:
2321
- case XFS_DIR3_FREE_MAGIC:
2322
- lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
2323
- uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
2324
- break;
2325
- case XFS_ATTR3_RMT_MAGIC:
2326
- /*
2327
- * Remote attr blocks are written synchronously, rather than
2328
- * being logged. That means they do not contain a valid LSN
2329
- * (i.e. transactionally ordered) in them, and hence any time we
2330
- * see a buffer to replay over the top of a remote attribute
2331
- * block we should simply do so.
2332
- */
2333
- goto recover_immediately;
2334
- case XFS_SB_MAGIC:
2335
- /*
2336
- * superblock uuids are magic. We may or may not have a
2337
- * sb_meta_uuid on disk, but it will be set in the in-core
2338
- * superblock. We set the uuid pointer for verification
2339
- * according to the superblock feature mask to ensure we check
2340
- * the relevant UUID in the superblock.
2341
- */
2342
- lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
2343
- if (xfs_sb_version_hasmetauuid(&mp->m_sb))
2344
- uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
2345
- else
2346
- uuid = &((struct xfs_dsb *)blk)->sb_uuid;
2347
- break;
2348
- default:
2349
- break;
2350
- }
2351
-
2352
- if (lsn != (xfs_lsn_t)-1) {
2353
- if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
2354
- goto recover_immediately;
2355
- return lsn;
2356
- }
2357
-
2358
- magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
2359
- switch (magicda) {
2360
- case XFS_DIR3_LEAF1_MAGIC:
2361
- case XFS_DIR3_LEAFN_MAGIC:
2362
- case XFS_DA3_NODE_MAGIC:
2363
- lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
2364
- uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
2365
- break;
2366
- default:
2367
- break;
2368
- }
2369
-
2370
- if (lsn != (xfs_lsn_t)-1) {
2371
- if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
2372
- goto recover_immediately;
2373
- return lsn;
2374
- }
2375
-
2376
- /*
2377
- * We do individual object checks on dquot and inode buffers as they
2378
- * have their own individual LSN records. Also, we could have a stale
2379
- * buffer here, so we have to at least recognise these buffer types.
2380
- *
2381
- * A notd complexity here is inode unlinked list processing - it logs
2382
- * the inode directly in the buffer, but we don't know which inodes have
2383
- * been modified, and there is no global buffer LSN. Hence we need to
2384
- * recover all inode buffer types immediately. This problem will be
2385
- * fixed by logical logging of the unlinked list modifications.
2386
- */
2387
- magic16 = be16_to_cpu(*(__be16 *)blk);
2388
- switch (magic16) {
2389
- case XFS_DQUOT_MAGIC:
2390
- case XFS_DINODE_MAGIC:
2391
- goto recover_immediately;
2392
- default:
2393
- break;
2394
- }
2395
-
2396
- /* unknown buffer contents, recover immediately */
2397
-
2398
-recover_immediately:
2399
- return (xfs_lsn_t)-1;
2400
-
2401
-}
2402
-
2403
-/*
2404
- * Validate the recovered buffer is of the correct type and attach the
2405
- * appropriate buffer operations to them for writeback. Magic numbers are in a
2406
- * few places:
2407
- * the first 16 bits of the buffer (inode buffer, dquot buffer),
2408
- * the first 32 bits of the buffer (most blocks),
2409
- * inside a struct xfs_da_blkinfo at the start of the buffer.
2410
- */
2411
-static void
2412
-xlog_recover_validate_buf_type(
2413
- struct xfs_mount *mp,
2414
- struct xfs_buf *bp,
2415
- xfs_buf_log_format_t *buf_f,
2416
- xfs_lsn_t current_lsn)
2417
-{
2418
- struct xfs_da_blkinfo *info = bp->b_addr;
2419
- uint32_t magic32;
2420
- uint16_t magic16;
2421
- uint16_t magicda;
2422
- char *warnmsg = NULL;
2423
-
2424
- /*
2425
- * We can only do post recovery validation on items on CRC enabled
2426
- * fielsystems as we need to know when the buffer was written to be able
2427
- * to determine if we should have replayed the item. If we replay old
2428
- * metadata over a newer buffer, then it will enter a temporarily
2429
- * inconsistent state resulting in verification failures. Hence for now
2430
- * just avoid the verification stage for non-crc filesystems
2431
- */
2432
- if (!xfs_sb_version_hascrc(&mp->m_sb))
2433
- return;
2434
-
2435
- magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
2436
- magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
2437
- magicda = be16_to_cpu(info->magic);
2438
- switch (xfs_blft_from_flags(buf_f)) {
2439
- case XFS_BLFT_BTREE_BUF:
2440
- switch (magic32) {
2441
- case XFS_ABTB_CRC_MAGIC:
2442
- case XFS_ABTC_CRC_MAGIC:
2443
- case XFS_ABTB_MAGIC:
2444
- case XFS_ABTC_MAGIC:
2445
- bp->b_ops = &xfs_allocbt_buf_ops;
2446
- break;
2447
- case XFS_IBT_CRC_MAGIC:
2448
- case XFS_FIBT_CRC_MAGIC:
2449
- case XFS_IBT_MAGIC:
2450
- case XFS_FIBT_MAGIC:
2451
- bp->b_ops = &xfs_inobt_buf_ops;
2452
- break;
2453
- case XFS_BMAP_CRC_MAGIC:
2454
- case XFS_BMAP_MAGIC:
2455
- bp->b_ops = &xfs_bmbt_buf_ops;
2456
- break;
2457
- case XFS_RMAP_CRC_MAGIC:
2458
- bp->b_ops = &xfs_rmapbt_buf_ops;
2459
- break;
2460
- case XFS_REFC_CRC_MAGIC:
2461
- bp->b_ops = &xfs_refcountbt_buf_ops;
2462
- break;
2463
- default:
2464
- warnmsg = "Bad btree block magic!";
2465
- break;
2466
- }
2467
- break;
2468
- case XFS_BLFT_AGF_BUF:
2469
- if (magic32 != XFS_AGF_MAGIC) {
2470
- warnmsg = "Bad AGF block magic!";
2471
- break;
2472
- }
2473
- bp->b_ops = &xfs_agf_buf_ops;
2474
- break;
2475
- case XFS_BLFT_AGFL_BUF:
2476
- if (magic32 != XFS_AGFL_MAGIC) {
2477
- warnmsg = "Bad AGFL block magic!";
2478
- break;
2479
- }
2480
- bp->b_ops = &xfs_agfl_buf_ops;
2481
- break;
2482
- case XFS_BLFT_AGI_BUF:
2483
- if (magic32 != XFS_AGI_MAGIC) {
2484
- warnmsg = "Bad AGI block magic!";
2485
- break;
2486
- }
2487
- bp->b_ops = &xfs_agi_buf_ops;
2488
- break;
2489
- case XFS_BLFT_UDQUOT_BUF:
2490
- case XFS_BLFT_PDQUOT_BUF:
2491
- case XFS_BLFT_GDQUOT_BUF:
2492
-#ifdef CONFIG_XFS_QUOTA
2493
- if (magic16 != XFS_DQUOT_MAGIC) {
2494
- warnmsg = "Bad DQUOT block magic!";
2495
- break;
2496
- }
2497
- bp->b_ops = &xfs_dquot_buf_ops;
2498
-#else
2499
- xfs_alert(mp,
2500
- "Trying to recover dquots without QUOTA support built in!");
2501
- ASSERT(0);
2502
-#endif
2503
- break;
2504
- case XFS_BLFT_DINO_BUF:
2505
- if (magic16 != XFS_DINODE_MAGIC) {
2506
- warnmsg = "Bad INODE block magic!";
2507
- break;
2508
- }
2509
- bp->b_ops = &xfs_inode_buf_ops;
2510
- break;
2511
- case XFS_BLFT_SYMLINK_BUF:
2512
- if (magic32 != XFS_SYMLINK_MAGIC) {
2513
- warnmsg = "Bad symlink block magic!";
2514
- break;
2515
- }
2516
- bp->b_ops = &xfs_symlink_buf_ops;
2517
- break;
2518
- case XFS_BLFT_DIR_BLOCK_BUF:
2519
- if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
2520
- magic32 != XFS_DIR3_BLOCK_MAGIC) {
2521
- warnmsg = "Bad dir block magic!";
2522
- break;
2523
- }
2524
- bp->b_ops = &xfs_dir3_block_buf_ops;
2525
- break;
2526
- case XFS_BLFT_DIR_DATA_BUF:
2527
- if (magic32 != XFS_DIR2_DATA_MAGIC &&
2528
- magic32 != XFS_DIR3_DATA_MAGIC) {
2529
- warnmsg = "Bad dir data magic!";
2530
- break;
2531
- }
2532
- bp->b_ops = &xfs_dir3_data_buf_ops;
2533
- break;
2534
- case XFS_BLFT_DIR_FREE_BUF:
2535
- if (magic32 != XFS_DIR2_FREE_MAGIC &&
2536
- magic32 != XFS_DIR3_FREE_MAGIC) {
2537
- warnmsg = "Bad dir3 free magic!";
2538
- break;
2539
- }
2540
- bp->b_ops = &xfs_dir3_free_buf_ops;
2541
- break;
2542
- case XFS_BLFT_DIR_LEAF1_BUF:
2543
- if (magicda != XFS_DIR2_LEAF1_MAGIC &&
2544
- magicda != XFS_DIR3_LEAF1_MAGIC) {
2545
- warnmsg = "Bad dir leaf1 magic!";
2546
- break;
2547
- }
2548
- bp->b_ops = &xfs_dir3_leaf1_buf_ops;
2549
- break;
2550
- case XFS_BLFT_DIR_LEAFN_BUF:
2551
- if (magicda != XFS_DIR2_LEAFN_MAGIC &&
2552
- magicda != XFS_DIR3_LEAFN_MAGIC) {
2553
- warnmsg = "Bad dir leafn magic!";
2554
- break;
2555
- }
2556
- bp->b_ops = &xfs_dir3_leafn_buf_ops;
2557
- break;
2558
- case XFS_BLFT_DA_NODE_BUF:
2559
- if (magicda != XFS_DA_NODE_MAGIC &&
2560
- magicda != XFS_DA3_NODE_MAGIC) {
2561
- warnmsg = "Bad da node magic!";
2562
- break;
2563
- }
2564
- bp->b_ops = &xfs_da3_node_buf_ops;
2565
- break;
2566
- case XFS_BLFT_ATTR_LEAF_BUF:
2567
- if (magicda != XFS_ATTR_LEAF_MAGIC &&
2568
- magicda != XFS_ATTR3_LEAF_MAGIC) {
2569
- warnmsg = "Bad attr leaf magic!";
2570
- break;
2571
- }
2572
- bp->b_ops = &xfs_attr3_leaf_buf_ops;
2573
- break;
2574
- case XFS_BLFT_ATTR_RMT_BUF:
2575
- if (magic32 != XFS_ATTR3_RMT_MAGIC) {
2576
- warnmsg = "Bad attr remote magic!";
2577
- break;
2578
- }
2579
- bp->b_ops = &xfs_attr3_rmt_buf_ops;
2580
- break;
2581
- case XFS_BLFT_SB_BUF:
2582
- if (magic32 != XFS_SB_MAGIC) {
2583
- warnmsg = "Bad SB block magic!";
2584
- break;
2585
- }
2586
- bp->b_ops = &xfs_sb_buf_ops;
2587
- break;
2588
-#ifdef CONFIG_XFS_RT
2589
- case XFS_BLFT_RTBITMAP_BUF:
2590
- case XFS_BLFT_RTSUMMARY_BUF:
2591
- /* no magic numbers for verification of RT buffers */
2592
- bp->b_ops = &xfs_rtbuf_ops;
2593
- break;
2594
-#endif /* CONFIG_XFS_RT */
2595
- default:
2596
- xfs_warn(mp, "Unknown buffer type %d!",
2597
- xfs_blft_from_flags(buf_f));
2598
- break;
2599
- }
2600
-
2601
- /*
2602
- * Nothing else to do in the case of a NULL current LSN as this means
2603
- * the buffer is more recent than the change in the log and will be
2604
- * skipped.
2605
- */
2606
- if (current_lsn == NULLCOMMITLSN)
2607
- return;
2608
-
2609
- if (warnmsg) {
2610
- xfs_warn(mp, warnmsg);
2611
- ASSERT(0);
2612
- }
2613
-
2614
- /*
2615
- * We must update the metadata LSN of the buffer as it is written out to
2616
- * ensure that older transactions never replay over this one and corrupt
2617
- * the buffer. This can occur if log recovery is interrupted at some
2618
- * point after the current transaction completes, at which point a
2619
- * subsequent mount starts recovery from the beginning.
2620
- *
2621
- * Write verifiers update the metadata LSN from log items attached to
2622
- * the buffer. Therefore, initialize a bli purely to carry the LSN to
2623
- * the verifier. We'll clean it up in our ->iodone() callback.
2624
- */
2625
- if (bp->b_ops) {
2626
- struct xfs_buf_log_item *bip;
2627
-
2628
- ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
2629
- bp->b_iodone = xlog_recover_iodone;
2630
- xfs_buf_item_init(bp, mp);
2631
- bip = bp->b_log_item;
2632
- bip->bli_item.li_lsn = current_lsn;
2633
- }
2634
-}
2635
-
2636
-/*
2637
- * Perform a 'normal' buffer recovery. Each logged region of the
2638
- * buffer should be copied over the corresponding region in the
2639
- * given buffer. The bitmap in the buf log format structure indicates
2640
- * where to place the logged data.
2641
- */
2642
-STATIC void
2643
-xlog_recover_do_reg_buffer(
2644
- struct xfs_mount *mp,
2645
- xlog_recover_item_t *item,
2646
- struct xfs_buf *bp,
2647
- xfs_buf_log_format_t *buf_f,
2648
- xfs_lsn_t current_lsn)
2649
-{
2650
- int i;
2651
- int bit;
2652
- int nbits;
2653
- xfs_failaddr_t fa;
2654
-
2655
- trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
2656
-
2657
- bit = 0;
2658
- i = 1; /* 0 is the buf format structure */
2659
- while (1) {
2660
- bit = xfs_next_bit(buf_f->blf_data_map,
2661
- buf_f->blf_map_size, bit);
2662
- if (bit == -1)
2663
- break;
2664
- nbits = xfs_contig_bits(buf_f->blf_data_map,
2665
- buf_f->blf_map_size, bit);
2666
- ASSERT(nbits > 0);
2667
- ASSERT(item->ri_buf[i].i_addr != NULL);
2668
- ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
2669
- ASSERT(BBTOB(bp->b_io_length) >=
2670
- ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
2671
-
2672
- /*
2673
- * The dirty regions logged in the buffer, even though
2674
- * contiguous, may span multiple chunks. This is because the
2675
- * dirty region may span a physical page boundary in a buffer
2676
- * and hence be split into two separate vectors for writing into
2677
- * the log. Hence we need to trim nbits back to the length of
2678
- * the current region being copied out of the log.
2679
- */
2680
- if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
2681
- nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
2682
-
2683
- /*
2684
- * Do a sanity check if this is a dquot buffer. Just checking
2685
- * the first dquot in the buffer should do. XXXThis is
2686
- * probably a good thing to do for other buf types also.
2687
- */
2688
- fa = NULL;
2689
- if (buf_f->blf_flags &
2690
- (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2691
- if (item->ri_buf[i].i_addr == NULL) {
2692
- xfs_alert(mp,
2693
- "XFS: NULL dquot in %s.", __func__);
2694
- goto next;
2695
- }
2696
- if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
2697
- xfs_alert(mp,
2698
- "XFS: dquot too small (%d) in %s.",
2699
- item->ri_buf[i].i_len, __func__);
2700
- goto next;
2701
- }
2702
- fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr,
2703
- -1, 0);
2704
- if (fa) {
2705
- xfs_alert(mp,
2706
- "dquot corrupt at %pS trying to replay into block 0x%llx",
2707
- fa, bp->b_bn);
2708
- goto next;
2709
- }
2710
- }
2711
-
2712
- memcpy(xfs_buf_offset(bp,
2713
- (uint)bit << XFS_BLF_SHIFT), /* dest */
2714
- item->ri_buf[i].i_addr, /* source */
2715
- nbits<<XFS_BLF_SHIFT); /* length */
2716
- next:
2717
- i++;
2718
- bit += nbits;
2719
- }
2720
-
2721
- /* Shouldn't be any more regions */
2722
- ASSERT(i == item->ri_total);
2723
-
2724
- xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
2725
-}
2726
-
2727
-/*
2728
- * Perform a dquot buffer recovery.
2729
- * Simple algorithm: if we have found a QUOTAOFF log item of the same type
2730
- * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2731
- * Else, treat it as a regular buffer and do recovery.
2732
- *
2733
- * Return false if the buffer was tossed and true if we recovered the buffer to
2734
- * indicate to the caller if the buffer needs writing.
2735
- */
2736
-STATIC bool
2737
-xlog_recover_do_dquot_buffer(
2738
- struct xfs_mount *mp,
2739
- struct xlog *log,
2740
- struct xlog_recover_item *item,
2741
- struct xfs_buf *bp,
2742
- struct xfs_buf_log_format *buf_f)
2743
-{
2744
- uint type;
2745
-
2746
- trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2747
-
2748
- /*
2749
- * Filesystems are required to send in quota flags at mount time.
2750
- */
2751
- if (!mp->m_qflags)
2752
- return false;
2753
-
2754
- type = 0;
2755
- if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2756
- type |= XFS_DQ_USER;
2757
- if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2758
- type |= XFS_DQ_PROJ;
2759
- if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2760
- type |= XFS_DQ_GROUP;
2761
- /*
2762
- * This type of quotas was turned off, so ignore this buffer
2763
- */
2764
- if (log->l_quotaoffs_flag & type)
2765
- return false;
2766
-
2767
- xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
2768
- return true;
2769
-}
2770
-
2771
-/*
2772
- * This routine replays a modification made to a buffer at runtime.
2773
- * There are actually two types of buffer, regular and inode, which
2774
- * are handled differently. Inode buffers are handled differently
2775
- * in that we only recover a specific set of data from them, namely
2776
- * the inode di_next_unlinked fields. This is because all other inode
2777
- * data is actually logged via inode records and any data we replay
2778
- * here which overlaps that may be stale.
2779
- *
2780
- * When meta-data buffers are freed at run time we log a buffer item
2781
- * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2782
- * of the buffer in the log should not be replayed at recovery time.
2783
- * This is so that if the blocks covered by the buffer are reused for
2784
- * file data before we crash we don't end up replaying old, freed
2785
- * meta-data into a user's file.
2786
- *
2787
- * To handle the cancellation of buffer log items, we make two passes
2788
- * over the log during recovery. During the first we build a table of
2789
- * those buffers which have been cancelled, and during the second we
2790
- * only replay those buffers which do not have corresponding cancel
2791
- * records in the table. See xlog_recover_buffer_pass[1,2] above
2792
- * for more details on the implementation of the table of cancel records.
2793
- */
2794
-STATIC int
2795
-xlog_recover_buffer_pass2(
2796
- struct xlog *log,
2797
- struct list_head *buffer_list,
2798
- struct xlog_recover_item *item,
2799
- xfs_lsn_t current_lsn)
2800
-{
2801
- xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2802
- xfs_mount_t *mp = log->l_mp;
2803
- xfs_buf_t *bp;
2804
- int error;
2805
- uint buf_flags;
2806
- xfs_lsn_t lsn;
2807
-
2808
- /*
2809
- * In this pass we only want to recover all the buffers which have
2810
- * not been cancelled and are not cancellation buffers themselves.
2811
- */
2812
- if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2813
- buf_f->blf_len, buf_f->blf_flags)) {
2814
- trace_xfs_log_recover_buf_cancel(log, buf_f);
2815
- return 0;
2816
- }
2817
-
2818
- trace_xfs_log_recover_buf_recover(log, buf_f);
2819
-
2820
- buf_flags = 0;
2821
- if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
2822
- buf_flags |= XBF_UNMAPPED;
2823
-
2824
- bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2825
- buf_flags, NULL);
2826
- if (!bp)
2827
- return -ENOMEM;
2828
- error = bp->b_error;
2829
- if (error) {
2830
- xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2831
- goto out_release;
2832
- }
2833
-
2834
- /*
2835
- * Recover the buffer only if we get an LSN from it and it's less than
2836
- * the lsn of the transaction we are replaying.
2837
- *
2838
- * Note that we have to be extremely careful of readahead here.
2839
- * Readahead does not attach verfiers to the buffers so if we don't
2840
- * actually do any replay after readahead because of the LSN we found
2841
- * in the buffer if more recent than that current transaction then we
2842
- * need to attach the verifier directly. Failure to do so can lead to
2843
- * future recovery actions (e.g. EFI and unlinked list recovery) can
2844
- * operate on the buffers and they won't get the verifier attached. This
2845
- * can lead to blocks on disk having the correct content but a stale
2846
- * CRC.
2847
- *
2848
- * It is safe to assume these clean buffers are currently up to date.
2849
- * If the buffer is dirtied by a later transaction being replayed, then
2850
- * the verifier will be reset to match whatever recover turns that
2851
- * buffer into.
2852
- */
2853
- lsn = xlog_recover_get_buf_lsn(mp, bp);
2854
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2855
- trace_xfs_log_recover_buf_skip(log, buf_f);
2856
- xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
2857
- goto out_release;
2858
- }
2859
-
2860
- if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2861
- error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2862
- if (error)
2863
- goto out_release;
2864
- } else if (buf_f->blf_flags &
2865
- (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2866
- bool dirty;
2867
-
2868
- dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2869
- if (!dirty)
2870
- goto out_release;
2871
- } else {
2872
- xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
2873
- }
2874
-
2875
- /*
2876
- * Perform delayed write on the buffer. Asynchronous writes will be
2877
- * slower when taking into account all the buffers to be flushed.
2878
- *
2879
- * Also make sure that only inode buffers with good sizes stay in
2880
- * the buffer cache. The kernel moves inodes in buffers of 1 block
2881
- * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode
2882
- * buffers in the log can be a different size if the log was generated
2883
- * by an older kernel using unclustered inode buffers or a newer kernel
2884
- * running with a different inode cluster size. Regardless, if the
2885
- * the inode buffer size isn't max(blocksize, mp->m_inode_cluster_size)
2886
- * for *our* value of mp->m_inode_cluster_size, then we need to keep
2887
- * the buffer out of the buffer cache so that the buffer won't
2888
- * overlap with future reads of those inodes.
2889
- */
2890
- if (XFS_DINODE_MAGIC ==
2891
- be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2892
- (BBTOB(bp->b_io_length) != max(log->l_mp->m_sb.sb_blocksize,
2893
- (uint32_t)log->l_mp->m_inode_cluster_size))) {
2894
- xfs_buf_stale(bp);
2895
- error = xfs_bwrite(bp);
2896
- } else {
2897
- ASSERT(bp->b_target->bt_mount == mp);
2898
- bp->b_iodone = xlog_recover_iodone;
2899
- xfs_buf_delwri_queue(bp, buffer_list);
2900
- }
2901
-
2902
-out_release:
2903
- xfs_buf_relse(bp);
2904
- return error;
2905
-}
2906
-
2907
-/*
2908
- * Inode fork owner changes
2909
- *
2910
- * If we have been told that we have to reparent the inode fork, it's because an
2911
- * extent swap operation on a CRC enabled filesystem has been done and we are
2912
- * replaying it. We need to walk the BMBT of the appropriate fork and change the
2913
- * owners of it.
2914
- *
2915
- * The complexity here is that we don't have an inode context to work with, so
2916
- * after we've replayed the inode we need to instantiate one. This is where the
2917
- * fun begins.
2918
- *
2919
- * We are in the middle of log recovery, so we can't run transactions. That
2920
- * means we cannot use cache coherent inode instantiation via xfs_iget(), as
2921
- * that will result in the corresponding iput() running the inode through
2922
- * xfs_inactive(). If we've just replayed an inode core that changes the link
2923
- * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
2924
- * transactions (bad!).
2925
- *
2926
- * So, to avoid this, we instantiate an inode directly from the inode core we've
2927
- * just recovered. We have the buffer still locked, and all we really need to
2928
- * instantiate is the inode core and the forks being modified. We can do this
2929
- * manually, then run the inode btree owner change, and then tear down the
2930
- * xfs_inode without having to run any transactions at all.
2931
- *
2932
- * Also, because we don't have a transaction context available here but need to
2933
- * gather all the buffers we modify for writeback so we pass the buffer_list
2934
- * instead for the operation to use.
2935
- */
2936
-
2937
-STATIC int
2938
-xfs_recover_inode_owner_change(
2939
- struct xfs_mount *mp,
2940
- struct xfs_dinode *dip,
2941
- struct xfs_inode_log_format *in_f,
2942
- struct list_head *buffer_list)
2943
-{
2944
- struct xfs_inode *ip;
2945
- int error;
2946
-
2947
- ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
2948
-
2949
- ip = xfs_inode_alloc(mp, in_f->ilf_ino);
2950
- if (!ip)
2951
- return -ENOMEM;
2952
-
2953
- /* instantiate the inode */
2954
- xfs_inode_from_disk(ip, dip);
2955
- ASSERT(ip->i_d.di_version >= 3);
2956
-
2957
- error = xfs_iformat_fork(ip, dip);
2958
- if (error)
2959
- goto out_free_ip;
2960
-
2961
- if (!xfs_inode_verify_forks(ip)) {
2962
- error = -EFSCORRUPTED;
2963
- goto out_free_ip;
2964
- }
2965
-
2966
- if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
2967
- ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
2968
- error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
2969
- ip->i_ino, buffer_list);
2970
- if (error)
2971
- goto out_free_ip;
2972
- }
2973
-
2974
- if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
2975
- ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
2976
- error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
2977
- ip->i_ino, buffer_list);
2978
- if (error)
2979
- goto out_free_ip;
2980
- }
2981
-
2982
-out_free_ip:
2983
- xfs_inode_free(ip);
2984
- return error;
2985
-}
2986
-
2987
-STATIC int
2988
-xlog_recover_inode_pass2(
2989
- struct xlog *log,
2990
- struct list_head *buffer_list,
2991
- struct xlog_recover_item *item,
2992
- xfs_lsn_t current_lsn)
2993
-{
2994
- struct xfs_inode_log_format *in_f;
2995
- xfs_mount_t *mp = log->l_mp;
2996
- xfs_buf_t *bp;
2997
- xfs_dinode_t *dip;
2998
- int len;
2999
- char *src;
3000
- char *dest;
3001
- int error;
3002
- int attr_index;
3003
- uint fields;
3004
- struct xfs_log_dinode *ldip;
3005
- uint isize;
3006
- int need_free = 0;
3007
-
3008
- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
3009
- in_f = item->ri_buf[0].i_addr;
3010
- } else {
3011
- in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP);
3012
- need_free = 1;
3013
- error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
3014
- if (error)
3015
- goto error;
3016
- }
3017
-
3018
- /*
3019
- * Inode buffers can be freed, look out for it,
3020
- * and do not replay the inode.
3021
- */
3022
- if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
3023
- in_f->ilf_len, 0)) {
3024
- error = 0;
3025
- trace_xfs_log_recover_inode_cancel(log, in_f);
3026
- goto error;
3027
- }
3028
- trace_xfs_log_recover_inode_recover(log, in_f);
3029
-
3030
- bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
3031
- &xfs_inode_buf_ops);
3032
- if (!bp) {
3033
- error = -ENOMEM;
3034
- goto error;
3035
- }
3036
- error = bp->b_error;
3037
- if (error) {
3038
- xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
3039
- goto out_release;
3040
- }
3041
- ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
3042
- dip = xfs_buf_offset(bp, in_f->ilf_boffset);
3043
-
3044
- /*
3045
- * Make sure the place we're flushing out to really looks
3046
- * like an inode!
3047
- */
3048
- if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
3049
- xfs_alert(mp,
3050
- "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
3051
- __func__, dip, bp, in_f->ilf_ino);
3052
- XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
3053
- XFS_ERRLEVEL_LOW, mp);
3054
- error = -EFSCORRUPTED;
3055
- goto out_release;
3056
- }
3057
- ldip = item->ri_buf[1].i_addr;
3058
- if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) {
3059
- xfs_alert(mp,
3060
- "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld",
3061
- __func__, item, in_f->ilf_ino);
3062
- XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
3063
- XFS_ERRLEVEL_LOW, mp);
3064
- error = -EFSCORRUPTED;
3065
- goto out_release;
3066
- }
3067
-
3068
- /*
3069
- * If the inode has an LSN in it, recover the inode only if it's less
3070
- * than the lsn of the transaction we are replaying. Note: we still
3071
- * need to replay an owner change even though the inode is more recent
3072
- * than the transaction as there is no guarantee that all the btree
3073
- * blocks are more recent than this transaction, too.
3074
- */
3075
- if (dip->di_version >= 3) {
3076
- xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
3077
-
3078
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3079
- trace_xfs_log_recover_inode_skip(log, in_f);
3080
- error = 0;
3081
- goto out_owner_change;
3082
- }
3083
- }
3084
-
3085
- /*
3086
- * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
3087
- * are transactional and if ordering is necessary we can determine that
3088
- * more accurately by the LSN field in the V3 inode core. Don't trust
3089
- * the inode versions we might be changing them here - use the
3090
- * superblock flag to determine whether we need to look at di_flushiter
3091
- * to skip replay when the on disk inode is newer than the log one
3092
- */
3093
- if (!xfs_sb_version_hascrc(&mp->m_sb) &&
3094
- ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
3095
- /*
3096
- * Deal with the wrap case, DI_MAX_FLUSH is less
3097
- * than smaller numbers
3098
- */
3099
- if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
3100
- ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
3101
- /* do nothing */
3102
- } else {
3103
- trace_xfs_log_recover_inode_skip(log, in_f);
3104
- error = 0;
3105
- goto out_release;
3106
- }
3107
- }
3108
-
3109
- /* Take the opportunity to reset the flush iteration count */
3110
- ldip->di_flushiter = 0;
3111
-
3112
- if (unlikely(S_ISREG(ldip->di_mode))) {
3113
- if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3114
- (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
3115
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
3116
- XFS_ERRLEVEL_LOW, mp, ldip,
3117
- sizeof(*ldip));
3118
- xfs_alert(mp,
3119
- "%s: Bad regular inode log record, rec ptr "PTR_FMT", "
3120
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
3121
- __func__, item, dip, bp, in_f->ilf_ino);
3122
- error = -EFSCORRUPTED;
3123
- goto out_release;
3124
- }
3125
- } else if (unlikely(S_ISDIR(ldip->di_mode))) {
3126
- if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3127
- (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
3128
- (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
3129
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
3130
- XFS_ERRLEVEL_LOW, mp, ldip,
3131
- sizeof(*ldip));
3132
- xfs_alert(mp,
3133
- "%s: Bad dir inode log record, rec ptr "PTR_FMT", "
3134
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
3135
- __func__, item, dip, bp, in_f->ilf_ino);
3136
- error = -EFSCORRUPTED;
3137
- goto out_release;
3138
- }
3139
- }
3140
- if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
3141
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
3142
- XFS_ERRLEVEL_LOW, mp, ldip,
3143
- sizeof(*ldip));
3144
- xfs_alert(mp,
3145
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
3146
- "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
3147
- __func__, item, dip, bp, in_f->ilf_ino,
3148
- ldip->di_nextents + ldip->di_anextents,
3149
- ldip->di_nblocks);
3150
- error = -EFSCORRUPTED;
3151
- goto out_release;
3152
- }
3153
- if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
3154
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
3155
- XFS_ERRLEVEL_LOW, mp, ldip,
3156
- sizeof(*ldip));
3157
- xfs_alert(mp,
3158
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
3159
- "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
3160
- item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
3161
- error = -EFSCORRUPTED;
3162
- goto out_release;
3163
- }
3164
- isize = xfs_log_dinode_size(ldip->di_version);
3165
- if (unlikely(item->ri_buf[1].i_len > isize)) {
3166
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
3167
- XFS_ERRLEVEL_LOW, mp, ldip,
3168
- sizeof(*ldip));
3169
- xfs_alert(mp,
3170
- "%s: Bad inode log record length %d, rec ptr "PTR_FMT,
3171
- __func__, item->ri_buf[1].i_len, item);
3172
- error = -EFSCORRUPTED;
3173
- goto out_release;
3174
- }
3175
-
3176
- /* recover the log dinode inode into the on disk inode */
3177
- xfs_log_dinode_to_disk(ldip, dip);
3178
-
3179
- fields = in_f->ilf_fields;
3180
- if (fields & XFS_ILOG_DEV)
3181
- xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
3182
-
3183
- if (in_f->ilf_size == 2)
3184
- goto out_owner_change;
3185
- len = item->ri_buf[2].i_len;
3186
- src = item->ri_buf[2].i_addr;
3187
- ASSERT(in_f->ilf_size <= 4);
3188
- ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
3189
- ASSERT(!(fields & XFS_ILOG_DFORK) ||
3190
- (len == in_f->ilf_dsize));
3191
-
3192
- switch (fields & XFS_ILOG_DFORK) {
3193
- case XFS_ILOG_DDATA:
3194
- case XFS_ILOG_DEXT:
3195
- memcpy(XFS_DFORK_DPTR(dip), src, len);
3196
- break;
3197
-
3198
- case XFS_ILOG_DBROOT:
3199
- xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
3200
- (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
3201
- XFS_DFORK_DSIZE(dip, mp));
3202
- break;
3203
-
3204
- default:
3205
- /*
3206
- * There are no data fork flags set.
3207
- */
3208
- ASSERT((fields & XFS_ILOG_DFORK) == 0);
3209
- break;
3210
- }
3211
-
3212
- /*
3213
- * If we logged any attribute data, recover it. There may or
3214
- * may not have been any other non-core data logged in this
3215
- * transaction.
3216
- */
3217
- if (in_f->ilf_fields & XFS_ILOG_AFORK) {
3218
- if (in_f->ilf_fields & XFS_ILOG_DFORK) {
3219
- attr_index = 3;
3220
- } else {
3221
- attr_index = 2;
3222
- }
3223
- len = item->ri_buf[attr_index].i_len;
3224
- src = item->ri_buf[attr_index].i_addr;
3225
- ASSERT(len == in_f->ilf_asize);
3226
-
3227
- switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
3228
- case XFS_ILOG_ADATA:
3229
- case XFS_ILOG_AEXT:
3230
- dest = XFS_DFORK_APTR(dip);
3231
- ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
3232
- memcpy(dest, src, len);
3233
- break;
3234
-
3235
- case XFS_ILOG_ABROOT:
3236
- dest = XFS_DFORK_APTR(dip);
3237
- xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
3238
- len, (xfs_bmdr_block_t*)dest,
3239
- XFS_DFORK_ASIZE(dip, mp));
3240
- break;
3241
-
3242
- default:
3243
- xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
3244
- ASSERT(0);
3245
- error = -EIO;
3246
- goto out_release;
3247
- }
3248
- }
3249
-
3250
-out_owner_change:
3251
- /* Recover the swapext owner change unless inode has been deleted */
3252
- if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) &&
3253
- (dip->di_mode != 0))
3254
- error = xfs_recover_inode_owner_change(mp, dip, in_f,
3255
- buffer_list);
3256
- /* re-generate the checksum. */
3257
- xfs_dinode_calc_crc(log->l_mp, dip);
3258
-
3259
- ASSERT(bp->b_target->bt_mount == mp);
3260
- bp->b_iodone = xlog_recover_iodone;
3261
- xfs_buf_delwri_queue(bp, buffer_list);
3262
-
3263
-out_release:
3264
- xfs_buf_relse(bp);
3265
-error:
3266
- if (need_free)
3267
- kmem_free(in_f);
3268
- return error;
3269
-}
3270
-
3271
-/*
3272
- * Recover QUOTAOFF records. We simply make a note of it in the xlog
3273
- * structure, so that we know not to do any dquot item or dquot buffer recovery,
3274
- * of that type.
3275
- */
3276
-STATIC int
3277
-xlog_recover_quotaoff_pass1(
3278
- struct xlog *log,
3279
- struct xlog_recover_item *item)
3280
-{
3281
- xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
3282
- ASSERT(qoff_f);
3283
-
3284
- /*
3285
- * The logitem format's flag tells us if this was user quotaoff,
3286
- * group/project quotaoff or both.
3287
- */
3288
- if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
3289
- log->l_quotaoffs_flag |= XFS_DQ_USER;
3290
- if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
3291
- log->l_quotaoffs_flag |= XFS_DQ_PROJ;
3292
- if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
3293
- log->l_quotaoffs_flag |= XFS_DQ_GROUP;
3294
-
3295
- return 0;
3296
-}
3297
-
3298
-/*
3299
- * Recover a dquot record
3300
- */
3301
-STATIC int
3302
-xlog_recover_dquot_pass2(
3303
- struct xlog *log,
3304
- struct list_head *buffer_list,
3305
- struct xlog_recover_item *item,
3306
- xfs_lsn_t current_lsn)
3307
-{
3308
- xfs_mount_t *mp = log->l_mp;
3309
- xfs_buf_t *bp;
3310
- struct xfs_disk_dquot *ddq, *recddq;
3311
- xfs_failaddr_t fa;
3312
- int error;
3313
- xfs_dq_logformat_t *dq_f;
3314
- uint type;
3315
-
3316
-
3317
- /*
3318
- * Filesystems are required to send in quota flags at mount time.
3319
- */
3320
- if (mp->m_qflags == 0)
3321
- return 0;
3322
-
3323
- recddq = item->ri_buf[1].i_addr;
3324
- if (recddq == NULL) {
3325
- xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
3326
- return -EIO;
3327
- }
3328
- if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
3329
- xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
3330
- item->ri_buf[1].i_len, __func__);
3331
- return -EIO;
3332
- }
3333
-
3334
- /*
3335
- * This type of quotas was turned off, so ignore this record.
3336
- */
3337
- type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3338
- ASSERT(type);
3339
- if (log->l_quotaoffs_flag & type)
3340
- return 0;
3341
-
3342
- /*
3343
- * At this point we know that quota was _not_ turned off.
3344
- * Since the mount flags are not indicating to us otherwise, this
3345
- * must mean that quota is on, and the dquot needs to be replayed.
3346
- * Remember that we may not have fully recovered the superblock yet,
3347
- * so we can't do the usual trick of looking at the SB quota bits.
3348
- *
3349
- * The other possibility, of course, is that the quota subsystem was
3350
- * removed since the last mount - ENOSYS.
3351
- */
3352
- dq_f = item->ri_buf[0].i_addr;
3353
- ASSERT(dq_f);
3354
- fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0);
3355
- if (fa) {
3356
- xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
3357
- dq_f->qlf_id, fa);
3358
- return -EIO;
3359
- }
3360
- ASSERT(dq_f->qlf_len == 1);
3361
-
3362
- /*
3363
- * At this point we are assuming that the dquots have been allocated
3364
- * and hence the buffer has valid dquots stamped in it. It should,
3365
- * therefore, pass verifier validation. If the dquot is bad, then the
3366
- * we'll return an error here, so we don't need to specifically check
3367
- * the dquot in the buffer after the verifier has run.
3368
- */
3369
- error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
3370
- XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
3371
- &xfs_dquot_buf_ops);
3372
- if (error)
3373
- return error;
3374
-
3375
- ASSERT(bp);
3376
- ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
3377
-
3378
- /*
3379
- * If the dquot has an LSN in it, recover the dquot only if it's less
3380
- * than the lsn of the transaction we are replaying.
3381
- */
3382
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
3383
- struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
3384
- xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
3385
-
3386
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3387
- goto out_release;
3388
- }
3389
- }
3390
-
3391
- memcpy(ddq, recddq, item->ri_buf[1].i_len);
3392
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
3393
- xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
3394
- XFS_DQUOT_CRC_OFF);
3395
- }
3396
-
3397
- ASSERT(dq_f->qlf_size == 2);
3398
- ASSERT(bp->b_target->bt_mount == mp);
3399
- bp->b_iodone = xlog_recover_iodone;
3400
- xfs_buf_delwri_queue(bp, buffer_list);
3401
-
3402
-out_release:
3403
- xfs_buf_relse(bp);
3404
- return 0;
3405
-}
3406
-
3407
-/*
3408
- * This routine is called to create an in-core extent free intent
3409
- * item from the efi format structure which was logged on disk.
3410
- * It allocates an in-core efi, copies the extents from the format
3411
- * structure into it, and adds the efi to the AIL with the given
3412
- * LSN.
3413
- */
3414
-STATIC int
3415
-xlog_recover_efi_pass2(
3416
- struct xlog *log,
3417
- struct xlog_recover_item *item,
3418
- xfs_lsn_t lsn)
3419
-{
3420
- int error;
3421
- struct xfs_mount *mp = log->l_mp;
3422
- struct xfs_efi_log_item *efip;
3423
- struct xfs_efi_log_format *efi_formatp;
3424
-
3425
- efi_formatp = item->ri_buf[0].i_addr;
3426
-
3427
- efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
3428
- error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
3429
- if (error) {
3430
- xfs_efi_item_free(efip);
3431
- return error;
3432
- }
3433
- atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
3434
-
3435
- spin_lock(&log->l_ailp->ail_lock);
3436
- /*
3437
- * The EFI has two references. One for the EFD and one for EFI to ensure
3438
- * it makes it into the AIL. Insert the EFI into the AIL directly and
3439
- * drop the EFI reference. Note that xfs_trans_ail_update() drops the
3440
- * AIL lock.
3441
- */
3442
- xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
3443
- xfs_efi_release(efip);
3444
- return 0;
3445
-}
3446
-
3447
-
3448
-/*
3449
- * This routine is called when an EFD format structure is found in a committed
3450
- * transaction in the log. Its purpose is to cancel the corresponding EFI if it
3451
- * was still in the log. To do this it searches the AIL for the EFI with an id
3452
- * equal to that in the EFD format structure. If we find it we drop the EFD
3453
- * reference, which removes the EFI from the AIL and frees it.
3454
- */
3455
-STATIC int
3456
-xlog_recover_efd_pass2(
3457
- struct xlog *log,
3458
- struct xlog_recover_item *item)
3459
-{
3460
- xfs_efd_log_format_t *efd_formatp;
3461
- xfs_efi_log_item_t *efip = NULL;
3462
- xfs_log_item_t *lip;
3463
- uint64_t efi_id;
3464
- struct xfs_ail_cursor cur;
3465
- struct xfs_ail *ailp = log->l_ailp;
3466
-
3467
- efd_formatp = item->ri_buf[0].i_addr;
3468
- ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
3469
- ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
3470
- (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
3471
- ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
3472
- efi_id = efd_formatp->efd_efi_id;
3473
-
3474
- /*
3475
- * Search for the EFI with the id in the EFD format structure in the
3476
- * AIL.
3477
- */
3478
- spin_lock(&ailp->ail_lock);
3479
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3480
- while (lip != NULL) {
3481
- if (lip->li_type == XFS_LI_EFI) {
3482
- efip = (xfs_efi_log_item_t *)lip;
3483
- if (efip->efi_format.efi_id == efi_id) {
3484
- /*
3485
- * Drop the EFD reference to the EFI. This
3486
- * removes the EFI from the AIL and frees it.
3487
- */
3488
- spin_unlock(&ailp->ail_lock);
3489
- xfs_efi_release(efip);
3490
- spin_lock(&ailp->ail_lock);
3491
- break;
3492
- }
3493
- }
3494
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
3495
- }
3496
-
3497
- xfs_trans_ail_cursor_done(&cur);
3498
- spin_unlock(&ailp->ail_lock);
3499
-
3500
- return 0;
3501
-}
3502
-
3503
-/*
3504
- * This routine is called to create an in-core extent rmap update
3505
- * item from the rui format structure which was logged on disk.
3506
- * It allocates an in-core rui, copies the extents from the format
3507
- * structure into it, and adds the rui to the AIL with the given
3508
- * LSN.
3509
- */
3510
-STATIC int
3511
-xlog_recover_rui_pass2(
3512
- struct xlog *log,
3513
- struct xlog_recover_item *item,
3514
- xfs_lsn_t lsn)
3515
-{
3516
- int error;
3517
- struct xfs_mount *mp = log->l_mp;
3518
- struct xfs_rui_log_item *ruip;
3519
- struct xfs_rui_log_format *rui_formatp;
3520
-
3521
- rui_formatp = item->ri_buf[0].i_addr;
3522
-
3523
- ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
3524
- error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format);
3525
- if (error) {
3526
- xfs_rui_item_free(ruip);
3527
- return error;
3528
- }
3529
- atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
3530
-
3531
- spin_lock(&log->l_ailp->ail_lock);
3532
- /*
3533
- * The RUI has two references. One for the RUD and one for RUI to ensure
3534
- * it makes it into the AIL. Insert the RUI into the AIL directly and
3535
- * drop the RUI reference. Note that xfs_trans_ail_update() drops the
3536
- * AIL lock.
3537
- */
3538
- xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn);
3539
- xfs_rui_release(ruip);
3540
- return 0;
3541
-}
3542
-
3543
-
3544
-/*
3545
- * This routine is called when an RUD format structure is found in a committed
3546
- * transaction in the log. Its purpose is to cancel the corresponding RUI if it
3547
- * was still in the log. To do this it searches the AIL for the RUI with an id
3548
- * equal to that in the RUD format structure. If we find it we drop the RUD
3549
- * reference, which removes the RUI from the AIL and frees it.
3550
- */
3551
-STATIC int
3552
-xlog_recover_rud_pass2(
3553
- struct xlog *log,
3554
- struct xlog_recover_item *item)
3555
-{
3556
- struct xfs_rud_log_format *rud_formatp;
3557
- struct xfs_rui_log_item *ruip = NULL;
3558
- struct xfs_log_item *lip;
3559
- uint64_t rui_id;
3560
- struct xfs_ail_cursor cur;
3561
- struct xfs_ail *ailp = log->l_ailp;
3562
-
3563
- rud_formatp = item->ri_buf[0].i_addr;
3564
- ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format));
3565
- rui_id = rud_formatp->rud_rui_id;
3566
-
3567
- /*
3568
- * Search for the RUI with the id in the RUD format structure in the
3569
- * AIL.
3570
- */
3571
- spin_lock(&ailp->ail_lock);
3572
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3573
- while (lip != NULL) {
3574
- if (lip->li_type == XFS_LI_RUI) {
3575
- ruip = (struct xfs_rui_log_item *)lip;
3576
- if (ruip->rui_format.rui_id == rui_id) {
3577
- /*
3578
- * Drop the RUD reference to the RUI. This
3579
- * removes the RUI from the AIL and frees it.
3580
- */
3581
- spin_unlock(&ailp->ail_lock);
3582
- xfs_rui_release(ruip);
3583
- spin_lock(&ailp->ail_lock);
3584
- break;
3585
- }
3586
- }
3587
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
3588
- }
3589
-
3590
- xfs_trans_ail_cursor_done(&cur);
3591
- spin_unlock(&ailp->ail_lock);
3592
-
3593
- return 0;
3594
-}
3595
-
3596
-/*
3597
- * Copy an CUI format buffer from the given buf, and into the destination
3598
- * CUI format structure. The CUI/CUD items were designed not to need any
3599
- * special alignment handling.
3600
- */
3601
-static int
3602
-xfs_cui_copy_format(
3603
- struct xfs_log_iovec *buf,
3604
- struct xfs_cui_log_format *dst_cui_fmt)
3605
-{
3606
- struct xfs_cui_log_format *src_cui_fmt;
3607
- uint len;
3608
-
3609
- src_cui_fmt = buf->i_addr;
3610
- len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
3611
-
3612
- if (buf->i_len == len) {
3613
- memcpy(dst_cui_fmt, src_cui_fmt, len);
3614
- return 0;
3615
- }
3616
- return -EFSCORRUPTED;
3617
-}
3618
-
3619
-/*
3620
- * This routine is called to create an in-core extent refcount update
3621
- * item from the cui format structure which was logged on disk.
3622
- * It allocates an in-core cui, copies the extents from the format
3623
- * structure into it, and adds the cui to the AIL with the given
3624
- * LSN.
3625
- */
3626
-STATIC int
3627
-xlog_recover_cui_pass2(
3628
- struct xlog *log,
3629
- struct xlog_recover_item *item,
3630
- xfs_lsn_t lsn)
3631
-{
3632
- int error;
3633
- struct xfs_mount *mp = log->l_mp;
3634
- struct xfs_cui_log_item *cuip;
3635
- struct xfs_cui_log_format *cui_formatp;
3636
-
3637
- cui_formatp = item->ri_buf[0].i_addr;
3638
-
3639
- cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
3640
- error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
3641
- if (error) {
3642
- xfs_cui_item_free(cuip);
3643
- return error;
3644
- }
3645
- atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
3646
-
3647
- spin_lock(&log->l_ailp->ail_lock);
3648
- /*
3649
- * The CUI has two references. One for the CUD and one for CUI to ensure
3650
- * it makes it into the AIL. Insert the CUI into the AIL directly and
3651
- * drop the CUI reference. Note that xfs_trans_ail_update() drops the
3652
- * AIL lock.
3653
- */
3654
- xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn);
3655
- xfs_cui_release(cuip);
3656
- return 0;
3657
-}
3658
-
3659
-
3660
-/*
3661
- * This routine is called when an CUD format structure is found in a committed
3662
- * transaction in the log. Its purpose is to cancel the corresponding CUI if it
3663
- * was still in the log. To do this it searches the AIL for the CUI with an id
3664
- * equal to that in the CUD format structure. If we find it we drop the CUD
3665
- * reference, which removes the CUI from the AIL and frees it.
3666
- */
3667
-STATIC int
3668
-xlog_recover_cud_pass2(
3669
- struct xlog *log,
3670
- struct xlog_recover_item *item)
3671
-{
3672
- struct xfs_cud_log_format *cud_formatp;
3673
- struct xfs_cui_log_item *cuip = NULL;
3674
- struct xfs_log_item *lip;
3675
- uint64_t cui_id;
3676
- struct xfs_ail_cursor cur;
3677
- struct xfs_ail *ailp = log->l_ailp;
3678
-
3679
- cud_formatp = item->ri_buf[0].i_addr;
3680
- if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format))
3681
- return -EFSCORRUPTED;
3682
- cui_id = cud_formatp->cud_cui_id;
3683
-
3684
- /*
3685
- * Search for the CUI with the id in the CUD format structure in the
3686
- * AIL.
3687
- */
3688
- spin_lock(&ailp->ail_lock);
3689
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3690
- while (lip != NULL) {
3691
- if (lip->li_type == XFS_LI_CUI) {
3692
- cuip = (struct xfs_cui_log_item *)lip;
3693
- if (cuip->cui_format.cui_id == cui_id) {
3694
- /*
3695
- * Drop the CUD reference to the CUI. This
3696
- * removes the CUI from the AIL and frees it.
3697
- */
3698
- spin_unlock(&ailp->ail_lock);
3699
- xfs_cui_release(cuip);
3700
- spin_lock(&ailp->ail_lock);
3701
- break;
3702
- }
3703
- }
3704
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
3705
- }
3706
-
3707
- xfs_trans_ail_cursor_done(&cur);
3708
- spin_unlock(&ailp->ail_lock);
3709
-
3710
- return 0;
3711
-}
3712
-
3713
-/*
3714
- * Copy an BUI format buffer from the given buf, and into the destination
3715
- * BUI format structure. The BUI/BUD items were designed not to need any
3716
- * special alignment handling.
3717
- */
3718
-static int
3719
-xfs_bui_copy_format(
3720
- struct xfs_log_iovec *buf,
3721
- struct xfs_bui_log_format *dst_bui_fmt)
3722
-{
3723
- struct xfs_bui_log_format *src_bui_fmt;
3724
- uint len;
3725
-
3726
- src_bui_fmt = buf->i_addr;
3727
- len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
3728
-
3729
- if (buf->i_len == len) {
3730
- memcpy(dst_bui_fmt, src_bui_fmt, len);
3731
- return 0;
3732
- }
3733
- return -EFSCORRUPTED;
3734
-}
3735
-
3736
-/*
3737
- * This routine is called to create an in-core extent bmap update
3738
- * item from the bui format structure which was logged on disk.
3739
- * It allocates an in-core bui, copies the extents from the format
3740
- * structure into it, and adds the bui to the AIL with the given
3741
- * LSN.
3742
- */
3743
-STATIC int
3744
-xlog_recover_bui_pass2(
3745
- struct xlog *log,
3746
- struct xlog_recover_item *item,
3747
- xfs_lsn_t lsn)
3748
-{
3749
- int error;
3750
- struct xfs_mount *mp = log->l_mp;
3751
- struct xfs_bui_log_item *buip;
3752
- struct xfs_bui_log_format *bui_formatp;
3753
-
3754
- bui_formatp = item->ri_buf[0].i_addr;
3755
-
3756
- if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
3757
- return -EFSCORRUPTED;
3758
- buip = xfs_bui_init(mp);
3759
- error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
3760
- if (error) {
3761
- xfs_bui_item_free(buip);
3762
- return error;
3763
- }
3764
- atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
3765
-
3766
- spin_lock(&log->l_ailp->ail_lock);
3767
- /*
3768
- * The RUI has two references. One for the RUD and one for RUI to ensure
3769
- * it makes it into the AIL. Insert the RUI into the AIL directly and
3770
- * drop the RUI reference. Note that xfs_trans_ail_update() drops the
3771
- * AIL lock.
3772
- */
3773
- xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn);
3774
- xfs_bui_release(buip);
3775
- return 0;
3776
-}
3777
-
3778
-
3779
-/*
3780
- * This routine is called when an BUD format structure is found in a committed
3781
- * transaction in the log. Its purpose is to cancel the corresponding BUI if it
3782
- * was still in the log. To do this it searches the AIL for the BUI with an id
3783
- * equal to that in the BUD format structure. If we find it we drop the BUD
3784
- * reference, which removes the BUI from the AIL and frees it.
3785
- */
3786
-STATIC int
3787
-xlog_recover_bud_pass2(
3788
- struct xlog *log,
3789
- struct xlog_recover_item *item)
3790
-{
3791
- struct xfs_bud_log_format *bud_formatp;
3792
- struct xfs_bui_log_item *buip = NULL;
3793
- struct xfs_log_item *lip;
3794
- uint64_t bui_id;
3795
- struct xfs_ail_cursor cur;
3796
- struct xfs_ail *ailp = log->l_ailp;
3797
-
3798
- bud_formatp = item->ri_buf[0].i_addr;
3799
- if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format))
3800
- return -EFSCORRUPTED;
3801
- bui_id = bud_formatp->bud_bui_id;
3802
-
3803
- /*
3804
- * Search for the BUI with the id in the BUD format structure in the
3805
- * AIL.
3806
- */
3807
- spin_lock(&ailp->ail_lock);
3808
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3809
- while (lip != NULL) {
3810
- if (lip->li_type == XFS_LI_BUI) {
3811
- buip = (struct xfs_bui_log_item *)lip;
3812
- if (buip->bui_format.bui_id == bui_id) {
3813
- /*
3814
- * Drop the BUD reference to the BUI. This
3815
- * removes the BUI from the AIL and frees it.
3816
- */
3817
- spin_unlock(&ailp->ail_lock);
3818
- xfs_bui_release(buip);
3819
- spin_lock(&ailp->ail_lock);
3820
- break;
3821
- }
3822
- }
3823
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
3824
- }
3825
-
3826
- xfs_trans_ail_cursor_done(&cur);
3827
- spin_unlock(&ailp->ail_lock);
3828
-
3829
- return 0;
3830
-}
3831
-
3832
-/*
3833
- * This routine is called when an inode create format structure is found in a
3834
- * committed transaction in the log. It's purpose is to initialise the inodes
3835
- * being allocated on disk. This requires us to get inode cluster buffers that
3836
- * match the range to be initialised, stamped with inode templates and written
3837
- * by delayed write so that subsequent modifications will hit the cached buffer
3838
- * and only need writing out at the end of recovery.
3839
- */
3840
-STATIC int
3841
-xlog_recover_do_icreate_pass2(
3842
- struct xlog *log,
3843
- struct list_head *buffer_list,
3844
- xlog_recover_item_t *item)
3845
-{
3846
- struct xfs_mount *mp = log->l_mp;
3847
- struct xfs_icreate_log *icl;
3848
- xfs_agnumber_t agno;
3849
- xfs_agblock_t agbno;
3850
- unsigned int count;
3851
- unsigned int isize;
3852
- xfs_agblock_t length;
3853
- int blks_per_cluster;
3854
- int bb_per_cluster;
3855
- int cancel_count;
3856
- int nbufs;
3857
- int i;
3858
-
3859
- icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3860
- if (icl->icl_type != XFS_LI_ICREATE) {
3861
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3862
- return -EINVAL;
3863
- }
3864
-
3865
- if (icl->icl_size != 1) {
3866
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3867
- return -EINVAL;
3868
- }
3869
-
3870
- agno = be32_to_cpu(icl->icl_ag);
3871
- if (agno >= mp->m_sb.sb_agcount) {
3872
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3873
- return -EINVAL;
3874
- }
3875
- agbno = be32_to_cpu(icl->icl_agbno);
3876
- if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
3877
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3878
- return -EINVAL;
3879
- }
3880
- isize = be32_to_cpu(icl->icl_isize);
3881
- if (isize != mp->m_sb.sb_inodesize) {
3882
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3883
- return -EINVAL;
3884
- }
3885
- count = be32_to_cpu(icl->icl_count);
3886
- if (!count) {
3887
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3888
- return -EINVAL;
3889
- }
3890
- length = be32_to_cpu(icl->icl_length);
3891
- if (!length || length >= mp->m_sb.sb_agblocks) {
3892
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3893
- return -EINVAL;
3894
- }
3895
-
3896
- /*
3897
- * The inode chunk is either full or sparse and we only support
3898
- * m_ialloc_min_blks sized sparse allocations at this time.
3899
- */
3900
- if (length != mp->m_ialloc_blks &&
3901
- length != mp->m_ialloc_min_blks) {
3902
- xfs_warn(log->l_mp,
3903
- "%s: unsupported chunk length", __FUNCTION__);
3904
- return -EINVAL;
3905
- }
3906
-
3907
- /* verify inode count is consistent with extent length */
3908
- if ((count >> mp->m_sb.sb_inopblog) != length) {
3909
- xfs_warn(log->l_mp,
3910
- "%s: inconsistent inode count and chunk length",
3911
- __FUNCTION__);
3912
- return -EINVAL;
3913
- }
3914
-
3915
- /*
3916
- * The icreate transaction can cover multiple cluster buffers and these
3917
- * buffers could have been freed and reused. Check the individual
3918
- * buffers for cancellation so we don't overwrite anything written after
3919
- * a cancellation.
3920
- */
3921
- blks_per_cluster = xfs_icluster_size_fsb(mp);
3922
- bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster);
3923
- nbufs = length / blks_per_cluster;
3924
- for (i = 0, cancel_count = 0; i < nbufs; i++) {
3925
- xfs_daddr_t daddr;
3926
-
3927
- daddr = XFS_AGB_TO_DADDR(mp, agno,
3928
- agbno + i * blks_per_cluster);
3929
- if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
3930
- cancel_count++;
3931
- }
3932
-
3933
- /*
3934
- * We currently only use icreate for a single allocation at a time. This
3935
- * means we should expect either all or none of the buffers to be
3936
- * cancelled. Be conservative and skip replay if at least one buffer is
3937
- * cancelled, but warn the user that something is awry if the buffers
3938
- * are not consistent.
3939
- *
3940
- * XXX: This must be refined to only skip cancelled clusters once we use
3941
- * icreate for multiple chunk allocations.
3942
- */
3943
- ASSERT(!cancel_count || cancel_count == nbufs);
3944
- if (cancel_count) {
3945
- if (cancel_count != nbufs)
3946
- xfs_warn(mp,
3947
- "WARNING: partial inode chunk cancellation, skipped icreate.");
3948
- trace_xfs_log_recover_icreate_cancel(log, icl);
3949
- return 0;
3950
- }
3951
-
3952
- trace_xfs_log_recover_icreate_recover(log, icl);
3953
- return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
3954
- length, be32_to_cpu(icl->icl_gen));
3955
-}
3956
-
3957
-STATIC void
3958
-xlog_recover_buffer_ra_pass2(
3959
- struct xlog *log,
3960
- struct xlog_recover_item *item)
3961
-{
3962
- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
3963
- struct xfs_mount *mp = log->l_mp;
3964
-
3965
- if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
3966
- buf_f->blf_len, buf_f->blf_flags)) {
3967
- return;
3968
- }
3969
-
3970
- xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
3971
- buf_f->blf_len, NULL);
3972
-}
3973
-
3974
-STATIC void
3975
-xlog_recover_inode_ra_pass2(
3976
- struct xlog *log,
3977
- struct xlog_recover_item *item)
3978
-{
3979
- struct xfs_inode_log_format ilf_buf;
3980
- struct xfs_inode_log_format *ilfp;
3981
- struct xfs_mount *mp = log->l_mp;
3982
- int error;
3983
-
3984
- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
3985
- ilfp = item->ri_buf[0].i_addr;
3986
- } else {
3987
- ilfp = &ilf_buf;
3988
- memset(ilfp, 0, sizeof(*ilfp));
3989
- error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
3990
- if (error)
3991
- return;
3992
- }
3993
-
3994
- if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
3995
- return;
3996
-
3997
- xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
3998
- ilfp->ilf_len, &xfs_inode_buf_ra_ops);
3999
-}
4000
-
4001
-STATIC void
4002
-xlog_recover_dquot_ra_pass2(
4003
- struct xlog *log,
4004
- struct xlog_recover_item *item)
4005
-{
4006
- struct xfs_mount *mp = log->l_mp;
4007
- struct xfs_disk_dquot *recddq;
4008
- struct xfs_dq_logformat *dq_f;
4009
- uint type;
4010
- int len;
4011
-
4012
-
4013
- if (mp->m_qflags == 0)
4014
- return;
4015
-
4016
- recddq = item->ri_buf[1].i_addr;
4017
- if (recddq == NULL)
4018
- return;
4019
- if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
4020
- return;
4021
-
4022
- type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
4023
- ASSERT(type);
4024
- if (log->l_quotaoffs_flag & type)
4025
- return;
4026
-
4027
- dq_f = item->ri_buf[0].i_addr;
4028
- ASSERT(dq_f);
4029
- ASSERT(dq_f->qlf_len == 1);
4030
-
4031
- len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
4032
- if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
4033
- return;
4034
-
4035
- xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
4036
- &xfs_dquot_buf_ra_ops);
4037
-}
4038
-
4039
-STATIC void
4040
-xlog_recover_ra_pass2(
4041
- struct xlog *log,
4042
- struct xlog_recover_item *item)
4043
-{
4044
- switch (ITEM_TYPE(item)) {
4045
- case XFS_LI_BUF:
4046
- xlog_recover_buffer_ra_pass2(log, item);
4047
- break;
4048
- case XFS_LI_INODE:
4049
- xlog_recover_inode_ra_pass2(log, item);
4050
- break;
4051
- case XFS_LI_DQUOT:
4052
- xlog_recover_dquot_ra_pass2(log, item);
4053
- break;
4054
- case XFS_LI_EFI:
4055
- case XFS_LI_EFD:
4056
- case XFS_LI_QUOTAOFF:
4057
- case XFS_LI_RUI:
4058
- case XFS_LI_RUD:
4059
- case XFS_LI_CUI:
4060
- case XFS_LI_CUD:
4061
- case XFS_LI_BUI:
4062
- case XFS_LI_BUD:
4063
- default:
4064
- break;
4065
- }
4066
-}
4067
-
4068
-STATIC int
4069
-xlog_recover_commit_pass1(
4070
- struct xlog *log,
4071
- struct xlog_recover *trans,
4072
- struct xlog_recover_item *item)
4073
-{
4074
- trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
4075
-
4076
- switch (ITEM_TYPE(item)) {
4077
- case XFS_LI_BUF:
4078
- return xlog_recover_buffer_pass1(log, item);
4079
- case XFS_LI_QUOTAOFF:
4080
- return xlog_recover_quotaoff_pass1(log, item);
4081
- case XFS_LI_INODE:
4082
- case XFS_LI_EFI:
4083
- case XFS_LI_EFD:
4084
- case XFS_LI_DQUOT:
4085
- case XFS_LI_ICREATE:
4086
- case XFS_LI_RUI:
4087
- case XFS_LI_RUD:
4088
- case XFS_LI_CUI:
4089
- case XFS_LI_CUD:
4090
- case XFS_LI_BUI:
4091
- case XFS_LI_BUD:
4092
- /* nothing to do in pass 1 */
4093
- return 0;
4094
- default:
4095
- xfs_warn(log->l_mp, "%s: invalid item type (%d)",
4096
- __func__, ITEM_TYPE(item));
4097
- ASSERT(0);
4098
- return -EIO;
4099
- }
4100
-}
4101
-
4102
-STATIC int
4103
-xlog_recover_commit_pass2(
4104
- struct xlog *log,
4105
- struct xlog_recover *trans,
4106
- struct list_head *buffer_list,
4107
- struct xlog_recover_item *item)
4108
-{
4109
- trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
4110
-
4111
- switch (ITEM_TYPE(item)) {
4112
- case XFS_LI_BUF:
4113
- return xlog_recover_buffer_pass2(log, buffer_list, item,
4114
- trans->r_lsn);
4115
- case XFS_LI_INODE:
4116
- return xlog_recover_inode_pass2(log, buffer_list, item,
4117
- trans->r_lsn);
4118
- case XFS_LI_EFI:
4119
- return xlog_recover_efi_pass2(log, item, trans->r_lsn);
4120
- case XFS_LI_EFD:
4121
- return xlog_recover_efd_pass2(log, item);
4122
- case XFS_LI_RUI:
4123
- return xlog_recover_rui_pass2(log, item, trans->r_lsn);
4124
- case XFS_LI_RUD:
4125
- return xlog_recover_rud_pass2(log, item);
4126
- case XFS_LI_CUI:
4127
- return xlog_recover_cui_pass2(log, item, trans->r_lsn);
4128
- case XFS_LI_CUD:
4129
- return xlog_recover_cud_pass2(log, item);
4130
- case XFS_LI_BUI:
4131
- return xlog_recover_bui_pass2(log, item, trans->r_lsn);
4132
- case XFS_LI_BUD:
4133
- return xlog_recover_bud_pass2(log, item);
4134
- case XFS_LI_DQUOT:
4135
- return xlog_recover_dquot_pass2(log, buffer_list, item,
4136
- trans->r_lsn);
4137
- case XFS_LI_ICREATE:
4138
- return xlog_recover_do_icreate_pass2(log, buffer_list, item);
4139
- case XFS_LI_QUOTAOFF:
4140
- /* nothing to do in pass2 */
4141
- return 0;
4142
- default:
4143
- xfs_warn(log->l_mp, "%s: invalid item type (%d)",
4144
- __func__, ITEM_TYPE(item));
4145
- ASSERT(0);
4146
- return -EIO;
4147
- }
1918
+ if (!xlog_is_buffer_cancelled(log, blkno, len))
1919
+ xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
41481920 }
41491921
41501922 STATIC int
....@@ -4158,8 +1930,12 @@
41581930 int error = 0;
41591931
41601932 list_for_each_entry(item, item_list, ri_list) {
4161
- error = xlog_recover_commit_pass2(log, trans,
4162
- buffer_list, item);
1933
+ trace_xfs_log_recover_item_recover(log, trans, item,
1934
+ XLOG_RECOVER_PASS2);
1935
+
1936
+ if (item->ri_ops->commit_pass2)
1937
+ error = item->ri_ops->commit_pass2(log, buffer_list,
1938
+ item, trans->r_lsn);
41631939 if (error)
41641940 return error;
41651941 }
....@@ -4196,12 +1972,16 @@
41961972 return error;
41971973
41981974 list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
1975
+ trace_xfs_log_recover_item_recover(log, trans, item, pass);
1976
+
41991977 switch (pass) {
42001978 case XLOG_RECOVER_PASS1:
4201
- error = xlog_recover_commit_pass1(log, trans, item);
1979
+ if (item->ri_ops->commit_pass1)
1980
+ error = item->ri_ops->commit_pass1(log, item);
42021981 break;
42031982 case XLOG_RECOVER_PASS2:
4204
- xlog_recover_ra_pass2(log, item);
1983
+ if (item->ri_ops->ra_pass2)
1984
+ item->ri_ops->ra_pass2(log, item);
42051985 list_move_tail(&item->ri_list, &ra_list);
42061986 items_queued++;
42071987 if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
....@@ -4238,9 +2018,9 @@
42382018 xlog_recover_add_item(
42392019 struct list_head *head)
42402020 {
4241
- xlog_recover_item_t *item;
2021
+ struct xlog_recover_item *item;
42422022
4243
- item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
2023
+ item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
42442024 INIT_LIST_HEAD(&item->ri_list);
42452025 list_add_tail(&item->ri_list, head);
42462026 }
....@@ -4252,7 +2032,7 @@
42522032 char *dp,
42532033 int len)
42542034 {
4255
- xlog_recover_item_t *item;
2035
+ struct xlog_recover_item *item;
42562036 char *ptr, *old_ptr;
42572037 int old_len;
42582038
....@@ -4264,7 +2044,7 @@
42642044 ASSERT(len <= sizeof(struct xfs_trans_header));
42652045 if (len > sizeof(struct xfs_trans_header)) {
42662046 xfs_warn(log->l_mp, "%s: bad header length", __func__);
4267
- return -EIO;
2047
+ return -EFSCORRUPTED;
42682048 }
42692049
42702050 xlog_recover_add_item(&trans->r_itemq);
....@@ -4275,12 +2055,15 @@
42752055 }
42762056
42772057 /* take the tail entry */
4278
- item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
2058
+ item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2059
+ ri_list);
42792060
42802061 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
42812062 old_len = item->ri_buf[item->ri_cnt-1].i_len;
42822063
4283
- ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
2064
+ ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
2065
+ if (!ptr)
2066
+ return -ENOMEM;
42842067 memcpy(&ptr[old_len], dp, len);
42852068 item->ri_buf[item->ri_cnt-1].i_len += len;
42862069 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
....@@ -4309,7 +2092,7 @@
43092092 int len)
43102093 {
43112094 struct xfs_inode_log_format *in_f; /* any will do */
4312
- xlog_recover_item_t *item;
2095
+ struct xlog_recover_item *item;
43132096 char *ptr;
43142097
43152098 if (!len)
....@@ -4320,13 +2103,13 @@
43202103 xfs_warn(log->l_mp, "%s: bad header magic number",
43212104 __func__);
43222105 ASSERT(0);
4323
- return -EIO;
2106
+ return -EFSCORRUPTED;
43242107 }
43252108
43262109 if (len > sizeof(struct xfs_trans_header)) {
43272110 xfs_warn(log->l_mp, "%s: bad header length", __func__);
43282111 ASSERT(0);
4329
- return -EIO;
2112
+ return -EFSCORRUPTED;
43302113 }
43312114
43322115 /*
....@@ -4340,18 +2123,19 @@
43402123 return 0;
43412124 }
43422125
4343
- ptr = kmem_alloc(len, KM_SLEEP);
2126
+ ptr = kmem_alloc(len, 0);
43442127 memcpy(ptr, dp, len);
43452128 in_f = (struct xfs_inode_log_format *)ptr;
43462129
43472130 /* take the tail entry */
4348
- item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
2131
+ item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2132
+ ri_list);
43492133 if (item->ri_total != 0 &&
43502134 item->ri_total == item->ri_cnt) {
43512135 /* tail item is in use, get a new one */
43522136 xlog_recover_add_item(&trans->r_itemq);
43532137 item = list_entry(trans->r_itemq.prev,
4354
- xlog_recover_item_t, ri_list);
2138
+ struct xlog_recover_item, ri_list);
43552139 }
43562140
43572141 if (item->ri_total == 0) { /* first region to be added */
....@@ -4362,15 +2146,24 @@
43622146 in_f->ilf_size);
43632147 ASSERT(0);
43642148 kmem_free(ptr);
4365
- return -EIO;
2149
+ return -EFSCORRUPTED;
43662150 }
43672151
43682152 item->ri_total = in_f->ilf_size;
43692153 item->ri_buf =
43702154 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
4371
- KM_SLEEP);
2155
+ 0);
43722156 }
4373
- ASSERT(item->ri_total > item->ri_cnt);
2157
+
2158
+ if (item->ri_total <= item->ri_cnt) {
2159
+ xfs_warn(log->l_mp,
2160
+ "log item region count (%d) overflowed size (%d)",
2161
+ item->ri_cnt, item->ri_total);
2162
+ ASSERT(0);
2163
+ kmem_free(ptr);
2164
+ return -EFSCORRUPTED;
2165
+ }
2166
+
43742167 /* Description region is ri_buf[0] */
43752168 item->ri_buf[item->ri_cnt].i_addr = ptr;
43762169 item->ri_buf[item->ri_cnt].i_len = len;
....@@ -4388,7 +2181,7 @@
43882181 xlog_recover_free_trans(
43892182 struct xlog_recover *trans)
43902183 {
4391
- xlog_recover_item_t *item, *n;
2184
+ struct xlog_recover_item *item, *n;
43922185 int i;
43932186
43942187 hlist_del_init(&trans->r_list);
....@@ -4457,7 +2250,7 @@
44572250 default:
44582251 xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
44592252 ASSERT(0);
4460
- error = -EIO;
2253
+ error = -EFSCORRUPTED;
44612254 break;
44622255 }
44632256 if (error || freeit)
....@@ -4502,7 +2295,7 @@
45022295 * This is a new transaction so allocate a new recovery container to
45032296 * hold the recovery ops that will follow.
45042297 */
4505
- trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
2298
+ trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
45062299 trans->r_log_tid = tid;
45072300 trans->r_lsn = be64_to_cpu(rhead->h_lsn);
45082301 INIT_LIST_HEAD(&trans->r_itemq);
....@@ -4537,7 +2330,7 @@
45372330 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
45382331 __func__, ohead->oh_clientid);
45392332 ASSERT(0);
4540
- return -EIO;
2333
+ return -EFSCORRUPTED;
45412334 }
45422335
45432336 /*
....@@ -4547,7 +2340,7 @@
45472340 if (dp + len > end) {
45482341 xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
45492342 WARN_ON(1);
4550
- return -EIO;
2343
+ return -EFSCORRUPTED;
45512344 }
45522345
45532346 trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
....@@ -4640,214 +2433,71 @@
46402433 return 0;
46412434 }
46422435
4643
-/* Recover the EFI if necessary. */
4644
-STATIC int
4645
-xlog_recover_process_efi(
4646
- struct xfs_mount *mp,
4647
- struct xfs_ail *ailp,
4648
- struct xfs_log_item *lip)
4649
-{
4650
- struct xfs_efi_log_item *efip;
4651
- int error;
4652
-
4653
- /*
4654
- * Skip EFIs that we've already processed.
4655
- */
4656
- efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4657
- if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
4658
- return 0;
4659
-
4660
- spin_unlock(&ailp->ail_lock);
4661
- error = xfs_efi_recover(mp, efip);
4662
- spin_lock(&ailp->ail_lock);
4663
-
4664
- return error;
4665
-}
4666
-
4667
-/* Release the EFI since we're cancelling everything. */
4668
-STATIC void
4669
-xlog_recover_cancel_efi(
4670
- struct xfs_mount *mp,
4671
- struct xfs_ail *ailp,
4672
- struct xfs_log_item *lip)
4673
-{
4674
- struct xfs_efi_log_item *efip;
4675
-
4676
- efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4677
-
4678
- spin_unlock(&ailp->ail_lock);
4679
- xfs_efi_release(efip);
4680
- spin_lock(&ailp->ail_lock);
4681
-}
4682
-
4683
-/* Recover the RUI if necessary. */
4684
-STATIC int
4685
-xlog_recover_process_rui(
4686
- struct xfs_mount *mp,
4687
- struct xfs_ail *ailp,
4688
- struct xfs_log_item *lip)
4689
-{
4690
- struct xfs_rui_log_item *ruip;
4691
- int error;
4692
-
4693
- /*
4694
- * Skip RUIs that we've already processed.
4695
- */
4696
- ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
4697
- if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags))
4698
- return 0;
4699
-
4700
- spin_unlock(&ailp->ail_lock);
4701
- error = xfs_rui_recover(mp, ruip);
4702
- spin_lock(&ailp->ail_lock);
4703
-
4704
- return error;
4705
-}
4706
-
4707
-/* Release the RUI since we're cancelling everything. */
4708
-STATIC void
4709
-xlog_recover_cancel_rui(
4710
- struct xfs_mount *mp,
4711
- struct xfs_ail *ailp,
4712
- struct xfs_log_item *lip)
4713
-{
4714
- struct xfs_rui_log_item *ruip;
4715
-
4716
- ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
4717
-
4718
- spin_unlock(&ailp->ail_lock);
4719
- xfs_rui_release(ruip);
4720
- spin_lock(&ailp->ail_lock);
4721
-}
4722
-
4723
-/* Recover the CUI if necessary. */
4724
-STATIC int
4725
-xlog_recover_process_cui(
4726
- struct xfs_trans *parent_tp,
4727
- struct xfs_ail *ailp,
4728
- struct xfs_log_item *lip)
4729
-{
4730
- struct xfs_cui_log_item *cuip;
4731
- int error;
4732
-
4733
- /*
4734
- * Skip CUIs that we've already processed.
4735
- */
4736
- cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
4737
- if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags))
4738
- return 0;
4739
-
4740
- spin_unlock(&ailp->ail_lock);
4741
- error = xfs_cui_recover(parent_tp, cuip);
4742
- spin_lock(&ailp->ail_lock);
4743
-
4744
- return error;
4745
-}
4746
-
4747
-/* Release the CUI since we're cancelling everything. */
4748
-STATIC void
4749
-xlog_recover_cancel_cui(
4750
- struct xfs_mount *mp,
4751
- struct xfs_ail *ailp,
4752
- struct xfs_log_item *lip)
4753
-{
4754
- struct xfs_cui_log_item *cuip;
4755
-
4756
- cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
4757
-
4758
- spin_unlock(&ailp->ail_lock);
4759
- xfs_cui_release(cuip);
4760
- spin_lock(&ailp->ail_lock);
4761
-}
4762
-
4763
-/* Recover the BUI if necessary. */
4764
-STATIC int
4765
-xlog_recover_process_bui(
4766
- struct xfs_trans *parent_tp,
4767
- struct xfs_ail *ailp,
4768
- struct xfs_log_item *lip)
4769
-{
4770
- struct xfs_bui_log_item *buip;
4771
- int error;
4772
-
4773
- /*
4774
- * Skip BUIs that we've already processed.
4775
- */
4776
- buip = container_of(lip, struct xfs_bui_log_item, bui_item);
4777
- if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags))
4778
- return 0;
4779
-
4780
- spin_unlock(&ailp->ail_lock);
4781
- error = xfs_bui_recover(parent_tp, buip);
4782
- spin_lock(&ailp->ail_lock);
4783
-
4784
- return error;
4785
-}
4786
-
4787
-/* Release the BUI since we're cancelling everything. */
4788
-STATIC void
4789
-xlog_recover_cancel_bui(
4790
- struct xfs_mount *mp,
4791
- struct xfs_ail *ailp,
4792
- struct xfs_log_item *lip)
4793
-{
4794
- struct xfs_bui_log_item *buip;
4795
-
4796
- buip = container_of(lip, struct xfs_bui_log_item, bui_item);
4797
-
4798
- spin_unlock(&ailp->ail_lock);
4799
- xfs_bui_release(buip);
4800
- spin_lock(&ailp->ail_lock);
4801
-}
4802
-
4803
-/* Is this log item a deferred action intent? */
4804
-static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
4805
-{
4806
- switch (lip->li_type) {
4807
- case XFS_LI_EFI:
4808
- case XFS_LI_RUI:
4809
- case XFS_LI_CUI:
4810
- case XFS_LI_BUI:
4811
- return true;
4812
- default:
4813
- return false;
4814
- }
4815
-}
4816
-
48172436 /* Take all the collected deferred ops and finish them in order. */
48182437 static int
48192438 xlog_finish_defer_ops(
4820
- struct xfs_trans *parent_tp)
2439
+ struct xfs_mount *mp,
2440
+ struct list_head *capture_list)
48212441 {
4822
- struct xfs_mount *mp = parent_tp->t_mountp;
2442
+ struct xfs_defer_capture *dfc, *next;
48232443 struct xfs_trans *tp;
4824
- int64_t freeblks;
4825
- uint resblks;
4826
- int error;
2444
+ struct xfs_inode *ip;
2445
+ int error = 0;
48272446
4828
- /*
4829
- * We're finishing the defer_ops that accumulated as a result of
4830
- * recovering unfinished intent items during log recovery. We
4831
- * reserve an itruncate transaction because it is the largest
4832
- * permanent transaction type. Since we're the only user of the fs
4833
- * right now, take 93% (15/16) of the available free blocks. Use
4834
- * weird math to avoid a 64-bit division.
4835
- */
4836
- freeblks = percpu_counter_sum(&mp->m_fdblocks);
4837
- if (freeblks <= 0)
4838
- return -ENOSPC;
4839
- resblks = min_t(int64_t, UINT_MAX, freeblks);
4840
- resblks = (resblks * 15) >> 4;
4841
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
4842
- 0, XFS_TRANS_RESERVE, &tp);
4843
- if (error)
4844
- return error;
4845
- /* transfer all collected dfops to this transaction */
4846
- xfs_defer_move(tp, parent_tp);
2447
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2448
+ struct xfs_trans_res resv;
48472449
4848
- return xfs_trans_commit(tp);
2450
+ /*
2451
+ * Create a new transaction reservation from the captured
2452
+ * information. Set logcount to 1 to force the new transaction
2453
+ * to regrant every roll so that we can make forward progress
2454
+ * in recovery no matter how full the log might be.
2455
+ */
2456
+ resv.tr_logres = dfc->dfc_logres;
2457
+ resv.tr_logcount = 1;
2458
+ resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
2459
+
2460
+ error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
2461
+ dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
2462
+ if (error) {
2463
+ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
2464
+ return error;
2465
+ }
2466
+
2467
+ /*
2468
+ * Transfer to this new transaction all the dfops we captured
2469
+ * from recovering a single intent item.
2470
+ */
2471
+ list_del_init(&dfc->dfc_list);
2472
+ xfs_defer_ops_continue(dfc, tp, &ip);
2473
+
2474
+ error = xfs_trans_commit(tp);
2475
+ if (ip) {
2476
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
2477
+ xfs_irele(ip);
2478
+ }
2479
+ if (error)
2480
+ return error;
2481
+ }
2482
+
2483
+ ASSERT(list_empty(capture_list));
2484
+ return 0;
48492485 }
48502486
2487
+/* Release all the captured defer ops and capture structures in this list. */
2488
+static void
2489
+xlog_abort_defer_ops(
2490
+ struct xfs_mount *mp,
2491
+ struct list_head *capture_list)
2492
+{
2493
+ struct xfs_defer_capture *dfc;
2494
+ struct xfs_defer_capture *next;
2495
+
2496
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2497
+ list_del_init(&dfc->dfc_list);
2498
+ xfs_defer_ops_release(mp, dfc);
2499
+ }
2500
+}
48512501 /*
48522502 * When this is called, all of the log intent items which did not have
48532503 * corresponding log done items should be in the AIL. What we do now
....@@ -4868,35 +2518,23 @@
48682518 xlog_recover_process_intents(
48692519 struct xlog *log)
48702520 {
4871
- struct xfs_trans *parent_tp;
2521
+ LIST_HEAD(capture_list);
48722522 struct xfs_ail_cursor cur;
48732523 struct xfs_log_item *lip;
48742524 struct xfs_ail *ailp;
4875
- int error;
2525
+ int error = 0;
48762526 #if defined(DEBUG) || defined(XFS_WARN)
48772527 xfs_lsn_t last_lsn;
48782528 #endif
48792529
4880
- /*
4881
- * The intent recovery handlers commit transactions to complete recovery
4882
- * for individual intents, but any new deferred operations that are
4883
- * queued during that process are held off until the very end. The
4884
- * purpose of this transaction is to serve as a container for deferred
4885
- * operations. Each intent recovery handler must transfer dfops here
4886
- * before its local transaction commits, and we'll finish the entire
4887
- * list below.
4888
- */
4889
- error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
4890
- if (error)
4891
- return error;
4892
-
48932530 ailp = log->l_ailp;
48942531 spin_lock(&ailp->ail_lock);
4895
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
48962532 #if defined(DEBUG) || defined(XFS_WARN)
48972533 last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
48982534 #endif
4899
- while (lip != NULL) {
2535
+ for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2536
+ lip != NULL;
2537
+ lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
49002538 /*
49012539 * We're done when we see something other than an intent.
49022540 * There should be no intents left in the AIL now.
....@@ -4918,35 +2556,29 @@
49182556
49192557 /*
49202558 * NOTE: If your intent processing routine can create more
4921
- * deferred ops, you /must/ attach them to the dfops in this
4922
- * routine or else those subsequent intents will get
2559
+ * deferred ops, you /must/ attach them to the capture list in
2560
+ * the recover routine or else those subsequent intents will be
49232561 * replayed in the wrong order!
49242562 */
4925
- switch (lip->li_type) {
4926
- case XFS_LI_EFI:
4927
- error = xlog_recover_process_efi(log->l_mp, ailp, lip);
4928
- break;
4929
- case XFS_LI_RUI:
4930
- error = xlog_recover_process_rui(log->l_mp, ailp, lip);
4931
- break;
4932
- case XFS_LI_CUI:
4933
- error = xlog_recover_process_cui(parent_tp, ailp, lip);
4934
- break;
4935
- case XFS_LI_BUI:
4936
- error = xlog_recover_process_bui(parent_tp, ailp, lip);
4937
- break;
4938
- }
2563
+ spin_unlock(&ailp->ail_lock);
2564
+ error = lip->li_ops->iop_recover(lip, &capture_list);
2565
+ spin_lock(&ailp->ail_lock);
49392566 if (error)
4940
- goto out;
4941
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
2567
+ break;
49422568 }
4943
-out:
2569
+
49442570 xfs_trans_ail_cursor_done(&cur);
49452571 spin_unlock(&ailp->ail_lock);
4946
- if (!error)
4947
- error = xlog_finish_defer_ops(parent_tp);
4948
- xfs_trans_cancel(parent_tp);
2572
+ if (error)
2573
+ goto err;
49492574
2575
+ error = xlog_finish_defer_ops(log->l_mp, &capture_list);
2576
+ if (error)
2577
+ goto err;
2578
+
2579
+ return 0;
2580
+err:
2581
+ xlog_abort_defer_ops(log->l_mp, &capture_list);
49502582 return error;
49512583 }
49522584
....@@ -4954,12 +2586,11 @@
49542586 * A cancel occurs when the mount has failed and we're bailing out.
49552587 * Release all pending log intent items so they don't pin the AIL.
49562588 */
4957
-STATIC int
2589
+STATIC void
49582590 xlog_recover_cancel_intents(
49592591 struct xlog *log)
49602592 {
49612593 struct xfs_log_item *lip;
4962
- int error = 0;
49632594 struct xfs_ail_cursor cur;
49642595 struct xfs_ail *ailp;
49652596
....@@ -4979,27 +2610,14 @@
49792610 break;
49802611 }
49812612
4982
- switch (lip->li_type) {
4983
- case XFS_LI_EFI:
4984
- xlog_recover_cancel_efi(log->l_mp, ailp, lip);
4985
- break;
4986
- case XFS_LI_RUI:
4987
- xlog_recover_cancel_rui(log->l_mp, ailp, lip);
4988
- break;
4989
- case XFS_LI_CUI:
4990
- xlog_recover_cancel_cui(log->l_mp, ailp, lip);
4991
- break;
4992
- case XFS_LI_BUI:
4993
- xlog_recover_cancel_bui(log->l_mp, ailp, lip);
4994
- break;
4995
- }
4996
-
2613
+ spin_unlock(&ailp->ail_lock);
2614
+ lip->li_ops->iop_release(lip);
2615
+ spin_lock(&ailp->ail_lock);
49972616 lip = xfs_trans_ail_cursor_next(ailp, &cur);
49982617 }
49992618
50002619 xfs_trans_ail_cursor_done(&cur);
50012620 spin_unlock(&ailp->ail_lock);
5002
- return error;
50032621 }
50042622
50052623 /*
....@@ -5026,7 +2644,7 @@
50262644 if (error)
50272645 goto out_abort;
50282646
5029
- agi = XFS_BUF_TO_AGI(agibp);
2647
+ agi = agibp->b_addr;
50302648 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
50312649 offset = offsetof(xfs_agi_t, agi_unlinked) +
50322650 (sizeof(xfs_agino_t) * bucket);
....@@ -5066,7 +2684,7 @@
50662684 /*
50672685 * Get the on disk inode to find the next inode in the bucket.
50682686 */
5069
- error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
2687
+ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0);
50702688 if (error)
50712689 goto fail_iput;
50722690
....@@ -5103,16 +2721,27 @@
51032721 }
51042722
51052723 /*
5106
- * xlog_iunlink_recover
2724
+ * Recover AGI unlinked lists
51072725 *
5108
- * This is called during recovery to process any inodes which
5109
- * we unlinked but not freed when the system crashed. These
5110
- * inodes will be on the lists in the AGI blocks. What we do
5111
- * here is scan all the AGIs and fully truncate and free any
5112
- * inodes found on the lists. Each inode is removed from the
5113
- * lists when it has been fully truncated and is freed. The
5114
- * freeing of the inode and its removal from the list must be
5115
- * atomic.
2726
+ * This is called during recovery to process any inodes which we unlinked but
2727
+ * not freed when the system crashed. These inodes will be on the lists in the
2728
+ * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
2729
+ * any inodes found on the lists. Each inode is removed from the lists when it
2730
+ * has been fully truncated and is freed. The freeing of the inode and its
2731
+ * removal from the list must be atomic.
2732
+ *
2733
+ * If everything we touch in the agi processing loop is already in memory, this
2734
+ * loop can hold the cpu for a long time. It runs without lock contention,
2735
+ * memory allocation contention, the need wait for IO, etc, and so will run
2736
+ * until we either run out of inodes to process, run low on memory or we run out
2737
+ * of log space.
2738
+ *
2739
+ * This behaviour is bad for latency on single CPU and non-preemptible kernels,
2740
+ * and can prevent other filesytem work (such as CIL pushes) from running. This
2741
+ * can lead to deadlocks if the recovery process runs out of log reservation
2742
+ * space. Hence we need to yield the CPU when there is other kernel work
2743
+ * scheduled on this CPU to ensure other scheduled work can run without undue
2744
+ * latency.
51162745 */
51172746 STATIC void
51182747 xlog_recover_process_iunlinks(
....@@ -5151,7 +2780,7 @@
51512780 * buffer reference though, so that it stays pinned in memory
51522781 * while we need the buffer.
51532782 */
5154
- agi = XFS_BUF_TO_AGI(agibp);
2783
+ agi = agibp->b_addr;
51552784 xfs_buf_unlock(agibp);
51562785
51572786 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
....@@ -5159,13 +2788,14 @@
51592788 while (agino != NULLAGINO) {
51602789 agino = xlog_recover_process_one_iunlink(mp,
51612790 agno, agino, bucket);
2791
+ cond_resched();
51622792 }
51632793 }
51642794 xfs_buf_rele(agibp);
51652795 }
51662796 }
51672797
5168
-STATIC int
2798
+STATIC void
51692799 xlog_unpack_data(
51702800 struct xlog_rec_header *rhead,
51712801 char *dp,
....@@ -5188,8 +2818,6 @@
51882818 dp += BBSIZE;
51892819 }
51902820 }
5191
-
5192
- return 0;
51932821 }
51942822
51952823 /*
....@@ -5204,10 +2832,8 @@
52042832 int pass,
52052833 struct list_head *buffer_list)
52062834 {
5207
- int error;
52082835 __le32 old_crc = rhead->h_crc;
52092836 __le32 crc;
5210
-
52112837
52122838 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
52132839
....@@ -5243,13 +2869,13 @@
52432869 * If the filesystem is CRC enabled, this mismatch becomes a
52442870 * fatal log corruption failure.
52452871 */
5246
- if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
2872
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
2873
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
52472874 return -EFSCORRUPTED;
2875
+ }
52482876 }
52492877
5250
- error = xlog_unpack_data(rhead, dp, log);
5251
- if (error)
5252
- return error;
2878
+ xlog_unpack_data(rhead, dp, log);
52532879
52542880 return xlog_recover_process_data(log, rhash, rhead, dp, pass,
52552881 buffer_list);
....@@ -5259,35 +2885,34 @@
52592885 xlog_valid_rec_header(
52602886 struct xlog *log,
52612887 struct xlog_rec_header *rhead,
5262
- xfs_daddr_t blkno)
2888
+ xfs_daddr_t blkno,
2889
+ int bufsize)
52632890 {
52642891 int hlen;
52652892
5266
- if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
5267
- XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
5268
- XFS_ERRLEVEL_LOW, log->l_mp);
2893
+ if (XFS_IS_CORRUPT(log->l_mp,
2894
+ rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
52692895 return -EFSCORRUPTED;
5270
- }
5271
- if (unlikely(
5272
- (!rhead->h_version ||
5273
- (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
2896
+ if (XFS_IS_CORRUPT(log->l_mp,
2897
+ (!rhead->h_version ||
2898
+ (be32_to_cpu(rhead->h_version) &
2899
+ (~XLOG_VERSION_OKBITS))))) {
52742900 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
52752901 __func__, be32_to_cpu(rhead->h_version));
5276
- return -EIO;
2902
+ return -EFSCORRUPTED;
52772903 }
52782904
5279
- /* LR body must have data or it wouldn't have been written */
2905
+ /*
2906
+ * LR body must have data (or it wouldn't have been written)
2907
+ * and h_len must not be greater than LR buffer size.
2908
+ */
52802909 hlen = be32_to_cpu(rhead->h_len);
5281
- if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
5282
- XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
5283
- XFS_ERRLEVEL_LOW, log->l_mp);
2910
+ if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize))
52842911 return -EFSCORRUPTED;
5285
- }
5286
- if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
5287
- XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
5288
- XFS_ERRLEVEL_LOW, log->l_mp);
2912
+
2913
+ if (XFS_IS_CORRUPT(log->l_mp,
2914
+ blkno > log->l_logBBsize || blkno > INT_MAX))
52892915 return -EFSCORRUPTED;
5290
- }
52912916 return 0;
52922917 }
52932918
....@@ -5311,7 +2936,7 @@
53112936 xfs_daddr_t blk_no, rblk_no;
53122937 xfs_daddr_t rhead_blk;
53132938 char *offset;
5314
- xfs_buf_t *hbp, *dbp;
2939
+ char *hbp, *dbp;
53152940 int error = 0, h_size, h_len;
53162941 int error2 = 0;
53172942 int bblks, split_bblks;
....@@ -5336,7 +2961,7 @@
53362961 * iclog header and extract the header size from it. Get a
53372962 * new hbp that is the correct size.
53382963 */
5339
- hbp = xlog_get_bp(log, 1);
2964
+ hbp = xlog_alloc_buffer(log, 1);
53402965 if (!hbp)
53412966 return -ENOMEM;
53422967
....@@ -5345,9 +2970,6 @@
53452970 goto bread_err1;
53462971
53472972 rhead = (xlog_rec_header_t *)offset;
5348
- error = xlog_valid_rec_header(log, rhead, tail_blk);
5349
- if (error)
5350
- goto bread_err1;
53512973
53522974 /*
53532975 * xfsprogs has a bug where record length is based on lsunit but
....@@ -5362,39 +2984,35 @@
53622984 */
53632985 h_size = be32_to_cpu(rhead->h_size);
53642986 h_len = be32_to_cpu(rhead->h_len);
5365
- if (h_len > h_size) {
5366
- if (h_len <= log->l_mp->m_logbsize &&
5367
- be32_to_cpu(rhead->h_num_logops) == 1) {
5368
- xfs_warn(log->l_mp,
2987
+ if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
2988
+ rhead->h_num_logops == cpu_to_be32(1)) {
2989
+ xfs_warn(log->l_mp,
53692990 "invalid iclog size (%d bytes), using lsunit (%d bytes)",
5370
- h_size, log->l_mp->m_logbsize);
5371
- h_size = log->l_mp->m_logbsize;
5372
- } else
5373
- return -EFSCORRUPTED;
2991
+ h_size, log->l_mp->m_logbsize);
2992
+ h_size = log->l_mp->m_logbsize;
53742993 }
53752994
5376
- if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
5377
- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
5378
- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
5379
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
5380
- hblks++;
5381
- xlog_put_bp(hbp);
5382
- hbp = xlog_get_bp(log, hblks);
5383
- } else {
5384
- hblks = 1;
2995
+ error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
2996
+ if (error)
2997
+ goto bread_err1;
2998
+
2999
+ hblks = xlog_logrec_hblks(log, rhead);
3000
+ if (hblks != 1) {
3001
+ kmem_free(hbp);
3002
+ hbp = xlog_alloc_buffer(log, hblks);
53853003 }
53863004 } else {
53873005 ASSERT(log->l_sectBBsize == 1);
53883006 hblks = 1;
5389
- hbp = xlog_get_bp(log, 1);
3007
+ hbp = xlog_alloc_buffer(log, 1);
53903008 h_size = XLOG_BIG_RECORD_BSIZE;
53913009 }
53923010
53933011 if (!hbp)
53943012 return -ENOMEM;
5395
- dbp = xlog_get_bp(log, BTOBB(h_size));
3013
+ dbp = xlog_alloc_buffer(log, BTOBB(h_size));
53963014 if (!dbp) {
5397
- xlog_put_bp(hbp);
3015
+ kmem_free(hbp);
53983016 return -ENOMEM;
53993017 }
54003018
....@@ -5409,7 +3027,7 @@
54093027 /*
54103028 * Check for header wrapping around physical end-of-log
54113029 */
5412
- offset = hbp->b_addr;
3030
+ offset = hbp;
54133031 split_hblks = 0;
54143032 wrapped_hblks = 0;
54153033 if (blk_no + hblks <= log->l_logBBsize) {
....@@ -5445,15 +3063,15 @@
54453063 * - order is important.
54463064 */
54473065 wrapped_hblks = hblks - split_hblks;
5448
- error = xlog_bread_offset(log, 0,
5449
- wrapped_hblks, hbp,
3066
+ error = xlog_bread_noalign(log, 0,
3067
+ wrapped_hblks,
54503068 offset + BBTOB(split_hblks));
54513069 if (error)
54523070 goto bread_err2;
54533071 }
54543072 rhead = (xlog_rec_header_t *)offset;
54553073 error = xlog_valid_rec_header(log, rhead,
5456
- split_hblks ? blk_no : 0);
3074
+ split_hblks ? blk_no : 0, h_size);
54573075 if (error)
54583076 goto bread_err2;
54593077
....@@ -5477,7 +3095,7 @@
54773095 } else {
54783096 /* This log record is split across the
54793097 * physical end of log */
5480
- offset = dbp->b_addr;
3098
+ offset = dbp;
54813099 split_bblks = 0;
54823100 if (blk_no != log->l_logBBsize) {
54833101 /* some data is before the physical
....@@ -5506,8 +3124,8 @@
55063124 * _first_, then the log start (LR header end)
55073125 * - order is important.
55083126 */
5509
- error = xlog_bread_offset(log, 0,
5510
- bblks - split_bblks, dbp,
3127
+ error = xlog_bread_noalign(log, 0,
3128
+ bblks - split_bblks,
55113129 offset + BBTOB(split_bblks));
55123130 if (error)
55133131 goto bread_err2;
....@@ -5534,7 +3152,7 @@
55343152 goto bread_err2;
55353153
55363154 rhead = (xlog_rec_header_t *)offset;
5537
- error = xlog_valid_rec_header(log, rhead, blk_no);
3155
+ error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
55383156 if (error)
55393157 goto bread_err2;
55403158
....@@ -5555,9 +3173,9 @@
55553173 }
55563174
55573175 bread_err2:
5558
- xlog_put_bp(dbp);
3176
+ kmem_free(dbp);
55593177 bread_err1:
5560
- xlog_put_bp(hbp);
3178
+ kmem_free(hbp);
55613179
55623180 /*
55633181 * Submit buffers that have been added from the last record processed,
....@@ -5614,7 +3232,7 @@
56143232 */
56153233 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
56163234 sizeof(struct list_head),
5617
- KM_SLEEP);
3235
+ 0);
56183236 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
56193237 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
56203238
....@@ -5651,14 +3269,14 @@
56513269 */
56523270 STATIC int
56533271 xlog_do_recover(
5654
- struct xlog *log,
5655
- xfs_daddr_t head_blk,
5656
- xfs_daddr_t tail_blk)
3272
+ struct xlog *log,
3273
+ xfs_daddr_t head_blk,
3274
+ xfs_daddr_t tail_blk)
56573275 {
5658
- struct xfs_mount *mp = log->l_mp;
5659
- int error;
5660
- xfs_buf_t *bp;
5661
- xfs_sb_t *sbp;
3276
+ struct xfs_mount *mp = log->l_mp;
3277
+ struct xfs_buf *bp = mp->m_sb_bp;
3278
+ struct xfs_sb *sbp = &mp->m_sb;
3279
+ int error;
56623280
56633281 trace_xfs_log_recover(log, head_blk, tail_blk);
56643282
....@@ -5672,9 +3290,8 @@
56723290 /*
56733291 * If IO errors happened during recovery, bail out.
56743292 */
5675
- if (XFS_FORCED_SHUTDOWN(mp)) {
3293
+ if (XFS_FORCED_SHUTDOWN(mp))
56763294 return -EIO;
5677
- }
56783295
56793296 /*
56803297 * We now update the tail_lsn since much of the recovery has completed
....@@ -5688,19 +3305,15 @@
56883305 xlog_assign_tail_lsn(mp);
56893306
56903307 /*
5691
- * Now that we've finished replaying all buffer and inode
5692
- * updates, re-read in the superblock and reverify it.
3308
+ * Now that we've finished replaying all buffer and inode updates,
3309
+ * re-read the superblock and reverify it.
56933310 */
5694
- bp = xfs_getsb(mp, 0);
5695
- bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
5696
- ASSERT(!(bp->b_flags & XBF_WRITE));
5697
- bp->b_flags |= XBF_READ;
5698
- bp->b_ops = &xfs_sb_buf_ops;
5699
-
5700
- error = xfs_buf_submit(bp);
3311
+ xfs_buf_lock(bp);
3312
+ xfs_buf_hold(bp);
3313
+ error = _xfs_buf_read(bp, XBF_READ);
57013314 if (error) {
57023315 if (!XFS_FORCED_SHUTDOWN(mp)) {
5703
- xfs_buf_ioerror_alert(bp, __func__);
3316
+ xfs_buf_ioerror_alert(bp, __this_address);
57043317 ASSERT(0);
57053318 }
57063319 xfs_buf_relse(bp);
....@@ -5708,8 +3321,7 @@
57083321 }
57093322
57103323 /* Convert superblock from on-disk format */
5711
- sbp = &mp->m_sb;
5712
- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
3324
+ xfs_sb_from_disk(sbp, bp->b_addr);
57133325 xfs_buf_relse(bp);
57143326
57153327 /* re-initialise in-core superblock and geometry structures */
....@@ -5838,6 +3450,15 @@
58383450 int error;
58393451 error = xlog_recover_process_intents(log);
58403452 if (error) {
3453
+ /*
3454
+ * Cancel all the unprocessed intent items now so that
3455
+ * we don't leave them pinned in the AIL. This can
3456
+ * cause the AIL to livelock on the pinned item if
3457
+ * anyone tries to push the AIL (inode reclaim does
3458
+ * this) before we get around to xfs_log_mount_cancel.
3459
+ */
3460
+ xlog_recover_cancel_intents(log);
3461
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
58413462 xfs_alert(log->l_mp, "Failed to recover intents");
58423463 return error;
58433464 }
....@@ -5864,16 +3485,12 @@
58643485 return 0;
58653486 }
58663487
5867
-int
3488
+void
58683489 xlog_recover_cancel(
58693490 struct xlog *log)
58703491 {
5871
- int error = 0;
5872
-
58733492 if (log->l_flags & XLOG_RECOVERY_NEEDED)
5874
- error = xlog_recover_cancel_intents(log);
5875
-
5876
- return error;
3493
+ xlog_recover_cancel_intents(log);
58773494 }
58783495
58793496 #if defined(DEBUG)
....@@ -5886,7 +3503,6 @@
58863503 struct xlog *log)
58873504 {
58883505 xfs_mount_t *mp;
5889
- xfs_agf_t *agfp;
58903506 xfs_buf_t *agfbp;
58913507 xfs_buf_t *agibp;
58923508 xfs_agnumber_t agno;
....@@ -5906,7 +3522,8 @@
59063522 xfs_alert(mp, "%s agf read failed agno %d error %d",
59073523 __func__, agno, error);
59083524 } else {
5909
- agfp = XFS_BUF_TO_AGF(agfbp);
3525
+ struct xfs_agf *agfp = agfbp->b_addr;
3526
+
59103527 freeblks += be32_to_cpu(agfp->agf_freeblks) +
59113528 be32_to_cpu(agfp->agf_flcount);
59123529 xfs_buf_relse(agfbp);
....@@ -5917,7 +3534,7 @@
59173534 xfs_alert(mp, "%s agi read failed agno %d error %d",
59183535 __func__, agno, error);
59193536 } else {
5920
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
3537
+ struct xfs_agi *agi = agibp->b_addr;
59213538
59223539 itotal += be32_to_cpu(agi->agi_count);
59233540 ifree += be32_to_cpu(agi->agi_freecount);