.. | .. |
---|
10 | 10 | struct xlog; |
---|
11 | 11 | struct xlog_ticket; |
---|
12 | 12 | struct xfs_mount; |
---|
13 | | -struct xfs_log_callback; |
---|
14 | 13 | |
---|
15 | 14 | /* |
---|
16 | 15 | * Flags for log structure |
---|
.. | .. |
---|
41 | 40 | /* |
---|
42 | 41 | * In core log state |
---|
43 | 42 | */ |
---|
44 | | -#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ |
---|
45 | | -#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ |
---|
46 | | -#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ |
---|
47 | | -#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ |
---|
48 | | -#define XLOG_STATE_DO_CALLBACK \ |
---|
49 | | - 0x0010 /* Process callback functions */ |
---|
50 | | -#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ |
---|
51 | | -#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ |
---|
52 | | -#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ |
---|
53 | | -#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */ |
---|
54 | | -#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ |
---|
55 | | -#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ |
---|
| 43 | +enum xlog_iclog_state { |
---|
| 44 | + XLOG_STATE_ACTIVE, /* Current IC log being written to */ |
---|
| 45 | + XLOG_STATE_WANT_SYNC, /* Want to sync this iclog; no more writes */ |
---|
| 46 | + XLOG_STATE_SYNCING, /* This IC log is syncing */ |
---|
| 47 | + XLOG_STATE_DONE_SYNC, /* Done syncing to disk */ |
---|
| 48 | + XLOG_STATE_CALLBACK, /* Callback functions now */ |
---|
| 49 | + XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */ |
---|
| 50 | + XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */ |
---|
| 51 | +}; |
---|
56 | 52 | |
---|
57 | 53 | /* |
---|
58 | | - * Flags to log ticket |
---|
| 54 | + * Log ticket flags |
---|
59 | 55 | */ |
---|
60 | | -#define XLOG_TIC_INITED 0x1 /* has been initialized */ |
---|
61 | | -#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ |
---|
| 56 | +#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */ |
---|
62 | 57 | |
---|
63 | 58 | #define XLOG_TIC_FLAGS \ |
---|
64 | | - { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ |
---|
65 | 59 | { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } |
---|
66 | 60 | |
---|
67 | 61 | /* |
---|
.. | .. |
---|
179 | 173 | * the iclog. |
---|
180 | 174 | * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. |
---|
181 | 175 | * - ic_next is the pointer to the next iclog in the ring. |
---|
182 | | - * - ic_bp is a pointer to the buffer used to write this incore log to disk. |
---|
183 | 176 | * - ic_log is a pointer back to the global log structure. |
---|
184 | | - * - ic_callback is a linked list of callback function/argument pairs to be |
---|
185 | | - * called after an iclog finishes writing. |
---|
186 | | - * - ic_size is the full size of the header plus data. |
---|
| 177 | + * - ic_size is the full size of the log buffer, minus the cycle headers. |
---|
187 | 178 | * - ic_offset is the current number of bytes written to in this iclog. |
---|
188 | 179 | * - ic_refcnt is bumped when someone is writing to the log. |
---|
189 | 180 | * - ic_state is the state of the iclog. |
---|
.. | .. |
---|
193 | 184 | * structure cacheline aligned. The following fields can be contended on |
---|
194 | 185 | * by independent processes: |
---|
195 | 186 | * |
---|
196 | | - * - ic_callback_* |
---|
| 187 | + * - ic_callbacks |
---|
197 | 188 | * - ic_refcnt |
---|
198 | 189 | * - fields protected by the global l_icloglock |
---|
199 | 190 | * |
---|
.. | .. |
---|
206 | 197 | wait_queue_head_t ic_write_wait; |
---|
207 | 198 | struct xlog_in_core *ic_next; |
---|
208 | 199 | struct xlog_in_core *ic_prev; |
---|
209 | | - struct xfs_buf *ic_bp; |
---|
210 | 200 | struct xlog *ic_log; |
---|
211 | | - int ic_size; |
---|
212 | | - int ic_offset; |
---|
213 | | - int ic_bwritecnt; |
---|
214 | | - unsigned short ic_state; |
---|
| 201 | + u32 ic_size; |
---|
| 202 | + u32 ic_offset; |
---|
| 203 | + enum xlog_iclog_state ic_state; |
---|
215 | 204 | char *ic_datap; /* pointer to iclog data */ |
---|
216 | 205 | |
---|
217 | 206 | /* Callback structures need their own cacheline */ |
---|
218 | 207 | spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; |
---|
219 | | - struct xfs_log_callback *ic_callback; |
---|
220 | | - struct xfs_log_callback **ic_callback_tail; |
---|
| 208 | + struct list_head ic_callbacks; |
---|
221 | 209 | |
---|
222 | 210 | /* reference counts need their own cacheline */ |
---|
223 | 211 | atomic_t ic_refcnt ____cacheline_aligned_in_smp; |
---|
224 | 212 | xlog_in_core_2_t *ic_data; |
---|
225 | 213 | #define ic_header ic_data->hic_header |
---|
| 214 | +#ifdef DEBUG |
---|
| 215 | + bool ic_fail_crc : 1; |
---|
| 216 | +#endif |
---|
| 217 | + struct semaphore ic_sema; |
---|
| 218 | + struct work_struct ic_end_io_work; |
---|
| 219 | + struct bio ic_bio; |
---|
| 220 | + struct bio_vec ic_bvec[]; |
---|
226 | 221 | } xlog_in_core_t; |
---|
227 | 222 | |
---|
228 | 223 | /* |
---|
.. | .. |
---|
235 | 230 | |
---|
236 | 231 | struct xfs_cil_ctx { |
---|
237 | 232 | struct xfs_cil *cil; |
---|
238 | | - xfs_lsn_t sequence; /* chkpt sequence # */ |
---|
| 233 | + xfs_csn_t sequence; /* chkpt sequence # */ |
---|
239 | 234 | xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ |
---|
240 | 235 | xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ |
---|
241 | 236 | struct xlog_ticket *ticket; /* chkpt ticket */ |
---|
.. | .. |
---|
243 | 238 | int space_used; /* aggregate size of regions */ |
---|
244 | 239 | struct list_head busy_extents; /* busy extents in chkpt */ |
---|
245 | 240 | struct xfs_log_vec *lv_chain; /* logvecs being pushed */ |
---|
246 | | - struct xfs_log_callback log_cb; /* completion callback hook. */ |
---|
| 241 | + struct list_head iclog_entry; |
---|
247 | 242 | struct list_head committing; /* ctx committing list */ |
---|
248 | 243 | struct work_struct discard_endio_work; |
---|
249 | 244 | }; |
---|
.. | .. |
---|
273 | 268 | struct xfs_cil_ctx *xc_ctx; |
---|
274 | 269 | |
---|
275 | 270 | spinlock_t xc_push_lock ____cacheline_aligned_in_smp; |
---|
276 | | - xfs_lsn_t xc_push_seq; |
---|
| 271 | + xfs_csn_t xc_push_seq; |
---|
277 | 272 | struct list_head xc_committing; |
---|
278 | 273 | wait_queue_head_t xc_commit_wait; |
---|
279 | | - xfs_lsn_t xc_current_sequence; |
---|
| 274 | + xfs_csn_t xc_current_sequence; |
---|
280 | 275 | struct work_struct xc_push_work; |
---|
| 276 | + wait_queue_head_t xc_push_wait; /* background push throttle */ |
---|
281 | 277 | } ____cacheline_aligned_in_smp; |
---|
282 | 278 | |
---|
283 | 279 | /* |
---|
.. | .. |
---|
321 | 317 | * tries to keep 25% of the log free, so we need to keep below that limit or we |
---|
322 | 318 | * risk running out of free log space to start any new transactions. |
---|
323 | 319 | * |
---|
324 | | - * In order to keep background CIL push efficient, we will set a lower |
---|
325 | | - * threshold at which background pushing is attempted without blocking current |
---|
326 | | - * transaction commits. A separate, higher bound defines when CIL pushes are |
---|
327 | | - * enforced to ensure we stay within our maximum checkpoint size bounds. |
---|
328 | | - * threshold, yet give us plenty of space for aggregation on large logs. |
---|
| 320 | + * In order to keep background CIL push efficient, we only need to ensure the |
---|
| 321 | + * CIL is large enough to maintain sufficient in-memory relogging to avoid |
---|
| 322 | + * repeated physical writes of frequently modified metadata. If we allow the CIL |
---|
| 323 | + * to grow to a substantial fraction of the log, then we may be pinning hundreds |
---|
| 324 | + * of megabytes of metadata in memory until the CIL flushes. This can cause |
---|
| 325 | + * issues when we are running low on memory - pinned memory cannot be reclaimed, |
---|
| 326 | + * and the CIL consumes a lot of memory. Hence we need to set an upper physical |
---|
| 327 | + * size limit for the CIL that limits the maximum amount of memory pinned by the |
---|
| 328 | + * CIL but does not limit performance by reducing relogging efficiency |
---|
| 329 | + * significantly. |
---|
| 330 | + * |
---|
| 331 | + * As such, the CIL push threshold ends up being the smaller of two thresholds: |
---|
| 332 | + * - a threshold large enough that it allows CIL to be pushed and progress to be |
---|
| 333 | + * made without excessive blocking of incoming transaction commits. This is |
---|
| 334 | + * defined to be 12.5% of the log space - half the 25% push threshold of the |
---|
| 335 | + * AIL. |
---|
| 336 | + * - small enough that it doesn't pin excessive amounts of memory but maintains |
---|
| 337 | + * close to peak relogging efficiency. This is defined to be 16x the iclog |
---|
| 338 | + * buffer window (32MB) as measurements have shown this to be roughly the |
---|
| 339 | + * point of diminishing performance increases under highly concurrent |
---|
| 340 | + * modification workloads. |
---|
| 341 | + * |
---|
| 342 | + * To prevent the CIL from overflowing upper commit size bounds, we introduce a |
---|
| 343 | + * new threshold at which we block committing transactions until the background |
---|
| 344 | + * CIL commit commences and switches to a new context. While this is not a hard |
---|
| 345 | + * limit, it forces the process committing a transaction to the CIL to block and |
---|
| 346 | + * yeild the CPU, giving the CIL push work a chance to be scheduled and start |
---|
| 347 | + * work. This prevents a process running lots of transactions from overfilling |
---|
| 348 | + * the CIL because it is not yielding the CPU. We set the blocking limit at |
---|
| 349 | + * twice the background push space threshold so we keep in line with the AIL |
---|
| 350 | + * push thresholds. |
---|
| 351 | + * |
---|
| 352 | + * Note: this is not a -hard- limit as blocking is applied after the transaction |
---|
| 353 | + * is inserted into the CIL and the push has been triggered. It is largely a |
---|
| 354 | + * throttling mechanism that allows the CIL push to be scheduled and run. A hard |
---|
| 355 | + * limit will be difficult to implement without introducing global serialisation |
---|
| 356 | + * in the CIL commit fast path, and it's not at all clear that we actually need |
---|
| 357 | + * such hard limits given the ~7 years we've run without a hard limit before |
---|
| 358 | + * finding the first situation where a checkpoint size overflow actually |
---|
| 359 | + * occurred. Hence the simple throttle, and an ASSERT check to tell us that |
---|
| 360 | + * we've overrun the max size. |
---|
329 | 361 | */ |
---|
330 | | -#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) |
---|
| 362 | +#define XLOG_CIL_SPACE_LIMIT(log) \ |
---|
| 363 | + min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4) |
---|
| 364 | + |
---|
| 365 | +#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \ |
---|
| 366 | + (XLOG_CIL_SPACE_LIMIT(log) * 2) |
---|
331 | 367 | |
---|
332 | 368 | /* |
---|
333 | 369 | * ticket grant locks, queues and accounting have their own cachlines |
---|
.. | .. |
---|
350 | 386 | struct xfs_mount *l_mp; /* mount point */ |
---|
351 | 387 | struct xfs_ail *l_ailp; /* AIL log is working with */ |
---|
352 | 388 | struct xfs_cil *l_cilp; /* CIL log is working with */ |
---|
353 | | - struct xfs_buf *l_xbuf; /* extra buffer for log |
---|
354 | | - * wrapping */ |
---|
355 | 389 | struct xfs_buftarg *l_targ; /* buftarg of log */ |
---|
| 390 | + struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */ |
---|
356 | 391 | struct delayed_work l_work; /* background flush work */ |
---|
357 | 392 | uint l_flags; |
---|
358 | 393 | uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ |
---|
.. | .. |
---|
361 | 396 | int l_iclog_heads; /* # of iclog header sectors */ |
---|
362 | 397 | uint l_sectBBsize; /* sector size in BBs (2^n) */ |
---|
363 | 398 | int l_iclog_size; /* size of log in bytes */ |
---|
364 | | - int l_iclog_size_log; /* log power size of log */ |
---|
365 | 399 | int l_iclog_bufs; /* number of iclog buffers */ |
---|
366 | 400 | xfs_daddr_t l_logBBstart; /* start block of log */ |
---|
367 | 401 | int l_logsize; /* size of log in bytes */ |
---|
.. | .. |
---|
399 | 433 | /* The following field are used for debugging; need to hold icloglock */ |
---|
400 | 434 | #ifdef DEBUG |
---|
401 | 435 | void *l_iclog_bak[XLOG_MAX_ICLOGS]; |
---|
402 | | - /* log record crc error injection factor */ |
---|
403 | | - uint32_t l_badcrc_factor; |
---|
404 | 436 | #endif |
---|
405 | 437 | /* log recovery lsn tracking (for buffer submission */ |
---|
406 | 438 | xfs_lsn_t l_recovery_lsn; |
---|
.. | .. |
---|
409 | 441 | #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ |
---|
410 | 442 | ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) |
---|
411 | 443 | |
---|
412 | | -#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) |
---|
| 444 | +#define XLOG_FORCED_SHUTDOWN(log) \ |
---|
| 445 | + (unlikely((log)->l_flags & XLOG_IO_ERROR)) |
---|
413 | 446 | |
---|
414 | 447 | /* common routines */ |
---|
415 | 448 | extern int |
---|
.. | .. |
---|
418 | 451 | extern int |
---|
419 | 452 | xlog_recover_finish( |
---|
420 | 453 | struct xlog *log); |
---|
421 | | -extern int |
---|
| 454 | +extern void |
---|
422 | 455 | xlog_recover_cancel(struct xlog *); |
---|
423 | 456 | |
---|
424 | 457 | extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, |
---|
.. | .. |
---|
431 | 464 | int unit_bytes, |
---|
432 | 465 | int count, |
---|
433 | 466 | char client, |
---|
434 | | - bool permanent, |
---|
435 | | - xfs_km_flags_t alloc_flags); |
---|
436 | | - |
---|
| 467 | + bool permanent); |
---|
437 | 468 | |
---|
438 | 469 | static inline void |
---|
439 | 470 | xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) |
---|
.. | .. |
---|
445 | 476 | |
---|
446 | 477 | void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); |
---|
447 | 478 | void xlog_print_trans(struct xfs_trans *); |
---|
448 | | -int |
---|
449 | | -xlog_write( |
---|
450 | | - struct xlog *log, |
---|
451 | | - struct xfs_log_vec *log_vector, |
---|
452 | | - struct xlog_ticket *tic, |
---|
453 | | - xfs_lsn_t *start_lsn, |
---|
454 | | - struct xlog_in_core **commit_iclog, |
---|
455 | | - uint flags); |
---|
| 479 | +int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, |
---|
| 480 | + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, |
---|
| 481 | + struct xlog_in_core **commit_iclog, uint flags, |
---|
| 482 | + bool need_start_rec); |
---|
| 483 | +int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, |
---|
| 484 | + struct xlog_in_core **iclog, xfs_lsn_t *lsn); |
---|
| 485 | +void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); |
---|
| 486 | +void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); |
---|
456 | 487 | |
---|
457 | 488 | /* |
---|
458 | 489 | * When we crack an atomic LSN, we sample it first so that the value will not |
---|
.. | .. |
---|
516 | 547 | void xlog_cil_init_post_recovery(struct xlog *log); |
---|
517 | 548 | void xlog_cil_destroy(struct xlog *log); |
---|
518 | 549 | bool xlog_cil_empty(struct xlog *log); |
---|
| 550 | +void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp, |
---|
| 551 | + xfs_csn_t *commit_seq, bool regrant); |
---|
519 | 552 | |
---|
520 | 553 | /* |
---|
521 | 554 | * CIL force routines |
---|
522 | 555 | */ |
---|
523 | | -xfs_lsn_t |
---|
524 | | -xlog_cil_force_lsn( |
---|
525 | | - struct xlog *log, |
---|
526 | | - xfs_lsn_t sequence); |
---|
| 556 | +xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence); |
---|
527 | 557 | |
---|
528 | 558 | static inline void |
---|
529 | 559 | xlog_cil_force(struct xlog *log) |
---|
530 | 560 | { |
---|
531 | | - xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); |
---|
| 561 | + xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence); |
---|
532 | 562 | } |
---|
533 | | - |
---|
534 | | -/* |
---|
535 | | - * Unmount record type is used as a pseudo transaction type for the ticket. |
---|
536 | | - * It's value must be outside the range of XFS_TRANS_* values. |
---|
537 | | - */ |
---|
538 | | -#define XLOG_UNMOUNT_REC_TYPE (-1U) |
---|
539 | 563 | |
---|
540 | 564 | /* |
---|
541 | 565 | * Wrapper function for waiting on a wait queue serialised against wakeups |
---|
542 | 566 | * by a spinlock. This matches the semantics of all the wait queues used in the |
---|
543 | 567 | * log code. |
---|
544 | 568 | */ |
---|
545 | | -static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) |
---|
| 569 | +static inline void |
---|
| 570 | +xlog_wait( |
---|
| 571 | + struct wait_queue_head *wq, |
---|
| 572 | + struct spinlock *lock) |
---|
| 573 | + __releases(lock) |
---|
546 | 574 | { |
---|
547 | 575 | DECLARE_WAITQUEUE(wait, current); |
---|
548 | 576 | |
---|