| .. | .. |
|---|
| 10 | 10 | struct xlog; |
|---|
| 11 | 11 | struct xlog_ticket; |
|---|
| 12 | 12 | struct xfs_mount; |
|---|
| 13 | | -struct xfs_log_callback; |
|---|
| 14 | 13 | |
|---|
| 15 | 14 | /* |
|---|
| 16 | 15 | * Flags for log structure |
|---|
| .. | .. |
|---|
| 41 | 40 | /* |
|---|
| 42 | 41 | * In core log state |
|---|
| 43 | 42 | */ |
|---|
| 44 | | -#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ |
|---|
| 45 | | -#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ |
|---|
| 46 | | -#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ |
|---|
| 47 | | -#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ |
|---|
| 48 | | -#define XLOG_STATE_DO_CALLBACK \ |
|---|
| 49 | | - 0x0010 /* Process callback functions */ |
|---|
| 50 | | -#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ |
|---|
| 51 | | -#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ |
|---|
| 52 | | -#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ |
|---|
| 53 | | -#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */ |
|---|
| 54 | | -#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ |
|---|
| 55 | | -#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ |
|---|
| 43 | +enum xlog_iclog_state { |
|---|
| 44 | + XLOG_STATE_ACTIVE, /* Current IC log being written to */ |
|---|
| 45 | + XLOG_STATE_WANT_SYNC, /* Want to sync this iclog; no more writes */ |
|---|
| 46 | + XLOG_STATE_SYNCING, /* This IC log is syncing */ |
|---|
| 47 | + XLOG_STATE_DONE_SYNC, /* Done syncing to disk */ |
|---|
| 48 | + XLOG_STATE_CALLBACK, /* Callback functions now */ |
|---|
| 49 | + XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */ |
|---|
| 50 | + XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */ |
|---|
| 51 | +}; |
|---|
| 56 | 52 | |
|---|
| 57 | 53 | /* |
|---|
| 58 | | - * Flags to log ticket |
|---|
| 54 | + * Log ticket flags |
|---|
| 59 | 55 | */ |
|---|
| 60 | | -#define XLOG_TIC_INITED 0x1 /* has been initialized */ |
|---|
| 61 | | -#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ |
|---|
| 56 | +#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */ |
|---|
| 62 | 57 | |
|---|
| 63 | 58 | #define XLOG_TIC_FLAGS \ |
|---|
| 64 | | - { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ |
|---|
| 65 | 59 | { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } |
|---|
| 66 | 60 | |
|---|
| 67 | 61 | /* |
|---|
| .. | .. |
|---|
| 179 | 173 | * the iclog. |
|---|
| 180 | 174 | * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. |
|---|
| 181 | 175 | * - ic_next is the pointer to the next iclog in the ring. |
|---|
| 182 | | - * - ic_bp is a pointer to the buffer used to write this incore log to disk. |
|---|
| 183 | 176 | * - ic_log is a pointer back to the global log structure. |
|---|
| 184 | | - * - ic_callback is a linked list of callback function/argument pairs to be |
|---|
| 185 | | - * called after an iclog finishes writing. |
|---|
| 186 | | - * - ic_size is the full size of the header plus data. |
|---|
| 177 | + * - ic_size is the full size of the log buffer, minus the cycle headers. |
|---|
| 187 | 178 | * - ic_offset is the current number of bytes written to in this iclog. |
|---|
| 188 | 179 | * - ic_refcnt is bumped when someone is writing to the log. |
|---|
| 189 | 180 | * - ic_state is the state of the iclog. |
|---|
| .. | .. |
|---|
| 193 | 184 | * structure cacheline aligned. The following fields can be contended on |
|---|
| 194 | 185 | * by independent processes: |
|---|
| 195 | 186 | * |
|---|
| 196 | | - * - ic_callback_* |
|---|
| 187 | + * - ic_callbacks |
|---|
| 197 | 188 | * - ic_refcnt |
|---|
| 198 | 189 | * - fields protected by the global l_icloglock |
|---|
| 199 | 190 | * |
|---|
| .. | .. |
|---|
| 206 | 197 | wait_queue_head_t ic_write_wait; |
|---|
| 207 | 198 | struct xlog_in_core *ic_next; |
|---|
| 208 | 199 | struct xlog_in_core *ic_prev; |
|---|
| 209 | | - struct xfs_buf *ic_bp; |
|---|
| 210 | 200 | struct xlog *ic_log; |
|---|
| 211 | | - int ic_size; |
|---|
| 212 | | - int ic_offset; |
|---|
| 213 | | - int ic_bwritecnt; |
|---|
| 214 | | - unsigned short ic_state; |
|---|
| 201 | + u32 ic_size; |
|---|
| 202 | + u32 ic_offset; |
|---|
| 203 | + enum xlog_iclog_state ic_state; |
|---|
| 215 | 204 | char *ic_datap; /* pointer to iclog data */ |
|---|
| 216 | 205 | |
|---|
| 217 | 206 | /* Callback structures need their own cacheline */ |
|---|
| 218 | 207 | spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; |
|---|
| 219 | | - struct xfs_log_callback *ic_callback; |
|---|
| 220 | | - struct xfs_log_callback **ic_callback_tail; |
|---|
| 208 | + struct list_head ic_callbacks; |
|---|
| 221 | 209 | |
|---|
| 222 | 210 | /* reference counts need their own cacheline */ |
|---|
| 223 | 211 | atomic_t ic_refcnt ____cacheline_aligned_in_smp; |
|---|
| 224 | 212 | xlog_in_core_2_t *ic_data; |
|---|
| 225 | 213 | #define ic_header ic_data->hic_header |
|---|
| 214 | +#ifdef DEBUG |
|---|
| 215 | + bool ic_fail_crc : 1; |
|---|
| 216 | +#endif |
|---|
| 217 | + struct semaphore ic_sema; |
|---|
| 218 | + struct work_struct ic_end_io_work; |
|---|
| 219 | + struct bio ic_bio; |
|---|
| 220 | + struct bio_vec ic_bvec[]; |
|---|
| 226 | 221 | } xlog_in_core_t; |
|---|
| 227 | 222 | |
|---|
| 228 | 223 | /* |
|---|
| .. | .. |
|---|
| 235 | 230 | |
|---|
| 236 | 231 | struct xfs_cil_ctx { |
|---|
| 237 | 232 | struct xfs_cil *cil; |
|---|
| 238 | | - xfs_lsn_t sequence; /* chkpt sequence # */ |
|---|
| 233 | + xfs_csn_t sequence; /* chkpt sequence # */ |
|---|
| 239 | 234 | xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ |
|---|
| 240 | 235 | xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ |
|---|
| 241 | 236 | struct xlog_ticket *ticket; /* chkpt ticket */ |
|---|
| .. | .. |
|---|
| 243 | 238 | int space_used; /* aggregate size of regions */ |
|---|
| 244 | 239 | struct list_head busy_extents; /* busy extents in chkpt */ |
|---|
| 245 | 240 | struct xfs_log_vec *lv_chain; /* logvecs being pushed */ |
|---|
| 246 | | - struct xfs_log_callback log_cb; /* completion callback hook. */ |
|---|
| 241 | + struct list_head iclog_entry; |
|---|
| 247 | 242 | struct list_head committing; /* ctx committing list */ |
|---|
| 248 | 243 | struct work_struct discard_endio_work; |
|---|
| 249 | 244 | }; |
|---|
| .. | .. |
|---|
| 273 | 268 | struct xfs_cil_ctx *xc_ctx; |
|---|
| 274 | 269 | |
|---|
| 275 | 270 | spinlock_t xc_push_lock ____cacheline_aligned_in_smp; |
|---|
| 276 | | - xfs_lsn_t xc_push_seq; |
|---|
| 271 | + xfs_csn_t xc_push_seq; |
|---|
| 277 | 272 | struct list_head xc_committing; |
|---|
| 278 | 273 | wait_queue_head_t xc_commit_wait; |
|---|
| 279 | | - xfs_lsn_t xc_current_sequence; |
|---|
| 274 | + xfs_csn_t xc_current_sequence; |
|---|
| 280 | 275 | struct work_struct xc_push_work; |
|---|
| 276 | + wait_queue_head_t xc_push_wait; /* background push throttle */ |
|---|
| 281 | 277 | } ____cacheline_aligned_in_smp; |
|---|
| 282 | 278 | |
|---|
| 283 | 279 | /* |
|---|
| .. | .. |
|---|
| 321 | 317 | * tries to keep 25% of the log free, so we need to keep below that limit or we |
|---|
| 322 | 318 | * risk running out of free log space to start any new transactions. |
|---|
| 323 | 319 | * |
|---|
| 324 | | - * In order to keep background CIL push efficient, we will set a lower |
|---|
| 325 | | - * threshold at which background pushing is attempted without blocking current |
|---|
| 326 | | - * transaction commits. A separate, higher bound defines when CIL pushes are |
|---|
| 327 | | - * enforced to ensure we stay within our maximum checkpoint size bounds. |
|---|
| 328 | | - * threshold, yet give us plenty of space for aggregation on large logs. |
|---|
| 320 | + * In order to keep background CIL push efficient, we only need to ensure the |
|---|
| 321 | + * CIL is large enough to maintain sufficient in-memory relogging to avoid |
|---|
| 322 | + * repeated physical writes of frequently modified metadata. If we allow the CIL |
|---|
| 323 | + * to grow to a substantial fraction of the log, then we may be pinning hundreds |
|---|
| 324 | + * of megabytes of metadata in memory until the CIL flushes. This can cause |
|---|
| 325 | + * issues when we are running low on memory - pinned memory cannot be reclaimed, |
|---|
| 326 | + * and the CIL consumes a lot of memory. Hence we need to set an upper physical |
|---|
| 327 | + * size limit for the CIL that limits the maximum amount of memory pinned by the |
|---|
| 328 | + * CIL but does not limit performance by reducing relogging efficiency |
|---|
| 329 | + * significantly. |
|---|
| 330 | + * |
|---|
| 331 | + * As such, the CIL push threshold ends up being the smaller of two thresholds: |
|---|
| 332 | + * - a threshold large enough that it allows CIL to be pushed and progress to be |
|---|
| 333 | + * made without excessive blocking of incoming transaction commits. This is |
|---|
| 334 | + * defined to be 12.5% of the log space - half the 25% push threshold of the |
|---|
| 335 | + * AIL. |
|---|
| 336 | + * - small enough that it doesn't pin excessive amounts of memory but maintains |
|---|
| 337 | + * close to peak relogging efficiency. This is defined to be 16x the iclog |
|---|
| 338 | + * buffer window (32MB) as measurements have shown this to be roughly the |
|---|
| 339 | + * point of diminishing performance increases under highly concurrent |
|---|
| 340 | + * modification workloads. |
|---|
| 341 | + * |
|---|
| 342 | + * To prevent the CIL from overflowing upper commit size bounds, we introduce a |
|---|
| 343 | + * new threshold at which we block committing transactions until the background |
|---|
| 344 | + * CIL commit commences and switches to a new context. While this is not a hard |
|---|
| 345 | + * limit, it forces the process committing a transaction to the CIL to block and |
|---|
| 346 | + * yeild the CPU, giving the CIL push work a chance to be scheduled and start |
|---|
| 347 | + * work. This prevents a process running lots of transactions from overfilling |
|---|
| 348 | + * the CIL because it is not yielding the CPU. We set the blocking limit at |
|---|
| 349 | + * twice the background push space threshold so we keep in line with the AIL |
|---|
| 350 | + * push thresholds. |
|---|
| 351 | + * |
|---|
| 352 | + * Note: this is not a -hard- limit as blocking is applied after the transaction |
|---|
| 353 | + * is inserted into the CIL and the push has been triggered. It is largely a |
|---|
| 354 | + * throttling mechanism that allows the CIL push to be scheduled and run. A hard |
|---|
| 355 | + * limit will be difficult to implement without introducing global serialisation |
|---|
| 356 | + * in the CIL commit fast path, and it's not at all clear that we actually need |
|---|
| 357 | + * such hard limits given the ~7 years we've run without a hard limit before |
|---|
| 358 | + * finding the first situation where a checkpoint size overflow actually |
|---|
| 359 | + * occurred. Hence the simple throttle, and an ASSERT check to tell us that |
|---|
| 360 | + * we've overrun the max size. |
|---|
| 329 | 361 | */ |
|---|
| 330 | | -#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) |
|---|
| 362 | +#define XLOG_CIL_SPACE_LIMIT(log) \ |
|---|
| 363 | + min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4) |
|---|
| 364 | + |
|---|
| 365 | +#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \ |
|---|
| 366 | + (XLOG_CIL_SPACE_LIMIT(log) * 2) |
|---|
| 331 | 367 | |
|---|
| 332 | 368 | /* |
|---|
| 333 | 369 | * ticket grant locks, queues and accounting have their own cachlines |
|---|
| .. | .. |
|---|
| 350 | 386 | struct xfs_mount *l_mp; /* mount point */ |
|---|
| 351 | 387 | struct xfs_ail *l_ailp; /* AIL log is working with */ |
|---|
| 352 | 388 | struct xfs_cil *l_cilp; /* CIL log is working with */ |
|---|
| 353 | | - struct xfs_buf *l_xbuf; /* extra buffer for log |
|---|
| 354 | | - * wrapping */ |
|---|
| 355 | 389 | struct xfs_buftarg *l_targ; /* buftarg of log */ |
|---|
| 390 | + struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */ |
|---|
| 356 | 391 | struct delayed_work l_work; /* background flush work */ |
|---|
| 357 | 392 | uint l_flags; |
|---|
| 358 | 393 | uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ |
|---|
| .. | .. |
|---|
| 361 | 396 | int l_iclog_heads; /* # of iclog header sectors */ |
|---|
| 362 | 397 | uint l_sectBBsize; /* sector size in BBs (2^n) */ |
|---|
| 363 | 398 | int l_iclog_size; /* size of log in bytes */ |
|---|
| 364 | | - int l_iclog_size_log; /* log power size of log */ |
|---|
| 365 | 399 | int l_iclog_bufs; /* number of iclog buffers */ |
|---|
| 366 | 400 | xfs_daddr_t l_logBBstart; /* start block of log */ |
|---|
| 367 | 401 | int l_logsize; /* size of log in bytes */ |
|---|
| .. | .. |
|---|
| 399 | 433 | /* The following field are used for debugging; need to hold icloglock */ |
|---|
| 400 | 434 | #ifdef DEBUG |
|---|
| 401 | 435 | void *l_iclog_bak[XLOG_MAX_ICLOGS]; |
|---|
| 402 | | - /* log record crc error injection factor */ |
|---|
| 403 | | - uint32_t l_badcrc_factor; |
|---|
| 404 | 436 | #endif |
|---|
| 405 | 437 | /* log recovery lsn tracking (for buffer submission */ |
|---|
| 406 | 438 | xfs_lsn_t l_recovery_lsn; |
|---|
| .. | .. |
|---|
| 409 | 441 | #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ |
|---|
| 410 | 442 | ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) |
|---|
| 411 | 443 | |
|---|
| 412 | | -#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) |
|---|
| 444 | +#define XLOG_FORCED_SHUTDOWN(log) \ |
|---|
| 445 | + (unlikely((log)->l_flags & XLOG_IO_ERROR)) |
|---|
| 413 | 446 | |
|---|
| 414 | 447 | /* common routines */ |
|---|
| 415 | 448 | extern int |
|---|
| .. | .. |
|---|
| 418 | 451 | extern int |
|---|
| 419 | 452 | xlog_recover_finish( |
|---|
| 420 | 453 | struct xlog *log); |
|---|
| 421 | | -extern int |
|---|
| 454 | +extern void |
|---|
| 422 | 455 | xlog_recover_cancel(struct xlog *); |
|---|
| 423 | 456 | |
|---|
| 424 | 457 | extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, |
|---|
| .. | .. |
|---|
| 431 | 464 | int unit_bytes, |
|---|
| 432 | 465 | int count, |
|---|
| 433 | 466 | char client, |
|---|
| 434 | | - bool permanent, |
|---|
| 435 | | - xfs_km_flags_t alloc_flags); |
|---|
| 436 | | - |
|---|
| 467 | + bool permanent); |
|---|
| 437 | 468 | |
|---|
| 438 | 469 | static inline void |
|---|
| 439 | 470 | xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) |
|---|
| .. | .. |
|---|
| 445 | 476 | |
|---|
| 446 | 477 | void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); |
|---|
| 447 | 478 | void xlog_print_trans(struct xfs_trans *); |
|---|
| 448 | | -int |
|---|
| 449 | | -xlog_write( |
|---|
| 450 | | - struct xlog *log, |
|---|
| 451 | | - struct xfs_log_vec *log_vector, |
|---|
| 452 | | - struct xlog_ticket *tic, |
|---|
| 453 | | - xfs_lsn_t *start_lsn, |
|---|
| 454 | | - struct xlog_in_core **commit_iclog, |
|---|
| 455 | | - uint flags); |
|---|
| 479 | +int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, |
|---|
| 480 | + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, |
|---|
| 481 | + struct xlog_in_core **commit_iclog, uint flags, |
|---|
| 482 | + bool need_start_rec); |
|---|
| 483 | +int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, |
|---|
| 484 | + struct xlog_in_core **iclog, xfs_lsn_t *lsn); |
|---|
| 485 | +void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); |
|---|
| 486 | +void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); |
|---|
| 456 | 487 | |
|---|
| 457 | 488 | /* |
|---|
| 458 | 489 | * When we crack an atomic LSN, we sample it first so that the value will not |
|---|
| .. | .. |
|---|
| 516 | 547 | void xlog_cil_init_post_recovery(struct xlog *log); |
|---|
| 517 | 548 | void xlog_cil_destroy(struct xlog *log); |
|---|
| 518 | 549 | bool xlog_cil_empty(struct xlog *log); |
|---|
| 550 | +void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp, |
|---|
| 551 | + xfs_csn_t *commit_seq, bool regrant); |
|---|
| 519 | 552 | |
|---|
| 520 | 553 | /* |
|---|
| 521 | 554 | * CIL force routines |
|---|
| 522 | 555 | */ |
|---|
| 523 | | -xfs_lsn_t |
|---|
| 524 | | -xlog_cil_force_lsn( |
|---|
| 525 | | - struct xlog *log, |
|---|
| 526 | | - xfs_lsn_t sequence); |
|---|
| 556 | +xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence); |
|---|
| 527 | 557 | |
|---|
| 528 | 558 | static inline void |
|---|
| 529 | 559 | xlog_cil_force(struct xlog *log) |
|---|
| 530 | 560 | { |
|---|
| 531 | | - xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); |
|---|
| 561 | + xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence); |
|---|
| 532 | 562 | } |
|---|
| 533 | | - |
|---|
| 534 | | -/* |
|---|
| 535 | | - * Unmount record type is used as a pseudo transaction type for the ticket. |
|---|
| 536 | | - * It's value must be outside the range of XFS_TRANS_* values. |
|---|
| 537 | | - */ |
|---|
| 538 | | -#define XLOG_UNMOUNT_REC_TYPE (-1U) |
|---|
| 539 | 563 | |
|---|
| 540 | 564 | /* |
|---|
| 541 | 565 | * Wrapper function for waiting on a wait queue serialised against wakeups |
|---|
| 542 | 566 | * by a spinlock. This matches the semantics of all the wait queues used in the |
|---|
| 543 | 567 | * log code. |
|---|
| 544 | 568 | */ |
|---|
| 545 | | -static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) |
|---|
| 569 | +static inline void |
|---|
| 570 | +xlog_wait( |
|---|
| 571 | + struct wait_queue_head *wq, |
|---|
| 572 | + struct spinlock *lock) |
|---|
| 573 | + __releases(lock) |
|---|
| 546 | 574 | { |
|---|
| 547 | 575 | DECLARE_WAITQUEUE(wait, current); |
|---|
| 548 | 576 | |
|---|