From 6778948f9de86c3cfaf36725a7c87dcff9ba247f Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Mon, 11 Dec 2023 08:20:59 +0000 Subject: [PATCH] kernel_5.10 no rt --- kernel/fs/xfs/xfs_log_priv.h | 166 ++++++++++++++++++++++++++++++++----------------------- 1 files changed, 97 insertions(+), 69 deletions(-) diff --git a/kernel/fs/xfs/xfs_log_priv.h b/kernel/fs/xfs/xfs_log_priv.h index b5f82cb..42cd160 100644 --- a/kernel/fs/xfs/xfs_log_priv.h +++ b/kernel/fs/xfs/xfs_log_priv.h @@ -10,7 +10,6 @@ struct xlog; struct xlog_ticket; struct xfs_mount; -struct xfs_log_callback; /* * Flags for log structure @@ -41,27 +40,22 @@ /* * In core log state */ -#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ -#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ -#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ -#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ -#define XLOG_STATE_DO_CALLBACK \ - 0x0010 /* Process callback functions */ -#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ -#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ -#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ -#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */ -#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ -#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ +enum xlog_iclog_state { + XLOG_STATE_ACTIVE, /* Current IC log being written to */ + XLOG_STATE_WANT_SYNC, /* Want to sync this iclog; no more writes */ + XLOG_STATE_SYNCING, /* This IC log is syncing */ + XLOG_STATE_DONE_SYNC, /* Done syncing to disk */ + XLOG_STATE_CALLBACK, /* Callback functions now */ + XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */ + XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */ +}; /* - * Flags to log ticket + * Log ticket flags */ -#define XLOG_TIC_INITED 0x1 /* has been initialized */ -#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ +#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */ #define XLOG_TIC_FLAGS \ - { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } /* @@ -179,11 +173,8 @@ * the iclog. * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. * - ic_next is the pointer to the next iclog in the ring. - * - ic_bp is a pointer to the buffer used to write this incore log to disk. * - ic_log is a pointer back to the global log structure. - * - ic_callback is a linked list of callback function/argument pairs to be - * called after an iclog finishes writing. - * - ic_size is the full size of the header plus data. + * - ic_size is the full size of the log buffer, minus the cycle headers. * - ic_offset is the current number of bytes written to in this iclog. * - ic_refcnt is bumped when someone is writing to the log. * - ic_state is the state of the iclog. @@ -193,7 +184,7 @@ * structure cacheline aligned. The following fields can be contended on * by independent processes: * - * - ic_callback_* + * - ic_callbacks * - ic_refcnt * - fields protected by the global l_icloglock * @@ -206,23 +197,27 @@ wait_queue_head_t ic_write_wait; struct xlog_in_core *ic_next; struct xlog_in_core *ic_prev; - struct xfs_buf *ic_bp; struct xlog *ic_log; - int ic_size; - int ic_offset; - int ic_bwritecnt; - unsigned short ic_state; + u32 ic_size; + u32 ic_offset; + enum xlog_iclog_state ic_state; char *ic_datap; /* pointer to iclog data */ /* Callback structures need their own cacheline */ spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; - struct xfs_log_callback *ic_callback; - struct xfs_log_callback **ic_callback_tail; + struct list_head ic_callbacks; /* reference counts need their own cacheline */ atomic_t ic_refcnt ____cacheline_aligned_in_smp; xlog_in_core_2_t *ic_data; #define ic_header ic_data->hic_header +#ifdef DEBUG + bool ic_fail_crc : 1; +#endif + struct semaphore ic_sema; + struct work_struct ic_end_io_work; + struct bio ic_bio; + struct bio_vec ic_bvec[]; } xlog_in_core_t; /* @@ -235,7 +230,7 @@ struct xfs_cil_ctx { struct xfs_cil *cil; - xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_csn_t sequence; /* chkpt sequence # */ xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ struct xlog_ticket *ticket; /* chkpt ticket */ @@ -243,7 +238,7 @@ int space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ struct xfs_log_vec *lv_chain; /* logvecs being pushed */ - struct xfs_log_callback log_cb; /* completion callback hook. */ + struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ struct work_struct discard_endio_work; }; @@ -273,11 +268,12 @@ struct xfs_cil_ctx *xc_ctx; spinlock_t xc_push_lock ____cacheline_aligned_in_smp; - xfs_lsn_t xc_push_seq; + xfs_csn_t xc_push_seq; struct list_head xc_committing; wait_queue_head_t xc_commit_wait; - xfs_lsn_t xc_current_sequence; + xfs_csn_t xc_current_sequence; struct work_struct xc_push_work; + wait_queue_head_t xc_push_wait; /* background push throttle */ } ____cacheline_aligned_in_smp; /* @@ -321,13 +317,53 @@ * tries to keep 25% of the log free, so we need to keep below that limit or we * risk running out of free log space to start any new transactions. * - * In order to keep background CIL push efficient, we will set a lower - * threshold at which background pushing is attempted without blocking current - * transaction commits. A separate, higher bound defines when CIL pushes are - * enforced to ensure we stay within our maximum checkpoint size bounds. - * threshold, yet give us plenty of space for aggregation on large logs. + * In order to keep background CIL push efficient, we only need to ensure the + * CIL is large enough to maintain sufficient in-memory relogging to avoid + * repeated physical writes of frequently modified metadata. If we allow the CIL + * to grow to a substantial fraction of the log, then we may be pinning hundreds + * of megabytes of metadata in memory until the CIL flushes. This can cause + * issues when we are running low on memory - pinned memory cannot be reclaimed, + * and the CIL consumes a lot of memory. Hence we need to set an upper physical + * size limit for the CIL that limits the maximum amount of memory pinned by the + * CIL but does not limit performance by reducing relogging efficiency + * significantly. + * + * As such, the CIL push threshold ends up being the smaller of two thresholds: + * - a threshold large enough that it allows CIL to be pushed and progress to be + * made without excessive blocking of incoming transaction commits. This is + * defined to be 12.5% of the log space - half the 25% push threshold of the + * AIL. + * - small enough that it doesn't pin excessive amounts of memory but maintains + * close to peak relogging efficiency. This is defined to be 16x the iclog + * buffer window (32MB) as measurements have shown this to be roughly the + * point of diminishing performance increases under highly concurrent + * modification workloads. + * + * To prevent the CIL from overflowing upper commit size bounds, we introduce a + * new threshold at which we block committing transactions until the background + * CIL commit commences and switches to a new context. While this is not a hard + * limit, it forces the process committing a transaction to the CIL to block and + * yeild the CPU, giving the CIL push work a chance to be scheduled and start + * work. This prevents a process running lots of transactions from overfilling + * the CIL because it is not yielding the CPU. We set the blocking limit at + * twice the background push space threshold so we keep in line with the AIL + * push thresholds. + * + * Note: this is not a -hard- limit as blocking is applied after the transaction + * is inserted into the CIL and the push has been triggered. It is largely a + * throttling mechanism that allows the CIL push to be scheduled and run. A hard + * limit will be difficult to implement without introducing global serialisation + * in the CIL commit fast path, and it's not at all clear that we actually need + * such hard limits given the ~7 years we've run without a hard limit before + * finding the first situation where a checkpoint size overflow actually + * occurred. Hence the simple throttle, and an ASSERT check to tell us that + * we've overrun the max size. */ -#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) +#define XLOG_CIL_SPACE_LIMIT(log) \ + min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4) + +#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \ + (XLOG_CIL_SPACE_LIMIT(log) * 2) /* * ticket grant locks, queues and accounting have their own cachlines @@ -350,9 +386,8 @@ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ struct xfs_cil *l_cilp; /* CIL log is working with */ - struct xfs_buf *l_xbuf; /* extra buffer for log - * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ + struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */ struct delayed_work l_work; /* background flush work */ uint l_flags; uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ @@ -361,7 +396,6 @@ int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ int l_iclog_size; /* size of log in bytes */ - int l_iclog_size_log; /* log power size of log */ int l_iclog_bufs; /* number of iclog buffers */ xfs_daddr_t l_logBBstart; /* start block of log */ int l_logsize; /* size of log in bytes */ @@ -399,8 +433,6 @@ /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG void *l_iclog_bak[XLOG_MAX_ICLOGS]; - /* log record crc error injection factor */ - uint32_t l_badcrc_factor; #endif /* log recovery lsn tracking (for buffer submission */ xfs_lsn_t l_recovery_lsn; @@ -409,7 +441,8 @@ #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) -#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) +#define XLOG_FORCED_SHUTDOWN(log) \ + (unlikely((log)->l_flags & XLOG_IO_ERROR)) /* common routines */ extern int @@ -418,7 +451,7 @@ extern int xlog_recover_finish( struct xlog *log); -extern int +extern void xlog_recover_cancel(struct xlog *); extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, @@ -431,9 +464,7 @@ int unit_bytes, int count, char client, - bool permanent, - xfs_km_flags_t alloc_flags); - + bool permanent); static inline void xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) @@ -445,14 +476,14 @@ void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); -int -xlog_write( - struct xlog *log, - struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, - xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, - uint flags); +int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, uint flags, + bool need_start_rec); +int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, + struct xlog_in_core **iclog, xfs_lsn_t *lsn); +void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); +void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); /* * When we crack an atomic LSN, we sample it first so that the value will not @@ -516,33 +547,30 @@ void xlog_cil_init_post_recovery(struct xlog *log); void xlog_cil_destroy(struct xlog *log); bool xlog_cil_empty(struct xlog *log); +void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp, + xfs_csn_t *commit_seq, bool regrant); /* * CIL force routines */ -xfs_lsn_t -xlog_cil_force_lsn( - struct xlog *log, - xfs_lsn_t sequence); +xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence); static inline void xlog_cil_force(struct xlog *log) { - xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); + xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence); } - -/* - * Unmount record type is used as a pseudo transaction type for the ticket. - * It's value must be outside the range of XFS_TRANS_* values. - */ -#define XLOG_UNMOUNT_REC_TYPE (-1U) /* * Wrapper function for waiting on a wait queue serialised against wakeups * by a spinlock. This matches the semantics of all the wait queues used in the * log code. */ -static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +static inline void +xlog_wait( + struct wait_queue_head *wq, + struct spinlock *lock) + __releases(lock) { DECLARE_WAITQUEUE(wait, current); -- Gitblit v1.6.2