hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/fs/xfs/xfs_log_priv.h
....@@ -10,7 +10,6 @@
1010 struct xlog;
1111 struct xlog_ticket;
1212 struct xfs_mount;
13
-struct xfs_log_callback;
1413
1514 /*
1615 * Flags for log structure
....@@ -41,27 +40,22 @@
4140 /*
4241 * In core log state
4342 */
44
-#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */
45
-#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */
46
-#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */
47
-#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */
48
-#define XLOG_STATE_DO_CALLBACK \
49
- 0x0010 /* Process callback functions */
50
-#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */
51
-#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/
52
-#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
53
-#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */
54
-#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
55
-#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
43
+enum xlog_iclog_state {
44
+ XLOG_STATE_ACTIVE, /* Current IC log being written to */
45
+ XLOG_STATE_WANT_SYNC, /* Want to sync this iclog; no more writes */
46
+ XLOG_STATE_SYNCING, /* This IC log is syncing */
47
+ XLOG_STATE_DONE_SYNC, /* Done syncing to disk */
48
+ XLOG_STATE_CALLBACK, /* Callback functions now */
49
+ XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */
50
+ XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */
51
+};
5652
5753 /*
58
- * Flags to log ticket
54
+ * Log ticket flags
5955 */
60
-#define XLOG_TIC_INITED 0x1 /* has been initialized */
61
-#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
56
+#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */
6257
6358 #define XLOG_TIC_FLAGS \
64
- { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
6559 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
6660
6761 /*
....@@ -179,11 +173,8 @@
179173 * the iclog.
180174 * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
181175 * - ic_next is the pointer to the next iclog in the ring.
182
- * - ic_bp is a pointer to the buffer used to write this incore log to disk.
183176 * - ic_log is a pointer back to the global log structure.
184
- * - ic_callback is a linked list of callback function/argument pairs to be
185
- * called after an iclog finishes writing.
186
- * - ic_size is the full size of the header plus data.
177
+ * - ic_size is the full size of the log buffer, minus the cycle headers.
187178 * - ic_offset is the current number of bytes written to in this iclog.
188179 * - ic_refcnt is bumped when someone is writing to the log.
189180 * - ic_state is the state of the iclog.
....@@ -193,7 +184,7 @@
193184 * structure cacheline aligned. The following fields can be contended on
194185 * by independent processes:
195186 *
196
- * - ic_callback_*
187
+ * - ic_callbacks
197188 * - ic_refcnt
198189 * - fields protected by the global l_icloglock
199190 *
....@@ -206,23 +197,27 @@
206197 wait_queue_head_t ic_write_wait;
207198 struct xlog_in_core *ic_next;
208199 struct xlog_in_core *ic_prev;
209
- struct xfs_buf *ic_bp;
210200 struct xlog *ic_log;
211
- int ic_size;
212
- int ic_offset;
213
- int ic_bwritecnt;
214
- unsigned short ic_state;
201
+ u32 ic_size;
202
+ u32 ic_offset;
203
+ enum xlog_iclog_state ic_state;
215204 char *ic_datap; /* pointer to iclog data */
216205
217206 /* Callback structures need their own cacheline */
218207 spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
219
- struct xfs_log_callback *ic_callback;
220
- struct xfs_log_callback **ic_callback_tail;
208
+ struct list_head ic_callbacks;
221209
222210 /* reference counts need their own cacheline */
223211 atomic_t ic_refcnt ____cacheline_aligned_in_smp;
224212 xlog_in_core_2_t *ic_data;
225213 #define ic_header ic_data->hic_header
214
+#ifdef DEBUG
215
+ bool ic_fail_crc : 1;
216
+#endif
217
+ struct semaphore ic_sema;
218
+ struct work_struct ic_end_io_work;
219
+ struct bio ic_bio;
220
+ struct bio_vec ic_bvec[];
226221 } xlog_in_core_t;
227222
228223 /*
....@@ -235,7 +230,7 @@
235230
236231 struct xfs_cil_ctx {
237232 struct xfs_cil *cil;
238
- xfs_lsn_t sequence; /* chkpt sequence # */
233
+ xfs_csn_t sequence; /* chkpt sequence # */
239234 xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
240235 xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
241236 struct xlog_ticket *ticket; /* chkpt ticket */
....@@ -243,7 +238,7 @@
243238 int space_used; /* aggregate size of regions */
244239 struct list_head busy_extents; /* busy extents in chkpt */
245240 struct xfs_log_vec *lv_chain; /* logvecs being pushed */
246
- struct xfs_log_callback log_cb; /* completion callback hook. */
241
+ struct list_head iclog_entry;
247242 struct list_head committing; /* ctx committing list */
248243 struct work_struct discard_endio_work;
249244 };
....@@ -273,11 +268,12 @@
273268 struct xfs_cil_ctx *xc_ctx;
274269
275270 spinlock_t xc_push_lock ____cacheline_aligned_in_smp;
276
- xfs_lsn_t xc_push_seq;
271
+ xfs_csn_t xc_push_seq;
277272 struct list_head xc_committing;
278273 wait_queue_head_t xc_commit_wait;
279
- xfs_lsn_t xc_current_sequence;
274
+ xfs_csn_t xc_current_sequence;
280275 struct work_struct xc_push_work;
276
+ wait_queue_head_t xc_push_wait; /* background push throttle */
281277 } ____cacheline_aligned_in_smp;
282278
283279 /*
....@@ -321,13 +317,53 @@
321317 * tries to keep 25% of the log free, so we need to keep below that limit or we
322318 * risk running out of free log space to start any new transactions.
323319 *
324
- * In order to keep background CIL push efficient, we will set a lower
325
- * threshold at which background pushing is attempted without blocking current
326
- * transaction commits. A separate, higher bound defines when CIL pushes are
327
- * enforced to ensure we stay within our maximum checkpoint size bounds.
328
- * threshold, yet give us plenty of space for aggregation on large logs.
320
+ * In order to keep background CIL push efficient, we only need to ensure the
321
+ * CIL is large enough to maintain sufficient in-memory relogging to avoid
322
+ * repeated physical writes of frequently modified metadata. If we allow the CIL
323
+ * to grow to a substantial fraction of the log, then we may be pinning hundreds
324
+ * of megabytes of metadata in memory until the CIL flushes. This can cause
325
+ * issues when we are running low on memory - pinned memory cannot be reclaimed,
326
+ * and the CIL consumes a lot of memory. Hence we need to set an upper physical
327
+ * size limit for the CIL that limits the maximum amount of memory pinned by the
328
+ * CIL but does not limit performance by reducing relogging efficiency
329
+ * significantly.
330
+ *
331
+ * As such, the CIL push threshold ends up being the smaller of two thresholds:
332
+ * - a threshold large enough that it allows CIL to be pushed and progress to be
333
+ * made without excessive blocking of incoming transaction commits. This is
334
+ * defined to be 12.5% of the log space - half the 25% push threshold of the
335
+ * AIL.
336
+ * - small enough that it doesn't pin excessive amounts of memory but maintains
337
+ * close to peak relogging efficiency. This is defined to be 16x the iclog
338
+ * buffer window (32MB) as measurements have shown this to be roughly the
339
+ * point of diminishing performance increases under highly concurrent
340
+ * modification workloads.
341
+ *
342
+ * To prevent the CIL from overflowing upper commit size bounds, we introduce a
343
+ * new threshold at which we block committing transactions until the background
344
+ * CIL commit commences and switches to a new context. While this is not a hard
345
+ * limit, it forces the process committing a transaction to the CIL to block and
346
+ * yeild the CPU, giving the CIL push work a chance to be scheduled and start
347
+ * work. This prevents a process running lots of transactions from overfilling
348
+ * the CIL because it is not yielding the CPU. We set the blocking limit at
349
+ * twice the background push space threshold so we keep in line with the AIL
350
+ * push thresholds.
351
+ *
352
+ * Note: this is not a -hard- limit as blocking is applied after the transaction
353
+ * is inserted into the CIL and the push has been triggered. It is largely a
354
+ * throttling mechanism that allows the CIL push to be scheduled and run. A hard
355
+ * limit will be difficult to implement without introducing global serialisation
356
+ * in the CIL commit fast path, and it's not at all clear that we actually need
357
+ * such hard limits given the ~7 years we've run without a hard limit before
358
+ * finding the first situation where a checkpoint size overflow actually
359
+ * occurred. Hence the simple throttle, and an ASSERT check to tell us that
360
+ * we've overrun the max size.
329361 */
330
-#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
362
+#define XLOG_CIL_SPACE_LIMIT(log) \
363
+ min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
364
+
365
+#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \
366
+ (XLOG_CIL_SPACE_LIMIT(log) * 2)
331367
332368 /*
333369 * ticket grant locks, queues and accounting have their own cachlines
....@@ -350,9 +386,8 @@
350386 struct xfs_mount *l_mp; /* mount point */
351387 struct xfs_ail *l_ailp; /* AIL log is working with */
352388 struct xfs_cil *l_cilp; /* CIL log is working with */
353
- struct xfs_buf *l_xbuf; /* extra buffer for log
354
- * wrapping */
355389 struct xfs_buftarg *l_targ; /* buftarg of log */
390
+ struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */
356391 struct delayed_work l_work; /* background flush work */
357392 uint l_flags;
358393 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
....@@ -361,7 +396,6 @@
361396 int l_iclog_heads; /* # of iclog header sectors */
362397 uint l_sectBBsize; /* sector size in BBs (2^n) */
363398 int l_iclog_size; /* size of log in bytes */
364
- int l_iclog_size_log; /* log power size of log */
365399 int l_iclog_bufs; /* number of iclog buffers */
366400 xfs_daddr_t l_logBBstart; /* start block of log */
367401 int l_logsize; /* size of log in bytes */
....@@ -399,8 +433,6 @@
399433 /* The following field are used for debugging; need to hold icloglock */
400434 #ifdef DEBUG
401435 void *l_iclog_bak[XLOG_MAX_ICLOGS];
402
- /* log record crc error injection factor */
403
- uint32_t l_badcrc_factor;
404436 #endif
405437 /* log recovery lsn tracking (for buffer submission */
406438 xfs_lsn_t l_recovery_lsn;
....@@ -409,7 +441,8 @@
409441 #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
410442 ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
411443
412
-#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
444
+#define XLOG_FORCED_SHUTDOWN(log) \
445
+ (unlikely((log)->l_flags & XLOG_IO_ERROR))
413446
414447 /* common routines */
415448 extern int
....@@ -418,7 +451,7 @@
418451 extern int
419452 xlog_recover_finish(
420453 struct xlog *log);
421
-extern int
454
+extern void
422455 xlog_recover_cancel(struct xlog *);
423456
424457 extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
....@@ -431,9 +464,7 @@
431464 int unit_bytes,
432465 int count,
433466 char client,
434
- bool permanent,
435
- xfs_km_flags_t alloc_flags);
436
-
467
+ bool permanent);
437468
438469 static inline void
439470 xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
....@@ -445,14 +476,14 @@
445476
446477 void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
447478 void xlog_print_trans(struct xfs_trans *);
448
-int
449
-xlog_write(
450
- struct xlog *log,
451
- struct xfs_log_vec *log_vector,
452
- struct xlog_ticket *tic,
453
- xfs_lsn_t *start_lsn,
454
- struct xlog_in_core **commit_iclog,
455
- uint flags);
479
+int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector,
480
+ struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
481
+ struct xlog_in_core **commit_iclog, uint flags,
482
+ bool need_start_rec);
483
+int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
484
+ struct xlog_in_core **iclog, xfs_lsn_t *lsn);
485
+void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
486
+void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
456487
457488 /*
458489 * When we crack an atomic LSN, we sample it first so that the value will not
....@@ -516,33 +547,30 @@
516547 void xlog_cil_init_post_recovery(struct xlog *log);
517548 void xlog_cil_destroy(struct xlog *log);
518549 bool xlog_cil_empty(struct xlog *log);
550
+void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp,
551
+ xfs_csn_t *commit_seq, bool regrant);
519552
520553 /*
521554 * CIL force routines
522555 */
523
-xfs_lsn_t
524
-xlog_cil_force_lsn(
525
- struct xlog *log,
526
- xfs_lsn_t sequence);
556
+xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence);
527557
528558 static inline void
529559 xlog_cil_force(struct xlog *log)
530560 {
531
- xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
561
+ xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence);
532562 }
533
-
534
-/*
535
- * Unmount record type is used as a pseudo transaction type for the ticket.
536
- * It's value must be outside the range of XFS_TRANS_* values.
537
- */
538
-#define XLOG_UNMOUNT_REC_TYPE (-1U)
539563
540564 /*
541565 * Wrapper function for waiting on a wait queue serialised against wakeups
542566 * by a spinlock. This matches the semantics of all the wait queues used in the
543567 * log code.
544568 */
545
-static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
569
+static inline void
570
+xlog_wait(
571
+ struct wait_queue_head *wq,
572
+ struct spinlock *lock)
573
+ __releases(lock)
546574 {
547575 DECLARE_WAITQUEUE(wait, current);
548576