hc
2024-05-10 37f49e37ab4cb5d0bc4c60eb5c6d4dd57db767bb
kernel/fs/jbd2/commit.c
....@@ -184,23 +184,51 @@
184184 /*
185185 * write the filemap data using writepage() address_space_operations.
186186 * We don't do block allocation here even for delalloc. We don't
187
- * use writepages() because with dealyed allocation we may be doing
187
+ * use writepages() because with delayed allocation we may be doing
188188 * block allocation in writepages().
189189 */
190
-static int journal_submit_inode_data_buffers(struct address_space *mapping,
191
- loff_t dirty_start, loff_t dirty_end)
190
+int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
192191 {
193
- int ret;
192
+ struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
194193 struct writeback_control wbc = {
195194 .sync_mode = WB_SYNC_ALL,
196195 .nr_to_write = mapping->nrpages * 2,
197
- .range_start = dirty_start,
198
- .range_end = dirty_end,
196
+ .range_start = jinode->i_dirty_start,
197
+ .range_end = jinode->i_dirty_end,
199198 };
200199
201
- ret = generic_writepages(mapping, &wbc);
202
- return ret;
200
+ /*
201
+ * submit the inode data buffers. We use writepage
202
+ * instead of writepages. Because writepages can do
203
+ * block allocation with delalloc. We need to write
204
+ * only allocated blocks here.
205
+ */
206
+ return generic_writepages(mapping, &wbc);
203207 }
208
+
209
+/* Send all the data buffers related to an inode */
210
+int jbd2_submit_inode_data(struct jbd2_inode *jinode)
211
+{
212
+
213
+ if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
214
+ return 0;
215
+
216
+ trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
217
+ return jbd2_journal_submit_inode_data_buffers(jinode);
218
+
219
+}
220
+EXPORT_SYMBOL(jbd2_submit_inode_data);
221
+
222
+int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
223
+{
224
+ if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) ||
225
+ !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping)
226
+ return 0;
227
+ return filemap_fdatawait_range_keep_errors(
228
+ jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start,
229
+ jinode->i_dirty_end);
230
+}
231
+EXPORT_SYMBOL(jbd2_wait_inode_data);
204232
205233 /*
206234 * Submit all the data buffers of inode associated with the transaction to
....@@ -215,29 +243,20 @@
215243 {
216244 struct jbd2_inode *jinode;
217245 int err, ret = 0;
218
- struct address_space *mapping;
219246
220247 spin_lock(&journal->j_list_lock);
221248 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
222
- loff_t dirty_start = jinode->i_dirty_start;
223
- loff_t dirty_end = jinode->i_dirty_end;
224
-
225249 if (!(jinode->i_flags & JI_WRITE_DATA))
226250 continue;
227
- mapping = jinode->i_vfs_inode->i_mapping;
228251 jinode->i_flags |= JI_COMMIT_RUNNING;
229252 spin_unlock(&journal->j_list_lock);
230
- /*
231
- * submit the inode data buffers. We use writepage
232
- * instead of writepages. Because writepages can do
233
- * block allocation with delalloc. We need to write
234
- * only allocated blocks here.
235
- */
253
+ /* submit the inode data buffers. */
236254 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
237
- err = journal_submit_inode_data_buffers(mapping, dirty_start,
238
- dirty_end);
239
- if (!ret)
240
- ret = err;
255
+ if (journal->j_submit_inode_data_buffers) {
256
+ err = journal->j_submit_inode_data_buffers(jinode);
257
+ if (!ret)
258
+ ret = err;
259
+ }
241260 spin_lock(&journal->j_list_lock);
242261 J_ASSERT(jinode->i_transaction == commit_transaction);
243262 jinode->i_flags &= ~JI_COMMIT_RUNNING;
....@@ -246,6 +265,15 @@
246265 }
247266 spin_unlock(&journal->j_list_lock);
248267 return ret;
268
+}
269
+
270
+int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
271
+{
272
+ struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
273
+
274
+ return filemap_fdatawait_range_keep_errors(mapping,
275
+ jinode->i_dirty_start,
276
+ jinode->i_dirty_end);
249277 }
250278
251279 /*
....@@ -262,18 +290,16 @@
262290 /* For locking, see the comment in journal_submit_data_buffers() */
263291 spin_lock(&journal->j_list_lock);
264292 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
265
- loff_t dirty_start = jinode->i_dirty_start;
266
- loff_t dirty_end = jinode->i_dirty_end;
267
-
268293 if (!(jinode->i_flags & JI_WAIT_DATA))
269294 continue;
270295 jinode->i_flags |= JI_COMMIT_RUNNING;
271296 spin_unlock(&journal->j_list_lock);
272
- err = filemap_fdatawait_range_keep_errors(
273
- jinode->i_vfs_inode->i_mapping, dirty_start,
274
- dirty_end);
275
- if (!ret)
276
- ret = err;
297
+ /* wait for the inode data buffers writeout. */
298
+ if (journal->j_finish_inode_data_buffers) {
299
+ err = journal->j_finish_inode_data_buffers(jinode);
300
+ if (!ret)
301
+ ret = err;
302
+ }
277303 spin_lock(&journal->j_list_lock);
278304 jinode->i_flags &= ~JI_COMMIT_RUNNING;
279305 smp_mb();
....@@ -413,6 +439,29 @@
413439 J_ASSERT(journal->j_running_transaction != NULL);
414440 J_ASSERT(journal->j_committing_transaction == NULL);
415441
442
+ write_lock(&journal->j_state_lock);
443
+ journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
444
+ while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
445
+ DEFINE_WAIT(wait);
446
+
447
+ prepare_to_wait(&journal->j_fc_wait, &wait,
448
+ TASK_UNINTERRUPTIBLE);
449
+ write_unlock(&journal->j_state_lock);
450
+ schedule();
451
+ write_lock(&journal->j_state_lock);
452
+ finish_wait(&journal->j_fc_wait, &wait);
453
+ /*
454
+ * TODO: by blocking fast commits here, we are increasing
455
+ * fsync() latency slightly. Strictly speaking, we don't need
456
+ * to block fast commits until the transaction enters T_FLUSH
457
+ * state. So an optimization is possible where we block new fast
458
+ * commits here and wait for existing ones to complete
459
+ * just before we enter T_FLUSH. That way, the existing fast
460
+ * commits and this full commit can proceed parallely.
461
+ */
462
+ }
463
+ write_unlock(&journal->j_state_lock);
464
+
416465 commit_transaction = journal->j_running_transaction;
417466
418467 trace_jbd2_start_commit(journal, commit_transaction);
....@@ -420,6 +469,7 @@
420469 commit_transaction->t_tid);
421470
422471 write_lock(&journal->j_state_lock);
472
+ journal->j_fc_off = 0;
423473 J_ASSERT(commit_transaction->t_state == T_RUNNING);
424474 commit_transaction->t_state = T_LOCKED;
425475
....@@ -450,6 +500,7 @@
450500 finish_wait(&journal->j_wait_updates, &wait);
451501 }
452502 spin_unlock(&commit_transaction->t_handle_lock);
503
+ commit_transaction->t_state = T_SWITCH;
453504
454505 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
455506 journal->j_max_transaction_buffers);
....@@ -469,6 +520,8 @@
469520 * has reserved. This is consistent with the existing behaviour
470521 * that multiple jbd2_journal_get_write_access() calls to the same
471522 * buffer are perfectly permissible.
523
+ * We use journal->j_state_lock here to serialize processing of
524
+ * t_reserved_list with eviction of buffers from journal_unmap_buffer().
472525 */
473526 while (commit_transaction->t_reserved_list) {
474527 jh = commit_transaction->t_reserved_list;
....@@ -480,14 +533,15 @@
480533 if (jh->b_committed_data) {
481534 struct buffer_head *bh = jh2bh(jh);
482535
483
- jbd_lock_bh_state(bh);
536
+ spin_lock(&jh->b_state_lock);
484537 jbd2_free(jh->b_committed_data, bh->b_size);
485538 jh->b_committed_data = NULL;
486
- jbd_unlock_bh_state(bh);
539
+ spin_unlock(&jh->b_state_lock);
487540 }
488541 jbd2_journal_refile_buffer(journal, jh);
489542 }
490543
544
+ write_unlock(&journal->j_state_lock);
491545 /*
492546 * Now try to drop any written-back buffers from the journal's
493547 * checkpoint lists. We do this *before* commit because it potentially
....@@ -510,6 +564,7 @@
510564 */
511565 jbd2_journal_switch_revoke_table(journal);
512566
567
+ write_lock(&journal->j_state_lock);
513568 /*
514569 * Reserved credits cannot be claimed anymore, free them
515570 */
....@@ -526,7 +581,7 @@
526581 journal->j_running_transaction = NULL;
527582 start_time = ktime_get();
528583 commit_transaction->t_log_start = journal->j_head;
529
- wake_up(&journal->j_wait_transaction_locked);
584
+ wake_up_all(&journal->j_wait_transaction_locked);
530585 write_unlock(&journal->j_state_lock);
531586
532587 jbd_debug(3, "JBD2: commit phase 2a\n");
....@@ -557,8 +612,7 @@
557612 stats.run.rs_logging = jiffies;
558613 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
559614 stats.run.rs_logging);
560
- stats.run.rs_blocks =
561
- atomic_read(&commit_transaction->t_outstanding_credits);
615
+ stats.run.rs_blocks = commit_transaction->t_nr_buffers;
562616 stats.run.rs_blocks_logged = 0;
563617
564618 J_ASSERT(commit_transaction->t_nr_buffers <=
....@@ -639,8 +693,7 @@
639693
640694 /*
641695 * start_this_handle() uses t_outstanding_credits to determine
642
- * the free space in the log, but this counter is changed
643
- * by jbd2_journal_next_log_block() also.
696
+ * the free space in the log.
644697 */
645698 atomic_dec(&commit_transaction->t_outstanding_credits);
646699
....@@ -759,7 +812,7 @@
759812 if (first_block < journal->j_tail)
760813 freed += journal->j_last - journal->j_first;
761814 /* Update tail only if we free significant amount of space */
762
- if (freed < journal->j_maxlen / 4)
815
+ if (freed < jbd2_journal_get_max_txn_bufs(journal))
763816 update_tail = 0;
764817 }
765818 J_ASSERT(commit_transaction->t_state == T_COMMIT);
....@@ -774,7 +827,7 @@
774827 if (commit_transaction->t_need_data_flush &&
775828 (journal->j_fs_dev != journal->j_dev) &&
776829 (journal->j_flags & JBD2_BARRIER))
777
- blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
830
+ blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
778831
779832 /* Done it all: now write the commit record asynchronously. */
780833 if (jbd2_has_feature_async_commit(journal)) {
....@@ -881,11 +934,14 @@
881934 stats.run.rs_blocks_logged++;
882935 if (jbd2_has_feature_async_commit(journal) &&
883936 journal->j_flags & JBD2_BARRIER) {
884
- blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
937
+ blkdev_issue_flush(journal->j_dev, GFP_NOFS);
885938 }
886939
887940 if (err)
888941 jbd2_journal_abort(journal, err);
942
+
943
+ WARN_ON_ONCE(
944
+ atomic_read(&commit_transaction->t_outstanding_credits) < 0);
889945
890946 /*
891947 * Now disk caches for filesystem device are flushed so we are safe to
....@@ -917,6 +973,7 @@
917973 transaction_t *cp_transaction;
918974 struct buffer_head *bh;
919975 int try_to_free = 0;
976
+ bool drop_ref;
920977
921978 jh = commit_transaction->t_forget;
922979 spin_unlock(&journal->j_list_lock);
....@@ -926,7 +983,7 @@
926983 * done with it.
927984 */
928985 get_bh(bh);
929
- jbd_lock_bh_state(bh);
986
+ spin_lock(&jh->b_state_lock);
930987 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
931988
932989 /*
....@@ -1026,8 +1083,10 @@
10261083 try_to_free = 1;
10271084 }
10281085 JBUFFER_TRACE(jh, "refile or unfile buffer");
1029
- __jbd2_journal_refile_buffer(jh);
1030
- jbd_unlock_bh_state(bh);
1086
+ drop_ref = __jbd2_journal_refile_buffer(jh);
1087
+ spin_unlock(&jh->b_state_lock);
1088
+ if (drop_ref)
1089
+ jbd2_journal_put_journal_head(jh);
10311090 if (try_to_free)
10321091 release_buffer_page(bh); /* Drops bh reference */
10331092 else
....@@ -1112,12 +1171,16 @@
11121171
11131172 if (journal->j_commit_callback)
11141173 journal->j_commit_callback(journal, commit_transaction);
1174
+ if (journal->j_fc_cleanup_callback)
1175
+ journal->j_fc_cleanup_callback(journal, 1);
11151176
11161177 trace_jbd2_end_commit(journal, commit_transaction);
11171178 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
11181179 journal->j_commit_sequence, journal->j_tail_sequence);
11191180
11201181 write_lock(&journal->j_state_lock);
1182
+ journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1183
+ journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
11211184 spin_lock(&journal->j_list_lock);
11221185 commit_transaction->t_state = T_FINISHED;
11231186 /* Check if the transaction can be dropped now that we are finished */
....@@ -1129,6 +1192,7 @@
11291192 spin_unlock(&journal->j_list_lock);
11301193 write_unlock(&journal->j_state_lock);
11311194 wake_up(&journal->j_wait_done_commit);
1195
+ wake_up(&journal->j_fc_wait);
11321196
11331197 /*
11341198 * Calculate overall stats