| .. | .. |
|---|
| 184 | 184 | /* |
|---|
| 185 | 185 | * write the filemap data using writepage() address_space_operations. |
|---|
| 186 | 186 | * We don't do block allocation here even for delalloc. We don't |
|---|
| 187 | | - * use writepages() because with dealyed allocation we may be doing |
|---|
| 187 | + * use writepages() because with delayed allocation we may be doing |
|---|
| 188 | 188 | * block allocation in writepages(). |
|---|
| 189 | 189 | */ |
|---|
| 190 | | -static int journal_submit_inode_data_buffers(struct address_space *mapping, |
|---|
| 191 | | - loff_t dirty_start, loff_t dirty_end) |
|---|
| 190 | +int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) |
|---|
| 192 | 191 | { |
|---|
| 193 | | - int ret; |
|---|
| 192 | + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; |
|---|
| 194 | 193 | struct writeback_control wbc = { |
|---|
| 195 | 194 | .sync_mode = WB_SYNC_ALL, |
|---|
| 196 | 195 | .nr_to_write = mapping->nrpages * 2, |
|---|
| 197 | | - .range_start = dirty_start, |
|---|
| 198 | | - .range_end = dirty_end, |
|---|
| 196 | + .range_start = jinode->i_dirty_start, |
|---|
| 197 | + .range_end = jinode->i_dirty_end, |
|---|
| 199 | 198 | }; |
|---|
| 200 | 199 | |
|---|
| 201 | | - ret = generic_writepages(mapping, &wbc); |
|---|
| 202 | | - return ret; |
|---|
| 200 | + /* |
|---|
| 201 | + * submit the inode data buffers. We use writepage |
|---|
| 202 | + * instead of writepages. Because writepages can do |
|---|
| 203 | + * block allocation with delalloc. We need to write |
|---|
| 204 | + * only allocated blocks here. |
|---|
| 205 | + */ |
|---|
| 206 | + return generic_writepages(mapping, &wbc); |
|---|
| 203 | 207 | } |
|---|
| 208 | + |
|---|
| 209 | +/* Send all the data buffers related to an inode */ |
|---|
| 210 | +int jbd2_submit_inode_data(struct jbd2_inode *jinode) |
|---|
| 211 | +{ |
|---|
| 212 | + |
|---|
| 213 | + if (!jinode || !(jinode->i_flags & JI_WRITE_DATA)) |
|---|
| 214 | + return 0; |
|---|
| 215 | + |
|---|
| 216 | + trace_jbd2_submit_inode_data(jinode->i_vfs_inode); |
|---|
| 217 | + return jbd2_journal_submit_inode_data_buffers(jinode); |
|---|
| 218 | + |
|---|
| 219 | +} |
|---|
| 220 | +EXPORT_SYMBOL(jbd2_submit_inode_data); |
|---|
| 221 | + |
|---|
| 222 | +int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode) |
|---|
| 223 | +{ |
|---|
| 224 | + if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) || |
|---|
| 225 | + !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping) |
|---|
| 226 | + return 0; |
|---|
| 227 | + return filemap_fdatawait_range_keep_errors( |
|---|
| 228 | + jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start, |
|---|
| 229 | + jinode->i_dirty_end); |
|---|
| 230 | +} |
|---|
| 231 | +EXPORT_SYMBOL(jbd2_wait_inode_data); |
|---|
| 204 | 232 | |
|---|
| 205 | 233 | /* |
|---|
| 206 | 234 | * Submit all the data buffers of inode associated with the transaction to |
|---|
| .. | .. |
|---|
| 215 | 243 | { |
|---|
| 216 | 244 | struct jbd2_inode *jinode; |
|---|
| 217 | 245 | int err, ret = 0; |
|---|
| 218 | | - struct address_space *mapping; |
|---|
| 219 | 246 | |
|---|
| 220 | 247 | spin_lock(&journal->j_list_lock); |
|---|
| 221 | 248 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { |
|---|
| 222 | | - loff_t dirty_start = jinode->i_dirty_start; |
|---|
| 223 | | - loff_t dirty_end = jinode->i_dirty_end; |
|---|
| 224 | | - |
|---|
| 225 | 249 | if (!(jinode->i_flags & JI_WRITE_DATA)) |
|---|
| 226 | 250 | continue; |
|---|
| 227 | | - mapping = jinode->i_vfs_inode->i_mapping; |
|---|
| 228 | 251 | jinode->i_flags |= JI_COMMIT_RUNNING; |
|---|
| 229 | 252 | spin_unlock(&journal->j_list_lock); |
|---|
| 230 | | - /* |
|---|
| 231 | | - * submit the inode data buffers. We use writepage |
|---|
| 232 | | - * instead of writepages. Because writepages can do |
|---|
| 233 | | - * block allocation with delalloc. We need to write |
|---|
| 234 | | - * only allocated blocks here. |
|---|
| 235 | | - */ |
|---|
| 253 | + /* submit the inode data buffers. */ |
|---|
| 236 | 254 | trace_jbd2_submit_inode_data(jinode->i_vfs_inode); |
|---|
| 237 | | - err = journal_submit_inode_data_buffers(mapping, dirty_start, |
|---|
| 238 | | - dirty_end); |
|---|
| 239 | | - if (!ret) |
|---|
| 240 | | - ret = err; |
|---|
| 255 | + if (journal->j_submit_inode_data_buffers) { |
|---|
| 256 | + err = journal->j_submit_inode_data_buffers(jinode); |
|---|
| 257 | + if (!ret) |
|---|
| 258 | + ret = err; |
|---|
| 259 | + } |
|---|
| 241 | 260 | spin_lock(&journal->j_list_lock); |
|---|
| 242 | 261 | J_ASSERT(jinode->i_transaction == commit_transaction); |
|---|
| 243 | 262 | jinode->i_flags &= ~JI_COMMIT_RUNNING; |
|---|
| .. | .. |
|---|
| 246 | 265 | } |
|---|
| 247 | 266 | spin_unlock(&journal->j_list_lock); |
|---|
| 248 | 267 | return ret; |
|---|
| 268 | +} |
|---|
| 269 | + |
|---|
| 270 | +int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) |
|---|
| 271 | +{ |
|---|
| 272 | + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; |
|---|
| 273 | + |
|---|
| 274 | + return filemap_fdatawait_range_keep_errors(mapping, |
|---|
| 275 | + jinode->i_dirty_start, |
|---|
| 276 | + jinode->i_dirty_end); |
|---|
| 249 | 277 | } |
|---|
| 250 | 278 | |
|---|
| 251 | 279 | /* |
|---|
| .. | .. |
|---|
| 262 | 290 | /* For locking, see the comment in journal_submit_data_buffers() */ |
|---|
| 263 | 291 | spin_lock(&journal->j_list_lock); |
|---|
| 264 | 292 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { |
|---|
| 265 | | - loff_t dirty_start = jinode->i_dirty_start; |
|---|
| 266 | | - loff_t dirty_end = jinode->i_dirty_end; |
|---|
| 267 | | - |
|---|
| 268 | 293 | if (!(jinode->i_flags & JI_WAIT_DATA)) |
|---|
| 269 | 294 | continue; |
|---|
| 270 | 295 | jinode->i_flags |= JI_COMMIT_RUNNING; |
|---|
| 271 | 296 | spin_unlock(&journal->j_list_lock); |
|---|
| 272 | | - err = filemap_fdatawait_range_keep_errors( |
|---|
| 273 | | - jinode->i_vfs_inode->i_mapping, dirty_start, |
|---|
| 274 | | - dirty_end); |
|---|
| 275 | | - if (!ret) |
|---|
| 276 | | - ret = err; |
|---|
| 297 | + /* wait for the inode data buffers writeout. */ |
|---|
| 298 | + if (journal->j_finish_inode_data_buffers) { |
|---|
| 299 | + err = journal->j_finish_inode_data_buffers(jinode); |
|---|
| 300 | + if (!ret) |
|---|
| 301 | + ret = err; |
|---|
| 302 | + } |
|---|
| 277 | 303 | spin_lock(&journal->j_list_lock); |
|---|
| 278 | 304 | jinode->i_flags &= ~JI_COMMIT_RUNNING; |
|---|
| 279 | 305 | smp_mb(); |
|---|
| .. | .. |
|---|
| 413 | 439 | J_ASSERT(journal->j_running_transaction != NULL); |
|---|
| 414 | 440 | J_ASSERT(journal->j_committing_transaction == NULL); |
|---|
| 415 | 441 | |
|---|
| 442 | + write_lock(&journal->j_state_lock); |
|---|
| 443 | + journal->j_flags |= JBD2_FULL_COMMIT_ONGOING; |
|---|
| 444 | + while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) { |
|---|
| 445 | + DEFINE_WAIT(wait); |
|---|
| 446 | + |
|---|
| 447 | + prepare_to_wait(&journal->j_fc_wait, &wait, |
|---|
| 448 | + TASK_UNINTERRUPTIBLE); |
|---|
| 449 | + write_unlock(&journal->j_state_lock); |
|---|
| 450 | + schedule(); |
|---|
| 451 | + write_lock(&journal->j_state_lock); |
|---|
| 452 | + finish_wait(&journal->j_fc_wait, &wait); |
|---|
| 453 | + /* |
|---|
| 454 | + * TODO: by blocking fast commits here, we are increasing |
|---|
| 455 | + * fsync() latency slightly. Strictly speaking, we don't need |
|---|
| 456 | + * to block fast commits until the transaction enters T_FLUSH |
|---|
| 457 | + * state. So an optimization is possible where we block new fast |
|---|
| 458 | + * commits here and wait for existing ones to complete |
|---|
| 459 | + * just before we enter T_FLUSH. That way, the existing fast |
|---|
| 460 | + * commits and this full commit can proceed parallely. |
|---|
| 461 | + */ |
|---|
| 462 | + } |
|---|
| 463 | + write_unlock(&journal->j_state_lock); |
|---|
| 464 | + |
|---|
| 416 | 465 | commit_transaction = journal->j_running_transaction; |
|---|
| 417 | 466 | |
|---|
| 418 | 467 | trace_jbd2_start_commit(journal, commit_transaction); |
|---|
| .. | .. |
|---|
| 420 | 469 | commit_transaction->t_tid); |
|---|
| 421 | 470 | |
|---|
| 422 | 471 | write_lock(&journal->j_state_lock); |
|---|
| 472 | + journal->j_fc_off = 0; |
|---|
| 423 | 473 | J_ASSERT(commit_transaction->t_state == T_RUNNING); |
|---|
| 424 | 474 | commit_transaction->t_state = T_LOCKED; |
|---|
| 425 | 475 | |
|---|
| .. | .. |
|---|
| 450 | 500 | finish_wait(&journal->j_wait_updates, &wait); |
|---|
| 451 | 501 | } |
|---|
| 452 | 502 | spin_unlock(&commit_transaction->t_handle_lock); |
|---|
| 503 | + commit_transaction->t_state = T_SWITCH; |
|---|
| 453 | 504 | |
|---|
| 454 | 505 | J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= |
|---|
| 455 | 506 | journal->j_max_transaction_buffers); |
|---|
| .. | .. |
|---|
| 469 | 520 | * has reserved. This is consistent with the existing behaviour |
|---|
| 470 | 521 | * that multiple jbd2_journal_get_write_access() calls to the same |
|---|
| 471 | 522 | * buffer are perfectly permissible. |
|---|
| 523 | + * We use journal->j_state_lock here to serialize processing of |
|---|
| 524 | + * t_reserved_list with eviction of buffers from journal_unmap_buffer(). |
|---|
| 472 | 525 | */ |
|---|
| 473 | 526 | while (commit_transaction->t_reserved_list) { |
|---|
| 474 | 527 | jh = commit_transaction->t_reserved_list; |
|---|
| .. | .. |
|---|
| 480 | 533 | if (jh->b_committed_data) { |
|---|
| 481 | 534 | struct buffer_head *bh = jh2bh(jh); |
|---|
| 482 | 535 | |
|---|
| 483 | | - jbd_lock_bh_state(bh); |
|---|
| 536 | + spin_lock(&jh->b_state_lock); |
|---|
| 484 | 537 | jbd2_free(jh->b_committed_data, bh->b_size); |
|---|
| 485 | 538 | jh->b_committed_data = NULL; |
|---|
| 486 | | - jbd_unlock_bh_state(bh); |
|---|
| 539 | + spin_unlock(&jh->b_state_lock); |
|---|
| 487 | 540 | } |
|---|
| 488 | 541 | jbd2_journal_refile_buffer(journal, jh); |
|---|
| 489 | 542 | } |
|---|
| 490 | 543 | |
|---|
| 544 | + write_unlock(&journal->j_state_lock); |
|---|
| 491 | 545 | /* |
|---|
| 492 | 546 | * Now try to drop any written-back buffers from the journal's |
|---|
| 493 | 547 | * checkpoint lists. We do this *before* commit because it potentially |
|---|
| .. | .. |
|---|
| 510 | 564 | */ |
|---|
| 511 | 565 | jbd2_journal_switch_revoke_table(journal); |
|---|
| 512 | 566 | |
|---|
| 567 | + write_lock(&journal->j_state_lock); |
|---|
| 513 | 568 | /* |
|---|
| 514 | 569 | * Reserved credits cannot be claimed anymore, free them |
|---|
| 515 | 570 | */ |
|---|
| .. | .. |
|---|
| 526 | 581 | journal->j_running_transaction = NULL; |
|---|
| 527 | 582 | start_time = ktime_get(); |
|---|
| 528 | 583 | commit_transaction->t_log_start = journal->j_head; |
|---|
| 529 | | - wake_up(&journal->j_wait_transaction_locked); |
|---|
| 584 | + wake_up_all(&journal->j_wait_transaction_locked); |
|---|
| 530 | 585 | write_unlock(&journal->j_state_lock); |
|---|
| 531 | 586 | |
|---|
| 532 | 587 | jbd_debug(3, "JBD2: commit phase 2a\n"); |
|---|
| .. | .. |
|---|
| 557 | 612 | stats.run.rs_logging = jiffies; |
|---|
| 558 | 613 | stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, |
|---|
| 559 | 614 | stats.run.rs_logging); |
|---|
| 560 | | - stats.run.rs_blocks = |
|---|
| 561 | | - atomic_read(&commit_transaction->t_outstanding_credits); |
|---|
| 615 | + stats.run.rs_blocks = commit_transaction->t_nr_buffers; |
|---|
| 562 | 616 | stats.run.rs_blocks_logged = 0; |
|---|
| 563 | 617 | |
|---|
| 564 | 618 | J_ASSERT(commit_transaction->t_nr_buffers <= |
|---|
| .. | .. |
|---|
| 639 | 693 | |
|---|
| 640 | 694 | /* |
|---|
| 641 | 695 | * start_this_handle() uses t_outstanding_credits to determine |
|---|
| 642 | | - * the free space in the log, but this counter is changed |
|---|
| 643 | | - * by jbd2_journal_next_log_block() also. |
|---|
| 696 | + * the free space in the log. |
|---|
| 644 | 697 | */ |
|---|
| 645 | 698 | atomic_dec(&commit_transaction->t_outstanding_credits); |
|---|
| 646 | 699 | |
|---|
| .. | .. |
|---|
| 759 | 812 | if (first_block < journal->j_tail) |
|---|
| 760 | 813 | freed += journal->j_last - journal->j_first; |
|---|
| 761 | 814 | /* Update tail only if we free significant amount of space */ |
|---|
| 762 | | - if (freed < journal->j_maxlen / 4) |
|---|
| 815 | + if (freed < jbd2_journal_get_max_txn_bufs(journal)) |
|---|
| 763 | 816 | update_tail = 0; |
|---|
| 764 | 817 | } |
|---|
| 765 | 818 | J_ASSERT(commit_transaction->t_state == T_COMMIT); |
|---|
| .. | .. |
|---|
| 774 | 827 | if (commit_transaction->t_need_data_flush && |
|---|
| 775 | 828 | (journal->j_fs_dev != journal->j_dev) && |
|---|
| 776 | 829 | (journal->j_flags & JBD2_BARRIER)) |
|---|
| 777 | | - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); |
|---|
| 830 | + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); |
|---|
| 778 | 831 | |
|---|
| 779 | 832 | /* Done it all: now write the commit record asynchronously. */ |
|---|
| 780 | 833 | if (jbd2_has_feature_async_commit(journal)) { |
|---|
| .. | .. |
|---|
| 881 | 934 | stats.run.rs_blocks_logged++; |
|---|
| 882 | 935 | if (jbd2_has_feature_async_commit(journal) && |
|---|
| 883 | 936 | journal->j_flags & JBD2_BARRIER) { |
|---|
| 884 | | - blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); |
|---|
| 937 | + blkdev_issue_flush(journal->j_dev, GFP_NOFS); |
|---|
| 885 | 938 | } |
|---|
| 886 | 939 | |
|---|
| 887 | 940 | if (err) |
|---|
| 888 | 941 | jbd2_journal_abort(journal, err); |
|---|
| 942 | + |
|---|
| 943 | + WARN_ON_ONCE( |
|---|
| 944 | + atomic_read(&commit_transaction->t_outstanding_credits) < 0); |
|---|
| 889 | 945 | |
|---|
| 890 | 946 | /* |
|---|
| 891 | 947 | * Now disk caches for filesystem device are flushed so we are safe to |
|---|
| .. | .. |
|---|
| 917 | 973 | transaction_t *cp_transaction; |
|---|
| 918 | 974 | struct buffer_head *bh; |
|---|
| 919 | 975 | int try_to_free = 0; |
|---|
| 976 | + bool drop_ref; |
|---|
| 920 | 977 | |
|---|
| 921 | 978 | jh = commit_transaction->t_forget; |
|---|
| 922 | 979 | spin_unlock(&journal->j_list_lock); |
|---|
| .. | .. |
|---|
| 926 | 983 | * done with it. |
|---|
| 927 | 984 | */ |
|---|
| 928 | 985 | get_bh(bh); |
|---|
| 929 | | - jbd_lock_bh_state(bh); |
|---|
| 986 | + spin_lock(&jh->b_state_lock); |
|---|
| 930 | 987 | J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); |
|---|
| 931 | 988 | |
|---|
| 932 | 989 | /* |
|---|
| .. | .. |
|---|
| 1026 | 1083 | try_to_free = 1; |
|---|
| 1027 | 1084 | } |
|---|
| 1028 | 1085 | JBUFFER_TRACE(jh, "refile or unfile buffer"); |
|---|
| 1029 | | - __jbd2_journal_refile_buffer(jh); |
|---|
| 1030 | | - jbd_unlock_bh_state(bh); |
|---|
| 1086 | + drop_ref = __jbd2_journal_refile_buffer(jh); |
|---|
| 1087 | + spin_unlock(&jh->b_state_lock); |
|---|
| 1088 | + if (drop_ref) |
|---|
| 1089 | + jbd2_journal_put_journal_head(jh); |
|---|
| 1031 | 1090 | if (try_to_free) |
|---|
| 1032 | 1091 | release_buffer_page(bh); /* Drops bh reference */ |
|---|
| 1033 | 1092 | else |
|---|
| .. | .. |
|---|
| 1112 | 1171 | |
|---|
| 1113 | 1172 | if (journal->j_commit_callback) |
|---|
| 1114 | 1173 | journal->j_commit_callback(journal, commit_transaction); |
|---|
| 1174 | + if (journal->j_fc_cleanup_callback) |
|---|
| 1175 | + journal->j_fc_cleanup_callback(journal, 1); |
|---|
| 1115 | 1176 | |
|---|
| 1116 | 1177 | trace_jbd2_end_commit(journal, commit_transaction); |
|---|
| 1117 | 1178 | jbd_debug(1, "JBD2: commit %d complete, head %d\n", |
|---|
| 1118 | 1179 | journal->j_commit_sequence, journal->j_tail_sequence); |
|---|
| 1119 | 1180 | |
|---|
| 1120 | 1181 | write_lock(&journal->j_state_lock); |
|---|
| 1182 | + journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING; |
|---|
| 1183 | + journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; |
|---|
| 1121 | 1184 | spin_lock(&journal->j_list_lock); |
|---|
| 1122 | 1185 | commit_transaction->t_state = T_FINISHED; |
|---|
| 1123 | 1186 | /* Check if the transaction can be dropped now that we are finished */ |
|---|
| .. | .. |
|---|
| 1129 | 1192 | spin_unlock(&journal->j_list_lock); |
|---|
| 1130 | 1193 | write_unlock(&journal->j_state_lock); |
|---|
| 1131 | 1194 | wake_up(&journal->j_wait_done_commit); |
|---|
| 1195 | + wake_up(&journal->j_fc_wait); |
|---|
| 1132 | 1196 | |
|---|
| 1133 | 1197 | /* |
|---|
| 1134 | 1198 | * Calculate overall stats |
|---|