hc
2024-05-10 23fa18eaa71266feff7ba8d83022d9e1cc83c65a
kernel/fs/ext4/inode.c
....@@ -49,8 +49,6 @@
4949 #include <trace/events/ext4.h>
5050 #include <trace/events/android_fs.h>
5151
52
-#define MPAGE_DA_EXTENT_TAIL 0x01
53
-
5452 static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
5553 struct ext4_inode_info *ei)
5654 {
....@@ -104,8 +102,8 @@
104102 return provided == calculated;
105103 }
106104
107
-static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
108
- struct ext4_inode_info *ei)
105
+void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
106
+ struct ext4_inode_info *ei)
109107 {
110108 __u32 csum;
111109
....@@ -165,32 +163,6 @@
165163 }
166164
167165 /*
168
- * Restart the transaction associated with *handle. This does a commit,
169
- * so before we call here everything must be consistently dirtied against
170
- * this transaction.
171
- */
172
-int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
173
- int nblocks)
174
-{
175
- int ret;
176
-
177
- /*
178
- * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
179
- * moment, get_block can be called only for blocks inside i_size since
180
- * page cache has been already dropped and writes are blocked by
181
- * i_mutex. So we can safely drop the i_data_sem here.
182
- */
183
- BUG_ON(EXT4_JOURNAL(inode) == NULL);
184
- jbd_debug(2, "restarting handle %p\n", handle);
185
- up_write(&EXT4_I(inode)->i_data_sem);
186
- ret = ext4_journal_restart(handle, nblocks);
187
- down_write(&EXT4_I(inode)->i_data_sem);
188
- ext4_discard_preallocations(inode);
189
-
190
- return ret;
191
-}
192
-
193
-/*
194166 * Called at the last iput() if i_nlink is zero.
195167 */
196168 void ext4_evict_inode(struct inode *inode)
....@@ -208,6 +180,8 @@
208180
209181 trace_ext4_evict_inode(inode);
210182
183
+ if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
184
+ ext4_evict_ea_inode(inode);
211185 if (inode->i_nlink) {
212186 /*
213187 * When journalling data dirty buffers are tracked only in the
....@@ -249,6 +223,16 @@
249223 if (ext4_should_order_data(inode))
250224 ext4_begin_ordered_truncate(inode, 0);
251225 truncate_inode_pages_final(&inode->i_data);
226
+
227
+ /*
228
+ * For inodes with journalled data, transaction commit could have
229
+ * dirtied the inode. And for inodes with dioread_nolock, unwritten
230
+ * extents converting worker could merge extents and also have dirtied
231
+ * the inode. Flush worker is ignoring it because of I_FREEING flag but
232
+ * we still need to remove the inode from the writeback lists.
233
+ */
234
+ if (!list_empty_careful(&inode->i_io_list))
235
+ inode_io_list_del(inode);
252236
253237 /*
254238 * Protect us against freezing - iput() caller didn't have to have any
....@@ -305,9 +289,9 @@
305289 if (inode->i_blocks) {
306290 err = ext4_truncate(inode);
307291 if (err) {
308
- ext4_error(inode->i_sb,
309
- "couldn't truncate inode %lu (err %d)",
310
- inode->i_ino, err);
292
+ ext4_error_err(inode->i_sb, -err,
293
+ "couldn't truncate inode %lu (err %d)",
294
+ inode->i_ino, err);
311295 goto stop_handle;
312296 }
313297 }
....@@ -355,6 +339,14 @@
355339 ext4_xattr_inode_array_free(ea_inode_array);
356340 return;
357341 no_delete:
342
+ /*
343
+ * Check out some where else accidentally dirty the evicting inode,
344
+ * which may probably cause inode use-after-free issues later.
345
+ */
346
+ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));
347
+
348
+ if (!list_empty(&EXT4_I(inode)->i_fc_list))
349
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
358350 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
359351 }
360352
....@@ -410,8 +402,8 @@
410402 * inode's preallocations.
411403 */
412404 if ((ei->i_reserved_data_blocks == 0) &&
413
- (atomic_read(&inode->i_writecount) == 0))
414
- ext4_discard_preallocations(inode);
405
+ !inode_is_open_for_write(inode))
406
+ ext4_discard_preallocations(inode, 0);
415407 }
416408
417409 static int __check_block_validity(struct inode *inode, const char *func,
....@@ -437,7 +429,7 @@
437429 {
438430 int ret;
439431
440
- if (IS_ENCRYPTED(inode))
432
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
441433 return fscrypt_zeroout_range(inode, lblk, pblk, len);
442434
443435 ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
....@@ -469,11 +461,9 @@
469461 */
470462 down_read(&EXT4_I(inode)->i_data_sem);
471463 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
472
- retval = ext4_ext_map_blocks(handle, inode, map, flags &
473
- EXT4_GET_BLOCKS_KEEP_SIZE);
464
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
474465 } else {
475
- retval = ext4_ind_map_blocks(handle, inode, map, flags &
476
- EXT4_GET_BLOCKS_KEEP_SIZE);
466
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
477467 }
478468 up_read((&EXT4_I(inode)->i_data_sem));
479469
....@@ -530,9 +520,8 @@
530520 #endif
531521
532522 map->m_flags = 0;
533
- ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
534
- "logical block %lu\n", inode->i_ino, flags, map->m_len,
535
- (unsigned long) map->m_lblk);
523
+ ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
524
+ flags, map->m_len, (unsigned long) map->m_lblk);
536525
537526 /*
538527 * ext4_map_blocks returns an int, and m_len is an unsigned int
....@@ -545,7 +534,8 @@
545534 return -EFSCORRUPTED;
546535
547536 /* Lookup extent status tree firstly */
548
- if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
537
+ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
538
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
549539 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
550540 map->m_pblk = ext4_es_pblock(&es) +
551541 map->m_lblk - es.es_lblk;
....@@ -563,7 +553,7 @@
563553 map->m_len = retval;
564554 retval = 0;
565555 } else {
566
- BUG_ON(1);
556
+ BUG();
567557 }
568558 #ifdef ES_AGGRESSIVE_TEST
569559 ext4_map_blocks_es_recheck(handle, inode, map,
....@@ -578,11 +568,9 @@
578568 */
579569 down_read(&EXT4_I(inode)->i_data_sem);
580570 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
581
- retval = ext4_ext_map_blocks(handle, inode, map, flags &
582
- EXT4_GET_BLOCKS_KEEP_SIZE);
571
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
583572 } else {
584
- retval = ext4_ind_map_blocks(handle, inode, map, flags &
585
- EXT4_GET_BLOCKS_KEEP_SIZE);
573
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
586574 }
587575 if (retval > 0) {
588576 unsigned int status;
....@@ -599,8 +587,8 @@
599587 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
600588 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
601589 !(status & EXTENT_STATUS_WRITTEN) &&
602
- ext4_find_delalloc_range(inode, map->m_lblk,
603
- map->m_lblk + map->m_len - 1))
590
+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
591
+ map->m_lblk + map->m_len - 1))
604592 status |= EXTENT_STATUS_DELAYED;
605593 ret = ext4_es_insert_extent(inode, map->m_lblk,
606594 map->m_len, map->m_pblk, status);
....@@ -667,16 +655,6 @@
667655 */
668656 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
669657 }
670
-
671
- /*
672
- * Update reserved blocks/metadata blocks after successful
673
- * block allocation which had been deferred till now. We don't
674
- * support fallocate for non extent files. So we can update
675
- * reserve space here.
676
- */
677
- if ((retval > 0) &&
678
- (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
679
- ext4_da_update_reserve_space(inode, retval, 1);
680658 }
681659
682660 if (retval > 0) {
....@@ -700,8 +678,6 @@
700678 if (flags & EXT4_GET_BLOCKS_ZERO &&
701679 map->m_flags & EXT4_MAP_MAPPED &&
702680 map->m_flags & EXT4_MAP_NEW) {
703
- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
704
- map->m_len);
705681 ret = ext4_issue_zeroout(inode, map->m_lblk,
706682 map->m_pblk, map->m_len);
707683 if (ret) {
....@@ -715,7 +691,7 @@
715691 * extent status tree.
716692 */
717693 if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
718
- ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
694
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
719695 if (ext4_es_is_written(&es))
720696 goto out_sem;
721697 }
....@@ -723,8 +699,8 @@
723699 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
724700 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
725701 !(status & EXTENT_STATUS_WRITTEN) &&
726
- ext4_find_delalloc_range(inode, map->m_lblk,
727
- map->m_lblk + map->m_len - 1))
702
+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
703
+ map->m_lblk + map->m_len - 1))
728704 status |= EXTENT_STATUS_DELAYED;
729705 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
730706 map->m_pblk, status);
....@@ -765,6 +741,12 @@
765741 return ret;
766742 }
767743 }
744
+ if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
745
+ map->m_flags & EXT4_MAP_MAPPED))
746
+ ext4_fc_track_range(handle, inode, map->m_lblk,
747
+ map->m_lblk + map->m_len - 1);
748
+ if (retval < 0)
749
+ ext_debug(inode, "failed with err %d\n", retval);
768750 return retval;
769751 }
770752
....@@ -847,136 +829,6 @@
847829 #define DIO_MAX_BLOCKS 4096
848830
849831 /*
850
- * Get blocks function for the cases that need to start a transaction -
851
- * generally difference cases of direct IO and DAX IO. It also handles retries
852
- * in case of ENOSPC.
853
- */
854
-static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
855
- struct buffer_head *bh_result, int flags)
856
-{
857
- int dio_credits;
858
- handle_t *handle;
859
- int retries = 0;
860
- int ret;
861
-
862
- /* Trim mapping request to maximum we can map at once for DIO */
863
- if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
864
- bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
865
- dio_credits = ext4_chunk_trans_blocks(inode,
866
- bh_result->b_size >> inode->i_blkbits);
867
-retry:
868
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
869
- if (IS_ERR(handle))
870
- return PTR_ERR(handle);
871
-
872
- ret = _ext4_get_block(inode, iblock, bh_result, flags);
873
- ext4_journal_stop(handle);
874
-
875
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
876
- goto retry;
877
- return ret;
878
-}
879
-
880
-/* Get block function for DIO reads and writes to inodes without extents */
881
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
882
- struct buffer_head *bh, int create)
883
-{
884
- /* We don't expect handle for direct IO */
885
- WARN_ON_ONCE(ext4_journal_current_handle());
886
-
887
- if (!create)
888
- return _ext4_get_block(inode, iblock, bh, 0);
889
- return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
890
-}
891
-
892
-/*
893
- * Get block function for AIO DIO writes when we create unwritten extent if
894
- * blocks are not allocated yet. The extent will be converted to written
895
- * after IO is complete.
896
- */
897
-static int ext4_dio_get_block_unwritten_async(struct inode *inode,
898
- sector_t iblock, struct buffer_head *bh_result, int create)
899
-{
900
- int ret;
901
-
902
- /* We don't expect handle for direct IO */
903
- WARN_ON_ONCE(ext4_journal_current_handle());
904
-
905
- ret = ext4_get_block_trans(inode, iblock, bh_result,
906
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
907
-
908
- /*
909
- * When doing DIO using unwritten extents, we need io_end to convert
910
- * unwritten extents to written on IO completion. We allocate io_end
911
- * once we spot unwritten extent and store it in b_private. Generic
912
- * DIO code keeps b_private set and furthermore passes the value to
913
- * our completion callback in 'private' argument.
914
- */
915
- if (!ret && buffer_unwritten(bh_result)) {
916
- if (!bh_result->b_private) {
917
- ext4_io_end_t *io_end;
918
-
919
- io_end = ext4_init_io_end(inode, GFP_KERNEL);
920
- if (!io_end)
921
- return -ENOMEM;
922
- bh_result->b_private = io_end;
923
- ext4_set_io_unwritten_flag(inode, io_end);
924
- }
925
- set_buffer_defer_completion(bh_result);
926
- }
927
-
928
- return ret;
929
-}
930
-
931
-/*
932
- * Get block function for non-AIO DIO writes when we create unwritten extent if
933
- * blocks are not allocated yet. The extent will be converted to written
934
- * after IO is complete by ext4_direct_IO_write().
935
- */
936
-static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
937
- sector_t iblock, struct buffer_head *bh_result, int create)
938
-{
939
- int ret;
940
-
941
- /* We don't expect handle for direct IO */
942
- WARN_ON_ONCE(ext4_journal_current_handle());
943
-
944
- ret = ext4_get_block_trans(inode, iblock, bh_result,
945
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
946
-
947
- /*
948
- * Mark inode as having pending DIO writes to unwritten extents.
949
- * ext4_direct_IO_write() checks this flag and converts extents to
950
- * written.
951
- */
952
- if (!ret && buffer_unwritten(bh_result))
953
- ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
954
-
955
- return ret;
956
-}
957
-
958
-static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
959
- struct buffer_head *bh_result, int create)
960
-{
961
- int ret;
962
-
963
- ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
964
- inode->i_ino, create);
965
- /* We don't expect handle for direct IO */
966
- WARN_ON_ONCE(ext4_journal_current_handle());
967
-
968
- ret = _ext4_get_block(inode, iblock, bh_result, 0);
969
- /*
970
- * Blocks should have been preallocated! ext4_file_write_iter() checks
971
- * that.
972
- */
973
- WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
974
-
975
- return ret;
976
-}
977
-
978
-
979
-/*
980832 * `handle' can be NULL if create is zero
981833 */
982834 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
....@@ -987,7 +839,8 @@
987839 int create = map_flags & EXT4_GET_BLOCKS_CREATE;
988840 int err;
989841
990
- J_ASSERT(handle != NULL || create == 0);
842
+ J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
843
+ || handle != NULL || create == 0);
991844
992845 map.m_lblk = block;
993846 map.m_len = 1;
....@@ -1003,7 +856,8 @@
1003856 return ERR_PTR(-ENOMEM);
1004857 if (map.m_flags & EXT4_MAP_NEW) {
1005858 J_ASSERT(create != 0);
1006
- J_ASSERT(handle != NULL);
859
+ J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
860
+ || (handle != NULL));
1007861
1008862 /*
1009863 * Now that we do not always journal data, we should
....@@ -1040,18 +894,20 @@
1040894 ext4_lblk_t block, int map_flags)
1041895 {
1042896 struct buffer_head *bh;
897
+ int ret;
1043898
1044899 bh = ext4_getblk(handle, inode, block, map_flags);
1045900 if (IS_ERR(bh))
1046901 return bh;
1047
- if (!bh || buffer_uptodate(bh))
902
+ if (!bh || ext4_buffer_uptodate(bh))
1048903 return bh;
1049
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
1050
- wait_on_buffer(bh);
1051
- if (buffer_uptodate(bh))
1052
- return bh;
1053
- put_bh(bh);
1054
- return ERR_PTR(-EIO);
904
+
905
+ ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
906
+ if (ret) {
907
+ put_bh(bh);
908
+ return ERR_PTR(ret);
909
+ }
910
+ return bh;
1055911 }
1056912
1057913 /* Read a contiguous batch of blocks. */
....@@ -1071,9 +927,8 @@
1071927
1072928 for (i = 0; i < bh_count; i++)
1073929 /* Note that NULL bhs[i] is valid because of holes. */
1074
- if (bhs[i] && !buffer_uptodate(bhs[i]))
1075
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
1076
- &bhs[i]);
930
+ if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
931
+ ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);
1077932
1078933 if (!wait)
1079934 return 0;
....@@ -1190,8 +1045,9 @@
11901045 int err = 0;
11911046 unsigned blocksize = inode->i_sb->s_blocksize;
11921047 unsigned bbits;
1193
- struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
1194
- bool decrypt = false;
1048
+ struct buffer_head *bh, *head, *wait[2];
1049
+ int nr_wait = 0;
1050
+ int i;
11951051
11961052 BUG_ON(!PageLocked(page));
11971053 BUG_ON(from > PAGE_SIZE);
....@@ -1222,7 +1078,6 @@
12221078 if (err)
12231079 break;
12241080 if (buffer_new(bh)) {
1225
- clean_bdev_bh_alias(bh);
12261081 if (PageUptodate(page)) {
12271082 clear_buffer_new(bh);
12281083 set_buffer_uptodate(bh);
....@@ -1243,23 +1098,33 @@
12431098 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
12441099 !buffer_unwritten(bh) &&
12451100 (block_start < from || block_end > to)) {
1246
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
1247
- *wait_bh++ = bh;
1248
- decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
1101
+ ext4_read_bh_lock(bh, 0, false);
1102
+ wait[nr_wait++] = bh;
12491103 }
12501104 }
12511105 /*
12521106 * If we issued read requests, let them complete.
12531107 */
1254
- while (wait_bh > wait) {
1255
- wait_on_buffer(*--wait_bh);
1256
- if (!buffer_uptodate(*wait_bh))
1108
+ for (i = 0; i < nr_wait; i++) {
1109
+ wait_on_buffer(wait[i]);
1110
+ if (!buffer_uptodate(wait[i]))
12571111 err = -EIO;
12581112 }
1259
- if (unlikely(err))
1113
+ if (unlikely(err)) {
12601114 page_zero_new_buffers(page, from, to);
1261
- else if (decrypt)
1262
- err = fscrypt_decrypt_pagecache_blocks(page, PAGE_SIZE, 0);
1115
+ } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
1116
+ for (i = 0; i < nr_wait; i++) {
1117
+ int err2;
1118
+
1119
+ err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize,
1120
+ bh_offset(wait[i]));
1121
+ if (err2) {
1122
+ clear_buffer_uptodate(wait[i]);
1123
+ err = err2;
1124
+ }
1125
+ }
1126
+ }
1127
+
12631128 return err;
12641129 }
12651130 #endif
....@@ -1319,6 +1184,13 @@
13191184 page = grab_cache_page_write_begin(mapping, index, flags);
13201185 if (!page)
13211186 return -ENOMEM;
1187
+ /*
1188
+ * The same as page allocation, we prealloc buffer heads before
1189
+ * starting the handle.
1190
+ */
1191
+ if (!page_has_buffers(page))
1192
+ create_empty_buffers(page, inode->i_sb->s_blocksize, 0);
1193
+
13221194 unlock_page(page);
13231195
13241196 retry_journal:
....@@ -1433,7 +1305,8 @@
14331305
14341306 trace_android_fs_datawrite_end(inode, pos, len);
14351307 trace_ext4_write_end(inode, pos, len, copied);
1436
- if (inline_data) {
1308
+ if (inline_data &&
1309
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
14371310 ret = ext4_write_inline_data_end(inode, pos, len,
14381311 copied, page);
14391312 if (ret < 0) {
....@@ -1442,6 +1315,7 @@
14421315 goto errout;
14431316 }
14441317 copied = ret;
1318
+ ret = 0;
14451319 } else
14461320 copied = block_write_end(file, mapping, pos,
14471321 len, copied, page, fsdata);
....@@ -1466,15 +1340,16 @@
14661340 * filesystems.
14671341 */
14681342 if (i_size_changed || inline_data)
1469
- ext4_mark_inode_dirty(handle, inode);
1343
+ ret = ext4_mark_inode_dirty(handle, inode);
14701344
1345
+errout:
14711346 if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
14721347 /* if we have allocated more blocks and copied
14731348 * less. We will have blocks allocated outside
14741349 * inode->i_size. So truncate them
14751350 */
14761351 ext4_orphan_add(handle, inode);
1477
-errout:
1352
+
14781353 ret2 = ext4_journal_stop(handle);
14791354 if (!ret)
14801355 ret = ret2;
....@@ -1558,6 +1433,7 @@
15581433 goto errout;
15591434 }
15601435 copied = ret;
1436
+ ret = 0;
15611437 } else if (unlikely(copied < len) && !PageUptodate(page)) {
15621438 copied = 0;
15631439 ext4_journalled_zero_new_buffers(handle, page, from, to);
....@@ -1587,6 +1463,7 @@
15871463 ret = ret2;
15881464 }
15891465
1466
+errout:
15901467 if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
15911468 /* if we have allocated more blocks and copied
15921469 * less. We will have blocks allocated outside
....@@ -1594,7 +1471,6 @@
15941471 */
15951472 ext4_orphan_add(handle, inode);
15961473
1597
-errout:
15981474 ret2 = ext4_journal_stop(handle);
15991475 if (!ret)
16001476 ret = ret2;
....@@ -1643,7 +1519,7 @@
16431519 return 0; /* success */
16441520 }
16451521
1646
-static void ext4_da_release_space(struct inode *inode, int to_free)
1522
+void ext4_da_release_space(struct inode *inode, int to_free)
16471523 {
16481524 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
16491525 struct ext4_inode_info *ei = EXT4_I(inode);
....@@ -1678,64 +1554,6 @@
16781554 dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
16791555 }
16801556
1681
-static void ext4_da_page_release_reservation(struct page *page,
1682
- unsigned int offset,
1683
- unsigned int length)
1684
-{
1685
- int to_release = 0, contiguous_blks = 0;
1686
- struct buffer_head *head, *bh;
1687
- unsigned int curr_off = 0;
1688
- struct inode *inode = page->mapping->host;
1689
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1690
- unsigned int stop = offset + length;
1691
- int num_clusters;
1692
- ext4_fsblk_t lblk;
1693
-
1694
- BUG_ON(stop > PAGE_SIZE || stop < length);
1695
-
1696
- head = page_buffers(page);
1697
- bh = head;
1698
- do {
1699
- unsigned int next_off = curr_off + bh->b_size;
1700
-
1701
- if (next_off > stop)
1702
- break;
1703
-
1704
- if ((offset <= curr_off) && (buffer_delay(bh))) {
1705
- to_release++;
1706
- contiguous_blks++;
1707
- clear_buffer_delay(bh);
1708
- } else if (contiguous_blks) {
1709
- lblk = page->index <<
1710
- (PAGE_SHIFT - inode->i_blkbits);
1711
- lblk += (curr_off >> inode->i_blkbits) -
1712
- contiguous_blks;
1713
- ext4_es_remove_extent(inode, lblk, contiguous_blks);
1714
- contiguous_blks = 0;
1715
- }
1716
- curr_off = next_off;
1717
- } while ((bh = bh->b_this_page) != head);
1718
-
1719
- if (contiguous_blks) {
1720
- lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
1721
- lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
1722
- ext4_es_remove_extent(inode, lblk, contiguous_blks);
1723
- }
1724
-
1725
- /* If we have released all the blocks belonging to a cluster, then we
1726
- * need to release the reserved space for that cluster. */
1727
- num_clusters = EXT4_NUM_B2C(sbi, to_release);
1728
- while (num_clusters > 0) {
1729
- lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
1730
- ((num_clusters - 1) << sbi->s_cluster_bits);
1731
- if (sbi->s_cluster_ratio == 1 ||
1732
- !ext4_find_delalloc_cluster(inode, lblk))
1733
- ext4_da_release_space(inode, 1);
1734
-
1735
- num_clusters--;
1736
- }
1737
-}
1738
-
17391557 /*
17401558 * Delayed allocation stuff
17411559 */
....@@ -1755,6 +1573,7 @@
17551573 struct ext4_map_blocks map;
17561574 struct ext4_io_submit io_submit; /* IO submission data */
17571575 unsigned int do_map:1;
1576
+ unsigned int scanned_until_end:1;
17581577 };
17591578
17601579 static void mpage_release_unused_pages(struct mpage_da_data *mpd,
....@@ -1770,13 +1589,21 @@
17701589 if (mpd->first_page >= mpd->next_page)
17711590 return;
17721591
1592
+ mpd->scanned_until_end = 0;
17731593 index = mpd->first_page;
17741594 end = mpd->next_page - 1;
17751595 if (invalidate) {
17761596 ext4_lblk_t start, last;
17771597 start = index << (PAGE_SHIFT - inode->i_blkbits);
17781598 last = end << (PAGE_SHIFT - inode->i_blkbits);
1599
+
1600
+ /*
1601
+ * avoid racing with extent status tree scans made by
1602
+ * ext4_insert_delayed_block()
1603
+ */
1604
+ down_write(&EXT4_I(inode)->i_data_sem);
17791605 ext4_es_remove_extent(inode, start, last - start + 1);
1606
+ up_write(&EXT4_I(inode)->i_data_sem);
17801607 }
17811608
17821609 pagevec_init(&pvec);
....@@ -1829,6 +1656,70 @@
18291656 }
18301657
18311658 /*
1659
+ * ext4_insert_delayed_block - adds a delayed block to the extents status
1660
+ * tree, incrementing the reserved cluster/block
1661
+ * count or making a pending reservation
1662
+ * where needed
1663
+ *
1664
+ * @inode - file containing the newly added block
1665
+ * @lblk - logical block to be added
1666
+ *
1667
+ * Returns 0 on success, negative error code on failure.
1668
+ */
1669
+static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
1670
+{
1671
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1672
+ int ret;
1673
+ bool allocated = false;
1674
+ bool reserved = false;
1675
+
1676
+ /*
1677
+ * If the cluster containing lblk is shared with a delayed,
1678
+ * written, or unwritten extent in a bigalloc file system, it's
1679
+ * already been accounted for and does not need to be reserved.
1680
+ * A pending reservation must be made for the cluster if it's
1681
+ * shared with a written or unwritten extent and doesn't already
1682
+ * have one. Written and unwritten extents can be purged from the
1683
+ * extents status tree if the system is under memory pressure, so
1684
+ * it's necessary to examine the extent tree if a search of the
1685
+ * extents status tree doesn't get a match.
1686
+ */
1687
+ if (sbi->s_cluster_ratio == 1) {
1688
+ ret = ext4_da_reserve_space(inode);
1689
+ if (ret != 0) /* ENOSPC */
1690
+ goto errout;
1691
+ reserved = true;
1692
+ } else { /* bigalloc */
1693
+ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
1694
+ if (!ext4_es_scan_clu(inode,
1695
+ &ext4_es_is_mapped, lblk)) {
1696
+ ret = ext4_clu_mapped(inode,
1697
+ EXT4_B2C(sbi, lblk));
1698
+ if (ret < 0)
1699
+ goto errout;
1700
+ if (ret == 0) {
1701
+ ret = ext4_da_reserve_space(inode);
1702
+ if (ret != 0) /* ENOSPC */
1703
+ goto errout;
1704
+ reserved = true;
1705
+ } else {
1706
+ allocated = true;
1707
+ }
1708
+ } else {
1709
+ allocated = true;
1710
+ }
1711
+ }
1712
+ }
1713
+
1714
+ ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
1715
+ if (ret && reserved)
1716
+ ext4_da_release_space(inode, 1);
1717
+
1718
+errout:
1719
+ return ret;
1720
+}
1721
+
1722
+/*
18321723 * This function is grabs code from the very beginning of
18331724 * ext4_map_blocks, but assumes that the caller is from delayed write
18341725 * time. This function looks up the requested blocks and sets the
....@@ -1851,12 +1742,11 @@
18511742 invalid_block = ~0;
18521743
18531744 map->m_flags = 0;
1854
- ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
1855
- "logical block %lu\n", inode->i_ino, map->m_len,
1745
+ ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
18561746 (unsigned long) map->m_lblk);
18571747
18581748 /* Lookup extent status tree firstly */
1859
- if (ext4_es_lookup_extent(inode, iblock, &es)) {
1749
+ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
18601750 if (ext4_es_is_hole(&es)) {
18611751 retval = 0;
18621752 down_read(&EXT4_I(inode)->i_data_sem);
....@@ -1884,7 +1774,7 @@
18841774 else if (ext4_es_is_unwritten(&es))
18851775 map->m_flags |= EXT4_MAP_UNWRITTEN;
18861776 else
1887
- BUG_ON(1);
1777
+ BUG();
18881778
18891779 #ifdef ES_AGGRESSIVE_TEST
18901780 ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
....@@ -1907,28 +1797,14 @@
19071797 add_delayed:
19081798 if (retval == 0) {
19091799 int ret;
1800
+
19101801 /*
19111802 * XXX: __block_prepare_write() unmaps passed block,
19121803 * is it OK?
19131804 */
1914
- /*
1915
- * If the block was allocated from previously allocated cluster,
1916
- * then we don't need to reserve it again. However we still need
1917
- * to reserve metadata for every block we're going to write.
1918
- */
1919
- if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
1920
- !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
1921
- ret = ext4_da_reserve_space(inode);
1922
- if (ret) {
1923
- /* not enough space to reserve */
1924
- retval = ret;
1925
- goto out_unlock;
1926
- }
1927
- }
19281805
1929
- ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
1930
- ~0, EXTENT_STATUS_DELAYED);
1931
- if (ret) {
1806
+ ret = ext4_insert_delayed_block(inode, map->m_lblk);
1807
+ if (ret != 0) {
19321808 retval = ret;
19331809 goto out_unlock;
19341810 }
....@@ -2088,6 +1964,9 @@
20881964 }
20891965 if (ret == 0)
20901966 ret = err;
1967
+ err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
1968
+ if (ret == 0)
1969
+ ret = err;
20911970 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
20921971 err = ext4_journal_stop(handle);
20931972 if (!ret)
....@@ -2169,6 +2048,15 @@
21692048 len = size & ~PAGE_MASK;
21702049 else
21712050 len = PAGE_SIZE;
2051
+
2052
+ /* Should never happen but for bugs in other kernel subsystems */
2053
+ if (!page_has_buffers(page)) {
2054
+ ext4_warning_inode(inode,
2055
+ "page %lu does not have buffers attached", page->index);
2056
+ ClearPageDirty(page);
2057
+ unlock_page(page);
2058
+ return 0;
2059
+ }
21722060
21732061 page_bufs = page_buffers(page);
21742062 /*
....@@ -2262,7 +2150,7 @@
22622150 return err;
22632151 }
22642152
2265
-#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
2153
+#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))
22662154
22672155 /*
22682156 * mballoc gives us at most this number of blocks...
....@@ -2372,7 +2260,84 @@
23722260 if (err < 0)
23732261 return err;
23742262 }
2375
- return lblk < blocks;
2263
+ if (lblk >= blocks) {
2264
+ mpd->scanned_until_end = 1;
2265
+ return 0;
2266
+ }
2267
+ return 1;
2268
+}
2269
+
2270
+/*
2271
+ * mpage_process_page - update page buffers corresponding to changed extent and
2272
+ * may submit fully mapped page for IO
2273
+ *
2274
+ * @mpd - description of extent to map, on return next extent to map
2275
+ * @m_lblk - logical block mapping.
2276
+ * @m_pblk - corresponding physical mapping.
2277
+ * @map_bh - determines on return whether this page requires any further
2278
+ * mapping or not.
2279
+ * Scan given page buffers corresponding to changed extent and update buffer
2280
+ * state according to new extent state.
2281
+ * We map delalloc buffers to their physical location, clear unwritten bits.
2282
+ * If the given page is not fully mapped, we update @map to the next extent in
2283
+ * the given page that needs mapping & return @map_bh as true.
2284
+ */
2285
+static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
2286
+ ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
2287
+ bool *map_bh)
2288
+{
2289
+ struct buffer_head *head, *bh;
2290
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
2291
+ ext4_lblk_t lblk = *m_lblk;
2292
+ ext4_fsblk_t pblock = *m_pblk;
2293
+ int err = 0;
2294
+ int blkbits = mpd->inode->i_blkbits;
2295
+ ssize_t io_end_size = 0;
2296
+ struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
2297
+
2298
+ bh = head = page_buffers(page);
2299
+ do {
2300
+ if (lblk < mpd->map.m_lblk)
2301
+ continue;
2302
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2303
+ /*
2304
+ * Buffer after end of mapped extent.
2305
+ * Find next buffer in the page to map.
2306
+ */
2307
+ mpd->map.m_len = 0;
2308
+ mpd->map.m_flags = 0;
2309
+ io_end_vec->size += io_end_size;
2310
+ io_end_size = 0;
2311
+
2312
+ err = mpage_process_page_bufs(mpd, head, bh, lblk);
2313
+ if (err > 0)
2314
+ err = 0;
2315
+ if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
2316
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
2317
+ if (IS_ERR(io_end_vec)) {
2318
+ err = PTR_ERR(io_end_vec);
2319
+ goto out;
2320
+ }
2321
+ io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
2322
+ }
2323
+ *map_bh = true;
2324
+ goto out;
2325
+ }
2326
+ if (buffer_delay(bh)) {
2327
+ clear_buffer_delay(bh);
2328
+ bh->b_blocknr = pblock++;
2329
+ }
2330
+ clear_buffer_unwritten(bh);
2331
+ io_end_size += (1 << blkbits);
2332
+ } while (lblk++, (bh = bh->b_this_page) != head);
2333
+
2334
+ io_end_vec->size += io_end_size;
2335
+ io_end_size = 0;
2336
+ *map_bh = false;
2337
+out:
2338
+ *m_lblk = lblk;
2339
+ *m_pblk = pblock;
2340
+ return err;
23762341 }
23772342
23782343 /*
....@@ -2394,12 +2359,12 @@
23942359 struct pagevec pvec;
23952360 int nr_pages, i;
23962361 struct inode *inode = mpd->inode;
2397
- struct buffer_head *head, *bh;
23982362 int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
23992363 pgoff_t start, end;
24002364 ext4_lblk_t lblk;
2401
- sector_t pblock;
2365
+ ext4_fsblk_t pblock;
24022366 int err;
2367
+ bool map_bh = false;
24032368
24042369 start = mpd->map.m_lblk >> bpp_bits;
24052370 end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
....@@ -2415,50 +2380,19 @@
24152380 for (i = 0; i < nr_pages; i++) {
24162381 struct page *page = pvec.pages[i];
24172382
2418
- bh = head = page_buffers(page);
2419
- do {
2420
- if (lblk < mpd->map.m_lblk)
2421
- continue;
2422
- if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2423
- /*
2424
- * Buffer after end of mapped extent.
2425
- * Find next buffer in the page to map.
2426
- */
2427
- mpd->map.m_len = 0;
2428
- mpd->map.m_flags = 0;
2429
- /*
2430
- * FIXME: If dioread_nolock supports
2431
- * blocksize < pagesize, we need to make
2432
- * sure we add size mapped so far to
2433
- * io_end->size as the following call
2434
- * can submit the page for IO.
2435
- */
2436
- err = mpage_process_page_bufs(mpd, head,
2437
- bh, lblk);
2438
- pagevec_release(&pvec);
2439
- if (err > 0)
2440
- err = 0;
2441
- return err;
2442
- }
2443
- if (buffer_delay(bh)) {
2444
- clear_buffer_delay(bh);
2445
- bh->b_blocknr = pblock++;
2446
- }
2447
- clear_buffer_unwritten(bh);
2448
- } while (lblk++, (bh = bh->b_this_page) != head);
2449
-
2383
+ err = mpage_process_page(mpd, page, &lblk, &pblock,
2384
+ &map_bh);
24502385 /*
2451
- * FIXME: This is going to break if dioread_nolock
2452
- * supports blocksize < pagesize as we will try to
2453
- * convert potentially unmapped parts of inode.
2386
+ * If map_bh is true, means page may require further bh
2387
+ * mapping, or maybe the page was submitted for IO.
2388
+ * So we return to call further extent mapping.
24542389 */
2455
- mpd->io_submit.io_end->size += PAGE_SIZE;
2390
+ if (err < 0 || map_bh)
2391
+ goto out;
24562392 /* Page fully mapped - let IO run! */
24572393 err = mpage_submit_page(mpd, page);
2458
- if (err < 0) {
2459
- pagevec_release(&pvec);
2460
- return err;
2461
- }
2394
+ if (err < 0)
2395
+ goto out;
24622396 }
24632397 pagevec_release(&pvec);
24642398 }
....@@ -2466,6 +2400,9 @@
24662400 mpd->map.m_len = 0;
24672401 mpd->map.m_flags = 0;
24682402 return 0;
2403
+out:
2404
+ pagevec_release(&pvec);
2405
+ return err;
24692406 }
24702407
24712408 static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
....@@ -2497,7 +2434,7 @@
24972434 dioread_nolock = ext4_should_dioread_nolock(inode);
24982435 if (dioread_nolock)
24992436 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2500
- if (map->m_flags & (1 << BH_Delay))
2437
+ if (map->m_flags & BIT(BH_Delay))
25012438 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
25022439
25032440 err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
....@@ -2513,10 +2450,6 @@
25132450 }
25142451
25152452 BUG_ON(map->m_len == 0);
2516
- if (map->m_flags & EXT4_MAP_NEW) {
2517
- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
2518
- map->m_len);
2519
- }
25202453 return 0;
25212454 }
25222455
....@@ -2549,16 +2482,20 @@
25492482 int err;
25502483 loff_t disksize;
25512484 int progress = 0;
2485
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
2486
+ struct ext4_io_end_vec *io_end_vec;
25522487
2553
- mpd->io_submit.io_end->offset =
2554
- ((loff_t)map->m_lblk) << inode->i_blkbits;
2488
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
2489
+ if (IS_ERR(io_end_vec))
2490
+ return PTR_ERR(io_end_vec);
2491
+ io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
25552492 do {
25562493 err = mpage_map_one_extent(handle, mpd);
25572494 if (err < 0) {
25582495 struct super_block *sb = inode->i_sb;
25592496
25602497 if (ext4_forced_shutdown(EXT4_SB(sb)) ||
2561
- EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
2498
+ ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
25622499 goto invalidate_dirty_pages;
25632500 /*
25642501 * Let the uper layers retry transient errors.
....@@ -2615,10 +2552,11 @@
26152552 EXT4_I(inode)->i_disksize = disksize;
26162553 up_write(&EXT4_I(inode)->i_data_sem);
26172554 err2 = ext4_mark_inode_dirty(handle, inode);
2618
- if (err2)
2619
- ext4_error(inode->i_sb,
2620
- "Failed to mark inode %lu dirty",
2621
- inode->i_ino);
2555
+ if (err2) {
2556
+ ext4_error_err(inode->i_sb, -err2,
2557
+ "Failed to mark inode %lu dirty",
2558
+ inode->i_ino);
2559
+ }
26222560 if (!err)
26232561 err = err2;
26242562 }
....@@ -2666,7 +2604,7 @@
26662604 long left = mpd->wbc->nr_to_write;
26672605 pgoff_t index = mpd->first_page;
26682606 pgoff_t end = mpd->last_page;
2669
- int tag;
2607
+ xa_mark_t tag;
26702608 int i, err = 0;
26712609 int blkbits = mpd->inode->i_blkbits;
26722610 ext4_lblk_t lblk;
....@@ -2684,7 +2622,7 @@
26842622 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
26852623 tag);
26862624 if (nr_pages == 0)
2687
- goto out;
2625
+ break;
26882626
26892627 for (i = 0; i < nr_pages; i++) {
26902628 struct page *page = pvec.pages[i];
....@@ -2723,6 +2661,22 @@
27232661 wait_on_page_writeback(page);
27242662 BUG_ON(PageWriteback(page));
27252663
2664
+ /*
2665
+ * Should never happen but for buggy code in
2666
+ * other subsystems that call
2667
+ * set_page_dirty() without properly warning
2668
+ * the file system first. See [1] for more
2669
+ * information.
2670
+ *
2671
+ * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
2672
+ */
2673
+ if (!page_has_buffers(page)) {
2674
+ ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index);
2675
+ ClearPageDirty(page);
2676
+ unlock_page(page);
2677
+ continue;
2678
+ }
2679
+
27262680 if (mpd->map.m_len == 0)
27272681 mpd->first_page = page->index;
27282682 mpd->next_page = page->index + 1;
....@@ -2739,6 +2693,7 @@
27392693 pagevec_release(&pvec);
27402694 cond_resched();
27412695 }
2696
+ mpd->scanned_until_end = 1;
27422697 return 0;
27432698 out:
27442699 pagevec_release(&pvec);
....@@ -2757,7 +2712,6 @@
27572712 struct inode *inode = mapping->host;
27582713 int needed_blocks, rsv_blocks = 0, ret = 0;
27592714 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2760
- bool done;
27612715 struct blk_plug plug;
27622716 bool give_up_on_write = false;
27632717
....@@ -2791,18 +2745,9 @@
27912745 * the stack trace.
27922746 */
27932747 if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
2794
- sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
2748
+ ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
27952749 ret = -EROFS;
27962750 goto out_writepages;
2797
- }
2798
-
2799
- if (ext4_should_dioread_nolock(inode)) {
2800
- /*
2801
- * We may need to convert up to one extent per block in
2802
- * the page and we may dirty the inode.
2803
- */
2804
- rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
2805
- PAGE_SIZE >> inode->i_blkbits);
28062751 }
28072752
28082753 /*
....@@ -2821,6 +2766,15 @@
28212766 EXT4_STATE_MAY_INLINE_DATA));
28222767 ext4_destroy_inline_data(handle, inode);
28232768 ext4_journal_stop(handle);
2769
+ }
2770
+
2771
+ if (ext4_should_dioread_nolock(inode)) {
2772
+ /*
2773
+ * We may need to convert up to one extent per block in
2774
+ * the page and we may dirty the inode.
2775
+ */
2776
+ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
2777
+ PAGE_SIZE >> inode->i_blkbits);
28242778 }
28252779
28262780 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
....@@ -2843,7 +2797,6 @@
28432797 retry:
28442798 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
28452799 tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
2846
- done = false;
28472800 blk_start_plug(&plug);
28482801
28492802 /*
....@@ -2853,22 +2806,23 @@
28532806 * started.
28542807 */
28552808 mpd.do_map = 0;
2809
+ mpd.scanned_until_end = 0;
28562810 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
28572811 if (!mpd.io_submit.io_end) {
28582812 ret = -ENOMEM;
28592813 goto unplug;
28602814 }
28612815 ret = mpage_prepare_extent_to_map(&mpd);
2816
+ /* Unlock pages we didn't use */
2817
+ mpage_release_unused_pages(&mpd, false);
28622818 /* Submit prepared bio */
28632819 ext4_io_submit(&mpd.io_submit);
28642820 ext4_put_io_end_defer(mpd.io_submit.io_end);
28652821 mpd.io_submit.io_end = NULL;
2866
- /* Unlock pages we didn't use */
2867
- mpage_release_unused_pages(&mpd, false);
28682822 if (ret < 0)
28692823 goto unplug;
28702824
2871
- while (!done && mpd.first_page <= mpd.last_page) {
2825
+ while (!mpd.scanned_until_end && wbc->nr_to_write > 0) {
28722826 /* For each extent of pages we use new io_end */
28732827 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
28742828 if (!mpd.io_submit.io_end) {
....@@ -2903,26 +2857,15 @@
29032857
29042858 trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
29052859 ret = mpage_prepare_extent_to_map(&mpd);
2906
- if (!ret) {
2907
- if (mpd.map.m_len)
2908
- ret = mpage_map_and_submit_extent(handle, &mpd,
2860
+ if (!ret && mpd.map.m_len)
2861
+ ret = mpage_map_and_submit_extent(handle, &mpd,
29092862 &give_up_on_write);
2910
- else {
2911
- /*
2912
- * We scanned the whole range (or exhausted
2913
- * nr_to_write), submitted what was mapped and
2914
- * didn't find anything needing mapping. We are
2915
- * done.
2916
- */
2917
- done = true;
2918
- }
2919
- }
29202863 /*
29212864 * Caution: If the handle is synchronous,
29222865 * ext4_journal_stop() can wait for transaction commit
29232866 * to finish which may depend on writeback of pages to
29242867 * complete or on page lock to be released. In that
2925
- * case, we have to wait until after after we have
2868
+ * case, we have to wait until after we have
29262869 * submitted all the IO, released page locks we hold,
29272870 * and dropped io_end reference (for extent conversion
29282871 * to be able to complete) before stopping the handle.
....@@ -2932,10 +2875,11 @@
29322875 handle = NULL;
29332876 mpd.do_map = 0;
29342877 }
2935
- /* Submit prepared bio */
2936
- ext4_io_submit(&mpd.io_submit);
29372878 /* Unlock pages we didn't use */
29382879 mpage_release_unused_pages(&mpd, give_up_on_write);
2880
+ /* Submit prepared bio */
2881
+ ext4_io_submit(&mpd.io_submit);
2882
+
29392883 /*
29402884 * Drop our io_end reference we got from init. We have
29412885 * to be careful and use deferred io_end finishing if
....@@ -3002,7 +2946,7 @@
30022946 percpu_down_read(&sbi->s_writepages_rwsem);
30032947 trace_ext4_writepages(inode, wbc);
30042948
3005
- ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
2949
+ ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
30062950 trace_ext4_writepages_result(inode, wbc, ret,
30072951 nr_to_write - wbc->nr_to_write);
30082952 percpu_up_read(&sbi->s_writepages_rwsem);
....@@ -3212,58 +3156,42 @@
32123156 end = start + copied - 1;
32133157
32143158 /*
3215
- * generic_write_end() will run mark_inode_dirty() if i_size
3216
- * changes. So let's piggyback the i_disksize mark_inode_dirty
3217
- * into that.
3159
+ * Since we are holding inode lock, we are sure i_disksize <=
3160
+ * i_size. We also know that if i_disksize < i_size, there are
3161
+ * delalloc writes pending in the range upto i_size. If the end of
3162
+ * the current write is <= i_size, there's no need to touch
3163
+ * i_disksize since writeback will push i_disksize upto i_size
3164
+ * eventually. If the end of the current write is > i_size and
3165
+ * inside an allocated block (ext4_da_should_update_i_disksize()
3166
+ * check), we need to update i_disksize here as neither
3167
+ * ext4_writepage() nor certain ext4_writepages() paths not
3168
+ * allocating blocks update i_disksize.
3169
+ *
3170
+ * Note that we defer inode dirtying to generic_write_end() /
3171
+ * ext4_da_write_inline_data_end().
32183172 */
32193173 new_i_size = pos + copied;
3220
- if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
3174
+ if (copied && new_i_size > inode->i_size) {
32213175 if (ext4_has_inline_data(inode) ||
3222
- ext4_da_should_update_i_disksize(page, end)) {
3176
+ ext4_da_should_update_i_disksize(page, end))
32233177 ext4_update_i_disksize(inode, new_i_size);
3224
- /* We need to mark inode dirty even if
3225
- * new_i_size is less that inode->i_size
3226
- * bu greater than i_disksize.(hint delalloc)
3227
- */
3228
- ext4_mark_inode_dirty(handle, inode);
3229
- }
32303178 }
32313179
32323180 if (write_mode != CONVERT_INLINE_DATA &&
32333181 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
32343182 ext4_has_inline_data(inode))
3235
- ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
3183
+ ret = ext4_da_write_inline_data_end(inode, pos, len, copied,
32363184 page);
32373185 else
3238
- ret2 = generic_write_end(file, mapping, pos, len, copied,
3186
+ ret = generic_write_end(file, mapping, pos, len, copied,
32393187 page, fsdata);
32403188
3241
- copied = ret2;
3242
- if (ret2 < 0)
3243
- ret = ret2;
3189
+ copied = ret;
32443190 ret2 = ext4_journal_stop(handle);
3245
- if (!ret)
3191
+ if (unlikely(ret2 && !ret))
32463192 ret = ret2;
32473193
32483194 return ret ? ret : copied;
3249
-}
3250
-
3251
-static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
3252
- unsigned int length)
3253
-{
3254
- /*
3255
- * Drop reserved blocks
3256
- */
3257
- BUG_ON(!PageLocked(page));
3258
- if (!page_has_buffers(page))
3259
- goto out;
3260
-
3261
- ext4_da_page_release_reservation(page, offset, length);
3262
-
3263
-out:
3264
- ext4_invalidatepage(page, offset, length);
3265
-
3266
- return;
32673195 }
32683196
32693197 /*
....@@ -3328,13 +3256,15 @@
33283256 {
33293257 struct inode *inode = mapping->host;
33303258 journal_t *journal;
3259
+ sector_t ret = 0;
33313260 int err;
33323261
3262
+ inode_lock_shared(inode);
33333263 /*
33343264 * We can get here for an inline file via the FIBMAP ioctl
33353265 */
33363266 if (ext4_has_inline_data(inode))
3337
- return 0;
3267
+ goto out;
33383268
33393269 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
33403270 test_opt(inode->i_sb, DELALLOC)) {
....@@ -3373,10 +3303,14 @@
33733303 jbd2_journal_unlock_updates(journal);
33743304
33753305 if (err)
3376
- return 0;
3306
+ goto out;
33773307 }
33783308
3379
- return generic_block_bmap(mapping, block, ext4_get_block);
3309
+ ret = iomap_bmap(mapping, block, &ext4_iomap_ops);
3310
+
3311
+out:
3312
+ inode_unlock_shared(inode);
3313
+ return ret;
33803314 }
33813315
33823316 static int ext4_readpage(struct file *file, struct page *page)
....@@ -3390,23 +3324,20 @@
33903324 ret = ext4_readpage_inline(inode, page);
33913325
33923326 if (ret == -EAGAIN)
3393
- return ext4_mpage_readpages(page->mapping, NULL, page, 1,
3394
- false);
3327
+ return ext4_mpage_readpages(inode, NULL, page);
33953328
33963329 return ret;
33973330 }
33983331
3399
-static int
3400
-ext4_readpages(struct file *file, struct address_space *mapping,
3401
- struct list_head *pages, unsigned nr_pages)
3332
+static void ext4_readahead(struct readahead_control *rac)
34023333 {
3403
- struct inode *inode = mapping->host;
3334
+ struct inode *inode = rac->mapping->host;
34043335
3405
- /* If the file has inline data, no need to do readpages. */
3336
+ /* If the file has inline data, no need to do readahead. */
34063337 if (ext4_has_inline_data(inode))
3407
- return 0;
3338
+ return;
34083339
3409
- return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true);
3340
+ ext4_mpage_readpages(inode, rac, NULL);
34103341 }
34113342
34123343 static void ext4_invalidatepage(struct page *page, unsigned int offset,
....@@ -3455,7 +3386,7 @@
34553386 if (PageChecked(page))
34563387 return 0;
34573388 if (journal)
3458
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
3389
+ return jbd2_journal_try_to_free_buffers(journal, page);
34593390 else
34603391 return try_to_free_buffers(page);
34613392 }
....@@ -3464,216 +3395,215 @@
34643395 {
34653396 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
34663397
3467
- if (journal)
3468
- return !jbd2_transaction_committed(journal,
3469
- EXT4_I(inode)->i_datasync_tid);
3398
+ if (journal) {
3399
+ if (jbd2_transaction_committed(journal,
3400
+ EXT4_I(inode)->i_datasync_tid))
3401
+ return false;
3402
+ if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
3403
+ return !list_empty(&EXT4_I(inode)->i_fc_list);
3404
+ return true;
3405
+ }
3406
+
34703407 /* Any metadata buffers to write? */
34713408 if (!list_empty(&inode->i_mapping->private_list))
34723409 return true;
34733410 return inode->i_state & I_DIRTY_DATASYNC;
34743411 }
34753412
3476
-static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
3477
- unsigned flags, struct iomap *iomap)
3413
+static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
3414
+ struct ext4_map_blocks *map, loff_t offset,
3415
+ loff_t length)
34783416 {
3479
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3480
- unsigned int blkbits = inode->i_blkbits;
3481
- unsigned long first_block, last_block;
3482
- struct ext4_map_blocks map;
3483
- bool delalloc = false;
3484
- int ret;
3485
-
3486
- if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
3487
- return -EINVAL;
3488
- first_block = offset >> blkbits;
3489
- last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
3490
- EXT4_MAX_LOGICAL_BLOCK);
3491
-
3492
- if (flags & IOMAP_REPORT) {
3493
- if (ext4_has_inline_data(inode)) {
3494
- ret = ext4_inline_data_iomap(inode, iomap);
3495
- if (ret != -EAGAIN) {
3496
- if (ret == 0 && offset >= iomap->length)
3497
- ret = -ENOENT;
3498
- return ret;
3499
- }
3500
- }
3501
- } else {
3502
- if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
3503
- return -ERANGE;
3504
- }
3505
-
3506
- map.m_lblk = first_block;
3507
- map.m_len = last_block - first_block + 1;
3508
-
3509
- if (flags & IOMAP_REPORT) {
3510
- ret = ext4_map_blocks(NULL, inode, &map, 0);
3511
- if (ret < 0)
3512
- return ret;
3513
-
3514
- if (ret == 0) {
3515
- ext4_lblk_t end = map.m_lblk + map.m_len - 1;
3516
- struct extent_status es;
3517
-
3518
- ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es);
3519
-
3520
- if (!es.es_len || es.es_lblk > end) {
3521
- /* entire range is a hole */
3522
- } else if (es.es_lblk > map.m_lblk) {
3523
- /* range starts with a hole */
3524
- map.m_len = es.es_lblk - map.m_lblk;
3525
- } else {
3526
- ext4_lblk_t offs = 0;
3527
-
3528
- if (es.es_lblk < map.m_lblk)
3529
- offs = map.m_lblk - es.es_lblk;
3530
- map.m_lblk = es.es_lblk + offs;
3531
- map.m_len = es.es_len - offs;
3532
- delalloc = true;
3533
- }
3534
- }
3535
- } else if (flags & IOMAP_WRITE) {
3536
- int dio_credits;
3537
- handle_t *handle;
3538
- int retries = 0;
3539
-
3540
- /* Trim mapping request to maximum we can map at once for DIO */
3541
- if (map.m_len > DIO_MAX_BLOCKS)
3542
- map.m_len = DIO_MAX_BLOCKS;
3543
- dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
3544
-retry:
3545
- /*
3546
- * Either we allocate blocks and then we don't get unwritten
3547
- * extent so we have reserved enough credits, or the blocks
3548
- * are already allocated and unwritten and in that case
3549
- * extent conversion fits in the credits as well.
3550
- */
3551
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
3552
- dio_credits);
3553
- if (IS_ERR(handle))
3554
- return PTR_ERR(handle);
3555
-
3556
- ret = ext4_map_blocks(handle, inode, &map,
3557
- EXT4_GET_BLOCKS_CREATE_ZERO);
3558
- if (ret < 0) {
3559
- ext4_journal_stop(handle);
3560
- if (ret == -ENOSPC &&
3561
- ext4_should_retry_alloc(inode->i_sb, &retries))
3562
- goto retry;
3563
- return ret;
3564
- }
3565
-
3566
- /*
3567
- * If we added blocks beyond i_size, we need to make sure they
3568
- * will get truncated if we crash before updating i_size in
3569
- * ext4_iomap_end(). For faults we don't need to do that (and
3570
- * even cannot because for orphan list operations inode_lock is
3571
- * required) - if we happen to instantiate block beyond i_size,
3572
- * it is because we race with truncate which has already added
3573
- * the inode to the orphan list.
3574
- */
3575
- if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
3576
- (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
3577
- int err;
3578
-
3579
- err = ext4_orphan_add(handle, inode);
3580
- if (err < 0) {
3581
- ext4_journal_stop(handle);
3582
- return err;
3583
- }
3584
- }
3585
- ext4_journal_stop(handle);
3586
- } else {
3587
- ret = ext4_map_blocks(NULL, inode, &map, 0);
3588
- if (ret < 0)
3589
- return ret;
3590
- }
3417
+ u8 blkbits = inode->i_blkbits;
35913418
35923419 /*
35933420 * Writes that span EOF might trigger an I/O size update on completion,
3594
- * so consider them to be dirty for the purposes of O_DSYNC, even if
3595
- * there is no other metadata changes being made or are pending here.
3421
+ * so consider them to be dirty for the purpose of O_DSYNC, even if
3422
+ * there is no other metadata changes being made or are pending.
35963423 */
35973424 iomap->flags = 0;
35983425 if (ext4_inode_datasync_dirty(inode) ||
35993426 offset + length > i_size_read(inode))
36003427 iomap->flags |= IOMAP_F_DIRTY;
3601
- iomap->bdev = inode->i_sb->s_bdev;
3602
- iomap->dax_dev = sbi->s_daxdev;
3603
- iomap->offset = (u64)first_block << blkbits;
3604
- iomap->length = (u64)map.m_len << blkbits;
36053428
3606
- if (ret == 0) {
3607
- iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
3608
- iomap->addr = IOMAP_NULL_ADDR;
3609
- } else {
3610
- if (map.m_flags & EXT4_MAP_MAPPED) {
3611
- iomap->type = IOMAP_MAPPED;
3612
- } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
3613
- iomap->type = IOMAP_UNWRITTEN;
3614
- } else {
3615
- WARN_ON_ONCE(1);
3616
- return -EIO;
3617
- }
3618
- iomap->addr = (u64)map.m_pblk << blkbits;
3619
- }
3620
-
3621
- if (map.m_flags & EXT4_MAP_NEW)
3429
+ if (map->m_flags & EXT4_MAP_NEW)
36223430 iomap->flags |= IOMAP_F_NEW;
36233431
3432
+ iomap->bdev = inode->i_sb->s_bdev;
3433
+ iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
3434
+ iomap->offset = (u64) map->m_lblk << blkbits;
3435
+ iomap->length = (u64) map->m_len << blkbits;
3436
+
3437
+ if ((map->m_flags & EXT4_MAP_MAPPED) &&
3438
+ !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3439
+ iomap->flags |= IOMAP_F_MERGED;
3440
+
3441
+ /*
3442
+ * Flags passed to ext4_map_blocks() for direct I/O writes can result
3443
+ * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
3444
+ * set. In order for any allocated unwritten extents to be converted
3445
+ * into written extents correctly within the ->end_io() handler, we
3446
+ * need to ensure that the iomap->type is set appropriately. Hence, the
3447
+ * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
3448
+ * been set first.
3449
+ */
3450
+ if (map->m_flags & EXT4_MAP_UNWRITTEN) {
3451
+ iomap->type = IOMAP_UNWRITTEN;
3452
+ iomap->addr = (u64) map->m_pblk << blkbits;
3453
+ } else if (map->m_flags & EXT4_MAP_MAPPED) {
3454
+ iomap->type = IOMAP_MAPPED;
3455
+ iomap->addr = (u64) map->m_pblk << blkbits;
3456
+ } else {
3457
+ iomap->type = IOMAP_HOLE;
3458
+ iomap->addr = IOMAP_NULL_ADDR;
3459
+ }
3460
+}
3461
+
3462
+static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
3463
+ unsigned int flags)
3464
+{
3465
+ handle_t *handle;
3466
+ u8 blkbits = inode->i_blkbits;
3467
+ int ret, dio_credits, m_flags = 0, retries = 0;
3468
+
3469
+ /*
3470
+ * Trim the mapping request to the maximum value that we can map at
3471
+ * once for direct I/O.
3472
+ */
3473
+ if (map->m_len > DIO_MAX_BLOCKS)
3474
+ map->m_len = DIO_MAX_BLOCKS;
3475
+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
3476
+
3477
+retry:
3478
+ /*
3479
+ * Either we allocate blocks and then don't get an unwritten extent, so
3480
+ * in that case we have reserved enough credits. Or, the blocks are
3481
+ * already allocated and unwritten. In that case, the extent conversion
3482
+ * fits into the credits as well.
3483
+ */
3484
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
3485
+ if (IS_ERR(handle))
3486
+ return PTR_ERR(handle);
3487
+
3488
+ /*
3489
+ * DAX and direct I/O are the only two operations that are currently
3490
+ * supported with IOMAP_WRITE.
3491
+ */
3492
+ WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
3493
+ if (IS_DAX(inode))
3494
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
3495
+ /*
3496
+ * We use i_size instead of i_disksize here because delalloc writeback
3497
+ * can complete at any point during the I/O and subsequently push the
3498
+ * i_disksize out to i_size. This could be beyond where direct I/O is
3499
+ * happening and thus expose allocated blocks to direct I/O reads.
3500
+ */
3501
+ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
3502
+ m_flags = EXT4_GET_BLOCKS_CREATE;
3503
+ else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3504
+ m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3505
+
3506
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
3507
+
3508
+ /*
3509
+ * We cannot fill holes in indirect tree based inodes as that could
3510
+ * expose stale data in the case of a crash. Use the magic error code
3511
+ * to fallback to buffered I/O.
3512
+ */
3513
+ if (!m_flags && !ret)
3514
+ ret = -ENOTBLK;
3515
+
3516
+ ext4_journal_stop(handle);
3517
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3518
+ goto retry;
3519
+
3520
+ return ret;
3521
+}
3522
+
3523
+
3524
+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
3525
+ unsigned flags, struct iomap *iomap, struct iomap *srcmap)
3526
+{
3527
+ int ret;
3528
+ struct ext4_map_blocks map;
3529
+ u8 blkbits = inode->i_blkbits;
3530
+
3531
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
3532
+ return -EINVAL;
3533
+
3534
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
3535
+ return -ERANGE;
3536
+
3537
+ /*
3538
+ * Calculate the first and last logical blocks respectively.
3539
+ */
3540
+ map.m_lblk = offset >> blkbits;
3541
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
3542
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
3543
+
3544
+ if (flags & IOMAP_WRITE) {
3545
+ /*
3546
+ * We check here if the blocks are already allocated, then we
3547
+ * don't need to start a journal txn and we can directly return
3548
+ * the mapping information. This could boost performance
3549
+ * especially in multi-threaded overwrite requests.
3550
+ */
3551
+ if (offset + length <= i_size_read(inode)) {
3552
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
3553
+ if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
3554
+ goto out;
3555
+ }
3556
+ ret = ext4_iomap_alloc(inode, &map, flags);
3557
+ } else {
3558
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
3559
+ }
3560
+
3561
+ if (ret < 0)
3562
+ return ret;
3563
+out:
3564
+
3565
+ /*
3566
+ * When inline encryption is enabled, sometimes I/O to an encrypted file
3567
+ * has to be broken up to guarantee DUN contiguity. Handle this by
3568
+ * limiting the length of the mapping returned.
3569
+ */
3570
+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
3571
+
3572
+ ext4_set_iomap(inode, iomap, &map, offset, length);
3573
+
36243574 return 0;
3575
+}
3576
+
3577
+static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
3578
+ loff_t length, unsigned flags, struct iomap *iomap,
3579
+ struct iomap *srcmap)
3580
+{
3581
+ int ret;
3582
+
3583
+ /*
3584
+ * Even for writes we don't need to allocate blocks, so just pretend
3585
+ * we are reading to save overhead of starting a transaction.
3586
+ */
3587
+ flags &= ~IOMAP_WRITE;
3588
+ ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
3589
+ WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);
3590
+ return ret;
36253591 }
36263592
36273593 static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
36283594 ssize_t written, unsigned flags, struct iomap *iomap)
36293595 {
3630
- int ret = 0;
3631
- handle_t *handle;
3632
- int blkbits = inode->i_blkbits;
3633
- bool truncate = false;
3634
-
3635
- if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
3636
- return 0;
3637
-
3638
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3639
- if (IS_ERR(handle)) {
3640
- ret = PTR_ERR(handle);
3641
- goto orphan_del;
3642
- }
3643
- if (ext4_update_inode_size(inode, offset + written))
3644
- ext4_mark_inode_dirty(handle, inode);
36453596 /*
3646
- * We may need to truncate allocated but not written blocks beyond EOF.
3597
+ * Check to see whether an error occurred while writing out the data to
3598
+ * the allocated blocks. If so, return the magic error code so that we
3599
+ * fallback to buffered I/O and attempt to complete the remainder of
3600
+ * the I/O. Any blocks that may have been allocated in preparation for
3601
+ * the direct I/O will be reused during buffered I/O.
36473602 */
3648
- if (iomap->offset + iomap->length >
3649
- ALIGN(inode->i_size, 1 << blkbits)) {
3650
- ext4_lblk_t written_blk, end_blk;
3603
+ if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
3604
+ return -ENOTBLK;
36513605
3652
- written_blk = (offset + written) >> blkbits;
3653
- end_blk = (offset + length) >> blkbits;
3654
- if (written_blk < end_blk && ext4_can_truncate(inode))
3655
- truncate = true;
3656
- }
3657
- /*
3658
- * Remove inode from orphan list if we were extending a inode and
3659
- * everything went fine.
3660
- */
3661
- if (!truncate && inode->i_nlink &&
3662
- !list_empty(&EXT4_I(inode)->i_orphan))
3663
- ext4_orphan_del(handle, inode);
3664
- ext4_journal_stop(handle);
3665
- if (truncate) {
3666
- ext4_truncate_failed_write(inode);
3667
-orphan_del:
3668
- /*
3669
- * If truncate failed early the inode might still be on the
3670
- * orphan list; we need to make sure the inode is removed from
3671
- * the orphan list in that case.
3672
- */
3673
- if (inode->i_nlink)
3674
- ext4_orphan_del(NULL, inode);
3675
- }
3676
- return ret;
3606
+ return 0;
36773607 }
36783608
36793609 const struct iomap_ops ext4_iomap_ops = {
....@@ -3681,310 +3611,94 @@
36813611 .iomap_end = ext4_iomap_end,
36823612 };
36833613
3684
-static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3685
- ssize_t size, void *private)
3614
+const struct iomap_ops ext4_iomap_overwrite_ops = {
3615
+ .iomap_begin = ext4_iomap_overwrite_begin,
3616
+ .iomap_end = ext4_iomap_end,
3617
+};
3618
+
3619
+static bool ext4_iomap_is_delalloc(struct inode *inode,
3620
+ struct ext4_map_blocks *map)
36863621 {
3687
- ext4_io_end_t *io_end = private;
3622
+ struct extent_status es;
3623
+ ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
36883624
3689
- /* if not async direct IO just return */
3690
- if (!io_end)
3691
- return 0;
3625
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
3626
+ map->m_lblk, end, &es);
36923627
3693
- ext_debug("ext4_end_io_dio(): io_end 0x%p "
3694
- "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
3695
- io_end, io_end->inode->i_ino, iocb, offset, size);
3628
+ if (!es.es_len || es.es_lblk > end)
3629
+ return false;
3630
+
3631
+ if (es.es_lblk > map->m_lblk) {
3632
+ map->m_len = es.es_lblk - map->m_lblk;
3633
+ return false;
3634
+ }
3635
+
3636
+ offset = map->m_lblk - es.es_lblk;
3637
+ map->m_len = es.es_len - offset;
3638
+
3639
+ return true;
3640
+}
3641
+
3642
+static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
3643
+ loff_t length, unsigned int flags,
3644
+ struct iomap *iomap, struct iomap *srcmap)
3645
+{
3646
+ int ret;
3647
+ bool delalloc = false;
3648
+ struct ext4_map_blocks map;
3649
+ u8 blkbits = inode->i_blkbits;
3650
+
3651
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
3652
+ return -EINVAL;
3653
+
3654
+ if (ext4_has_inline_data(inode)) {
3655
+ ret = ext4_inline_data_iomap(inode, iomap);
3656
+ if (ret != -EAGAIN) {
3657
+ if (ret == 0 && offset >= iomap->length)
3658
+ ret = -ENOENT;
3659
+ return ret;
3660
+ }
3661
+ }
36963662
36973663 /*
3698
- * Error during AIO DIO. We cannot convert unwritten extents as the
3699
- * data was not written. Just clear the unwritten flag and drop io_end.
3664
+ * Calculate the first and last logical block respectively.
37003665 */
3701
- if (size <= 0) {
3702
- ext4_clear_io_unwritten_flag(io_end);
3703
- size = 0;
3666
+ map.m_lblk = offset >> blkbits;
3667
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
3668
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
3669
+
3670
+ /*
3671
+ * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
3672
+ * So handle it here itself instead of querying ext4_map_blocks().
3673
+ * Since ext4_map_blocks() will warn about it and will return
3674
+ * -EIO error.
3675
+ */
3676
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
3677
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3678
+
3679
+ if (offset >= sbi->s_bitmap_maxbytes) {
3680
+ map.m_flags = 0;
3681
+ goto set_iomap;
3682
+ }
37043683 }
3705
- io_end->offset = offset;
3706
- io_end->size = size;
3707
- ext4_put_io_end(io_end);
3684
+
3685
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
3686
+ if (ret < 0)
3687
+ return ret;
3688
+ if (ret == 0)
3689
+ delalloc = ext4_iomap_is_delalloc(inode, &map);
3690
+
3691
+set_iomap:
3692
+ ext4_set_iomap(inode, iomap, &map, offset, length);
3693
+ if (delalloc && iomap->type == IOMAP_HOLE)
3694
+ iomap->type = IOMAP_DELALLOC;
37083695
37093696 return 0;
37103697 }
37113698
3712
-/*
3713
- * Handling of direct IO writes.
3714
- *
3715
- * For ext4 extent files, ext4 will do direct-io write even to holes,
3716
- * preallocated extents, and those write extend the file, no need to
3717
- * fall back to buffered IO.
3718
- *
3719
- * For holes, we fallocate those blocks, mark them as unwritten
3720
- * If those blocks were preallocated, we mark sure they are split, but
3721
- * still keep the range to write as unwritten.
3722
- *
3723
- * The unwritten extents will be converted to written when DIO is completed.
3724
- * For async direct IO, since the IO may still pending when return, we
3725
- * set up an end_io call back function, which will do the conversion
3726
- * when async direct IO completed.
3727
- *
3728
- * If the O_DIRECT write will extend the file then add this inode to the
3729
- * orphan list. So recovery will truncate it back to the original size
3730
- * if the machine crashes during the write.
3731
- *
3732
- */
3733
-static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
3734
-{
3735
- struct file *file = iocb->ki_filp;
3736
- struct inode *inode = file->f_mapping->host;
3737
- struct ext4_inode_info *ei = EXT4_I(inode);
3738
- ssize_t ret;
3739
- loff_t offset = iocb->ki_pos;
3740
- size_t count = iov_iter_count(iter);
3741
- int overwrite = 0;
3742
- get_block_t *get_block_func = NULL;
3743
- int dio_flags = 0;
3744
- loff_t final_size = offset + count;
3745
- int orphan = 0;
3746
- handle_t *handle;
3747
-
3748
- if (final_size > inode->i_size || final_size > ei->i_disksize) {
3749
- /* Credits for sb + inode write */
3750
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3751
- if (IS_ERR(handle)) {
3752
- ret = PTR_ERR(handle);
3753
- goto out;
3754
- }
3755
- ret = ext4_orphan_add(handle, inode);
3756
- if (ret) {
3757
- ext4_journal_stop(handle);
3758
- goto out;
3759
- }
3760
- orphan = 1;
3761
- ext4_update_i_disksize(inode, inode->i_size);
3762
- ext4_journal_stop(handle);
3763
- }
3764
-
3765
- BUG_ON(iocb->private == NULL);
3766
-
3767
- /*
3768
- * Make all waiters for direct IO properly wait also for extent
3769
- * conversion. This also disallows race between truncate() and
3770
- * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
3771
- */
3772
- inode_dio_begin(inode);
3773
-
3774
- /* If we do a overwrite dio, i_mutex locking can be released */
3775
- overwrite = *((int *)iocb->private);
3776
-
3777
- if (overwrite)
3778
- inode_unlock(inode);
3779
-
3780
- /*
3781
- * For extent mapped files we could direct write to holes and fallocate.
3782
- *
3783
- * Allocated blocks to fill the hole are marked as unwritten to prevent
3784
- * parallel buffered read to expose the stale data before DIO complete
3785
- * the data IO.
3786
- *
3787
- * As to previously fallocated extents, ext4 get_block will just simply
3788
- * mark the buffer mapped but still keep the extents unwritten.
3789
- *
3790
- * For non AIO case, we will convert those unwritten extents to written
3791
- * after return back from blockdev_direct_IO. That way we save us from
3792
- * allocating io_end structure and also the overhead of offloading
3793
- * the extent convertion to a workqueue.
3794
- *
3795
- * For async DIO, the conversion needs to be deferred when the
3796
- * IO is completed. The ext4 end_io callback function will be
3797
- * called to take care of the conversion work. Here for async
3798
- * case, we allocate an io_end structure to hook to the iocb.
3799
- */
3800
- iocb->private = NULL;
3801
- if (overwrite)
3802
- get_block_func = ext4_dio_get_block_overwrite;
3803
- else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
3804
- round_down(offset, i_blocksize(inode)) >= inode->i_size) {
3805
- get_block_func = ext4_dio_get_block;
3806
- dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
3807
- } else if (is_sync_kiocb(iocb)) {
3808
- get_block_func = ext4_dio_get_block_unwritten_sync;
3809
- dio_flags = DIO_LOCKING;
3810
- } else {
3811
- get_block_func = ext4_dio_get_block_unwritten_async;
3812
- dio_flags = DIO_LOCKING;
3813
- }
3814
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
3815
- get_block_func, ext4_end_io_dio, NULL,
3816
- dio_flags);
3817
-
3818
- if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3819
- EXT4_STATE_DIO_UNWRITTEN)) {
3820
- int err;
3821
- /*
3822
- * for non AIO case, since the IO is already
3823
- * completed, we could do the conversion right here
3824
- */
3825
- err = ext4_convert_unwritten_extents(NULL, inode,
3826
- offset, ret);
3827
- if (err < 0)
3828
- ret = err;
3829
- ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3830
- }
3831
-
3832
- inode_dio_end(inode);
3833
- /* take i_mutex locking again if we do a ovewrite dio */
3834
- if (overwrite)
3835
- inode_lock(inode);
3836
-
3837
- if (ret < 0 && final_size > inode->i_size)
3838
- ext4_truncate_failed_write(inode);
3839
-
3840
- /* Handle extending of i_size after direct IO write */
3841
- if (orphan) {
3842
- int err;
3843
-
3844
- /* Credits for sb + inode write */
3845
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3846
- if (IS_ERR(handle)) {
3847
- /*
3848
- * We wrote the data but cannot extend
3849
- * i_size. Bail out. In async io case, we do
3850
- * not return error here because we have
3851
- * already submmitted the corresponding
3852
- * bio. Returning error here makes the caller
3853
- * think that this IO is done and failed
3854
- * resulting in race with bio's completion
3855
- * handler.
3856
- */
3857
- if (!ret)
3858
- ret = PTR_ERR(handle);
3859
- if (inode->i_nlink)
3860
- ext4_orphan_del(NULL, inode);
3861
-
3862
- goto out;
3863
- }
3864
- if (inode->i_nlink)
3865
- ext4_orphan_del(handle, inode);
3866
- if (ret > 0) {
3867
- loff_t end = offset + ret;
3868
- if (end > inode->i_size || end > ei->i_disksize) {
3869
- ext4_update_i_disksize(inode, end);
3870
- if (end > inode->i_size)
3871
- i_size_write(inode, end);
3872
- /*
3873
- * We're going to return a positive `ret'
3874
- * here due to non-zero-length I/O, so there's
3875
- * no way of reporting error returns from
3876
- * ext4_mark_inode_dirty() to userspace. So
3877
- * ignore it.
3878
- */
3879
- ext4_mark_inode_dirty(handle, inode);
3880
- }
3881
- }
3882
- err = ext4_journal_stop(handle);
3883
- if (ret == 0)
3884
- ret = err;
3885
- }
3886
-out:
3887
- return ret;
3888
-}
3889
-
3890
-static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
3891
-{
3892
- struct address_space *mapping = iocb->ki_filp->f_mapping;
3893
- struct inode *inode = mapping->host;
3894
- size_t count = iov_iter_count(iter);
3895
- ssize_t ret;
3896
- loff_t offset = iocb->ki_pos;
3897
- loff_t size = i_size_read(inode);
3898
-
3899
- if (offset >= size)
3900
- return 0;
3901
-
3902
- /*
3903
- * Shared inode_lock is enough for us - it protects against concurrent
3904
- * writes & truncates and since we take care of writing back page cache,
3905
- * we are protected against page writeback as well.
3906
- */
3907
- if (iocb->ki_flags & IOCB_NOWAIT) {
3908
- if (!inode_trylock_shared(inode))
3909
- return -EAGAIN;
3910
- } else {
3911
- inode_lock_shared(inode);
3912
- }
3913
-
3914
- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
3915
- iocb->ki_pos + count - 1);
3916
- if (ret)
3917
- goto out_unlock;
3918
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
3919
- iter, ext4_dio_get_block, NULL, NULL, 0);
3920
-out_unlock:
3921
- inode_unlock_shared(inode);
3922
- return ret;
3923
-}
3924
-
3925
-static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3926
-{
3927
- struct file *file = iocb->ki_filp;
3928
- struct inode *inode = file->f_mapping->host;
3929
- size_t count = iov_iter_count(iter);
3930
- loff_t offset = iocb->ki_pos;
3931
- ssize_t ret;
3932
- int rw = iov_iter_rw(iter);
3933
-
3934
- if (!fscrypt_dio_supported(iocb, iter))
3935
- return 0;
3936
-
3937
- if (fsverity_active(inode))
3938
- return 0;
3939
-
3940
- /*
3941
- * If we are doing data journalling we don't support O_DIRECT
3942
- */
3943
- if (ext4_should_journal_data(inode))
3944
- return 0;
3945
-
3946
- /* Let buffer I/O handle the inline data case. */
3947
- if (ext4_has_inline_data(inode))
3948
- return 0;
3949
-
3950
- if (trace_android_fs_dataread_start_enabled() &&
3951
- (rw == READ)) {
3952
- char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
3953
-
3954
- path = android_fstrace_get_pathname(pathbuf,
3955
- MAX_TRACE_PATHBUF_LEN,
3956
- inode);
3957
- trace_android_fs_dataread_start(inode, offset, count,
3958
- current->pid, path,
3959
- current->comm);
3960
- }
3961
- if (trace_android_fs_datawrite_start_enabled() &&
3962
- (rw == WRITE)) {
3963
- char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
3964
-
3965
- path = android_fstrace_get_pathname(pathbuf,
3966
- MAX_TRACE_PATHBUF_LEN,
3967
- inode);
3968
- trace_android_fs_datawrite_start(inode, offset, count,
3969
- current->pid, path,
3970
- current->comm);
3971
- }
3972
- trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
3973
- if (iov_iter_rw(iter) == READ)
3974
- ret = ext4_direct_IO_read(iocb, iter);
3975
- else
3976
- ret = ext4_direct_IO_write(iocb, iter);
3977
- trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
3978
-
3979
- if (trace_android_fs_dataread_start_enabled() &&
3980
- (rw == READ))
3981
- trace_android_fs_dataread_end(inode, offset, count);
3982
- if (trace_android_fs_datawrite_start_enabled() &&
3983
- (rw == WRITE))
3984
- trace_android_fs_datawrite_end(inode, offset, count);
3985
-
3986
- return ret;
3987
-}
3699
+const struct iomap_ops ext4_iomap_report_ops = {
3700
+ .iomap_begin = ext4_iomap_begin_report,
3701
+};
39883702
39893703 /*
39903704 * Pages can be marked dirty completely asynchronously from ext4's journalling
....@@ -4012,9 +3726,16 @@
40123726 return __set_page_dirty_buffers(page);
40133727 }
40143728
3729
+static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
3730
+ struct file *file, sector_t *span)
3731
+{
3732
+ return iomap_swapfile_activate(sis, file, span,
3733
+ &ext4_iomap_report_ops);
3734
+}
3735
+
40153736 static const struct address_space_operations ext4_aops = {
40163737 .readpage = ext4_readpage,
4017
- .readpages = ext4_readpages,
3738
+ .readahead = ext4_readahead,
40183739 .writepage = ext4_writepage,
40193740 .writepages = ext4_writepages,
40203741 .write_begin = ext4_write_begin,
....@@ -4023,15 +3744,16 @@
40233744 .bmap = ext4_bmap,
40243745 .invalidatepage = ext4_invalidatepage,
40253746 .releasepage = ext4_releasepage,
4026
- .direct_IO = ext4_direct_IO,
3747
+ .direct_IO = noop_direct_IO,
40273748 .migratepage = buffer_migrate_page,
40283749 .is_partially_uptodate = block_is_partially_uptodate,
40293750 .error_remove_page = generic_error_remove_page,
3751
+ .swap_activate = ext4_iomap_swap_activate,
40303752 };
40313753
40323754 static const struct address_space_operations ext4_journalled_aops = {
40333755 .readpage = ext4_readpage,
4034
- .readpages = ext4_readpages,
3756
+ .readahead = ext4_readahead,
40353757 .writepage = ext4_writepage,
40363758 .writepages = ext4_writepages,
40373759 .write_begin = ext4_write_begin,
....@@ -4040,26 +3762,28 @@
40403762 .bmap = ext4_bmap,
40413763 .invalidatepage = ext4_journalled_invalidatepage,
40423764 .releasepage = ext4_releasepage,
4043
- .direct_IO = ext4_direct_IO,
3765
+ .direct_IO = noop_direct_IO,
40443766 .is_partially_uptodate = block_is_partially_uptodate,
40453767 .error_remove_page = generic_error_remove_page,
3768
+ .swap_activate = ext4_iomap_swap_activate,
40463769 };
40473770
40483771 static const struct address_space_operations ext4_da_aops = {
40493772 .readpage = ext4_readpage,
4050
- .readpages = ext4_readpages,
3773
+ .readahead = ext4_readahead,
40513774 .writepage = ext4_writepage,
40523775 .writepages = ext4_writepages,
40533776 .write_begin = ext4_da_write_begin,
40543777 .write_end = ext4_da_write_end,
40553778 .set_page_dirty = ext4_set_page_dirty,
40563779 .bmap = ext4_bmap,
4057
- .invalidatepage = ext4_da_invalidatepage,
3780
+ .invalidatepage = ext4_invalidatepage,
40583781 .releasepage = ext4_releasepage,
4059
- .direct_IO = ext4_direct_IO,
3782
+ .direct_IO = noop_direct_IO,
40603783 .migratepage = buffer_migrate_page,
40613784 .is_partially_uptodate = block_is_partially_uptodate,
40623785 .error_remove_page = generic_error_remove_page,
3786
+ .swap_activate = ext4_iomap_swap_activate,
40633787 };
40643788
40653789 static const struct address_space_operations ext4_dax_aops = {
....@@ -4068,6 +3792,7 @@
40683792 .set_page_dirty = noop_set_page_dirty,
40693793 .bmap = ext4_bmap,
40703794 .invalidatepage = noop_invalidatepage,
3795
+ .swap_activate = ext4_iomap_swap_activate,
40713796 };
40723797
40733798 void ext4_set_aops(struct inode *inode)
....@@ -4141,18 +3866,18 @@
41413866 set_buffer_uptodate(bh);
41423867
41433868 if (!buffer_uptodate(bh)) {
4144
- err = -EIO;
4145
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
4146
- wait_on_buffer(bh);
4147
- /* Uhhuh. Read error. Complain and punt. */
4148
- if (!buffer_uptodate(bh))
3869
+ err = ext4_read_bh_lock(bh, 0, true);
3870
+ if (err)
41493871 goto unlock;
41503872 if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
41513873 /* We expect the key to be set. */
41523874 BUG_ON(!fscrypt_has_encryption_key(inode));
4153
- BUG_ON(blocksize != PAGE_SIZE);
4154
- WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks(
4155
- page, PAGE_SIZE, 0));
3875
+ err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
3876
+ bh_offset(bh));
3877
+ if (err) {
3878
+ clear_buffer_uptodate(bh);
3879
+ goto unlock;
3880
+ }
41563881 }
41573882 }
41583883 if (ext4_should_journal_data(inode)) {
....@@ -4185,7 +3910,7 @@
41853910 * starting from file offset 'from'. The range to be zero'd must
41863911 * be contained with in one block. If the specified range exceeds
41873912 * the end of the block it will be shortened to end of the block
4188
- * that cooresponds to 'from'
3913
+ * that corresponds to 'from'
41893914 */
41903915 static int ext4_block_zero_page_range(handle_t *handle,
41913916 struct address_space *mapping, loff_t from, loff_t length)
....@@ -4292,6 +4017,8 @@
42924017 loff_t len)
42934018 {
42944019 handle_t *handle;
4020
+ int ret;
4021
+
42954022 loff_t size = i_size_read(inode);
42964023
42974024 WARN_ON(!inode_is_locked(inode));
....@@ -4305,10 +4032,10 @@
43054032 if (IS_ERR(handle))
43064033 return PTR_ERR(handle);
43074034 ext4_update_i_disksize(inode, size);
4308
- ext4_mark_inode_dirty(handle, inode);
4035
+ ret = ext4_mark_inode_dirty(handle, inode);
43094036 ext4_journal_stop(handle);
43104037
4311
- return 0;
4038
+ return ret;
43124039 }
43134040
43144041 static void ext4_wait_dax_page(struct ext4_inode_info *ei)
....@@ -4352,29 +4079,19 @@
43524079 * Returns: 0 on success or negative on failure
43534080 */
43544081
4355
-int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
4082
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
43564083 {
4084
+ struct inode *inode = file_inode(file);
43574085 struct super_block *sb = inode->i_sb;
43584086 ext4_lblk_t first_block, stop_block;
43594087 struct address_space *mapping = inode->i_mapping;
4360
- loff_t first_block_offset, last_block_offset;
4088
+ loff_t first_block_offset, last_block_offset, max_length;
4089
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
43614090 handle_t *handle;
43624091 unsigned int credits;
4363
- int ret = 0;
4364
-
4365
- if (!S_ISREG(inode->i_mode))
4366
- return -EOPNOTSUPP;
4092
+ int ret = 0, ret2 = 0;
43674093
43684094 trace_ext4_punch_hole(inode, offset, length, 0);
4369
-
4370
- ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
4371
- if (ext4_has_inline_data(inode)) {
4372
- down_write(&EXT4_I(inode)->i_mmap_sem);
4373
- ret = ext4_convert_inline_data(inode);
4374
- up_write(&EXT4_I(inode)->i_mmap_sem);
4375
- if (ret)
4376
- return ret;
4377
- }
43784095
43794096 /*
43804097 * Write out all dirty pages to avoid race conditions
....@@ -4403,6 +4120,14 @@
44034120 offset;
44044121 }
44054122
4123
+ /*
4124
+ * For punch hole the length + offset needs to be within one block
4125
+ * before last range. Adjust the length if it goes beyond that limit.
4126
+ */
4127
+ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
4128
+ if (offset + length > max_length)
4129
+ length = max_length - offset;
4130
+
44064131 if (offset & (sb->s_blocksize - 1) ||
44074132 (offset + length) & (sb->s_blocksize - 1)) {
44084133 /*
....@@ -4417,6 +4142,10 @@
44174142
44184143 /* Wait all existing dio workers, newcomers will block on i_mutex */
44194144 inode_dio_wait(inode);
4145
+
4146
+ ret = file_modified(file);
4147
+ if (ret)
4148
+ goto out_mutex;
44204149
44214150 /*
44224151 * Prevent page faults from reinstantiating pages we have released from
....@@ -4464,7 +4193,7 @@
44644193 if (stop_block > first_block) {
44654194
44664195 down_write(&EXT4_I(inode)->i_data_sem);
4467
- ext4_discard_preallocations(inode);
4196
+ ext4_discard_preallocations(inode, 0);
44684197
44694198 ret = ext4_es_remove_extent(inode, first_block,
44704199 stop_block - first_block);
....@@ -4482,11 +4211,14 @@
44824211
44834212 up_write(&EXT4_I(inode)->i_data_sem);
44844213 }
4214
+ ext4_fc_track_range(handle, inode, first_block, stop_block);
44854215 if (IS_SYNC(inode))
44864216 ext4_handle_sync(handle);
44874217
44884218 inode->i_mtime = inode->i_ctime = current_time(inode);
4489
- ext4_mark_inode_dirty(handle, inode);
4219
+ ret2 = ext4_mark_inode_dirty(handle, inode);
4220
+ if (unlikely(ret2))
4221
+ ret = ret2;
44904222 if (ret >= 0)
44914223 ext4_update_inode_fsync_trans(handle, inode, 1);
44924224 out_stop:
....@@ -4555,7 +4287,7 @@
45554287 {
45564288 struct ext4_inode_info *ei = EXT4_I(inode);
45574289 unsigned int credits;
4558
- int err = 0;
4290
+ int err = 0, err2;
45594291 handle_t *handle;
45604292 struct address_space *mapping = inode->i_mapping;
45614293
....@@ -4569,9 +4301,7 @@
45694301 trace_ext4_truncate_enter(inode);
45704302
45714303 if (!ext4_can_truncate(inode))
4572
- return 0;
4573
-
4574
- ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4304
+ goto out_trace;
45754305
45764306 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
45774307 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
....@@ -4580,16 +4310,15 @@
45804310 int has_inline = 1;
45814311
45824312 err = ext4_inline_data_truncate(inode, &has_inline);
4583
- if (err)
4584
- return err;
4585
- if (has_inline)
4586
- return 0;
4313
+ if (err || has_inline)
4314
+ goto out_trace;
45874315 }
45884316
45894317 /* If we zero-out tail of the page, we have to create jinode for jbd2 */
45904318 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
4591
- if (ext4_inode_attach_jinode(inode) < 0)
4592
- return 0;
4319
+ err = ext4_inode_attach_jinode(inode);
4320
+ if (err)
4321
+ goto out_trace;
45934322 }
45944323
45954324 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
....@@ -4598,8 +4327,10 @@
45984327 credits = ext4_blocks_for_truncate(inode);
45994328
46004329 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
4601
- if (IS_ERR(handle))
4602
- return PTR_ERR(handle);
4330
+ if (IS_ERR(handle)) {
4331
+ err = PTR_ERR(handle);
4332
+ goto out_trace;
4333
+ }
46034334
46044335 if (inode->i_size & (inode->i_sb->s_blocksize - 1))
46054336 ext4_block_truncate_page(handle, mapping, inode->i_size);
....@@ -4619,7 +4350,7 @@
46194350
46204351 down_write(&EXT4_I(inode)->i_data_sem);
46214352
4622
- ext4_discard_preallocations(inode);
4353
+ ext4_discard_preallocations(inode, 0);
46234354
46244355 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
46254356 err = ext4_ext_truncate(handle, inode);
....@@ -4645,9 +4376,12 @@
46454376 ext4_orphan_del(handle, inode);
46464377
46474378 inode->i_mtime = inode->i_ctime = current_time(inode);
4648
- ext4_mark_inode_dirty(handle, inode);
4379
+ err2 = ext4_mark_inode_dirty(handle, inode);
4380
+ if (unlikely(err2 && !err))
4381
+ err = err2;
46494382 ext4_journal_stop(handle);
46504383
4384
+out_trace:
46514385 trace_ext4_truncate_exit(inode);
46524386 return err;
46534387 }
....@@ -4658,21 +4392,22 @@
46584392 * data in memory that is needed to recreate the on-disk version of this
46594393 * inode.
46604394 */
4661
-static int __ext4_get_inode_loc(struct inode *inode,
4662
- struct ext4_iloc *iloc, int in_mem)
4395
+static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
4396
+ struct ext4_iloc *iloc, int in_mem,
4397
+ ext4_fsblk_t *ret_block)
46634398 {
46644399 struct ext4_group_desc *gdp;
46654400 struct buffer_head *bh;
4666
- struct super_block *sb = inode->i_sb;
46674401 ext4_fsblk_t block;
4402
+ struct blk_plug plug;
46684403 int inodes_per_block, inode_offset;
46694404
46704405 iloc->bh = NULL;
4671
- if (inode->i_ino < EXT4_ROOT_INO ||
4672
- inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
4406
+ if (ino < EXT4_ROOT_INO ||
4407
+ ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
46734408 return -EFSCORRUPTED;
46744409
4675
- iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
4410
+ iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
46764411 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
46774412 if (!gdp)
46784413 return -EIO;
....@@ -4681,27 +4416,28 @@
46814416 * Figure out the offset within the block group inode table
46824417 */
46834418 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4684
- inode_offset = ((inode->i_ino - 1) %
4419
+ inode_offset = ((ino - 1) %
46854420 EXT4_INODES_PER_GROUP(sb));
4686
- block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
46874421 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
4422
+
4423
+ block = ext4_inode_table(sb, gdp);
4424
+ if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
4425
+ (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
4426
+ ext4_error(sb, "Invalid inode table block %llu in "
4427
+ "block_group %u", block, iloc->block_group);
4428
+ return -EFSCORRUPTED;
4429
+ }
4430
+ block += (inode_offset / inodes_per_block);
46884431
46894432 bh = sb_getblk(sb, block);
46904433 if (unlikely(!bh))
46914434 return -ENOMEM;
4435
+ if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
4436
+ goto simulate_eio;
46924437 if (!buffer_uptodate(bh)) {
46934438 lock_buffer(bh);
46944439
4695
- /*
4696
- * If the buffer has the write error flag, we have failed
4697
- * to write out another inode in the same block. In this
4698
- * case, we don't have to read the block because we may
4699
- * read the old inode data successfully.
4700
- */
4701
- if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4702
- set_buffer_uptodate(bh);
4703
-
4704
- if (buffer_uptodate(bh)) {
4440
+ if (ext4_buffer_uptodate(bh)) {
47054441 /* someone brought it uptodate while we waited */
47064442 unlock_buffer(bh);
47074443 goto has_buffer;
....@@ -4753,6 +4489,7 @@
47534489 * If we need to do any I/O, try to pre-readahead extra
47544490 * blocks from the inode table.
47554491 */
4492
+ blk_start_plug(&plug);
47564493 if (EXT4_SB(sb)->s_inode_readahead_blks) {
47574494 ext4_fsblk_t b, end, table;
47584495 unsigned num;
....@@ -4771,7 +4508,7 @@
47714508 if (end > table)
47724509 end = table;
47734510 while (b <= end)
4774
- sb_breadahead_unmovable(sb, b++);
4511
+ ext4_sb_breadahead_unmovable(sb, b++);
47754512 }
47764513
47774514 /*
....@@ -4779,14 +4516,14 @@
47794516 * has in-inode xattrs, or we don't have this inode in memory.
47804517 * Read the block from disk.
47814518 */
4782
- trace_ext4_load_inode(inode);
4783
- get_bh(bh);
4784
- bh->b_end_io = end_buffer_read_sync;
4785
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
4519
+ trace_ext4_load_inode(sb, ino);
4520
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
4521
+ blk_finish_plug(&plug);
47864522 wait_on_buffer(bh);
47874523 if (!buffer_uptodate(bh)) {
4788
- EXT4_ERROR_INODE_BLOCK(inode, block,
4789
- "unable to read itable block");
4524
+ simulate_eio:
4525
+ if (ret_block)
4526
+ *ret_block = block;
47904527 brelse(bh);
47914528 return -EIO;
47924529 }
....@@ -4796,16 +4533,50 @@
47964533 return 0;
47974534 }
47984535
4799
-int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4536
+static int __ext4_get_inode_loc_noinmem(struct inode *inode,
4537
+ struct ext4_iloc *iloc)
48004538 {
4801
- /* We have all inode data except xattrs in memory here. */
4802
- return __ext4_get_inode_loc(inode, iloc,
4803
- !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4539
+ ext4_fsblk_t err_blk = 0;
4540
+ int ret;
4541
+
4542
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0,
4543
+ &err_blk);
4544
+
4545
+ if (ret == -EIO)
4546
+ ext4_error_inode_block(inode, err_blk, EIO,
4547
+ "unable to read itable block");
4548
+
4549
+ return ret;
48044550 }
48054551
4806
-static bool ext4_should_use_dax(struct inode *inode)
4552
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
48074553 {
4808
- if (!test_opt(inode->i_sb, DAX))
4554
+ ext4_fsblk_t err_blk = 0;
4555
+ int ret;
4556
+
4557
+ /* We have all inode data except xattrs in memory here. */
4558
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc,
4559
+ !ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk);
4560
+
4561
+ if (ret == -EIO)
4562
+ ext4_error_inode_block(inode, err_blk, EIO,
4563
+ "unable to read itable block");
4564
+
4565
+ return ret;
4566
+}
4567
+
4568
+
4569
+int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
4570
+ struct ext4_iloc *iloc)
4571
+{
4572
+ return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL);
4573
+}
4574
+
4575
+static bool ext4_should_enable_dax(struct inode *inode)
4576
+{
4577
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4578
+
4579
+ if (test_opt2(inode->i_sb, DAX_NEVER))
48094580 return false;
48104581 if (!S_ISREG(inode->i_mode))
48114582 return false;
....@@ -4817,13 +4588,20 @@
48174588 return false;
48184589 if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
48194590 return false;
4820
- return true;
4591
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
4592
+ return false;
4593
+ if (test_opt(inode->i_sb, DAX_ALWAYS))
4594
+ return true;
4595
+
4596
+ return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
48214597 }
48224598
4823
-void ext4_set_inode_flags(struct inode *inode)
4599
+void ext4_set_inode_flags(struct inode *inode, bool init)
48244600 {
48254601 unsigned int flags = EXT4_I(inode)->i_flags;
48264602 unsigned int new_fl = 0;
4603
+
4604
+ WARN_ON_ONCE(IS_DAX(inode) && init);
48274605
48284606 if (flags & EXT4_SYNC_FL)
48294607 new_fl |= S_SYNC;
....@@ -4835,8 +4613,13 @@
48354613 new_fl |= S_NOATIME;
48364614 if (flags & EXT4_DIRSYNC_FL)
48374615 new_fl |= S_DIRSYNC;
4838
- if (ext4_should_use_dax(inode))
4616
+
4617
+ /* Because of the way inode_set_flags() works we must preserve S_DAX
4618
+ * here if already set. */
4619
+ new_fl |= (inode->i_flags & S_DAX);
4620
+ if (init && ext4_should_enable_dax(inode))
48394621 new_fl |= S_DAX;
4622
+
48404623 if (flags & EXT4_ENCRYPT_FL)
48414624 new_fl |= S_ENCRYPTED;
48424625 if (flags & EXT4_CASEFOLD_FL)
....@@ -4877,11 +4660,15 @@
48774660 __le32 *magic = (void *)raw_inode +
48784661 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
48794662
4880
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
4881
- EXT4_INODE_SIZE(inode->i_sb) &&
4663
+ if (EXT4_INODE_HAS_XATTR_SPACE(inode) &&
48824664 *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
4665
+ int err;
4666
+
48834667 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4884
- return ext4_find_inline_data_nolock(inode);
4668
+ err = ext4_find_inline_data_nolock(inode);
4669
+ if (!err && ext4_has_inline_data(inode))
4670
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
4671
+ return err;
48854672 } else
48864673 EXT4_I(inode)->i_inline_off = 0;
48874674 return 0;
....@@ -4915,6 +4702,24 @@
49154702 return inode_peek_iversion(inode);
49164703 }
49174704
4705
+static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
4706
+
4707
+{
4708
+ if (flags & EXT4_IGET_EA_INODE) {
4709
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
4710
+ return "missing EA_INODE flag";
4711
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
4712
+ EXT4_I(inode)->i_file_acl)
4713
+ return "ea_inode with extended attributes";
4714
+ } else {
4715
+ if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
4716
+ return "unexpected EA_INODE flag";
4717
+ }
4718
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
4719
+ return "unexpected bad inode w/o EXT4_IGET_BAD";
4720
+ return NULL;
4721
+}
4722
+
49184723 struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
49194724 ext4_iget_flags flags, const char *function,
49204725 unsigned int line)
....@@ -4923,6 +4728,7 @@
49234728 struct ext4_inode *raw_inode;
49244729 struct ext4_inode_info *ei;
49254730 struct inode *inode;
4731
+ const char *err_str;
49264732 journal_t *journal = EXT4_SB(sb)->s_journal;
49274733 long ret;
49284734 loff_t size;
....@@ -4937,7 +4743,7 @@
49374743 (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
49384744 if (flags & EXT4_IGET_HANDLE)
49394745 return ERR_PTR(-ESTALE);
4940
- __ext4_error(sb, function, line,
4746
+ __ext4_error(sb, function, line, EFSCORRUPTED, 0,
49414747 "inode #%lu: comm %s: iget: illegal inode #",
49424748 ino, current->comm);
49434749 return ERR_PTR(-EFSCORRUPTED);
....@@ -4946,23 +4752,22 @@
49464752 inode = iget_locked(sb, ino);
49474753 if (!inode)
49484754 return ERR_PTR(-ENOMEM);
4949
- if (!(inode->i_state & I_NEW))
4755
+ if (!(inode->i_state & I_NEW)) {
4756
+ if ((err_str = check_igot_inode(inode, flags)) != NULL) {
4757
+ ext4_error_inode(inode, function, line, 0, err_str);
4758
+ iput(inode);
4759
+ return ERR_PTR(-EFSCORRUPTED);
4760
+ }
49504761 return inode;
4762
+ }
49514763
49524764 ei = EXT4_I(inode);
49534765 iloc.bh = NULL;
49544766
4955
- ret = __ext4_get_inode_loc(inode, &iloc, 0);
4767
+ ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
49564768 if (ret < 0)
49574769 goto bad_inode;
49584770 raw_inode = ext4_raw_inode(&iloc);
4959
-
4960
- if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
4961
- ext4_error_inode(inode, function, line, 0,
4962
- "iget: root inode unallocated");
4963
- ret = -EFSCORRUPTED;
4964
- goto bad_inode;
4965
- }
49664771
49674772 if ((flags & EXT4_IGET_HANDLE) &&
49684773 (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
....@@ -4998,9 +4803,11 @@
49984803 sizeof(gen));
49994804 }
50004805
5001
- if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
5002
- ext4_error_inode(inode, function, line, 0,
5003
- "iget: checksum invalid");
4806
+ if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
4807
+ ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
4808
+ (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
4809
+ ext4_error_inode_err(inode, function, line, 0,
4810
+ EFSBADCRC, "iget: checksum invalid");
50044811 ret = -EFSBADCRC;
50054812 goto bad_inode;
50064813 }
....@@ -5034,11 +4841,16 @@
50344841 * NeilBrown 1999oct15
50354842 */
50364843 if (inode->i_nlink == 0) {
5037
- if ((inode->i_mode == 0 ||
4844
+ if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
50384845 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
50394846 ino != EXT4_BOOT_LOADER_INO) {
5040
- /* this inode is deleted */
5041
- ret = -ESTALE;
4847
+ /* this inode is deleted or unallocated */
4848
+ if (flags & EXT4_IGET_SPECIAL) {
4849
+ ext4_error_inode(inode, function, line, 0,
4850
+ "iget: special inode unallocated");
4851
+ ret = -EFSCORRUPTED;
4852
+ } else
4853
+ ret = -ESTALE;
50424854 goto bad_inode;
50434855 }
50444856 /* The only unlinked inodes we let through here have
....@@ -5049,7 +4861,7 @@
50494861 * not initialized on a new filesystem. */
50504862 }
50514863 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
5052
- ext4_set_inode_flags(inode);
4864
+ ext4_set_inode_flags(inode, true);
50534865 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
50544866 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
50554867 if (ext4_has_feature_64bit(sb))
....@@ -5088,6 +4900,7 @@
50884900 for (block = 0; block < EXT4_N_BLOCKS; block++)
50894901 ei->i_data[block] = raw_inode->i_block[block];
50904902 INIT_LIST_HEAD(&ei->i_orphan);
4903
+ ext4_fc_init_inode(&ei->vfs_inode);
50914904
50924905 /*
50934906 * Set transaction id's of transactions that have to be committed
....@@ -5153,9 +4966,10 @@
51534966 goto bad_inode;
51544967 } else if (!ext4_has_inline_data(inode)) {
51554968 /* validate the block references in the inode */
5156
- if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
5157
- (S_ISLNK(inode->i_mode) &&
5158
- !ext4_inode_is_fast_symlink(inode))) {
4969
+ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
4970
+ (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4971
+ (S_ISLNK(inode->i_mode) &&
4972
+ !ext4_inode_is_fast_symlink(inode)))) {
51594973 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
51604974 ret = ext4_ext_check_inode(inode);
51614975 else
....@@ -5212,10 +5026,15 @@
52125026 goto bad_inode;
52135027 }
52145028 if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
5215
- EXT4_ERROR_INODE(inode,
5029
+ ext4_error_inode(inode, function, line, 0,
52165030 "casefold flag without casefold feature");
5217
- brelse(iloc.bh);
5031
+ if ((err_str = check_igot_inode(inode, flags)) != NULL) {
5032
+ ext4_error_inode(inode, function, line, 0, err_str);
5033
+ ret = -EFSCORRUPTED;
5034
+ goto bad_inode;
5035
+ }
52185036
5037
+ brelse(iloc.bh);
52195038 unlock_new_inode(inode);
52205039 return inode;
52215040
....@@ -5264,21 +5083,22 @@
52645083 return 0;
52655084 }
52665085
5267
-struct other_inode {
5268
- unsigned long orig_ino;
5269
- struct ext4_inode *raw_inode;
5270
-};
5271
-
5272
-static int other_inode_match(struct inode * inode, unsigned long ino,
5273
- void *data)
5086
+static void __ext4_update_other_inode_time(struct super_block *sb,
5087
+ unsigned long orig_ino,
5088
+ unsigned long ino,
5089
+ struct ext4_inode *raw_inode)
52745090 {
5275
- struct other_inode *oi = (struct other_inode *) data;
5091
+ struct inode *inode;
52765092
5277
- if ((inode->i_ino != ino) ||
5278
- (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
5093
+ inode = find_inode_by_ino_rcu(sb, ino);
5094
+ if (!inode)
5095
+ return;
5096
+
5097
+ if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
52795098 I_DIRTY_INODE)) ||
52805099 ((inode->i_state & I_DIRTY_TIME) == 0))
5281
- return 0;
5100
+ return;
5101
+
52825102 spin_lock(&inode->i_lock);
52835103 if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
52845104 I_DIRTY_INODE)) == 0) &&
....@@ -5289,16 +5109,15 @@
52895109 spin_unlock(&inode->i_lock);
52905110
52915111 spin_lock(&ei->i_raw_lock);
5292
- EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
5293
- EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
5294
- EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
5295
- ext4_inode_csum_set(inode, oi->raw_inode, ei);
5112
+ EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
5113
+ EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
5114
+ EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
5115
+ ext4_inode_csum_set(inode, raw_inode, ei);
52965116 spin_unlock(&ei->i_raw_lock);
5297
- trace_ext4_other_inode_update_time(inode, oi->orig_ino);
5298
- return -1;
5117
+ trace_ext4_other_inode_update_time(inode, orig_ino);
5118
+ return;
52995119 }
53005120 spin_unlock(&inode->i_lock);
5301
- return -1;
53025121 }
53035122
53045123 /*
....@@ -5308,24 +5127,24 @@
53085127 static void ext4_update_other_inodes_time(struct super_block *sb,
53095128 unsigned long orig_ino, char *buf)
53105129 {
5311
- struct other_inode oi;
53125130 unsigned long ino;
53135131 int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
53145132 int inode_size = EXT4_INODE_SIZE(sb);
53155133
5316
- oi.orig_ino = orig_ino;
53175134 /*
53185135 * Calculate the first inode in the inode table block. Inode
53195136 * numbers are one-based. That is, the first inode in a block
53205137 * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
53215138 */
53225139 ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
5140
+ rcu_read_lock();
53235141 for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
53245142 if (ino == orig_ino)
53255143 continue;
5326
- oi.raw_inode = (struct ext4_inode *) buf;
5327
- (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
5144
+ __ext4_update_other_inode_time(sb, orig_ino, ino,
5145
+ (struct ext4_inode *)buf);
53285146 }
5147
+ rcu_read_unlock();
53295148 }
53305149
53315150 /*
....@@ -5535,12 +5354,12 @@
55355354 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
55365355 return 0;
55375356
5538
- err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
5357
+ err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
55395358 EXT4_I(inode)->i_sync_tid);
55405359 } else {
55415360 struct ext4_iloc iloc;
55425361
5543
- err = __ext4_get_inode_loc(inode, &iloc, 0);
5362
+ err = __ext4_get_inode_loc_noinmem(inode, &iloc);
55445363 if (err)
55455364 return err;
55465365 /*
....@@ -5550,8 +5369,8 @@
55505369 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
55515370 sync_dirty_buffer(iloc.bh);
55525371 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5553
- EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
5554
- "IO error syncing inode");
5372
+ ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
5373
+ "IO error syncing inode");
55555374 err = -EIO;
55565375 }
55575376 brelse(iloc.bh);
....@@ -5664,6 +5483,7 @@
56645483 if (error)
56655484 return error;
56665485 }
5486
+
56675487 if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
56685488 (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
56695489 handle_t *handle;
....@@ -5697,37 +5517,58 @@
56975517 inode->i_gid = attr->ia_gid;
56985518 error = ext4_mark_inode_dirty(handle, inode);
56995519 ext4_journal_stop(handle);
5520
+ if (unlikely(error)) {
5521
+ return error;
5522
+ }
57005523 }
57015524
57025525 if (attr->ia_valid & ATTR_SIZE) {
57035526 handle_t *handle;
57045527 loff_t oldsize = inode->i_size;
5705
- int shrink = (attr->ia_size <= inode->i_size);
5528
+ loff_t old_disksize;
5529
+ int shrink = (attr->ia_size < inode->i_size);
57065530
57075531 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
57085532 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
57095533
5710
- if (attr->ia_size > sbi->s_bitmap_maxbytes)
5534
+ if (attr->ia_size > sbi->s_bitmap_maxbytes) {
57115535 return -EFBIG;
5536
+ }
57125537 }
5713
- if (!S_ISREG(inode->i_mode))
5538
+ if (!S_ISREG(inode->i_mode)) {
57145539 return -EINVAL;
5540
+ }
57155541
57165542 if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
57175543 inode_inc_iversion(inode);
57185544
5719
- if (ext4_should_order_data(inode) &&
5720
- (attr->ia_size < inode->i_size)) {
5721
- error = ext4_begin_ordered_truncate(inode,
5545
+ if (shrink) {
5546
+ if (ext4_should_order_data(inode)) {
5547
+ error = ext4_begin_ordered_truncate(inode,
57225548 attr->ia_size);
5723
- if (error)
5724
- goto err_out;
5549
+ if (error)
5550
+ goto err_out;
5551
+ }
5552
+ /*
5553
+ * Blocks are going to be removed from the inode. Wait
5554
+ * for dio in flight.
5555
+ */
5556
+ inode_dio_wait(inode);
57255557 }
5558
+
5559
+ down_write(&EXT4_I(inode)->i_mmap_sem);
5560
+
5561
+ rc = ext4_break_layouts(inode);
5562
+ if (rc) {
5563
+ up_write(&EXT4_I(inode)->i_mmap_sem);
5564
+ goto err_out;
5565
+ }
5566
+
57265567 if (attr->ia_size != inode->i_size) {
57275568 handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
57285569 if (IS_ERR(handle)) {
57295570 error = PTR_ERR(handle);
5730
- goto err_out;
5571
+ goto out_mmap_sem;
57315572 }
57325573 if (ext4_handle_valid(handle) && shrink) {
57335574 error = ext4_orphan_add(handle, inode);
....@@ -5741,7 +5582,22 @@
57415582 inode->i_mtime = current_time(inode);
57425583 inode->i_ctime = inode->i_mtime;
57435584 }
5585
+
5586
+ if (shrink)
5587
+ ext4_fc_track_range(handle, inode,
5588
+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
5589
+ inode->i_sb->s_blocksize_bits,
5590
+ EXT_MAX_BLOCKS - 1);
5591
+ else
5592
+ ext4_fc_track_range(
5593
+ handle, inode,
5594
+ (oldsize > 0 ? oldsize - 1 : oldsize) >>
5595
+ inode->i_sb->s_blocksize_bits,
5596
+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
5597
+ inode->i_sb->s_blocksize_bits);
5598
+
57445599 down_write(&EXT4_I(inode)->i_data_sem);
5600
+ old_disksize = EXT4_I(inode)->i_disksize;
57455601 EXT4_I(inode)->i_disksize = attr->ia_size;
57465602 rc = ext4_mark_inode_dirty(handle, inode);
57475603 if (!error)
....@@ -5753,32 +5609,18 @@
57535609 */
57545610 if (!error)
57555611 i_size_write(inode, attr->ia_size);
5612
+ else
5613
+ EXT4_I(inode)->i_disksize = old_disksize;
57565614 up_write(&EXT4_I(inode)->i_data_sem);
57575615 ext4_journal_stop(handle);
5758
- if (error) {
5759
- if (orphan && inode->i_nlink)
5760
- ext4_orphan_del(NULL, inode);
5761
- goto err_out;
5616
+ if (error)
5617
+ goto out_mmap_sem;
5618
+ if (!shrink) {
5619
+ pagecache_isize_extended(inode, oldsize,
5620
+ inode->i_size);
5621
+ } else if (ext4_should_journal_data(inode)) {
5622
+ ext4_wait_for_tail_page_commit(inode);
57625623 }
5763
- }
5764
- if (!shrink) {
5765
- pagecache_isize_extended(inode, oldsize, inode->i_size);
5766
- } else {
5767
- /*
5768
- * Blocks are going to be removed from the inode. Wait
5769
- * for dio in flight.
5770
- */
5771
- inode_dio_wait(inode);
5772
- }
5773
- if (orphan && ext4_should_journal_data(inode))
5774
- ext4_wait_for_tail_page_commit(inode);
5775
- down_write(&EXT4_I(inode)->i_mmap_sem);
5776
-
5777
- rc = ext4_break_layouts(inode);
5778
- if (rc) {
5779
- up_write(&EXT4_I(inode)->i_mmap_sem);
5780
- error = rc;
5781
- goto err_out;
57825624 }
57835625
57845626 /*
....@@ -5786,11 +5628,16 @@
57865628 * in data=journal mode to make pages freeable.
57875629 */
57885630 truncate_pagecache(inode, inode->i_size);
5789
- if (shrink) {
5631
+ /*
5632
+ * Call ext4_truncate() even if i_size didn't change to
5633
+ * truncate possible preallocated blocks.
5634
+ */
5635
+ if (attr->ia_size <= oldsize) {
57905636 rc = ext4_truncate(inode);
57915637 if (rc)
57925638 error = rc;
57935639 }
5640
+out_mmap_sem:
57945641 up_write(&EXT4_I(inode)->i_mmap_sem);
57955642 }
57965643
....@@ -5810,7 +5657,8 @@
58105657 rc = posix_acl_chmod(inode, inode->i_mode);
58115658
58125659 err_out:
5813
- ext4_std_error(inode->i_sb, error);
5660
+ if (error)
5661
+ ext4_std_error(inode->i_sb, error);
58145662 if (!error)
58155663 error = rc;
58165664 return error;
....@@ -5824,7 +5672,8 @@
58245672 struct ext4_inode_info *ei = EXT4_I(inode);
58255673 unsigned int flags;
58265674
5827
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
5675
+ if ((request_mask & STATX_BTIME) &&
5676
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
58285677 stat->result_mask |= STATX_BTIME;
58295678 stat->btime.tv_sec = ei->i_crtime.tv_sec;
58305679 stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
....@@ -5993,7 +5842,14 @@
59935842 put_bh(iloc->bh);
59945843 return -EIO;
59955844 }
5996
- if (IS_I_VERSION(inode))
5845
+ ext4_fc_track_inode(handle, inode);
5846
+
5847
+ /*
5848
+ * ea_inodes are using i_version for storing reference count, don't
5849
+ * mess with it
5850
+ */
5851
+ if (IS_I_VERSION(inode) &&
5852
+ !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
59975853 inode_inc_iversion(inode);
59985854
59995855 /* the do_update_inode consumes one bh->b_count */
....@@ -6070,6 +5926,14 @@
60705926 return 0;
60715927 }
60725928
5929
+ /*
5930
+ * We may need to allocate external xattr block so we need quotas
5931
+ * initialized. Here we can be called with various locks held so we
5932
+ * cannot affort to initialize quotas ourselves. So just bail.
5933
+ */
5934
+ if (dquot_initialize_needed(inode))
5935
+ return -EAGAIN;
5936
+
60735937 /* try to expand with EAs present */
60745938 error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
60755939 raw_inode, handle);
....@@ -6107,9 +5971,8 @@
61075971 * If this is felt to be critical, then e2fsck should be run to
61085972 * force a large enough s_min_extra_isize.
61095973 */
6110
- if (ext4_handle_valid(handle) &&
6111
- jbd2_journal_extend(handle,
6112
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0)
5974
+ if (ext4_journal_extend(handle,
5975
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
61135976 return -ENOSPC;
61145977
61155978 if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
....@@ -6178,7 +6041,8 @@
61786041 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
61796042 * we start and wait on commits.
61806043 */
6181
-int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
6044
+int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
6045
+ const char *func, unsigned int line)
61826046 {
61836047 struct ext4_iloc iloc;
61846048 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
....@@ -6188,13 +6052,18 @@
61886052 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
61896053 err = ext4_reserve_inode_write(handle, inode, &iloc);
61906054 if (err)
6191
- return err;
6055
+ goto out;
61926056
61936057 if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
61946058 ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
61956059 iloc, handle);
61966060
6197
- return ext4_mark_iloc_dirty(handle, inode, &iloc);
6061
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
6062
+out:
6063
+ if (unlikely(err))
6064
+ ext4_error_inode_err(inode, func, line, 0, err,
6065
+ "mark_inode_dirty error");
6066
+ return err;
61986067 }
61996068
62006069 /*
....@@ -6231,36 +6100,6 @@
62316100 out:
62326101 return;
62336102 }
6234
-
6235
-#if 0
6236
-/*
6237
- * Bind an inode's backing buffer_head into this transaction, to prevent
6238
- * it from being flushed to disk early. Unlike
6239
- * ext4_reserve_inode_write, this leaves behind no bh reference and
6240
- * returns no iloc structure, so the caller needs to repeat the iloc
6241
- * lookup to mark the inode dirty later.
6242
- */
6243
-static int ext4_pin_inode(handle_t *handle, struct inode *inode)
6244
-{
6245
- struct ext4_iloc iloc;
6246
-
6247
- int err = 0;
6248
- if (handle) {
6249
- err = ext4_get_inode_loc(inode, &iloc);
6250
- if (!err) {
6251
- BUFFER_TRACE(iloc.bh, "get_write_access");
6252
- err = jbd2_journal_get_write_access(handle, iloc.bh);
6253
- if (!err)
6254
- err = ext4_handle_dirty_metadata(handle,
6255
- NULL,
6256
- iloc.bh);
6257
- brelse(iloc.bh);
6258
- }
6259
- }
6260
- ext4_std_error(inode->i_sb, err);
6261
- return err;
6262
-}
6263
-#endif
62646103
62656104 int ext4_change_inode_journal_flag(struct inode *inode, int val)
62666105 {
....@@ -6341,6 +6180,8 @@
63416180 if (IS_ERR(handle))
63426181 return PTR_ERR(handle);
63436182
6183
+ ext4_fc_mark_ineligible(inode->i_sb,
6184
+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
63446185 err = ext4_mark_inode_dirty(handle, inode);
63456186 ext4_handle_sync(handle);
63466187 ext4_journal_stop(handle);
....@@ -6354,13 +6195,14 @@
63546195 return !buffer_mapped(bh);
63556196 }
63566197
6357
-int ext4_page_mkwrite(struct vm_fault *vmf)
6198
+vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
63586199 {
63596200 struct vm_area_struct *vma = vmf->vma;
63606201 struct page *page = vmf->page;
63616202 loff_t size;
63626203 unsigned long len;
6363
- int ret;
6204
+ int err;
6205
+ vm_fault_t ret;
63646206 struct file *file = vma->vm_file;
63656207 struct inode *inode = file_inode(file);
63666208 struct address_space *mapping = inode->i_mapping;
....@@ -6376,18 +6218,26 @@
63766218
63776219 down_read(&EXT4_I(inode)->i_mmap_sem);
63786220
6379
- ret = ext4_convert_inline_data(inode);
6380
- if (ret)
6221
+ err = ext4_convert_inline_data(inode);
6222
+ if (err)
63816223 goto out_ret;
6224
+
6225
+ /*
6226
+ * On data journalling we skip straight to the transaction handle:
6227
+ * there's no delalloc; page truncated will be checked later; the
6228
+ * early return w/ all buffers mapped (calculates size/len) can't
6229
+ * be used; and there's no dioread_nolock, so only ext4_get_block.
6230
+ */
6231
+ if (ext4_should_journal_data(inode))
6232
+ goto retry_alloc;
63826233
63836234 /* Delalloc case is easy... */
63846235 if (test_opt(inode->i_sb, DELALLOC) &&
6385
- !ext4_should_journal_data(inode) &&
63866236 !ext4_nonda_switch(inode->i_sb)) {
63876237 do {
6388
- ret = block_page_mkwrite(vma, vmf,
6238
+ err = block_page_mkwrite(vma, vmf,
63896239 ext4_da_get_block_prep);
6390
- } while (ret == -ENOSPC &&
6240
+ } while (err == -ENOSPC &&
63916241 ext4_should_retry_alloc(inode->i_sb, &retries));
63926242 goto out_ret;
63936243 }
....@@ -6408,6 +6258,9 @@
64086258 /*
64096259 * Return if we have all the buffers mapped. This avoids the need to do
64106260 * journal_start/journal_stop which can block and take a long time
6261
+ *
6262
+ * This cannot be done for data journalling, as we have to add the
6263
+ * inode to the transaction's list to writeprotect pages on commit.
64116264 */
64126265 if (page_has_buffers(page)) {
64136266 if (!ext4_walk_page_buffers(NULL, page_buffers(page),
....@@ -6432,36 +6285,67 @@
64326285 ret = VM_FAULT_SIGBUS;
64336286 goto out;
64346287 }
6435
- ret = block_page_mkwrite(vma, vmf, get_block);
6436
- if (!ret && ext4_should_journal_data(inode)) {
6437
- if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
6438
- PAGE_SIZE, NULL, do_journal_get_write_access)) {
6439
- unlock_page(page);
6440
- ret = VM_FAULT_SIGBUS;
6441
- ext4_journal_stop(handle);
6442
- goto out;
6288
+ /*
6289
+ * Data journalling can't use block_page_mkwrite() because it
6290
+ * will set_buffer_dirty() before do_journal_get_write_access()
6291
+ * thus might hit warning messages for dirty metadata buffers.
6292
+ */
6293
+ if (!ext4_should_journal_data(inode)) {
6294
+ err = block_page_mkwrite(vma, vmf, get_block);
6295
+ } else {
6296
+ lock_page(page);
6297
+ size = i_size_read(inode);
6298
+ /* Page got truncated from under us? */
6299
+ if (page->mapping != mapping || page_offset(page) > size) {
6300
+ ret = VM_FAULT_NOPAGE;
6301
+ goto out_error;
64436302 }
6444
- ext4_set_inode_state(inode, EXT4_STATE_JDATA);
6303
+
6304
+ if (page->index == size >> PAGE_SHIFT)
6305
+ len = size & ~PAGE_MASK;
6306
+ else
6307
+ len = PAGE_SIZE;
6308
+
6309
+ err = __block_write_begin(page, 0, len, ext4_get_block);
6310
+ if (!err) {
6311
+ ret = VM_FAULT_SIGBUS;
6312
+ if (ext4_walk_page_buffers(handle, page_buffers(page),
6313
+ 0, len, NULL, do_journal_get_write_access))
6314
+ goto out_error;
6315
+ if (ext4_walk_page_buffers(handle, page_buffers(page),
6316
+ 0, len, NULL, write_end_fn))
6317
+ goto out_error;
6318
+ if (ext4_jbd2_inode_add_write(handle, inode,
6319
+ page_offset(page), len))
6320
+ goto out_error;
6321
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
6322
+ } else {
6323
+ unlock_page(page);
6324
+ }
64456325 }
64466326 ext4_journal_stop(handle);
6447
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
6327
+ if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
64486328 goto retry_alloc;
64496329 out_ret:
6450
- ret = block_page_mkwrite_return(ret);
6330
+ ret = block_page_mkwrite_return(err);
64516331 out:
64526332 up_read(&EXT4_I(inode)->i_mmap_sem);
64536333 sb_end_pagefault(inode->i_sb);
64546334 return ret;
6335
+out_error:
6336
+ unlock_page(page);
6337
+ ext4_journal_stop(handle);
6338
+ goto out;
64556339 }
64566340
6457
-int ext4_filemap_fault(struct vm_fault *vmf)
6341
+vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
64586342 {
64596343 struct inode *inode = file_inode(vmf->vma->vm_file);
6460
- int err;
6344
+ vm_fault_t ret;
64616345
64626346 down_read(&EXT4_I(inode)->i_mmap_sem);
6463
- err = filemap_fault(vmf);
6347
+ ret = filemap_fault(vmf);
64646348 up_read(&EXT4_I(inode)->i_mmap_sem);
64656349
6466
- return err;
6350
+ return ret;
64676351 }