hc
2023-12-08 01573e231f18eb2d99162747186f59511f56b64d
kernel/fs/ext4/inode.c
....@@ -49,8 +49,6 @@
4949 #include <trace/events/ext4.h>
5050 #include <trace/events/android_fs.h>
5151
52
-#define MPAGE_DA_EXTENT_TAIL 0x01
53
-
5452 static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
5553 struct ext4_inode_info *ei)
5654 {
....@@ -104,8 +102,8 @@
104102 return provided == calculated;
105103 }
106104
107
-static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
108
- struct ext4_inode_info *ei)
105
+void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
106
+ struct ext4_inode_info *ei)
109107 {
110108 __u32 csum;
111109
....@@ -162,32 +160,6 @@
162160 }
163161 return S_ISLNK(inode->i_mode) && inode->i_size &&
164162 (inode->i_size < EXT4_N_BLOCKS * 4);
165
-}
166
-
167
-/*
168
- * Restart the transaction associated with *handle. This does a commit,
169
- * so before we call here everything must be consistently dirtied against
170
- * this transaction.
171
- */
172
-int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
173
- int nblocks)
174
-{
175
- int ret;
176
-
177
- /*
178
- * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
179
- * moment, get_block can be called only for blocks inside i_size since
180
- * page cache has been already dropped and writes are blocked by
181
- * i_mutex. So we can safely drop the i_data_sem here.
182
- */
183
- BUG_ON(EXT4_JOURNAL(inode) == NULL);
184
- jbd_debug(2, "restarting handle %p\n", handle);
185
- up_write(&EXT4_I(inode)->i_data_sem);
186
- ret = ext4_journal_restart(handle, nblocks);
187
- down_write(&EXT4_I(inode)->i_data_sem);
188
- ext4_discard_preallocations(inode);
189
-
190
- return ret;
191163 }
192164
193165 /*
....@@ -251,6 +223,16 @@
251223 truncate_inode_pages_final(&inode->i_data);
252224
253225 /*
226
+ * For inodes with journalled data, transaction commit could have
227
+ * dirtied the inode. Flush worker is ignoring it because of I_FREEING
228
+ * flag but we still need to remove the inode from the writeback lists.
229
+ */
230
+ if (!list_empty_careful(&inode->i_io_list)) {
231
+ WARN_ON_ONCE(!ext4_should_journal_data(inode));
232
+ inode_io_list_del(inode);
233
+ }
234
+
235
+ /*
254236 * Protect us against freezing - iput() caller didn't have to have any
255237 * protection against it. When we are in a running transaction though,
256238 * we are already protected against freezing and we cannot grab further
....@@ -305,9 +287,9 @@
305287 if (inode->i_blocks) {
306288 err = ext4_truncate(inode);
307289 if (err) {
308
- ext4_error(inode->i_sb,
309
- "couldn't truncate inode %lu (err %d)",
310
- inode->i_ino, err);
290
+ ext4_error_err(inode->i_sb, -err,
291
+ "couldn't truncate inode %lu (err %d)",
292
+ inode->i_ino, err);
311293 goto stop_handle;
312294 }
313295 }
....@@ -355,6 +337,8 @@
355337 ext4_xattr_inode_array_free(ea_inode_array);
356338 return;
357339 no_delete:
340
+ if (!list_empty(&EXT4_I(inode)->i_fc_list))
341
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
358342 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
359343 }
360344
....@@ -410,8 +394,8 @@
410394 * inode's preallocations.
411395 */
412396 if ((ei->i_reserved_data_blocks == 0) &&
413
- (atomic_read(&inode->i_writecount) == 0))
414
- ext4_discard_preallocations(inode);
397
+ !inode_is_open_for_write(inode))
398
+ ext4_discard_preallocations(inode, 0);
415399 }
416400
417401 static int __check_block_validity(struct inode *inode, const char *func,
....@@ -437,7 +421,7 @@
437421 {
438422 int ret;
439423
440
- if (IS_ENCRYPTED(inode))
424
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
441425 return fscrypt_zeroout_range(inode, lblk, pblk, len);
442426
443427 ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
....@@ -469,11 +453,9 @@
469453 */
470454 down_read(&EXT4_I(inode)->i_data_sem);
471455 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
472
- retval = ext4_ext_map_blocks(handle, inode, map, flags &
473
- EXT4_GET_BLOCKS_KEEP_SIZE);
456
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
474457 } else {
475
- retval = ext4_ind_map_blocks(handle, inode, map, flags &
476
- EXT4_GET_BLOCKS_KEEP_SIZE);
458
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
477459 }
478460 up_read((&EXT4_I(inode)->i_data_sem));
479461
....@@ -530,9 +512,8 @@
530512 #endif
531513
532514 map->m_flags = 0;
533
- ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
534
- "logical block %lu\n", inode->i_ino, flags, map->m_len,
535
- (unsigned long) map->m_lblk);
515
+ ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
516
+ flags, map->m_len, (unsigned long) map->m_lblk);
536517
537518 /*
538519 * ext4_map_blocks returns an int, and m_len is an unsigned int
....@@ -545,7 +526,8 @@
545526 return -EFSCORRUPTED;
546527
547528 /* Lookup extent status tree firstly */
548
- if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
529
+ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
530
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
549531 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
550532 map->m_pblk = ext4_es_pblock(&es) +
551533 map->m_lblk - es.es_lblk;
....@@ -563,7 +545,7 @@
563545 map->m_len = retval;
564546 retval = 0;
565547 } else {
566
- BUG_ON(1);
548
+ BUG();
567549 }
568550 #ifdef ES_AGGRESSIVE_TEST
569551 ext4_map_blocks_es_recheck(handle, inode, map,
....@@ -578,11 +560,9 @@
578560 */
579561 down_read(&EXT4_I(inode)->i_data_sem);
580562 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
581
- retval = ext4_ext_map_blocks(handle, inode, map, flags &
582
- EXT4_GET_BLOCKS_KEEP_SIZE);
563
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
583564 } else {
584
- retval = ext4_ind_map_blocks(handle, inode, map, flags &
585
- EXT4_GET_BLOCKS_KEEP_SIZE);
565
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
586566 }
587567 if (retval > 0) {
588568 unsigned int status;
....@@ -599,8 +579,8 @@
599579 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
600580 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
601581 !(status & EXTENT_STATUS_WRITTEN) &&
602
- ext4_find_delalloc_range(inode, map->m_lblk,
603
- map->m_lblk + map->m_len - 1))
582
+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
583
+ map->m_lblk + map->m_len - 1))
604584 status |= EXTENT_STATUS_DELAYED;
605585 ret = ext4_es_insert_extent(inode, map->m_lblk,
606586 map->m_len, map->m_pblk, status);
....@@ -700,8 +680,6 @@
700680 if (flags & EXT4_GET_BLOCKS_ZERO &&
701681 map->m_flags & EXT4_MAP_MAPPED &&
702682 map->m_flags & EXT4_MAP_NEW) {
703
- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
704
- map->m_len);
705683 ret = ext4_issue_zeroout(inode, map->m_lblk,
706684 map->m_pblk, map->m_len);
707685 if (ret) {
....@@ -715,7 +693,7 @@
715693 * extent status tree.
716694 */
717695 if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
718
- ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
696
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
719697 if (ext4_es_is_written(&es))
720698 goto out_sem;
721699 }
....@@ -723,8 +701,8 @@
723701 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
724702 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
725703 !(status & EXTENT_STATUS_WRITTEN) &&
726
- ext4_find_delalloc_range(inode, map->m_lblk,
727
- map->m_lblk + map->m_len - 1))
704
+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
705
+ map->m_lblk + map->m_len - 1))
728706 status |= EXTENT_STATUS_DELAYED;
729707 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
730708 map->m_pblk, status);
....@@ -765,6 +743,12 @@
765743 return ret;
766744 }
767745 }
746
+ if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
747
+ map->m_flags & EXT4_MAP_MAPPED))
748
+ ext4_fc_track_range(handle, inode, map->m_lblk,
749
+ map->m_lblk + map->m_len - 1);
750
+ if (retval < 0)
751
+ ext_debug(inode, "failed with err %d\n", retval);
768752 return retval;
769753 }
770754
....@@ -847,136 +831,6 @@
847831 #define DIO_MAX_BLOCKS 4096
848832
849833 /*
850
- * Get blocks function for the cases that need to start a transaction -
851
- * generally difference cases of direct IO and DAX IO. It also handles retries
852
- * in case of ENOSPC.
853
- */
854
-static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
855
- struct buffer_head *bh_result, int flags)
856
-{
857
- int dio_credits;
858
- handle_t *handle;
859
- int retries = 0;
860
- int ret;
861
-
862
- /* Trim mapping request to maximum we can map at once for DIO */
863
- if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
864
- bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
865
- dio_credits = ext4_chunk_trans_blocks(inode,
866
- bh_result->b_size >> inode->i_blkbits);
867
-retry:
868
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
869
- if (IS_ERR(handle))
870
- return PTR_ERR(handle);
871
-
872
- ret = _ext4_get_block(inode, iblock, bh_result, flags);
873
- ext4_journal_stop(handle);
874
-
875
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
876
- goto retry;
877
- return ret;
878
-}
879
-
880
-/* Get block function for DIO reads and writes to inodes without extents */
881
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
882
- struct buffer_head *bh, int create)
883
-{
884
- /* We don't expect handle for direct IO */
885
- WARN_ON_ONCE(ext4_journal_current_handle());
886
-
887
- if (!create)
888
- return _ext4_get_block(inode, iblock, bh, 0);
889
- return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
890
-}
891
-
892
-/*
893
- * Get block function for AIO DIO writes when we create unwritten extent if
894
- * blocks are not allocated yet. The extent will be converted to written
895
- * after IO is complete.
896
- */
897
-static int ext4_dio_get_block_unwritten_async(struct inode *inode,
898
- sector_t iblock, struct buffer_head *bh_result, int create)
899
-{
900
- int ret;
901
-
902
- /* We don't expect handle for direct IO */
903
- WARN_ON_ONCE(ext4_journal_current_handle());
904
-
905
- ret = ext4_get_block_trans(inode, iblock, bh_result,
906
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
907
-
908
- /*
909
- * When doing DIO using unwritten extents, we need io_end to convert
910
- * unwritten extents to written on IO completion. We allocate io_end
911
- * once we spot unwritten extent and store it in b_private. Generic
912
- * DIO code keeps b_private set and furthermore passes the value to
913
- * our completion callback in 'private' argument.
914
- */
915
- if (!ret && buffer_unwritten(bh_result)) {
916
- if (!bh_result->b_private) {
917
- ext4_io_end_t *io_end;
918
-
919
- io_end = ext4_init_io_end(inode, GFP_KERNEL);
920
- if (!io_end)
921
- return -ENOMEM;
922
- bh_result->b_private = io_end;
923
- ext4_set_io_unwritten_flag(inode, io_end);
924
- }
925
- set_buffer_defer_completion(bh_result);
926
- }
927
-
928
- return ret;
929
-}
930
-
931
-/*
932
- * Get block function for non-AIO DIO writes when we create unwritten extent if
933
- * blocks are not allocated yet. The extent will be converted to written
934
- * after IO is complete by ext4_direct_IO_write().
935
- */
936
-static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
937
- sector_t iblock, struct buffer_head *bh_result, int create)
938
-{
939
- int ret;
940
-
941
- /* We don't expect handle for direct IO */
942
- WARN_ON_ONCE(ext4_journal_current_handle());
943
-
944
- ret = ext4_get_block_trans(inode, iblock, bh_result,
945
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
946
-
947
- /*
948
- * Mark inode as having pending DIO writes to unwritten extents.
949
- * ext4_direct_IO_write() checks this flag and converts extents to
950
- * written.
951
- */
952
- if (!ret && buffer_unwritten(bh_result))
953
- ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
954
-
955
- return ret;
956
-}
957
-
958
-static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
959
- struct buffer_head *bh_result, int create)
960
-{
961
- int ret;
962
-
963
- ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
964
- inode->i_ino, create);
965
- /* We don't expect handle for direct IO */
966
- WARN_ON_ONCE(ext4_journal_current_handle());
967
-
968
- ret = _ext4_get_block(inode, iblock, bh_result, 0);
969
- /*
970
- * Blocks should have been preallocated! ext4_file_write_iter() checks
971
- * that.
972
- */
973
- WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
974
-
975
- return ret;
976
-}
977
-
978
-
979
-/*
980834 * `handle' can be NULL if create is zero
981835 */
982836 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
....@@ -987,7 +841,8 @@
987841 int create = map_flags & EXT4_GET_BLOCKS_CREATE;
988842 int err;
989843
990
- J_ASSERT(handle != NULL || create == 0);
844
+ J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
845
+ || handle != NULL || create == 0);
991846
992847 map.m_lblk = block;
993848 map.m_len = 1;
....@@ -1003,7 +858,8 @@
1003858 return ERR_PTR(-ENOMEM);
1004859 if (map.m_flags & EXT4_MAP_NEW) {
1005860 J_ASSERT(create != 0);
1006
- J_ASSERT(handle != NULL);
861
+ J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
862
+ || (handle != NULL));
1007863
1008864 /*
1009865 * Now that we do not always journal data, we should
....@@ -1040,18 +896,20 @@
1040896 ext4_lblk_t block, int map_flags)
1041897 {
1042898 struct buffer_head *bh;
899
+ int ret;
1043900
1044901 bh = ext4_getblk(handle, inode, block, map_flags);
1045902 if (IS_ERR(bh))
1046903 return bh;
1047
- if (!bh || buffer_uptodate(bh))
904
+ if (!bh || ext4_buffer_uptodate(bh))
1048905 return bh;
1049
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
1050
- wait_on_buffer(bh);
1051
- if (buffer_uptodate(bh))
1052
- return bh;
1053
- put_bh(bh);
1054
- return ERR_PTR(-EIO);
906
+
907
+ ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
908
+ if (ret) {
909
+ put_bh(bh);
910
+ return ERR_PTR(ret);
911
+ }
912
+ return bh;
1055913 }
1056914
1057915 /* Read a contiguous batch of blocks. */
....@@ -1071,9 +929,8 @@
1071929
1072930 for (i = 0; i < bh_count; i++)
1073931 /* Note that NULL bhs[i] is valid because of holes. */
1074
- if (bhs[i] && !buffer_uptodate(bhs[i]))
1075
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
1076
- &bhs[i]);
932
+ if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
933
+ ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);
1077934
1078935 if (!wait)
1079936 return 0;
....@@ -1190,8 +1047,9 @@
11901047 int err = 0;
11911048 unsigned blocksize = inode->i_sb->s_blocksize;
11921049 unsigned bbits;
1193
- struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
1194
- bool decrypt = false;
1050
+ struct buffer_head *bh, *head, *wait[2];
1051
+ int nr_wait = 0;
1052
+ int i;
11951053
11961054 BUG_ON(!PageLocked(page));
11971055 BUG_ON(from > PAGE_SIZE);
....@@ -1222,7 +1080,6 @@
12221080 if (err)
12231081 break;
12241082 if (buffer_new(bh)) {
1225
- clean_bdev_bh_alias(bh);
12261083 if (PageUptodate(page)) {
12271084 clear_buffer_new(bh);
12281085 set_buffer_uptodate(bh);
....@@ -1243,23 +1100,33 @@
12431100 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
12441101 !buffer_unwritten(bh) &&
12451102 (block_start < from || block_end > to)) {
1246
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
1247
- *wait_bh++ = bh;
1248
- decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
1103
+ ext4_read_bh_lock(bh, 0, false);
1104
+ wait[nr_wait++] = bh;
12491105 }
12501106 }
12511107 /*
12521108 * If we issued read requests, let them complete.
12531109 */
1254
- while (wait_bh > wait) {
1255
- wait_on_buffer(*--wait_bh);
1256
- if (!buffer_uptodate(*wait_bh))
1110
+ for (i = 0; i < nr_wait; i++) {
1111
+ wait_on_buffer(wait[i]);
1112
+ if (!buffer_uptodate(wait[i]))
12571113 err = -EIO;
12581114 }
1259
- if (unlikely(err))
1115
+ if (unlikely(err)) {
12601116 page_zero_new_buffers(page, from, to);
1261
- else if (decrypt)
1262
- err = fscrypt_decrypt_pagecache_blocks(page, PAGE_SIZE, 0);
1117
+ } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
1118
+ for (i = 0; i < nr_wait; i++) {
1119
+ int err2;
1120
+
1121
+ err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize,
1122
+ bh_offset(wait[i]));
1123
+ if (err2) {
1124
+ clear_buffer_uptodate(wait[i]);
1125
+ err = err2;
1126
+ }
1127
+ }
1128
+ }
1129
+
12631130 return err;
12641131 }
12651132 #endif
....@@ -1319,6 +1186,13 @@
13191186 page = grab_cache_page_write_begin(mapping, index, flags);
13201187 if (!page)
13211188 return -ENOMEM;
1189
+ /*
1190
+ * The same as page allocation, we prealloc buffer heads before
1191
+ * starting the handle.
1192
+ */
1193
+ if (!page_has_buffers(page))
1194
+ create_empty_buffers(page, inode->i_sb->s_blocksize, 0);
1195
+
13221196 unlock_page(page);
13231197
13241198 retry_journal:
....@@ -1442,6 +1316,7 @@
14421316 goto errout;
14431317 }
14441318 copied = ret;
1319
+ ret = 0;
14451320 } else
14461321 copied = block_write_end(file, mapping, pos,
14471322 len, copied, page, fsdata);
....@@ -1466,15 +1341,16 @@
14661341 * filesystems.
14671342 */
14681343 if (i_size_changed || inline_data)
1469
- ext4_mark_inode_dirty(handle, inode);
1344
+ ret = ext4_mark_inode_dirty(handle, inode);
14701345
1346
+errout:
14711347 if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
14721348 /* if we have allocated more blocks and copied
14731349 * less. We will have blocks allocated outside
14741350 * inode->i_size. So truncate them
14751351 */
14761352 ext4_orphan_add(handle, inode);
1477
-errout:
1353
+
14781354 ret2 = ext4_journal_stop(handle);
14791355 if (!ret)
14801356 ret = ret2;
....@@ -1558,6 +1434,7 @@
15581434 goto errout;
15591435 }
15601436 copied = ret;
1437
+ ret = 0;
15611438 } else if (unlikely(copied < len) && !PageUptodate(page)) {
15621439 copied = 0;
15631440 ext4_journalled_zero_new_buffers(handle, page, from, to);
....@@ -1587,6 +1464,7 @@
15871464 ret = ret2;
15881465 }
15891466
1467
+errout:
15901468 if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
15911469 /* if we have allocated more blocks and copied
15921470 * less. We will have blocks allocated outside
....@@ -1594,7 +1472,6 @@
15941472 */
15951473 ext4_orphan_add(handle, inode);
15961474
1597
-errout:
15981475 ret2 = ext4_journal_stop(handle);
15991476 if (!ret)
16001477 ret = ret2;
....@@ -1643,7 +1520,7 @@
16431520 return 0; /* success */
16441521 }
16451522
1646
-static void ext4_da_release_space(struct inode *inode, int to_free)
1523
+void ext4_da_release_space(struct inode *inode, int to_free)
16471524 {
16481525 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
16491526 struct ext4_inode_info *ei = EXT4_I(inode);
....@@ -1678,64 +1555,6 @@
16781555 dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
16791556 }
16801557
1681
-static void ext4_da_page_release_reservation(struct page *page,
1682
- unsigned int offset,
1683
- unsigned int length)
1684
-{
1685
- int to_release = 0, contiguous_blks = 0;
1686
- struct buffer_head *head, *bh;
1687
- unsigned int curr_off = 0;
1688
- struct inode *inode = page->mapping->host;
1689
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1690
- unsigned int stop = offset + length;
1691
- int num_clusters;
1692
- ext4_fsblk_t lblk;
1693
-
1694
- BUG_ON(stop > PAGE_SIZE || stop < length);
1695
-
1696
- head = page_buffers(page);
1697
- bh = head;
1698
- do {
1699
- unsigned int next_off = curr_off + bh->b_size;
1700
-
1701
- if (next_off > stop)
1702
- break;
1703
-
1704
- if ((offset <= curr_off) && (buffer_delay(bh))) {
1705
- to_release++;
1706
- contiguous_blks++;
1707
- clear_buffer_delay(bh);
1708
- } else if (contiguous_blks) {
1709
- lblk = page->index <<
1710
- (PAGE_SHIFT - inode->i_blkbits);
1711
- lblk += (curr_off >> inode->i_blkbits) -
1712
- contiguous_blks;
1713
- ext4_es_remove_extent(inode, lblk, contiguous_blks);
1714
- contiguous_blks = 0;
1715
- }
1716
- curr_off = next_off;
1717
- } while ((bh = bh->b_this_page) != head);
1718
-
1719
- if (contiguous_blks) {
1720
- lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
1721
- lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
1722
- ext4_es_remove_extent(inode, lblk, contiguous_blks);
1723
- }
1724
-
1725
- /* If we have released all the blocks belonging to a cluster, then we
1726
- * need to release the reserved space for that cluster. */
1727
- num_clusters = EXT4_NUM_B2C(sbi, to_release);
1728
- while (num_clusters > 0) {
1729
- lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
1730
- ((num_clusters - 1) << sbi->s_cluster_bits);
1731
- if (sbi->s_cluster_ratio == 1 ||
1732
- !ext4_find_delalloc_cluster(inode, lblk))
1733
- ext4_da_release_space(inode, 1);
1734
-
1735
- num_clusters--;
1736
- }
1737
-}
1738
-
17391558 /*
17401559 * Delayed allocation stuff
17411560 */
....@@ -1755,6 +1574,7 @@
17551574 struct ext4_map_blocks map;
17561575 struct ext4_io_submit io_submit; /* IO submission data */
17571576 unsigned int do_map:1;
1577
+ unsigned int scanned_until_end:1;
17581578 };
17591579
17601580 static void mpage_release_unused_pages(struct mpage_da_data *mpd,
....@@ -1770,13 +1590,21 @@
17701590 if (mpd->first_page >= mpd->next_page)
17711591 return;
17721592
1593
+ mpd->scanned_until_end = 0;
17731594 index = mpd->first_page;
17741595 end = mpd->next_page - 1;
17751596 if (invalidate) {
17761597 ext4_lblk_t start, last;
17771598 start = index << (PAGE_SHIFT - inode->i_blkbits);
17781599 last = end << (PAGE_SHIFT - inode->i_blkbits);
1600
+
1601
+ /*
1602
+ * avoid racing with extent status tree scans made by
1603
+ * ext4_insert_delayed_block()
1604
+ */
1605
+ down_write(&EXT4_I(inode)->i_data_sem);
17791606 ext4_es_remove_extent(inode, start, last - start + 1);
1607
+ up_write(&EXT4_I(inode)->i_data_sem);
17801608 }
17811609
17821610 pagevec_init(&pvec);
....@@ -1829,6 +1657,70 @@
18291657 }
18301658
18311659 /*
1660
+ * ext4_insert_delayed_block - adds a delayed block to the extents status
1661
+ * tree, incrementing the reserved cluster/block
1662
+ * count or making a pending reservation
1663
+ * where needed
1664
+ *
1665
+ * @inode - file containing the newly added block
1666
+ * @lblk - logical block to be added
1667
+ *
1668
+ * Returns 0 on success, negative error code on failure.
1669
+ */
1670
+static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
1671
+{
1672
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1673
+ int ret;
1674
+ bool allocated = false;
1675
+ bool reserved = false;
1676
+
1677
+ /*
1678
+ * If the cluster containing lblk is shared with a delayed,
1679
+ * written, or unwritten extent in a bigalloc file system, it's
1680
+ * already been accounted for and does not need to be reserved.
1681
+ * A pending reservation must be made for the cluster if it's
1682
+ * shared with a written or unwritten extent and doesn't already
1683
+ * have one. Written and unwritten extents can be purged from the
1684
+ * extents status tree if the system is under memory pressure, so
1685
+ * it's necessary to examine the extent tree if a search of the
1686
+ * extents status tree doesn't get a match.
1687
+ */
1688
+ if (sbi->s_cluster_ratio == 1) {
1689
+ ret = ext4_da_reserve_space(inode);
1690
+ if (ret != 0) /* ENOSPC */
1691
+ goto errout;
1692
+ reserved = true;
1693
+ } else { /* bigalloc */
1694
+ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
1695
+ if (!ext4_es_scan_clu(inode,
1696
+ &ext4_es_is_mapped, lblk)) {
1697
+ ret = ext4_clu_mapped(inode,
1698
+ EXT4_B2C(sbi, lblk));
1699
+ if (ret < 0)
1700
+ goto errout;
1701
+ if (ret == 0) {
1702
+ ret = ext4_da_reserve_space(inode);
1703
+ if (ret != 0) /* ENOSPC */
1704
+ goto errout;
1705
+ reserved = true;
1706
+ } else {
1707
+ allocated = true;
1708
+ }
1709
+ } else {
1710
+ allocated = true;
1711
+ }
1712
+ }
1713
+ }
1714
+
1715
+ ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
1716
+ if (ret && reserved)
1717
+ ext4_da_release_space(inode, 1);
1718
+
1719
+errout:
1720
+ return ret;
1721
+}
1722
+
1723
+/*
18321724 * This function is grabs code from the very beginning of
18331725 * ext4_map_blocks, but assumes that the caller is from delayed write
18341726 * time. This function looks up the requested blocks and sets the
....@@ -1851,12 +1743,11 @@
18511743 invalid_block = ~0;
18521744
18531745 map->m_flags = 0;
1854
- ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
1855
- "logical block %lu\n", inode->i_ino, map->m_len,
1746
+ ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
18561747 (unsigned long) map->m_lblk);
18571748
18581749 /* Lookup extent status tree firstly */
1859
- if (ext4_es_lookup_extent(inode, iblock, &es)) {
1750
+ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
18601751 if (ext4_es_is_hole(&es)) {
18611752 retval = 0;
18621753 down_read(&EXT4_I(inode)->i_data_sem);
....@@ -1884,7 +1775,7 @@
18841775 else if (ext4_es_is_unwritten(&es))
18851776 map->m_flags |= EXT4_MAP_UNWRITTEN;
18861777 else
1887
- BUG_ON(1);
1778
+ BUG();
18881779
18891780 #ifdef ES_AGGRESSIVE_TEST
18901781 ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
....@@ -1907,28 +1798,14 @@
19071798 add_delayed:
19081799 if (retval == 0) {
19091800 int ret;
1801
+
19101802 /*
19111803 * XXX: __block_prepare_write() unmaps passed block,
19121804 * is it OK?
19131805 */
1914
- /*
1915
- * If the block was allocated from previously allocated cluster,
1916
- * then we don't need to reserve it again. However we still need
1917
- * to reserve metadata for every block we're going to write.
1918
- */
1919
- if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
1920
- !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
1921
- ret = ext4_da_reserve_space(inode);
1922
- if (ret) {
1923
- /* not enough space to reserve */
1924
- retval = ret;
1925
- goto out_unlock;
1926
- }
1927
- }
19281806
1929
- ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
1930
- ~0, EXTENT_STATUS_DELAYED);
1931
- if (ret) {
1807
+ ret = ext4_insert_delayed_block(inode, map->m_lblk);
1808
+ if (ret != 0) {
19321809 retval = ret;
19331810 goto out_unlock;
19341811 }
....@@ -2088,6 +1965,9 @@
20881965 }
20891966 if (ret == 0)
20901967 ret = err;
1968
+ err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
1969
+ if (ret == 0)
1970
+ ret = err;
20911971 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
20921972 err = ext4_journal_stop(handle);
20931973 if (!ret)
....@@ -2169,6 +2049,15 @@
21692049 len = size & ~PAGE_MASK;
21702050 else
21712051 len = PAGE_SIZE;
2052
+
2053
+ /* Should never happen but for bugs in other kernel subsystems */
2054
+ if (!page_has_buffers(page)) {
2055
+ ext4_warning_inode(inode,
2056
+ "page %lu does not have buffers attached", page->index);
2057
+ ClearPageDirty(page);
2058
+ unlock_page(page);
2059
+ return 0;
2060
+ }
21722061
21732062 page_bufs = page_buffers(page);
21742063 /*
....@@ -2262,7 +2151,7 @@
22622151 return err;
22632152 }
22642153
2265
-#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
2154
+#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))
22662155
22672156 /*
22682157 * mballoc gives us at most this number of blocks...
....@@ -2372,7 +2261,84 @@
23722261 if (err < 0)
23732262 return err;
23742263 }
2375
- return lblk < blocks;
2264
+ if (lblk >= blocks) {
2265
+ mpd->scanned_until_end = 1;
2266
+ return 0;
2267
+ }
2268
+ return 1;
2269
+}
2270
+
2271
+/*
2272
+ * mpage_process_page - update page buffers corresponding to changed extent and
2273
+ * may submit fully mapped page for IO
2274
+ *
2275
+ * @mpd - description of extent to map, on return next extent to map
2276
+ * @m_lblk - logical block mapping.
2277
+ * @m_pblk - corresponding physical mapping.
2278
+ * @map_bh - determines on return whether this page requires any further
2279
+ * mapping or not.
2280
+ * Scan given page buffers corresponding to changed extent and update buffer
2281
+ * state according to new extent state.
2282
+ * We map delalloc buffers to their physical location, clear unwritten bits.
2283
+ * If the given page is not fully mapped, we update @map to the next extent in
2284
+ * the given page that needs mapping & return @map_bh as true.
2285
+ */
2286
+static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
2287
+ ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
2288
+ bool *map_bh)
2289
+{
2290
+ struct buffer_head *head, *bh;
2291
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
2292
+ ext4_lblk_t lblk = *m_lblk;
2293
+ ext4_fsblk_t pblock = *m_pblk;
2294
+ int err = 0;
2295
+ int blkbits = mpd->inode->i_blkbits;
2296
+ ssize_t io_end_size = 0;
2297
+ struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
2298
+
2299
+ bh = head = page_buffers(page);
2300
+ do {
2301
+ if (lblk < mpd->map.m_lblk)
2302
+ continue;
2303
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2304
+ /*
2305
+ * Buffer after end of mapped extent.
2306
+ * Find next buffer in the page to map.
2307
+ */
2308
+ mpd->map.m_len = 0;
2309
+ mpd->map.m_flags = 0;
2310
+ io_end_vec->size += io_end_size;
2311
+ io_end_size = 0;
2312
+
2313
+ err = mpage_process_page_bufs(mpd, head, bh, lblk);
2314
+ if (err > 0)
2315
+ err = 0;
2316
+ if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
2317
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
2318
+ if (IS_ERR(io_end_vec)) {
2319
+ err = PTR_ERR(io_end_vec);
2320
+ goto out;
2321
+ }
2322
+ io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
2323
+ }
2324
+ *map_bh = true;
2325
+ goto out;
2326
+ }
2327
+ if (buffer_delay(bh)) {
2328
+ clear_buffer_delay(bh);
2329
+ bh->b_blocknr = pblock++;
2330
+ }
2331
+ clear_buffer_unwritten(bh);
2332
+ io_end_size += (1 << blkbits);
2333
+ } while (lblk++, (bh = bh->b_this_page) != head);
2334
+
2335
+ io_end_vec->size += io_end_size;
2336
+ io_end_size = 0;
2337
+ *map_bh = false;
2338
+out:
2339
+ *m_lblk = lblk;
2340
+ *m_pblk = pblock;
2341
+ return err;
23762342 }
23772343
23782344 /*
....@@ -2394,12 +2360,12 @@
23942360 struct pagevec pvec;
23952361 int nr_pages, i;
23962362 struct inode *inode = mpd->inode;
2397
- struct buffer_head *head, *bh;
23982363 int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
23992364 pgoff_t start, end;
24002365 ext4_lblk_t lblk;
2401
- sector_t pblock;
2366
+ ext4_fsblk_t pblock;
24022367 int err;
2368
+ bool map_bh = false;
24032369
24042370 start = mpd->map.m_lblk >> bpp_bits;
24052371 end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
....@@ -2415,50 +2381,19 @@
24152381 for (i = 0; i < nr_pages; i++) {
24162382 struct page *page = pvec.pages[i];
24172383
2418
- bh = head = page_buffers(page);
2419
- do {
2420
- if (lblk < mpd->map.m_lblk)
2421
- continue;
2422
- if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2423
- /*
2424
- * Buffer after end of mapped extent.
2425
- * Find next buffer in the page to map.
2426
- */
2427
- mpd->map.m_len = 0;
2428
- mpd->map.m_flags = 0;
2429
- /*
2430
- * FIXME: If dioread_nolock supports
2431
- * blocksize < pagesize, we need to make
2432
- * sure we add size mapped so far to
2433
- * io_end->size as the following call
2434
- * can submit the page for IO.
2435
- */
2436
- err = mpage_process_page_bufs(mpd, head,
2437
- bh, lblk);
2438
- pagevec_release(&pvec);
2439
- if (err > 0)
2440
- err = 0;
2441
- return err;
2442
- }
2443
- if (buffer_delay(bh)) {
2444
- clear_buffer_delay(bh);
2445
- bh->b_blocknr = pblock++;
2446
- }
2447
- clear_buffer_unwritten(bh);
2448
- } while (lblk++, (bh = bh->b_this_page) != head);
2449
-
2384
+ err = mpage_process_page(mpd, page, &lblk, &pblock,
2385
+ &map_bh);
24502386 /*
2451
- * FIXME: This is going to break if dioread_nolock
2452
- * supports blocksize < pagesize as we will try to
2453
- * convert potentially unmapped parts of inode.
2387
+ * If map_bh is true, means page may require further bh
2388
+ * mapping, or maybe the page was submitted for IO.
2389
+ * So we return to call further extent mapping.
24542390 */
2455
- mpd->io_submit.io_end->size += PAGE_SIZE;
2391
+ if (err < 0 || map_bh)
2392
+ goto out;
24562393 /* Page fully mapped - let IO run! */
24572394 err = mpage_submit_page(mpd, page);
2458
- if (err < 0) {
2459
- pagevec_release(&pvec);
2460
- return err;
2461
- }
2395
+ if (err < 0)
2396
+ goto out;
24622397 }
24632398 pagevec_release(&pvec);
24642399 }
....@@ -2466,6 +2401,9 @@
24662401 mpd->map.m_len = 0;
24672402 mpd->map.m_flags = 0;
24682403 return 0;
2404
+out:
2405
+ pagevec_release(&pvec);
2406
+ return err;
24692407 }
24702408
24712409 static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
....@@ -2497,7 +2435,7 @@
24972435 dioread_nolock = ext4_should_dioread_nolock(inode);
24982436 if (dioread_nolock)
24992437 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2500
- if (map->m_flags & (1 << BH_Delay))
2438
+ if (map->m_flags & BIT(BH_Delay))
25012439 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
25022440
25032441 err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
....@@ -2513,10 +2451,6 @@
25132451 }
25142452
25152453 BUG_ON(map->m_len == 0);
2516
- if (map->m_flags & EXT4_MAP_NEW) {
2517
- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
2518
- map->m_len);
2519
- }
25202454 return 0;
25212455 }
25222456
....@@ -2549,16 +2483,20 @@
25492483 int err;
25502484 loff_t disksize;
25512485 int progress = 0;
2486
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
2487
+ struct ext4_io_end_vec *io_end_vec;
25522488
2553
- mpd->io_submit.io_end->offset =
2554
- ((loff_t)map->m_lblk) << inode->i_blkbits;
2489
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
2490
+ if (IS_ERR(io_end_vec))
2491
+ return PTR_ERR(io_end_vec);
2492
+ io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
25552493 do {
25562494 err = mpage_map_one_extent(handle, mpd);
25572495 if (err < 0) {
25582496 struct super_block *sb = inode->i_sb;
25592497
25602498 if (ext4_forced_shutdown(EXT4_SB(sb)) ||
2561
- EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
2499
+ ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
25622500 goto invalidate_dirty_pages;
25632501 /*
25642502 * Let the uper layers retry transient errors.
....@@ -2615,10 +2553,11 @@
26152553 EXT4_I(inode)->i_disksize = disksize;
26162554 up_write(&EXT4_I(inode)->i_data_sem);
26172555 err2 = ext4_mark_inode_dirty(handle, inode);
2618
- if (err2)
2619
- ext4_error(inode->i_sb,
2620
- "Failed to mark inode %lu dirty",
2621
- inode->i_ino);
2556
+ if (err2) {
2557
+ ext4_error_err(inode->i_sb, -err2,
2558
+ "Failed to mark inode %lu dirty",
2559
+ inode->i_ino);
2560
+ }
26222561 if (!err)
26232562 err = err2;
26242563 }
....@@ -2666,7 +2605,7 @@
26662605 long left = mpd->wbc->nr_to_write;
26672606 pgoff_t index = mpd->first_page;
26682607 pgoff_t end = mpd->last_page;
2669
- int tag;
2608
+ xa_mark_t tag;
26702609 int i, err = 0;
26712610 int blkbits = mpd->inode->i_blkbits;
26722611 ext4_lblk_t lblk;
....@@ -2684,7 +2623,7 @@
26842623 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
26852624 tag);
26862625 if (nr_pages == 0)
2687
- goto out;
2626
+ break;
26882627
26892628 for (i = 0; i < nr_pages; i++) {
26902629 struct page *page = pvec.pages[i];
....@@ -2723,6 +2662,22 @@
27232662 wait_on_page_writeback(page);
27242663 BUG_ON(PageWriteback(page));
27252664
2665
+ /*
2666
+ * Should never happen but for buggy code in
2667
+ * other subsystems that call
2668
+ * set_page_dirty() without properly warning
2669
+ * the file system first. See [1] for more
2670
+ * information.
2671
+ *
2672
+ * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
2673
+ */
2674
+ if (!page_has_buffers(page)) {
2675
+ ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index);
2676
+ ClearPageDirty(page);
2677
+ unlock_page(page);
2678
+ continue;
2679
+ }
2680
+
27262681 if (mpd->map.m_len == 0)
27272682 mpd->first_page = page->index;
27282683 mpd->next_page = page->index + 1;
....@@ -2739,6 +2694,7 @@
27392694 pagevec_release(&pvec);
27402695 cond_resched();
27412696 }
2697
+ mpd->scanned_until_end = 1;
27422698 return 0;
27432699 out:
27442700 pagevec_release(&pvec);
....@@ -2757,7 +2713,6 @@
27572713 struct inode *inode = mapping->host;
27582714 int needed_blocks, rsv_blocks = 0, ret = 0;
27592715 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2760
- bool done;
27612716 struct blk_plug plug;
27622717 bool give_up_on_write = false;
27632718
....@@ -2791,18 +2746,9 @@
27912746 * the stack trace.
27922747 */
27932748 if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
2794
- sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
2749
+ ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
27952750 ret = -EROFS;
27962751 goto out_writepages;
2797
- }
2798
-
2799
- if (ext4_should_dioread_nolock(inode)) {
2800
- /*
2801
- * We may need to convert up to one extent per block in
2802
- * the page and we may dirty the inode.
2803
- */
2804
- rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
2805
- PAGE_SIZE >> inode->i_blkbits);
28062752 }
28072753
28082754 /*
....@@ -2821,6 +2767,15 @@
28212767 EXT4_STATE_MAY_INLINE_DATA));
28222768 ext4_destroy_inline_data(handle, inode);
28232769 ext4_journal_stop(handle);
2770
+ }
2771
+
2772
+ if (ext4_should_dioread_nolock(inode)) {
2773
+ /*
2774
+ * We may need to convert up to one extent per block in
2775
+ * the page and we may dirty the inode.
2776
+ */
2777
+ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
2778
+ PAGE_SIZE >> inode->i_blkbits);
28242779 }
28252780
28262781 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
....@@ -2843,7 +2798,6 @@
28432798 retry:
28442799 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
28452800 tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
2846
- done = false;
28472801 blk_start_plug(&plug);
28482802
28492803 /*
....@@ -2853,22 +2807,23 @@
28532807 * started.
28542808 */
28552809 mpd.do_map = 0;
2810
+ mpd.scanned_until_end = 0;
28562811 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
28572812 if (!mpd.io_submit.io_end) {
28582813 ret = -ENOMEM;
28592814 goto unplug;
28602815 }
28612816 ret = mpage_prepare_extent_to_map(&mpd);
2817
+ /* Unlock pages we didn't use */
2818
+ mpage_release_unused_pages(&mpd, false);
28622819 /* Submit prepared bio */
28632820 ext4_io_submit(&mpd.io_submit);
28642821 ext4_put_io_end_defer(mpd.io_submit.io_end);
28652822 mpd.io_submit.io_end = NULL;
2866
- /* Unlock pages we didn't use */
2867
- mpage_release_unused_pages(&mpd, false);
28682823 if (ret < 0)
28692824 goto unplug;
28702825
2871
- while (!done && mpd.first_page <= mpd.last_page) {
2826
+ while (!mpd.scanned_until_end && wbc->nr_to_write > 0) {
28722827 /* For each extent of pages we use new io_end */
28732828 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
28742829 if (!mpd.io_submit.io_end) {
....@@ -2903,26 +2858,15 @@
29032858
29042859 trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
29052860 ret = mpage_prepare_extent_to_map(&mpd);
2906
- if (!ret) {
2907
- if (mpd.map.m_len)
2908
- ret = mpage_map_and_submit_extent(handle, &mpd,
2861
+ if (!ret && mpd.map.m_len)
2862
+ ret = mpage_map_and_submit_extent(handle, &mpd,
29092863 &give_up_on_write);
2910
- else {
2911
- /*
2912
- * We scanned the whole range (or exhausted
2913
- * nr_to_write), submitted what was mapped and
2914
- * didn't find anything needing mapping. We are
2915
- * done.
2916
- */
2917
- done = true;
2918
- }
2919
- }
29202864 /*
29212865 * Caution: If the handle is synchronous,
29222866 * ext4_journal_stop() can wait for transaction commit
29232867 * to finish which may depend on writeback of pages to
29242868 * complete or on page lock to be released. In that
2925
- * case, we have to wait until after after we have
2869
+ * case, we have to wait until after we have
29262870 * submitted all the IO, released page locks we hold,
29272871 * and dropped io_end reference (for extent conversion
29282872 * to be able to complete) before stopping the handle.
....@@ -2932,10 +2876,11 @@
29322876 handle = NULL;
29332877 mpd.do_map = 0;
29342878 }
2935
- /* Submit prepared bio */
2936
- ext4_io_submit(&mpd.io_submit);
29372879 /* Unlock pages we didn't use */
29382880 mpage_release_unused_pages(&mpd, give_up_on_write);
2881
+ /* Submit prepared bio */
2882
+ ext4_io_submit(&mpd.io_submit);
2883
+
29392884 /*
29402885 * Drop our io_end reference we got from init. We have
29412886 * to be careful and use deferred io_end finishing if
....@@ -3002,7 +2947,7 @@
30022947 percpu_down_read(&sbi->s_writepages_rwsem);
30032948 trace_ext4_writepages(inode, wbc);
30042949
3005
- ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
2950
+ ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
30062951 trace_ext4_writepages_result(inode, wbc, ret,
30072952 nr_to_write - wbc->nr_to_write);
30082953 percpu_up_read(&sbi->s_writepages_rwsem);
....@@ -3212,58 +3157,42 @@
32123157 end = start + copied - 1;
32133158
32143159 /*
3215
- * generic_write_end() will run mark_inode_dirty() if i_size
3216
- * changes. So let's piggyback the i_disksize mark_inode_dirty
3217
- * into that.
3160
+ * Since we are holding inode lock, we are sure i_disksize <=
3161
+ * i_size. We also know that if i_disksize < i_size, there are
3162
+ * delalloc writes pending in the range upto i_size. If the end of
3163
+ * the current write is <= i_size, there's no need to touch
3164
+ * i_disksize since writeback will push i_disksize upto i_size
3165
+ * eventually. If the end of the current write is > i_size and
3166
+ * inside an allocated block (ext4_da_should_update_i_disksize()
3167
+ * check), we need to update i_disksize here as neither
3168
+ * ext4_writepage() nor certain ext4_writepages() paths not
3169
+ * allocating blocks update i_disksize.
3170
+ *
3171
+ * Note that we defer inode dirtying to generic_write_end() /
3172
+ * ext4_da_write_inline_data_end().
32183173 */
32193174 new_i_size = pos + copied;
3220
- if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
3175
+ if (copied && new_i_size > inode->i_size) {
32213176 if (ext4_has_inline_data(inode) ||
3222
- ext4_da_should_update_i_disksize(page, end)) {
3177
+ ext4_da_should_update_i_disksize(page, end))
32233178 ext4_update_i_disksize(inode, new_i_size);
3224
- /* We need to mark inode dirty even if
3225
- * new_i_size is less that inode->i_size
3226
- * bu greater than i_disksize.(hint delalloc)
3227
- */
3228
- ext4_mark_inode_dirty(handle, inode);
3229
- }
32303179 }
32313180
32323181 if (write_mode != CONVERT_INLINE_DATA &&
32333182 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
32343183 ext4_has_inline_data(inode))
3235
- ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
3184
+ ret = ext4_da_write_inline_data_end(inode, pos, len, copied,
32363185 page);
32373186 else
3238
- ret2 = generic_write_end(file, mapping, pos, len, copied,
3187
+ ret = generic_write_end(file, mapping, pos, len, copied,
32393188 page, fsdata);
32403189
3241
- copied = ret2;
3242
- if (ret2 < 0)
3243
- ret = ret2;
3190
+ copied = ret;
32443191 ret2 = ext4_journal_stop(handle);
3245
- if (!ret)
3192
+ if (unlikely(ret2 && !ret))
32463193 ret = ret2;
32473194
32483195 return ret ? ret : copied;
3249
-}
3250
-
3251
-static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
3252
- unsigned int length)
3253
-{
3254
- /*
3255
- * Drop reserved blocks
3256
- */
3257
- BUG_ON(!PageLocked(page));
3258
- if (!page_has_buffers(page))
3259
- goto out;
3260
-
3261
- ext4_da_page_release_reservation(page, offset, length);
3262
-
3263
-out:
3264
- ext4_invalidatepage(page, offset, length);
3265
-
3266
- return;
32673196 }
32683197
32693198 /*
....@@ -3328,13 +3257,15 @@
33283257 {
33293258 struct inode *inode = mapping->host;
33303259 journal_t *journal;
3260
+ sector_t ret = 0;
33313261 int err;
33323262
3263
+ inode_lock_shared(inode);
33333264 /*
33343265 * We can get here for an inline file via the FIBMAP ioctl
33353266 */
33363267 if (ext4_has_inline_data(inode))
3337
- return 0;
3268
+ goto out;
33383269
33393270 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
33403271 test_opt(inode->i_sb, DELALLOC)) {
....@@ -3373,10 +3304,14 @@
33733304 jbd2_journal_unlock_updates(journal);
33743305
33753306 if (err)
3376
- return 0;
3307
+ goto out;
33773308 }
33783309
3379
- return generic_block_bmap(mapping, block, ext4_get_block);
3310
+ ret = iomap_bmap(mapping, block, &ext4_iomap_ops);
3311
+
3312
+out:
3313
+ inode_unlock_shared(inode);
3314
+ return ret;
33803315 }
33813316
33823317 static int ext4_readpage(struct file *file, struct page *page)
....@@ -3390,23 +3325,20 @@
33903325 ret = ext4_readpage_inline(inode, page);
33913326
33923327 if (ret == -EAGAIN)
3393
- return ext4_mpage_readpages(page->mapping, NULL, page, 1,
3394
- false);
3328
+ return ext4_mpage_readpages(inode, NULL, page);
33953329
33963330 return ret;
33973331 }
33983332
3399
-static int
3400
-ext4_readpages(struct file *file, struct address_space *mapping,
3401
- struct list_head *pages, unsigned nr_pages)
3333
+static void ext4_readahead(struct readahead_control *rac)
34023334 {
3403
- struct inode *inode = mapping->host;
3335
+ struct inode *inode = rac->mapping->host;
34043336
3405
- /* If the file has inline data, no need to do readpages. */
3337
+ /* If the file has inline data, no need to do readahead. */
34063338 if (ext4_has_inline_data(inode))
3407
- return 0;
3339
+ return;
34083340
3409
- return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true);
3341
+ ext4_mpage_readpages(inode, rac, NULL);
34103342 }
34113343
34123344 static void ext4_invalidatepage(struct page *page, unsigned int offset,
....@@ -3455,7 +3387,7 @@
34553387 if (PageChecked(page))
34563388 return 0;
34573389 if (journal)
3458
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
3390
+ return jbd2_journal_try_to_free_buffers(journal, page);
34593391 else
34603392 return try_to_free_buffers(page);
34613393 }
....@@ -3464,216 +3396,215 @@
34643396 {
34653397 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
34663398
3467
- if (journal)
3468
- return !jbd2_transaction_committed(journal,
3469
- EXT4_I(inode)->i_datasync_tid);
3399
+ if (journal) {
3400
+ if (jbd2_transaction_committed(journal,
3401
+ EXT4_I(inode)->i_datasync_tid))
3402
+ return false;
3403
+ if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
3404
+ return !list_empty(&EXT4_I(inode)->i_fc_list);
3405
+ return true;
3406
+ }
3407
+
34703408 /* Any metadata buffers to write? */
34713409 if (!list_empty(&inode->i_mapping->private_list))
34723410 return true;
34733411 return inode->i_state & I_DIRTY_DATASYNC;
34743412 }
34753413
3476
-static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
3477
- unsigned flags, struct iomap *iomap)
3414
+static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
3415
+ struct ext4_map_blocks *map, loff_t offset,
3416
+ loff_t length)
34783417 {
3479
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3480
- unsigned int blkbits = inode->i_blkbits;
3481
- unsigned long first_block, last_block;
3482
- struct ext4_map_blocks map;
3483
- bool delalloc = false;
3484
- int ret;
3485
-
3486
- if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
3487
- return -EINVAL;
3488
- first_block = offset >> blkbits;
3489
- last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
3490
- EXT4_MAX_LOGICAL_BLOCK);
3491
-
3492
- if (flags & IOMAP_REPORT) {
3493
- if (ext4_has_inline_data(inode)) {
3494
- ret = ext4_inline_data_iomap(inode, iomap);
3495
- if (ret != -EAGAIN) {
3496
- if (ret == 0 && offset >= iomap->length)
3497
- ret = -ENOENT;
3498
- return ret;
3499
- }
3500
- }
3501
- } else {
3502
- if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
3503
- return -ERANGE;
3504
- }
3505
-
3506
- map.m_lblk = first_block;
3507
- map.m_len = last_block - first_block + 1;
3508
-
3509
- if (flags & IOMAP_REPORT) {
3510
- ret = ext4_map_blocks(NULL, inode, &map, 0);
3511
- if (ret < 0)
3512
- return ret;
3513
-
3514
- if (ret == 0) {
3515
- ext4_lblk_t end = map.m_lblk + map.m_len - 1;
3516
- struct extent_status es;
3517
-
3518
- ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es);
3519
-
3520
- if (!es.es_len || es.es_lblk > end) {
3521
- /* entire range is a hole */
3522
- } else if (es.es_lblk > map.m_lblk) {
3523
- /* range starts with a hole */
3524
- map.m_len = es.es_lblk - map.m_lblk;
3525
- } else {
3526
- ext4_lblk_t offs = 0;
3527
-
3528
- if (es.es_lblk < map.m_lblk)
3529
- offs = map.m_lblk - es.es_lblk;
3530
- map.m_lblk = es.es_lblk + offs;
3531
- map.m_len = es.es_len - offs;
3532
- delalloc = true;
3533
- }
3534
- }
3535
- } else if (flags & IOMAP_WRITE) {
3536
- int dio_credits;
3537
- handle_t *handle;
3538
- int retries = 0;
3539
-
3540
- /* Trim mapping request to maximum we can map at once for DIO */
3541
- if (map.m_len > DIO_MAX_BLOCKS)
3542
- map.m_len = DIO_MAX_BLOCKS;
3543
- dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
3544
-retry:
3545
- /*
3546
- * Either we allocate blocks and then we don't get unwritten
3547
- * extent so we have reserved enough credits, or the blocks
3548
- * are already allocated and unwritten and in that case
3549
- * extent conversion fits in the credits as well.
3550
- */
3551
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
3552
- dio_credits);
3553
- if (IS_ERR(handle))
3554
- return PTR_ERR(handle);
3555
-
3556
- ret = ext4_map_blocks(handle, inode, &map,
3557
- EXT4_GET_BLOCKS_CREATE_ZERO);
3558
- if (ret < 0) {
3559
- ext4_journal_stop(handle);
3560
- if (ret == -ENOSPC &&
3561
- ext4_should_retry_alloc(inode->i_sb, &retries))
3562
- goto retry;
3563
- return ret;
3564
- }
3565
-
3566
- /*
3567
- * If we added blocks beyond i_size, we need to make sure they
3568
- * will get truncated if we crash before updating i_size in
3569
- * ext4_iomap_end(). For faults we don't need to do that (and
3570
- * even cannot because for orphan list operations inode_lock is
3571
- * required) - if we happen to instantiate block beyond i_size,
3572
- * it is because we race with truncate which has already added
3573
- * the inode to the orphan list.
3574
- */
3575
- if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
3576
- (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
3577
- int err;
3578
-
3579
- err = ext4_orphan_add(handle, inode);
3580
- if (err < 0) {
3581
- ext4_journal_stop(handle);
3582
- return err;
3583
- }
3584
- }
3585
- ext4_journal_stop(handle);
3586
- } else {
3587
- ret = ext4_map_blocks(NULL, inode, &map, 0);
3588
- if (ret < 0)
3589
- return ret;
3590
- }
3418
+ u8 blkbits = inode->i_blkbits;
35913419
35923420 /*
35933421 * Writes that span EOF might trigger an I/O size update on completion,
3594
- * so consider them to be dirty for the purposes of O_DSYNC, even if
3595
- * there is no other metadata changes being made or are pending here.
3422
+ * so consider them to be dirty for the purpose of O_DSYNC, even if
3423
+ * there is no other metadata changes being made or are pending.
35963424 */
35973425 iomap->flags = 0;
35983426 if (ext4_inode_datasync_dirty(inode) ||
35993427 offset + length > i_size_read(inode))
36003428 iomap->flags |= IOMAP_F_DIRTY;
3601
- iomap->bdev = inode->i_sb->s_bdev;
3602
- iomap->dax_dev = sbi->s_daxdev;
3603
- iomap->offset = (u64)first_block << blkbits;
3604
- iomap->length = (u64)map.m_len << blkbits;
36053429
3606
- if (ret == 0) {
3607
- iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
3608
- iomap->addr = IOMAP_NULL_ADDR;
3609
- } else {
3610
- if (map.m_flags & EXT4_MAP_MAPPED) {
3611
- iomap->type = IOMAP_MAPPED;
3612
- } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
3613
- iomap->type = IOMAP_UNWRITTEN;
3614
- } else {
3615
- WARN_ON_ONCE(1);
3616
- return -EIO;
3617
- }
3618
- iomap->addr = (u64)map.m_pblk << blkbits;
3619
- }
3620
-
3621
- if (map.m_flags & EXT4_MAP_NEW)
3430
+ if (map->m_flags & EXT4_MAP_NEW)
36223431 iomap->flags |= IOMAP_F_NEW;
36233432
3433
+ iomap->bdev = inode->i_sb->s_bdev;
3434
+ iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
3435
+ iomap->offset = (u64) map->m_lblk << blkbits;
3436
+ iomap->length = (u64) map->m_len << blkbits;
3437
+
3438
+ if ((map->m_flags & EXT4_MAP_MAPPED) &&
3439
+ !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3440
+ iomap->flags |= IOMAP_F_MERGED;
3441
+
3442
+ /*
3443
+ * Flags passed to ext4_map_blocks() for direct I/O writes can result
3444
+ * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
3445
+ * set. In order for any allocated unwritten extents to be converted
3446
+ * into written extents correctly within the ->end_io() handler, we
3447
+ * need to ensure that the iomap->type is set appropriately. Hence, the
3448
+ * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
3449
+ * been set first.
3450
+ */
3451
+ if (map->m_flags & EXT4_MAP_UNWRITTEN) {
3452
+ iomap->type = IOMAP_UNWRITTEN;
3453
+ iomap->addr = (u64) map->m_pblk << blkbits;
3454
+ } else if (map->m_flags & EXT4_MAP_MAPPED) {
3455
+ iomap->type = IOMAP_MAPPED;
3456
+ iomap->addr = (u64) map->m_pblk << blkbits;
3457
+ } else {
3458
+ iomap->type = IOMAP_HOLE;
3459
+ iomap->addr = IOMAP_NULL_ADDR;
3460
+ }
3461
+}
3462
+
3463
+static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
3464
+ unsigned int flags)
3465
+{
3466
+ handle_t *handle;
3467
+ u8 blkbits = inode->i_blkbits;
3468
+ int ret, dio_credits, m_flags = 0, retries = 0;
3469
+
3470
+ /*
3471
+ * Trim the mapping request to the maximum value that we can map at
3472
+ * once for direct I/O.
3473
+ */
3474
+ if (map->m_len > DIO_MAX_BLOCKS)
3475
+ map->m_len = DIO_MAX_BLOCKS;
3476
+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
3477
+
3478
+retry:
3479
+ /*
3480
+ * Either we allocate blocks and then don't get an unwritten extent, so
3481
+ * in that case we have reserved enough credits. Or, the blocks are
3482
+ * already allocated and unwritten. In that case, the extent conversion
3483
+ * fits into the credits as well.
3484
+ */
3485
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
3486
+ if (IS_ERR(handle))
3487
+ return PTR_ERR(handle);
3488
+
3489
+ /*
3490
+ * DAX and direct I/O are the only two operations that are currently
3491
+ * supported with IOMAP_WRITE.
3492
+ */
3493
+ WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
3494
+ if (IS_DAX(inode))
3495
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
3496
+ /*
3497
+ * We use i_size instead of i_disksize here because delalloc writeback
3498
+ * can complete at any point during the I/O and subsequently push the
3499
+ * i_disksize out to i_size. This could be beyond where direct I/O is
3500
+ * happening and thus expose allocated blocks to direct I/O reads.
3501
+ */
3502
+ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
3503
+ m_flags = EXT4_GET_BLOCKS_CREATE;
3504
+ else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3505
+ m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3506
+
3507
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
3508
+
3509
+ /*
3510
+ * We cannot fill holes in indirect tree based inodes as that could
3511
+ * expose stale data in the case of a crash. Use the magic error code
3512
+ * to fallback to buffered I/O.
3513
+ */
3514
+ if (!m_flags && !ret)
3515
+ ret = -ENOTBLK;
3516
+
3517
+ ext4_journal_stop(handle);
3518
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3519
+ goto retry;
3520
+
3521
+ return ret;
3522
+}
3523
+
3524
+
3525
+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
3526
+ unsigned flags, struct iomap *iomap, struct iomap *srcmap)
3527
+{
3528
+ int ret;
3529
+ struct ext4_map_blocks map;
3530
+ u8 blkbits = inode->i_blkbits;
3531
+
3532
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
3533
+ return -EINVAL;
3534
+
3535
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
3536
+ return -ERANGE;
3537
+
3538
+ /*
3539
+ * Calculate the first and last logical blocks respectively.
3540
+ */
3541
+ map.m_lblk = offset >> blkbits;
3542
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
3543
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
3544
+
3545
+ if (flags & IOMAP_WRITE) {
3546
+ /*
3547
+ * We check here if the blocks are already allocated, then we
3548
+ * don't need to start a journal txn and we can directly return
3549
+ * the mapping information. This could boost performance
3550
+ * especially in multi-threaded overwrite requests.
3551
+ */
3552
+ if (offset + length <= i_size_read(inode)) {
3553
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
3554
+ if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
3555
+ goto out;
3556
+ }
3557
+ ret = ext4_iomap_alloc(inode, &map, flags);
3558
+ } else {
3559
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
3560
+ }
3561
+
3562
+ if (ret < 0)
3563
+ return ret;
3564
+out:
3565
+
3566
+ /*
3567
+ * When inline encryption is enabled, sometimes I/O to an encrypted file
3568
+ * has to be broken up to guarantee DUN contiguity. Handle this by
3569
+ * limiting the length of the mapping returned.
3570
+ */
3571
+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
3572
+
3573
+ ext4_set_iomap(inode, iomap, &map, offset, length);
3574
+
36243575 return 0;
3576
+}
3577
+
3578
+static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
3579
+ loff_t length, unsigned flags, struct iomap *iomap,
3580
+ struct iomap *srcmap)
3581
+{
3582
+ int ret;
3583
+
3584
+ /*
3585
+ * Even for writes we don't need to allocate blocks, so just pretend
3586
+ * we are reading to save overhead of starting a transaction.
3587
+ */
3588
+ flags &= ~IOMAP_WRITE;
3589
+ ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
3590
+ WARN_ON_ONCE(iomap->type != IOMAP_MAPPED);
3591
+ return ret;
36253592 }
36263593
36273594 static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
36283595 ssize_t written, unsigned flags, struct iomap *iomap)
36293596 {
3630
- int ret = 0;
3631
- handle_t *handle;
3632
- int blkbits = inode->i_blkbits;
3633
- bool truncate = false;
3634
-
3635
- if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
3636
- return 0;
3637
-
3638
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3639
- if (IS_ERR(handle)) {
3640
- ret = PTR_ERR(handle);
3641
- goto orphan_del;
3642
- }
3643
- if (ext4_update_inode_size(inode, offset + written))
3644
- ext4_mark_inode_dirty(handle, inode);
36453597 /*
3646
- * We may need to truncate allocated but not written blocks beyond EOF.
3598
+ * Check to see whether an error occurred while writing out the data to
3599
+ * the allocated blocks. If so, return the magic error code so that we
3600
+ * fallback to buffered I/O and attempt to complete the remainder of
3601
+ * the I/O. Any blocks that may have been allocated in preparation for
3602
+ * the direct I/O will be reused during buffered I/O.
36473603 */
3648
- if (iomap->offset + iomap->length >
3649
- ALIGN(inode->i_size, 1 << blkbits)) {
3650
- ext4_lblk_t written_blk, end_blk;
3604
+ if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
3605
+ return -ENOTBLK;
36513606
3652
- written_blk = (offset + written) >> blkbits;
3653
- end_blk = (offset + length) >> blkbits;
3654
- if (written_blk < end_blk && ext4_can_truncate(inode))
3655
- truncate = true;
3656
- }
3657
- /*
3658
- * Remove inode from orphan list if we were extending a inode and
3659
- * everything went fine.
3660
- */
3661
- if (!truncate && inode->i_nlink &&
3662
- !list_empty(&EXT4_I(inode)->i_orphan))
3663
- ext4_orphan_del(handle, inode);
3664
- ext4_journal_stop(handle);
3665
- if (truncate) {
3666
- ext4_truncate_failed_write(inode);
3667
-orphan_del:
3668
- /*
3669
- * If truncate failed early the inode might still be on the
3670
- * orphan list; we need to make sure the inode is removed from
3671
- * the orphan list in that case.
3672
- */
3673
- if (inode->i_nlink)
3674
- ext4_orphan_del(NULL, inode);
3675
- }
3676
- return ret;
3607
+ return 0;
36773608 }
36783609
36793610 const struct iomap_ops ext4_iomap_ops = {
....@@ -3681,310 +3612,94 @@
36813612 .iomap_end = ext4_iomap_end,
36823613 };
36833614
3684
-static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3685
- ssize_t size, void *private)
3615
+const struct iomap_ops ext4_iomap_overwrite_ops = {
3616
+ .iomap_begin = ext4_iomap_overwrite_begin,
3617
+ .iomap_end = ext4_iomap_end,
3618
+};
3619
+
3620
+static bool ext4_iomap_is_delalloc(struct inode *inode,
3621
+ struct ext4_map_blocks *map)
36863622 {
3687
- ext4_io_end_t *io_end = private;
3623
+ struct extent_status es;
3624
+ ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
36883625
3689
- /* if not async direct IO just return */
3690
- if (!io_end)
3691
- return 0;
3626
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
3627
+ map->m_lblk, end, &es);
36923628
3693
- ext_debug("ext4_end_io_dio(): io_end 0x%p "
3694
- "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
3695
- io_end, io_end->inode->i_ino, iocb, offset, size);
3629
+ if (!es.es_len || es.es_lblk > end)
3630
+ return false;
3631
+
3632
+ if (es.es_lblk > map->m_lblk) {
3633
+ map->m_len = es.es_lblk - map->m_lblk;
3634
+ return false;
3635
+ }
3636
+
3637
+ offset = map->m_lblk - es.es_lblk;
3638
+ map->m_len = es.es_len - offset;
3639
+
3640
+ return true;
3641
+}
3642
+
3643
+static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
3644
+ loff_t length, unsigned int flags,
3645
+ struct iomap *iomap, struct iomap *srcmap)
3646
+{
3647
+ int ret;
3648
+ bool delalloc = false;
3649
+ struct ext4_map_blocks map;
3650
+ u8 blkbits = inode->i_blkbits;
3651
+
3652
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
3653
+ return -EINVAL;
3654
+
3655
+ if (ext4_has_inline_data(inode)) {
3656
+ ret = ext4_inline_data_iomap(inode, iomap);
3657
+ if (ret != -EAGAIN) {
3658
+ if (ret == 0 && offset >= iomap->length)
3659
+ ret = -ENOENT;
3660
+ return ret;
3661
+ }
3662
+ }
36963663
36973664 /*
3698
- * Error during AIO DIO. We cannot convert unwritten extents as the
3699
- * data was not written. Just clear the unwritten flag and drop io_end.
3665
+ * Calculate the first and last logical block respectively.
37003666 */
3701
- if (size <= 0) {
3702
- ext4_clear_io_unwritten_flag(io_end);
3703
- size = 0;
3667
+ map.m_lblk = offset >> blkbits;
3668
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
3669
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
3670
+
3671
+ /*
3672
+ * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
3673
+ * So handle it here itself instead of querying ext4_map_blocks().
3674
+ * Since ext4_map_blocks() will warn about it and will return
3675
+ * -EIO error.
3676
+ */
3677
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
3678
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3679
+
3680
+ if (offset >= sbi->s_bitmap_maxbytes) {
3681
+ map.m_flags = 0;
3682
+ goto set_iomap;
3683
+ }
37043684 }
3705
- io_end->offset = offset;
3706
- io_end->size = size;
3707
- ext4_put_io_end(io_end);
3685
+
3686
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
3687
+ if (ret < 0)
3688
+ return ret;
3689
+ if (ret == 0)
3690
+ delalloc = ext4_iomap_is_delalloc(inode, &map);
3691
+
3692
+set_iomap:
3693
+ ext4_set_iomap(inode, iomap, &map, offset, length);
3694
+ if (delalloc && iomap->type == IOMAP_HOLE)
3695
+ iomap->type = IOMAP_DELALLOC;
37083696
37093697 return 0;
37103698 }
37113699
3712
-/*
3713
- * Handling of direct IO writes.
3714
- *
3715
- * For ext4 extent files, ext4 will do direct-io write even to holes,
3716
- * preallocated extents, and those write extend the file, no need to
3717
- * fall back to buffered IO.
3718
- *
3719
- * For holes, we fallocate those blocks, mark them as unwritten
3720
- * If those blocks were preallocated, we mark sure they are split, but
3721
- * still keep the range to write as unwritten.
3722
- *
3723
- * The unwritten extents will be converted to written when DIO is completed.
3724
- * For async direct IO, since the IO may still pending when return, we
3725
- * set up an end_io call back function, which will do the conversion
3726
- * when async direct IO completed.
3727
- *
3728
- * If the O_DIRECT write will extend the file then add this inode to the
3729
- * orphan list. So recovery will truncate it back to the original size
3730
- * if the machine crashes during the write.
3731
- *
3732
- */
3733
-static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
3734
-{
3735
- struct file *file = iocb->ki_filp;
3736
- struct inode *inode = file->f_mapping->host;
3737
- struct ext4_inode_info *ei = EXT4_I(inode);
3738
- ssize_t ret;
3739
- loff_t offset = iocb->ki_pos;
3740
- size_t count = iov_iter_count(iter);
3741
- int overwrite = 0;
3742
- get_block_t *get_block_func = NULL;
3743
- int dio_flags = 0;
3744
- loff_t final_size = offset + count;
3745
- int orphan = 0;
3746
- handle_t *handle;
3747
-
3748
- if (final_size > inode->i_size || final_size > ei->i_disksize) {
3749
- /* Credits for sb + inode write */
3750
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3751
- if (IS_ERR(handle)) {
3752
- ret = PTR_ERR(handle);
3753
- goto out;
3754
- }
3755
- ret = ext4_orphan_add(handle, inode);
3756
- if (ret) {
3757
- ext4_journal_stop(handle);
3758
- goto out;
3759
- }
3760
- orphan = 1;
3761
- ext4_update_i_disksize(inode, inode->i_size);
3762
- ext4_journal_stop(handle);
3763
- }
3764
-
3765
- BUG_ON(iocb->private == NULL);
3766
-
3767
- /*
3768
- * Make all waiters for direct IO properly wait also for extent
3769
- * conversion. This also disallows race between truncate() and
3770
- * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
3771
- */
3772
- inode_dio_begin(inode);
3773
-
3774
- /* If we do a overwrite dio, i_mutex locking can be released */
3775
- overwrite = *((int *)iocb->private);
3776
-
3777
- if (overwrite)
3778
- inode_unlock(inode);
3779
-
3780
- /*
3781
- * For extent mapped files we could direct write to holes and fallocate.
3782
- *
3783
- * Allocated blocks to fill the hole are marked as unwritten to prevent
3784
- * parallel buffered read to expose the stale data before DIO complete
3785
- * the data IO.
3786
- *
3787
- * As to previously fallocated extents, ext4 get_block will just simply
3788
- * mark the buffer mapped but still keep the extents unwritten.
3789
- *
3790
- * For non AIO case, we will convert those unwritten extents to written
3791
- * after return back from blockdev_direct_IO. That way we save us from
3792
- * allocating io_end structure and also the overhead of offloading
3793
- * the extent convertion to a workqueue.
3794
- *
3795
- * For async DIO, the conversion needs to be deferred when the
3796
- * IO is completed. The ext4 end_io callback function will be
3797
- * called to take care of the conversion work. Here for async
3798
- * case, we allocate an io_end structure to hook to the iocb.
3799
- */
3800
- iocb->private = NULL;
3801
- if (overwrite)
3802
- get_block_func = ext4_dio_get_block_overwrite;
3803
- else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
3804
- round_down(offset, i_blocksize(inode)) >= inode->i_size) {
3805
- get_block_func = ext4_dio_get_block;
3806
- dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
3807
- } else if (is_sync_kiocb(iocb)) {
3808
- get_block_func = ext4_dio_get_block_unwritten_sync;
3809
- dio_flags = DIO_LOCKING;
3810
- } else {
3811
- get_block_func = ext4_dio_get_block_unwritten_async;
3812
- dio_flags = DIO_LOCKING;
3813
- }
3814
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
3815
- get_block_func, ext4_end_io_dio, NULL,
3816
- dio_flags);
3817
-
3818
- if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3819
- EXT4_STATE_DIO_UNWRITTEN)) {
3820
- int err;
3821
- /*
3822
- * for non AIO case, since the IO is already
3823
- * completed, we could do the conversion right here
3824
- */
3825
- err = ext4_convert_unwritten_extents(NULL, inode,
3826
- offset, ret);
3827
- if (err < 0)
3828
- ret = err;
3829
- ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3830
- }
3831
-
3832
- inode_dio_end(inode);
3833
- /* take i_mutex locking again if we do a ovewrite dio */
3834
- if (overwrite)
3835
- inode_lock(inode);
3836
-
3837
- if (ret < 0 && final_size > inode->i_size)
3838
- ext4_truncate_failed_write(inode);
3839
-
3840
- /* Handle extending of i_size after direct IO write */
3841
- if (orphan) {
3842
- int err;
3843
-
3844
- /* Credits for sb + inode write */
3845
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3846
- if (IS_ERR(handle)) {
3847
- /*
3848
- * We wrote the data but cannot extend
3849
- * i_size. Bail out. In async io case, we do
3850
- * not return error here because we have
3851
- * already submmitted the corresponding
3852
- * bio. Returning error here makes the caller
3853
- * think that this IO is done and failed
3854
- * resulting in race with bio's completion
3855
- * handler.
3856
- */
3857
- if (!ret)
3858
- ret = PTR_ERR(handle);
3859
- if (inode->i_nlink)
3860
- ext4_orphan_del(NULL, inode);
3861
-
3862
- goto out;
3863
- }
3864
- if (inode->i_nlink)
3865
- ext4_orphan_del(handle, inode);
3866
- if (ret > 0) {
3867
- loff_t end = offset + ret;
3868
- if (end > inode->i_size || end > ei->i_disksize) {
3869
- ext4_update_i_disksize(inode, end);
3870
- if (end > inode->i_size)
3871
- i_size_write(inode, end);
3872
- /*
3873
- * We're going to return a positive `ret'
3874
- * here due to non-zero-length I/O, so there's
3875
- * no way of reporting error returns from
3876
- * ext4_mark_inode_dirty() to userspace. So
3877
- * ignore it.
3878
- */
3879
- ext4_mark_inode_dirty(handle, inode);
3880
- }
3881
- }
3882
- err = ext4_journal_stop(handle);
3883
- if (ret == 0)
3884
- ret = err;
3885
- }
3886
-out:
3887
- return ret;
3888
-}
3889
-
3890
-static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
3891
-{
3892
- struct address_space *mapping = iocb->ki_filp->f_mapping;
3893
- struct inode *inode = mapping->host;
3894
- size_t count = iov_iter_count(iter);
3895
- ssize_t ret;
3896
- loff_t offset = iocb->ki_pos;
3897
- loff_t size = i_size_read(inode);
3898
-
3899
- if (offset >= size)
3900
- return 0;
3901
-
3902
- /*
3903
- * Shared inode_lock is enough for us - it protects against concurrent
3904
- * writes & truncates and since we take care of writing back page cache,
3905
- * we are protected against page writeback as well.
3906
- */
3907
- if (iocb->ki_flags & IOCB_NOWAIT) {
3908
- if (!inode_trylock_shared(inode))
3909
- return -EAGAIN;
3910
- } else {
3911
- inode_lock_shared(inode);
3912
- }
3913
-
3914
- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
3915
- iocb->ki_pos + count - 1);
3916
- if (ret)
3917
- goto out_unlock;
3918
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
3919
- iter, ext4_dio_get_block, NULL, NULL, 0);
3920
-out_unlock:
3921
- inode_unlock_shared(inode);
3922
- return ret;
3923
-}
3924
-
3925
-static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3926
-{
3927
- struct file *file = iocb->ki_filp;
3928
- struct inode *inode = file->f_mapping->host;
3929
- size_t count = iov_iter_count(iter);
3930
- loff_t offset = iocb->ki_pos;
3931
- ssize_t ret;
3932
- int rw = iov_iter_rw(iter);
3933
-
3934
- if (!fscrypt_dio_supported(iocb, iter))
3935
- return 0;
3936
-
3937
- if (fsverity_active(inode))
3938
- return 0;
3939
-
3940
- /*
3941
- * If we are doing data journalling we don't support O_DIRECT
3942
- */
3943
- if (ext4_should_journal_data(inode))
3944
- return 0;
3945
-
3946
- /* Let buffer I/O handle the inline data case. */
3947
- if (ext4_has_inline_data(inode))
3948
- return 0;
3949
-
3950
- if (trace_android_fs_dataread_start_enabled() &&
3951
- (rw == READ)) {
3952
- char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
3953
-
3954
- path = android_fstrace_get_pathname(pathbuf,
3955
- MAX_TRACE_PATHBUF_LEN,
3956
- inode);
3957
- trace_android_fs_dataread_start(inode, offset, count,
3958
- current->pid, path,
3959
- current->comm);
3960
- }
3961
- if (trace_android_fs_datawrite_start_enabled() &&
3962
- (rw == WRITE)) {
3963
- char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
3964
-
3965
- path = android_fstrace_get_pathname(pathbuf,
3966
- MAX_TRACE_PATHBUF_LEN,
3967
- inode);
3968
- trace_android_fs_datawrite_start(inode, offset, count,
3969
- current->pid, path,
3970
- current->comm);
3971
- }
3972
- trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
3973
- if (iov_iter_rw(iter) == READ)
3974
- ret = ext4_direct_IO_read(iocb, iter);
3975
- else
3976
- ret = ext4_direct_IO_write(iocb, iter);
3977
- trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
3978
-
3979
- if (trace_android_fs_dataread_start_enabled() &&
3980
- (rw == READ))
3981
- trace_android_fs_dataread_end(inode, offset, count);
3982
- if (trace_android_fs_datawrite_start_enabled() &&
3983
- (rw == WRITE))
3984
- trace_android_fs_datawrite_end(inode, offset, count);
3985
-
3986
- return ret;
3987
-}
3700
+const struct iomap_ops ext4_iomap_report_ops = {
3701
+ .iomap_begin = ext4_iomap_begin_report,
3702
+};
39883703
39893704 /*
39903705 * Pages can be marked dirty completely asynchronously from ext4's journalling
....@@ -4012,9 +3727,16 @@
40123727 return __set_page_dirty_buffers(page);
40133728 }
40143729
3730
+static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
3731
+ struct file *file, sector_t *span)
3732
+{
3733
+ return iomap_swapfile_activate(sis, file, span,
3734
+ &ext4_iomap_report_ops);
3735
+}
3736
+
40153737 static const struct address_space_operations ext4_aops = {
40163738 .readpage = ext4_readpage,
4017
- .readpages = ext4_readpages,
3739
+ .readahead = ext4_readahead,
40183740 .writepage = ext4_writepage,
40193741 .writepages = ext4_writepages,
40203742 .write_begin = ext4_write_begin,
....@@ -4023,15 +3745,16 @@
40233745 .bmap = ext4_bmap,
40243746 .invalidatepage = ext4_invalidatepage,
40253747 .releasepage = ext4_releasepage,
4026
- .direct_IO = ext4_direct_IO,
3748
+ .direct_IO = noop_direct_IO,
40273749 .migratepage = buffer_migrate_page,
40283750 .is_partially_uptodate = block_is_partially_uptodate,
40293751 .error_remove_page = generic_error_remove_page,
3752
+ .swap_activate = ext4_iomap_swap_activate,
40303753 };
40313754
40323755 static const struct address_space_operations ext4_journalled_aops = {
40333756 .readpage = ext4_readpage,
4034
- .readpages = ext4_readpages,
3757
+ .readahead = ext4_readahead,
40353758 .writepage = ext4_writepage,
40363759 .writepages = ext4_writepages,
40373760 .write_begin = ext4_write_begin,
....@@ -4040,26 +3763,28 @@
40403763 .bmap = ext4_bmap,
40413764 .invalidatepage = ext4_journalled_invalidatepage,
40423765 .releasepage = ext4_releasepage,
4043
- .direct_IO = ext4_direct_IO,
3766
+ .direct_IO = noop_direct_IO,
40443767 .is_partially_uptodate = block_is_partially_uptodate,
40453768 .error_remove_page = generic_error_remove_page,
3769
+ .swap_activate = ext4_iomap_swap_activate,
40463770 };
40473771
40483772 static const struct address_space_operations ext4_da_aops = {
40493773 .readpage = ext4_readpage,
4050
- .readpages = ext4_readpages,
3774
+ .readahead = ext4_readahead,
40513775 .writepage = ext4_writepage,
40523776 .writepages = ext4_writepages,
40533777 .write_begin = ext4_da_write_begin,
40543778 .write_end = ext4_da_write_end,
40553779 .set_page_dirty = ext4_set_page_dirty,
40563780 .bmap = ext4_bmap,
4057
- .invalidatepage = ext4_da_invalidatepage,
3781
+ .invalidatepage = ext4_invalidatepage,
40583782 .releasepage = ext4_releasepage,
4059
- .direct_IO = ext4_direct_IO,
3783
+ .direct_IO = noop_direct_IO,
40603784 .migratepage = buffer_migrate_page,
40613785 .is_partially_uptodate = block_is_partially_uptodate,
40623786 .error_remove_page = generic_error_remove_page,
3787
+ .swap_activate = ext4_iomap_swap_activate,
40633788 };
40643789
40653790 static const struct address_space_operations ext4_dax_aops = {
....@@ -4068,6 +3793,7 @@
40683793 .set_page_dirty = noop_set_page_dirty,
40693794 .bmap = ext4_bmap,
40703795 .invalidatepage = noop_invalidatepage,
3796
+ .swap_activate = ext4_iomap_swap_activate,
40713797 };
40723798
40733799 void ext4_set_aops(struct inode *inode)
....@@ -4141,18 +3867,18 @@
41413867 set_buffer_uptodate(bh);
41423868
41433869 if (!buffer_uptodate(bh)) {
4144
- err = -EIO;
4145
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
4146
- wait_on_buffer(bh);
4147
- /* Uhhuh. Read error. Complain and punt. */
4148
- if (!buffer_uptodate(bh))
3870
+ err = ext4_read_bh_lock(bh, 0, true);
3871
+ if (err)
41493872 goto unlock;
41503873 if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
41513874 /* We expect the key to be set. */
41523875 BUG_ON(!fscrypt_has_encryption_key(inode));
4153
- BUG_ON(blocksize != PAGE_SIZE);
4154
- WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks(
4155
- page, PAGE_SIZE, 0));
3876
+ err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
3877
+ bh_offset(bh));
3878
+ if (err) {
3879
+ clear_buffer_uptodate(bh);
3880
+ goto unlock;
3881
+ }
41563882 }
41573883 }
41583884 if (ext4_should_journal_data(inode)) {
....@@ -4292,6 +4018,8 @@
42924018 loff_t len)
42934019 {
42944020 handle_t *handle;
4021
+ int ret;
4022
+
42954023 loff_t size = i_size_read(inode);
42964024
42974025 WARN_ON(!inode_is_locked(inode));
....@@ -4305,10 +4033,10 @@
43054033 if (IS_ERR(handle))
43064034 return PTR_ERR(handle);
43074035 ext4_update_i_disksize(inode, size);
4308
- ext4_mark_inode_dirty(handle, inode);
4036
+ ret = ext4_mark_inode_dirty(handle, inode);
43094037 ext4_journal_stop(handle);
43104038
4311
- return 0;
4039
+ return ret;
43124040 }
43134041
43144042 static void ext4_wait_dax_page(struct ext4_inode_info *ei)
....@@ -4352,29 +4080,19 @@
43524080 * Returns: 0 on success or negative on failure
43534081 */
43544082
4355
-int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
4083
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
43564084 {
4085
+ struct inode *inode = file_inode(file);
43574086 struct super_block *sb = inode->i_sb;
43584087 ext4_lblk_t first_block, stop_block;
43594088 struct address_space *mapping = inode->i_mapping;
4360
- loff_t first_block_offset, last_block_offset;
4089
+ loff_t first_block_offset, last_block_offset, max_length;
4090
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
43614091 handle_t *handle;
43624092 unsigned int credits;
4363
- int ret = 0;
4364
-
4365
- if (!S_ISREG(inode->i_mode))
4366
- return -EOPNOTSUPP;
4093
+ int ret = 0, ret2 = 0;
43674094
43684095 trace_ext4_punch_hole(inode, offset, length, 0);
4369
-
4370
- ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
4371
- if (ext4_has_inline_data(inode)) {
4372
- down_write(&EXT4_I(inode)->i_mmap_sem);
4373
- ret = ext4_convert_inline_data(inode);
4374
- up_write(&EXT4_I(inode)->i_mmap_sem);
4375
- if (ret)
4376
- return ret;
4377
- }
43784096
43794097 /*
43804098 * Write out all dirty pages to avoid race conditions
....@@ -4403,6 +4121,14 @@
44034121 offset;
44044122 }
44054123
4124
+ /*
4125
+ * For punch hole the length + offset needs to be within one block
4126
+ * before last range. Adjust the length if it goes beyond that limit.
4127
+ */
4128
+ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
4129
+ if (offset + length > max_length)
4130
+ length = max_length - offset;
4131
+
44064132 if (offset & (sb->s_blocksize - 1) ||
44074133 (offset + length) & (sb->s_blocksize - 1)) {
44084134 /*
....@@ -4417,6 +4143,10 @@
44174143
44184144 /* Wait all existing dio workers, newcomers will block on i_mutex */
44194145 inode_dio_wait(inode);
4146
+
4147
+ ret = file_modified(file);
4148
+ if (ret)
4149
+ goto out_mutex;
44204150
44214151 /*
44224152 * Prevent page faults from reinstantiating pages we have released from
....@@ -4464,7 +4194,7 @@
44644194 if (stop_block > first_block) {
44654195
44664196 down_write(&EXT4_I(inode)->i_data_sem);
4467
- ext4_discard_preallocations(inode);
4197
+ ext4_discard_preallocations(inode, 0);
44684198
44694199 ret = ext4_es_remove_extent(inode, first_block,
44704200 stop_block - first_block);
....@@ -4482,11 +4212,14 @@
44824212
44834213 up_write(&EXT4_I(inode)->i_data_sem);
44844214 }
4215
+ ext4_fc_track_range(handle, inode, first_block, stop_block);
44854216 if (IS_SYNC(inode))
44864217 ext4_handle_sync(handle);
44874218
44884219 inode->i_mtime = inode->i_ctime = current_time(inode);
4489
- ext4_mark_inode_dirty(handle, inode);
4220
+ ret2 = ext4_mark_inode_dirty(handle, inode);
4221
+ if (unlikely(ret2))
4222
+ ret = ret2;
44904223 if (ret >= 0)
44914224 ext4_update_inode_fsync_trans(handle, inode, 1);
44924225 out_stop:
....@@ -4555,7 +4288,7 @@
45554288 {
45564289 struct ext4_inode_info *ei = EXT4_I(inode);
45574290 unsigned int credits;
4558
- int err = 0;
4291
+ int err = 0, err2;
45594292 handle_t *handle;
45604293 struct address_space *mapping = inode->i_mapping;
45614294
....@@ -4569,9 +4302,7 @@
45694302 trace_ext4_truncate_enter(inode);
45704303
45714304 if (!ext4_can_truncate(inode))
4572
- return 0;
4573
-
4574
- ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4305
+ goto out_trace;
45754306
45764307 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
45774308 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
....@@ -4580,16 +4311,14 @@
45804311 int has_inline = 1;
45814312
45824313 err = ext4_inline_data_truncate(inode, &has_inline);
4583
- if (err)
4584
- return err;
4585
- if (has_inline)
4586
- return 0;
4314
+ if (err || has_inline)
4315
+ goto out_trace;
45874316 }
45884317
45894318 /* If we zero-out tail of the page, we have to create jinode for jbd2 */
45904319 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
45914320 if (ext4_inode_attach_jinode(inode) < 0)
4592
- return 0;
4321
+ goto out_trace;
45934322 }
45944323
45954324 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
....@@ -4598,8 +4327,10 @@
45984327 credits = ext4_blocks_for_truncate(inode);
45994328
46004329 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
4601
- if (IS_ERR(handle))
4602
- return PTR_ERR(handle);
4330
+ if (IS_ERR(handle)) {
4331
+ err = PTR_ERR(handle);
4332
+ goto out_trace;
4333
+ }
46034334
46044335 if (inode->i_size & (inode->i_sb->s_blocksize - 1))
46054336 ext4_block_truncate_page(handle, mapping, inode->i_size);
....@@ -4619,7 +4350,7 @@
46194350
46204351 down_write(&EXT4_I(inode)->i_data_sem);
46214352
4622
- ext4_discard_preallocations(inode);
4353
+ ext4_discard_preallocations(inode, 0);
46234354
46244355 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
46254356 err = ext4_ext_truncate(handle, inode);
....@@ -4645,9 +4376,12 @@
46454376 ext4_orphan_del(handle, inode);
46464377
46474378 inode->i_mtime = inode->i_ctime = current_time(inode);
4648
- ext4_mark_inode_dirty(handle, inode);
4379
+ err2 = ext4_mark_inode_dirty(handle, inode);
4380
+ if (unlikely(err2 && !err))
4381
+ err = err2;
46494382 ext4_journal_stop(handle);
46504383
4384
+out_trace:
46514385 trace_ext4_truncate_exit(inode);
46524386 return err;
46534387 }
....@@ -4658,21 +4392,22 @@
46584392 * data in memory that is needed to recreate the on-disk version of this
46594393 * inode.
46604394 */
4661
-static int __ext4_get_inode_loc(struct inode *inode,
4662
- struct ext4_iloc *iloc, int in_mem)
4395
+static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
4396
+ struct ext4_iloc *iloc, int in_mem,
4397
+ ext4_fsblk_t *ret_block)
46634398 {
46644399 struct ext4_group_desc *gdp;
46654400 struct buffer_head *bh;
4666
- struct super_block *sb = inode->i_sb;
46674401 ext4_fsblk_t block;
4402
+ struct blk_plug plug;
46684403 int inodes_per_block, inode_offset;
46694404
46704405 iloc->bh = NULL;
4671
- if (inode->i_ino < EXT4_ROOT_INO ||
4672
- inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
4406
+ if (ino < EXT4_ROOT_INO ||
4407
+ ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
46734408 return -EFSCORRUPTED;
46744409
4675
- iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
4410
+ iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
46764411 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
46774412 if (!gdp)
46784413 return -EIO;
....@@ -4681,7 +4416,7 @@
46814416 * Figure out the offset within the block group inode table
46824417 */
46834418 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4684
- inode_offset = ((inode->i_ino - 1) %
4419
+ inode_offset = ((ino - 1) %
46854420 EXT4_INODES_PER_GROUP(sb));
46864421 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
46874422 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
....@@ -4689,19 +4424,12 @@
46894424 bh = sb_getblk(sb, block);
46904425 if (unlikely(!bh))
46914426 return -ENOMEM;
4427
+ if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
4428
+ goto simulate_eio;
46924429 if (!buffer_uptodate(bh)) {
46934430 lock_buffer(bh);
46944431
4695
- /*
4696
- * If the buffer has the write error flag, we have failed
4697
- * to write out another inode in the same block. In this
4698
- * case, we don't have to read the block because we may
4699
- * read the old inode data successfully.
4700
- */
4701
- if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4702
- set_buffer_uptodate(bh);
4703
-
4704
- if (buffer_uptodate(bh)) {
4432
+ if (ext4_buffer_uptodate(bh)) {
47054433 /* someone brought it uptodate while we waited */
47064434 unlock_buffer(bh);
47074435 goto has_buffer;
....@@ -4753,6 +4481,7 @@
47534481 * If we need to do any I/O, try to pre-readahead extra
47544482 * blocks from the inode table.
47554483 */
4484
+ blk_start_plug(&plug);
47564485 if (EXT4_SB(sb)->s_inode_readahead_blks) {
47574486 ext4_fsblk_t b, end, table;
47584487 unsigned num;
....@@ -4771,7 +4500,7 @@
47714500 if (end > table)
47724501 end = table;
47734502 while (b <= end)
4774
- sb_breadahead_unmovable(sb, b++);
4503
+ ext4_sb_breadahead_unmovable(sb, b++);
47754504 }
47764505
47774506 /*
....@@ -4779,14 +4508,14 @@
47794508 * has in-inode xattrs, or we don't have this inode in memory.
47804509 * Read the block from disk.
47814510 */
4782
- trace_ext4_load_inode(inode);
4783
- get_bh(bh);
4784
- bh->b_end_io = end_buffer_read_sync;
4785
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
4511
+ trace_ext4_load_inode(sb, ino);
4512
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
4513
+ blk_finish_plug(&plug);
47864514 wait_on_buffer(bh);
47874515 if (!buffer_uptodate(bh)) {
4788
- EXT4_ERROR_INODE_BLOCK(inode, block,
4789
- "unable to read itable block");
4516
+ simulate_eio:
4517
+ if (ret_block)
4518
+ *ret_block = block;
47904519 brelse(bh);
47914520 return -EIO;
47924521 }
....@@ -4796,16 +4525,50 @@
47964525 return 0;
47974526 }
47984527
4799
-int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
4528
+static int __ext4_get_inode_loc_noinmem(struct inode *inode,
4529
+ struct ext4_iloc *iloc)
48004530 {
4801
- /* We have all inode data except xattrs in memory here. */
4802
- return __ext4_get_inode_loc(inode, iloc,
4803
- !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
4531
+ ext4_fsblk_t err_blk = 0;
4532
+ int ret;
4533
+
4534
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0,
4535
+ &err_blk);
4536
+
4537
+ if (ret == -EIO)
4538
+ ext4_error_inode_block(inode, err_blk, EIO,
4539
+ "unable to read itable block");
4540
+
4541
+ return ret;
48044542 }
48054543
4806
-static bool ext4_should_use_dax(struct inode *inode)
4544
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
48074545 {
4808
- if (!test_opt(inode->i_sb, DAX))
4546
+ ext4_fsblk_t err_blk = 0;
4547
+ int ret;
4548
+
4549
+ /* We have all inode data except xattrs in memory here. */
4550
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc,
4551
+ !ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk);
4552
+
4553
+ if (ret == -EIO)
4554
+ ext4_error_inode_block(inode, err_blk, EIO,
4555
+ "unable to read itable block");
4556
+
4557
+ return ret;
4558
+}
4559
+
4560
+
4561
+int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
4562
+ struct ext4_iloc *iloc)
4563
+{
4564
+ return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL);
4565
+}
4566
+
4567
+static bool ext4_should_enable_dax(struct inode *inode)
4568
+{
4569
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4570
+
4571
+ if (test_opt2(inode->i_sb, DAX_NEVER))
48094572 return false;
48104573 if (!S_ISREG(inode->i_mode))
48114574 return false;
....@@ -4817,13 +4580,20 @@
48174580 return false;
48184581 if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
48194582 return false;
4820
- return true;
4583
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
4584
+ return false;
4585
+ if (test_opt(inode->i_sb, DAX_ALWAYS))
4586
+ return true;
4587
+
4588
+ return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
48214589 }
48224590
4823
-void ext4_set_inode_flags(struct inode *inode)
4591
+void ext4_set_inode_flags(struct inode *inode, bool init)
48244592 {
48254593 unsigned int flags = EXT4_I(inode)->i_flags;
48264594 unsigned int new_fl = 0;
4595
+
4596
+ WARN_ON_ONCE(IS_DAX(inode) && init);
48274597
48284598 if (flags & EXT4_SYNC_FL)
48294599 new_fl |= S_SYNC;
....@@ -4835,8 +4605,13 @@
48354605 new_fl |= S_NOATIME;
48364606 if (flags & EXT4_DIRSYNC_FL)
48374607 new_fl |= S_DIRSYNC;
4838
- if (ext4_should_use_dax(inode))
4608
+
4609
+ /* Because of the way inode_set_flags() works we must preserve S_DAX
4610
+ * here if already set. */
4611
+ new_fl |= (inode->i_flags & S_DAX);
4612
+ if (init && ext4_should_enable_dax(inode))
48394613 new_fl |= S_DAX;
4614
+
48404615 if (flags & EXT4_ENCRYPT_FL)
48414616 new_fl |= S_ENCRYPTED;
48424617 if (flags & EXT4_CASEFOLD_FL)
....@@ -4877,8 +4652,7 @@
48774652 __le32 *magic = (void *)raw_inode +
48784653 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
48794654
4880
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
4881
- EXT4_INODE_SIZE(inode->i_sb) &&
4655
+ if (EXT4_INODE_HAS_XATTR_SPACE(inode) &&
48824656 *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
48834657 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
48844658 return ext4_find_inline_data_nolock(inode);
....@@ -4937,7 +4711,7 @@
49374711 (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
49384712 if (flags & EXT4_IGET_HANDLE)
49394713 return ERR_PTR(-ESTALE);
4940
- __ext4_error(sb, function, line,
4714
+ __ext4_error(sb, function, line, EFSCORRUPTED, 0,
49414715 "inode #%lu: comm %s: iget: illegal inode #",
49424716 ino, current->comm);
49434717 return ERR_PTR(-EFSCORRUPTED);
....@@ -4952,7 +4726,7 @@
49524726 ei = EXT4_I(inode);
49534727 iloc.bh = NULL;
49544728
4955
- ret = __ext4_get_inode_loc(inode, &iloc, 0);
4729
+ ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
49564730 if (ret < 0)
49574731 goto bad_inode;
49584732 raw_inode = ext4_raw_inode(&iloc);
....@@ -4998,9 +4772,11 @@
49984772 sizeof(gen));
49994773 }
50004774
5001
- if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
5002
- ext4_error_inode(inode, function, line, 0,
5003
- "iget: checksum invalid");
4775
+ if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
4776
+ ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
4777
+ (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
4778
+ ext4_error_inode_err(inode, function, line, 0,
4779
+ EFSBADCRC, "iget: checksum invalid");
50044780 ret = -EFSBADCRC;
50054781 goto bad_inode;
50064782 }
....@@ -5049,7 +4825,7 @@
50494825 * not initialized on a new filesystem. */
50504826 }
50514827 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
5052
- ext4_set_inode_flags(inode);
4828
+ ext4_set_inode_flags(inode, true);
50534829 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
50544830 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
50554831 if (ext4_has_feature_64bit(sb))
....@@ -5088,6 +4864,7 @@
50884864 for (block = 0; block < EXT4_N_BLOCKS; block++)
50894865 ei->i_data[block] = raw_inode->i_block[block];
50904866 INIT_LIST_HEAD(&ei->i_orphan);
4867
+ ext4_fc_init_inode(&ei->vfs_inode);
50914868
50924869 /*
50934870 * Set transaction id's of transactions that have to be committed
....@@ -5153,9 +4930,10 @@
51534930 goto bad_inode;
51544931 } else if (!ext4_has_inline_data(inode)) {
51554932 /* validate the block references in the inode */
5156
- if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
5157
- (S_ISLNK(inode->i_mode) &&
5158
- !ext4_inode_is_fast_symlink(inode))) {
4933
+ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
4934
+ (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4935
+ (S_ISLNK(inode->i_mode) &&
4936
+ !ext4_inode_is_fast_symlink(inode)))) {
51594937 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
51604938 ret = ext4_ext_check_inode(inode);
51614939 else
....@@ -5212,7 +4990,7 @@
52124990 goto bad_inode;
52134991 }
52144992 if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
5215
- EXT4_ERROR_INODE(inode,
4993
+ ext4_error_inode(inode, function, line, 0,
52164994 "casefold flag without casefold feature");
52174995 brelse(iloc.bh);
52184996
....@@ -5264,21 +5042,22 @@
52645042 return 0;
52655043 }
52665044
5267
-struct other_inode {
5268
- unsigned long orig_ino;
5269
- struct ext4_inode *raw_inode;
5270
-};
5271
-
5272
-static int other_inode_match(struct inode * inode, unsigned long ino,
5273
- void *data)
5045
+static void __ext4_update_other_inode_time(struct super_block *sb,
5046
+ unsigned long orig_ino,
5047
+ unsigned long ino,
5048
+ struct ext4_inode *raw_inode)
52745049 {
5275
- struct other_inode *oi = (struct other_inode *) data;
5050
+ struct inode *inode;
52765051
5277
- if ((inode->i_ino != ino) ||
5278
- (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
5052
+ inode = find_inode_by_ino_rcu(sb, ino);
5053
+ if (!inode)
5054
+ return;
5055
+
5056
+ if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
52795057 I_DIRTY_INODE)) ||
52805058 ((inode->i_state & I_DIRTY_TIME) == 0))
5281
- return 0;
5059
+ return;
5060
+
52825061 spin_lock(&inode->i_lock);
52835062 if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
52845063 I_DIRTY_INODE)) == 0) &&
....@@ -5289,16 +5068,15 @@
52895068 spin_unlock(&inode->i_lock);
52905069
52915070 spin_lock(&ei->i_raw_lock);
5292
- EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
5293
- EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
5294
- EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
5295
- ext4_inode_csum_set(inode, oi->raw_inode, ei);
5071
+ EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
5072
+ EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
5073
+ EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
5074
+ ext4_inode_csum_set(inode, raw_inode, ei);
52965075 spin_unlock(&ei->i_raw_lock);
5297
- trace_ext4_other_inode_update_time(inode, oi->orig_ino);
5298
- return -1;
5076
+ trace_ext4_other_inode_update_time(inode, orig_ino);
5077
+ return;
52995078 }
53005079 spin_unlock(&inode->i_lock);
5301
- return -1;
53025080 }
53035081
53045082 /*
....@@ -5308,24 +5086,24 @@
53085086 static void ext4_update_other_inodes_time(struct super_block *sb,
53095087 unsigned long orig_ino, char *buf)
53105088 {
5311
- struct other_inode oi;
53125089 unsigned long ino;
53135090 int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
53145091 int inode_size = EXT4_INODE_SIZE(sb);
53155092
5316
- oi.orig_ino = orig_ino;
53175093 /*
53185094 * Calculate the first inode in the inode table block. Inode
53195095 * numbers are one-based. That is, the first inode in a block
53205096 * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
53215097 */
53225098 ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
5099
+ rcu_read_lock();
53235100 for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
53245101 if (ino == orig_ino)
53255102 continue;
5326
- oi.raw_inode = (struct ext4_inode *) buf;
5327
- (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
5103
+ __ext4_update_other_inode_time(sb, orig_ino, ino,
5104
+ (struct ext4_inode *)buf);
53285105 }
5106
+ rcu_read_unlock();
53295107 }
53305108
53315109 /*
....@@ -5535,12 +5313,12 @@
55355313 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
55365314 return 0;
55375315
5538
- err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
5316
+ err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
55395317 EXT4_I(inode)->i_sync_tid);
55405318 } else {
55415319 struct ext4_iloc iloc;
55425320
5543
- err = __ext4_get_inode_loc(inode, &iloc, 0);
5321
+ err = __ext4_get_inode_loc_noinmem(inode, &iloc);
55445322 if (err)
55455323 return err;
55465324 /*
....@@ -5550,8 +5328,8 @@
55505328 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
55515329 sync_dirty_buffer(iloc.bh);
55525330 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5553
- EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
5554
- "IO error syncing inode");
5331
+ ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
5332
+ "IO error syncing inode");
55555333 err = -EIO;
55565334 }
55575335 brelse(iloc.bh);
....@@ -5664,6 +5442,7 @@
56645442 if (error)
56655443 return error;
56665444 }
5445
+ ext4_fc_start_update(inode);
56675446 if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
56685447 (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
56695448 handle_t *handle;
....@@ -5687,6 +5466,7 @@
56875466
56885467 if (error) {
56895468 ext4_journal_stop(handle);
5469
+ ext4_fc_stop_update(inode);
56905470 return error;
56915471 }
56925472 /* Update corresponding info in inode so that everything is in
....@@ -5697,37 +5477,61 @@
56975477 inode->i_gid = attr->ia_gid;
56985478 error = ext4_mark_inode_dirty(handle, inode);
56995479 ext4_journal_stop(handle);
5480
+ if (unlikely(error)) {
5481
+ ext4_fc_stop_update(inode);
5482
+ return error;
5483
+ }
57005484 }
57015485
57025486 if (attr->ia_valid & ATTR_SIZE) {
57035487 handle_t *handle;
57045488 loff_t oldsize = inode->i_size;
5705
- int shrink = (attr->ia_size <= inode->i_size);
5489
+ loff_t old_disksize;
5490
+ int shrink = (attr->ia_size < inode->i_size);
57065491
57075492 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
57085493 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
57095494
5710
- if (attr->ia_size > sbi->s_bitmap_maxbytes)
5495
+ if (attr->ia_size > sbi->s_bitmap_maxbytes) {
5496
+ ext4_fc_stop_update(inode);
57115497 return -EFBIG;
5498
+ }
57125499 }
5713
- if (!S_ISREG(inode->i_mode))
5500
+ if (!S_ISREG(inode->i_mode)) {
5501
+ ext4_fc_stop_update(inode);
57145502 return -EINVAL;
5503
+ }
57155504
57165505 if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
57175506 inode_inc_iversion(inode);
57185507
5719
- if (ext4_should_order_data(inode) &&
5720
- (attr->ia_size < inode->i_size)) {
5721
- error = ext4_begin_ordered_truncate(inode,
5508
+ if (shrink) {
5509
+ if (ext4_should_order_data(inode)) {
5510
+ error = ext4_begin_ordered_truncate(inode,
57225511 attr->ia_size);
5723
- if (error)
5724
- goto err_out;
5512
+ if (error)
5513
+ goto err_out;
5514
+ }
5515
+ /*
5516
+ * Blocks are going to be removed from the inode. Wait
5517
+ * for dio in flight.
5518
+ */
5519
+ inode_dio_wait(inode);
57255520 }
5521
+
5522
+ down_write(&EXT4_I(inode)->i_mmap_sem);
5523
+
5524
+ rc = ext4_break_layouts(inode);
5525
+ if (rc) {
5526
+ up_write(&EXT4_I(inode)->i_mmap_sem);
5527
+ goto err_out;
5528
+ }
5529
+
57265530 if (attr->ia_size != inode->i_size) {
57275531 handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
57285532 if (IS_ERR(handle)) {
57295533 error = PTR_ERR(handle);
5730
- goto err_out;
5534
+ goto out_mmap_sem;
57315535 }
57325536 if (ext4_handle_valid(handle) && shrink) {
57335537 error = ext4_orphan_add(handle, inode);
....@@ -5741,7 +5545,22 @@
57415545 inode->i_mtime = current_time(inode);
57425546 inode->i_ctime = inode->i_mtime;
57435547 }
5548
+
5549
+ if (shrink)
5550
+ ext4_fc_track_range(handle, inode,
5551
+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
5552
+ inode->i_sb->s_blocksize_bits,
5553
+ EXT_MAX_BLOCKS - 1);
5554
+ else
5555
+ ext4_fc_track_range(
5556
+ handle, inode,
5557
+ (oldsize > 0 ? oldsize - 1 : oldsize) >>
5558
+ inode->i_sb->s_blocksize_bits,
5559
+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
5560
+ inode->i_sb->s_blocksize_bits);
5561
+
57445562 down_write(&EXT4_I(inode)->i_data_sem);
5563
+ old_disksize = EXT4_I(inode)->i_disksize;
57455564 EXT4_I(inode)->i_disksize = attr->ia_size;
57465565 rc = ext4_mark_inode_dirty(handle, inode);
57475566 if (!error)
....@@ -5753,32 +5572,18 @@
57535572 */
57545573 if (!error)
57555574 i_size_write(inode, attr->ia_size);
5575
+ else
5576
+ EXT4_I(inode)->i_disksize = old_disksize;
57565577 up_write(&EXT4_I(inode)->i_data_sem);
57575578 ext4_journal_stop(handle);
5758
- if (error) {
5759
- if (orphan && inode->i_nlink)
5760
- ext4_orphan_del(NULL, inode);
5761
- goto err_out;
5579
+ if (error)
5580
+ goto out_mmap_sem;
5581
+ if (!shrink) {
5582
+ pagecache_isize_extended(inode, oldsize,
5583
+ inode->i_size);
5584
+ } else if (ext4_should_journal_data(inode)) {
5585
+ ext4_wait_for_tail_page_commit(inode);
57625586 }
5763
- }
5764
- if (!shrink) {
5765
- pagecache_isize_extended(inode, oldsize, inode->i_size);
5766
- } else {
5767
- /*
5768
- * Blocks are going to be removed from the inode. Wait
5769
- * for dio in flight.
5770
- */
5771
- inode_dio_wait(inode);
5772
- }
5773
- if (orphan && ext4_should_journal_data(inode))
5774
- ext4_wait_for_tail_page_commit(inode);
5775
- down_write(&EXT4_I(inode)->i_mmap_sem);
5776
-
5777
- rc = ext4_break_layouts(inode);
5778
- if (rc) {
5779
- up_write(&EXT4_I(inode)->i_mmap_sem);
5780
- error = rc;
5781
- goto err_out;
57825587 }
57835588
57845589 /*
....@@ -5786,11 +5591,16 @@
57865591 * in data=journal mode to make pages freeable.
57875592 */
57885593 truncate_pagecache(inode, inode->i_size);
5789
- if (shrink) {
5594
+ /*
5595
+ * Call ext4_truncate() even if i_size didn't change to
5596
+ * truncate possible preallocated blocks.
5597
+ */
5598
+ if (attr->ia_size <= oldsize) {
57905599 rc = ext4_truncate(inode);
57915600 if (rc)
57925601 error = rc;
57935602 }
5603
+out_mmap_sem:
57945604 up_write(&EXT4_I(inode)->i_mmap_sem);
57955605 }
57965606
....@@ -5810,9 +5620,11 @@
58105620 rc = posix_acl_chmod(inode, inode->i_mode);
58115621
58125622 err_out:
5813
- ext4_std_error(inode->i_sb, error);
5623
+ if (error)
5624
+ ext4_std_error(inode->i_sb, error);
58145625 if (!error)
58155626 error = rc;
5627
+ ext4_fc_stop_update(inode);
58165628 return error;
58175629 }
58185630
....@@ -5824,7 +5636,8 @@
58245636 struct ext4_inode_info *ei = EXT4_I(inode);
58255637 unsigned int flags;
58265638
5827
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
5639
+ if ((request_mask & STATX_BTIME) &&
5640
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
58285641 stat->result_mask |= STATX_BTIME;
58295642 stat->btime.tv_sec = ei->i_crtime.tv_sec;
58305643 stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
....@@ -5993,7 +5806,14 @@
59935806 put_bh(iloc->bh);
59945807 return -EIO;
59955808 }
5996
- if (IS_I_VERSION(inode))
5809
+ ext4_fc_track_inode(handle, inode);
5810
+
5811
+ /*
5812
+ * ea_inodes are using i_version for storing reference count, don't
5813
+ * mess with it
5814
+ */
5815
+ if (IS_I_VERSION(inode) &&
5816
+ !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
59975817 inode_inc_iversion(inode);
59985818
59995819 /* the do_update_inode consumes one bh->b_count */
....@@ -6107,9 +5927,8 @@
61075927 * If this is felt to be critical, then e2fsck should be run to
61085928 * force a large enough s_min_extra_isize.
61095929 */
6110
- if (ext4_handle_valid(handle) &&
6111
- jbd2_journal_extend(handle,
6112
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0)
5930
+ if (ext4_journal_extend(handle,
5931
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
61135932 return -ENOSPC;
61145933
61155934 if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
....@@ -6178,7 +5997,8 @@
61785997 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
61795998 * we start and wait on commits.
61805999 */
6181
-int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
6000
+int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
6001
+ const char *func, unsigned int line)
61826002 {
61836003 struct ext4_iloc iloc;
61846004 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
....@@ -6188,13 +6008,18 @@
61886008 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
61896009 err = ext4_reserve_inode_write(handle, inode, &iloc);
61906010 if (err)
6191
- return err;
6011
+ goto out;
61926012
61936013 if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
61946014 ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
61956015 iloc, handle);
61966016
6197
- return ext4_mark_iloc_dirty(handle, inode, &iloc);
6017
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
6018
+out:
6019
+ if (unlikely(err))
6020
+ ext4_error_inode_err(inode, func, line, 0, err,
6021
+ "mark_inode_dirty error");
6022
+ return err;
61986023 }
61996024
62006025 /*
....@@ -6231,36 +6056,6 @@
62316056 out:
62326057 return;
62336058 }
6234
-
6235
-#if 0
6236
-/*
6237
- * Bind an inode's backing buffer_head into this transaction, to prevent
6238
- * it from being flushed to disk early. Unlike
6239
- * ext4_reserve_inode_write, this leaves behind no bh reference and
6240
- * returns no iloc structure, so the caller needs to repeat the iloc
6241
- * lookup to mark the inode dirty later.
6242
- */
6243
-static int ext4_pin_inode(handle_t *handle, struct inode *inode)
6244
-{
6245
- struct ext4_iloc iloc;
6246
-
6247
- int err = 0;
6248
- if (handle) {
6249
- err = ext4_get_inode_loc(inode, &iloc);
6250
- if (!err) {
6251
- BUFFER_TRACE(iloc.bh, "get_write_access");
6252
- err = jbd2_journal_get_write_access(handle, iloc.bh);
6253
- if (!err)
6254
- err = ext4_handle_dirty_metadata(handle,
6255
- NULL,
6256
- iloc.bh);
6257
- brelse(iloc.bh);
6258
- }
6259
- }
6260
- ext4_std_error(inode->i_sb, err);
6261
- return err;
6262
-}
6263
-#endif
62646059
62656060 int ext4_change_inode_journal_flag(struct inode *inode, int val)
62666061 {
....@@ -6341,6 +6136,8 @@
63416136 if (IS_ERR(handle))
63426137 return PTR_ERR(handle);
63436138
6139
+ ext4_fc_mark_ineligible(inode->i_sb,
6140
+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
63446141 err = ext4_mark_inode_dirty(handle, inode);
63456142 ext4_handle_sync(handle);
63466143 ext4_journal_stop(handle);
....@@ -6354,13 +6151,14 @@
63546151 return !buffer_mapped(bh);
63556152 }
63566153
6357
-int ext4_page_mkwrite(struct vm_fault *vmf)
6154
+vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
63586155 {
63596156 struct vm_area_struct *vma = vmf->vma;
63606157 struct page *page = vmf->page;
63616158 loff_t size;
63626159 unsigned long len;
6363
- int ret;
6160
+ int err;
6161
+ vm_fault_t ret;
63646162 struct file *file = vma->vm_file;
63656163 struct inode *inode = file_inode(file);
63666164 struct address_space *mapping = inode->i_mapping;
....@@ -6376,18 +6174,26 @@
63766174
63776175 down_read(&EXT4_I(inode)->i_mmap_sem);
63786176
6379
- ret = ext4_convert_inline_data(inode);
6380
- if (ret)
6177
+ err = ext4_convert_inline_data(inode);
6178
+ if (err)
63816179 goto out_ret;
6180
+
6181
+ /*
6182
+ * On data journalling we skip straight to the transaction handle:
6183
+ * there's no delalloc; page truncated will be checked later; the
6184
+ * early return w/ all buffers mapped (calculates size/len) can't
6185
+ * be used; and there's no dioread_nolock, so only ext4_get_block.
6186
+ */
6187
+ if (ext4_should_journal_data(inode))
6188
+ goto retry_alloc;
63826189
63836190 /* Delalloc case is easy... */
63846191 if (test_opt(inode->i_sb, DELALLOC) &&
6385
- !ext4_should_journal_data(inode) &&
63866192 !ext4_nonda_switch(inode->i_sb)) {
63876193 do {
6388
- ret = block_page_mkwrite(vma, vmf,
6194
+ err = block_page_mkwrite(vma, vmf,
63896195 ext4_da_get_block_prep);
6390
- } while (ret == -ENOSPC &&
6196
+ } while (err == -ENOSPC &&
63916197 ext4_should_retry_alloc(inode->i_sb, &retries));
63926198 goto out_ret;
63936199 }
....@@ -6408,6 +6214,9 @@
64086214 /*
64096215 * Return if we have all the buffers mapped. This avoids the need to do
64106216 * journal_start/journal_stop which can block and take a long time
6217
+ *
6218
+ * This cannot be done for data journalling, as we have to add the
6219
+ * inode to the transaction's list to writeprotect pages on commit.
64116220 */
64126221 if (page_has_buffers(page)) {
64136222 if (!ext4_walk_page_buffers(NULL, page_buffers(page),
....@@ -6432,36 +6241,67 @@
64326241 ret = VM_FAULT_SIGBUS;
64336242 goto out;
64346243 }
6435
- ret = block_page_mkwrite(vma, vmf, get_block);
6436
- if (!ret && ext4_should_journal_data(inode)) {
6437
- if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
6438
- PAGE_SIZE, NULL, do_journal_get_write_access)) {
6439
- unlock_page(page);
6440
- ret = VM_FAULT_SIGBUS;
6441
- ext4_journal_stop(handle);
6442
- goto out;
6244
+ /*
6245
+ * Data journalling can't use block_page_mkwrite() because it
6246
+ * will set_buffer_dirty() before do_journal_get_write_access()
6247
+ * thus might hit warning messages for dirty metadata buffers.
6248
+ */
6249
+ if (!ext4_should_journal_data(inode)) {
6250
+ err = block_page_mkwrite(vma, vmf, get_block);
6251
+ } else {
6252
+ lock_page(page);
6253
+ size = i_size_read(inode);
6254
+ /* Page got truncated from under us? */
6255
+ if (page->mapping != mapping || page_offset(page) > size) {
6256
+ ret = VM_FAULT_NOPAGE;
6257
+ goto out_error;
64436258 }
6444
- ext4_set_inode_state(inode, EXT4_STATE_JDATA);
6259
+
6260
+ if (page->index == size >> PAGE_SHIFT)
6261
+ len = size & ~PAGE_MASK;
6262
+ else
6263
+ len = PAGE_SIZE;
6264
+
6265
+ err = __block_write_begin(page, 0, len, ext4_get_block);
6266
+ if (!err) {
6267
+ ret = VM_FAULT_SIGBUS;
6268
+ if (ext4_walk_page_buffers(handle, page_buffers(page),
6269
+ 0, len, NULL, do_journal_get_write_access))
6270
+ goto out_error;
6271
+ if (ext4_walk_page_buffers(handle, page_buffers(page),
6272
+ 0, len, NULL, write_end_fn))
6273
+ goto out_error;
6274
+ if (ext4_jbd2_inode_add_write(handle, inode,
6275
+ page_offset(page), len))
6276
+ goto out_error;
6277
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
6278
+ } else {
6279
+ unlock_page(page);
6280
+ }
64456281 }
64466282 ext4_journal_stop(handle);
6447
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
6283
+ if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
64486284 goto retry_alloc;
64496285 out_ret:
6450
- ret = block_page_mkwrite_return(ret);
6286
+ ret = block_page_mkwrite_return(err);
64516287 out:
64526288 up_read(&EXT4_I(inode)->i_mmap_sem);
64536289 sb_end_pagefault(inode->i_sb);
64546290 return ret;
6291
+out_error:
6292
+ unlock_page(page);
6293
+ ext4_journal_stop(handle);
6294
+ goto out;
64556295 }
64566296
6457
-int ext4_filemap_fault(struct vm_fault *vmf)
6297
+vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
64586298 {
64596299 struct inode *inode = file_inode(vmf->vma->vm_file);
6460
- int err;
6300
+ vm_fault_t ret;
64616301
64626302 down_read(&EXT4_I(inode)->i_mmap_sem);
6463
- err = filemap_fault(vmf);
6303
+ ret = filemap_fault(vmf);
64646304 up_read(&EXT4_I(inode)->i_mmap_sem);
64656305
6466
- return err;
6306
+ return ret;
64676307 }