~hc/RK356X_SDK_RELEASE.git

..	..	@@ -49,8 +49,6 @@
49	49	#include <trace/events/ext4.h>
50	50	#include <trace/events/android_fs.h>
51	51
52		-#define MPAGE_DA_EXTENT_TAIL 0x01
53		-
54	52	static __u32 ext4_inode_csum(struct inode inode, struct ext4_inode raw,
55	53	struct ext4_inode_info *ei)
56	54	{
..	..	@@ -104,8 +102,8 @@
104	102	return provided == calculated;
105	103	}
106	104
107		-static void ext4_inode_csum_set(struct inode inode, struct ext4_inode raw,
108		- struct ext4_inode_info *ei)
	105	+void ext4_inode_csum_set(struct inode inode, struct ext4_inode raw,
	106	+ struct ext4_inode_info *ei)
109	107	{
110	108	__u32 csum;
111	109
..	..	@@ -165,32 +163,6 @@
165	163	}
166	164
167	165	/*
168		- * Restart the transaction associated with *handle. This does a commit,
169		- * so before we call here everything must be consistently dirtied against
170		- * this transaction.
171		- */
172		-int ext4_truncate_restart_trans(handle_t handle, struct inode inode,
173		- int nblocks)
174		-{
175		- int ret;
176		-
177		- /*
178		- * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
179		- * moment, get_block can be called only for blocks inside i_size since
180		- * page cache has been already dropped and writes are blocked by
181		- * i_mutex. So we can safely drop the i_data_sem here.
182		- */
183		- BUG_ON(EXT4_JOURNAL(inode) == NULL);
184		- jbd_debug(2, "restarting handle %p\n", handle);
185		- up_write(&EXT4_I(inode)->i_data_sem);
186		- ret = ext4_journal_restart(handle, nblocks);
187		- down_write(&EXT4_I(inode)->i_data_sem);
188		- ext4_discard_preallocations(inode);
189		-
190		- return ret;
191		-}
192		-
193		-/*
194	166	* Called at the last iput() if i_nlink is zero.
195	167	*/
196	168	void ext4_evict_inode(struct inode *inode)
..	..	@@ -208,6 +180,8 @@
208	180
209	181	trace_ext4_evict_inode(inode);
210	182
	183	+ if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
	184	+ ext4_evict_ea_inode(inode);
211	185	if (inode->i_nlink) {
212	186	/*
213	187	* When journalling data dirty buffers are tracked only in the
..	..	@@ -249,6 +223,16 @@
249	223	if (ext4_should_order_data(inode))
250	224	ext4_begin_ordered_truncate(inode, 0);
251	225	truncate_inode_pages_final(&inode->i_data);
	226	+
	227	+ /*
	228	+ * For inodes with journalled data, transaction commit could have
	229	+ * dirtied the inode. And for inodes with dioread_nolock, unwritten
	230	+ * extents converting worker could merge extents and also have dirtied
	231	+ * the inode. Flush worker is ignoring it because of I_FREEING flag but
	232	+ * we still need to remove the inode from the writeback lists.
	233	+ */
	234	+ if (!list_empty_careful(&inode->i_io_list))
	235	+ inode_io_list_del(inode);
252	236
253	237	/*
254	238	* Protect us against freezing - iput() caller didn't have to have any
..	..	@@ -305,9 +289,9 @@
305	289	if (inode->i_blocks) {
306	290	err = ext4_truncate(inode);
307	291	if (err) {
308		- ext4_error(inode->i_sb,
309		- "couldn't truncate inode %lu (err %d)",
310		- inode->i_ino, err);
	292	+ ext4_error_err(inode->i_sb, -err,
	293	+ "couldn't truncate inode %lu (err %d)",
	294	+ inode->i_ino, err);
311	295	goto stop_handle;
312	296	}
313	297	}
..	..	@@ -355,6 +339,14 @@
355	339	ext4_xattr_inode_array_free(ea_inode_array);
356	340	return;
357	341	no_delete:
	342	+ /*
	343	+ * Check out some where else accidentally dirty the evicting inode,
	344	+ * which may probably cause inode use-after-free issues later.
	345	+ */
	346	+ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));
	347	+
	348	+ if (!list_empty(&EXT4_I(inode)->i_fc_list))
	349	+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
358	350	ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
359	351	}
360	352
..	..	@@ -410,8 +402,8 @@
410	402	* inode's preallocations.
411	403	*/
412	404	if ((ei->i_reserved_data_blocks == 0) &&
413		- (atomic_read(&inode->i_writecount) == 0))
414		- ext4_discard_preallocations(inode);
	405	+ !inode_is_open_for_write(inode))
	406	+ ext4_discard_preallocations(inode, 0);
415	407	}
416	408
417	409	static int __check_block_validity(struct inode inode, const char func,
..	..	@@ -437,7 +429,7 @@
437	429	{
438	430	int ret;
439	431
440		- if (IS_ENCRYPTED(inode))
	432	+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
441	433	return fscrypt_zeroout_range(inode, lblk, pblk, len);
442	434
443	435	ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
..	..	@@ -469,11 +461,9 @@
469	461	*/
470	462	down_read(&EXT4_I(inode)->i_data_sem);
471	463	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
472		- retval = ext4_ext_map_blocks(handle, inode, map, flags &
473		- EXT4_GET_BLOCKS_KEEP_SIZE);
	464	+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
474	465	} else {
475		- retval = ext4_ind_map_blocks(handle, inode, map, flags &
476		- EXT4_GET_BLOCKS_KEEP_SIZE);
	466	+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
477	467	}
478	468	up_read((&EXT4_I(inode)->i_data_sem));
479	469
..	..	@@ -530,9 +520,8 @@
530	520	#endif
531	521
532	522	map->m_flags = 0;
533		- ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
534		- "logical block %lu\n", inode->i_ino, flags, map->m_len,
535		- (unsigned long) map->m_lblk);
	523	+ ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
	524	+ flags, map->m_len, (unsigned long) map->m_lblk);
536	525
537	526	/*
538	527	* ext4_map_blocks returns an int, and m_len is an unsigned int
..	..	@@ -545,7 +534,8 @@
545	534	return -EFSCORRUPTED;
546	535
547	536	/* Lookup extent status tree firstly */
548		- if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
	537	+ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
	538	+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
549	539	if (ext4_es_is_written(&es) \|\| ext4_es_is_unwritten(&es)) {
550	540	map->m_pblk = ext4_es_pblock(&es) +
551	541	map->m_lblk - es.es_lblk;
..	..	@@ -563,7 +553,7 @@
563	553	map->m_len = retval;
564	554	retval = 0;
565	555	} else {
566		- BUG_ON(1);
	556	+ BUG();
567	557	}
568	558	#ifdef ES_AGGRESSIVE_TEST
569	559	ext4_map_blocks_es_recheck(handle, inode, map,
..	..	@@ -578,11 +568,9 @@
578	568	*/
579	569	down_read(&EXT4_I(inode)->i_data_sem);
580	570	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
581		- retval = ext4_ext_map_blocks(handle, inode, map, flags &
582		- EXT4_GET_BLOCKS_KEEP_SIZE);
	571	+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
583	572	} else {
584		- retval = ext4_ind_map_blocks(handle, inode, map, flags &
585		- EXT4_GET_BLOCKS_KEEP_SIZE);
	573	+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
586	574	}
587	575	if (retval > 0) {
588	576	unsigned int status;
..	..	@@ -599,8 +587,8 @@
599	587	EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
600	588	if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
601	589	!(status & EXTENT_STATUS_WRITTEN) &&
602		- ext4_find_delalloc_range(inode, map->m_lblk,
603		- map->m_lblk + map->m_len - 1))
	590	+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
	591	+ map->m_lblk + map->m_len - 1))
604	592	status \|= EXTENT_STATUS_DELAYED;
605	593	ret = ext4_es_insert_extent(inode, map->m_lblk,
606	594	map->m_len, map->m_pblk, status);
..	..	@@ -667,16 +655,6 @@
667	655	*/
668	656	ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
669	657	}
670		-
671		- /*
672		- * Update reserved blocks/metadata blocks after successful
673		- * block allocation which had been deferred till now. We don't
674		- * support fallocate for non extent files. So we can update
675		- * reserve space here.
676		- */
677		- if ((retval > 0) &&
678		- (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
679		- ext4_da_update_reserve_space(inode, retval, 1);
680	658	}
681	659
682	660	if (retval > 0) {
..	..	@@ -700,8 +678,6 @@
700	678	if (flags & EXT4_GET_BLOCKS_ZERO &&
701	679	map->m_flags & EXT4_MAP_MAPPED &&
702	680	map->m_flags & EXT4_MAP_NEW) {
703		- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
704		- map->m_len);
705	681	ret = ext4_issue_zeroout(inode, map->m_lblk,
706	682	map->m_pblk, map->m_len);
707	683	if (ret) {
..	..	@@ -715,7 +691,7 @@
715	691	* extent status tree.
716	692	*/
717	693	if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
718		- ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
	694	+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
719	695	if (ext4_es_is_written(&es))
720	696	goto out_sem;
721	697	}
..	..	@@ -723,8 +699,8 @@
723	699	EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
724	700	if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
725	701	!(status & EXTENT_STATUS_WRITTEN) &&
726		- ext4_find_delalloc_range(inode, map->m_lblk,
727		- map->m_lblk + map->m_len - 1))
	702	+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
	703	+ map->m_lblk + map->m_len - 1))
728	704	status \|= EXTENT_STATUS_DELAYED;
729	705	ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
730	706	map->m_pblk, status);
..	..	@@ -765,6 +741,12 @@
765	741	return ret;
766	742	}
767	743	}
	744	+ if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN \|\|
	745	+ map->m_flags & EXT4_MAP_MAPPED))
	746	+ ext4_fc_track_range(handle, inode, map->m_lblk,
	747	+ map->m_lblk + map->m_len - 1);
	748	+ if (retval < 0)
	749	+ ext_debug(inode, "failed with err %d\n", retval);
768	750	return retval;
769	751	}
770	752
..	..	@@ -847,136 +829,6 @@
847	829	#define DIO_MAX_BLOCKS 4096
848	830
849	831	/*
850		- * Get blocks function for the cases that need to start a transaction -
851		- * generally difference cases of direct IO and DAX IO. It also handles retries
852		- * in case of ENOSPC.
853		- */
854		-static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
855		- struct buffer_head *bh_result, int flags)
856		-{
857		- int dio_credits;
858		- handle_t *handle;
859		- int retries = 0;
860		- int ret;
861		-
862		- /* Trim mapping request to maximum we can map at once for DIO */
863		- if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
864		- bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
865		- dio_credits = ext4_chunk_trans_blocks(inode,
866		- bh_result->b_size >> inode->i_blkbits);
867		-retry:
868		- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
869		- if (IS_ERR(handle))
870		- return PTR_ERR(handle);
871		-
872		- ret = _ext4_get_block(inode, iblock, bh_result, flags);
873		- ext4_journal_stop(handle);
874		-
875		- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
876		- goto retry;
877		- return ret;
878		-}
879		-
880		-/* Get block function for DIO reads and writes to inodes without extents */
881		-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
882		- struct buffer_head *bh, int create)
883		-{
884		- /* We don't expect handle for direct IO */
885		- WARN_ON_ONCE(ext4_journal_current_handle());
886		-
887		- if (!create)
888		- return _ext4_get_block(inode, iblock, bh, 0);
889		- return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
890		-}
891		-
892		-/*
893		- * Get block function for AIO DIO writes when we create unwritten extent if
894		- * blocks are not allocated yet. The extent will be converted to written
895		- * after IO is complete.
896		- */
897		-static int ext4_dio_get_block_unwritten_async(struct inode *inode,
898		- sector_t iblock, struct buffer_head *bh_result, int create)
899		-{
900		- int ret;
901		-
902		- /* We don't expect handle for direct IO */
903		- WARN_ON_ONCE(ext4_journal_current_handle());
904		-
905		- ret = ext4_get_block_trans(inode, iblock, bh_result,
906		- EXT4_GET_BLOCKS_IO_CREATE_EXT);
907		-
908		- /*
909		- * When doing DIO using unwritten extents, we need io_end to convert
910		- * unwritten extents to written on IO completion. We allocate io_end
911		- * once we spot unwritten extent and store it in b_private. Generic
912		- * DIO code keeps b_private set and furthermore passes the value to
913		- * our completion callback in 'private' argument.
914		- */
915		- if (!ret && buffer_unwritten(bh_result)) {
916		- if (!bh_result->b_private) {
917		- ext4_io_end_t *io_end;
918		-
919		- io_end = ext4_init_io_end(inode, GFP_KERNEL);
920		- if (!io_end)
921		- return -ENOMEM;
922		- bh_result->b_private = io_end;
923		- ext4_set_io_unwritten_flag(inode, io_end);
924		- }
925		- set_buffer_defer_completion(bh_result);
926		- }
927		-
928		- return ret;
929		-}
930		-
931		-/*
932		- * Get block function for non-AIO DIO writes when we create unwritten extent if
933		- * blocks are not allocated yet. The extent will be converted to written
934		- * after IO is complete by ext4_direct_IO_write().
935		- */
936		-static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
937		- sector_t iblock, struct buffer_head *bh_result, int create)
938		-{
939		- int ret;
940		-
941		- /* We don't expect handle for direct IO */
942		- WARN_ON_ONCE(ext4_journal_current_handle());
943		-
944		- ret = ext4_get_block_trans(inode, iblock, bh_result,
945		- EXT4_GET_BLOCKS_IO_CREATE_EXT);
946		-
947		- /*
948		- * Mark inode as having pending DIO writes to unwritten extents.
949		- * ext4_direct_IO_write() checks this flag and converts extents to
950		- * written.
951		- */
952		- if (!ret && buffer_unwritten(bh_result))
953		- ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
954		-
955		- return ret;
956		-}
957		-
958		-static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
959		- struct buffer_head *bh_result, int create)
960		-{
961		- int ret;
962		-
963		- ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
964		- inode->i_ino, create);
965		- /* We don't expect handle for direct IO */
966		- WARN_ON_ONCE(ext4_journal_current_handle());
967		-
968		- ret = _ext4_get_block(inode, iblock, bh_result, 0);
969		- /*
970		- * Blocks should have been preallocated! ext4_file_write_iter() checks
971		- * that.
972		- */
973		- WARN_ON_ONCE(!buffer_mapped(bh_result) \|\| buffer_unwritten(bh_result));
974		-
975		- return ret;
976		-}
977		-
978		-
979		-/*
980	832	* `handle' can be NULL if create is zero
981	833	*/
982	834	struct buffer_head ext4_getblk(handle_t handle, struct inode *inode,
..	..	@@ -987,7 +839,8 @@
987	839	int create = map_flags & EXT4_GET_BLOCKS_CREATE;
988	840	int err;
989	841
990		- J_ASSERT(handle != NULL \|\| create == 0);
	842	+ J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
	843	+ \|\| handle != NULL \|\| create == 0);
991	844
992	845	map.m_lblk = block;
993	846	map.m_len = 1;
..	..	@@ -1003,7 +856,8 @@
1003	856	return ERR_PTR(-ENOMEM);
1004	857	if (map.m_flags & EXT4_MAP_NEW) {
1005	858	J_ASSERT(create != 0);
1006		- J_ASSERT(handle != NULL);
	859	+ J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
	860	+ \|\| (handle != NULL));
1007	861
1008	862	/*
1009	863	* Now that we do not always journal data, we should
..	..	@@ -1040,18 +894,20 @@
1040	894	ext4_lblk_t block, int map_flags)
1041	895	{
1042	896	struct buffer_head *bh;
	897	+ int ret;
1043	898
1044	899	bh = ext4_getblk(handle, inode, block, map_flags);
1045	900	if (IS_ERR(bh))
1046	901	return bh;
1047		- if (!bh \|\| buffer_uptodate(bh))
	902	+ if (!bh \|\| ext4_buffer_uptodate(bh))
1048	903	return bh;
1049		- ll_rw_block(REQ_OP_READ, REQ_META \| REQ_PRIO, 1, &bh);
1050		- wait_on_buffer(bh);
1051		- if (buffer_uptodate(bh))
1052		- return bh;
1053		- put_bh(bh);
1054		- return ERR_PTR(-EIO);
	904	+
	905	+ ret = ext4_read_bh_lock(bh, REQ_META \| REQ_PRIO, true);
	906	+ if (ret) {
	907	+ put_bh(bh);
	908	+ return ERR_PTR(ret);
	909	+ }
	910	+ return bh;
1055	911	}
1056	912
1057	913	/* Read a contiguous batch of blocks. */
..	..	@@ -1071,9 +927,8 @@
1071	927
1072	928	for (i = 0; i < bh_count; i++)
1073	929	/* Note that NULL bhs[i] is valid because of holes. */
1074		- if (bhs[i] && !buffer_uptodate(bhs[i]))
1075		- ll_rw_block(REQ_OP_READ, REQ_META \| REQ_PRIO, 1,
1076		- &bhs[i]);
	930	+ if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
	931	+ ext4_read_bh_lock(bhs[i], REQ_META \| REQ_PRIO, false);
1077	932
1078	933	if (!wait)
1079	934	return 0;
..	..	@@ -1190,8 +1045,9 @@
1190	1045	int err = 0;
1191	1046	unsigned blocksize = inode->i_sb->s_blocksize;
1192	1047	unsigned bbits;
1193		- struct buffer_head bh, head, wait[2], *wait_bh = wait;
1194		- bool decrypt = false;
	1048	+ struct buffer_head bh, head, *wait[2];
	1049	+ int nr_wait = 0;
	1050	+ int i;
1195	1051
1196	1052	BUG_ON(!PageLocked(page));
1197	1053	BUG_ON(from > PAGE_SIZE);
..	..	@@ -1222,7 +1078,6 @@
1222	1078	if (err)
1223	1079	break;
1224	1080	if (buffer_new(bh)) {
1225		- clean_bdev_bh_alias(bh);
1226	1081	if (PageUptodate(page)) {
1227	1082	clear_buffer_new(bh);
1228	1083	set_buffer_uptodate(bh);
..	..	@@ -1243,23 +1098,33 @@
1243	1098	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1244	1099	!buffer_unwritten(bh) &&
1245	1100	(block_start < from \|\| block_end > to)) {
1246		- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
1247		- *wait_bh++ = bh;
1248		- decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
	1101	+ ext4_read_bh_lock(bh, 0, false);
	1102	+ wait[nr_wait++] = bh;
1249	1103	}
1250	1104	}
1251	1105	/*
1252	1106	* If we issued read requests, let them complete.
1253	1107	*/
1254		- while (wait_bh > wait) {
1255		- wait_on_buffer(*--wait_bh);
1256		- if (!buffer_uptodate(*wait_bh))
	1108	+ for (i = 0; i < nr_wait; i++) {
	1109	+ wait_on_buffer(wait[i]);
	1110	+ if (!buffer_uptodate(wait[i]))
1257	1111	err = -EIO;
1258	1112	}
1259		- if (unlikely(err))
	1113	+ if (unlikely(err)) {
1260	1114	page_zero_new_buffers(page, from, to);
1261		- else if (decrypt)
1262		- err = fscrypt_decrypt_pagecache_blocks(page, PAGE_SIZE, 0);
	1115	+ } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
	1116	+ for (i = 0; i < nr_wait; i++) {
	1117	+ int err2;
	1118	+
	1119	+ err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize,
	1120	+ bh_offset(wait[i]));
	1121	+ if (err2) {
	1122	+ clear_buffer_uptodate(wait[i]);
	1123	+ err = err2;
	1124	+ }
	1125	+ }
	1126	+ }
	1127	+
1263	1128	return err;
1264	1129	}
1265	1130	#endif
..	..	@@ -1319,6 +1184,13 @@
1319	1184	page = grab_cache_page_write_begin(mapping, index, flags);
1320	1185	if (!page)
1321	1186	return -ENOMEM;
	1187	+ /*
	1188	+ * The same as page allocation, we prealloc buffer heads before
	1189	+ * starting the handle.
	1190	+ */
	1191	+ if (!page_has_buffers(page))
	1192	+ create_empty_buffers(page, inode->i_sb->s_blocksize, 0);
	1193	+
1322	1194	unlock_page(page);
1323	1195
1324	1196	retry_journal:
..	..	@@ -1433,7 +1305,8 @@
1433	1305
1434	1306	trace_android_fs_datawrite_end(inode, pos, len);
1435	1307	trace_ext4_write_end(inode, pos, len, copied);
1436		- if (inline_data) {
	1308	+ if (inline_data &&
	1309	+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
1437	1310	ret = ext4_write_inline_data_end(inode, pos, len,
1438	1311	copied, page);
1439	1312	if (ret < 0) {
..	..	@@ -1442,6 +1315,7 @@
1442	1315	goto errout;
1443	1316	}
1444	1317	copied = ret;
	1318	+ ret = 0;
1445	1319	} else
1446	1320	copied = block_write_end(file, mapping, pos,
1447	1321	len, copied, page, fsdata);
..	..	@@ -1466,15 +1340,16 @@
1466	1340	* filesystems.
1467	1341	*/
1468	1342	if (i_size_changed \|\| inline_data)
1469		- ext4_mark_inode_dirty(handle, inode);
	1343	+ ret = ext4_mark_inode_dirty(handle, inode);
1470	1344
	1345	+errout:
1471	1346	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
1472	1347	/* if we have allocated more blocks and copied
1473	1348	* less. We will have blocks allocated outside
1474	1349	* inode->i_size. So truncate them
1475	1350	*/
1476	1351	ext4_orphan_add(handle, inode);
1477		-errout:
	1352	+
1478	1353	ret2 = ext4_journal_stop(handle);
1479	1354	if (!ret)
1480	1355	ret = ret2;
..	..	@@ -1558,6 +1433,7 @@
1558	1433	goto errout;
1559	1434	}
1560	1435	copied = ret;
	1436	+ ret = 0;
1561	1437	} else if (unlikely(copied < len) && !PageUptodate(page)) {
1562	1438	copied = 0;
1563	1439	ext4_journalled_zero_new_buffers(handle, page, from, to);
..	..	@@ -1587,6 +1463,7 @@
1587	1463	ret = ret2;
1588	1464	}
1589	1465
	1466	+errout:
1590	1467	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
1591	1468	/* if we have allocated more blocks and copied
1592	1469	* less. We will have blocks allocated outside
..	..	@@ -1594,7 +1471,6 @@
1594	1471	*/
1595	1472	ext4_orphan_add(handle, inode);
1596	1473
1597		-errout:
1598	1474	ret2 = ext4_journal_stop(handle);
1599	1475	if (!ret)
1600	1476	ret = ret2;
..	..	@@ -1643,7 +1519,7 @@
1643	1519	return 0; /* success */
1644	1520	}
1645	1521
1646		-static void ext4_da_release_space(struct inode *inode, int to_free)
	1522	+void ext4_da_release_space(struct inode *inode, int to_free)
1647	1523	{
1648	1524	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1649	1525	struct ext4_inode_info *ei = EXT4_I(inode);
..	..	@@ -1678,64 +1554,6 @@
1678	1554	dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
1679	1555	}
1680	1556
1681		-static void ext4_da_page_release_reservation(struct page *page,
1682		- unsigned int offset,
1683		- unsigned int length)
1684		-{
1685		- int to_release = 0, contiguous_blks = 0;
1686		- struct buffer_head head, bh;
1687		- unsigned int curr_off = 0;
1688		- struct inode *inode = page->mapping->host;
1689		- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1690		- unsigned int stop = offset + length;
1691		- int num_clusters;
1692		- ext4_fsblk_t lblk;
1693		-
1694		- BUG_ON(stop > PAGE_SIZE \|\| stop < length);
1695		-
1696		- head = page_buffers(page);
1697		- bh = head;
1698		- do {
1699		- unsigned int next_off = curr_off + bh->b_size;
1700		-
1701		- if (next_off > stop)
1702		- break;
1703		-
1704		- if ((offset <= curr_off) && (buffer_delay(bh))) {
1705		- to_release++;
1706		- contiguous_blks++;
1707		- clear_buffer_delay(bh);
1708		- } else if (contiguous_blks) {
1709		- lblk = page->index <<
1710		- (PAGE_SHIFT - inode->i_blkbits);
1711		- lblk += (curr_off >> inode->i_blkbits) -
1712		- contiguous_blks;
1713		- ext4_es_remove_extent(inode, lblk, contiguous_blks);
1714		- contiguous_blks = 0;
1715		- }
1716		- curr_off = next_off;
1717		- } while ((bh = bh->b_this_page) != head);
1718		-
1719		- if (contiguous_blks) {
1720		- lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
1721		- lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
1722		- ext4_es_remove_extent(inode, lblk, contiguous_blks);
1723		- }
1724		-
1725		- /* If we have released all the blocks belonging to a cluster, then we
1726		- * need to release the reserved space for that cluster. */
1727		- num_clusters = EXT4_NUM_B2C(sbi, to_release);
1728		- while (num_clusters > 0) {
1729		- lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
1730		- ((num_clusters - 1) << sbi->s_cluster_bits);
1731		- if (sbi->s_cluster_ratio == 1 \|\|
1732		- !ext4_find_delalloc_cluster(inode, lblk))
1733		- ext4_da_release_space(inode, 1);
1734		-
1735		- num_clusters--;
1736		- }
1737		-}
1738		-
1739	1557	/*
1740	1558	* Delayed allocation stuff
1741	1559	*/
..	..	@@ -1755,6 +1573,7 @@
1755	1573	struct ext4_map_blocks map;
1756	1574	struct ext4_io_submit io_submit; /* IO submission data */
1757	1575	unsigned int do_map:1;
	1576	+ unsigned int scanned_until_end:1;
1758	1577	};
1759	1578
1760	1579	static void mpage_release_unused_pages(struct mpage_da_data *mpd,
..	..	@@ -1770,13 +1589,21 @@
1770	1589	if (mpd->first_page >= mpd->next_page)
1771	1590	return;
1772	1591
	1592	+ mpd->scanned_until_end = 0;
1773	1593	index = mpd->first_page;
1774	1594	end = mpd->next_page - 1;
1775	1595	if (invalidate) {
1776	1596	ext4_lblk_t start, last;
1777	1597	start = index << (PAGE_SHIFT - inode->i_blkbits);
1778	1598	last = end << (PAGE_SHIFT - inode->i_blkbits);
	1599	+
	1600	+ /*
	1601	+ * avoid racing with extent status tree scans made by
	1602	+ * ext4_insert_delayed_block()
	1603	+ */
	1604	+ down_write(&EXT4_I(inode)->i_data_sem);
1779	1605	ext4_es_remove_extent(inode, start, last - start + 1);
	1606	+ up_write(&EXT4_I(inode)->i_data_sem);
1780	1607	}
1781	1608
1782	1609	pagevec_init(&pvec);
..	..	@@ -1829,6 +1656,70 @@
1829	1656	}
1830	1657
1831	1658	/*
	1659	+ * ext4_insert_delayed_block - adds a delayed block to the extents status
	1660	+ * tree, incrementing the reserved cluster/block
	1661	+ * count or making a pending reservation
	1662	+ * where needed
	1663	+ *
	1664	+ * @inode - file containing the newly added block
	1665	+ * @lblk - logical block to be added
	1666	+ *
	1667	+ * Returns 0 on success, negative error code on failure.
	1668	+ */
	1669	+static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
	1670	+{
	1671	+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
	1672	+ int ret;
	1673	+ bool allocated = false;
	1674	+ bool reserved = false;
	1675	+
	1676	+ /*
	1677	+ * If the cluster containing lblk is shared with a delayed,
	1678	+ * written, or unwritten extent in a bigalloc file system, it's
	1679	+ * already been accounted for and does not need to be reserved.
	1680	+ * A pending reservation must be made for the cluster if it's
	1681	+ * shared with a written or unwritten extent and doesn't already
	1682	+ * have one. Written and unwritten extents can be purged from the
	1683	+ * extents status tree if the system is under memory pressure, so
	1684	+ * it's necessary to examine the extent tree if a search of the
	1685	+ * extents status tree doesn't get a match.
	1686	+ */
	1687	+ if (sbi->s_cluster_ratio == 1) {
	1688	+ ret = ext4_da_reserve_space(inode);
	1689	+ if (ret != 0) /* ENOSPC */
	1690	+ goto errout;
	1691	+ reserved = true;
	1692	+ } else { /* bigalloc */
	1693	+ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
	1694	+ if (!ext4_es_scan_clu(inode,
	1695	+ &ext4_es_is_mapped, lblk)) {
	1696	+ ret = ext4_clu_mapped(inode,
	1697	+ EXT4_B2C(sbi, lblk));
	1698	+ if (ret < 0)
	1699	+ goto errout;
	1700	+ if (ret == 0) {
	1701	+ ret = ext4_da_reserve_space(inode);
	1702	+ if (ret != 0) /* ENOSPC */
	1703	+ goto errout;
	1704	+ reserved = true;
	1705	+ } else {
	1706	+ allocated = true;
	1707	+ }
	1708	+ } else {
	1709	+ allocated = true;
	1710	+ }
	1711	+ }
	1712	+ }
	1713	+
	1714	+ ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
	1715	+ if (ret && reserved)
	1716	+ ext4_da_release_space(inode, 1);
	1717	+
	1718	+errout:
	1719	+ return ret;
	1720	+}
	1721	+
	1722	+/*
1832	1723	* This function is grabs code from the very beginning of
1833	1724	* ext4_map_blocks, but assumes that the caller is from delayed write
1834	1725	* time. This function looks up the requested blocks and sets the
..	..	@@ -1851,12 +1742,11 @@
1851	1742	invalid_block = ~0;
1852	1743
1853	1744	map->m_flags = 0;
1854		- ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
1855		- "logical block %lu\n", inode->i_ino, map->m_len,
	1745	+ ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
1856	1746	(unsigned long) map->m_lblk);
1857	1747
1858	1748	/* Lookup extent status tree firstly */
1859		- if (ext4_es_lookup_extent(inode, iblock, &es)) {
	1749	+ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
1860	1750	if (ext4_es_is_hole(&es)) {
1861	1751	retval = 0;
1862	1752	down_read(&EXT4_I(inode)->i_data_sem);
..	..	@@ -1884,7 +1774,7 @@
1884	1774	else if (ext4_es_is_unwritten(&es))
1885	1775	map->m_flags \|= EXT4_MAP_UNWRITTEN;
1886	1776	else
1887		- BUG_ON(1);
	1777	+ BUG();
1888	1778
1889	1779	#ifdef ES_AGGRESSIVE_TEST
1890	1780	ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
..	..	@@ -1907,28 +1797,14 @@
1907	1797	add_delayed:
1908	1798	if (retval == 0) {
1909	1799	int ret;
	1800	+
1910	1801	/*
1911	1802	* XXX: __block_prepare_write() unmaps passed block,
1912	1803	* is it OK?
1913	1804	*/
1914		- /*
1915		- * If the block was allocated from previously allocated cluster,
1916		- * then we don't need to reserve it again. However we still need
1917		- * to reserve metadata for every block we're going to write.
1918		- */
1919		- if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 \|\|
1920		- !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
1921		- ret = ext4_da_reserve_space(inode);
1922		- if (ret) {
1923		- /* not enough space to reserve */
1924		- retval = ret;
1925		- goto out_unlock;
1926		- }
1927		- }
1928	1805
1929		- ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
1930		- ~0, EXTENT_STATUS_DELAYED);
1931		- if (ret) {
	1806	+ ret = ext4_insert_delayed_block(inode, map->m_lblk);
	1807	+ if (ret != 0) {
1932	1808	retval = ret;
1933	1809	goto out_unlock;
1934	1810	}
..	..	@@ -2088,6 +1964,9 @@
2088	1964	}
2089	1965	if (ret == 0)
2090	1966	ret = err;
	1967	+ err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
	1968	+ if (ret == 0)
	1969	+ ret = err;
2091	1970	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
2092	1971	err = ext4_journal_stop(handle);
2093	1972	if (!ret)
..	..	@@ -2169,6 +2048,15 @@
2169	2048	len = size & ~PAGE_MASK;
2170	2049	else
2171	2050	len = PAGE_SIZE;
	2051	+
	2052	+ /* Should never happen but for bugs in other kernel subsystems */
	2053	+ if (!page_has_buffers(page)) {
	2054	+ ext4_warning_inode(inode,
	2055	+ "page %lu does not have buffers attached", page->index);
	2056	+ ClearPageDirty(page);
	2057	+ unlock_page(page);
	2058	+ return 0;
	2059	+ }
2172	2060
2173	2061	page_bufs = page_buffers(page);
2174	2062	/*
..	..	@@ -2262,7 +2150,7 @@
2262	2150	return err;
2263	2151	}
2264	2152
2265		-#define BH_FLAGS ((1 << BH_Unwritten) \| (1 << BH_Delay))
	2153	+#define BH_FLAGS (BIT(BH_Unwritten) \| BIT(BH_Delay))
2266	2154
2267	2155	/*
2268	2156	* mballoc gives us at most this number of blocks...
..	..	@@ -2372,7 +2260,84 @@
2372	2260	if (err < 0)
2373	2261	return err;
2374	2262	}
2375		- return lblk < blocks;
	2263	+ if (lblk >= blocks) {
	2264	+ mpd->scanned_until_end = 1;
	2265	+ return 0;
	2266	+ }
	2267	+ return 1;
	2268	+}
	2269	+
	2270	+/*
	2271	+ * mpage_process_page - update page buffers corresponding to changed extent and
	2272	+ * may submit fully mapped page for IO
	2273	+ *
	2274	+ * @mpd - description of extent to map, on return next extent to map
	2275	+ * @m_lblk - logical block mapping.
	2276	+ * @m_pblk - corresponding physical mapping.
	2277	+ * @map_bh - determines on return whether this page requires any further
	2278	+ * mapping or not.
	2279	+ * Scan given page buffers corresponding to changed extent and update buffer
	2280	+ * state according to new extent state.
	2281	+ * We map delalloc buffers to their physical location, clear unwritten bits.
	2282	+ * If the given page is not fully mapped, we update @map to the next extent in
	2283	+ * the given page that needs mapping & return @map_bh as true.
	2284	+ */
	2285	+static int mpage_process_page(struct mpage_da_data mpd, struct page page,
	2286	+ ext4_lblk_t m_lblk, ext4_fsblk_t m_pblk,
	2287	+ bool *map_bh)
	2288	+{
	2289	+ struct buffer_head head, bh;
	2290	+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
	2291	+ ext4_lblk_t lblk = *m_lblk;
	2292	+ ext4_fsblk_t pblock = *m_pblk;
	2293	+ int err = 0;
	2294	+ int blkbits = mpd->inode->i_blkbits;
	2295	+ ssize_t io_end_size = 0;
	2296	+ struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
	2297	+
	2298	+ bh = head = page_buffers(page);
	2299	+ do {
	2300	+ if (lblk < mpd->map.m_lblk)
	2301	+ continue;
	2302	+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
	2303	+ /*
	2304	+ * Buffer after end of mapped extent.
	2305	+ * Find next buffer in the page to map.
	2306	+ */
	2307	+ mpd->map.m_len = 0;
	2308	+ mpd->map.m_flags = 0;
	2309	+ io_end_vec->size += io_end_size;
	2310	+ io_end_size = 0;
	2311	+
	2312	+ err = mpage_process_page_bufs(mpd, head, bh, lblk);
	2313	+ if (err > 0)
	2314	+ err = 0;
	2315	+ if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
	2316	+ io_end_vec = ext4_alloc_io_end_vec(io_end);
	2317	+ if (IS_ERR(io_end_vec)) {
	2318	+ err = PTR_ERR(io_end_vec);
	2319	+ goto out;
	2320	+ }
	2321	+ io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
	2322	+ }
	2323	+ *map_bh = true;
	2324	+ goto out;
	2325	+ }
	2326	+ if (buffer_delay(bh)) {
	2327	+ clear_buffer_delay(bh);
	2328	+ bh->b_blocknr = pblock++;
	2329	+ }
	2330	+ clear_buffer_unwritten(bh);
	2331	+ io_end_size += (1 << blkbits);
	2332	+ } while (lblk++, (bh = bh->b_this_page) != head);
	2333	+
	2334	+ io_end_vec->size += io_end_size;
	2335	+ io_end_size = 0;
	2336	+ *map_bh = false;
	2337	+out:
	2338	+ *m_lblk = lblk;
	2339	+ *m_pblk = pblock;
	2340	+ return err;
2376	2341	}
2377	2342
2378	2343	/*
..	..	@@ -2394,12 +2359,12 @@
2394	2359	struct pagevec pvec;
2395	2360	int nr_pages, i;
2396	2361	struct inode *inode = mpd->inode;
2397		- struct buffer_head head, bh;
2398	2362	int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
2399	2363	pgoff_t start, end;
2400	2364	ext4_lblk_t lblk;
2401		- sector_t pblock;
	2365	+ ext4_fsblk_t pblock;
2402	2366	int err;
	2367	+ bool map_bh = false;
2403	2368
2404	2369	start = mpd->map.m_lblk >> bpp_bits;
2405	2370	end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
..	..	@@ -2415,50 +2380,19 @@
2415	2380	for (i = 0; i < nr_pages; i++) {
2416	2381	struct page *page = pvec.pages[i];
2417	2382
2418		- bh = head = page_buffers(page);
2419		- do {
2420		- if (lblk < mpd->map.m_lblk)
2421		- continue;
2422		- if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2423		- /*
2424		- * Buffer after end of mapped extent.
2425		- * Find next buffer in the page to map.
2426		- */
2427		- mpd->map.m_len = 0;
2428		- mpd->map.m_flags = 0;
2429		- /*
2430		- * FIXME: If dioread_nolock supports
2431		- * blocksize < pagesize, we need to make
2432		- * sure we add size mapped so far to
2433		- * io_end->size as the following call
2434		- * can submit the page for IO.
2435		- */
2436		- err = mpage_process_page_bufs(mpd, head,
2437		- bh, lblk);
2438		- pagevec_release(&pvec);
2439		- if (err > 0)
2440		- err = 0;
2441		- return err;
2442		- }
2443		- if (buffer_delay(bh)) {
2444		- clear_buffer_delay(bh);
2445		- bh->b_blocknr = pblock++;
2446		- }
2447		- clear_buffer_unwritten(bh);
2448		- } while (lblk++, (bh = bh->b_this_page) != head);
2449		-
	2383	+ err = mpage_process_page(mpd, page, &lblk, &pblock,
	2384	+ &map_bh);
2450	2385	/*
2451		- * FIXME: This is going to break if dioread_nolock
2452		- * supports blocksize < pagesize as we will try to
2453		- * convert potentially unmapped parts of inode.
	2386	+ * If map_bh is true, means page may require further bh
	2387	+ * mapping, or maybe the page was submitted for IO.
	2388	+ * So we return to call further extent mapping.
2454	2389	*/
2455		- mpd->io_submit.io_end->size += PAGE_SIZE;
	2390	+ if (err < 0 \|\| map_bh)
	2391	+ goto out;
2456	2392	/* Page fully mapped - let IO run! */
2457	2393	err = mpage_submit_page(mpd, page);
2458		- if (err < 0) {
2459		- pagevec_release(&pvec);
2460		- return err;
2461		- }
	2394	+ if (err < 0)
	2395	+ goto out;
2462	2396	}
2463	2397	pagevec_release(&pvec);
2464	2398	}
..	..	@@ -2466,6 +2400,9 @@
2466	2400	mpd->map.m_len = 0;
2467	2401	mpd->map.m_flags = 0;
2468	2402	return 0;
	2403	+out:
	2404	+ pagevec_release(&pvec);
	2405	+ return err;
2469	2406	}
2470	2407
2471	2408	static int mpage_map_one_extent(handle_t handle, struct mpage_da_data mpd)
..	..	@@ -2497,7 +2434,7 @@
2497	2434	dioread_nolock = ext4_should_dioread_nolock(inode);
2498	2435	if (dioread_nolock)
2499	2436	get_blocks_flags \|= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2500		- if (map->m_flags & (1 << BH_Delay))
	2437	+ if (map->m_flags & BIT(BH_Delay))
2501	2438	get_blocks_flags \|= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2502	2439
2503	2440	err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
..	..	@@ -2513,10 +2450,6 @@
2513	2450	}
2514	2451
2515	2452	BUG_ON(map->m_len == 0);
2516		- if (map->m_flags & EXT4_MAP_NEW) {
2517		- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
2518		- map->m_len);
2519		- }
2520	2453	return 0;
2521	2454	}
2522	2455
..	..	@@ -2549,16 +2482,20 @@
2549	2482	int err;
2550	2483	loff_t disksize;
2551	2484	int progress = 0;
	2485	+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
	2486	+ struct ext4_io_end_vec *io_end_vec;
2552	2487
2553		- mpd->io_submit.io_end->offset =
2554		- ((loff_t)map->m_lblk) << inode->i_blkbits;
	2488	+ io_end_vec = ext4_alloc_io_end_vec(io_end);
	2489	+ if (IS_ERR(io_end_vec))
	2490	+ return PTR_ERR(io_end_vec);
	2491	+ io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
2555	2492	do {
2556	2493	err = mpage_map_one_extent(handle, mpd);
2557	2494	if (err < 0) {
2558	2495	struct super_block *sb = inode->i_sb;
2559	2496
2560	2497	if (ext4_forced_shutdown(EXT4_SB(sb)) \|\|
2561		- EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
	2498	+ ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
2562	2499	goto invalidate_dirty_pages;
2563	2500	/*
2564	2501	* Let the uper layers retry transient errors.
..	..	@@ -2615,10 +2552,11 @@
2615	2552	EXT4_I(inode)->i_disksize = disksize;
2616	2553	up_write(&EXT4_I(inode)->i_data_sem);
2617	2554	err2 = ext4_mark_inode_dirty(handle, inode);
2618		- if (err2)
2619		- ext4_error(inode->i_sb,
2620		- "Failed to mark inode %lu dirty",
2621		- inode->i_ino);
	2555	+ if (err2) {
	2556	+ ext4_error_err(inode->i_sb, -err2,
	2557	+ "Failed to mark inode %lu dirty",
	2558	+ inode->i_ino);
	2559	+ }
2622	2560	if (!err)
2623	2561	err = err2;
2624	2562	}
..	..	@@ -2666,7 +2604,7 @@
2666	2604	long left = mpd->wbc->nr_to_write;
2667	2605	pgoff_t index = mpd->first_page;
2668	2606	pgoff_t end = mpd->last_page;
2669		- int tag;
	2607	+ xa_mark_t tag;
2670	2608	int i, err = 0;
2671	2609	int blkbits = mpd->inode->i_blkbits;
2672	2610	ext4_lblk_t lblk;
..	..	@@ -2684,7 +2622,7 @@
2684	2622	nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
2685	2623	tag);
2686	2624	if (nr_pages == 0)
2687		- goto out;
	2625	+ break;
2688	2626
2689	2627	for (i = 0; i < nr_pages; i++) {
2690	2628	struct page *page = pvec.pages[i];
..	..	@@ -2723,6 +2661,22 @@
2723	2661	wait_on_page_writeback(page);
2724	2662	BUG_ON(PageWriteback(page));
2725	2663
	2664	+ /*
	2665	+ * Should never happen but for buggy code in
	2666	+ * other subsystems that call
	2667	+ * set_page_dirty() without properly warning
	2668	+ * the file system first. See [1] for more
	2669	+ * information.
	2670	+ *
	2671	+ * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
	2672	+ */
	2673	+ if (!page_has_buffers(page)) {
	2674	+ ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index);
	2675	+ ClearPageDirty(page);
	2676	+ unlock_page(page);
	2677	+ continue;
	2678	+ }
	2679	+
2726	2680	if (mpd->map.m_len == 0)
2727	2681	mpd->first_page = page->index;
2728	2682	mpd->next_page = page->index + 1;
..	..	@@ -2739,6 +2693,7 @@
2739	2693	pagevec_release(&pvec);
2740	2694	cond_resched();
2741	2695	}
	2696	+ mpd->scanned_until_end = 1;
2742	2697	return 0;
2743	2698	out:
2744	2699	pagevec_release(&pvec);
..	..	@@ -2757,7 +2712,6 @@
2757	2712	struct inode *inode = mapping->host;
2758	2713	int needed_blocks, rsv_blocks = 0, ret = 0;
2759	2714	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2760		- bool done;
2761	2715	struct blk_plug plug;
2762	2716	bool give_up_on_write = false;
2763	2717
..	..	@@ -2791,18 +2745,9 @@
2791	2745	* the stack trace.
2792	2746	*/
2793	2747	if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) \|\|
2794		- sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
	2748	+ ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
2795	2749	ret = -EROFS;
2796	2750	goto out_writepages;
2797		- }
2798		-
2799		- if (ext4_should_dioread_nolock(inode)) {
2800		- /*
2801		- * We may need to convert up to one extent per block in
2802		- * the page and we may dirty the inode.
2803		- */
2804		- rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
2805		- PAGE_SIZE >> inode->i_blkbits);
2806	2751	}
2807	2752
2808	2753	/*
..	..	@@ -2821,6 +2766,15 @@
2821	2766	EXT4_STATE_MAY_INLINE_DATA));
2822	2767	ext4_destroy_inline_data(handle, inode);
2823	2768	ext4_journal_stop(handle);
	2769	+ }
	2770	+
	2771	+ if (ext4_should_dioread_nolock(inode)) {
	2772	+ /*
	2773	+ * We may need to convert up to one extent per block in
	2774	+ * the page and we may dirty the inode.
	2775	+ */
	2776	+ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
	2777	+ PAGE_SIZE >> inode->i_blkbits);
2824	2778	}
2825	2779
2826	2780	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
..	..	@@ -2843,7 +2797,6 @@
2843	2797	retry:
2844	2798	if (wbc->sync_mode == WB_SYNC_ALL \|\| wbc->tagged_writepages)
2845	2799	tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
2846		- done = false;
2847	2800	blk_start_plug(&plug);
2848	2801
2849	2802	/*
..	..	@@ -2853,22 +2806,23 @@
2853	2806	* started.
2854	2807	*/
2855	2808	mpd.do_map = 0;
	2809	+ mpd.scanned_until_end = 0;
2856	2810	mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2857	2811	if (!mpd.io_submit.io_end) {
2858	2812	ret = -ENOMEM;
2859	2813	goto unplug;
2860	2814	}
2861	2815	ret = mpage_prepare_extent_to_map(&mpd);
	2816	+ /* Unlock pages we didn't use */
	2817	+ mpage_release_unused_pages(&mpd, false);
2862	2818	/* Submit prepared bio */
2863	2819	ext4_io_submit(&mpd.io_submit);
2864	2820	ext4_put_io_end_defer(mpd.io_submit.io_end);
2865	2821	mpd.io_submit.io_end = NULL;
2866		- /* Unlock pages we didn't use */
2867		- mpage_release_unused_pages(&mpd, false);
2868	2822	if (ret < 0)
2869	2823	goto unplug;
2870	2824
2871		- while (!done && mpd.first_page <= mpd.last_page) {
	2825	+ while (!mpd.scanned_until_end && wbc->nr_to_write > 0) {
2872	2826	/* For each extent of pages we use new io_end */
2873	2827	mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2874	2828	if (!mpd.io_submit.io_end) {
..	..	@@ -2903,26 +2857,15 @@
2903	2857
2904	2858	trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
2905	2859	ret = mpage_prepare_extent_to_map(&mpd);
2906		- if (!ret) {
2907		- if (mpd.map.m_len)
2908		- ret = mpage_map_and_submit_extent(handle, &mpd,
	2860	+ if (!ret && mpd.map.m_len)
	2861	+ ret = mpage_map_and_submit_extent(handle, &mpd,
2909	2862	&give_up_on_write);
2910		- else {
2911		- /*
2912		- * We scanned the whole range (or exhausted
2913		- * nr_to_write), submitted what was mapped and
2914		- * didn't find anything needing mapping. We are
2915		- * done.
2916		- */
2917		- done = true;
2918		- }
2919		- }
2920	2863	/*
2921	2864	* Caution: If the handle is synchronous,
2922	2865	* ext4_journal_stop() can wait for transaction commit
2923	2866	* to finish which may depend on writeback of pages to
2924	2867	* complete or on page lock to be released. In that
2925		- * case, we have to wait until after after we have
	2868	+ * case, we have to wait until after we have
2926	2869	* submitted all the IO, released page locks we hold,
2927	2870	* and dropped io_end reference (for extent conversion
2928	2871	* to be able to complete) before stopping the handle.
..	..	@@ -2932,10 +2875,11 @@
2932	2875	handle = NULL;
2933	2876	mpd.do_map = 0;
2934	2877	}
2935		- /* Submit prepared bio */
2936		- ext4_io_submit(&mpd.io_submit);
2937	2878	/* Unlock pages we didn't use */
2938	2879	mpage_release_unused_pages(&mpd, give_up_on_write);
	2880	+ /* Submit prepared bio */
	2881	+ ext4_io_submit(&mpd.io_submit);
	2882	+
2939	2883	/*
2940	2884	* Drop our io_end reference we got from init. We have
2941	2885	* to be careful and use deferred io_end finishing if
..	..	@@ -3002,7 +2946,7 @@
3002	2946	percpu_down_read(&sbi->s_writepages_rwsem);
3003	2947	trace_ext4_writepages(inode, wbc);
3004	2948
3005		- ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
	2949	+ ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
3006	2950	trace_ext4_writepages_result(inode, wbc, ret,
3007	2951	nr_to_write - wbc->nr_to_write);
3008	2952	percpu_up_read(&sbi->s_writepages_rwsem);
..	..	@@ -3212,58 +3156,42 @@
3212	3156	end = start + copied - 1;
3213	3157
3214	3158	/*
3215		- * generic_write_end() will run mark_inode_dirty() if i_size
3216		- * changes. So let's piggyback the i_disksize mark_inode_dirty
3217		- * into that.
	3159	+ * Since we are holding inode lock, we are sure i_disksize <=
	3160	+ * i_size. We also know that if i_disksize < i_size, there are
	3161	+ * delalloc writes pending in the range upto i_size. If the end of
	3162	+ * the current write is <= i_size, there's no need to touch
	3163	+ * i_disksize since writeback will push i_disksize upto i_size
	3164	+ * eventually. If the end of the current write is > i_size and
	3165	+ * inside an allocated block (ext4_da_should_update_i_disksize()
	3166	+ * check), we need to update i_disksize here as neither
	3167	+ * ext4_writepage() nor certain ext4_writepages() paths not
	3168	+ * allocating blocks update i_disksize.
	3169	+ *
	3170	+ * Note that we defer inode dirtying to generic_write_end() /
	3171	+ * ext4_da_write_inline_data_end().
3218	3172	*/
3219	3173	new_i_size = pos + copied;
3220		- if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
	3174	+ if (copied && new_i_size > inode->i_size) {
3221	3175	if (ext4_has_inline_data(inode) \|\|
3222		- ext4_da_should_update_i_disksize(page, end)) {
	3176	+ ext4_da_should_update_i_disksize(page, end))
3223	3177	ext4_update_i_disksize(inode, new_i_size);
3224		- /* We need to mark inode dirty even if
3225		- * new_i_size is less that inode->i_size
3226		- * bu greater than i_disksize.(hint delalloc)
3227		- */
3228		- ext4_mark_inode_dirty(handle, inode);
3229		- }
3230	3178	}
3231	3179
3232	3180	if (write_mode != CONVERT_INLINE_DATA &&
3233	3181	ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
3234	3182	ext4_has_inline_data(inode))
3235		- ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
	3183	+ ret = ext4_da_write_inline_data_end(inode, pos, len, copied,
3236	3184	page);
3237	3185	else
3238		- ret2 = generic_write_end(file, mapping, pos, len, copied,
	3186	+ ret = generic_write_end(file, mapping, pos, len, copied,
3239	3187	page, fsdata);
3240	3188
3241		- copied = ret2;
3242		- if (ret2 < 0)
3243		- ret = ret2;
	3189	+ copied = ret;
3244	3190	ret2 = ext4_journal_stop(handle);
3245		- if (!ret)
	3191	+ if (unlikely(ret2 && !ret))
3246	3192	ret = ret2;
3247	3193
3248	3194	return ret ? ret : copied;
3249		-}
3250		-
3251		-static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
3252		- unsigned int length)
3253		-{
3254		- /*
3255		- * Drop reserved blocks
3256		- */
3257		- BUG_ON(!PageLocked(page));
3258		- if (!page_has_buffers(page))
3259		- goto out;
3260		-
3261		- ext4_da_page_release_reservation(page, offset, length);
3262		-
3263		-out:
3264		- ext4_invalidatepage(page, offset, length);
3265		-
3266		- return;
3267	3195	}
3268	3196
3269	3197	/*
..	..	@@ -3328,13 +3256,15 @@
3328	3256	{
3329	3257	struct inode *inode = mapping->host;
3330	3258	journal_t *journal;
	3259	+ sector_t ret = 0;
3331	3260	int err;
3332	3261
	3262	+ inode_lock_shared(inode);
3333	3263	/*
3334	3264	* We can get here for an inline file via the FIBMAP ioctl
3335	3265	*/
3336	3266	if (ext4_has_inline_data(inode))
3337		- return 0;
	3267	+ goto out;
3338	3268
3339	3269	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3340	3270	test_opt(inode->i_sb, DELALLOC)) {
..	..	@@ -3373,10 +3303,14 @@
3373	3303	jbd2_journal_unlock_updates(journal);
3374	3304
3375	3305	if (err)
3376		- return 0;
	3306	+ goto out;
3377	3307	}
3378	3308
3379		- return generic_block_bmap(mapping, block, ext4_get_block);
	3309	+ ret = iomap_bmap(mapping, block, &ext4_iomap_ops);
	3310	+
	3311	+out:
	3312	+ inode_unlock_shared(inode);
	3313	+ return ret;
3380	3314	}
3381	3315
3382	3316	static int ext4_readpage(struct file file, struct page page)
..	..	@@ -3390,23 +3324,20 @@
3390	3324	ret = ext4_readpage_inline(inode, page);
3391	3325
3392	3326	if (ret == -EAGAIN)
3393		- return ext4_mpage_readpages(page->mapping, NULL, page, 1,
3394		- false);
	3327	+ return ext4_mpage_readpages(inode, NULL, page);
3395	3328
3396	3329	return ret;
3397	3330	}
3398	3331
3399		-static int
3400		-ext4_readpages(struct file file, struct address_space mapping,
3401		- struct list_head *pages, unsigned nr_pages)
	3332	+static void ext4_readahead(struct readahead_control *rac)
3402	3333	{
3403		- struct inode *inode = mapping->host;
	3334	+ struct inode *inode = rac->mapping->host;
3404	3335
3405		- /* If the file has inline data, no need to do readpages. */
	3336	+ /* If the file has inline data, no need to do readahead. */
3406	3337	if (ext4_has_inline_data(inode))
3407		- return 0;
	3338	+ return;
3408	3339
3409		- return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true);
	3340	+ ext4_mpage_readpages(inode, rac, NULL);
3410	3341	}
3411	3342
3412	3343	static void ext4_invalidatepage(struct page *page, unsigned int offset,
..	..	@@ -3455,7 +3386,7 @@
3455	3386	if (PageChecked(page))
3456	3387	return 0;
3457	3388	if (journal)
3458		- return jbd2_journal_try_to_free_buffers(journal, page, wait);
	3389	+ return jbd2_journal_try_to_free_buffers(journal, page);
3459	3390	else
3460	3391	return try_to_free_buffers(page);
3461	3392	}
..	..	@@ -3464,216 +3395,215 @@
3464	3395	{
3465	3396	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
3466	3397
3467		- if (journal)
3468		- return !jbd2_transaction_committed(journal,
3469		- EXT4_I(inode)->i_datasync_tid);
	3398	+ if (journal) {
	3399	+ if (jbd2_transaction_committed(journal,
	3400	+ EXT4_I(inode)->i_datasync_tid))
	3401	+ return false;
	3402	+ if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
	3403	+ return !list_empty(&EXT4_I(inode)->i_fc_list);
	3404	+ return true;
	3405	+ }
	3406	+
3470	3407	/* Any metadata buffers to write? */
3471	3408	if (!list_empty(&inode->i_mapping->private_list))
3472	3409	return true;
3473	3410	return inode->i_state & I_DIRTY_DATASYNC;
3474	3411	}
3475	3412
3476		-static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
3477		- unsigned flags, struct iomap *iomap)
	3413	+static void ext4_set_iomap(struct inode inode, struct iomap iomap,
	3414	+ struct ext4_map_blocks *map, loff_t offset,
	3415	+ loff_t length)
3478	3416	{
3479		- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3480		- unsigned int blkbits = inode->i_blkbits;
3481		- unsigned long first_block, last_block;
3482		- struct ext4_map_blocks map;
3483		- bool delalloc = false;
3484		- int ret;
3485		-
3486		- if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
3487		- return -EINVAL;
3488		- first_block = offset >> blkbits;
3489		- last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
3490		- EXT4_MAX_LOGICAL_BLOCK);
3491		-
3492		- if (flags & IOMAP_REPORT) {
3493		- if (ext4_has_inline_data(inode)) {
3494		- ret = ext4_inline_data_iomap(inode, iomap);
3495		- if (ret != -EAGAIN) {
3496		- if (ret == 0 && offset >= iomap->length)
3497		- ret = -ENOENT;
3498		- return ret;
3499		- }
3500		- }
3501		- } else {
3502		- if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
3503		- return -ERANGE;
3504		- }
3505		-
3506		- map.m_lblk = first_block;
3507		- map.m_len = last_block - first_block + 1;
3508		-
3509		- if (flags & IOMAP_REPORT) {
3510		- ret = ext4_map_blocks(NULL, inode, &map, 0);
3511		- if (ret < 0)
3512		- return ret;
3513		-
3514		- if (ret == 0) {
3515		- ext4_lblk_t end = map.m_lblk + map.m_len - 1;
3516		- struct extent_status es;
3517		-
3518		- ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es);
3519		-
3520		- if (!es.es_len \|\| es.es_lblk > end) {
3521		- /* entire range is a hole */
3522		- } else if (es.es_lblk > map.m_lblk) {
3523		- /* range starts with a hole */
3524		- map.m_len = es.es_lblk - map.m_lblk;
3525		- } else {
3526		- ext4_lblk_t offs = 0;
3527		-
3528		- if (es.es_lblk < map.m_lblk)
3529		- offs = map.m_lblk - es.es_lblk;
3530		- map.m_lblk = es.es_lblk + offs;
3531		- map.m_len = es.es_len - offs;
3532		- delalloc = true;
3533		- }
3534		- }
3535		- } else if (flags & IOMAP_WRITE) {
3536		- int dio_credits;
3537		- handle_t *handle;
3538		- int retries = 0;
3539		-
3540		- /* Trim mapping request to maximum we can map at once for DIO */
3541		- if (map.m_len > DIO_MAX_BLOCKS)
3542		- map.m_len = DIO_MAX_BLOCKS;
3543		- dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
3544		-retry:
3545		- /*
3546		- * Either we allocate blocks and then we don't get unwritten
3547		- * extent so we have reserved enough credits, or the blocks
3548		- * are already allocated and unwritten and in that case
3549		- * extent conversion fits in the credits as well.
3550		- */
3551		- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
3552		- dio_credits);
3553		- if (IS_ERR(handle))
3554		- return PTR_ERR(handle);
3555		-
3556		- ret = ext4_map_blocks(handle, inode, &map,
3557		- EXT4_GET_BLOCKS_CREATE_ZERO);
3558		- if (ret < 0) {
3559		- ext4_journal_stop(handle);
3560		- if (ret == -ENOSPC &&
3561		- ext4_should_retry_alloc(inode->i_sb, &retries))
3562		- goto retry;
3563		- return ret;
3564		- }
3565		-
3566		- /*
3567		- * If we added blocks beyond i_size, we need to make sure they
3568		- * will get truncated if we crash before updating i_size in
3569		- * ext4_iomap_end(). For faults we don't need to do that (and
3570		- * even cannot because for orphan list operations inode_lock is
3571		- * required) - if we happen to instantiate block beyond i_size,
3572		- * it is because we race with truncate which has already added
3573		- * the inode to the orphan list.
3574		- */
3575		- if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
3576		- (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
3577		- int err;
3578		-
3579		- err = ext4_orphan_add(handle, inode);
3580		- if (err < 0) {
3581		- ext4_journal_stop(handle);
3582		- return err;
3583		- }
3584		- }
3585		- ext4_journal_stop(handle);
3586		- } else {
3587		- ret = ext4_map_blocks(NULL, inode, &map, 0);
3588		- if (ret < 0)
3589		- return ret;
3590		- }
	3417	+ u8 blkbits = inode->i_blkbits;
3591	3418
3592	3419	/*
3593	3420	* Writes that span EOF might trigger an I/O size update on completion,
3594		- * so consider them to be dirty for the purposes of O_DSYNC, even if
3595		- * there is no other metadata changes being made or are pending here.
	3421	+ * so consider them to be dirty for the purpose of O_DSYNC, even if
	3422	+ * there is no other metadata changes being made or are pending.
3596	3423	*/
3597	3424	iomap->flags = 0;
3598	3425	if (ext4_inode_datasync_dirty(inode) \|\|
3599	3426	offset + length > i_size_read(inode))
3600	3427	iomap->flags \|= IOMAP_F_DIRTY;
3601		- iomap->bdev = inode->i_sb->s_bdev;
3602		- iomap->dax_dev = sbi->s_daxdev;
3603		- iomap->offset = (u64)first_block << blkbits;
3604		- iomap->length = (u64)map.m_len << blkbits;
3605	3428
3606		- if (ret == 0) {
3607		- iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
3608		- iomap->addr = IOMAP_NULL_ADDR;
3609		- } else {
3610		- if (map.m_flags & EXT4_MAP_MAPPED) {
3611		- iomap->type = IOMAP_MAPPED;
3612		- } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
3613		- iomap->type = IOMAP_UNWRITTEN;
3614		- } else {
3615		- WARN_ON_ONCE(1);
3616		- return -EIO;
3617		- }
3618		- iomap->addr = (u64)map.m_pblk << blkbits;
3619		- }
3620		-
3621		- if (map.m_flags & EXT4_MAP_NEW)
	3429	+ if (map->m_flags & EXT4_MAP_NEW)
3622	3430	iomap->flags \|= IOMAP_F_NEW;
3623	3431
	3432	+ iomap->bdev = inode->i_sb->s_bdev;
	3433	+ iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
	3434	+ iomap->offset = (u64) map->m_lblk << blkbits;
	3435	+ iomap->length = (u64) map->m_len << blkbits;
	3436	+
	3437	+ if ((map->m_flags & EXT4_MAP_MAPPED) &&
	3438	+ !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
	3439	+ iomap->flags \|= IOMAP_F_MERGED;
	3440	+
	3441	+ /*
	3442	+ * Flags passed to ext4_map_blocks() for direct I/O writes can result
	3443	+ * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
	3444	+ * set. In order for any allocated unwritten extents to be converted
	3445	+ * into written extents correctly within the ->end_io() handler, we
	3446	+ * need to ensure that the iomap->type is set appropriately. Hence, the
	3447	+ * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
	3448	+ * been set first.
	3449	+ */
	3450	+ if (map->m_flags & EXT4_MAP_UNWRITTEN) {
	3451	+ iomap->type = IOMAP_UNWRITTEN;
	3452	+ iomap->addr = (u64) map->m_pblk << blkbits;
	3453	+ } else if (map->m_flags & EXT4_MAP_MAPPED) {
	3454	+ iomap->type = IOMAP_MAPPED;
	3455	+ iomap->addr = (u64) map->m_pblk << blkbits;
	3456	+ } else {
	3457	+ iomap->type = IOMAP_HOLE;
	3458	+ iomap->addr = IOMAP_NULL_ADDR;
	3459	+ }
	3460	+}
	3461	+
	3462	+static int ext4_iomap_alloc(struct inode inode, struct ext4_map_blocks map,
	3463	+ unsigned int flags)
	3464	+{
	3465	+ handle_t *handle;
	3466	+ u8 blkbits = inode->i_blkbits;
	3467	+ int ret, dio_credits, m_flags = 0, retries = 0;
	3468	+
	3469	+ /*
	3470	+ * Trim the mapping request to the maximum value that we can map at
	3471	+ * once for direct I/O.
	3472	+ */
	3473	+ if (map->m_len > DIO_MAX_BLOCKS)
	3474	+ map->m_len = DIO_MAX_BLOCKS;
	3475	+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
	3476	+
	3477	+retry:
	3478	+ /*
	3479	+ * Either we allocate blocks and then don't get an unwritten extent, so
	3480	+ * in that case we have reserved enough credits. Or, the blocks are
	3481	+ * already allocated and unwritten. In that case, the extent conversion
	3482	+ * fits into the credits as well.
	3483	+ */
	3484	+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
	3485	+ if (IS_ERR(handle))
	3486	+ return PTR_ERR(handle);
	3487	+
	3488	+ /*
	3489	+ * DAX and direct I/O are the only two operations that are currently
	3490	+ * supported with IOMAP_WRITE.
	3491	+ */
	3492	+ WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
	3493	+ if (IS_DAX(inode))
	3494	+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
	3495	+ /*
	3496	+ * We use i_size instead of i_disksize here because delalloc writeback
	3497	+ * can complete at any point during the I/O and subsequently push the
	3498	+ * i_disksize out to i_size. This could be beyond where direct I/O is
	3499	+ * happening and thus expose allocated blocks to direct I/O reads.
	3500	+ */
	3501	+ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
	3502	+ m_flags = EXT4_GET_BLOCKS_CREATE;
	3503	+ else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
	3504	+ m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
	3505	+
	3506	+ ret = ext4_map_blocks(handle, inode, map, m_flags);
	3507	+
	3508	+ /*
	3509	+ * We cannot fill holes in indirect tree based inodes as that could
	3510	+ * expose stale data in the case of a crash. Use the magic error code
	3511	+ * to fallback to buffered I/O.
	3512	+ */
	3513	+ if (!m_flags && !ret)
	3514	+ ret = -ENOTBLK;
	3515	+
	3516	+ ext4_journal_stop(handle);
	3517	+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
	3518	+ goto retry;
	3519	+
	3520	+ return ret;
	3521	+}
	3522	+
	3523	+
	3524	+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
	3525	+ unsigned flags, struct iomap iomap, struct iomap srcmap)
	3526	+{
	3527	+ int ret;
	3528	+ struct ext4_map_blocks map;
	3529	+ u8 blkbits = inode->i_blkbits;
	3530	+
	3531	+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
	3532	+ return -EINVAL;
	3533	+
	3534	+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
	3535	+ return -ERANGE;
	3536	+
	3537	+ /*
	3538	+ * Calculate the first and last logical blocks respectively.
	3539	+ */
	3540	+ map.m_lblk = offset >> blkbits;
	3541	+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
	3542	+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
	3543	+
	3544	+ if (flags & IOMAP_WRITE) {
	3545	+ /*
	3546	+ * We check here if the blocks are already allocated, then we
	3547	+ * don't need to start a journal txn and we can directly return
	3548	+ * the mapping information. This could boost performance
	3549	+ * especially in multi-threaded overwrite requests.
	3550	+ */
	3551	+ if (offset + length <= i_size_read(inode)) {
	3552	+ ret = ext4_map_blocks(NULL, inode, &map, 0);
	3553	+ if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
	3554	+ goto out;
	3555	+ }
	3556	+ ret = ext4_iomap_alloc(inode, &map, flags);
	3557	+ } else {
	3558	+ ret = ext4_map_blocks(NULL, inode, &map, 0);
	3559	+ }
	3560	+
	3561	+ if (ret < 0)
	3562	+ return ret;
	3563	+out:
	3564	+
	3565	+ /*
	3566	+ * When inline encryption is enabled, sometimes I/O to an encrypted file
	3567	+ * has to be broken up to guarantee DUN contiguity. Handle this by
	3568	+ * limiting the length of the mapping returned.
	3569	+ */
	3570	+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
	3571	+
	3572	+ ext4_set_iomap(inode, iomap, &map, offset, length);
	3573	+
3624	3574	return 0;
	3575	+}
	3576	+
	3577	+static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
	3578	+ loff_t length, unsigned flags, struct iomap *iomap,
	3579	+ struct iomap *srcmap)
	3580	+{
	3581	+ int ret;
	3582	+
	3583	+ /*
	3584	+ * Even for writes we don't need to allocate blocks, so just pretend
	3585	+ * we are reading to save overhead of starting a transaction.
	3586	+ */
	3587	+ flags &= ~IOMAP_WRITE;
	3588	+ ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
	3589	+ WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);
	3590	+ return ret;
3625	3591	}
3626	3592
3627	3593	static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
3628	3594	ssize_t written, unsigned flags, struct iomap *iomap)
3629	3595	{
3630		- int ret = 0;
3631		- handle_t *handle;
3632		- int blkbits = inode->i_blkbits;
3633		- bool truncate = false;
3634		-
3635		- if (!(flags & IOMAP_WRITE) \|\| (flags & IOMAP_FAULT))
3636		- return 0;
3637		-
3638		- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3639		- if (IS_ERR(handle)) {
3640		- ret = PTR_ERR(handle);
3641		- goto orphan_del;
3642		- }
3643		- if (ext4_update_inode_size(inode, offset + written))
3644		- ext4_mark_inode_dirty(handle, inode);
3645	3596	/*
3646		- * We may need to truncate allocated but not written blocks beyond EOF.
	3597	+ * Check to see whether an error occurred while writing out the data to
	3598	+ * the allocated blocks. If so, return the magic error code so that we
	3599	+ * fallback to buffered I/O and attempt to complete the remainder of
	3600	+ * the I/O. Any blocks that may have been allocated in preparation for
	3601	+ * the direct I/O will be reused during buffered I/O.
3647	3602	*/
3648		- if (iomap->offset + iomap->length >
3649		- ALIGN(inode->i_size, 1 << blkbits)) {
3650		- ext4_lblk_t written_blk, end_blk;
	3603	+ if (flags & (IOMAP_WRITE \| IOMAP_DIRECT) && written == 0)
	3604	+ return -ENOTBLK;
3651	3605
3652		- written_blk = (offset + written) >> blkbits;
3653		- end_blk = (offset + length) >> blkbits;
3654		- if (written_blk < end_blk && ext4_can_truncate(inode))
3655		- truncate = true;
3656		- }
3657		- /*
3658		- * Remove inode from orphan list if we were extending a inode and
3659		- * everything went fine.
3660		- */
3661		- if (!truncate && inode->i_nlink &&
3662		- !list_empty(&EXT4_I(inode)->i_orphan))
3663		- ext4_orphan_del(handle, inode);
3664		- ext4_journal_stop(handle);
3665		- if (truncate) {
3666		- ext4_truncate_failed_write(inode);
3667		-orphan_del:
3668		- /*
3669		- * If truncate failed early the inode might still be on the
3670		- * orphan list; we need to make sure the inode is removed from
3671		- * the orphan list in that case.
3672		- */
3673		- if (inode->i_nlink)
3674		- ext4_orphan_del(NULL, inode);
3675		- }
3676		- return ret;
	3606	+ return 0;
3677	3607	}
3678	3608
3679	3609	const struct iomap_ops ext4_iomap_ops = {
..	..	@@ -3681,310 +3611,94 @@
3681	3611	.iomap_end = ext4_iomap_end,
3682	3612	};
3683	3613
3684		-static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3685		- ssize_t size, void *private)
	3614	+const struct iomap_ops ext4_iomap_overwrite_ops = {
	3615	+ .iomap_begin = ext4_iomap_overwrite_begin,
	3616	+ .iomap_end = ext4_iomap_end,
	3617	+};
	3618	+
	3619	+static bool ext4_iomap_is_delalloc(struct inode *inode,
	3620	+ struct ext4_map_blocks *map)
3686	3621	{
3687		- ext4_io_end_t *io_end = private;
	3622	+ struct extent_status es;
	3623	+ ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
3688	3624
3689		- /* if not async direct IO just return */
3690		- if (!io_end)
3691		- return 0;
	3625	+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
	3626	+ map->m_lblk, end, &es);
3692	3627
3693		- ext_debug("ext4_end_io_dio(): io_end 0x%p "
3694		- "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
3695		- io_end, io_end->inode->i_ino, iocb, offset, size);
	3628	+ if (!es.es_len \|\| es.es_lblk > end)
	3629	+ return false;
	3630	+
	3631	+ if (es.es_lblk > map->m_lblk) {
	3632	+ map->m_len = es.es_lblk - map->m_lblk;
	3633	+ return false;
	3634	+ }
	3635	+
	3636	+ offset = map->m_lblk - es.es_lblk;
	3637	+ map->m_len = es.es_len - offset;
	3638	+
	3639	+ return true;
	3640	+}
	3641	+
	3642	+static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
	3643	+ loff_t length, unsigned int flags,
	3644	+ struct iomap iomap, struct iomap srcmap)
	3645	+{
	3646	+ int ret;
	3647	+ bool delalloc = false;
	3648	+ struct ext4_map_blocks map;
	3649	+ u8 blkbits = inode->i_blkbits;
	3650	+
	3651	+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
	3652	+ return -EINVAL;
	3653	+
	3654	+ if (ext4_has_inline_data(inode)) {
	3655	+ ret = ext4_inline_data_iomap(inode, iomap);
	3656	+ if (ret != -EAGAIN) {
	3657	+ if (ret == 0 && offset >= iomap->length)
	3658	+ ret = -ENOENT;
	3659	+ return ret;
	3660	+ }
	3661	+ }
3696	3662
3697	3663	/*
3698		- * Error during AIO DIO. We cannot convert unwritten extents as the
3699		- * data was not written. Just clear the unwritten flag and drop io_end.
	3664	+ * Calculate the first and last logical block respectively.
3700	3665	*/
3701		- if (size <= 0) {
3702		- ext4_clear_io_unwritten_flag(io_end);
3703		- size = 0;
	3666	+ map.m_lblk = offset >> blkbits;
	3667	+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
	3668	+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
	3669	+
	3670	+ /*
	3671	+ * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
	3672	+ * So handle it here itself instead of querying ext4_map_blocks().
	3673	+ * Since ext4_map_blocks() will warn about it and will return
	3674	+ * -EIO error.
	3675	+ */
	3676	+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
	3677	+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
	3678	+
	3679	+ if (offset >= sbi->s_bitmap_maxbytes) {
	3680	+ map.m_flags = 0;
	3681	+ goto set_iomap;
	3682	+ }
3704	3683	}
3705		- io_end->offset = offset;
3706		- io_end->size = size;
3707		- ext4_put_io_end(io_end);
	3684	+
	3685	+ ret = ext4_map_blocks(NULL, inode, &map, 0);
	3686	+ if (ret < 0)
	3687	+ return ret;
	3688	+ if (ret == 0)
	3689	+ delalloc = ext4_iomap_is_delalloc(inode, &map);
	3690	+
	3691	+set_iomap:
	3692	+ ext4_set_iomap(inode, iomap, &map, offset, length);
	3693	+ if (delalloc && iomap->type == IOMAP_HOLE)
	3694	+ iomap->type = IOMAP_DELALLOC;
3708	3695
3709	3696	return 0;
3710	3697	}
3711	3698
3712		-/*
3713		- * Handling of direct IO writes.
3714		- *
3715		- * For ext4 extent files, ext4 will do direct-io write even to holes,
3716		- * preallocated extents, and those write extend the file, no need to
3717		- * fall back to buffered IO.
3718		- *
3719		- * For holes, we fallocate those blocks, mark them as unwritten
3720		- * If those blocks were preallocated, we mark sure they are split, but
3721		- * still keep the range to write as unwritten.
3722		- *
3723		- * The unwritten extents will be converted to written when DIO is completed.
3724		- * For async direct IO, since the IO may still pending when return, we
3725		- * set up an end_io call back function, which will do the conversion
3726		- * when async direct IO completed.
3727		- *
3728		- * If the O_DIRECT write will extend the file then add this inode to the
3729		- * orphan list. So recovery will truncate it back to the original size
3730		- * if the machine crashes during the write.
3731		- *
3732		- */
3733		-static ssize_t ext4_direct_IO_write(struct kiocb iocb, struct iov_iter iter)
3734		-{
3735		- struct file *file = iocb->ki_filp;
3736		- struct inode *inode = file->f_mapping->host;
3737		- struct ext4_inode_info *ei = EXT4_I(inode);
3738		- ssize_t ret;
3739		- loff_t offset = iocb->ki_pos;
3740		- size_t count = iov_iter_count(iter);
3741		- int overwrite = 0;
3742		- get_block_t *get_block_func = NULL;
3743		- int dio_flags = 0;
3744		- loff_t final_size = offset + count;
3745		- int orphan = 0;
3746		- handle_t *handle;
3747		-
3748		- if (final_size > inode->i_size \|\| final_size > ei->i_disksize) {
3749		- /* Credits for sb + inode write */
3750		- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3751		- if (IS_ERR(handle)) {
3752		- ret = PTR_ERR(handle);
3753		- goto out;
3754		- }
3755		- ret = ext4_orphan_add(handle, inode);
3756		- if (ret) {
3757		- ext4_journal_stop(handle);
3758		- goto out;
3759		- }
3760		- orphan = 1;
3761		- ext4_update_i_disksize(inode, inode->i_size);
3762		- ext4_journal_stop(handle);
3763		- }
3764		-
3765		- BUG_ON(iocb->private == NULL);
3766		-
3767		- /*
3768		- * Make all waiters for direct IO properly wait also for extent
3769		- * conversion. This also disallows race between truncate() and
3770		- * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
3771		- */
3772		- inode_dio_begin(inode);
3773		-
3774		- /* If we do a overwrite dio, i_mutex locking can be released */
3775		- overwrite = ((int )iocb->private);
3776		-
3777		- if (overwrite)
3778		- inode_unlock(inode);
3779		-
3780		- /*
3781		- * For extent mapped files we could direct write to holes and fallocate.
3782		- *
3783		- * Allocated blocks to fill the hole are marked as unwritten to prevent
3784		- * parallel buffered read to expose the stale data before DIO complete
3785		- * the data IO.
3786		- *
3787		- * As to previously fallocated extents, ext4 get_block will just simply
3788		- * mark the buffer mapped but still keep the extents unwritten.
3789		- *
3790		- * For non AIO case, we will convert those unwritten extents to written
3791		- * after return back from blockdev_direct_IO. That way we save us from
3792		- * allocating io_end structure and also the overhead of offloading
3793		- * the extent convertion to a workqueue.
3794		- *
3795		- * For async DIO, the conversion needs to be deferred when the
3796		- * IO is completed. The ext4 end_io callback function will be
3797		- * called to take care of the conversion work. Here for async
3798		- * case, we allocate an io_end structure to hook to the iocb.
3799		- */
3800		- iocb->private = NULL;
3801		- if (overwrite)
3802		- get_block_func = ext4_dio_get_block_overwrite;
3803		- else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) \|\|
3804		- round_down(offset, i_blocksize(inode)) >= inode->i_size) {
3805		- get_block_func = ext4_dio_get_block;
3806		- dio_flags = DIO_LOCKING \| DIO_SKIP_HOLES;
3807		- } else if (is_sync_kiocb(iocb)) {
3808		- get_block_func = ext4_dio_get_block_unwritten_sync;
3809		- dio_flags = DIO_LOCKING;
3810		- } else {
3811		- get_block_func = ext4_dio_get_block_unwritten_async;
3812		- dio_flags = DIO_LOCKING;
3813		- }
3814		- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
3815		- get_block_func, ext4_end_io_dio, NULL,
3816		- dio_flags);
3817		-
3818		- if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3819		- EXT4_STATE_DIO_UNWRITTEN)) {
3820		- int err;
3821		- /*
3822		- * for non AIO case, since the IO is already
3823		- * completed, we could do the conversion right here
3824		- */
3825		- err = ext4_convert_unwritten_extents(NULL, inode,
3826		- offset, ret);
3827		- if (err < 0)
3828		- ret = err;
3829		- ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3830		- }
3831		-
3832		- inode_dio_end(inode);
3833		- /* take i_mutex locking again if we do a ovewrite dio */
3834		- if (overwrite)
3835		- inode_lock(inode);
3836		-
3837		- if (ret < 0 && final_size > inode->i_size)
3838		- ext4_truncate_failed_write(inode);
3839		-
3840		- /* Handle extending of i_size after direct IO write */
3841		- if (orphan) {
3842		- int err;
3843		-
3844		- /* Credits for sb + inode write */
3845		- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3846		- if (IS_ERR(handle)) {
3847		- /*
3848		- * We wrote the data but cannot extend
3849		- * i_size. Bail out. In async io case, we do
3850		- * not return error here because we have
3851		- * already submmitted the corresponding
3852		- * bio. Returning error here makes the caller
3853		- * think that this IO is done and failed
3854		- * resulting in race with bio's completion
3855		- * handler.
3856		- */
3857		- if (!ret)
3858		- ret = PTR_ERR(handle);
3859		- if (inode->i_nlink)
3860		- ext4_orphan_del(NULL, inode);
3861		-
3862		- goto out;
3863		- }
3864		- if (inode->i_nlink)
3865		- ext4_orphan_del(handle, inode);
3866		- if (ret > 0) {
3867		- loff_t end = offset + ret;
3868		- if (end > inode->i_size \|\| end > ei->i_disksize) {
3869		- ext4_update_i_disksize(inode, end);
3870		- if (end > inode->i_size)
3871		- i_size_write(inode, end);
3872		- /*
3873		- * We're going to return a positive `ret'
3874		- * here due to non-zero-length I/O, so there's
3875		- * no way of reporting error returns from
3876		- * ext4_mark_inode_dirty() to userspace. So
3877		- * ignore it.
3878		- */
3879		- ext4_mark_inode_dirty(handle, inode);
3880		- }
3881		- }
3882		- err = ext4_journal_stop(handle);
3883		- if (ret == 0)
3884		- ret = err;
3885		- }
3886		-out:
3887		- return ret;
3888		-}
3889		-
3890		-static ssize_t ext4_direct_IO_read(struct kiocb iocb, struct iov_iter iter)
3891		-{
3892		- struct address_space *mapping = iocb->ki_filp->f_mapping;
3893		- struct inode *inode = mapping->host;
3894		- size_t count = iov_iter_count(iter);
3895		- ssize_t ret;
3896		- loff_t offset = iocb->ki_pos;
3897		- loff_t size = i_size_read(inode);
3898		-
3899		- if (offset >= size)
3900		- return 0;
3901		-
3902		- /*
3903		- * Shared inode_lock is enough for us - it protects against concurrent
3904		- * writes & truncates and since we take care of writing back page cache,
3905		- * we are protected against page writeback as well.
3906		- */
3907		- if (iocb->ki_flags & IOCB_NOWAIT) {
3908		- if (!inode_trylock_shared(inode))
3909		- return -EAGAIN;
3910		- } else {
3911		- inode_lock_shared(inode);
3912		- }
3913		-
3914		- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
3915		- iocb->ki_pos + count - 1);
3916		- if (ret)
3917		- goto out_unlock;
3918		- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
3919		- iter, ext4_dio_get_block, NULL, NULL, 0);
3920		-out_unlock:
3921		- inode_unlock_shared(inode);
3922		- return ret;
3923		-}
3924		-
3925		-static ssize_t ext4_direct_IO(struct kiocb iocb, struct iov_iter iter)
3926		-{
3927		- struct file *file = iocb->ki_filp;
3928		- struct inode *inode = file->f_mapping->host;
3929		- size_t count = iov_iter_count(iter);
3930		- loff_t offset = iocb->ki_pos;
3931		- ssize_t ret;
3932		- int rw = iov_iter_rw(iter);
3933		-
3934		- if (!fscrypt_dio_supported(iocb, iter))
3935		- return 0;
3936		-
3937		- if (fsverity_active(inode))
3938		- return 0;
3939		-
3940		- /*
3941		- * If we are doing data journalling we don't support O_DIRECT
3942		- */
3943		- if (ext4_should_journal_data(inode))
3944		- return 0;
3945		-
3946		- /* Let buffer I/O handle the inline data case. */
3947		- if (ext4_has_inline_data(inode))
3948		- return 0;
3949		-
3950		- if (trace_android_fs_dataread_start_enabled() &&
3951		- (rw == READ)) {
3952		- char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
3953		-
3954		- path = android_fstrace_get_pathname(pathbuf,
3955		- MAX_TRACE_PATHBUF_LEN,
3956		- inode);
3957		- trace_android_fs_dataread_start(inode, offset, count,
3958		- current->pid, path,
3959		- current->comm);
3960		- }
3961		- if (trace_android_fs_datawrite_start_enabled() &&
3962		- (rw == WRITE)) {
3963		- char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
3964		-
3965		- path = android_fstrace_get_pathname(pathbuf,
3966		- MAX_TRACE_PATHBUF_LEN,
3967		- inode);
3968		- trace_android_fs_datawrite_start(inode, offset, count,
3969		- current->pid, path,
3970		- current->comm);
3971		- }
3972		- trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
3973		- if (iov_iter_rw(iter) == READ)
3974		- ret = ext4_direct_IO_read(iocb, iter);
3975		- else
3976		- ret = ext4_direct_IO_write(iocb, iter);
3977		- trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
3978		-
3979		- if (trace_android_fs_dataread_start_enabled() &&
3980		- (rw == READ))
3981		- trace_android_fs_dataread_end(inode, offset, count);
3982		- if (trace_android_fs_datawrite_start_enabled() &&
3983		- (rw == WRITE))
3984		- trace_android_fs_datawrite_end(inode, offset, count);
3985		-
3986		- return ret;
3987		-}
	3699	+const struct iomap_ops ext4_iomap_report_ops = {
	3700	+ .iomap_begin = ext4_iomap_begin_report,
	3701	+};
3988	3702
3989	3703	/*
3990	3704	* Pages can be marked dirty completely asynchronously from ext4's journalling
..	..	@@ -4012,9 +3726,16 @@
4012	3726	return __set_page_dirty_buffers(page);
4013	3727	}
4014	3728
	3729	+static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
	3730	+ struct file file, sector_t span)
	3731	+{
	3732	+ return iomap_swapfile_activate(sis, file, span,
	3733	+ &ext4_iomap_report_ops);
	3734	+}
	3735	+
4015	3736	static const struct address_space_operations ext4_aops = {
4016	3737	.readpage = ext4_readpage,
4017		- .readpages = ext4_readpages,
	3738	+ .readahead = ext4_readahead,
4018	3739	.writepage = ext4_writepage,
4019	3740	.writepages = ext4_writepages,
4020	3741	.write_begin = ext4_write_begin,
..	..	@@ -4023,15 +3744,16 @@
4023	3744	.bmap = ext4_bmap,
4024	3745	.invalidatepage = ext4_invalidatepage,
4025	3746	.releasepage = ext4_releasepage,
4026		- .direct_IO = ext4_direct_IO,
	3747	+ .direct_IO = noop_direct_IO,
4027	3748	.migratepage = buffer_migrate_page,
4028	3749	.is_partially_uptodate = block_is_partially_uptodate,
4029	3750	.error_remove_page = generic_error_remove_page,
	3751	+ .swap_activate = ext4_iomap_swap_activate,
4030	3752	};
4031	3753
4032	3754	static const struct address_space_operations ext4_journalled_aops = {
4033	3755	.readpage = ext4_readpage,
4034		- .readpages = ext4_readpages,
	3756	+ .readahead = ext4_readahead,
4035	3757	.writepage = ext4_writepage,
4036	3758	.writepages = ext4_writepages,
4037	3759	.write_begin = ext4_write_begin,
..	..	@@ -4040,26 +3762,28 @@
4040	3762	.bmap = ext4_bmap,
4041	3763	.invalidatepage = ext4_journalled_invalidatepage,
4042	3764	.releasepage = ext4_releasepage,
4043		- .direct_IO = ext4_direct_IO,
	3765	+ .direct_IO = noop_direct_IO,
4044	3766	.is_partially_uptodate = block_is_partially_uptodate,
4045	3767	.error_remove_page = generic_error_remove_page,
	3768	+ .swap_activate = ext4_iomap_swap_activate,
4046	3769	};
4047	3770
4048	3771	static const struct address_space_operations ext4_da_aops = {
4049	3772	.readpage = ext4_readpage,
4050		- .readpages = ext4_readpages,
	3773	+ .readahead = ext4_readahead,
4051	3774	.writepage = ext4_writepage,
4052	3775	.writepages = ext4_writepages,
4053	3776	.write_begin = ext4_da_write_begin,
4054	3777	.write_end = ext4_da_write_end,
4055	3778	.set_page_dirty = ext4_set_page_dirty,
4056	3779	.bmap = ext4_bmap,
4057		- .invalidatepage = ext4_da_invalidatepage,
	3780	+ .invalidatepage = ext4_invalidatepage,
4058	3781	.releasepage = ext4_releasepage,
4059		- .direct_IO = ext4_direct_IO,
	3782	+ .direct_IO = noop_direct_IO,
4060	3783	.migratepage = buffer_migrate_page,
4061	3784	.is_partially_uptodate = block_is_partially_uptodate,
4062	3785	.error_remove_page = generic_error_remove_page,
	3786	+ .swap_activate = ext4_iomap_swap_activate,
4063	3787	};
4064	3788
4065	3789	static const struct address_space_operations ext4_dax_aops = {
..	..	@@ -4068,6 +3792,7 @@
4068	3792	.set_page_dirty = noop_set_page_dirty,
4069	3793	.bmap = ext4_bmap,
4070	3794	.invalidatepage = noop_invalidatepage,
	3795	+ .swap_activate = ext4_iomap_swap_activate,
4071	3796	};
4072	3797
4073	3798	void ext4_set_aops(struct inode *inode)
..	..	@@ -4141,18 +3866,18 @@
4141	3866	set_buffer_uptodate(bh);
4142	3867
4143	3868	if (!buffer_uptodate(bh)) {
4144		- err = -EIO;
4145		- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
4146		- wait_on_buffer(bh);
4147		- /* Uhhuh. Read error. Complain and punt. */
4148		- if (!buffer_uptodate(bh))
	3869	+ err = ext4_read_bh_lock(bh, 0, true);
	3870	+ if (err)
4149	3871	goto unlock;
4150	3872	if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
4151	3873	/* We expect the key to be set. */
4152	3874	BUG_ON(!fscrypt_has_encryption_key(inode));
4153		- BUG_ON(blocksize != PAGE_SIZE);
4154		- WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks(
4155		- page, PAGE_SIZE, 0));
	3875	+ err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
	3876	+ bh_offset(bh));
	3877	+ if (err) {
	3878	+ clear_buffer_uptodate(bh);
	3879	+ goto unlock;
	3880	+ }
4156	3881	}
4157	3882	}
4158	3883	if (ext4_should_journal_data(inode)) {
..	..	@@ -4185,7 +3910,7 @@
4185	3910	* starting from file offset 'from'. The range to be zero'd must
4186	3911	* be contained with in one block. If the specified range exceeds
4187	3912	* the end of the block it will be shortened to end of the block
4188		- * that cooresponds to 'from'
	3913	+ * that corresponds to 'from'
4189	3914	*/
4190	3915	static int ext4_block_zero_page_range(handle_t *handle,
4191	3916	struct address_space *mapping, loff_t from, loff_t length)
..	..	@@ -4292,6 +4017,8 @@
4292	4017	loff_t len)
4293	4018	{
4294	4019	handle_t *handle;
	4020	+ int ret;
	4021	+
4295	4022	loff_t size = i_size_read(inode);
4296	4023
4297	4024	WARN_ON(!inode_is_locked(inode));
..	..	@@ -4305,10 +4032,10 @@
4305	4032	if (IS_ERR(handle))
4306	4033	return PTR_ERR(handle);
4307	4034	ext4_update_i_disksize(inode, size);
4308		- ext4_mark_inode_dirty(handle, inode);
	4035	+ ret = ext4_mark_inode_dirty(handle, inode);
4309	4036	ext4_journal_stop(handle);
4310	4037
4311		- return 0;
	4038	+ return ret;
4312	4039	}
4313	4040
4314	4041	static void ext4_wait_dax_page(struct ext4_inode_info *ei)
..	..	@@ -4352,29 +4079,19 @@
4352	4079	* Returns: 0 on success or negative on failure
4353	4080	*/
4354	4081
4355		-int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
	4082	+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
4356	4083	{
	4084	+ struct inode *inode = file_inode(file);
4357	4085	struct super_block *sb = inode->i_sb;
4358	4086	ext4_lblk_t first_block, stop_block;
4359	4087	struct address_space *mapping = inode->i_mapping;
4360		- loff_t first_block_offset, last_block_offset;
	4088	+ loff_t first_block_offset, last_block_offset, max_length;
	4089	+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4361	4090	handle_t *handle;
4362	4091	unsigned int credits;
4363		- int ret = 0;
4364		-
4365		- if (!S_ISREG(inode->i_mode))
4366		- return -EOPNOTSUPP;
	4092	+ int ret = 0, ret2 = 0;
4367	4093
4368	4094	trace_ext4_punch_hole(inode, offset, length, 0);
4369		-
4370		- ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
4371		- if (ext4_has_inline_data(inode)) {
4372		- down_write(&EXT4_I(inode)->i_mmap_sem);
4373		- ret = ext4_convert_inline_data(inode);
4374		- up_write(&EXT4_I(inode)->i_mmap_sem);
4375		- if (ret)
4376		- return ret;
4377		- }
4378	4095
4379	4096	/*
4380	4097	* Write out all dirty pages to avoid race conditions
..	..	@@ -4403,6 +4120,14 @@
4403	4120	offset;
4404	4121	}
4405	4122
	4123	+ /*
	4124	+ * For punch hole the length + offset needs to be within one block
	4125	+ * before last range. Adjust the length if it goes beyond that limit.
	4126	+ */
	4127	+ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
	4128	+ if (offset + length > max_length)
	4129	+ length = max_length - offset;
	4130	+
4406	4131	if (offset & (sb->s_blocksize - 1) \|\|
4407	4132	(offset + length) & (sb->s_blocksize - 1)) {
4408	4133	/*
..	..	@@ -4417,6 +4142,10 @@
4417	4142
4418	4143	/* Wait all existing dio workers, newcomers will block on i_mutex */
4419	4144	inode_dio_wait(inode);
	4145	+
	4146	+ ret = file_modified(file);
	4147	+ if (ret)
	4148	+ goto out_mutex;
4420	4149
4421	4150	/*
4422	4151	* Prevent page faults from reinstantiating pages we have released from
..	..	@@ -4464,7 +4193,7 @@
4464	4193	if (stop_block > first_block) {
4465	4194
4466	4195	down_write(&EXT4_I(inode)->i_data_sem);
4467		- ext4_discard_preallocations(inode);
	4196	+ ext4_discard_preallocations(inode, 0);
4468	4197
4469	4198	ret = ext4_es_remove_extent(inode, first_block,
4470	4199	stop_block - first_block);
..	..	@@ -4482,11 +4211,14 @@
4482	4211
4483	4212	up_write(&EXT4_I(inode)->i_data_sem);
4484	4213	}
	4214	+ ext4_fc_track_range(handle, inode, first_block, stop_block);
4485	4215	if (IS_SYNC(inode))
4486	4216	ext4_handle_sync(handle);
4487	4217
4488	4218	inode->i_mtime = inode->i_ctime = current_time(inode);
4489		- ext4_mark_inode_dirty(handle, inode);
	4219	+ ret2 = ext4_mark_inode_dirty(handle, inode);
	4220	+ if (unlikely(ret2))
	4221	+ ret = ret2;
4490	4222	if (ret >= 0)
4491	4223	ext4_update_inode_fsync_trans(handle, inode, 1);
4492	4224	out_stop:
..	..	@@ -4555,7 +4287,7 @@
4555	4287	{
4556	4288	struct ext4_inode_info *ei = EXT4_I(inode);
4557	4289	unsigned int credits;
4558		- int err = 0;
	4290	+ int err = 0, err2;
4559	4291	handle_t *handle;
4560	4292	struct address_space *mapping = inode->i_mapping;
4561	4293
..	..	@@ -4569,9 +4301,7 @@
4569	4301	trace_ext4_truncate_enter(inode);
4570	4302
4571	4303	if (!ext4_can_truncate(inode))
4572		- return 0;
4573		-
4574		- ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
	4304	+ goto out_trace;
4575	4305
4576	4306	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4577	4307	ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
..	..	@@ -4580,16 +4310,15 @@
4580	4310	int has_inline = 1;
4581	4311
4582	4312	err = ext4_inline_data_truncate(inode, &has_inline);
4583		- if (err)
4584		- return err;
4585		- if (has_inline)
4586		- return 0;
	4313	+ if (err \|\| has_inline)
	4314	+ goto out_trace;
4587	4315	}
4588	4316
4589	4317	/* If we zero-out tail of the page, we have to create jinode for jbd2 */
4590	4318	if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
4591		- if (ext4_inode_attach_jinode(inode) < 0)
4592		- return 0;
	4319	+ err = ext4_inode_attach_jinode(inode);
	4320	+ if (err)
	4321	+ goto out_trace;
4593	4322	}
4594	4323
4595	4324	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
..	..	@@ -4598,8 +4327,10 @@
4598	4327	credits = ext4_blocks_for_truncate(inode);
4599	4328
4600	4329	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
4601		- if (IS_ERR(handle))
4602		- return PTR_ERR(handle);
	4330	+ if (IS_ERR(handle)) {
	4331	+ err = PTR_ERR(handle);
	4332	+ goto out_trace;
	4333	+ }
4603	4334
4604	4335	if (inode->i_size & (inode->i_sb->s_blocksize - 1))
4605	4336	ext4_block_truncate_page(handle, mapping, inode->i_size);
..	..	@@ -4619,7 +4350,7 @@
4619	4350
4620	4351	down_write(&EXT4_I(inode)->i_data_sem);
4621	4352
4622		- ext4_discard_preallocations(inode);
	4353	+ ext4_discard_preallocations(inode, 0);
4623	4354
4624	4355	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4625	4356	err = ext4_ext_truncate(handle, inode);
..	..	@@ -4645,9 +4376,12 @@
4645	4376	ext4_orphan_del(handle, inode);
4646	4377
4647	4378	inode->i_mtime = inode->i_ctime = current_time(inode);
4648		- ext4_mark_inode_dirty(handle, inode);
	4379	+ err2 = ext4_mark_inode_dirty(handle, inode);
	4380	+ if (unlikely(err2 && !err))
	4381	+ err = err2;
4649	4382	ext4_journal_stop(handle);
4650	4383
	4384	+out_trace:
4651	4385	trace_ext4_truncate_exit(inode);
4652	4386	return err;
4653	4387	}
..	..	@@ -4658,21 +4392,22 @@
4658	4392	* data in memory that is needed to recreate the on-disk version of this
4659	4393	* inode.
4660	4394	*/
4661		-static int __ext4_get_inode_loc(struct inode *inode,
4662		- struct ext4_iloc *iloc, int in_mem)
	4395	+static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
	4396	+ struct ext4_iloc *iloc, int in_mem,
	4397	+ ext4_fsblk_t *ret_block)
4663	4398	{
4664	4399	struct ext4_group_desc *gdp;
4665	4400	struct buffer_head *bh;
4666		- struct super_block *sb = inode->i_sb;
4667	4401	ext4_fsblk_t block;
	4402	+ struct blk_plug plug;
4668	4403	int inodes_per_block, inode_offset;
4669	4404
4670	4405	iloc->bh = NULL;
4671		- if (inode->i_ino < EXT4_ROOT_INO \|\|
4672		- inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
	4406	+ if (ino < EXT4_ROOT_INO \|\|
	4407	+ ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
4673	4408	return -EFSCORRUPTED;
4674	4409
4675		- iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
	4410	+ iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
4676	4411	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
4677	4412	if (!gdp)
4678	4413	return -EIO;
..	..	@@ -4681,27 +4416,28 @@
4681	4416	* Figure out the offset within the block group inode table
4682	4417	*/
4683	4418	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4684		- inode_offset = ((inode->i_ino - 1) %
	4419	+ inode_offset = ((ino - 1) %
4685	4420	EXT4_INODES_PER_GROUP(sb));
4686		- block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
4687	4421	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
	4422	+
	4423	+ block = ext4_inode_table(sb, gdp);
	4424	+ if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) \|\|
	4425	+ (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
	4426	+ ext4_error(sb, "Invalid inode table block %llu in "
	4427	+ "block_group %u", block, iloc->block_group);
	4428	+ return -EFSCORRUPTED;
	4429	+ }
	4430	+ block += (inode_offset / inodes_per_block);
4688	4431
4689	4432	bh = sb_getblk(sb, block);
4690	4433	if (unlikely(!bh))
4691	4434	return -ENOMEM;
	4435	+ if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
	4436	+ goto simulate_eio;
4692	4437	if (!buffer_uptodate(bh)) {
4693	4438	lock_buffer(bh);
4694	4439
4695		- /*
4696		- * If the buffer has the write error flag, we have failed
4697		- * to write out another inode in the same block. In this
4698		- * case, we don't have to read the block because we may
4699		- * read the old inode data successfully.
4700		- */
4701		- if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4702		- set_buffer_uptodate(bh);
4703		-
4704		- if (buffer_uptodate(bh)) {
	4440	+ if (ext4_buffer_uptodate(bh)) {
4705	4441	/* someone brought it uptodate while we waited */
4706	4442	unlock_buffer(bh);
4707	4443	goto has_buffer;
..	..	@@ -4753,6 +4489,7 @@
4753	4489	* If we need to do any I/O, try to pre-readahead extra
4754	4490	* blocks from the inode table.
4755	4491	*/
	4492	+ blk_start_plug(&plug);
4756	4493	if (EXT4_SB(sb)->s_inode_readahead_blks) {
4757	4494	ext4_fsblk_t b, end, table;
4758	4495	unsigned num;
..	..	@@ -4771,7 +4508,7 @@
4771	4508	if (end > table)
4772	4509	end = table;
4773	4510	while (b <= end)
4774		- sb_breadahead_unmovable(sb, b++);
	4511	+ ext4_sb_breadahead_unmovable(sb, b++);
4775	4512	}
4776	4513
4777	4514	/*
..	..	@@ -4779,14 +4516,14 @@
4779	4516	* has in-inode xattrs, or we don't have this inode in memory.
4780	4517	* Read the block from disk.
4781	4518	*/
4782		- trace_ext4_load_inode(inode);
4783		- get_bh(bh);
4784		- bh->b_end_io = end_buffer_read_sync;
4785		- submit_bh(REQ_OP_READ, REQ_META \| REQ_PRIO, bh);
	4519	+ trace_ext4_load_inode(sb, ino);
	4520	+ ext4_read_bh_nowait(bh, REQ_META \| REQ_PRIO, NULL);
	4521	+ blk_finish_plug(&plug);
4786	4522	wait_on_buffer(bh);
4787	4523	if (!buffer_uptodate(bh)) {
4788		- EXT4_ERROR_INODE_BLOCK(inode, block,
4789		- "unable to read itable block");
	4524	+ simulate_eio:
	4525	+ if (ret_block)
	4526	+ *ret_block = block;
4790	4527	brelse(bh);
4791	4528	return -EIO;
4792	4529	}
..	..	@@ -4796,16 +4533,50 @@
4796	4533	return 0;
4797	4534	}
4798	4535
4799		-int ext4_get_inode_loc(struct inode inode, struct ext4_iloc iloc)
	4536	+static int __ext4_get_inode_loc_noinmem(struct inode *inode,
	4537	+ struct ext4_iloc *iloc)
4800	4538	{
4801		- /* We have all inode data except xattrs in memory here. */
4802		- return __ext4_get_inode_loc(inode, iloc,
4803		- !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
	4539	+ ext4_fsblk_t err_blk = 0;
	4540	+ int ret;
	4541	+
	4542	+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0,
	4543	+ &err_blk);
	4544	+
	4545	+ if (ret == -EIO)
	4546	+ ext4_error_inode_block(inode, err_blk, EIO,
	4547	+ "unable to read itable block");
	4548	+
	4549	+ return ret;
4804	4550	}
4805	4551
4806		-static bool ext4_should_use_dax(struct inode *inode)
	4552	+int ext4_get_inode_loc(struct inode inode, struct ext4_iloc iloc)
4807	4553	{
4808		- if (!test_opt(inode->i_sb, DAX))
	4554	+ ext4_fsblk_t err_blk = 0;
	4555	+ int ret;
	4556	+
	4557	+ /* We have all inode data except xattrs in memory here. */
	4558	+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc,
	4559	+ !ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk);
	4560	+
	4561	+ if (ret == -EIO)
	4562	+ ext4_error_inode_block(inode, err_blk, EIO,
	4563	+ "unable to read itable block");
	4564	+
	4565	+ return ret;
	4566	+}
	4567	+
	4568	+
	4569	+int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
	4570	+ struct ext4_iloc *iloc)
	4571	+{
	4572	+ return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL);
	4573	+}
	4574	+
	4575	+static bool ext4_should_enable_dax(struct inode *inode)
	4576	+{
	4577	+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
	4578	+
	4579	+ if (test_opt2(inode->i_sb, DAX_NEVER))
4809	4580	return false;
4810	4581	if (!S_ISREG(inode->i_mode))
4811	4582	return false;
..	..	@@ -4817,13 +4588,20 @@
4817	4588	return false;
4818	4589	if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
4819	4590	return false;
4820		- return true;
	4591	+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
	4592	+ return false;
	4593	+ if (test_opt(inode->i_sb, DAX_ALWAYS))
	4594	+ return true;
	4595	+
	4596	+ return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
4821	4597	}
4822	4598
4823		-void ext4_set_inode_flags(struct inode *inode)
	4599	+void ext4_set_inode_flags(struct inode *inode, bool init)
4824	4600	{
4825	4601	unsigned int flags = EXT4_I(inode)->i_flags;
4826	4602	unsigned int new_fl = 0;
	4603	+
	4604	+ WARN_ON_ONCE(IS_DAX(inode) && init);
4827	4605
4828	4606	if (flags & EXT4_SYNC_FL)
4829	4607	new_fl \|= S_SYNC;
..	..	@@ -4835,8 +4613,13 @@
4835	4613	new_fl \|= S_NOATIME;
4836	4614	if (flags & EXT4_DIRSYNC_FL)
4837	4615	new_fl \|= S_DIRSYNC;
4838		- if (ext4_should_use_dax(inode))
	4616	+
	4617	+ /* Because of the way inode_set_flags() works we must preserve S_DAX
	4618	+ * here if already set. */
	4619	+ new_fl \|= (inode->i_flags & S_DAX);
	4620	+ if (init && ext4_should_enable_dax(inode))
4839	4621	new_fl \|= S_DAX;
	4622	+
4840	4623	if (flags & EXT4_ENCRYPT_FL)
4841	4624	new_fl \|= S_ENCRYPTED;
4842	4625	if (flags & EXT4_CASEFOLD_FL)
..	..	@@ -4877,11 +4660,15 @@
4877	4660	__le32 magic = (void )raw_inode +
4878	4661	EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
4879	4662
4880		- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
4881		- EXT4_INODE_SIZE(inode->i_sb) &&
	4663	+ if (EXT4_INODE_HAS_XATTR_SPACE(inode) &&
4882	4664	*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
	4665	+ int err;
	4666	+
4883	4667	ext4_set_inode_state(inode, EXT4_STATE_XATTR);
4884		- return ext4_find_inline_data_nolock(inode);
	4668	+ err = ext4_find_inline_data_nolock(inode);
	4669	+ if (!err && ext4_has_inline_data(inode))
	4670	+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
	4671	+ return err;
4885	4672	} else
4886	4673	EXT4_I(inode)->i_inline_off = 0;
4887	4674	return 0;
..	..	@@ -4915,6 +4702,24 @@
4915	4702	return inode_peek_iversion(inode);
4916	4703	}
4917	4704
	4705	+static const char check_igot_inode(struct inode inode, ext4_iget_flags flags)
	4706	+
	4707	+{
	4708	+ if (flags & EXT4_IGET_EA_INODE) {
	4709	+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
	4710	+ return "missing EA_INODE flag";
	4711	+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) \|\|
	4712	+ EXT4_I(inode)->i_file_acl)
	4713	+ return "ea_inode with extended attributes";
	4714	+ } else {
	4715	+ if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
	4716	+ return "unexpected EA_INODE flag";
	4717	+ }
	4718	+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
	4719	+ return "unexpected bad inode w/o EXT4_IGET_BAD";
	4720	+ return NULL;
	4721	+}
	4722	+
4918	4723	struct inode __ext4_iget(struct super_block sb, unsigned long ino,
4919	4724	ext4_iget_flags flags, const char *function,
4920	4725	unsigned int line)
..	..	@@ -4923,6 +4728,7 @@
4923	4728	struct ext4_inode *raw_inode;
4924	4729	struct ext4_inode_info *ei;
4925	4730	struct inode *inode;
	4731	+ const char *err_str;
4926	4732	journal_t *journal = EXT4_SB(sb)->s_journal;
4927	4733	long ret;
4928	4734	loff_t size;
..	..	@@ -4937,7 +4743,7 @@
4937	4743	(ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
4938	4744	if (flags & EXT4_IGET_HANDLE)
4939	4745	return ERR_PTR(-ESTALE);
4940		- __ext4_error(sb, function, line,
	4746	+ __ext4_error(sb, function, line, EFSCORRUPTED, 0,
4941	4747	"inode #%lu: comm %s: iget: illegal inode #",
4942	4748	ino, current->comm);
4943	4749	return ERR_PTR(-EFSCORRUPTED);
..	..	@@ -4946,23 +4752,22 @@
4946	4752	inode = iget_locked(sb, ino);
4947	4753	if (!inode)
4948	4754	return ERR_PTR(-ENOMEM);
4949		- if (!(inode->i_state & I_NEW))
	4755	+ if (!(inode->i_state & I_NEW)) {
	4756	+ if ((err_str = check_igot_inode(inode, flags)) != NULL) {
	4757	+ ext4_error_inode(inode, function, line, 0, err_str);
	4758	+ iput(inode);
	4759	+ return ERR_PTR(-EFSCORRUPTED);
	4760	+ }
4950	4761	return inode;
	4762	+ }
4951	4763
4952	4764	ei = EXT4_I(inode);
4953	4765	iloc.bh = NULL;
4954	4766
4955		- ret = __ext4_get_inode_loc(inode, &iloc, 0);
	4767	+ ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
4956	4768	if (ret < 0)
4957	4769	goto bad_inode;
4958	4770	raw_inode = ext4_raw_inode(&iloc);
4959		-
4960		- if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
4961		- ext4_error_inode(inode, function, line, 0,
4962		- "iget: root inode unallocated");
4963		- ret = -EFSCORRUPTED;
4964		- goto bad_inode;
4965		- }
4966	4771
4967	4772	if ((flags & EXT4_IGET_HANDLE) &&
4968	4773	(raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
..	..	@@ -4998,9 +4803,11 @@
4998	4803	sizeof(gen));
4999	4804	}
5000	4805
5001		- if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
5002		- ext4_error_inode(inode, function, line, 0,
5003		- "iget: checksum invalid");
	4806	+ if ((!ext4_inode_csum_verify(inode, raw_inode, ei) \|\|
	4807	+ ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
	4808	+ (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
	4809	+ ext4_error_inode_err(inode, function, line, 0,
	4810	+ EFSBADCRC, "iget: checksum invalid");
5004	4811	ret = -EFSBADCRC;
5005	4812	goto bad_inode;
5006	4813	}
..	..	@@ -5034,11 +4841,16 @@
5034	4841	* NeilBrown 1999oct15
5035	4842	*/
5036	4843	if (inode->i_nlink == 0) {
5037		- if ((inode->i_mode == 0 \|\|
	4844	+ if ((inode->i_mode == 0 \|\| flags & EXT4_IGET_SPECIAL \|\|
5038	4845	!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
5039	4846	ino != EXT4_BOOT_LOADER_INO) {
5040		- /* this inode is deleted */
5041		- ret = -ESTALE;
	4847	+ /* this inode is deleted or unallocated */
	4848	+ if (flags & EXT4_IGET_SPECIAL) {
	4849	+ ext4_error_inode(inode, function, line, 0,
	4850	+ "iget: special inode unallocated");
	4851	+ ret = -EFSCORRUPTED;
	4852	+ } else
	4853	+ ret = -ESTALE;
5042	4854	goto bad_inode;
5043	4855	}
5044	4856	/* The only unlinked inodes we let through here have
..	..	@@ -5049,7 +4861,7 @@
5049	4861	* not initialized on a new filesystem. */
5050	4862	}
5051	4863	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
5052		- ext4_set_inode_flags(inode);
	4864	+ ext4_set_inode_flags(inode, true);
5053	4865	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
5054	4866	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
5055	4867	if (ext4_has_feature_64bit(sb))
..	..	@@ -5088,6 +4900,7 @@
5088	4900	for (block = 0; block < EXT4_N_BLOCKS; block++)
5089	4901	ei->i_data[block] = raw_inode->i_block[block];
5090	4902	INIT_LIST_HEAD(&ei->i_orphan);
	4903	+ ext4_fc_init_inode(&ei->vfs_inode);
5091	4904
5092	4905	/*
5093	4906	* Set transaction id's of transactions that have to be committed
..	..	@@ -5153,9 +4966,10 @@
5153	4966	goto bad_inode;
5154	4967	} else if (!ext4_has_inline_data(inode)) {
5155	4968	/* validate the block references in the inode */
5156		- if (S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode) \|\|
5157		- (S_ISLNK(inode->i_mode) &&
5158		- !ext4_inode_is_fast_symlink(inode))) {
	4969	+ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
	4970	+ (S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode) \|\|
	4971	+ (S_ISLNK(inode->i_mode) &&
	4972	+ !ext4_inode_is_fast_symlink(inode)))) {
5159	4973	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5160	4974	ret = ext4_ext_check_inode(inode);
5161	4975	else
..	..	@@ -5212,10 +5026,15 @@
5212	5026	goto bad_inode;
5213	5027	}
5214	5028	if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
5215		- EXT4_ERROR_INODE(inode,
	5029	+ ext4_error_inode(inode, function, line, 0,
5216	5030	"casefold flag without casefold feature");
5217		- brelse(iloc.bh);
	5031	+ if ((err_str = check_igot_inode(inode, flags)) != NULL) {
	5032	+ ext4_error_inode(inode, function, line, 0, err_str);
	5033	+ ret = -EFSCORRUPTED;
	5034	+ goto bad_inode;
	5035	+ }
5218	5036
	5037	+ brelse(iloc.bh);
5219	5038	unlock_new_inode(inode);
5220	5039	return inode;
5221	5040
..	..	@@ -5264,21 +5083,22 @@
5264	5083	return 0;
5265	5084	}
5266	5085
5267		-struct other_inode {
5268		- unsigned long orig_ino;
5269		- struct ext4_inode *raw_inode;
5270		-};
5271		-
5272		-static int other_inode_match(struct inode * inode, unsigned long ino,
5273		- void *data)
	5086	+static void __ext4_update_other_inode_time(struct super_block *sb,
	5087	+ unsigned long orig_ino,
	5088	+ unsigned long ino,
	5089	+ struct ext4_inode *raw_inode)
5274	5090	{
5275		- struct other_inode oi = (struct other_inode ) data;
	5091	+ struct inode *inode;
5276	5092
5277		- if ((inode->i_ino != ino) \|\|
5278		- (inode->i_state & (I_FREEING \| I_WILL_FREE \| I_NEW \|
	5093	+ inode = find_inode_by_ino_rcu(sb, ino);
	5094	+ if (!inode)
	5095	+ return;
	5096	+
	5097	+ if ((inode->i_state & (I_FREEING \| I_WILL_FREE \| I_NEW \|
5279	5098	I_DIRTY_INODE)) \|\|
5280	5099	((inode->i_state & I_DIRTY_TIME) == 0))
5281		- return 0;
	5100	+ return;
	5101	+
5282	5102	spin_lock(&inode->i_lock);
5283	5103	if (((inode->i_state & (I_FREEING \| I_WILL_FREE \| I_NEW \|
5284	5104	I_DIRTY_INODE)) == 0) &&
..	..	@@ -5289,16 +5109,15 @@
5289	5109	spin_unlock(&inode->i_lock);
5290	5110
5291	5111	spin_lock(&ei->i_raw_lock);
5292		- EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
5293		- EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
5294		- EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
5295		- ext4_inode_csum_set(inode, oi->raw_inode, ei);
	5112	+ EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
	5113	+ EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
	5114	+ EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
	5115	+ ext4_inode_csum_set(inode, raw_inode, ei);
5296	5116	spin_unlock(&ei->i_raw_lock);
5297		- trace_ext4_other_inode_update_time(inode, oi->orig_ino);
5298		- return -1;
	5117	+ trace_ext4_other_inode_update_time(inode, orig_ino);
	5118	+ return;
5299	5119	}
5300	5120	spin_unlock(&inode->i_lock);
5301		- return -1;
5302	5121	}
5303	5122
5304	5123	/*
..	..	@@ -5308,24 +5127,24 @@
5308	5127	static void ext4_update_other_inodes_time(struct super_block *sb,
5309	5128	unsigned long orig_ino, char *buf)
5310	5129	{
5311		- struct other_inode oi;
5312	5130	unsigned long ino;
5313	5131	int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
5314	5132	int inode_size = EXT4_INODE_SIZE(sb);
5315	5133
5316		- oi.orig_ino = orig_ino;
5317	5134	/*
5318	5135	* Calculate the first inode in the inode table block. Inode
5319	5136	* numbers are one-based. That is, the first inode in a block
5320	5137	* (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
5321	5138	*/
5322	5139	ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
	5140	+ rcu_read_lock();
5323	5141	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
5324	5142	if (ino == orig_ino)
5325	5143	continue;
5326		- oi.raw_inode = (struct ext4_inode *) buf;
5327		- (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
	5144	+ __ext4_update_other_inode_time(sb, orig_ino, ino,
	5145	+ (struct ext4_inode *)buf);
5328	5146	}
	5147	+ rcu_read_unlock();
5329	5148	}
5330	5149
5331	5150	/*
..	..	@@ -5535,12 +5354,12 @@
5535	5354	if (wbc->sync_mode != WB_SYNC_ALL \|\| wbc->for_sync)
5536	5355	return 0;
5537	5356
5538		- err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
	5357	+ err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
5539	5358	EXT4_I(inode)->i_sync_tid);
5540	5359	} else {
5541	5360	struct ext4_iloc iloc;
5542	5361
5543		- err = __ext4_get_inode_loc(inode, &iloc, 0);
	5362	+ err = __ext4_get_inode_loc_noinmem(inode, &iloc);
5544	5363	if (err)
5545	5364	return err;
5546	5365	/*
..	..	@@ -5550,8 +5369,8 @@
5550	5369	if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
5551	5370	sync_dirty_buffer(iloc.bh);
5552	5371	if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5553		- EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
5554		- "IO error syncing inode");
	5372	+ ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
	5373	+ "IO error syncing inode");
5555	5374	err = -EIO;
5556	5375	}
5557	5376	brelse(iloc.bh);
..	..	@@ -5664,6 +5483,7 @@
5664	5483	if (error)
5665	5484	return error;
5666	5485	}
	5486	+
5667	5487	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) \|\|
5668	5488	(ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
5669	5489	handle_t *handle;
..	..	@@ -5697,37 +5517,58 @@
5697	5517	inode->i_gid = attr->ia_gid;
5698	5518	error = ext4_mark_inode_dirty(handle, inode);
5699	5519	ext4_journal_stop(handle);
	5520	+ if (unlikely(error)) {
	5521	+ return error;
	5522	+ }
5700	5523	}
5701	5524
5702	5525	if (attr->ia_valid & ATTR_SIZE) {
5703	5526	handle_t *handle;
5704	5527	loff_t oldsize = inode->i_size;
5705		- int shrink = (attr->ia_size <= inode->i_size);
	5528	+ loff_t old_disksize;
	5529	+ int shrink = (attr->ia_size < inode->i_size);
5706	5530
5707	5531	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5708	5532	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5709	5533
5710		- if (attr->ia_size > sbi->s_bitmap_maxbytes)
	5534	+ if (attr->ia_size > sbi->s_bitmap_maxbytes) {
5711	5535	return -EFBIG;
	5536	+ }
5712	5537	}
5713		- if (!S_ISREG(inode->i_mode))
	5538	+ if (!S_ISREG(inode->i_mode)) {
5714	5539	return -EINVAL;
	5540	+ }
5715	5541
5716	5542	if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
5717	5543	inode_inc_iversion(inode);
5718	5544
5719		- if (ext4_should_order_data(inode) &&
5720		- (attr->ia_size < inode->i_size)) {
5721		- error = ext4_begin_ordered_truncate(inode,
	5545	+ if (shrink) {
	5546	+ if (ext4_should_order_data(inode)) {
	5547	+ error = ext4_begin_ordered_truncate(inode,
5722	5548	attr->ia_size);
5723		- if (error)
5724		- goto err_out;
	5549	+ if (error)
	5550	+ goto err_out;
	5551	+ }
	5552	+ /*
	5553	+ * Blocks are going to be removed from the inode. Wait
	5554	+ * for dio in flight.
	5555	+ */
	5556	+ inode_dio_wait(inode);
5725	5557	}
	5558	+
	5559	+ down_write(&EXT4_I(inode)->i_mmap_sem);
	5560	+
	5561	+ rc = ext4_break_layouts(inode);
	5562	+ if (rc) {
	5563	+ up_write(&EXT4_I(inode)->i_mmap_sem);
	5564	+ goto err_out;
	5565	+ }
	5566	+
5726	5567	if (attr->ia_size != inode->i_size) {
5727	5568	handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
5728	5569	if (IS_ERR(handle)) {
5729	5570	error = PTR_ERR(handle);
5730		- goto err_out;
	5571	+ goto out_mmap_sem;
5731	5572	}
5732	5573	if (ext4_handle_valid(handle) && shrink) {
5733	5574	error = ext4_orphan_add(handle, inode);
..	..	@@ -5741,7 +5582,22 @@
5741	5582	inode->i_mtime = current_time(inode);
5742	5583	inode->i_ctime = inode->i_mtime;
5743	5584	}
	5585	+
	5586	+ if (shrink)
	5587	+ ext4_fc_track_range(handle, inode,
	5588	+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
	5589	+ inode->i_sb->s_blocksize_bits,
	5590	+ EXT_MAX_BLOCKS - 1);
	5591	+ else
	5592	+ ext4_fc_track_range(
	5593	+ handle, inode,
	5594	+ (oldsize > 0 ? oldsize - 1 : oldsize) >>
	5595	+ inode->i_sb->s_blocksize_bits,
	5596	+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
	5597	+ inode->i_sb->s_blocksize_bits);
	5598	+
5744	5599	down_write(&EXT4_I(inode)->i_data_sem);
	5600	+ old_disksize = EXT4_I(inode)->i_disksize;
5745	5601	EXT4_I(inode)->i_disksize = attr->ia_size;
5746	5602	rc = ext4_mark_inode_dirty(handle, inode);
5747	5603	if (!error)
..	..	@@ -5753,32 +5609,18 @@
5753	5609	*/
5754	5610	if (!error)
5755	5611	i_size_write(inode, attr->ia_size);
	5612	+ else
	5613	+ EXT4_I(inode)->i_disksize = old_disksize;
5756	5614	up_write(&EXT4_I(inode)->i_data_sem);
5757	5615	ext4_journal_stop(handle);
5758		- if (error) {
5759		- if (orphan && inode->i_nlink)
5760		- ext4_orphan_del(NULL, inode);
5761		- goto err_out;
	5616	+ if (error)
	5617	+ goto out_mmap_sem;
	5618	+ if (!shrink) {
	5619	+ pagecache_isize_extended(inode, oldsize,
	5620	+ inode->i_size);
	5621	+ } else if (ext4_should_journal_data(inode)) {
	5622	+ ext4_wait_for_tail_page_commit(inode);
5762	5623	}
5763		- }
5764		- if (!shrink) {
5765		- pagecache_isize_extended(inode, oldsize, inode->i_size);
5766		- } else {
5767		- /*
5768		- * Blocks are going to be removed from the inode. Wait
5769		- * for dio in flight.
5770		- */
5771		- inode_dio_wait(inode);
5772		- }
5773		- if (orphan && ext4_should_journal_data(inode))
5774		- ext4_wait_for_tail_page_commit(inode);
5775		- down_write(&EXT4_I(inode)->i_mmap_sem);
5776		-
5777		- rc = ext4_break_layouts(inode);
5778		- if (rc) {
5779		- up_write(&EXT4_I(inode)->i_mmap_sem);
5780		- error = rc;
5781		- goto err_out;
5782	5624	}
5783	5625
5784	5626	/*
..	..	@@ -5786,11 +5628,16 @@
5786	5628	* in data=journal mode to make pages freeable.
5787	5629	*/
5788	5630	truncate_pagecache(inode, inode->i_size);
5789		- if (shrink) {
	5631	+ /*
	5632	+ * Call ext4_truncate() even if i_size didn't change to
	5633	+ * truncate possible preallocated blocks.
	5634	+ */
	5635	+ if (attr->ia_size <= oldsize) {
5790	5636	rc = ext4_truncate(inode);
5791	5637	if (rc)
5792	5638	error = rc;
5793	5639	}
	5640	+out_mmap_sem:
5794	5641	up_write(&EXT4_I(inode)->i_mmap_sem);
5795	5642	}
5796	5643
..	..	@@ -5810,7 +5657,8 @@
5810	5657	rc = posix_acl_chmod(inode, inode->i_mode);
5811	5658
5812	5659	err_out:
5813		- ext4_std_error(inode->i_sb, error);
	5660	+ if (error)
	5661	+ ext4_std_error(inode->i_sb, error);
5814	5662	if (!error)
5815	5663	error = rc;
5816	5664	return error;
..	..	@@ -5824,7 +5672,8 @@
5824	5672	struct ext4_inode_info *ei = EXT4_I(inode);
5825	5673	unsigned int flags;
5826	5674
5827		- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
	5675	+ if ((request_mask & STATX_BTIME) &&
	5676	+ EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
5828	5677	stat->result_mask \|= STATX_BTIME;
5829	5678	stat->btime.tv_sec = ei->i_crtime.tv_sec;
5830	5679	stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
..	..	@@ -5993,7 +5842,14 @@
5993	5842	put_bh(iloc->bh);
5994	5843	return -EIO;
5995	5844	}
5996		- if (IS_I_VERSION(inode))
	5845	+ ext4_fc_track_inode(handle, inode);
	5846	+
	5847	+ /*
	5848	+ * ea_inodes are using i_version for storing reference count, don't
	5849	+ * mess with it
	5850	+ */
	5851	+ if (IS_I_VERSION(inode) &&
	5852	+ !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
5997	5853	inode_inc_iversion(inode);
5998	5854
5999	5855	/* the do_update_inode consumes one bh->b_count */
..	..	@@ -6070,6 +5926,14 @@
6070	5926	return 0;
6071	5927	}
6072	5928
	5929	+ /*
	5930	+ * We may need to allocate external xattr block so we need quotas
	5931	+ * initialized. Here we can be called with various locks held so we
	5932	+ * cannot affort to initialize quotas ourselves. So just bail.
	5933	+ */
	5934	+ if (dquot_initialize_needed(inode))
	5935	+ return -EAGAIN;
	5936	+
6073	5937	/* try to expand with EAs present */
6074	5938	error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
6075	5939	raw_inode, handle);
..	..	@@ -6107,9 +5971,8 @@
6107	5971	* If this is felt to be critical, then e2fsck should be run to
6108	5972	* force a large enough s_min_extra_isize.
6109	5973	*/
6110		- if (ext4_handle_valid(handle) &&
6111		- jbd2_journal_extend(handle,
6112		- EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0)
	5974	+ if (ext4_journal_extend(handle,
	5975	+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
6113	5976	return -ENOSPC;
6114	5977
6115	5978	if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
..	..	@@ -6178,7 +6041,8 @@
6178	6041	* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
6179	6042	* we start and wait on commits.
6180	6043	*/
6181		-int ext4_mark_inode_dirty(handle_t handle, struct inode inode)
	6044	+int __ext4_mark_inode_dirty(handle_t handle, struct inode inode,
	6045	+ const char *func, unsigned int line)
6182	6046	{
6183	6047	struct ext4_iloc iloc;
6184	6048	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
..	..	@@ -6188,13 +6052,18 @@
6188	6052	trace_ext4_mark_inode_dirty(inode, _RET_IP_);
6189	6053	err = ext4_reserve_inode_write(handle, inode, &iloc);
6190	6054	if (err)
6191		- return err;
	6055	+ goto out;
6192	6056
6193	6057	if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
6194	6058	ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
6195	6059	iloc, handle);
6196	6060
6197		- return ext4_mark_iloc_dirty(handle, inode, &iloc);
	6061	+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
	6062	+out:
	6063	+ if (unlikely(err))
	6064	+ ext4_error_inode_err(inode, func, line, 0, err,
	6065	+ "mark_inode_dirty error");
	6066	+ return err;
6198	6067	}
6199	6068
6200	6069	/*
..	..	@@ -6231,36 +6100,6 @@
6231	6100	out:
6232	6101	return;
6233	6102	}
6234		-
6235		-#if 0
6236		-/*
6237		- * Bind an inode's backing buffer_head into this transaction, to prevent
6238		- * it from being flushed to disk early. Unlike
6239		- * ext4_reserve_inode_write, this leaves behind no bh reference and
6240		- * returns no iloc structure, so the caller needs to repeat the iloc
6241		- * lookup to mark the inode dirty later.
6242		- */
6243		-static int ext4_pin_inode(handle_t handle, struct inode inode)
6244		-{
6245		- struct ext4_iloc iloc;
6246		-
6247		- int err = 0;
6248		- if (handle) {
6249		- err = ext4_get_inode_loc(inode, &iloc);
6250		- if (!err) {
6251		- BUFFER_TRACE(iloc.bh, "get_write_access");
6252		- err = jbd2_journal_get_write_access(handle, iloc.bh);
6253		- if (!err)
6254		- err = ext4_handle_dirty_metadata(handle,
6255		- NULL,
6256		- iloc.bh);
6257		- brelse(iloc.bh);
6258		- }
6259		- }
6260		- ext4_std_error(inode->i_sb, err);
6261		- return err;
6262		-}
6263		-#endif
6264	6103
6265	6104	int ext4_change_inode_journal_flag(struct inode *inode, int val)
6266	6105	{
..	..	@@ -6341,6 +6180,8 @@
6341	6180	if (IS_ERR(handle))
6342	6181	return PTR_ERR(handle);
6343	6182
	6183	+ ext4_fc_mark_ineligible(inode->i_sb,
	6184	+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
6344	6185	err = ext4_mark_inode_dirty(handle, inode);
6345	6186	ext4_handle_sync(handle);
6346	6187	ext4_journal_stop(handle);
..	..	@@ -6354,13 +6195,14 @@
6354	6195	return !buffer_mapped(bh);
6355	6196	}
6356	6197
6357		-int ext4_page_mkwrite(struct vm_fault *vmf)
	6198	+vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
6358	6199	{
6359	6200	struct vm_area_struct *vma = vmf->vma;
6360	6201	struct page *page = vmf->page;
6361	6202	loff_t size;
6362	6203	unsigned long len;
6363		- int ret;
	6204	+ int err;
	6205	+ vm_fault_t ret;
6364	6206	struct file *file = vma->vm_file;
6365	6207	struct inode *inode = file_inode(file);
6366	6208	struct address_space *mapping = inode->i_mapping;
..	..	@@ -6376,18 +6218,26 @@
6376	6218
6377	6219	down_read(&EXT4_I(inode)->i_mmap_sem);
6378	6220
6379		- ret = ext4_convert_inline_data(inode);
6380		- if (ret)
	6221	+ err = ext4_convert_inline_data(inode);
	6222	+ if (err)
6381	6223	goto out_ret;
	6224	+
	6225	+ /*
	6226	+ * On data journalling we skip straight to the transaction handle:
	6227	+ * there's no delalloc; page truncated will be checked later; the
	6228	+ * early return w/ all buffers mapped (calculates size/len) can't
	6229	+ * be used; and there's no dioread_nolock, so only ext4_get_block.
	6230	+ */
	6231	+ if (ext4_should_journal_data(inode))
	6232	+ goto retry_alloc;
6382	6233
6383	6234	/* Delalloc case is easy... */
6384	6235	if (test_opt(inode->i_sb, DELALLOC) &&
6385		- !ext4_should_journal_data(inode) &&
6386	6236	!ext4_nonda_switch(inode->i_sb)) {
6387	6237	do {
6388		- ret = block_page_mkwrite(vma, vmf,
	6238	+ err = block_page_mkwrite(vma, vmf,
6389	6239	ext4_da_get_block_prep);
6390		- } while (ret == -ENOSPC &&
	6240	+ } while (err == -ENOSPC &&
6391	6241	ext4_should_retry_alloc(inode->i_sb, &retries));
6392	6242	goto out_ret;
6393	6243	}
..	..	@@ -6408,6 +6258,9 @@
6408	6258	/*
6409	6259	* Return if we have all the buffers mapped. This avoids the need to do
6410	6260	* journal_start/journal_stop which can block and take a long time
	6261	+ *
	6262	+ * This cannot be done for data journalling, as we have to add the
	6263	+ * inode to the transaction's list to writeprotect pages on commit.
6411	6264	*/
6412	6265	if (page_has_buffers(page)) {
6413	6266	if (!ext4_walk_page_buffers(NULL, page_buffers(page),
..	..	@@ -6432,36 +6285,67 @@
6432	6285	ret = VM_FAULT_SIGBUS;
6433	6286	goto out;
6434	6287	}
6435		- ret = block_page_mkwrite(vma, vmf, get_block);
6436		- if (!ret && ext4_should_journal_data(inode)) {
6437		- if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
6438		- PAGE_SIZE, NULL, do_journal_get_write_access)) {
6439		- unlock_page(page);
6440		- ret = VM_FAULT_SIGBUS;
6441		- ext4_journal_stop(handle);
6442		- goto out;
	6288	+ /*
	6289	+ * Data journalling can't use block_page_mkwrite() because it
	6290	+ * will set_buffer_dirty() before do_journal_get_write_access()
	6291	+ * thus might hit warning messages for dirty metadata buffers.
	6292	+ */
	6293	+ if (!ext4_should_journal_data(inode)) {
	6294	+ err = block_page_mkwrite(vma, vmf, get_block);
	6295	+ } else {
	6296	+ lock_page(page);
	6297	+ size = i_size_read(inode);
	6298	+ /* Page got truncated from under us? */
	6299	+ if (page->mapping != mapping \|\| page_offset(page) > size) {
	6300	+ ret = VM_FAULT_NOPAGE;
	6301	+ goto out_error;
6443	6302	}
6444		- ext4_set_inode_state(inode, EXT4_STATE_JDATA);
	6303	+
	6304	+ if (page->index == size >> PAGE_SHIFT)
	6305	+ len = size & ~PAGE_MASK;
	6306	+ else
	6307	+ len = PAGE_SIZE;
	6308	+
	6309	+ err = __block_write_begin(page, 0, len, ext4_get_block);
	6310	+ if (!err) {
	6311	+ ret = VM_FAULT_SIGBUS;
	6312	+ if (ext4_walk_page_buffers(handle, page_buffers(page),
	6313	+ 0, len, NULL, do_journal_get_write_access))
	6314	+ goto out_error;
	6315	+ if (ext4_walk_page_buffers(handle, page_buffers(page),
	6316	+ 0, len, NULL, write_end_fn))
	6317	+ goto out_error;
	6318	+ if (ext4_jbd2_inode_add_write(handle, inode,
	6319	+ page_offset(page), len))
	6320	+ goto out_error;
	6321	+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
	6322	+ } else {
	6323	+ unlock_page(page);
	6324	+ }
6445	6325	}
6446	6326	ext4_journal_stop(handle);
6447		- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
	6327	+ if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
6448	6328	goto retry_alloc;
6449	6329	out_ret:
6450		- ret = block_page_mkwrite_return(ret);
	6330	+ ret = block_page_mkwrite_return(err);
6451	6331	out:
6452	6332	up_read(&EXT4_I(inode)->i_mmap_sem);
6453	6333	sb_end_pagefault(inode->i_sb);
6454	6334	return ret;
	6335	+out_error:
	6336	+ unlock_page(page);
	6337	+ ext4_journal_stop(handle);
	6338	+ goto out;
6455	6339	}
6456	6340
6457		-int ext4_filemap_fault(struct vm_fault *vmf)
	6341	+vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
6458	6342	{
6459	6343	struct inode *inode = file_inode(vmf->vma->vm_file);
6460		- int err;
	6344	+ vm_fault_t ret;
6461	6345
6462	6346	down_read(&EXT4_I(inode)->i_mmap_sem);
6463		- err = filemap_fault(vmf);
	6347	+ ret = filemap_fault(vmf);
6464	6348	up_read(&EXT4_I(inode)->i_mmap_sem);
6465	6349
6466		- return err;
	6350	+ return ret;
6467	6351	}