~hc/RK356X_SDK_RELEASE.git

..	..	@@ -10,14 +10,11 @@
10	10	#include "xfs_log_format.h"
11	11	#include "xfs_trans_resv.h"
12	12	#include "xfs_mount.h"
13		-#include "xfs_da_format.h"
14		-#include "xfs_da_btree.h"
15	13	#include "xfs_inode.h"
16	14	#include "xfs_trans.h"
17	15	#include "xfs_inode_item.h"
18	16	#include "xfs_bmap.h"
19	17	#include "xfs_bmap_util.h"
20		-#include "xfs_error.h"
21	18	#include "xfs_dir2.h"
22	19	#include "xfs_dir2_priv.h"
23	20	#include "xfs_ioctl.h"
..	..	@@ -28,13 +25,45 @@
28	25	#include "xfs_iomap.h"
29	26	#include "xfs_reflink.h"
30	27
31		-#include <linux/dcache.h>
32	28	#include <linux/falloc.h>
33		-#include <linux/pagevec.h>
34	29	#include <linux/backing-dev.h>
35	30	#include <linux/mman.h>
	31	+#include <linux/fadvise.h>
36	32
37	33	static const struct vm_operations_struct xfs_file_vm_ops;
	34	+
	35	+/*
	36	+ * Decide if the given file range is aligned to the size of the fundamental
	37	+ * allocation unit for the file.
	38	+ */
	39	+static bool
	40	+xfs_is_falloc_aligned(
	41	+ struct xfs_inode *ip,
	42	+ loff_t pos,
	43	+ long long int len)
	44	+{
	45	+ struct xfs_mount *mp = ip->i_mount;
	46	+ uint64_t mask;
	47	+
	48	+ if (XFS_IS_REALTIME_INODE(ip)) {
	49	+ if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
	50	+ u64 rextbytes;
	51	+ u32 mod;
	52	+
	53	+ rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
	54	+ div_u64_rem(pos, rextbytes, &mod);
	55	+ if (mod)
	56	+ return false;
	57	+ div_u64_rem(len, rextbytes, &mod);
	58	+ return mod == 0;
	59	+ }
	60	+ mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
	61	+ } else {
	62	+ mask = mp->m_sb.sb_blocksize - 1;
	63	+ }
	64	+
	65	+ return !((pos \| len) & mask);
	66	+}
38	67
39	68	int
40	69	xfs_update_prealloc_flags(
..	..	@@ -84,19 +113,57 @@
84	113	int datasync)
85	114	{
86	115	struct xfs_inode *ip = XFS_I(file->f_mapping->host);
87		- struct xfs_mount *mp = ip->i_mount;
88		- xfs_lsn_t lsn = 0;
89	116
90	117	trace_xfs_dir_fsync(ip);
	118	+ return xfs_log_force_inode(ip);
	119	+}
	120	+
	121	+static xfs_csn_t
	122	+xfs_fsync_seq(
	123	+ struct xfs_inode *ip,
	124	+ bool datasync)
	125	+{
	126	+ if (!xfs_ipincount(ip))
	127	+ return 0;
	128	+ if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
	129	+ return 0;
	130	+ return ip->i_itemp->ili_commit_seq;
	131	+}
	132	+
	133	+/*
	134	+ * All metadata updates are logged, which means that we just have to flush the
	135	+ * log up to the latest LSN that touched the inode.
	136	+ *
	137	+ * If we have concurrent fsync/fdatasync() calls, we need them to all block on
	138	+ * the log force before we clear the ili_fsync_fields field. This ensures that
	139	+ * we don't get a racing sync operation that does not wait for the metadata to
	140	+ * hit the journal before returning. If we race with clearing ili_fsync_fields,
	141	+ * then all that will happen is the log force will do nothing as the lsn will
	142	+ * already be on disk. We can't race with setting ili_fsync_fields because that
	143	+ * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
	144	+ * shared until after the ili_fsync_fields is cleared.
	145	+ */
	146	+static int
	147	+xfs_fsync_flush_log(
	148	+ struct xfs_inode *ip,
	149	+ bool datasync,
	150	+ int *log_flushed)
	151	+{
	152	+ int error = 0;
	153	+ xfs_csn_t seq;
91	154
92	155	xfs_ilock(ip, XFS_ILOCK_SHARED);
93		- if (xfs_ipincount(ip))
94		- lsn = ip->i_itemp->ili_last_lsn;
95		- xfs_iunlock(ip, XFS_ILOCK_SHARED);
	156	+ seq = xfs_fsync_seq(ip, datasync);
	157	+ if (seq) {
	158	+ error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
	159	+ log_flushed);
96	160
97		- if (!lsn)
98		- return 0;
99		- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
	161	+ spin_lock(&ip->i_itemp->ili_lock);
	162	+ ip->i_itemp->ili_fsync_fields = 0;
	163	+ spin_unlock(&ip->i_itemp->ili_lock);
	164	+ }
	165	+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
	166	+ return error;
100	167	}
101	168
102	169	STATIC int
..	..	@@ -106,12 +173,10 @@
106	173	loff_t end,
107	174	int datasync)
108	175	{
109		- struct inode *inode = file->f_mapping->host;
110		- struct xfs_inode *ip = XFS_I(inode);
	176	+ struct xfs_inode *ip = XFS_I(file->f_mapping->host);
111	177	struct xfs_mount *mp = ip->i_mount;
112	178	int error = 0;
113	179	int log_flushed = 0;
114		- xfs_lsn_t lsn = 0;
115	180
116	181	trace_xfs_file_fsync(ip);
117	182
..	..	@@ -135,31 +200,7 @@
135	200	else if (mp->m_logdev_targp != mp->m_ddev_targp)
136	201	xfs_blkdev_issue_flush(mp->m_ddev_targp);
137	202
138		- /*
139		- * All metadata updates are logged, which means that we just have to
140		- * flush the log up to the latest LSN that touched the inode. If we have
141		- * concurrent fsync/fdatasync() calls, we need them to all block on the
142		- * log force before we clear the ili_fsync_fields field. This ensures
143		- * that we don't get a racing sync operation that does not wait for the
144		- * metadata to hit the journal before returning. If we race with
145		- * clearing the ili_fsync_fields, then all that will happen is the log
146		- * force will do nothing as the lsn will already be on disk. We can't
147		- * race with setting ili_fsync_fields because that is done under
148		- * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
149		- * until after the ili_fsync_fields is cleared.
150		- */
151		- xfs_ilock(ip, XFS_ILOCK_SHARED);
152		- if (xfs_ipincount(ip)) {
153		- if (!datasync \|\|
154		- (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
155		- lsn = ip->i_itemp->ili_last_lsn;
156		- }
157		-
158		- if (lsn) {
159		- error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
160		- ip->i_itemp->ili_fsync_fields = 0;
161		- }
162		- xfs_iunlock(ip, XFS_ILOCK_SHARED);
	203	+ error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
163	204
164	205	/*
165	206	* If we only have a single device, and the log force about was
..	..	@@ -191,8 +232,14 @@
191	232
192	233	file_accessed(iocb->ki_filp);
193	234
194		- xfs_ilock(ip, XFS_IOLOCK_SHARED);
195		- ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
	235	+ if (iocb->ki_flags & IOCB_NOWAIT) {
	236	+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
	237	+ return -EAGAIN;
	238	+ } else {
	239	+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
	240	+ }
	241	+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
	242	+ is_sync_kiocb(iocb));
196	243	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
197	244
198	245	return ret;
..	..	@@ -219,7 +266,7 @@
219	266	xfs_ilock(ip, XFS_IOLOCK_SHARED);
220	267	}
221	268
222		- ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
	269	+ ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
223	270	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
224	271
225	272	file_accessed(iocb->ki_filp);
..	..	@@ -355,7 +402,7 @@
355	402
356	403	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
357	404	error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
358		- NULL, &xfs_iomap_ops);
	405	+ NULL, &xfs_buffered_write_iomap_ops);
359	406	if (error)
360	407	return error;
361	408	} else
..	..	@@ -367,40 +414,30 @@
367	414	* lock above. Eventually we should look into a way to avoid
368	415	* the pointless lock roundtrip.
369	416	*/
370		- if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
371		- error = file_update_time(file);
372		- if (error)
373		- return error;
374		- }
375		-
376		- /*
377		- * If we're writing the file then make sure to clear the setuid and
378		- * setgid bits if the process is not being run by root. This keeps
379		- * people from modifying setuid and setgid binaries.
380		- */
381		- if (!IS_NOSEC(inode))
382		- return file_remove_privs(file);
383		- return 0;
	417	+ return file_modified(file);
384	418	}
385	419
386	420	static int
387	421	xfs_dio_write_end_io(
388	422	struct kiocb *iocb,
389	423	ssize_t size,
	424	+ int error,
390	425	unsigned flags)
391	426	{
392	427	struct inode *inode = file_inode(iocb->ki_filp);
393	428	struct xfs_inode *ip = XFS_I(inode);
394	429	loff_t offset = iocb->ki_pos;
395		- int error = 0;
	430	+ unsigned int nofs_flag;
396	431
397	432	trace_xfs_end_io_direct_write(ip, offset, size);
398	433
399	434	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
400	435	return -EIO;
401	436
402		- if (size <= 0)
403		- return size;
	437	+ if (error)
	438	+ return error;
	439	+ if (!size)
	440	+ return 0;
404	441
405	442	/*
406	443	* Capture amount written on completion as we can't reliably account
..	..	@@ -408,10 +445,17 @@
408	445	*/
409	446	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
410	447
	448	+ /*
	449	+ * We can allocate memory here while doing writeback on behalf of
	450	+ * memory reclaim. To avoid memory allocation deadlocks set the
	451	+ * task-wide nofs context for the following operations.
	452	+ */
	453	+ nofs_flag = memalloc_nofs_save();
	454	+
411	455	if (flags & IOMAP_DIO_COW) {
412	456	error = xfs_reflink_end_cow(ip, offset, size);
413	457	if (error)
414		- return error;
	458	+ goto out;
415	459	}
416	460
417	461	/*
..	..	@@ -420,8 +464,10 @@
420	464	* earlier allows a racing dio read to find unwritten extents before
421	465	* they are converted.
422	466	*/
423		- if (flags & IOMAP_DIO_UNWRITTEN)
424		- return xfs_iomap_write_unwritten(ip, offset, size, true);
	467	+ if (flags & IOMAP_DIO_UNWRITTEN) {
	468	+ error = xfs_iomap_write_unwritten(ip, offset, size, true);
	469	+ goto out;
	470	+ }
425	471
426	472	/*
427	473	* We need to update the in-core inode size here so that we don't end up
..	..	@@ -443,8 +489,14 @@
443	489	spin_unlock(&ip->i_flags_lock);
444	490	}
445	491
	492	+out:
	493	+ memalloc_nofs_restore(nofs_flag);
446	494	return error;
447	495	}
	496	+
	497	+static const struct iomap_dio_ops xfs_dio_write_ops = {
	498	+ .end_io = xfs_dio_write_end_io,
	499	+};
448	500
449	501	/*
450	502	* xfs_file_dio_aio_write - handle direct IO writes
..	..	@@ -485,8 +537,7 @@
485	537	int unaligned_io = 0;
486	538	int iolock;
487	539	size_t count = iov_iter_count(from);
488		- struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
489		- mp->m_rtdev_targp : mp->m_ddev_targp;
	540	+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
490	541
491	542	/* DIO must be aligned to device logical sector size */
492	543	if ((iocb->ki_pos \| count) & target->bt_logical_sectormask)
..	..	@@ -507,9 +558,9 @@
507	558	* We can't properly handle unaligned direct I/O to reflink
508	559	* files yet, as we can't unshare a partial block.
509	560	*/
510		- if (xfs_is_reflink_inode(ip)) {
	561	+ if (xfs_is_cow_inode(ip)) {
511	562	trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
512		- return -EREMCHG;
	563	+ return -ENOTBLK;
513	564	}
514	565	iolock = XFS_IOLOCK_EXCL;
515	566	} else {
..	..	@@ -546,21 +597,19 @@
546	597	}
547	598
548	599	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
549		- ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
550		-
551	600	/*
552		- * If unaligned, this is the only IO in-flight. If it has not yet
553		- * completed, wait on it before we release the iolock to prevent
554		- * subsequent overlapping IO.
	601	+ * If unaligned, this is the only IO in-flight. Wait on it before we
	602	+ * release the iolock to prevent subsequent overlapping IO.
555	603	*/
556		- if (ret == -EIOCBQUEUED && unaligned_io)
557		- inode_dio_wait(inode);
	604	+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
	605	+ &xfs_dio_write_ops,
	606	+ is_sync_kiocb(iocb) \|\| unaligned_io);
558	607	out:
559	608	xfs_iunlock(ip, iolock);
560	609
561	610	/*
562		- * No fallback to buffered IO on errors for XFS, direct IO will either
563		- * complete fully or fail.
	611	+ * No fallback to buffered IO after short writes for XFS, direct I/O
	612	+ * will either complete fully or return an error.
564	613	*/
565	614	ASSERT(ret < 0 \|\| ret == count);
566	615	return ret;
..	..	@@ -593,7 +642,7 @@
593	642	count = iov_iter_count(from);
594	643
595	644	trace_xfs_file_dax_write(ip, count, pos);
596		- ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
	645	+ ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
597	646	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
598	647	i_size_write(inode, iocb->ki_pos);
599	648	error = xfs_setfilesize(ip, pos, ret);
..	..	@@ -640,7 +689,8 @@
640	689	current->backing_dev_info = inode_to_bdi(inode);
641	690
642	691	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
643		- ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
	692	+ ret = iomap_file_buffered_write(iocb, from,
	693	+ &xfs_buffered_write_iomap_ops);
644	694	if (likely(ret >= 0))
645	695	iocb->ki_pos += ret;
646	696
..	..	@@ -719,7 +769,7 @@
719	769	* allow an operation to fall back to buffered mode.
720	770	*/
721	771	ret = xfs_file_dio_aio_write(iocb, from);
722		- if (ret != -EREMCHG)
	772	+ if (ret != -ENOTBLK)
723	773	return ret;
724	774	}
725	775
..	..	@@ -817,14 +867,42 @@
817	867	if (error)
818	868	goto out_unlock;
819	869
	870	+ /*
	871	+ * Must wait for all AIO to complete before we continue as AIO can
	872	+ * change the file size on completion without holding any locks we
	873	+ * currently hold. We must do this first because AIO can update both
	874	+ * the on disk and in memory inode sizes, and the operations that follow
	875	+ * require the in-memory size to be fully up-to-date.
	876	+ */
	877	+ inode_dio_wait(inode);
	878	+
	879	+ /*
	880	+ * Now AIO and DIO has drained we flush and (if necessary) invalidate
	881	+ * the cached range over the first operation we are about to run.
	882	+ *
	883	+ * We care about zero and collapse here because they both run a hole
	884	+ * punch over the range first. Because that can zero data, and the range
	885	+ * of invalidation for the shift operations is much larger, we still do
	886	+ * the required flush for collapse in xfs_prepare_shift().
	887	+ *
	888	+ * Insert has the same range requirements as collapse, and we extend the
	889	+ * file first which can zero data. Hence insert has the same
	890	+ * flush/invalidate requirements as collapse and so they are both
	891	+ * handled at the right time by xfs_prepare_shift().
	892	+ */
	893	+ if (mode & (FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_ZERO_RANGE \|
	894	+ FALLOC_FL_COLLAPSE_RANGE)) {
	895	+ error = xfs_flush_unmap_range(ip, offset, len);
	896	+ if (error)
	897	+ goto out_unlock;
	898	+ }
	899	+
820	900	if (mode & FALLOC_FL_PUNCH_HOLE) {
821	901	error = xfs_free_file_space(ip, offset, len);
822	902	if (error)
823	903	goto out_unlock;
824	904	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
825		- unsigned int blksize_mask = i_blocksize(inode) - 1;
826		-
827		- if (offset & blksize_mask \|\| len & blksize_mask) {
	905	+ if (!xfs_is_falloc_aligned(ip, offset, len)) {
828	906	error = -EINVAL;
829	907	goto out_unlock;
830	908	}
..	..	@@ -844,10 +922,9 @@
844	922	if (error)
845	923	goto out_unlock;
846	924	} else if (mode & FALLOC_FL_INSERT_RANGE) {
847		- unsigned int blksize_mask = i_blocksize(inode) - 1;
848	925	loff_t isize = i_size_read(inode);
849	926
850		- if (offset & blksize_mask \|\| len & blksize_mask) {
	927	+ if (!xfs_is_falloc_aligned(ip, offset, len)) {
851	928	error = -EINVAL;
852	929	goto out_unlock;
853	930	}
..	..	@@ -879,19 +956,48 @@
879	956	goto out_unlock;
880	957	}
881	958
882		- if (mode & FALLOC_FL_ZERO_RANGE)
883		- error = xfs_zero_file_space(ip, offset, len);
884		- else {
885		- if (mode & FALLOC_FL_UNSHARE_RANGE) {
886		- error = xfs_reflink_unshare(ip, offset, len);
887		- if (error)
888		- goto out_unlock;
	959	+ if (mode & FALLOC_FL_ZERO_RANGE) {
	960	+ /*
	961	+ * Punch a hole and prealloc the range. We use a hole
	962	+ * punch rather than unwritten extent conversion for two
	963	+ * reasons:
	964	+ *
	965	+ * 1.) Hole punch handles partial block zeroing for us.
	966	+ * 2.) If prealloc returns ENOSPC, the file range is
	967	+ * still zero-valued by virtue of the hole punch.
	968	+ */
	969	+ unsigned int blksize = i_blocksize(inode);
	970	+
	971	+ trace_xfs_zero_file_space(ip);
	972	+
	973	+ error = xfs_free_file_space(ip, offset, len);
	974	+ if (error)
	975	+ goto out_unlock;
	976	+
	977	+ len = round_up(offset + len, blksize) -
	978	+ round_down(offset, blksize);
	979	+ offset = round_down(offset, blksize);
	980	+ } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
	981	+ error = xfs_reflink_unshare(ip, offset, len);
	982	+ if (error)
	983	+ goto out_unlock;
	984	+ } else {
	985	+ /*
	986	+ * If always_cow mode we can't use preallocations and
	987	+ * thus should not create them.
	988	+ */
	989	+ if (xfs_is_always_cow_inode(ip)) {
	990	+ error = -EOPNOTSUPP;
	991	+ goto out_unlock;
889	992	}
	993	+ }
	994	+
	995	+ if (!xfs_is_always_cow_inode(ip)) {
890	996	error = xfs_alloc_file_space(ip, offset, len,
891	997	XFS_BMAPI_PREALLOC);
	998	+ if (error)
	999	+ goto out_unlock;
892	1000	}
893		- if (error)
894		- goto out_unlock;
895	1001	}
896	1002
897	1003	if (file->f_flags & O_DSYNC)
..	..	@@ -927,27 +1033,109 @@
927	1033	}
928	1034
929	1035	STATIC int
930		-xfs_file_clone_range(
931		- struct file *file_in,
932		- loff_t pos_in,
933		- struct file *file_out,
934		- loff_t pos_out,
935		- u64 len)
	1036	+xfs_file_fadvise(
	1037	+ struct file *file,
	1038	+ loff_t start,
	1039	+ loff_t end,
	1040	+ int advice)
936	1041	{
937		- return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
938		- len, false);
	1042	+ struct xfs_inode *ip = XFS_I(file_inode(file));
	1043	+ int ret;
	1044	+ int lockflags = 0;
	1045	+
	1046	+ /*
	1047	+ * Operations creating pages in page cache need protection from hole
	1048	+ * punching and similar ops
	1049	+ */
	1050	+ if (advice == POSIX_FADV_WILLNEED) {
	1051	+ lockflags = XFS_IOLOCK_SHARED;
	1052	+ xfs_ilock(ip, lockflags);
	1053	+ }
	1054	+ ret = generic_fadvise(file, start, end, advice);
	1055	+ if (lockflags)
	1056	+ xfs_iunlock(ip, lockflags);
	1057	+ return ret;
939	1058	}
940	1059
941		-STATIC int
942		-xfs_file_dedupe_range(
943		- struct file *file_in,
944		- loff_t pos_in,
945		- struct file *file_out,
946		- loff_t pos_out,
947		- u64 len)
	1060	+/* Does this file, inode, or mount want synchronous writes? */
	1061	+static inline bool xfs_file_sync_writes(struct file *filp)
948	1062	{
949		- return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
950		- len, true);
	1063	+ struct xfs_inode *ip = XFS_I(file_inode(filp));
	1064	+
	1065	+ if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC)
	1066	+ return true;
	1067	+ if (filp->f_flags & (__O_SYNC \| O_DSYNC))
	1068	+ return true;
	1069	+ if (IS_SYNC(file_inode(filp)))
	1070	+ return true;
	1071	+
	1072	+ return false;
	1073	+}
	1074	+
	1075	+STATIC loff_t
	1076	+xfs_file_remap_range(
	1077	+ struct file *file_in,
	1078	+ loff_t pos_in,
	1079	+ struct file *file_out,
	1080	+ loff_t pos_out,
	1081	+ loff_t len,
	1082	+ unsigned int remap_flags)
	1083	+{
	1084	+ struct inode *inode_in = file_inode(file_in);
	1085	+ struct xfs_inode *src = XFS_I(inode_in);
	1086	+ struct inode *inode_out = file_inode(file_out);
	1087	+ struct xfs_inode *dest = XFS_I(inode_out);
	1088	+ struct xfs_mount *mp = src->i_mount;
	1089	+ loff_t remapped = 0;
	1090	+ xfs_extlen_t cowextsize;
	1091	+ int ret;
	1092	+
	1093	+ if (remap_flags & ~(REMAP_FILE_DEDUP \| REMAP_FILE_ADVISORY))
	1094	+ return -EINVAL;
	1095	+
	1096	+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
	1097	+ return -EOPNOTSUPP;
	1098	+
	1099	+ if (XFS_FORCED_SHUTDOWN(mp))
	1100	+ return -EIO;
	1101	+
	1102	+ /* Prepare and then clone file data. */
	1103	+ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
	1104	+ &len, remap_flags);
	1105	+ if (ret \|\| len == 0)
	1106	+ return ret;
	1107	+
	1108	+ trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
	1109	+
	1110	+ ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
	1111	+ &remapped);
	1112	+ if (ret)
	1113	+ goto out_unlock;
	1114	+
	1115	+ /*
	1116	+ * Carry the cowextsize hint from src to dest if we're sharing the
	1117	+ * entire source file to the entire destination file, the source file
	1118	+ * has a cowextsize hint, and the destination file does not.
	1119	+ */
	1120	+ cowextsize = 0;
	1121	+ if (pos_in == 0 && len == i_size_read(inode_in) &&
	1122	+ (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
	1123	+ pos_out == 0 && len >= i_size_read(inode_out) &&
	1124	+ !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
	1125	+ cowextsize = src->i_d.di_cowextsize;
	1126	+
	1127	+ ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
	1128	+ remap_flags);
	1129	+ if (ret)
	1130	+ goto out_unlock;
	1131	+
	1132	+ if (xfs_file_sync_writes(file_in) \|\| xfs_file_sync_writes(file_out))
	1133	+ xfs_log_force_inode(dest);
	1134	+out_unlock:
	1135	+ xfs_iunlock2_io_mmap(src, dest);
	1136	+ if (ret)
	1137	+ trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
	1138	+ return remapped > 0 ? remapped : ret;
951	1139	}
952	1140
953	1141	STATIC int
..	..	@@ -959,7 +1147,7 @@
959	1147	return -EFBIG;
960	1148	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
961	1149	return -EIO;
962		- file->f_mode \|= FMODE_NOWAIT;
	1150	+ file->f_mode \|= FMODE_NOWAIT \| FMODE_BUF_RASYNC;
963	1151	return 0;
964	1152	}
965	1153
..	..	@@ -981,8 +1169,8 @@
981	1169	* certain to have the next operation be a read there.
982	1170	*/
983	1171	mode = xfs_ilock_data_map_shared(ip);
984		- if (ip->i_d.di_nextents > 0)
985		- error = xfs_dir3_data_readahead(ip, 0, -1);
	1172	+ if (ip->i_df.if_nextents > 0)
	1173	+ error = xfs_dir3_data_readahead(ip, 0, 0);
986	1174	xfs_iunlock(ip, mode);
987	1175	return error;
988	1176	}
..	..	@@ -1036,10 +1224,10 @@
1036	1224	default:
1037	1225	return generic_file_llseek(file, offset, whence);
1038	1226	case SEEK_HOLE:
1039		- offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
	1227	+ offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1040	1228	break;
1041	1229	case SEEK_DATA:
1042		- offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
	1230	+ offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1043	1231	break;
1044	1232	}
1045	1233
..	..	@@ -1052,7 +1240,7 @@
1052	1240	* Locking for serialisation of IO during page faults. This results in a lock
1053	1241	* ordering of:
1054	1242	*
1055		- * mmap_sem (MM)
	1243	+ * mmap_lock (MM)
1056	1244	* sb_start_pagefault(vfs, freeze)
1057	1245	* i_mmaplock (XFS - truncate serialisation)
1058	1246	* page_lock (MM)
..	..	@@ -1079,12 +1267,16 @@
1079	1267	if (IS_DAX(inode)) {
1080	1268	pfn_t pfn;
1081	1269
1082		- ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops);
	1270	+ ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
	1271	+ (write_fault && !vmf->cow_page) ?
	1272	+ &xfs_direct_write_iomap_ops :
	1273	+ &xfs_read_iomap_ops);
1083	1274	if (ret & VM_FAULT_NEEDDSYNC)
1084	1275	ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1085	1276	} else {
1086	1277	if (write_fault)
1087		- ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
	1278	+ ret = iomap_page_mkwrite(vmf,
	1279	+ &xfs_buffered_write_iomap_ops);
1088	1280	else
1089	1281	ret = filemap_fault(vmf);
1090	1282	}
..	..	@@ -1146,29 +1338,47 @@
1146	1338	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
1147	1339	}
1148	1340
	1341	+static vm_fault_t
	1342	+xfs_filemap_map_pages(
	1343	+ struct vm_fault *vmf,
	1344	+ pgoff_t start_pgoff,
	1345	+ pgoff_t end_pgoff)
	1346	+{
	1347	+ struct inode *inode = file_inode(vmf->vma->vm_file);
	1348	+ vm_fault_t ret;
	1349	+
	1350	+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
	1351	+ ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
	1352	+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
	1353	+ return ret;
	1354	+}
	1355	+
1149	1356	static const struct vm_operations_struct xfs_file_vm_ops = {
1150	1357	.fault = xfs_filemap_fault,
1151	1358	.huge_fault = xfs_filemap_huge_fault,
1152		- .map_pages = filemap_map_pages,
	1359	+ .map_pages = xfs_filemap_map_pages,
1153	1360	.page_mkwrite = xfs_filemap_page_mkwrite,
1154	1361	.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
1155	1362	};
1156	1363
1157	1364	STATIC int
1158	1365	xfs_file_mmap(
1159		- struct file *filp,
1160		- struct vm_area_struct *vma)
	1366	+ struct file *file,
	1367	+ struct vm_area_struct *vma)
1161	1368	{
	1369	+ struct inode *inode = file_inode(file);
	1370	+ struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
	1371	+
1162	1372	/*
1163		- * We don't support synchronous mappings for non-DAX files. At least
1164		- * until someone comes with a sensible use case.
	1373	+ * We don't support synchronous mappings for non-DAX files and
	1374	+ * for DAX files if underneath dax_device is not synchronous.
1165	1375	*/
1166		- if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
	1376	+ if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1167	1377	return -EOPNOTSUPP;
1168	1378
1169		- file_accessed(filp);
	1379	+ file_accessed(file);
1170	1380	vma->vm_ops = &xfs_file_vm_ops;
1171		- if (IS_DAX(file_inode(filp)))
	1381	+ if (IS_DAX(inode))
1172	1382	vma->vm_flags \|= VM_HUGEPAGE;
1173	1383	return 0;
1174	1384	}
..	..	@@ -1179,6 +1389,7 @@
1179	1389	.write_iter = xfs_file_write_iter,
1180	1390	.splice_read = generic_file_splice_read,
1181	1391	.splice_write = iter_file_splice_write,
	1392	+ .iopoll = iomap_dio_iopoll,
1182	1393	.unlocked_ioctl = xfs_file_ioctl,
1183	1394	#ifdef CONFIG_COMPAT
1184	1395	.compat_ioctl = xfs_file_compat_ioctl,
..	..	@@ -1190,8 +1401,8 @@
1190	1401	.fsync = xfs_file_fsync,
1191	1402	.get_unmapped_area = thp_get_unmapped_area,
1192	1403	.fallocate = xfs_file_fallocate,
1193		- .clone_file_range = xfs_file_clone_range,
1194		- .dedupe_file_range = xfs_file_dedupe_range,
	1404	+ .fadvise = xfs_file_fadvise,
	1405	+ .remap_file_range = xfs_file_remap_range,
1195	1406	};
1196	1407
1197	1408	const struct file_operations xfs_dir_file_operations = {