hc
2024-09-20 cf4ce59b3b70238352c7f1729f0f7223214828ad
kernel/fs/xfs/xfs_file.c
....@@ -10,14 +10,11 @@
1010 #include "xfs_log_format.h"
1111 #include "xfs_trans_resv.h"
1212 #include "xfs_mount.h"
13
-#include "xfs_da_format.h"
14
-#include "xfs_da_btree.h"
1513 #include "xfs_inode.h"
1614 #include "xfs_trans.h"
1715 #include "xfs_inode_item.h"
1816 #include "xfs_bmap.h"
1917 #include "xfs_bmap_util.h"
20
-#include "xfs_error.h"
2118 #include "xfs_dir2.h"
2219 #include "xfs_dir2_priv.h"
2320 #include "xfs_ioctl.h"
....@@ -28,13 +25,45 @@
2825 #include "xfs_iomap.h"
2926 #include "xfs_reflink.h"
3027
31
-#include <linux/dcache.h>
3228 #include <linux/falloc.h>
33
-#include <linux/pagevec.h>
3429 #include <linux/backing-dev.h>
3530 #include <linux/mman.h>
31
+#include <linux/fadvise.h>
3632
3733 static const struct vm_operations_struct xfs_file_vm_ops;
34
+
35
+/*
36
+ * Decide if the given file range is aligned to the size of the fundamental
37
+ * allocation unit for the file.
38
+ */
39
+static bool
40
+xfs_is_falloc_aligned(
41
+ struct xfs_inode *ip,
42
+ loff_t pos,
43
+ long long int len)
44
+{
45
+ struct xfs_mount *mp = ip->i_mount;
46
+ uint64_t mask;
47
+
48
+ if (XFS_IS_REALTIME_INODE(ip)) {
49
+ if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
50
+ u64 rextbytes;
51
+ u32 mod;
52
+
53
+ rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
54
+ div_u64_rem(pos, rextbytes, &mod);
55
+ if (mod)
56
+ return false;
57
+ div_u64_rem(len, rextbytes, &mod);
58
+ return mod == 0;
59
+ }
60
+ mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
61
+ } else {
62
+ mask = mp->m_sb.sb_blocksize - 1;
63
+ }
64
+
65
+ return !((pos | len) & mask);
66
+}
3867
3968 int
4069 xfs_update_prealloc_flags(
....@@ -65,8 +94,6 @@
6594 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
6695
6796 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
68
- if (flags & XFS_PREALLOC_SYNC)
69
- xfs_trans_set_sync(tp);
7097 return xfs_trans_commit(tp);
7198 }
7299
....@@ -84,19 +111,57 @@
84111 int datasync)
85112 {
86113 struct xfs_inode *ip = XFS_I(file->f_mapping->host);
87
- struct xfs_mount *mp = ip->i_mount;
88
- xfs_lsn_t lsn = 0;
89114
90115 trace_xfs_dir_fsync(ip);
116
+ return xfs_log_force_inode(ip);
117
+}
118
+
119
+static xfs_csn_t
120
+xfs_fsync_seq(
121
+ struct xfs_inode *ip,
122
+ bool datasync)
123
+{
124
+ if (!xfs_ipincount(ip))
125
+ return 0;
126
+ if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
127
+ return 0;
128
+ return ip->i_itemp->ili_commit_seq;
129
+}
130
+
131
+/*
132
+ * All metadata updates are logged, which means that we just have to flush the
133
+ * log up to the latest LSN that touched the inode.
134
+ *
135
+ * If we have concurrent fsync/fdatasync() calls, we need them to all block on
136
+ * the log force before we clear the ili_fsync_fields field. This ensures that
137
+ * we don't get a racing sync operation that does not wait for the metadata to
138
+ * hit the journal before returning. If we race with clearing ili_fsync_fields,
139
+ * then all that will happen is the log force will do nothing as the lsn will
140
+ * already be on disk. We can't race with setting ili_fsync_fields because that
141
+ * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
142
+ * shared until after the ili_fsync_fields is cleared.
143
+ */
144
+static int
145
+xfs_fsync_flush_log(
146
+ struct xfs_inode *ip,
147
+ bool datasync,
148
+ int *log_flushed)
149
+{
150
+ int error = 0;
151
+ xfs_csn_t seq;
91152
92153 xfs_ilock(ip, XFS_ILOCK_SHARED);
93
- if (xfs_ipincount(ip))
94
- lsn = ip->i_itemp->ili_last_lsn;
95
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
154
+ seq = xfs_fsync_seq(ip, datasync);
155
+ if (seq) {
156
+ error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
157
+ log_flushed);
96158
97
- if (!lsn)
98
- return 0;
99
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
159
+ spin_lock(&ip->i_itemp->ili_lock);
160
+ ip->i_itemp->ili_fsync_fields = 0;
161
+ spin_unlock(&ip->i_itemp->ili_lock);
162
+ }
163
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
164
+ return error;
100165 }
101166
102167 STATIC int
....@@ -106,12 +171,10 @@
106171 loff_t end,
107172 int datasync)
108173 {
109
- struct inode *inode = file->f_mapping->host;
110
- struct xfs_inode *ip = XFS_I(inode);
174
+ struct xfs_inode *ip = XFS_I(file->f_mapping->host);
111175 struct xfs_mount *mp = ip->i_mount;
112176 int error = 0;
113177 int log_flushed = 0;
114
- xfs_lsn_t lsn = 0;
115178
116179 trace_xfs_file_fsync(ip);
117180
....@@ -135,31 +198,7 @@
135198 else if (mp->m_logdev_targp != mp->m_ddev_targp)
136199 xfs_blkdev_issue_flush(mp->m_ddev_targp);
137200
138
- /*
139
- * All metadata updates are logged, which means that we just have to
140
- * flush the log up to the latest LSN that touched the inode. If we have
141
- * concurrent fsync/fdatasync() calls, we need them to all block on the
142
- * log force before we clear the ili_fsync_fields field. This ensures
143
- * that we don't get a racing sync operation that does not wait for the
144
- * metadata to hit the journal before returning. If we race with
145
- * clearing the ili_fsync_fields, then all that will happen is the log
146
- * force will do nothing as the lsn will already be on disk. We can't
147
- * race with setting ili_fsync_fields because that is done under
148
- * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
149
- * until after the ili_fsync_fields is cleared.
150
- */
151
- xfs_ilock(ip, XFS_ILOCK_SHARED);
152
- if (xfs_ipincount(ip)) {
153
- if (!datasync ||
154
- (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
155
- lsn = ip->i_itemp->ili_last_lsn;
156
- }
157
-
158
- if (lsn) {
159
- error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
160
- ip->i_itemp->ili_fsync_fields = 0;
161
- }
162
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
201
+ error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
163202
164203 /*
165204 * If we only have a single device, and the log force about was
....@@ -191,8 +230,14 @@
191230
192231 file_accessed(iocb->ki_filp);
193232
194
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
195
- ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
233
+ if (iocb->ki_flags & IOCB_NOWAIT) {
234
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
235
+ return -EAGAIN;
236
+ } else {
237
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
238
+ }
239
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
240
+ is_sync_kiocb(iocb));
196241 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
197242
198243 return ret;
....@@ -219,7 +264,7 @@
219264 xfs_ilock(ip, XFS_IOLOCK_SHARED);
220265 }
221266
222
- ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
267
+ ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
223268 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
224269
225270 file_accessed(iocb->ki_filp);
....@@ -355,7 +400,7 @@
355400
356401 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
357402 error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
358
- NULL, &xfs_iomap_ops);
403
+ NULL, &xfs_buffered_write_iomap_ops);
359404 if (error)
360405 return error;
361406 } else
....@@ -367,40 +412,30 @@
367412 * lock above. Eventually we should look into a way to avoid
368413 * the pointless lock roundtrip.
369414 */
370
- if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
371
- error = file_update_time(file);
372
- if (error)
373
- return error;
374
- }
375
-
376
- /*
377
- * If we're writing the file then make sure to clear the setuid and
378
- * setgid bits if the process is not being run by root. This keeps
379
- * people from modifying setuid and setgid binaries.
380
- */
381
- if (!IS_NOSEC(inode))
382
- return file_remove_privs(file);
383
- return 0;
415
+ return file_modified(file);
384416 }
385417
386418 static int
387419 xfs_dio_write_end_io(
388420 struct kiocb *iocb,
389421 ssize_t size,
422
+ int error,
390423 unsigned flags)
391424 {
392425 struct inode *inode = file_inode(iocb->ki_filp);
393426 struct xfs_inode *ip = XFS_I(inode);
394427 loff_t offset = iocb->ki_pos;
395
- int error = 0;
428
+ unsigned int nofs_flag;
396429
397430 trace_xfs_end_io_direct_write(ip, offset, size);
398431
399432 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
400433 return -EIO;
401434
402
- if (size <= 0)
403
- return size;
435
+ if (error)
436
+ return error;
437
+ if (!size)
438
+ return 0;
404439
405440 /*
406441 * Capture amount written on completion as we can't reliably account
....@@ -408,10 +443,17 @@
408443 */
409444 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
410445
446
+ /*
447
+ * We can allocate memory here while doing writeback on behalf of
448
+ * memory reclaim. To avoid memory allocation deadlocks set the
449
+ * task-wide nofs context for the following operations.
450
+ */
451
+ nofs_flag = memalloc_nofs_save();
452
+
411453 if (flags & IOMAP_DIO_COW) {
412454 error = xfs_reflink_end_cow(ip, offset, size);
413455 if (error)
414
- return error;
456
+ goto out;
415457 }
416458
417459 /*
....@@ -420,8 +462,10 @@
420462 * earlier allows a racing dio read to find unwritten extents before
421463 * they are converted.
422464 */
423
- if (flags & IOMAP_DIO_UNWRITTEN)
424
- return xfs_iomap_write_unwritten(ip, offset, size, true);
465
+ if (flags & IOMAP_DIO_UNWRITTEN) {
466
+ error = xfs_iomap_write_unwritten(ip, offset, size, true);
467
+ goto out;
468
+ }
425469
426470 /*
427471 * We need to update the in-core inode size here so that we don't end up
....@@ -443,8 +487,14 @@
443487 spin_unlock(&ip->i_flags_lock);
444488 }
445489
490
+out:
491
+ memalloc_nofs_restore(nofs_flag);
446492 return error;
447493 }
494
+
495
+static const struct iomap_dio_ops xfs_dio_write_ops = {
496
+ .end_io = xfs_dio_write_end_io,
497
+};
448498
449499 /*
450500 * xfs_file_dio_aio_write - handle direct IO writes
....@@ -485,8 +535,7 @@
485535 int unaligned_io = 0;
486536 int iolock;
487537 size_t count = iov_iter_count(from);
488
- struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
489
- mp->m_rtdev_targp : mp->m_ddev_targp;
538
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
490539
491540 /* DIO must be aligned to device logical sector size */
492541 if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
....@@ -507,9 +556,9 @@
507556 * We can't properly handle unaligned direct I/O to reflink
508557 * files yet, as we can't unshare a partial block.
509558 */
510
- if (xfs_is_reflink_inode(ip)) {
559
+ if (xfs_is_cow_inode(ip)) {
511560 trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
512
- return -EREMCHG;
561
+ return -ENOTBLK;
513562 }
514563 iolock = XFS_IOLOCK_EXCL;
515564 } else {
....@@ -546,21 +595,19 @@
546595 }
547596
548597 trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
549
- ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
550
-
551598 /*
552
- * If unaligned, this is the only IO in-flight. If it has not yet
553
- * completed, wait on it before we release the iolock to prevent
554
- * subsequent overlapping IO.
599
+ * If unaligned, this is the only IO in-flight. Wait on it before we
600
+ * release the iolock to prevent subsequent overlapping IO.
555601 */
556
- if (ret == -EIOCBQUEUED && unaligned_io)
557
- inode_dio_wait(inode);
602
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
603
+ &xfs_dio_write_ops,
604
+ is_sync_kiocb(iocb) || unaligned_io);
558605 out:
559606 xfs_iunlock(ip, iolock);
560607
561608 /*
562
- * No fallback to buffered IO on errors for XFS, direct IO will either
563
- * complete fully or fail.
609
+ * No fallback to buffered IO after short writes for XFS, direct I/O
610
+ * will either complete fully or return an error.
564611 */
565612 ASSERT(ret < 0 || ret == count);
566613 return ret;
....@@ -593,7 +640,7 @@
593640 count = iov_iter_count(from);
594641
595642 trace_xfs_file_dax_write(ip, count, pos);
596
- ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
643
+ ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
597644 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
598645 i_size_write(inode, iocb->ki_pos);
599646 error = xfs_setfilesize(ip, pos, ret);
....@@ -640,7 +687,8 @@
640687 current->backing_dev_info = inode_to_bdi(inode);
641688
642689 trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
643
- ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
690
+ ret = iomap_file_buffered_write(iocb, from,
691
+ &xfs_buffered_write_iomap_ops);
644692 if (likely(ret >= 0))
645693 iocb->ki_pos += ret;
646694
....@@ -719,7 +767,7 @@
719767 * allow an operation to fall back to buffered mode.
720768 */
721769 ret = xfs_file_dio_aio_write(iocb, from);
722
- if (ret != -EREMCHG)
770
+ if (ret != -ENOTBLK)
723771 return ret;
724772 }
725773
....@@ -802,7 +850,6 @@
802850 struct inode *inode = file_inode(file);
803851 struct xfs_inode *ip = XFS_I(inode);
804852 long error;
805
- enum xfs_prealloc_flags flags = 0;
806853 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
807854 loff_t new_size = 0;
808855 bool do_file_insert = false;
....@@ -817,14 +864,46 @@
817864 if (error)
818865 goto out_unlock;
819866
867
+ /*
868
+ * Must wait for all AIO to complete before we continue as AIO can
869
+ * change the file size on completion without holding any locks we
870
+ * currently hold. We must do this first because AIO can update both
871
+ * the on disk and in memory inode sizes, and the operations that follow
872
+ * require the in-memory size to be fully up-to-date.
873
+ */
874
+ inode_dio_wait(inode);
875
+
876
+ /*
877
+ * Now AIO and DIO has drained we flush and (if necessary) invalidate
878
+ * the cached range over the first operation we are about to run.
879
+ *
880
+ * We care about zero and collapse here because they both run a hole
881
+ * punch over the range first. Because that can zero data, and the range
882
+ * of invalidation for the shift operations is much larger, we still do
883
+ * the required flush for collapse in xfs_prepare_shift().
884
+ *
885
+ * Insert has the same range requirements as collapse, and we extend the
886
+ * file first which can zero data. Hence insert has the same
887
+ * flush/invalidate requirements as collapse and so they are both
888
+ * handled at the right time by xfs_prepare_shift().
889
+ */
890
+ if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
891
+ FALLOC_FL_COLLAPSE_RANGE)) {
892
+ error = xfs_flush_unmap_range(ip, offset, len);
893
+ if (error)
894
+ goto out_unlock;
895
+ }
896
+
897
+ error = file_modified(file);
898
+ if (error)
899
+ goto out_unlock;
900
+
820901 if (mode & FALLOC_FL_PUNCH_HOLE) {
821902 error = xfs_free_file_space(ip, offset, len);
822903 if (error)
823904 goto out_unlock;
824905 } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
825
- unsigned int blksize_mask = i_blocksize(inode) - 1;
826
-
827
- if (offset & blksize_mask || len & blksize_mask) {
906
+ if (!xfs_is_falloc_aligned(ip, offset, len)) {
828907 error = -EINVAL;
829908 goto out_unlock;
830909 }
....@@ -844,10 +923,9 @@
844923 if (error)
845924 goto out_unlock;
846925 } else if (mode & FALLOC_FL_INSERT_RANGE) {
847
- unsigned int blksize_mask = i_blocksize(inode) - 1;
848926 loff_t isize = i_size_read(inode);
849927
850
- if (offset & blksize_mask || len & blksize_mask) {
928
+ if (!xfs_is_falloc_aligned(ip, offset, len)) {
851929 error = -EINVAL;
852930 goto out_unlock;
853931 }
....@@ -869,8 +947,6 @@
869947 }
870948 do_file_insert = true;
871949 } else {
872
- flags |= XFS_PREALLOC_SET;
873
-
874950 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
875951 offset + len > i_size_read(inode)) {
876952 new_size = offset + len;
....@@ -879,27 +955,49 @@
879955 goto out_unlock;
880956 }
881957
882
- if (mode & FALLOC_FL_ZERO_RANGE)
883
- error = xfs_zero_file_space(ip, offset, len);
884
- else {
885
- if (mode & FALLOC_FL_UNSHARE_RANGE) {
886
- error = xfs_reflink_unshare(ip, offset, len);
887
- if (error)
888
- goto out_unlock;
958
+ if (mode & FALLOC_FL_ZERO_RANGE) {
959
+ /*
960
+ * Punch a hole and prealloc the range. We use a hole
961
+ * punch rather than unwritten extent conversion for two
962
+ * reasons:
963
+ *
964
+ * 1.) Hole punch handles partial block zeroing for us.
965
+ * 2.) If prealloc returns ENOSPC, the file range is
966
+ * still zero-valued by virtue of the hole punch.
967
+ */
968
+ unsigned int blksize = i_blocksize(inode);
969
+
970
+ trace_xfs_zero_file_space(ip);
971
+
972
+ error = xfs_free_file_space(ip, offset, len);
973
+ if (error)
974
+ goto out_unlock;
975
+
976
+ len = round_up(offset + len, blksize) -
977
+ round_down(offset, blksize);
978
+ offset = round_down(offset, blksize);
979
+ } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
980
+ error = xfs_reflink_unshare(ip, offset, len);
981
+ if (error)
982
+ goto out_unlock;
983
+ } else {
984
+ /*
985
+ * If always_cow mode we can't use preallocations and
986
+ * thus should not create them.
987
+ */
988
+ if (xfs_is_always_cow_inode(ip)) {
989
+ error = -EOPNOTSUPP;
990
+ goto out_unlock;
889991 }
992
+ }
993
+
994
+ if (!xfs_is_always_cow_inode(ip)) {
890995 error = xfs_alloc_file_space(ip, offset, len,
891996 XFS_BMAPI_PREALLOC);
997
+ if (error)
998
+ goto out_unlock;
892999 }
893
- if (error)
894
- goto out_unlock;
8951000 }
896
-
897
- if (file->f_flags & O_DSYNC)
898
- flags |= XFS_PREALLOC_SYNC;
899
-
900
- error = xfs_update_prealloc_flags(ip, flags);
901
- if (error)
902
- goto out_unlock;
9031001
9041002 /* Change file size if needed */
9051003 if (new_size) {
....@@ -918,8 +1016,14 @@
9181016 * leave shifted extents past EOF and hence losing access to
9191017 * the data that is contained within them.
9201018 */
921
- if (do_file_insert)
1019
+ if (do_file_insert) {
9221020 error = xfs_insert_file_space(ip, offset, len);
1021
+ if (error)
1022
+ goto out_unlock;
1023
+ }
1024
+
1025
+ if (file->f_flags & O_DSYNC)
1026
+ error = xfs_log_force_inode(ip);
9231027
9241028 out_unlock:
9251029 xfs_iunlock(ip, iolock);
....@@ -927,27 +1031,109 @@
9271031 }
9281032
9291033 STATIC int
930
-xfs_file_clone_range(
931
- struct file *file_in,
932
- loff_t pos_in,
933
- struct file *file_out,
934
- loff_t pos_out,
935
- u64 len)
1034
+xfs_file_fadvise(
1035
+ struct file *file,
1036
+ loff_t start,
1037
+ loff_t end,
1038
+ int advice)
9361039 {
937
- return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
938
- len, false);
1040
+ struct xfs_inode *ip = XFS_I(file_inode(file));
1041
+ int ret;
1042
+ int lockflags = 0;
1043
+
1044
+ /*
1045
+ * Operations creating pages in page cache need protection from hole
1046
+ * punching and similar ops
1047
+ */
1048
+ if (advice == POSIX_FADV_WILLNEED) {
1049
+ lockflags = XFS_IOLOCK_SHARED;
1050
+ xfs_ilock(ip, lockflags);
1051
+ }
1052
+ ret = generic_fadvise(file, start, end, advice);
1053
+ if (lockflags)
1054
+ xfs_iunlock(ip, lockflags);
1055
+ return ret;
9391056 }
9401057
941
-STATIC int
942
-xfs_file_dedupe_range(
943
- struct file *file_in,
944
- loff_t pos_in,
945
- struct file *file_out,
946
- loff_t pos_out,
947
- u64 len)
1058
+/* Does this file, inode, or mount want synchronous writes? */
1059
+static inline bool xfs_file_sync_writes(struct file *filp)
9481060 {
949
- return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
950
- len, true);
1061
+ struct xfs_inode *ip = XFS_I(file_inode(filp));
1062
+
1063
+ if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC)
1064
+ return true;
1065
+ if (filp->f_flags & (__O_SYNC | O_DSYNC))
1066
+ return true;
1067
+ if (IS_SYNC(file_inode(filp)))
1068
+ return true;
1069
+
1070
+ return false;
1071
+}
1072
+
1073
+STATIC loff_t
1074
+xfs_file_remap_range(
1075
+ struct file *file_in,
1076
+ loff_t pos_in,
1077
+ struct file *file_out,
1078
+ loff_t pos_out,
1079
+ loff_t len,
1080
+ unsigned int remap_flags)
1081
+{
1082
+ struct inode *inode_in = file_inode(file_in);
1083
+ struct xfs_inode *src = XFS_I(inode_in);
1084
+ struct inode *inode_out = file_inode(file_out);
1085
+ struct xfs_inode *dest = XFS_I(inode_out);
1086
+ struct xfs_mount *mp = src->i_mount;
1087
+ loff_t remapped = 0;
1088
+ xfs_extlen_t cowextsize;
1089
+ int ret;
1090
+
1091
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1092
+ return -EINVAL;
1093
+
1094
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
1095
+ return -EOPNOTSUPP;
1096
+
1097
+ if (XFS_FORCED_SHUTDOWN(mp))
1098
+ return -EIO;
1099
+
1100
+ /* Prepare and then clone file data. */
1101
+ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1102
+ &len, remap_flags);
1103
+ if (ret || len == 0)
1104
+ return ret;
1105
+
1106
+ trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1107
+
1108
+ ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1109
+ &remapped);
1110
+ if (ret)
1111
+ goto out_unlock;
1112
+
1113
+ /*
1114
+ * Carry the cowextsize hint from src to dest if we're sharing the
1115
+ * entire source file to the entire destination file, the source file
1116
+ * has a cowextsize hint, and the destination file does not.
1117
+ */
1118
+ cowextsize = 0;
1119
+ if (pos_in == 0 && len == i_size_read(inode_in) &&
1120
+ (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1121
+ pos_out == 0 && len >= i_size_read(inode_out) &&
1122
+ !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
1123
+ cowextsize = src->i_d.di_cowextsize;
1124
+
1125
+ ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1126
+ remap_flags);
1127
+ if (ret)
1128
+ goto out_unlock;
1129
+
1130
+ if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
1131
+ xfs_log_force_inode(dest);
1132
+out_unlock:
1133
+ xfs_iunlock2_io_mmap(src, dest);
1134
+ if (ret)
1135
+ trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1136
+ return remapped > 0 ? remapped : ret;
9511137 }
9521138
9531139 STATIC int
....@@ -959,7 +1145,7 @@
9591145 return -EFBIG;
9601146 if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
9611147 return -EIO;
962
- file->f_mode |= FMODE_NOWAIT;
1148
+ file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
9631149 return 0;
9641150 }
9651151
....@@ -981,8 +1167,8 @@
9811167 * certain to have the next operation be a read there.
9821168 */
9831169 mode = xfs_ilock_data_map_shared(ip);
984
- if (ip->i_d.di_nextents > 0)
985
- error = xfs_dir3_data_readahead(ip, 0, -1);
1170
+ if (ip->i_df.if_nextents > 0)
1171
+ error = xfs_dir3_data_readahead(ip, 0, 0);
9861172 xfs_iunlock(ip, mode);
9871173 return error;
9881174 }
....@@ -1036,10 +1222,10 @@
10361222 default:
10371223 return generic_file_llseek(file, offset, whence);
10381224 case SEEK_HOLE:
1039
- offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
1225
+ offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
10401226 break;
10411227 case SEEK_DATA:
1042
- offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
1228
+ offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
10431229 break;
10441230 }
10451231
....@@ -1052,7 +1238,7 @@
10521238 * Locking for serialisation of IO during page faults. This results in a lock
10531239 * ordering of:
10541240 *
1055
- * mmap_sem (MM)
1241
+ * mmap_lock (MM)
10561242 * sb_start_pagefault(vfs, freeze)
10571243 * i_mmaplock (XFS - truncate serialisation)
10581244 * page_lock (MM)
....@@ -1079,12 +1265,16 @@
10791265 if (IS_DAX(inode)) {
10801266 pfn_t pfn;
10811267
1082
- ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops);
1268
+ ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
1269
+ (write_fault && !vmf->cow_page) ?
1270
+ &xfs_direct_write_iomap_ops :
1271
+ &xfs_read_iomap_ops);
10831272 if (ret & VM_FAULT_NEEDDSYNC)
10841273 ret = dax_finish_sync_fault(vmf, pe_size, pfn);
10851274 } else {
10861275 if (write_fault)
1087
- ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
1276
+ ret = iomap_page_mkwrite(vmf,
1277
+ &xfs_buffered_write_iomap_ops);
10881278 else
10891279 ret = filemap_fault(vmf);
10901280 }
....@@ -1146,29 +1336,47 @@
11461336 return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
11471337 }
11481338
1339
+static vm_fault_t
1340
+xfs_filemap_map_pages(
1341
+ struct vm_fault *vmf,
1342
+ pgoff_t start_pgoff,
1343
+ pgoff_t end_pgoff)
1344
+{
1345
+ struct inode *inode = file_inode(vmf->vma->vm_file);
1346
+ vm_fault_t ret;
1347
+
1348
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1349
+ ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
1350
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1351
+ return ret;
1352
+}
1353
+
11491354 static const struct vm_operations_struct xfs_file_vm_ops = {
11501355 .fault = xfs_filemap_fault,
11511356 .huge_fault = xfs_filemap_huge_fault,
1152
- .map_pages = filemap_map_pages,
1357
+ .map_pages = xfs_filemap_map_pages,
11531358 .page_mkwrite = xfs_filemap_page_mkwrite,
11541359 .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
11551360 };
11561361
11571362 STATIC int
11581363 xfs_file_mmap(
1159
- struct file *filp,
1160
- struct vm_area_struct *vma)
1364
+ struct file *file,
1365
+ struct vm_area_struct *vma)
11611366 {
1367
+ struct inode *inode = file_inode(file);
1368
+ struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
1369
+
11621370 /*
1163
- * We don't support synchronous mappings for non-DAX files. At least
1164
- * until someone comes with a sensible use case.
1371
+ * We don't support synchronous mappings for non-DAX files and
1372
+ * for DAX files if underneath dax_device is not synchronous.
11651373 */
1166
- if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
1374
+ if (!daxdev_mapping_supported(vma, target->bt_daxdev))
11671375 return -EOPNOTSUPP;
11681376
1169
- file_accessed(filp);
1377
+ file_accessed(file);
11701378 vma->vm_ops = &xfs_file_vm_ops;
1171
- if (IS_DAX(file_inode(filp)))
1379
+ if (IS_DAX(inode))
11721380 vma->vm_flags |= VM_HUGEPAGE;
11731381 return 0;
11741382 }
....@@ -1179,6 +1387,7 @@
11791387 .write_iter = xfs_file_write_iter,
11801388 .splice_read = generic_file_splice_read,
11811389 .splice_write = iter_file_splice_write,
1390
+ .iopoll = iomap_dio_iopoll,
11821391 .unlocked_ioctl = xfs_file_ioctl,
11831392 #ifdef CONFIG_COMPAT
11841393 .compat_ioctl = xfs_file_compat_ioctl,
....@@ -1190,8 +1399,8 @@
11901399 .fsync = xfs_file_fsync,
11911400 .get_unmapped_area = thp_get_unmapped_area,
11921401 .fallocate = xfs_file_fallocate,
1193
- .clone_file_range = xfs_file_clone_range,
1194
- .dedupe_file_range = xfs_file_dedupe_range,
1402
+ .fadvise = xfs_file_fadvise,
1403
+ .remap_file_range = xfs_file_remap_range,
11951404 };
11961405
11971406 const struct file_operations xfs_dir_file_operations = {