.. | .. |
---|
10 | 10 | #include "xfs_log_format.h" |
---|
11 | 11 | #include "xfs_trans_resv.h" |
---|
12 | 12 | #include "xfs_mount.h" |
---|
13 | | -#include "xfs_da_format.h" |
---|
14 | | -#include "xfs_da_btree.h" |
---|
15 | 13 | #include "xfs_inode.h" |
---|
16 | 14 | #include "xfs_trans.h" |
---|
17 | 15 | #include "xfs_inode_item.h" |
---|
18 | 16 | #include "xfs_bmap.h" |
---|
19 | 17 | #include "xfs_bmap_util.h" |
---|
20 | | -#include "xfs_error.h" |
---|
21 | 18 | #include "xfs_dir2.h" |
---|
22 | 19 | #include "xfs_dir2_priv.h" |
---|
23 | 20 | #include "xfs_ioctl.h" |
---|
.. | .. |
---|
28 | 25 | #include "xfs_iomap.h" |
---|
29 | 26 | #include "xfs_reflink.h" |
---|
30 | 27 | |
---|
31 | | -#include <linux/dcache.h> |
---|
32 | 28 | #include <linux/falloc.h> |
---|
33 | | -#include <linux/pagevec.h> |
---|
34 | 29 | #include <linux/backing-dev.h> |
---|
35 | 30 | #include <linux/mman.h> |
---|
| 31 | +#include <linux/fadvise.h> |
---|
36 | 32 | |
---|
37 | 33 | static const struct vm_operations_struct xfs_file_vm_ops; |
---|
| 34 | + |
---|
| 35 | +/* |
---|
| 36 | + * Decide if the given file range is aligned to the size of the fundamental |
---|
| 37 | + * allocation unit for the file. |
---|
| 38 | + */ |
---|
| 39 | +static bool |
---|
| 40 | +xfs_is_falloc_aligned( |
---|
| 41 | + struct xfs_inode *ip, |
---|
| 42 | + loff_t pos, |
---|
| 43 | + long long int len) |
---|
| 44 | +{ |
---|
| 45 | + struct xfs_mount *mp = ip->i_mount; |
---|
| 46 | + uint64_t mask; |
---|
| 47 | + |
---|
| 48 | + if (XFS_IS_REALTIME_INODE(ip)) { |
---|
| 49 | + if (!is_power_of_2(mp->m_sb.sb_rextsize)) { |
---|
| 50 | + u64 rextbytes; |
---|
| 51 | + u32 mod; |
---|
| 52 | + |
---|
| 53 | + rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); |
---|
| 54 | + div_u64_rem(pos, rextbytes, &mod); |
---|
| 55 | + if (mod) |
---|
| 56 | + return false; |
---|
| 57 | + div_u64_rem(len, rextbytes, &mod); |
---|
| 58 | + return mod == 0; |
---|
| 59 | + } |
---|
| 60 | + mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1; |
---|
| 61 | + } else { |
---|
| 62 | + mask = mp->m_sb.sb_blocksize - 1; |
---|
| 63 | + } |
---|
| 64 | + |
---|
| 65 | + return !((pos | len) & mask); |
---|
| 66 | +} |
---|
38 | 67 | |
---|
39 | 68 | int |
---|
40 | 69 | xfs_update_prealloc_flags( |
---|
.. | .. |
---|
65 | 94 | ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; |
---|
66 | 95 | |
---|
67 | 96 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
---|
68 | | - if (flags & XFS_PREALLOC_SYNC) |
---|
69 | | - xfs_trans_set_sync(tp); |
---|
70 | 97 | return xfs_trans_commit(tp); |
---|
71 | 98 | } |
---|
72 | 99 | |
---|
.. | .. |
---|
84 | 111 | int datasync) |
---|
85 | 112 | { |
---|
86 | 113 | struct xfs_inode *ip = XFS_I(file->f_mapping->host); |
---|
87 | | - struct xfs_mount *mp = ip->i_mount; |
---|
88 | | - xfs_lsn_t lsn = 0; |
---|
89 | 114 | |
---|
90 | 115 | trace_xfs_dir_fsync(ip); |
---|
| 116 | + return xfs_log_force_inode(ip); |
---|
| 117 | +} |
---|
| 118 | + |
---|
| 119 | +static xfs_csn_t |
---|
| 120 | +xfs_fsync_seq( |
---|
| 121 | + struct xfs_inode *ip, |
---|
| 122 | + bool datasync) |
---|
| 123 | +{ |
---|
| 124 | + if (!xfs_ipincount(ip)) |
---|
| 125 | + return 0; |
---|
| 126 | + if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) |
---|
| 127 | + return 0; |
---|
| 128 | + return ip->i_itemp->ili_commit_seq; |
---|
| 129 | +} |
---|
| 130 | + |
---|
| 131 | +/* |
---|
| 132 | + * All metadata updates are logged, which means that we just have to flush the |
---|
| 133 | + * log up to the latest LSN that touched the inode. |
---|
| 134 | + * |
---|
| 135 | + * If we have concurrent fsync/fdatasync() calls, we need them to all block on |
---|
| 136 | + * the log force before we clear the ili_fsync_fields field. This ensures that |
---|
| 137 | + * we don't get a racing sync operation that does not wait for the metadata to |
---|
| 138 | + * hit the journal before returning. If we race with clearing ili_fsync_fields, |
---|
| 139 | + * then all that will happen is the log force will do nothing as the lsn will |
---|
| 140 | + * already be on disk. We can't race with setting ili_fsync_fields because that |
---|
| 141 | + * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock |
---|
| 142 | + * shared until after the ili_fsync_fields is cleared. |
---|
| 143 | + */ |
---|
| 144 | +static int |
---|
| 145 | +xfs_fsync_flush_log( |
---|
| 146 | + struct xfs_inode *ip, |
---|
| 147 | + bool datasync, |
---|
| 148 | + int *log_flushed) |
---|
| 149 | +{ |
---|
| 150 | + int error = 0; |
---|
| 151 | + xfs_csn_t seq; |
---|
91 | 152 | |
---|
92 | 153 | xfs_ilock(ip, XFS_ILOCK_SHARED); |
---|
93 | | - if (xfs_ipincount(ip)) |
---|
94 | | - lsn = ip->i_itemp->ili_last_lsn; |
---|
95 | | - xfs_iunlock(ip, XFS_ILOCK_SHARED); |
---|
| 154 | + seq = xfs_fsync_seq(ip, datasync); |
---|
| 155 | + if (seq) { |
---|
| 156 | + error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, |
---|
| 157 | + log_flushed); |
---|
96 | 158 | |
---|
97 | | - if (!lsn) |
---|
98 | | - return 0; |
---|
99 | | - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); |
---|
| 159 | + spin_lock(&ip->i_itemp->ili_lock); |
---|
| 160 | + ip->i_itemp->ili_fsync_fields = 0; |
---|
| 161 | + spin_unlock(&ip->i_itemp->ili_lock); |
---|
| 162 | + } |
---|
| 163 | + xfs_iunlock(ip, XFS_ILOCK_SHARED); |
---|
| 164 | + return error; |
---|
100 | 165 | } |
---|
101 | 166 | |
---|
102 | 167 | STATIC int |
---|
.. | .. |
---|
106 | 171 | loff_t end, |
---|
107 | 172 | int datasync) |
---|
108 | 173 | { |
---|
109 | | - struct inode *inode = file->f_mapping->host; |
---|
110 | | - struct xfs_inode *ip = XFS_I(inode); |
---|
| 174 | + struct xfs_inode *ip = XFS_I(file->f_mapping->host); |
---|
111 | 175 | struct xfs_mount *mp = ip->i_mount; |
---|
112 | 176 | int error = 0; |
---|
113 | 177 | int log_flushed = 0; |
---|
114 | | - xfs_lsn_t lsn = 0; |
---|
115 | 178 | |
---|
116 | 179 | trace_xfs_file_fsync(ip); |
---|
117 | 180 | |
---|
.. | .. |
---|
135 | 198 | else if (mp->m_logdev_targp != mp->m_ddev_targp) |
---|
136 | 199 | xfs_blkdev_issue_flush(mp->m_ddev_targp); |
---|
137 | 200 | |
---|
138 | | - /* |
---|
139 | | - * All metadata updates are logged, which means that we just have to |
---|
140 | | - * flush the log up to the latest LSN that touched the inode. If we have |
---|
141 | | - * concurrent fsync/fdatasync() calls, we need them to all block on the |
---|
142 | | - * log force before we clear the ili_fsync_fields field. This ensures |
---|
143 | | - * that we don't get a racing sync operation that does not wait for the |
---|
144 | | - * metadata to hit the journal before returning. If we race with |
---|
145 | | - * clearing the ili_fsync_fields, then all that will happen is the log |
---|
146 | | - * force will do nothing as the lsn will already be on disk. We can't |
---|
147 | | - * race with setting ili_fsync_fields because that is done under |
---|
148 | | - * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared |
---|
149 | | - * until after the ili_fsync_fields is cleared. |
---|
150 | | - */ |
---|
151 | | - xfs_ilock(ip, XFS_ILOCK_SHARED); |
---|
152 | | - if (xfs_ipincount(ip)) { |
---|
153 | | - if (!datasync || |
---|
154 | | - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) |
---|
155 | | - lsn = ip->i_itemp->ili_last_lsn; |
---|
156 | | - } |
---|
157 | | - |
---|
158 | | - if (lsn) { |
---|
159 | | - error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); |
---|
160 | | - ip->i_itemp->ili_fsync_fields = 0; |
---|
161 | | - } |
---|
162 | | - xfs_iunlock(ip, XFS_ILOCK_SHARED); |
---|
| 201 | + error = xfs_fsync_flush_log(ip, datasync, &log_flushed); |
---|
163 | 202 | |
---|
164 | 203 | /* |
---|
165 | 204 | * If we only have a single device, and the log force about was |
---|
.. | .. |
---|
191 | 230 | |
---|
192 | 231 | file_accessed(iocb->ki_filp); |
---|
193 | 232 | |
---|
194 | | - xfs_ilock(ip, XFS_IOLOCK_SHARED); |
---|
195 | | - ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); |
---|
| 233 | + if (iocb->ki_flags & IOCB_NOWAIT) { |
---|
| 234 | + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) |
---|
| 235 | + return -EAGAIN; |
---|
| 236 | + } else { |
---|
| 237 | + xfs_ilock(ip, XFS_IOLOCK_SHARED); |
---|
| 238 | + } |
---|
| 239 | + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, |
---|
| 240 | + is_sync_kiocb(iocb)); |
---|
196 | 241 | xfs_iunlock(ip, XFS_IOLOCK_SHARED); |
---|
197 | 242 | |
---|
198 | 243 | return ret; |
---|
.. | .. |
---|
219 | 264 | xfs_ilock(ip, XFS_IOLOCK_SHARED); |
---|
220 | 265 | } |
---|
221 | 266 | |
---|
222 | | - ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); |
---|
| 267 | + ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); |
---|
223 | 268 | xfs_iunlock(ip, XFS_IOLOCK_SHARED); |
---|
224 | 269 | |
---|
225 | 270 | file_accessed(iocb->ki_filp); |
---|
.. | .. |
---|
355 | 400 | |
---|
356 | 401 | trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); |
---|
357 | 402 | error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, |
---|
358 | | - NULL, &xfs_iomap_ops); |
---|
| 403 | + NULL, &xfs_buffered_write_iomap_ops); |
---|
359 | 404 | if (error) |
---|
360 | 405 | return error; |
---|
361 | 406 | } else |
---|
.. | .. |
---|
367 | 412 | * lock above. Eventually we should look into a way to avoid |
---|
368 | 413 | * the pointless lock roundtrip. |
---|
369 | 414 | */ |
---|
370 | | - if (likely(!(file->f_mode & FMODE_NOCMTIME))) { |
---|
371 | | - error = file_update_time(file); |
---|
372 | | - if (error) |
---|
373 | | - return error; |
---|
374 | | - } |
---|
375 | | - |
---|
376 | | - /* |
---|
377 | | - * If we're writing the file then make sure to clear the setuid and |
---|
378 | | - * setgid bits if the process is not being run by root. This keeps |
---|
379 | | - * people from modifying setuid and setgid binaries. |
---|
380 | | - */ |
---|
381 | | - if (!IS_NOSEC(inode)) |
---|
382 | | - return file_remove_privs(file); |
---|
383 | | - return 0; |
---|
| 415 | + return file_modified(file); |
---|
384 | 416 | } |
---|
385 | 417 | |
---|
386 | 418 | static int |
---|
387 | 419 | xfs_dio_write_end_io( |
---|
388 | 420 | struct kiocb *iocb, |
---|
389 | 421 | ssize_t size, |
---|
| 422 | + int error, |
---|
390 | 423 | unsigned flags) |
---|
391 | 424 | { |
---|
392 | 425 | struct inode *inode = file_inode(iocb->ki_filp); |
---|
393 | 426 | struct xfs_inode *ip = XFS_I(inode); |
---|
394 | 427 | loff_t offset = iocb->ki_pos; |
---|
395 | | - int error = 0; |
---|
| 428 | + unsigned int nofs_flag; |
---|
396 | 429 | |
---|
397 | 430 | trace_xfs_end_io_direct_write(ip, offset, size); |
---|
398 | 431 | |
---|
399 | 432 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) |
---|
400 | 433 | return -EIO; |
---|
401 | 434 | |
---|
402 | | - if (size <= 0) |
---|
403 | | - return size; |
---|
| 435 | + if (error) |
---|
| 436 | + return error; |
---|
| 437 | + if (!size) |
---|
| 438 | + return 0; |
---|
404 | 439 | |
---|
405 | 440 | /* |
---|
406 | 441 | * Capture amount written on completion as we can't reliably account |
---|
.. | .. |
---|
408 | 443 | */ |
---|
409 | 444 | XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); |
---|
410 | 445 | |
---|
| 446 | + /* |
---|
| 447 | + * We can allocate memory here while doing writeback on behalf of |
---|
| 448 | + * memory reclaim. To avoid memory allocation deadlocks set the |
---|
| 449 | + * task-wide nofs context for the following operations. |
---|
| 450 | + */ |
---|
| 451 | + nofs_flag = memalloc_nofs_save(); |
---|
| 452 | + |
---|
411 | 453 | if (flags & IOMAP_DIO_COW) { |
---|
412 | 454 | error = xfs_reflink_end_cow(ip, offset, size); |
---|
413 | 455 | if (error) |
---|
414 | | - return error; |
---|
| 456 | + goto out; |
---|
415 | 457 | } |
---|
416 | 458 | |
---|
417 | 459 | /* |
---|
.. | .. |
---|
420 | 462 | * earlier allows a racing dio read to find unwritten extents before |
---|
421 | 463 | * they are converted. |
---|
422 | 464 | */ |
---|
423 | | - if (flags & IOMAP_DIO_UNWRITTEN) |
---|
424 | | - return xfs_iomap_write_unwritten(ip, offset, size, true); |
---|
| 465 | + if (flags & IOMAP_DIO_UNWRITTEN) { |
---|
| 466 | + error = xfs_iomap_write_unwritten(ip, offset, size, true); |
---|
| 467 | + goto out; |
---|
| 468 | + } |
---|
425 | 469 | |
---|
426 | 470 | /* |
---|
427 | 471 | * We need to update the in-core inode size here so that we don't end up |
---|
.. | .. |
---|
443 | 487 | spin_unlock(&ip->i_flags_lock); |
---|
444 | 488 | } |
---|
445 | 489 | |
---|
| 490 | +out: |
---|
| 491 | + memalloc_nofs_restore(nofs_flag); |
---|
446 | 492 | return error; |
---|
447 | 493 | } |
---|
| 494 | + |
---|
| 495 | +static const struct iomap_dio_ops xfs_dio_write_ops = { |
---|
| 496 | + .end_io = xfs_dio_write_end_io, |
---|
| 497 | +}; |
---|
448 | 498 | |
---|
449 | 499 | /* |
---|
450 | 500 | * xfs_file_dio_aio_write - handle direct IO writes |
---|
.. | .. |
---|
485 | 535 | int unaligned_io = 0; |
---|
486 | 536 | int iolock; |
---|
487 | 537 | size_t count = iov_iter_count(from); |
---|
488 | | - struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? |
---|
489 | | - mp->m_rtdev_targp : mp->m_ddev_targp; |
---|
| 538 | + struct xfs_buftarg *target = xfs_inode_buftarg(ip); |
---|
490 | 539 | |
---|
491 | 540 | /* DIO must be aligned to device logical sector size */ |
---|
492 | 541 | if ((iocb->ki_pos | count) & target->bt_logical_sectormask) |
---|
.. | .. |
---|
507 | 556 | * We can't properly handle unaligned direct I/O to reflink |
---|
508 | 557 | * files yet, as we can't unshare a partial block. |
---|
509 | 558 | */ |
---|
510 | | - if (xfs_is_reflink_inode(ip)) { |
---|
| 559 | + if (xfs_is_cow_inode(ip)) { |
---|
511 | 560 | trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); |
---|
512 | | - return -EREMCHG; |
---|
| 561 | + return -ENOTBLK; |
---|
513 | 562 | } |
---|
514 | 563 | iolock = XFS_IOLOCK_EXCL; |
---|
515 | 564 | } else { |
---|
.. | .. |
---|
546 | 595 | } |
---|
547 | 596 | |
---|
548 | 597 | trace_xfs_file_direct_write(ip, count, iocb->ki_pos); |
---|
549 | | - ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); |
---|
550 | | - |
---|
551 | 598 | /* |
---|
552 | | - * If unaligned, this is the only IO in-flight. If it has not yet |
---|
553 | | - * completed, wait on it before we release the iolock to prevent |
---|
554 | | - * subsequent overlapping IO. |
---|
| 599 | + * If unaligned, this is the only IO in-flight. Wait on it before we |
---|
| 600 | + * release the iolock to prevent subsequent overlapping IO. |
---|
555 | 601 | */ |
---|
556 | | - if (ret == -EIOCBQUEUED && unaligned_io) |
---|
557 | | - inode_dio_wait(inode); |
---|
| 602 | + ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, |
---|
| 603 | + &xfs_dio_write_ops, |
---|
| 604 | + is_sync_kiocb(iocb) || unaligned_io); |
---|
558 | 605 | out: |
---|
559 | 606 | xfs_iunlock(ip, iolock); |
---|
560 | 607 | |
---|
561 | 608 | /* |
---|
562 | | - * No fallback to buffered IO on errors for XFS, direct IO will either |
---|
563 | | - * complete fully or fail. |
---|
| 609 | + * No fallback to buffered IO after short writes for XFS, direct I/O |
---|
| 610 | + * will either complete fully or return an error. |
---|
564 | 611 | */ |
---|
565 | 612 | ASSERT(ret < 0 || ret == count); |
---|
566 | 613 | return ret; |
---|
.. | .. |
---|
593 | 640 | count = iov_iter_count(from); |
---|
594 | 641 | |
---|
595 | 642 | trace_xfs_file_dax_write(ip, count, pos); |
---|
596 | | - ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); |
---|
| 643 | + ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops); |
---|
597 | 644 | if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { |
---|
598 | 645 | i_size_write(inode, iocb->ki_pos); |
---|
599 | 646 | error = xfs_setfilesize(ip, pos, ret); |
---|
.. | .. |
---|
640 | 687 | current->backing_dev_info = inode_to_bdi(inode); |
---|
641 | 688 | |
---|
642 | 689 | trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); |
---|
643 | | - ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); |
---|
| 690 | + ret = iomap_file_buffered_write(iocb, from, |
---|
| 691 | + &xfs_buffered_write_iomap_ops); |
---|
644 | 692 | if (likely(ret >= 0)) |
---|
645 | 693 | iocb->ki_pos += ret; |
---|
646 | 694 | |
---|
.. | .. |
---|
719 | 767 | * allow an operation to fall back to buffered mode. |
---|
720 | 768 | */ |
---|
721 | 769 | ret = xfs_file_dio_aio_write(iocb, from); |
---|
722 | | - if (ret != -EREMCHG) |
---|
| 770 | + if (ret != -ENOTBLK) |
---|
723 | 771 | return ret; |
---|
724 | 772 | } |
---|
725 | 773 | |
---|
.. | .. |
---|
802 | 850 | struct inode *inode = file_inode(file); |
---|
803 | 851 | struct xfs_inode *ip = XFS_I(inode); |
---|
804 | 852 | long error; |
---|
805 | | - enum xfs_prealloc_flags flags = 0; |
---|
806 | 853 | uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; |
---|
807 | 854 | loff_t new_size = 0; |
---|
808 | 855 | bool do_file_insert = false; |
---|
.. | .. |
---|
817 | 864 | if (error) |
---|
818 | 865 | goto out_unlock; |
---|
819 | 866 | |
---|
| 867 | + /* |
---|
| 868 | + * Must wait for all AIO to complete before we continue as AIO can |
---|
| 869 | + * change the file size on completion without holding any locks we |
---|
| 870 | + * currently hold. We must do this first because AIO can update both |
---|
| 871 | + * the on disk and in memory inode sizes, and the operations that follow |
---|
| 872 | + * require the in-memory size to be fully up-to-date. |
---|
| 873 | + */ |
---|
| 874 | + inode_dio_wait(inode); |
---|
| 875 | + |
---|
| 876 | + /* |
---|
| 877 | + * Now AIO and DIO has drained we flush and (if necessary) invalidate |
---|
| 878 | + * the cached range over the first operation we are about to run. |
---|
| 879 | + * |
---|
| 880 | + * We care about zero and collapse here because they both run a hole |
---|
| 881 | + * punch over the range first. Because that can zero data, and the range |
---|
| 882 | + * of invalidation for the shift operations is much larger, we still do |
---|
| 883 | + * the required flush for collapse in xfs_prepare_shift(). |
---|
| 884 | + * |
---|
| 885 | + * Insert has the same range requirements as collapse, and we extend the |
---|
| 886 | + * file first which can zero data. Hence insert has the same |
---|
| 887 | + * flush/invalidate requirements as collapse and so they are both |
---|
| 888 | + * handled at the right time by xfs_prepare_shift(). |
---|
| 889 | + */ |
---|
| 890 | + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | |
---|
| 891 | + FALLOC_FL_COLLAPSE_RANGE)) { |
---|
| 892 | + error = xfs_flush_unmap_range(ip, offset, len); |
---|
| 893 | + if (error) |
---|
| 894 | + goto out_unlock; |
---|
| 895 | + } |
---|
| 896 | + |
---|
| 897 | + error = file_modified(file); |
---|
| 898 | + if (error) |
---|
| 899 | + goto out_unlock; |
---|
| 900 | + |
---|
820 | 901 | if (mode & FALLOC_FL_PUNCH_HOLE) { |
---|
821 | 902 | error = xfs_free_file_space(ip, offset, len); |
---|
822 | 903 | if (error) |
---|
823 | 904 | goto out_unlock; |
---|
824 | 905 | } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { |
---|
825 | | - unsigned int blksize_mask = i_blocksize(inode) - 1; |
---|
826 | | - |
---|
827 | | - if (offset & blksize_mask || len & blksize_mask) { |
---|
| 906 | + if (!xfs_is_falloc_aligned(ip, offset, len)) { |
---|
828 | 907 | error = -EINVAL; |
---|
829 | 908 | goto out_unlock; |
---|
830 | 909 | } |
---|
.. | .. |
---|
844 | 923 | if (error) |
---|
845 | 924 | goto out_unlock; |
---|
846 | 925 | } else if (mode & FALLOC_FL_INSERT_RANGE) { |
---|
847 | | - unsigned int blksize_mask = i_blocksize(inode) - 1; |
---|
848 | 926 | loff_t isize = i_size_read(inode); |
---|
849 | 927 | |
---|
850 | | - if (offset & blksize_mask || len & blksize_mask) { |
---|
| 928 | + if (!xfs_is_falloc_aligned(ip, offset, len)) { |
---|
851 | 929 | error = -EINVAL; |
---|
852 | 930 | goto out_unlock; |
---|
853 | 931 | } |
---|
.. | .. |
---|
869 | 947 | } |
---|
870 | 948 | do_file_insert = true; |
---|
871 | 949 | } else { |
---|
872 | | - flags |= XFS_PREALLOC_SET; |
---|
873 | | - |
---|
874 | 950 | if (!(mode & FALLOC_FL_KEEP_SIZE) && |
---|
875 | 951 | offset + len > i_size_read(inode)) { |
---|
876 | 952 | new_size = offset + len; |
---|
.. | .. |
---|
879 | 955 | goto out_unlock; |
---|
880 | 956 | } |
---|
881 | 957 | |
---|
882 | | - if (mode & FALLOC_FL_ZERO_RANGE) |
---|
883 | | - error = xfs_zero_file_space(ip, offset, len); |
---|
884 | | - else { |
---|
885 | | - if (mode & FALLOC_FL_UNSHARE_RANGE) { |
---|
886 | | - error = xfs_reflink_unshare(ip, offset, len); |
---|
887 | | - if (error) |
---|
888 | | - goto out_unlock; |
---|
| 958 | + if (mode & FALLOC_FL_ZERO_RANGE) { |
---|
| 959 | + /* |
---|
| 960 | + * Punch a hole and prealloc the range. We use a hole |
---|
| 961 | + * punch rather than unwritten extent conversion for two |
---|
| 962 | + * reasons: |
---|
| 963 | + * |
---|
| 964 | + * 1.) Hole punch handles partial block zeroing for us. |
---|
| 965 | + * 2.) If prealloc returns ENOSPC, the file range is |
---|
| 966 | + * still zero-valued by virtue of the hole punch. |
---|
| 967 | + */ |
---|
| 968 | + unsigned int blksize = i_blocksize(inode); |
---|
| 969 | + |
---|
| 970 | + trace_xfs_zero_file_space(ip); |
---|
| 971 | + |
---|
| 972 | + error = xfs_free_file_space(ip, offset, len); |
---|
| 973 | + if (error) |
---|
| 974 | + goto out_unlock; |
---|
| 975 | + |
---|
| 976 | + len = round_up(offset + len, blksize) - |
---|
| 977 | + round_down(offset, blksize); |
---|
| 978 | + offset = round_down(offset, blksize); |
---|
| 979 | + } else if (mode & FALLOC_FL_UNSHARE_RANGE) { |
---|
| 980 | + error = xfs_reflink_unshare(ip, offset, len); |
---|
| 981 | + if (error) |
---|
| 982 | + goto out_unlock; |
---|
| 983 | + } else { |
---|
| 984 | + /* |
---|
| 985 | + * If always_cow mode we can't use preallocations and |
---|
| 986 | + * thus should not create them. |
---|
| 987 | + */ |
---|
| 988 | + if (xfs_is_always_cow_inode(ip)) { |
---|
| 989 | + error = -EOPNOTSUPP; |
---|
| 990 | + goto out_unlock; |
---|
889 | 991 | } |
---|
| 992 | + } |
---|
| 993 | + |
---|
| 994 | + if (!xfs_is_always_cow_inode(ip)) { |
---|
890 | 995 | error = xfs_alloc_file_space(ip, offset, len, |
---|
891 | 996 | XFS_BMAPI_PREALLOC); |
---|
| 997 | + if (error) |
---|
| 998 | + goto out_unlock; |
---|
892 | 999 | } |
---|
893 | | - if (error) |
---|
894 | | - goto out_unlock; |
---|
895 | 1000 | } |
---|
896 | | - |
---|
897 | | - if (file->f_flags & O_DSYNC) |
---|
898 | | - flags |= XFS_PREALLOC_SYNC; |
---|
899 | | - |
---|
900 | | - error = xfs_update_prealloc_flags(ip, flags); |
---|
901 | | - if (error) |
---|
902 | | - goto out_unlock; |
---|
903 | 1001 | |
---|
904 | 1002 | /* Change file size if needed */ |
---|
905 | 1003 | if (new_size) { |
---|
.. | .. |
---|
918 | 1016 | * leave shifted extents past EOF and hence losing access to |
---|
919 | 1017 | * the data that is contained within them. |
---|
920 | 1018 | */ |
---|
921 | | - if (do_file_insert) |
---|
| 1019 | + if (do_file_insert) { |
---|
922 | 1020 | error = xfs_insert_file_space(ip, offset, len); |
---|
| 1021 | + if (error) |
---|
| 1022 | + goto out_unlock; |
---|
| 1023 | + } |
---|
| 1024 | + |
---|
| 1025 | + if (file->f_flags & O_DSYNC) |
---|
| 1026 | + error = xfs_log_force_inode(ip); |
---|
923 | 1027 | |
---|
924 | 1028 | out_unlock: |
---|
925 | 1029 | xfs_iunlock(ip, iolock); |
---|
.. | .. |
---|
927 | 1031 | } |
---|
928 | 1032 | |
---|
929 | 1033 | STATIC int |
---|
930 | | -xfs_file_clone_range( |
---|
931 | | - struct file *file_in, |
---|
932 | | - loff_t pos_in, |
---|
933 | | - struct file *file_out, |
---|
934 | | - loff_t pos_out, |
---|
935 | | - u64 len) |
---|
| 1034 | +xfs_file_fadvise( |
---|
| 1035 | + struct file *file, |
---|
| 1036 | + loff_t start, |
---|
| 1037 | + loff_t end, |
---|
| 1038 | + int advice) |
---|
936 | 1039 | { |
---|
937 | | - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, |
---|
938 | | - len, false); |
---|
| 1040 | + struct xfs_inode *ip = XFS_I(file_inode(file)); |
---|
| 1041 | + int ret; |
---|
| 1042 | + int lockflags = 0; |
---|
| 1043 | + |
---|
| 1044 | + /* |
---|
| 1045 | + * Operations creating pages in page cache need protection from hole |
---|
| 1046 | + * punching and similar ops |
---|
| 1047 | + */ |
---|
| 1048 | + if (advice == POSIX_FADV_WILLNEED) { |
---|
| 1049 | + lockflags = XFS_IOLOCK_SHARED; |
---|
| 1050 | + xfs_ilock(ip, lockflags); |
---|
| 1051 | + } |
---|
| 1052 | + ret = generic_fadvise(file, start, end, advice); |
---|
| 1053 | + if (lockflags) |
---|
| 1054 | + xfs_iunlock(ip, lockflags); |
---|
| 1055 | + return ret; |
---|
939 | 1056 | } |
---|
940 | 1057 | |
---|
941 | | -STATIC int |
---|
942 | | -xfs_file_dedupe_range( |
---|
943 | | - struct file *file_in, |
---|
944 | | - loff_t pos_in, |
---|
945 | | - struct file *file_out, |
---|
946 | | - loff_t pos_out, |
---|
947 | | - u64 len) |
---|
| 1058 | +/* Does this file, inode, or mount want synchronous writes? */ |
---|
| 1059 | +static inline bool xfs_file_sync_writes(struct file *filp) |
---|
948 | 1060 | { |
---|
949 | | - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, |
---|
950 | | - len, true); |
---|
| 1061 | + struct xfs_inode *ip = XFS_I(file_inode(filp)); |
---|
| 1062 | + |
---|
| 1063 | + if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC) |
---|
| 1064 | + return true; |
---|
| 1065 | + if (filp->f_flags & (__O_SYNC | O_DSYNC)) |
---|
| 1066 | + return true; |
---|
| 1067 | + if (IS_SYNC(file_inode(filp))) |
---|
| 1068 | + return true; |
---|
| 1069 | + |
---|
| 1070 | + return false; |
---|
| 1071 | +} |
---|
| 1072 | + |
---|
| 1073 | +STATIC loff_t |
---|
| 1074 | +xfs_file_remap_range( |
---|
| 1075 | + struct file *file_in, |
---|
| 1076 | + loff_t pos_in, |
---|
| 1077 | + struct file *file_out, |
---|
| 1078 | + loff_t pos_out, |
---|
| 1079 | + loff_t len, |
---|
| 1080 | + unsigned int remap_flags) |
---|
| 1081 | +{ |
---|
| 1082 | + struct inode *inode_in = file_inode(file_in); |
---|
| 1083 | + struct xfs_inode *src = XFS_I(inode_in); |
---|
| 1084 | + struct inode *inode_out = file_inode(file_out); |
---|
| 1085 | + struct xfs_inode *dest = XFS_I(inode_out); |
---|
| 1086 | + struct xfs_mount *mp = src->i_mount; |
---|
| 1087 | + loff_t remapped = 0; |
---|
| 1088 | + xfs_extlen_t cowextsize; |
---|
| 1089 | + int ret; |
---|
| 1090 | + |
---|
| 1091 | + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) |
---|
| 1092 | + return -EINVAL; |
---|
| 1093 | + |
---|
| 1094 | + if (!xfs_sb_version_hasreflink(&mp->m_sb)) |
---|
| 1095 | + return -EOPNOTSUPP; |
---|
| 1096 | + |
---|
| 1097 | + if (XFS_FORCED_SHUTDOWN(mp)) |
---|
| 1098 | + return -EIO; |
---|
| 1099 | + |
---|
| 1100 | + /* Prepare and then clone file data. */ |
---|
| 1101 | + ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, |
---|
| 1102 | + &len, remap_flags); |
---|
| 1103 | + if (ret || len == 0) |
---|
| 1104 | + return ret; |
---|
| 1105 | + |
---|
| 1106 | + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); |
---|
| 1107 | + |
---|
| 1108 | + ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, |
---|
| 1109 | + &remapped); |
---|
| 1110 | + if (ret) |
---|
| 1111 | + goto out_unlock; |
---|
| 1112 | + |
---|
| 1113 | + /* |
---|
| 1114 | + * Carry the cowextsize hint from src to dest if we're sharing the |
---|
| 1115 | + * entire source file to the entire destination file, the source file |
---|
| 1116 | + * has a cowextsize hint, and the destination file does not. |
---|
| 1117 | + */ |
---|
| 1118 | + cowextsize = 0; |
---|
| 1119 | + if (pos_in == 0 && len == i_size_read(inode_in) && |
---|
| 1120 | + (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && |
---|
| 1121 | + pos_out == 0 && len >= i_size_read(inode_out) && |
---|
| 1122 | + !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) |
---|
| 1123 | + cowextsize = src->i_d.di_cowextsize; |
---|
| 1124 | + |
---|
| 1125 | + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, |
---|
| 1126 | + remap_flags); |
---|
| 1127 | + if (ret) |
---|
| 1128 | + goto out_unlock; |
---|
| 1129 | + |
---|
| 1130 | + if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) |
---|
| 1131 | + xfs_log_force_inode(dest); |
---|
| 1132 | +out_unlock: |
---|
| 1133 | + xfs_iunlock2_io_mmap(src, dest); |
---|
| 1134 | + if (ret) |
---|
| 1135 | + trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); |
---|
| 1136 | + return remapped > 0 ? remapped : ret; |
---|
951 | 1137 | } |
---|
952 | 1138 | |
---|
953 | 1139 | STATIC int |
---|
.. | .. |
---|
959 | 1145 | return -EFBIG; |
---|
960 | 1146 | if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) |
---|
961 | 1147 | return -EIO; |
---|
962 | | - file->f_mode |= FMODE_NOWAIT; |
---|
| 1148 | + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; |
---|
963 | 1149 | return 0; |
---|
964 | 1150 | } |
---|
965 | 1151 | |
---|
.. | .. |
---|
981 | 1167 | * certain to have the next operation be a read there. |
---|
982 | 1168 | */ |
---|
983 | 1169 | mode = xfs_ilock_data_map_shared(ip); |
---|
984 | | - if (ip->i_d.di_nextents > 0) |
---|
985 | | - error = xfs_dir3_data_readahead(ip, 0, -1); |
---|
| 1170 | + if (ip->i_df.if_nextents > 0) |
---|
| 1171 | + error = xfs_dir3_data_readahead(ip, 0, 0); |
---|
986 | 1172 | xfs_iunlock(ip, mode); |
---|
987 | 1173 | return error; |
---|
988 | 1174 | } |
---|
.. | .. |
---|
1036 | 1222 | default: |
---|
1037 | 1223 | return generic_file_llseek(file, offset, whence); |
---|
1038 | 1224 | case SEEK_HOLE: |
---|
1039 | | - offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops); |
---|
| 1225 | + offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); |
---|
1040 | 1226 | break; |
---|
1041 | 1227 | case SEEK_DATA: |
---|
1042 | | - offset = iomap_seek_data(inode, offset, &xfs_iomap_ops); |
---|
| 1228 | + offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); |
---|
1043 | 1229 | break; |
---|
1044 | 1230 | } |
---|
1045 | 1231 | |
---|
.. | .. |
---|
1052 | 1238 | * Locking for serialisation of IO during page faults. This results in a lock |
---|
1053 | 1239 | * ordering of: |
---|
1054 | 1240 | * |
---|
1055 | | - * mmap_sem (MM) |
---|
| 1241 | + * mmap_lock (MM) |
---|
1056 | 1242 | * sb_start_pagefault(vfs, freeze) |
---|
1057 | 1243 | * i_mmaplock (XFS - truncate serialisation) |
---|
1058 | 1244 | * page_lock (MM) |
---|
.. | .. |
---|
1079 | 1265 | if (IS_DAX(inode)) { |
---|
1080 | 1266 | pfn_t pfn; |
---|
1081 | 1267 | |
---|
1082 | | - ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops); |
---|
| 1268 | + ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, |
---|
| 1269 | + (write_fault && !vmf->cow_page) ? |
---|
| 1270 | + &xfs_direct_write_iomap_ops : |
---|
| 1271 | + &xfs_read_iomap_ops); |
---|
1083 | 1272 | if (ret & VM_FAULT_NEEDDSYNC) |
---|
1084 | 1273 | ret = dax_finish_sync_fault(vmf, pe_size, pfn); |
---|
1085 | 1274 | } else { |
---|
1086 | 1275 | if (write_fault) |
---|
1087 | | - ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); |
---|
| 1276 | + ret = iomap_page_mkwrite(vmf, |
---|
| 1277 | + &xfs_buffered_write_iomap_ops); |
---|
1088 | 1278 | else |
---|
1089 | 1279 | ret = filemap_fault(vmf); |
---|
1090 | 1280 | } |
---|
.. | .. |
---|
1146 | 1336 | return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); |
---|
1147 | 1337 | } |
---|
1148 | 1338 | |
---|
| 1339 | +static vm_fault_t |
---|
| 1340 | +xfs_filemap_map_pages( |
---|
| 1341 | + struct vm_fault *vmf, |
---|
| 1342 | + pgoff_t start_pgoff, |
---|
| 1343 | + pgoff_t end_pgoff) |
---|
| 1344 | +{ |
---|
| 1345 | + struct inode *inode = file_inode(vmf->vma->vm_file); |
---|
| 1346 | + vm_fault_t ret; |
---|
| 1347 | + |
---|
| 1348 | + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); |
---|
| 1349 | + ret = filemap_map_pages(vmf, start_pgoff, end_pgoff); |
---|
| 1350 | + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); |
---|
| 1351 | + return ret; |
---|
| 1352 | +} |
---|
| 1353 | + |
---|
1149 | 1354 | static const struct vm_operations_struct xfs_file_vm_ops = { |
---|
1150 | 1355 | .fault = xfs_filemap_fault, |
---|
1151 | 1356 | .huge_fault = xfs_filemap_huge_fault, |
---|
1152 | | - .map_pages = filemap_map_pages, |
---|
| 1357 | + .map_pages = xfs_filemap_map_pages, |
---|
1153 | 1358 | .page_mkwrite = xfs_filemap_page_mkwrite, |
---|
1154 | 1359 | .pfn_mkwrite = xfs_filemap_pfn_mkwrite, |
---|
1155 | 1360 | }; |
---|
1156 | 1361 | |
---|
1157 | 1362 | STATIC int |
---|
1158 | 1363 | xfs_file_mmap( |
---|
1159 | | - struct file *filp, |
---|
1160 | | - struct vm_area_struct *vma) |
---|
| 1364 | + struct file *file, |
---|
| 1365 | + struct vm_area_struct *vma) |
---|
1161 | 1366 | { |
---|
| 1367 | + struct inode *inode = file_inode(file); |
---|
| 1368 | + struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); |
---|
| 1369 | + |
---|
1162 | 1370 | /* |
---|
1163 | | - * We don't support synchronous mappings for non-DAX files. At least |
---|
1164 | | - * until someone comes with a sensible use case. |
---|
| 1371 | + * We don't support synchronous mappings for non-DAX files and |
---|
| 1372 | + * for DAX files if underneath dax_device is not synchronous. |
---|
1165 | 1373 | */ |
---|
1166 | | - if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC)) |
---|
| 1374 | + if (!daxdev_mapping_supported(vma, target->bt_daxdev)) |
---|
1167 | 1375 | return -EOPNOTSUPP; |
---|
1168 | 1376 | |
---|
1169 | | - file_accessed(filp); |
---|
| 1377 | + file_accessed(file); |
---|
1170 | 1378 | vma->vm_ops = &xfs_file_vm_ops; |
---|
1171 | | - if (IS_DAX(file_inode(filp))) |
---|
| 1379 | + if (IS_DAX(inode)) |
---|
1172 | 1380 | vma->vm_flags |= VM_HUGEPAGE; |
---|
1173 | 1381 | return 0; |
---|
1174 | 1382 | } |
---|
.. | .. |
---|
1179 | 1387 | .write_iter = xfs_file_write_iter, |
---|
1180 | 1388 | .splice_read = generic_file_splice_read, |
---|
1181 | 1389 | .splice_write = iter_file_splice_write, |
---|
| 1390 | + .iopoll = iomap_dio_iopoll, |
---|
1182 | 1391 | .unlocked_ioctl = xfs_file_ioctl, |
---|
1183 | 1392 | #ifdef CONFIG_COMPAT |
---|
1184 | 1393 | .compat_ioctl = xfs_file_compat_ioctl, |
---|
.. | .. |
---|
1190 | 1399 | .fsync = xfs_file_fsync, |
---|
1191 | 1400 | .get_unmapped_area = thp_get_unmapped_area, |
---|
1192 | 1401 | .fallocate = xfs_file_fallocate, |
---|
1193 | | - .clone_file_range = xfs_file_clone_range, |
---|
1194 | | - .dedupe_file_range = xfs_file_dedupe_range, |
---|
| 1402 | + .fadvise = xfs_file_fadvise, |
---|
| 1403 | + .remap_file_range = xfs_file_remap_range, |
---|
1195 | 1404 | }; |
---|
1196 | 1405 | |
---|
1197 | 1406 | const struct file_operations xfs_dir_file_operations = { |
---|