hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/fs/ext4/file.c
....@@ -29,10 +29,60 @@
2929 #include <linux/pagevec.h>
3030 #include <linux/uio.h>
3131 #include <linux/mman.h>
32
+#include <linux/backing-dev.h>
3233 #include "ext4.h"
3334 #include "ext4_jbd2.h"
3435 #include "xattr.h"
3536 #include "acl.h"
37
+#include "truncate.h"
38
+
39
+static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
40
+{
41
+ struct inode *inode = file_inode(iocb->ki_filp);
42
+
43
+ if (!fscrypt_dio_supported(iocb, iter))
44
+ return false;
45
+ if (fsverity_active(inode))
46
+ return false;
47
+ if (ext4_should_journal_data(inode))
48
+ return false;
49
+ if (ext4_has_inline_data(inode))
50
+ return false;
51
+ return true;
52
+}
53
+
54
+static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
55
+{
56
+ ssize_t ret;
57
+ struct inode *inode = file_inode(iocb->ki_filp);
58
+
59
+ if (iocb->ki_flags & IOCB_NOWAIT) {
60
+ if (!inode_trylock_shared(inode))
61
+ return -EAGAIN;
62
+ } else {
63
+ inode_lock_shared(inode);
64
+ }
65
+
66
+ if (!ext4_dio_supported(iocb, to)) {
67
+ inode_unlock_shared(inode);
68
+ /*
69
+ * Fallback to buffered I/O if the operation being performed on
70
+ * the inode is not supported by direct I/O. The IOCB_DIRECT
71
+ * flag needs to be cleared here in order to ensure that the
72
+ * direct I/O path within generic_file_read_iter() is not
73
+ * taken.
74
+ */
75
+ iocb->ki_flags &= ~IOCB_DIRECT;
76
+ return generic_file_read_iter(iocb, to);
77
+ }
78
+
79
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
80
+ is_sync_kiocb(iocb));
81
+ inode_unlock_shared(inode);
82
+
83
+ file_accessed(iocb->ki_filp);
84
+ return ret;
85
+}
3686
3787 #ifdef CONFIG_FS_DAX
3888 static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
....@@ -65,16 +115,21 @@
65115
66116 static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
67117 {
68
- if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
118
+ struct inode *inode = file_inode(iocb->ki_filp);
119
+
120
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
69121 return -EIO;
70122
71123 if (!iov_iter_count(to))
72124 return 0; /* skip atime */
73125
74126 #ifdef CONFIG_FS_DAX
75
- if (IS_DAX(file_inode(iocb->ki_filp)))
127
+ if (IS_DAX(inode))
76128 return ext4_dax_read_iter(iocb, to);
77129 #endif
130
+ if (iocb->ki_flags & IOCB_DIRECT)
131
+ return ext4_dio_read_iter(iocb, to);
132
+
78133 return generic_file_read_iter(iocb, to);
79134 }
80135
....@@ -92,23 +147,15 @@
92147 /* if we are the last writer on the inode, drop the block reservation */
93148 if ((filp->f_mode & FMODE_WRITE) &&
94149 (atomic_read(&inode->i_writecount) == 1) &&
95
- !EXT4_I(inode)->i_reserved_data_blocks)
96
- {
150
+ !EXT4_I(inode)->i_reserved_data_blocks) {
97151 down_write(&EXT4_I(inode)->i_data_sem);
98
- ext4_discard_preallocations(inode);
152
+ ext4_discard_preallocations(inode, 0);
99153 up_write(&EXT4_I(inode)->i_data_sem);
100154 }
101155 if (is_dx(inode) && filp->private_data)
102156 ext4_htree_free_dir_info(filp->private_data);
103157
104158 return 0;
105
-}
106
-
107
-static void ext4_unwritten_wait(struct inode *inode)
108
-{
109
- wait_queue_head_t *wq = ext4_ioend_wq(inode);
110
-
111
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
112159 }
113160
114161 /*
....@@ -120,19 +167,25 @@
120167 * threads are at work on the same unwritten block, they must be synchronized
121168 * or one thread will zero the other's data, causing corruption.
122169 */
123
-static int
124
-ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
170
+static bool
171
+ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
125172 {
126173 struct super_block *sb = inode->i_sb;
127
- int blockmask = sb->s_blocksize - 1;
128
-
129
- if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
130
- return 0;
174
+ unsigned long blockmask = sb->s_blocksize - 1;
131175
132176 if ((pos | iov_iter_alignment(from)) & blockmask)
133
- return 1;
177
+ return true;
134178
135
- return 0;
179
+ return false;
180
+}
181
+
182
+static bool
183
+ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
184
+{
185
+ if (offset + len > i_size_read(inode) ||
186
+ offset + len > EXT4_I(inode)->i_disksize)
187
+ return true;
188
+ return false;
136189 }
137190
138191 /* Is IO overwriting allocated and initialized blocks? */
....@@ -158,17 +211,18 @@
158211 return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
159212 }
160213
161
-static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
214
+static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
215
+ struct iov_iter *from)
162216 {
163217 struct inode *inode = file_inode(iocb->ki_filp);
164218 ssize_t ret;
165219
220
+ if (unlikely(IS_IMMUTABLE(inode)))
221
+ return -EPERM;
222
+
166223 ret = generic_write_checks(iocb, from);
167224 if (ret <= 0)
168225 return ret;
169
-
170
- if (unlikely(IS_IMMUTABLE(inode)))
171
- return -EPERM;
172226
173227 /*
174228 * If we have encountered a bitmap-format file, the size limit
....@@ -181,15 +235,394 @@
181235 return -EFBIG;
182236 iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
183237 }
238
+
184239 return iov_iter_count(from);
240
+}
241
+
242
+static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
243
+{
244
+ ssize_t ret, count;
245
+
246
+ count = ext4_generic_write_checks(iocb, from);
247
+ if (count <= 0)
248
+ return count;
249
+
250
+ ret = file_modified(iocb->ki_filp);
251
+ if (ret)
252
+ return ret;
253
+ return count;
254
+}
255
+
256
+static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
257
+ struct iov_iter *from)
258
+{
259
+ ssize_t ret;
260
+ struct inode *inode = file_inode(iocb->ki_filp);
261
+
262
+ if (iocb->ki_flags & IOCB_NOWAIT)
263
+ return -EOPNOTSUPP;
264
+
265
+ inode_lock(inode);
266
+ ret = ext4_write_checks(iocb, from);
267
+ if (ret <= 0)
268
+ goto out;
269
+
270
+ current->backing_dev_info = inode_to_bdi(inode);
271
+ ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
272
+ current->backing_dev_info = NULL;
273
+
274
+out:
275
+ inode_unlock(inode);
276
+ if (likely(ret > 0)) {
277
+ iocb->ki_pos += ret;
278
+ ret = generic_write_sync(iocb, ret);
279
+ }
280
+
281
+ return ret;
282
+}
283
+
284
+static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
285
+ ssize_t written, size_t count)
286
+{
287
+ handle_t *handle;
288
+ bool truncate = false;
289
+ u8 blkbits = inode->i_blkbits;
290
+ ext4_lblk_t written_blk, end_blk;
291
+ int ret;
292
+
293
+ /*
294
+ * Note that EXT4_I(inode)->i_disksize can get extended up to
295
+ * inode->i_size while the I/O was running due to writeback of delalloc
296
+ * blocks. But, the code in ext4_iomap_alloc() is careful to use
297
+ * zeroed/unwritten extents if this is possible; thus we won't leave
298
+ * uninitialized blocks in a file even if we didn't succeed in writing
299
+ * as much as we intended.
300
+ */
301
+ WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
302
+ if (offset + count <= EXT4_I(inode)->i_disksize) {
303
+ /*
304
+ * We need to ensure that the inode is removed from the orphan
305
+ * list if it has been added prematurely, due to writeback of
306
+ * delalloc blocks.
307
+ */
308
+ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
309
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
310
+
311
+ if (IS_ERR(handle)) {
312
+ ext4_orphan_del(NULL, inode);
313
+ return PTR_ERR(handle);
314
+ }
315
+
316
+ ext4_orphan_del(handle, inode);
317
+ ext4_journal_stop(handle);
318
+ }
319
+
320
+ return written;
321
+ }
322
+
323
+ if (written < 0)
324
+ goto truncate;
325
+
326
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
327
+ if (IS_ERR(handle)) {
328
+ written = PTR_ERR(handle);
329
+ goto truncate;
330
+ }
331
+
332
+ if (ext4_update_inode_size(inode, offset + written)) {
333
+ ret = ext4_mark_inode_dirty(handle, inode);
334
+ if (unlikely(ret)) {
335
+ written = ret;
336
+ ext4_journal_stop(handle);
337
+ goto truncate;
338
+ }
339
+ }
340
+
341
+ /*
342
+ * We may need to truncate allocated but not written blocks beyond EOF.
343
+ */
344
+ written_blk = ALIGN(offset + written, 1 << blkbits);
345
+ end_blk = ALIGN(offset + count, 1 << blkbits);
346
+ if (written_blk < end_blk && ext4_can_truncate(inode))
347
+ truncate = true;
348
+
349
+ /*
350
+ * Remove the inode from the orphan list if it has been extended and
351
+ * everything went OK.
352
+ */
353
+ if (!truncate && inode->i_nlink)
354
+ ext4_orphan_del(handle, inode);
355
+ ext4_journal_stop(handle);
356
+
357
+ if (truncate) {
358
+truncate:
359
+ ext4_truncate_failed_write(inode);
360
+ /*
361
+ * If the truncate operation failed early, then the inode may
362
+ * still be on the orphan list. In that case, we need to try
363
+ * remove the inode from the in-memory linked list.
364
+ */
365
+ if (inode->i_nlink)
366
+ ext4_orphan_del(NULL, inode);
367
+ }
368
+
369
+ return written;
370
+}
371
+
372
+static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
373
+ int error, unsigned int flags)
374
+{
375
+ loff_t pos = iocb->ki_pos;
376
+ struct inode *inode = file_inode(iocb->ki_filp);
377
+
378
+ if (error)
379
+ return error;
380
+
381
+ if (size && flags & IOMAP_DIO_UNWRITTEN) {
382
+ error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
383
+ if (error < 0)
384
+ return error;
385
+ }
386
+ /*
387
+ * If we are extending the file, we have to update i_size here before
388
+ * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
389
+ * buffered reads could zero out too much from page cache pages. Update
390
+ * of on-disk size will happen later in ext4_dio_write_iter() where
391
+ * we have enough information to also perform orphan list handling etc.
392
+ * Note that we perform all extending writes synchronously under
393
+ * i_rwsem held exclusively so i_size update is safe here in that case.
394
+ * If the write was not extending, we cannot see pos > i_size here
395
+ * because operations reducing i_size like truncate wait for all
396
+ * outstanding DIO before updating i_size.
397
+ */
398
+ pos += size;
399
+ if (pos > i_size_read(inode))
400
+ i_size_write(inode, pos);
401
+
402
+ return 0;
403
+}
404
+
405
+static const struct iomap_dio_ops ext4_dio_write_ops = {
406
+ .end_io = ext4_dio_write_end_io,
407
+};
408
+
409
+/*
410
+ * The intention here is to start with shared lock acquired then see if any
411
+ * condition requires an exclusive inode lock. If yes, then we restart the
412
+ * whole operation by releasing the shared lock and acquiring exclusive lock.
413
+ *
414
+ * - For unaligned_io we never take shared lock as it may cause data corruption
415
+ * when two unaligned IO tries to modify the same block e.g. while zeroing.
416
+ *
417
+ * - For extending writes case we don't take the shared lock, since it requires
418
+ * updating inode i_disksize and/or orphan handling with exclusive lock.
419
+ *
420
+ * - shared locking will only be true mostly with overwrites. Otherwise we will
421
+ * switch to exclusive i_rwsem lock.
422
+ */
423
+static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
424
+ bool *ilock_shared, bool *extend)
425
+{
426
+ struct file *file = iocb->ki_filp;
427
+ struct inode *inode = file_inode(file);
428
+ loff_t offset;
429
+ size_t count;
430
+ ssize_t ret;
431
+
432
+restart:
433
+ ret = ext4_generic_write_checks(iocb, from);
434
+ if (ret <= 0)
435
+ goto out;
436
+
437
+ offset = iocb->ki_pos;
438
+ count = ret;
439
+ if (ext4_extending_io(inode, offset, count))
440
+ *extend = true;
441
+ /*
442
+ * Determine whether the IO operation will overwrite allocated
443
+ * and initialized blocks.
444
+ * We need exclusive i_rwsem for changing security info
445
+ * in file_modified().
446
+ */
447
+ if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
448
+ !ext4_overwrite_io(inode, offset, count))) {
449
+ if (iocb->ki_flags & IOCB_NOWAIT) {
450
+ ret = -EAGAIN;
451
+ goto out;
452
+ }
453
+ inode_unlock_shared(inode);
454
+ *ilock_shared = false;
455
+ inode_lock(inode);
456
+ goto restart;
457
+ }
458
+
459
+ ret = file_modified(file);
460
+ if (ret < 0)
461
+ goto out;
462
+
463
+ return count;
464
+out:
465
+ if (*ilock_shared)
466
+ inode_unlock_shared(inode);
467
+ else
468
+ inode_unlock(inode);
469
+ return ret;
470
+}
471
+
472
+static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
473
+{
474
+ ssize_t ret;
475
+ handle_t *handle;
476
+ struct inode *inode = file_inode(iocb->ki_filp);
477
+ loff_t offset = iocb->ki_pos;
478
+ size_t count = iov_iter_count(from);
479
+ const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
480
+ bool extend = false, unaligned_io = false;
481
+ bool ilock_shared = true;
482
+
483
+ /*
484
+ * We initially start with shared inode lock unless it is
485
+ * unaligned IO which needs exclusive lock anyways.
486
+ */
487
+ if (ext4_unaligned_io(inode, from, offset)) {
488
+ unaligned_io = true;
489
+ ilock_shared = false;
490
+ }
491
+ /*
492
+ * Quick check here without any i_rwsem lock to see if it is extending
493
+ * IO. A more reliable check is done in ext4_dio_write_checks() with
494
+ * proper locking in place.
495
+ */
496
+ if (offset + count > i_size_read(inode))
497
+ ilock_shared = false;
498
+
499
+ if (iocb->ki_flags & IOCB_NOWAIT) {
500
+ if (ilock_shared) {
501
+ if (!inode_trylock_shared(inode))
502
+ return -EAGAIN;
503
+ } else {
504
+ if (!inode_trylock(inode))
505
+ return -EAGAIN;
506
+ }
507
+ } else {
508
+ if (ilock_shared)
509
+ inode_lock_shared(inode);
510
+ else
511
+ inode_lock(inode);
512
+ }
513
+
514
+ /* Fallback to buffered I/O if the inode does not support direct I/O. */
515
+ if (!ext4_dio_supported(iocb, from)) {
516
+ if (ilock_shared)
517
+ inode_unlock_shared(inode);
518
+ else
519
+ inode_unlock(inode);
520
+ return ext4_buffered_write_iter(iocb, from);
521
+ }
522
+
523
+ ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
524
+ if (ret <= 0)
525
+ return ret;
526
+
527
+ /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
528
+ if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
529
+ ret = -EAGAIN;
530
+ goto out;
531
+ }
532
+ /*
533
+ * Make sure inline data cannot be created anymore since we are going
534
+ * to allocate blocks for DIO. We know the inode does not have any
535
+ * inline data now because ext4_dio_supported() checked for that.
536
+ */
537
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
538
+
539
+ offset = iocb->ki_pos;
540
+ count = ret;
541
+
542
+ /*
543
+ * Unaligned direct IO must be serialized among each other as zeroing
544
+ * of partial blocks of two competing unaligned IOs can result in data
545
+ * corruption.
546
+ *
547
+ * So we make sure we don't allow any unaligned IO in flight.
548
+ * For IOs where we need not wait (like unaligned non-AIO DIO),
549
+ * below inode_dio_wait() may anyway become a no-op, since we start
550
+ * with exclusive lock.
551
+ */
552
+ if (unaligned_io)
553
+ inode_dio_wait(inode);
554
+
555
+ if (extend) {
556
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
557
+ if (IS_ERR(handle)) {
558
+ ret = PTR_ERR(handle);
559
+ goto out;
560
+ }
561
+
562
+ ret = ext4_orphan_add(handle, inode);
563
+ if (ret) {
564
+ ext4_journal_stop(handle);
565
+ goto out;
566
+ }
567
+
568
+ ext4_journal_stop(handle);
569
+ }
570
+
571
+ if (ilock_shared)
572
+ iomap_ops = &ext4_iomap_overwrite_ops;
573
+ ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
574
+ is_sync_kiocb(iocb) || unaligned_io || extend);
575
+ if (ret == -ENOTBLK)
576
+ ret = 0;
577
+
578
+ if (extend)
579
+ ret = ext4_handle_inode_extension(inode, offset, ret, count);
580
+
581
+out:
582
+ if (ilock_shared)
583
+ inode_unlock_shared(inode);
584
+ else
585
+ inode_unlock(inode);
586
+
587
+ if (ret >= 0 && iov_iter_count(from)) {
588
+ ssize_t err;
589
+ loff_t endbyte;
590
+
591
+ offset = iocb->ki_pos;
592
+ err = ext4_buffered_write_iter(iocb, from);
593
+ if (err < 0)
594
+ return err;
595
+
596
+ /*
597
+ * We need to ensure that the pages within the page cache for
598
+ * the range covered by this I/O are written to disk and
599
+ * invalidated. This is in attempt to preserve the expected
600
+ * direct I/O semantics in the case we fallback to buffered I/O
601
+ * to complete off the I/O request.
602
+ */
603
+ ret += err;
604
+ endbyte = offset + err - 1;
605
+ err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
606
+ offset, endbyte);
607
+ if (!err)
608
+ invalidate_mapping_pages(iocb->ki_filp->f_mapping,
609
+ offset >> PAGE_SHIFT,
610
+ endbyte >> PAGE_SHIFT);
611
+ }
612
+
613
+ return ret;
185614 }
186615
187616 #ifdef CONFIG_FS_DAX
188617 static ssize_t
189618 ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
190619 {
191
- struct inode *inode = file_inode(iocb->ki_filp);
192620 ssize_t ret;
621
+ size_t count;
622
+ loff_t offset;
623
+ handle_t *handle;
624
+ bool extend = false;
625
+ struct inode *inode = file_inode(iocb->ki_filp);
193626
194627 if (iocb->ki_flags & IOCB_NOWAIT) {
195628 if (!inode_trylock(inode))
....@@ -197,17 +630,35 @@
197630 } else {
198631 inode_lock(inode);
199632 }
633
+
200634 ret = ext4_write_checks(iocb, from);
201635 if (ret <= 0)
202636 goto out;
203
- ret = file_remove_privs(iocb->ki_filp);
204
- if (ret)
205
- goto out;
206
- ret = file_update_time(iocb->ki_filp);
207
- if (ret)
208
- goto out;
637
+
638
+ offset = iocb->ki_pos;
639
+ count = iov_iter_count(from);
640
+
641
+ if (offset + count > EXT4_I(inode)->i_disksize) {
642
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
643
+ if (IS_ERR(handle)) {
644
+ ret = PTR_ERR(handle);
645
+ goto out;
646
+ }
647
+
648
+ ret = ext4_orphan_add(handle, inode);
649
+ if (ret) {
650
+ ext4_journal_stop(handle);
651
+ goto out;
652
+ }
653
+
654
+ extend = true;
655
+ ext4_journal_stop(handle);
656
+ }
209657
210658 ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
659
+
660
+ if (extend)
661
+ ret = ext4_handle_inode_extension(inode, offset, ret, count);
211662 out:
212663 inode_unlock(inode);
213664 if (ret > 0)
....@@ -220,10 +671,6 @@
220671 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
221672 {
222673 struct inode *inode = file_inode(iocb->ki_filp);
223
- int o_direct = iocb->ki_flags & IOCB_DIRECT;
224
- int unaligned_aio = 0;
225
- int overwrite = 0;
226
- ssize_t ret;
227674
228675 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
229676 return -EIO;
....@@ -232,61 +679,10 @@
232679 if (IS_DAX(inode))
233680 return ext4_dax_write_iter(iocb, from);
234681 #endif
235
- if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
236
- return -EOPNOTSUPP;
237
-
238
- if (!inode_trylock(inode)) {
239
- if (iocb->ki_flags & IOCB_NOWAIT)
240
- return -EAGAIN;
241
- inode_lock(inode);
242
- }
243
-
244
- ret = ext4_write_checks(iocb, from);
245
- if (ret <= 0)
246
- goto out;
247
-
248
- /*
249
- * Unaligned direct AIO must be serialized among each other as zeroing
250
- * of partial blocks of two competing unaligned AIOs can result in data
251
- * corruption.
252
- */
253
- if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
254
- !is_sync_kiocb(iocb) &&
255
- ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
256
- unaligned_aio = 1;
257
- ext4_unwritten_wait(inode);
258
- }
259
-
260
- iocb->private = &overwrite;
261
- /* Check whether we do a DIO overwrite or not */
262
- if (o_direct && !unaligned_aio) {
263
- if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
264
- if (ext4_should_dioread_nolock(inode))
265
- overwrite = 1;
266
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
267
- ret = -EAGAIN;
268
- goto out;
269
- }
270
- }
271
-
272
- ret = __generic_file_write_iter(iocb, from);
273
- /*
274
- * Unaligned direct AIO must be the only IO in flight. Otherwise
275
- * overlapping aligned IO after unaligned might result in data
276
- * corruption.
277
- */
278
- if (ret == -EIOCBQUEUED && unaligned_aio)
279
- ext4_unwritten_wait(inode);
280
- inode_unlock(inode);
281
-
282
- if (ret > 0)
283
- ret = generic_write_sync(iocb, ret);
284
-
285
- return ret;
286
-
287
-out:
288
- inode_unlock(inode);
289
- return ret;
682
+ if (iocb->ki_flags & IOCB_DIRECT)
683
+ return ext4_dio_write_iter(iocb, from);
684
+ else
685
+ return ext4_buffered_write_iter(iocb, from);
290686 }
291687
292688 #ifdef CONFIG_FS_DAX
....@@ -368,20 +764,25 @@
368764 .fault = ext4_filemap_fault,
369765 .map_pages = filemap_map_pages,
370766 .page_mkwrite = ext4_page_mkwrite,
767
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
768
+ .allow_speculation = filemap_allow_speculation,
769
+#endif
371770 };
372771
373772 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
374773 {
375774 struct inode *inode = file->f_mapping->host;
775
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
776
+ struct dax_device *dax_dev = sbi->s_daxdev;
376777
377
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
778
+ if (unlikely(ext4_forced_shutdown(sbi)))
378779 return -EIO;
379780
380781 /*
381
- * We don't support synchronous mappings for non-DAX files. At least
382
- * until someone comes with a sensible use case.
782
+ * We don't support synchronous mappings for non-DAX files and
783
+ * for DAX files if underneath dax_device is not synchronous.
383784 */
384
- if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
785
+ if (!daxdev_mapping_supported(vma, dax_dev))
385786 return -EOPNOTSUPP;
386787
387788 file_accessed(file);
....@@ -403,13 +804,13 @@
403804 handle_t *handle;
404805 int err;
405806
406
- if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED))
807
+ if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
407808 return 0;
408809
409810 if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
410811 return 0;
411812
412
- sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
813
+ ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
413814 /*
414815 * Sample where the filesystem has been mounted and
415816 * store it in the superblock for sysadmin convenience
....@@ -432,7 +833,7 @@
432833 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
433834 if (err)
434835 goto out_journal;
435
- strlcpy(sbi->s_es->s_last_mounted, cp,
836
+ strncpy(sbi->s_es->s_last_mounted, cp,
436837 sizeof(sbi->s_es->s_last_mounted));
437838 ext4_handle_dirty_super(handle, sb);
438839 out_journal:
....@@ -442,7 +843,7 @@
442843 return err;
443844 }
444845
445
-static int ext4_file_open(struct inode * inode, struct file * filp)
846
+static int ext4_file_open(struct inode *inode, struct file *filp)
446847 {
447848 int ret;
448849
....@@ -471,7 +872,7 @@
471872 return ret;
472873 }
473874
474
- filp->f_mode |= FMODE_NOWAIT;
875
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
475876 return dquot_file_open(inode, filp);
476877 }
477878
....@@ -496,12 +897,14 @@
496897 maxbytes, i_size_read(inode));
497898 case SEEK_HOLE:
498899 inode_lock_shared(inode);
499
- offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
900
+ offset = iomap_seek_hole(inode, offset,
901
+ &ext4_iomap_report_ops);
500902 inode_unlock_shared(inode);
501903 break;
502904 case SEEK_DATA:
503905 inode_lock_shared(inode);
504
- offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
906
+ offset = iomap_seek_data(inode, offset,
907
+ &ext4_iomap_report_ops);
505908 inode_unlock_shared(inode);
506909 break;
507910 }
....@@ -515,6 +918,7 @@
515918 .llseek = ext4_llseek,
516919 .read_iter = ext4_file_read_iter,
517920 .write_iter = ext4_file_write_iter,
921
+ .iopoll = iomap_dio_iopoll,
518922 .unlocked_ioctl = ext4_ioctl,
519923 #ifdef CONFIG_COMPAT
520924 .compat_ioctl = ext4_compat_ioctl,