.. | .. |
---|
29 | 29 | #include <linux/pagevec.h> |
---|
30 | 30 | #include <linux/uio.h> |
---|
31 | 31 | #include <linux/mman.h> |
---|
| 32 | +#include <linux/backing-dev.h> |
---|
32 | 33 | #include "ext4.h" |
---|
33 | 34 | #include "ext4_jbd2.h" |
---|
34 | 35 | #include "xattr.h" |
---|
35 | 36 | #include "acl.h" |
---|
| 37 | +#include "truncate.h" |
---|
| 38 | + |
---|
| 39 | +static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter) |
---|
| 40 | +{ |
---|
| 41 | + struct inode *inode = file_inode(iocb->ki_filp); |
---|
| 42 | + |
---|
| 43 | + if (!fscrypt_dio_supported(iocb, iter)) |
---|
| 44 | + return false; |
---|
| 45 | + if (fsverity_active(inode)) |
---|
| 46 | + return false; |
---|
| 47 | + if (ext4_should_journal_data(inode)) |
---|
| 48 | + return false; |
---|
| 49 | + if (ext4_has_inline_data(inode)) |
---|
| 50 | + return false; |
---|
| 51 | + return true; |
---|
| 52 | +} |
---|
| 53 | + |
---|
| 54 | +static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) |
---|
| 55 | +{ |
---|
| 56 | + ssize_t ret; |
---|
| 57 | + struct inode *inode = file_inode(iocb->ki_filp); |
---|
| 58 | + |
---|
| 59 | + if (iocb->ki_flags & IOCB_NOWAIT) { |
---|
| 60 | + if (!inode_trylock_shared(inode)) |
---|
| 61 | + return -EAGAIN; |
---|
| 62 | + } else { |
---|
| 63 | + inode_lock_shared(inode); |
---|
| 64 | + } |
---|
| 65 | + |
---|
| 66 | + if (!ext4_dio_supported(iocb, to)) { |
---|
| 67 | + inode_unlock_shared(inode); |
---|
| 68 | + /* |
---|
| 69 | + * Fallback to buffered I/O if the operation being performed on |
---|
| 70 | + * the inode is not supported by direct I/O. The IOCB_DIRECT |
---|
| 71 | + * flag needs to be cleared here in order to ensure that the |
---|
| 72 | + * direct I/O path within generic_file_read_iter() is not |
---|
| 73 | + * taken. |
---|
| 74 | + */ |
---|
| 75 | + iocb->ki_flags &= ~IOCB_DIRECT; |
---|
| 76 | + return generic_file_read_iter(iocb, to); |
---|
| 77 | + } |
---|
| 78 | + |
---|
| 79 | + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, |
---|
| 80 | + is_sync_kiocb(iocb)); |
---|
| 81 | + inode_unlock_shared(inode); |
---|
| 82 | + |
---|
| 83 | + file_accessed(iocb->ki_filp); |
---|
| 84 | + return ret; |
---|
| 85 | +} |
---|
36 | 86 | |
---|
37 | 87 | #ifdef CONFIG_FS_DAX |
---|
38 | 88 | static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) |
---|
.. | .. |
---|
65 | 115 | |
---|
66 | 116 | static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) |
---|
67 | 117 | { |
---|
68 | | - if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb)))) |
---|
| 118 | + struct inode *inode = file_inode(iocb->ki_filp); |
---|
| 119 | + |
---|
| 120 | + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) |
---|
69 | 121 | return -EIO; |
---|
70 | 122 | |
---|
71 | 123 | if (!iov_iter_count(to)) |
---|
72 | 124 | return 0; /* skip atime */ |
---|
73 | 125 | |
---|
74 | 126 | #ifdef CONFIG_FS_DAX |
---|
75 | | - if (IS_DAX(file_inode(iocb->ki_filp))) |
---|
| 127 | + if (IS_DAX(inode)) |
---|
76 | 128 | return ext4_dax_read_iter(iocb, to); |
---|
77 | 129 | #endif |
---|
| 130 | + if (iocb->ki_flags & IOCB_DIRECT) |
---|
| 131 | + return ext4_dio_read_iter(iocb, to); |
---|
| 132 | + |
---|
78 | 133 | return generic_file_read_iter(iocb, to); |
---|
79 | 134 | } |
---|
80 | 135 | |
---|
.. | .. |
---|
92 | 147 | /* if we are the last writer on the inode, drop the block reservation */ |
---|
93 | 148 | if ((filp->f_mode & FMODE_WRITE) && |
---|
94 | 149 | (atomic_read(&inode->i_writecount) == 1) && |
---|
95 | | - !EXT4_I(inode)->i_reserved_data_blocks) |
---|
96 | | - { |
---|
| 150 | + !EXT4_I(inode)->i_reserved_data_blocks) { |
---|
97 | 151 | down_write(&EXT4_I(inode)->i_data_sem); |
---|
98 | | - ext4_discard_preallocations(inode); |
---|
| 152 | + ext4_discard_preallocations(inode, 0); |
---|
99 | 153 | up_write(&EXT4_I(inode)->i_data_sem); |
---|
100 | 154 | } |
---|
101 | 155 | if (is_dx(inode) && filp->private_data) |
---|
102 | 156 | ext4_htree_free_dir_info(filp->private_data); |
---|
103 | 157 | |
---|
104 | 158 | return 0; |
---|
105 | | -} |
---|
106 | | - |
---|
107 | | -static void ext4_unwritten_wait(struct inode *inode) |
---|
108 | | -{ |
---|
109 | | - wait_queue_head_t *wq = ext4_ioend_wq(inode); |
---|
110 | | - |
---|
111 | | - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); |
---|
112 | 159 | } |
---|
113 | 160 | |
---|
114 | 161 | /* |
---|
.. | .. |
---|
120 | 167 | * threads are at work on the same unwritten block, they must be synchronized |
---|
121 | 168 | * or one thread will zero the other's data, causing corruption. |
---|
122 | 169 | */ |
---|
123 | | -static int |
---|
124 | | -ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) |
---|
| 170 | +static bool |
---|
| 171 | +ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) |
---|
125 | 172 | { |
---|
126 | 173 | struct super_block *sb = inode->i_sb; |
---|
127 | | - int blockmask = sb->s_blocksize - 1; |
---|
128 | | - |
---|
129 | | - if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) |
---|
130 | | - return 0; |
---|
| 174 | + unsigned long blockmask = sb->s_blocksize - 1; |
---|
131 | 175 | |
---|
132 | 176 | if ((pos | iov_iter_alignment(from)) & blockmask) |
---|
133 | | - return 1; |
---|
| 177 | + return true; |
---|
134 | 178 | |
---|
135 | | - return 0; |
---|
| 179 | + return false; |
---|
| 180 | +} |
---|
| 181 | + |
---|
| 182 | +static bool |
---|
| 183 | +ext4_extending_io(struct inode *inode, loff_t offset, size_t len) |
---|
| 184 | +{ |
---|
| 185 | + if (offset + len > i_size_read(inode) || |
---|
| 186 | + offset + len > EXT4_I(inode)->i_disksize) |
---|
| 187 | + return true; |
---|
| 188 | + return false; |
---|
136 | 189 | } |
---|
137 | 190 | |
---|
138 | 191 | /* Is IO overwriting allocated and initialized blocks? */ |
---|
.. | .. |
---|
158 | 211 | return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); |
---|
159 | 212 | } |
---|
160 | 213 | |
---|
161 | | -static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) |
---|
| 214 | +static ssize_t ext4_generic_write_checks(struct kiocb *iocb, |
---|
| 215 | + struct iov_iter *from) |
---|
162 | 216 | { |
---|
163 | 217 | struct inode *inode = file_inode(iocb->ki_filp); |
---|
164 | 218 | ssize_t ret; |
---|
165 | 219 | |
---|
| 220 | + if (unlikely(IS_IMMUTABLE(inode))) |
---|
| 221 | + return -EPERM; |
---|
| 222 | + |
---|
166 | 223 | ret = generic_write_checks(iocb, from); |
---|
167 | 224 | if (ret <= 0) |
---|
168 | 225 | return ret; |
---|
169 | | - |
---|
170 | | - if (unlikely(IS_IMMUTABLE(inode))) |
---|
171 | | - return -EPERM; |
---|
172 | 226 | |
---|
173 | 227 | /* |
---|
174 | 228 | * If we have encountered a bitmap-format file, the size limit |
---|
.. | .. |
---|
181 | 235 | return -EFBIG; |
---|
182 | 236 | iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); |
---|
183 | 237 | } |
---|
| 238 | + |
---|
184 | 239 | return iov_iter_count(from); |
---|
| 240 | +} |
---|
| 241 | + |
---|
| 242 | +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) |
---|
| 243 | +{ |
---|
| 244 | + ssize_t ret, count; |
---|
| 245 | + |
---|
| 246 | + count = ext4_generic_write_checks(iocb, from); |
---|
| 247 | + if (count <= 0) |
---|
| 248 | + return count; |
---|
| 249 | + |
---|
| 250 | + ret = file_modified(iocb->ki_filp); |
---|
| 251 | + if (ret) |
---|
| 252 | + return ret; |
---|
| 253 | + return count; |
---|
| 254 | +} |
---|
| 255 | + |
---|
| 256 | +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, |
---|
| 257 | + struct iov_iter *from) |
---|
| 258 | +{ |
---|
| 259 | + ssize_t ret; |
---|
| 260 | + struct inode *inode = file_inode(iocb->ki_filp); |
---|
| 261 | + |
---|
| 262 | + if (iocb->ki_flags & IOCB_NOWAIT) |
---|
| 263 | + return -EOPNOTSUPP; |
---|
| 264 | + |
---|
| 265 | + inode_lock(inode); |
---|
| 266 | + ret = ext4_write_checks(iocb, from); |
---|
| 267 | + if (ret <= 0) |
---|
| 268 | + goto out; |
---|
| 269 | + |
---|
| 270 | + current->backing_dev_info = inode_to_bdi(inode); |
---|
| 271 | + ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); |
---|
| 272 | + current->backing_dev_info = NULL; |
---|
| 273 | + |
---|
| 274 | +out: |
---|
| 275 | + inode_unlock(inode); |
---|
| 276 | + if (likely(ret > 0)) { |
---|
| 277 | + iocb->ki_pos += ret; |
---|
| 278 | + ret = generic_write_sync(iocb, ret); |
---|
| 279 | + } |
---|
| 280 | + |
---|
| 281 | + return ret; |
---|
| 282 | +} |
---|
| 283 | + |
---|
| 284 | +static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, |
---|
| 285 | + ssize_t written, size_t count) |
---|
| 286 | +{ |
---|
| 287 | + handle_t *handle; |
---|
| 288 | + bool truncate = false; |
---|
| 289 | + u8 blkbits = inode->i_blkbits; |
---|
| 290 | + ext4_lblk_t written_blk, end_blk; |
---|
| 291 | + int ret; |
---|
| 292 | + |
---|
| 293 | + /* |
---|
| 294 | + * Note that EXT4_I(inode)->i_disksize can get extended up to |
---|
| 295 | + * inode->i_size while the I/O was running due to writeback of delalloc |
---|
| 296 | + * blocks. But, the code in ext4_iomap_alloc() is careful to use |
---|
| 297 | + * zeroed/unwritten extents if this is possible; thus we won't leave |
---|
| 298 | + * uninitialized blocks in a file even if we didn't succeed in writing |
---|
| 299 | + * as much as we intended. |
---|
| 300 | + */ |
---|
| 301 | + WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); |
---|
| 302 | + if (offset + count <= EXT4_I(inode)->i_disksize) { |
---|
| 303 | + /* |
---|
| 304 | + * We need to ensure that the inode is removed from the orphan |
---|
| 305 | + * list if it has been added prematurely, due to writeback of |
---|
| 306 | + * delalloc blocks. |
---|
| 307 | + */ |
---|
| 308 | + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { |
---|
| 309 | + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); |
---|
| 310 | + |
---|
| 311 | + if (IS_ERR(handle)) { |
---|
| 312 | + ext4_orphan_del(NULL, inode); |
---|
| 313 | + return PTR_ERR(handle); |
---|
| 314 | + } |
---|
| 315 | + |
---|
| 316 | + ext4_orphan_del(handle, inode); |
---|
| 317 | + ext4_journal_stop(handle); |
---|
| 318 | + } |
---|
| 319 | + |
---|
| 320 | + return written; |
---|
| 321 | + } |
---|
| 322 | + |
---|
| 323 | + if (written < 0) |
---|
| 324 | + goto truncate; |
---|
| 325 | + |
---|
| 326 | + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); |
---|
| 327 | + if (IS_ERR(handle)) { |
---|
| 328 | + written = PTR_ERR(handle); |
---|
| 329 | + goto truncate; |
---|
| 330 | + } |
---|
| 331 | + |
---|
| 332 | + if (ext4_update_inode_size(inode, offset + written)) { |
---|
| 333 | + ret = ext4_mark_inode_dirty(handle, inode); |
---|
| 334 | + if (unlikely(ret)) { |
---|
| 335 | + written = ret; |
---|
| 336 | + ext4_journal_stop(handle); |
---|
| 337 | + goto truncate; |
---|
| 338 | + } |
---|
| 339 | + } |
---|
| 340 | + |
---|
| 341 | + /* |
---|
| 342 | + * We may need to truncate allocated but not written blocks beyond EOF. |
---|
| 343 | + */ |
---|
| 344 | + written_blk = ALIGN(offset + written, 1 << blkbits); |
---|
| 345 | + end_blk = ALIGN(offset + count, 1 << blkbits); |
---|
| 346 | + if (written_blk < end_blk && ext4_can_truncate(inode)) |
---|
| 347 | + truncate = true; |
---|
| 348 | + |
---|
| 349 | + /* |
---|
| 350 | + * Remove the inode from the orphan list if it has been extended and |
---|
| 351 | + * everything went OK. |
---|
| 352 | + */ |
---|
| 353 | + if (!truncate && inode->i_nlink) |
---|
| 354 | + ext4_orphan_del(handle, inode); |
---|
| 355 | + ext4_journal_stop(handle); |
---|
| 356 | + |
---|
| 357 | + if (truncate) { |
---|
| 358 | +truncate: |
---|
| 359 | + ext4_truncate_failed_write(inode); |
---|
| 360 | + /* |
---|
| 361 | + * If the truncate operation failed early, then the inode may |
---|
| 362 | + * still be on the orphan list. In that case, we need to try |
---|
| 363 | + * remove the inode from the in-memory linked list. |
---|
| 364 | + */ |
---|
| 365 | + if (inode->i_nlink) |
---|
| 366 | + ext4_orphan_del(NULL, inode); |
---|
| 367 | + } |
---|
| 368 | + |
---|
| 369 | + return written; |
---|
| 370 | +} |
---|
| 371 | + |
---|
| 372 | +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, |
---|
| 373 | + int error, unsigned int flags) |
---|
| 374 | +{ |
---|
| 375 | + loff_t pos = iocb->ki_pos; |
---|
| 376 | + struct inode *inode = file_inode(iocb->ki_filp); |
---|
| 377 | + |
---|
| 378 | + if (error) |
---|
| 379 | + return error; |
---|
| 380 | + |
---|
| 381 | + if (size && flags & IOMAP_DIO_UNWRITTEN) { |
---|
| 382 | + error = ext4_convert_unwritten_extents(NULL, inode, pos, size); |
---|
| 383 | + if (error < 0) |
---|
| 384 | + return error; |
---|
| 385 | + } |
---|
| 386 | + /* |
---|
| 387 | + * If we are extending the file, we have to update i_size here before |
---|
| 388 | + * page cache gets invalidated in iomap_dio_rw(). Otherwise racing |
---|
| 389 | + * buffered reads could zero out too much from page cache pages. Update |
---|
| 390 | + * of on-disk size will happen later in ext4_dio_write_iter() where |
---|
| 391 | + * we have enough information to also perform orphan list handling etc. |
---|
| 392 | + * Note that we perform all extending writes synchronously under |
---|
| 393 | + * i_rwsem held exclusively so i_size update is safe here in that case. |
---|
| 394 | + * If the write was not extending, we cannot see pos > i_size here |
---|
| 395 | + * because operations reducing i_size like truncate wait for all |
---|
| 396 | + * outstanding DIO before updating i_size. |
---|
| 397 | + */ |
---|
| 398 | + pos += size; |
---|
| 399 | + if (pos > i_size_read(inode)) |
---|
| 400 | + i_size_write(inode, pos); |
---|
| 401 | + |
---|
| 402 | + return 0; |
---|
| 403 | +} |
---|
| 404 | + |
---|
| 405 | +static const struct iomap_dio_ops ext4_dio_write_ops = { |
---|
| 406 | + .end_io = ext4_dio_write_end_io, |
---|
| 407 | +}; |
---|
| 408 | + |
---|
| 409 | +/* |
---|
| 410 | + * The intention here is to start with shared lock acquired then see if any |
---|
| 411 | + * condition requires an exclusive inode lock. If yes, then we restart the |
---|
| 412 | + * whole operation by releasing the shared lock and acquiring exclusive lock. |
---|
| 413 | + * |
---|
| 414 | + * - For unaligned_io we never take shared lock as it may cause data corruption |
---|
| 415 | + * when two unaligned IO tries to modify the same block e.g. while zeroing. |
---|
| 416 | + * |
---|
| 417 | + * - For extending writes case we don't take the shared lock, since it requires |
---|
| 418 | + * updating inode i_disksize and/or orphan handling with exclusive lock. |
---|
| 419 | + * |
---|
| 420 | + * - shared locking will only be true mostly with overwrites. Otherwise we will |
---|
| 421 | + * switch to exclusive i_rwsem lock. |
---|
| 422 | + */ |
---|
| 423 | +static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, |
---|
| 424 | + bool *ilock_shared, bool *extend) |
---|
| 425 | +{ |
---|
| 426 | + struct file *file = iocb->ki_filp; |
---|
| 427 | + struct inode *inode = file_inode(file); |
---|
| 428 | + loff_t offset; |
---|
| 429 | + size_t count; |
---|
| 430 | + ssize_t ret; |
---|
| 431 | + |
---|
| 432 | +restart: |
---|
| 433 | + ret = ext4_generic_write_checks(iocb, from); |
---|
| 434 | + if (ret <= 0) |
---|
| 435 | + goto out; |
---|
| 436 | + |
---|
| 437 | + offset = iocb->ki_pos; |
---|
| 438 | + count = ret; |
---|
| 439 | + if (ext4_extending_io(inode, offset, count)) |
---|
| 440 | + *extend = true; |
---|
| 441 | + /* |
---|
| 442 | + * Determine whether the IO operation will overwrite allocated |
---|
| 443 | + * and initialized blocks. |
---|
| 444 | + * We need exclusive i_rwsem for changing security info |
---|
| 445 | + * in file_modified(). |
---|
| 446 | + */ |
---|
| 447 | + if (*ilock_shared && (!IS_NOSEC(inode) || *extend || |
---|
| 448 | + !ext4_overwrite_io(inode, offset, count))) { |
---|
| 449 | + if (iocb->ki_flags & IOCB_NOWAIT) { |
---|
| 450 | + ret = -EAGAIN; |
---|
| 451 | + goto out; |
---|
| 452 | + } |
---|
| 453 | + inode_unlock_shared(inode); |
---|
| 454 | + *ilock_shared = false; |
---|
| 455 | + inode_lock(inode); |
---|
| 456 | + goto restart; |
---|
| 457 | + } |
---|
| 458 | + |
---|
| 459 | + ret = file_modified(file); |
---|
| 460 | + if (ret < 0) |
---|
| 461 | + goto out; |
---|
| 462 | + |
---|
| 463 | + return count; |
---|
| 464 | +out: |
---|
| 465 | + if (*ilock_shared) |
---|
| 466 | + inode_unlock_shared(inode); |
---|
| 467 | + else |
---|
| 468 | + inode_unlock(inode); |
---|
| 469 | + return ret; |
---|
| 470 | +} |
---|
| 471 | + |
---|
| 472 | +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) |
---|
| 473 | +{ |
---|
| 474 | + ssize_t ret; |
---|
| 475 | + handle_t *handle; |
---|
| 476 | + struct inode *inode = file_inode(iocb->ki_filp); |
---|
| 477 | + loff_t offset = iocb->ki_pos; |
---|
| 478 | + size_t count = iov_iter_count(from); |
---|
| 479 | + const struct iomap_ops *iomap_ops = &ext4_iomap_ops; |
---|
| 480 | + bool extend = false, unaligned_io = false; |
---|
| 481 | + bool ilock_shared = true; |
---|
| 482 | + |
---|
| 483 | + /* |
---|
| 484 | + * We initially start with shared inode lock unless it is |
---|
| 485 | + * unaligned IO which needs exclusive lock anyways. |
---|
| 486 | + */ |
---|
| 487 | + if (ext4_unaligned_io(inode, from, offset)) { |
---|
| 488 | + unaligned_io = true; |
---|
| 489 | + ilock_shared = false; |
---|
| 490 | + } |
---|
| 491 | + /* |
---|
| 492 | + * Quick check here without any i_rwsem lock to see if it is extending |
---|
| 493 | + * IO. A more reliable check is done in ext4_dio_write_checks() with |
---|
| 494 | + * proper locking in place. |
---|
| 495 | + */ |
---|
| 496 | + if (offset + count > i_size_read(inode)) |
---|
| 497 | + ilock_shared = false; |
---|
| 498 | + |
---|
| 499 | + if (iocb->ki_flags & IOCB_NOWAIT) { |
---|
| 500 | + if (ilock_shared) { |
---|
| 501 | + if (!inode_trylock_shared(inode)) |
---|
| 502 | + return -EAGAIN; |
---|
| 503 | + } else { |
---|
| 504 | + if (!inode_trylock(inode)) |
---|
| 505 | + return -EAGAIN; |
---|
| 506 | + } |
---|
| 507 | + } else { |
---|
| 508 | + if (ilock_shared) |
---|
| 509 | + inode_lock_shared(inode); |
---|
| 510 | + else |
---|
| 511 | + inode_lock(inode); |
---|
| 512 | + } |
---|
| 513 | + |
---|
| 514 | + /* Fallback to buffered I/O if the inode does not support direct I/O. */ |
---|
| 515 | + if (!ext4_dio_supported(iocb, from)) { |
---|
| 516 | + if (ilock_shared) |
---|
| 517 | + inode_unlock_shared(inode); |
---|
| 518 | + else |
---|
| 519 | + inode_unlock(inode); |
---|
| 520 | + return ext4_buffered_write_iter(iocb, from); |
---|
| 521 | + } |
---|
| 522 | + |
---|
| 523 | + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); |
---|
| 524 | + if (ret <= 0) |
---|
| 525 | + return ret; |
---|
| 526 | + |
---|
| 527 | + /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */ |
---|
| 528 | + if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) { |
---|
| 529 | + ret = -EAGAIN; |
---|
| 530 | + goto out; |
---|
| 531 | + } |
---|
| 532 | + /* |
---|
| 533 | + * Make sure inline data cannot be created anymore since we are going |
---|
| 534 | + * to allocate blocks for DIO. We know the inode does not have any |
---|
| 535 | + * inline data now because ext4_dio_supported() checked for that. |
---|
| 536 | + */ |
---|
| 537 | + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); |
---|
| 538 | + |
---|
| 539 | + offset = iocb->ki_pos; |
---|
| 540 | + count = ret; |
---|
| 541 | + |
---|
| 542 | + /* |
---|
| 543 | + * Unaligned direct IO must be serialized among each other as zeroing |
---|
| 544 | + * of partial blocks of two competing unaligned IOs can result in data |
---|
| 545 | + * corruption. |
---|
| 546 | + * |
---|
| 547 | + * So we make sure we don't allow any unaligned IO in flight. |
---|
| 548 | + * For IOs where we need not wait (like unaligned non-AIO DIO), |
---|
| 549 | + * below inode_dio_wait() may anyway become a no-op, since we start |
---|
| 550 | + * with exclusive lock. |
---|
| 551 | + */ |
---|
| 552 | + if (unaligned_io) |
---|
| 553 | + inode_dio_wait(inode); |
---|
| 554 | + |
---|
| 555 | + if (extend) { |
---|
| 556 | + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); |
---|
| 557 | + if (IS_ERR(handle)) { |
---|
| 558 | + ret = PTR_ERR(handle); |
---|
| 559 | + goto out; |
---|
| 560 | + } |
---|
| 561 | + |
---|
| 562 | + ret = ext4_orphan_add(handle, inode); |
---|
| 563 | + if (ret) { |
---|
| 564 | + ext4_journal_stop(handle); |
---|
| 565 | + goto out; |
---|
| 566 | + } |
---|
| 567 | + |
---|
| 568 | + ext4_journal_stop(handle); |
---|
| 569 | + } |
---|
| 570 | + |
---|
| 571 | + if (ilock_shared) |
---|
| 572 | + iomap_ops = &ext4_iomap_overwrite_ops; |
---|
| 573 | + ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, |
---|
| 574 | + is_sync_kiocb(iocb) || unaligned_io || extend); |
---|
| 575 | + if (ret == -ENOTBLK) |
---|
| 576 | + ret = 0; |
---|
| 577 | + |
---|
| 578 | + if (extend) |
---|
| 579 | + ret = ext4_handle_inode_extension(inode, offset, ret, count); |
---|
| 580 | + |
---|
| 581 | +out: |
---|
| 582 | + if (ilock_shared) |
---|
| 583 | + inode_unlock_shared(inode); |
---|
| 584 | + else |
---|
| 585 | + inode_unlock(inode); |
---|
| 586 | + |
---|
| 587 | + if (ret >= 0 && iov_iter_count(from)) { |
---|
| 588 | + ssize_t err; |
---|
| 589 | + loff_t endbyte; |
---|
| 590 | + |
---|
| 591 | + offset = iocb->ki_pos; |
---|
| 592 | + err = ext4_buffered_write_iter(iocb, from); |
---|
| 593 | + if (err < 0) |
---|
| 594 | + return err; |
---|
| 595 | + |
---|
| 596 | + /* |
---|
| 597 | + * We need to ensure that the pages within the page cache for |
---|
| 598 | + * the range covered by this I/O are written to disk and |
---|
| 599 | + * invalidated. This is in attempt to preserve the expected |
---|
| 600 | + * direct I/O semantics in the case we fallback to buffered I/O |
---|
| 601 | + * to complete off the I/O request. |
---|
| 602 | + */ |
---|
| 603 | + ret += err; |
---|
| 604 | + endbyte = offset + err - 1; |
---|
| 605 | + err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, |
---|
| 606 | + offset, endbyte); |
---|
| 607 | + if (!err) |
---|
| 608 | + invalidate_mapping_pages(iocb->ki_filp->f_mapping, |
---|
| 609 | + offset >> PAGE_SHIFT, |
---|
| 610 | + endbyte >> PAGE_SHIFT); |
---|
| 611 | + } |
---|
| 612 | + |
---|
| 613 | + return ret; |
---|
185 | 614 | } |
---|
186 | 615 | |
---|
187 | 616 | #ifdef CONFIG_FS_DAX |
---|
188 | 617 | static ssize_t |
---|
189 | 618 | ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) |
---|
190 | 619 | { |
---|
191 | | - struct inode *inode = file_inode(iocb->ki_filp); |
---|
192 | 620 | ssize_t ret; |
---|
| 621 | + size_t count; |
---|
| 622 | + loff_t offset; |
---|
| 623 | + handle_t *handle; |
---|
| 624 | + bool extend = false; |
---|
| 625 | + struct inode *inode = file_inode(iocb->ki_filp); |
---|
193 | 626 | |
---|
194 | 627 | if (iocb->ki_flags & IOCB_NOWAIT) { |
---|
195 | 628 | if (!inode_trylock(inode)) |
---|
.. | .. |
---|
197 | 630 | } else { |
---|
198 | 631 | inode_lock(inode); |
---|
199 | 632 | } |
---|
| 633 | + |
---|
200 | 634 | ret = ext4_write_checks(iocb, from); |
---|
201 | 635 | if (ret <= 0) |
---|
202 | 636 | goto out; |
---|
203 | | - ret = file_remove_privs(iocb->ki_filp); |
---|
204 | | - if (ret) |
---|
205 | | - goto out; |
---|
206 | | - ret = file_update_time(iocb->ki_filp); |
---|
207 | | - if (ret) |
---|
208 | | - goto out; |
---|
| 637 | + |
---|
| 638 | + offset = iocb->ki_pos; |
---|
| 639 | + count = iov_iter_count(from); |
---|
| 640 | + |
---|
| 641 | + if (offset + count > EXT4_I(inode)->i_disksize) { |
---|
| 642 | + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); |
---|
| 643 | + if (IS_ERR(handle)) { |
---|
| 644 | + ret = PTR_ERR(handle); |
---|
| 645 | + goto out; |
---|
| 646 | + } |
---|
| 647 | + |
---|
| 648 | + ret = ext4_orphan_add(handle, inode); |
---|
| 649 | + if (ret) { |
---|
| 650 | + ext4_journal_stop(handle); |
---|
| 651 | + goto out; |
---|
| 652 | + } |
---|
| 653 | + |
---|
| 654 | + extend = true; |
---|
| 655 | + ext4_journal_stop(handle); |
---|
| 656 | + } |
---|
209 | 657 | |
---|
210 | 658 | ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); |
---|
| 659 | + |
---|
| 660 | + if (extend) |
---|
| 661 | + ret = ext4_handle_inode_extension(inode, offset, ret, count); |
---|
211 | 662 | out: |
---|
212 | 663 | inode_unlock(inode); |
---|
213 | 664 | if (ret > 0) |
---|
.. | .. |
---|
220 | 671 | ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) |
---|
221 | 672 | { |
---|
222 | 673 | struct inode *inode = file_inode(iocb->ki_filp); |
---|
223 | | - int o_direct = iocb->ki_flags & IOCB_DIRECT; |
---|
224 | | - int unaligned_aio = 0; |
---|
225 | | - int overwrite = 0; |
---|
226 | | - ssize_t ret; |
---|
227 | 674 | |
---|
228 | 675 | if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) |
---|
229 | 676 | return -EIO; |
---|
.. | .. |
---|
232 | 679 | if (IS_DAX(inode)) |
---|
233 | 680 | return ext4_dax_write_iter(iocb, from); |
---|
234 | 681 | #endif |
---|
235 | | - if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT)) |
---|
236 | | - return -EOPNOTSUPP; |
---|
237 | | - |
---|
238 | | - if (!inode_trylock(inode)) { |
---|
239 | | - if (iocb->ki_flags & IOCB_NOWAIT) |
---|
240 | | - return -EAGAIN; |
---|
241 | | - inode_lock(inode); |
---|
242 | | - } |
---|
243 | | - |
---|
244 | | - ret = ext4_write_checks(iocb, from); |
---|
245 | | - if (ret <= 0) |
---|
246 | | - goto out; |
---|
247 | | - |
---|
248 | | - /* |
---|
249 | | - * Unaligned direct AIO must be serialized among each other as zeroing |
---|
250 | | - * of partial blocks of two competing unaligned AIOs can result in data |
---|
251 | | - * corruption. |
---|
252 | | - */ |
---|
253 | | - if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && |
---|
254 | | - !is_sync_kiocb(iocb) && |
---|
255 | | - ext4_unaligned_aio(inode, from, iocb->ki_pos)) { |
---|
256 | | - unaligned_aio = 1; |
---|
257 | | - ext4_unwritten_wait(inode); |
---|
258 | | - } |
---|
259 | | - |
---|
260 | | - iocb->private = &overwrite; |
---|
261 | | - /* Check whether we do a DIO overwrite or not */ |
---|
262 | | - if (o_direct && !unaligned_aio) { |
---|
263 | | - if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { |
---|
264 | | - if (ext4_should_dioread_nolock(inode)) |
---|
265 | | - overwrite = 1; |
---|
266 | | - } else if (iocb->ki_flags & IOCB_NOWAIT) { |
---|
267 | | - ret = -EAGAIN; |
---|
268 | | - goto out; |
---|
269 | | - } |
---|
270 | | - } |
---|
271 | | - |
---|
272 | | - ret = __generic_file_write_iter(iocb, from); |
---|
273 | | - /* |
---|
274 | | - * Unaligned direct AIO must be the only IO in flight. Otherwise |
---|
275 | | - * overlapping aligned IO after unaligned might result in data |
---|
276 | | - * corruption. |
---|
277 | | - */ |
---|
278 | | - if (ret == -EIOCBQUEUED && unaligned_aio) |
---|
279 | | - ext4_unwritten_wait(inode); |
---|
280 | | - inode_unlock(inode); |
---|
281 | | - |
---|
282 | | - if (ret > 0) |
---|
283 | | - ret = generic_write_sync(iocb, ret); |
---|
284 | | - |
---|
285 | | - return ret; |
---|
286 | | - |
---|
287 | | -out: |
---|
288 | | - inode_unlock(inode); |
---|
289 | | - return ret; |
---|
| 682 | + if (iocb->ki_flags & IOCB_DIRECT) |
---|
| 683 | + return ext4_dio_write_iter(iocb, from); |
---|
| 684 | + else |
---|
| 685 | + return ext4_buffered_write_iter(iocb, from); |
---|
290 | 686 | } |
---|
291 | 687 | |
---|
292 | 688 | #ifdef CONFIG_FS_DAX |
---|
.. | .. |
---|
368 | 764 | .fault = ext4_filemap_fault, |
---|
369 | 765 | .map_pages = filemap_map_pages, |
---|
370 | 766 | .page_mkwrite = ext4_page_mkwrite, |
---|
| 767 | +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT |
---|
| 768 | + .allow_speculation = filemap_allow_speculation, |
---|
| 769 | +#endif |
---|
371 | 770 | }; |
---|
372 | 771 | |
---|
373 | 772 | static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) |
---|
374 | 773 | { |
---|
375 | 774 | struct inode *inode = file->f_mapping->host; |
---|
| 775 | + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
---|
| 776 | + struct dax_device *dax_dev = sbi->s_daxdev; |
---|
376 | 777 | |
---|
377 | | - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) |
---|
| 778 | + if (unlikely(ext4_forced_shutdown(sbi))) |
---|
378 | 779 | return -EIO; |
---|
379 | 780 | |
---|
380 | 781 | /* |
---|
381 | | - * We don't support synchronous mappings for non-DAX files. At least |
---|
382 | | - * until someone comes with a sensible use case. |
---|
| 782 | + * We don't support synchronous mappings for non-DAX files and |
---|
| 783 | + * for DAX files if underneath dax_device is not synchronous. |
---|
383 | 784 | */ |
---|
384 | | - if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC)) |
---|
| 785 | + if (!daxdev_mapping_supported(vma, dax_dev)) |
---|
385 | 786 | return -EOPNOTSUPP; |
---|
386 | 787 | |
---|
387 | 788 | file_accessed(file); |
---|
.. | .. |
---|
403 | 804 | handle_t *handle; |
---|
404 | 805 | int err; |
---|
405 | 806 | |
---|
406 | | - if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED)) |
---|
| 807 | + if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) |
---|
407 | 808 | return 0; |
---|
408 | 809 | |
---|
409 | 810 | if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) |
---|
410 | 811 | return 0; |
---|
411 | 812 | |
---|
412 | | - sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; |
---|
| 813 | + ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); |
---|
413 | 814 | /* |
---|
414 | 815 | * Sample where the filesystem has been mounted and |
---|
415 | 816 | * store it in the superblock for sysadmin convenience |
---|
.. | .. |
---|
432 | 833 | err = ext4_journal_get_write_access(handle, sbi->s_sbh); |
---|
433 | 834 | if (err) |
---|
434 | 835 | goto out_journal; |
---|
435 | | - strlcpy(sbi->s_es->s_last_mounted, cp, |
---|
| 836 | + strncpy(sbi->s_es->s_last_mounted, cp, |
---|
436 | 837 | sizeof(sbi->s_es->s_last_mounted)); |
---|
437 | 838 | ext4_handle_dirty_super(handle, sb); |
---|
438 | 839 | out_journal: |
---|
.. | .. |
---|
442 | 843 | return err; |
---|
443 | 844 | } |
---|
444 | 845 | |
---|
445 | | -static int ext4_file_open(struct inode * inode, struct file * filp) |
---|
| 846 | +static int ext4_file_open(struct inode *inode, struct file *filp) |
---|
446 | 847 | { |
---|
447 | 848 | int ret; |
---|
448 | 849 | |
---|
.. | .. |
---|
471 | 872 | return ret; |
---|
472 | 873 | } |
---|
473 | 874 | |
---|
474 | | - filp->f_mode |= FMODE_NOWAIT; |
---|
| 875 | + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; |
---|
475 | 876 | return dquot_file_open(inode, filp); |
---|
476 | 877 | } |
---|
477 | 878 | |
---|
.. | .. |
---|
496 | 897 | maxbytes, i_size_read(inode)); |
---|
497 | 898 | case SEEK_HOLE: |
---|
498 | 899 | inode_lock_shared(inode); |
---|
499 | | - offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); |
---|
| 900 | + offset = iomap_seek_hole(inode, offset, |
---|
| 901 | + &ext4_iomap_report_ops); |
---|
500 | 902 | inode_unlock_shared(inode); |
---|
501 | 903 | break; |
---|
502 | 904 | case SEEK_DATA: |
---|
503 | 905 | inode_lock_shared(inode); |
---|
504 | | - offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); |
---|
| 906 | + offset = iomap_seek_data(inode, offset, |
---|
| 907 | + &ext4_iomap_report_ops); |
---|
505 | 908 | inode_unlock_shared(inode); |
---|
506 | 909 | break; |
---|
507 | 910 | } |
---|
.. | .. |
---|
515 | 918 | .llseek = ext4_llseek, |
---|
516 | 919 | .read_iter = ext4_file_read_iter, |
---|
517 | 920 | .write_iter = ext4_file_write_iter, |
---|
| 921 | + .iopoll = iomap_dio_iopoll, |
---|
518 | 922 | .unlocked_ioctl = ext4_ioctl, |
---|
519 | 923 | #ifdef CONFIG_COMPAT |
---|
520 | 924 | .compat_ioctl = ext4_compat_ioctl, |
---|