| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * fs/direct-io.c |
|---|
| 3 | 4 | * |
|---|
| .. | .. |
|---|
| 38 | 39 | #include <linux/uio.h> |
|---|
| 39 | 40 | #include <linux/atomic.h> |
|---|
| 40 | 41 | #include <linux/prefetch.h> |
|---|
| 42 | + |
|---|
| 43 | +#include "internal.h" |
|---|
| 41 | 44 | |
|---|
| 42 | 45 | /* |
|---|
| 43 | 46 | * How many user pages to map in one call to get_user_pages(). This determines |
|---|
| .. | .. |
|---|
| 221 | 224 | } |
|---|
| 222 | 225 | |
|---|
| 223 | 226 | /* |
|---|
| 224 | | - * Warn about a page cache invalidation failure during a direct io write. |
|---|
| 225 | | - */ |
|---|
| 226 | | -void dio_warn_stale_pagecache(struct file *filp) |
|---|
| 227 | | -{ |
|---|
| 228 | | - static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); |
|---|
| 229 | | - char pathname[128]; |
|---|
| 230 | | - struct inode *inode = file_inode(filp); |
|---|
| 231 | | - char *path; |
|---|
| 232 | | - |
|---|
| 233 | | - errseq_set(&inode->i_mapping->wb_err, -EIO); |
|---|
| 234 | | - if (__ratelimit(&_rs)) { |
|---|
| 235 | | - path = file_path(filp, pathname, sizeof(pathname)); |
|---|
| 236 | | - if (IS_ERR(path)) |
|---|
| 237 | | - path = "(unknown)"; |
|---|
| 238 | | - pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); |
|---|
| 239 | | - pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, |
|---|
| 240 | | - current->comm); |
|---|
| 241 | | - } |
|---|
| 242 | | -} |
|---|
| 243 | | - |
|---|
| 244 | | -/** |
|---|
| 245 | 227 | * dio_complete() - called when all DIO BIO I/O has been completed |
|---|
| 246 | | - * @offset: the byte offset in the file of the completed operation |
|---|
| 247 | 228 | * |
|---|
| 248 | 229 | * This drops i_dio_count, lets interested parties know that a DIO operation |
|---|
| 249 | 230 | * has completed, and calculates the resulting return code for the operation. |
|---|
| .. | .. |
|---|
| 406 | 387 | spin_unlock_irqrestore(&dio->bio_lock, flags); |
|---|
| 407 | 388 | } |
|---|
| 408 | 389 | |
|---|
| 409 | | -/** |
|---|
| 410 | | - * dio_end_io - handle the end io action for the given bio |
|---|
| 411 | | - * @bio: The direct io bio thats being completed |
|---|
| 412 | | - * |
|---|
| 413 | | - * This is meant to be called by any filesystem that uses their own dio_submit_t |
|---|
| 414 | | - * so that the DIO specific endio actions are dealt with after the filesystem |
|---|
| 415 | | - * has done it's completion work. |
|---|
| 416 | | - */ |
|---|
| 417 | | -void dio_end_io(struct bio *bio) |
|---|
| 418 | | -{ |
|---|
| 419 | | - struct dio *dio = bio->bi_private; |
|---|
| 420 | | - |
|---|
| 421 | | - if (dio->is_async) |
|---|
| 422 | | - dio_bio_end_aio(bio); |
|---|
| 423 | | - else |
|---|
| 424 | | - dio_bio_end_io(bio); |
|---|
| 425 | | -} |
|---|
| 426 | | -EXPORT_SYMBOL_GPL(dio_end_io); |
|---|
| 427 | | - |
|---|
| 428 | 390 | static inline void |
|---|
| 429 | 391 | dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, |
|---|
| 430 | 392 | struct block_device *bdev, |
|---|
| .. | .. |
|---|
| 523 | 485 | dio->waiter = current; |
|---|
| 524 | 486 | spin_unlock_irqrestore(&dio->bio_lock, flags); |
|---|
| 525 | 487 | if (!(dio->iocb->ki_flags & IOCB_HIPRI) || |
|---|
| 526 | | - !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) |
|---|
| 527 | | - io_schedule(); |
|---|
| 488 | + !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) |
|---|
| 489 | + blk_io_schedule(); |
|---|
| 528 | 490 | /* wake up sets us TASK_RUNNING */ |
|---|
| 529 | 491 | spin_lock_irqsave(&dio->bio_lock, flags); |
|---|
| 530 | 492 | dio->waiter = NULL; |
|---|
| .. | .. |
|---|
| 542 | 504 | */ |
|---|
| 543 | 505 | static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) |
|---|
| 544 | 506 | { |
|---|
| 545 | | - struct bio_vec *bvec; |
|---|
| 546 | | - unsigned i; |
|---|
| 547 | 507 | blk_status_t err = bio->bi_status; |
|---|
| 508 | + bool should_dirty = dio->op == REQ_OP_READ && dio->should_dirty; |
|---|
| 548 | 509 | |
|---|
| 549 | 510 | if (err) { |
|---|
| 550 | 511 | if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT)) |
|---|
| .. | .. |
|---|
| 553 | 514 | dio->io_error = -EIO; |
|---|
| 554 | 515 | } |
|---|
| 555 | 516 | |
|---|
| 556 | | - if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) { |
|---|
| 517 | + if (dio->is_async && should_dirty) { |
|---|
| 557 | 518 | bio_check_pages_dirty(bio); /* transfers ownership */ |
|---|
| 558 | 519 | } else { |
|---|
| 559 | | - bio_for_each_segment_all(bvec, bio, i) { |
|---|
| 560 | | - struct page *page = bvec->bv_page; |
|---|
| 561 | | - |
|---|
| 562 | | - if (dio->op == REQ_OP_READ && !PageCompound(page) && |
|---|
| 563 | | - dio->should_dirty) |
|---|
| 564 | | - set_page_dirty_lock(page); |
|---|
| 565 | | - put_page(page); |
|---|
| 566 | | - } |
|---|
| 520 | + bio_release_pages(bio, should_dirty); |
|---|
| 567 | 521 | bio_put(bio); |
|---|
| 568 | 522 | } |
|---|
| 569 | 523 | return err; |
|---|
| .. | .. |
|---|
| 1206 | 1160 | * the early prefetch in the caller enough time. |
|---|
| 1207 | 1161 | */ |
|---|
| 1208 | 1162 | |
|---|
| 1209 | | - if (align & blocksize_mask) { |
|---|
| 1210 | | - if (bdev) |
|---|
| 1211 | | - blkbits = blksize_bits(bdev_logical_block_size(bdev)); |
|---|
| 1212 | | - blocksize_mask = (1 << blkbits) - 1; |
|---|
| 1213 | | - if (align & blocksize_mask) |
|---|
| 1214 | | - goto out; |
|---|
| 1215 | | - } |
|---|
| 1216 | | - |
|---|
| 1217 | 1163 | /* watch out for a 0 len io from a tricksy fs */ |
|---|
| 1218 | 1164 | if (iov_iter_rw(iter) == READ && !count) |
|---|
| 1219 | 1165 | return 0; |
|---|
| 1220 | 1166 | |
|---|
| 1221 | 1167 | dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); |
|---|
| 1222 | | - retval = -ENOMEM; |
|---|
| 1223 | 1168 | if (!dio) |
|---|
| 1224 | | - goto out; |
|---|
| 1169 | + return -ENOMEM; |
|---|
| 1225 | 1170 | /* |
|---|
| 1226 | 1171 | * Believe it or not, zeroing out the page array caused a .5% |
|---|
| 1227 | 1172 | * performance regression in a database benchmark. So, we take |
|---|
| .. | .. |
|---|
| 1230 | 1175 | memset(dio, 0, offsetof(struct dio, pages)); |
|---|
| 1231 | 1176 | |
|---|
| 1232 | 1177 | dio->flags = flags; |
|---|
| 1233 | | - if (dio->flags & DIO_LOCKING) { |
|---|
| 1234 | | - if (iov_iter_rw(iter) == READ) { |
|---|
| 1235 | | - struct address_space *mapping = |
|---|
| 1236 | | - iocb->ki_filp->f_mapping; |
|---|
| 1237 | | - |
|---|
| 1238 | | - /* will be released by direct_io_worker */ |
|---|
| 1239 | | - inode_lock(inode); |
|---|
| 1240 | | - |
|---|
| 1241 | | - retval = filemap_write_and_wait_range(mapping, offset, |
|---|
| 1242 | | - end - 1); |
|---|
| 1243 | | - if (retval) { |
|---|
| 1244 | | - inode_unlock(inode); |
|---|
| 1245 | | - kmem_cache_free(dio_cache, dio); |
|---|
| 1246 | | - goto out; |
|---|
| 1247 | | - } |
|---|
| 1248 | | - } |
|---|
| 1178 | + if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) { |
|---|
| 1179 | + /* will be released by direct_io_worker */ |
|---|
| 1180 | + inode_lock(inode); |
|---|
| 1249 | 1181 | } |
|---|
| 1250 | 1182 | |
|---|
| 1251 | 1183 | /* Once we sampled i_size check for reads beyond EOF */ |
|---|
| 1252 | 1184 | dio->i_size = i_size_read(inode); |
|---|
| 1253 | 1185 | if (iov_iter_rw(iter) == READ && offset >= dio->i_size) { |
|---|
| 1254 | | - if (dio->flags & DIO_LOCKING) |
|---|
| 1255 | | - inode_unlock(inode); |
|---|
| 1256 | | - kmem_cache_free(dio_cache, dio); |
|---|
| 1257 | 1186 | retval = 0; |
|---|
| 1258 | | - goto out; |
|---|
| 1187 | + goto fail_dio; |
|---|
| 1188 | + } |
|---|
| 1189 | + |
|---|
| 1190 | + if (align & blocksize_mask) { |
|---|
| 1191 | + if (bdev) |
|---|
| 1192 | + blkbits = blksize_bits(bdev_logical_block_size(bdev)); |
|---|
| 1193 | + blocksize_mask = (1 << blkbits) - 1; |
|---|
| 1194 | + if (align & blocksize_mask) |
|---|
| 1195 | + goto fail_dio; |
|---|
| 1196 | + } |
|---|
| 1197 | + |
|---|
| 1198 | + if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) { |
|---|
| 1199 | + struct address_space *mapping = iocb->ki_filp->f_mapping; |
|---|
| 1200 | + |
|---|
| 1201 | + retval = filemap_write_and_wait_range(mapping, offset, end - 1); |
|---|
| 1202 | + if (retval) |
|---|
| 1203 | + goto fail_dio; |
|---|
| 1259 | 1204 | } |
|---|
| 1260 | 1205 | |
|---|
| 1261 | 1206 | /* |
|---|
| .. | .. |
|---|
| 1280 | 1225 | } else { |
|---|
| 1281 | 1226 | dio->op = REQ_OP_READ; |
|---|
| 1282 | 1227 | } |
|---|
| 1228 | + if (iocb->ki_flags & IOCB_HIPRI) |
|---|
| 1229 | + dio->op_flags |= REQ_HIPRI; |
|---|
| 1283 | 1230 | |
|---|
| 1284 | 1231 | /* |
|---|
| 1285 | 1232 | * For AIO O_(D)SYNC writes we need to defer completions to a workqueue |
|---|
| .. | .. |
|---|
| 1297 | 1244 | */ |
|---|
| 1298 | 1245 | retval = sb_init_dio_done_wq(dio->inode->i_sb); |
|---|
| 1299 | 1246 | } |
|---|
| 1300 | | - if (retval) { |
|---|
| 1301 | | - /* |
|---|
| 1302 | | - * We grab i_mutex only for reads so we don't have |
|---|
| 1303 | | - * to release it here |
|---|
| 1304 | | - */ |
|---|
| 1305 | | - kmem_cache_free(dio_cache, dio); |
|---|
| 1306 | | - goto out; |
|---|
| 1307 | | - } |
|---|
| 1247 | + if (retval) |
|---|
| 1248 | + goto fail_dio; |
|---|
| 1308 | 1249 | } |
|---|
| 1309 | 1250 | |
|---|
| 1310 | 1251 | /* |
|---|
| .. | .. |
|---|
| 1328 | 1269 | spin_lock_init(&dio->bio_lock); |
|---|
| 1329 | 1270 | dio->refcount = 1; |
|---|
| 1330 | 1271 | |
|---|
| 1331 | | - dio->should_dirty = (iter->type == ITER_IOVEC); |
|---|
| 1272 | + dio->should_dirty = iter_is_iovec(iter) && iov_iter_rw(iter) == READ; |
|---|
| 1332 | 1273 | sdio.iter = iter; |
|---|
| 1333 | 1274 | sdio.final_block_in_request = end >> blkbits; |
|---|
| 1334 | 1275 | |
|---|
| .. | .. |
|---|
| 1407 | 1348 | } else |
|---|
| 1408 | 1349 | BUG_ON(retval != -EIOCBQUEUED); |
|---|
| 1409 | 1350 | |
|---|
| 1410 | | -out: |
|---|
| 1351 | + return retval; |
|---|
| 1352 | + |
|---|
| 1353 | +fail_dio: |
|---|
| 1354 | + if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) |
|---|
| 1355 | + inode_unlock(inode); |
|---|
| 1356 | + |
|---|
| 1357 | + kmem_cache_free(dio_cache, dio); |
|---|
| 1411 | 1358 | return retval; |
|---|
| 1412 | 1359 | } |
|---|
| 1413 | 1360 | |
|---|
| .. | .. |
|---|
| 1426 | 1373 | * Attempt to prefetch the pieces we likely need later. |
|---|
| 1427 | 1374 | */ |
|---|
| 1428 | 1375 | prefetch(&bdev->bd_disk->part_tbl); |
|---|
| 1429 | | - prefetch(bdev->bd_queue); |
|---|
| 1430 | | - prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); |
|---|
| 1376 | + prefetch(bdev->bd_disk->queue); |
|---|
| 1377 | + prefetch((char *)bdev->bd_disk->queue + SMP_CACHE_BYTES); |
|---|
| 1431 | 1378 | |
|---|
| 1432 | 1379 | return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block, |
|---|
| 1433 | 1380 | end_io, submit_io, flags); |
|---|
| 1434 | 1381 | } |
|---|
| 1435 | 1382 | |
|---|
| 1436 | | -EXPORT_SYMBOL(__blockdev_direct_IO); |
|---|
| 1383 | +EXPORT_SYMBOL_NS(__blockdev_direct_IO, ANDROID_GKI_VFS_EXPORT_ONLY); |
|---|
| 1437 | 1384 | |
|---|
| 1438 | 1385 | static __init int dio_init(void) |
|---|
| 1439 | 1386 | { |
|---|